Add option to change the local group size

Signed-off-by: Trivernis <trivernis@protonmail.com>
pull/1/head
Trivernis 4 years ago
parent 16e3e4a1bc
commit 8704bd387c

@ -18,6 +18,7 @@ impl ConcurrentKernelExecutor {
&self,
mut offset: u64,
numbers_per_step: usize,
local_size: Option<usize>,
stop: u64,
no_cache: bool,
num_threads: usize,
@ -35,6 +36,7 @@ impl ConcurrentKernelExecutor {
let controller = self.kernel_controller.clone();
let offset = Arc::clone(&offset);
let panic = Arc::clone(&panic);
let local_size = local_size.clone();
handles.push(
ThreadBuilder::new()
@ -54,7 +56,7 @@ impl ConcurrentKernelExecutor {
.collect::<Vec<u64>>();
let prime_result = if no_cache {
controller
.filter_primes_simple(numbers)
.filter_primes_simple(numbers, local_size.clone())
.map_err(|e| {
panic.store(true, Ordering::Relaxed);
e
@ -62,7 +64,7 @@ impl ConcurrentKernelExecutor {
.unwrap()
} else {
controller
.filter_primes(numbers)
.filter_primes(numbers, local_size.clone())
.map_err(|e| {
panic.store(true, Ordering::Relaxed);
e

@ -11,6 +11,7 @@ use std::time::{Duration, Instant};
pub struct BenchStatistics {
pub calc_count: u32,
pub num_tasks: usize,
pub local_size: Option<usize>,
pub write_duration: Duration,
pub calc_duration: Duration,
pub read_duration: Duration,
@ -20,9 +21,10 @@ impl Display for BenchStatistics {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
write!(
f,
"Calculation Count: {}\nTask Count: {}\nWrite Duration: {} ms\nGPU Duration: {} ms\nRead Duration: {} ms",
"Calculation Count: {}\nTask Count: {}\nLocal Size: {}\nWrite Duration: {} ms\nGPU Duration: {} ms\nRead Duration: {} ms",
self.calc_count,
self.num_tasks,
self.local_size.map(|v|v.to_string()).unwrap_or("n/a".to_string()),
self.write_duration.as_secs_f64() * 1000f64,
self.calc_duration.as_secs_f64() * 1000f64,
self.read_duration.as_secs_f64() * 1000f64
@ -40,7 +42,12 @@ impl BenchStatistics {
impl KernelController {
/// Benches an integer
pub fn bench_int(&self, calc_count: u32, num_tasks: usize) -> ocl::Result<BenchStatistics> {
pub fn bench_int(
&self,
calc_count: u32,
num_tasks: usize,
local_size: Option<usize>,
) -> ocl::Result<BenchStatistics> {
let write_start = Instant::now();
let input_buffer = self
.pro_que
@ -50,9 +57,13 @@ impl KernelController {
.build()?;
let write_duration = write_start.elapsed();
let kernel = self
.pro_que
.kernel_builder("bench_int")
let mut builder = self.pro_que.kernel_builder("bench_int");
if let Some(local_size) = local_size {
builder.local_work_size(local_size);
}
let kernel = builder
.arg(calc_count)
.arg(&input_buffer)
.global_work_size(num_tasks)
@ -71,6 +82,7 @@ impl KernelController {
Ok(BenchStatistics {
num_tasks,
calc_count,
local_size,
read_duration,
calc_duration,
write_duration,

@ -6,7 +6,7 @@
use ocl::core::DeviceInfo;
use ocl::enums::DeviceInfoResult;
use ocl::ProQue;
use ocl::{CommandQueueProperties, ProQue};
pub mod bench;
pub mod primes;
@ -21,12 +21,14 @@ impl KernelController {
let pro_que = ProQue::builder()
.src(include_str!("kernel.cl"))
.dims(1 << 20)
.queue_properties(CommandQueueProperties::PROFILING_ENABLE)
.build()?;
println!("Using device {}", pro_que.device().name()?);
Ok(Self { pro_que })
}
/// Prints information about the gpu capabilities
pub fn print_info(&self) -> ocl::Result<()> {
let device = self.pro_que.device();
let info_keys = vec![

@ -5,6 +5,8 @@
*/
use crate::kernel_controller::KernelController;
use ocl::core::{get_event_profiling_info, wait_for_event, ProfilingInfo};
use ocl::EventList;
use parking_lot::Mutex;
use std::mem::size_of;
use std::sync::Arc;
@ -19,7 +21,11 @@ pub struct PrimeCalculationResult {
impl KernelController {
/// Filters all primes from the input without using a precalculated list of primes
/// for divisibility checks
pub fn filter_primes_simple(&self, input: Vec<u64>) -> ocl::Result<PrimeCalculationResult> {
pub fn filter_primes_simple(
&self,
input: Vec<u64>,
local_size: Option<usize>,
) -> ocl::Result<PrimeCalculationResult> {
let input_buffer = self.pro_que.buffer_builder().len(input.len()).build()?;
input_buffer.write(&input[..]).enq()?;
@ -30,27 +36,37 @@ impl KernelController {
.fill_val(0u8)
.build()?;
let kernel = self
.pro_que
.kernel_builder("check_prime")
let mut builder = self.pro_que.kernel_builder("check_prime");
if let Some(local_size) = local_size {
builder.local_work_size(local_size);
}
let kernel = builder
.arg(&input_buffer)
.arg(&output_buffer)
.global_work_size(input.len())
.build()?;
let start = Instant::now();
let start_cpu = Instant::now();
let event_start = self.pro_que.queue().enqueue_marker::<EventList>(None)?;
unsafe {
kernel.enq()?;
}
let event_stop = self.pro_que.queue().enqueue_marker::<EventList>(None)?;
wait_for_event(&event_start)?;
wait_for_event(&event_stop)?;
self.pro_que.finish()?;
let gpu_calc_duration = start.elapsed();
let start = get_event_profiling_info(&event_start, ProfilingInfo::End)?;
let stop = get_event_profiling_info(&event_stop, ProfilingInfo::Start)?;
let gpu_calc_duration = Duration::from_nanos(stop.time()? - start.time()?);
let mut output = vec![0u8; output_buffer.len()];
output_buffer.read(&mut output).enq()?;
println!(
"GPU IO + Calculation took {} ms",
gpu_calc_duration.as_secs_f64() * 1000f64
"GPU Calculation: {} ms\nGPU IO + Calculation: {} ms",
gpu_calc_duration.as_secs_f64() * 1000f64,
start_cpu.elapsed().as_secs_f64() * 1000f64
);
let filter_start = Instant::now();
@ -65,7 +81,11 @@ impl KernelController {
/// Filters the primes from a list of numbers by using a precalculated list of primes to check
/// for divisibility
pub fn filter_primes(&self, input: Vec<u64>) -> ocl::Result<PrimeCalculationResult> {
pub fn filter_primes(
&self,
input: Vec<u64>,
local_size: Option<usize>,
) -> ocl::Result<PrimeCalculationResult> {
lazy_static::lazy_static! {static ref PRIME_CACHE: Arc<Mutex<Vec<u64>>> = Arc::new(Mutex::new(Vec::new()));}
if PRIME_CACHE.lock().len() == 0 {
PRIME_CACHE.lock().append(&mut get_primes(
@ -96,30 +116,41 @@ impl KernelController {
.fill_val(0u8)
.build()?;
let kernel = self
.pro_que
.kernel_builder("check_prime_cached")
let mut builder = self.pro_que.kernel_builder("check_prime_cached");
if let Some(local_size) = local_size {
builder.local_work_size(local_size);
}
let kernel = builder
.arg(prime_buffer.len() as u32)
.arg(&prime_buffer)
.arg(&input_buffer)
.arg(&output_buffer)
.local_work_size(2)
.global_work_size(input.len())
.build()?;
let start = Instant::now();
let event_start = self.pro_que.queue().enqueue_marker::<EventList>(None)?;
let start_cpu = Instant::now();
unsafe {
kernel.enq()?;
}
let event_stop = self.pro_que.queue().enqueue_marker::<EventList>(None)?;
wait_for_event(&event_start)?;
wait_for_event(&event_stop)?;
self.pro_que.finish()?;
let gpu_calc_duration = start.elapsed();
let start = get_event_profiling_info(&event_start, ProfilingInfo::End)?;
let stop = get_event_profiling_info(&event_stop, ProfilingInfo::Start)?;
let gpu_calc_duration = Duration::from_nanos(stop.time()? - start.time()?);
let mut output = vec![0u8; output_buffer.len()];
output_buffer.read(&mut output).enq()?;
println!(
"GPU IO + Calculation took {} ms",
gpu_calc_duration.as_secs_f64() * 1000f64
"GPU Calculation: {} ms\nGPU IO + Calculation: {} ms",
gpu_calc_duration.as_secs_f64() * 1000f64,
start_cpu.elapsed().as_secs_f64() * 1000f64
);
let prime_filter_start = Instant::now();

@ -55,6 +55,13 @@ struct CalculatePrimes {
#[structopt(long = "timings-output", default_value = "timings.csv")]
timings_file: PathBuf,
/// The local size for the tasks.
/// The value for numbers_per_step needs to be divisible by this number.
/// The maximum local size depends on the gpu capabilities.
/// If no value is provided, OpenCL chooses it automatically.
#[structopt(long = "local-size")]
local_size: Option<usize>,
/// The amount of numbers that are checked per step. Even numbers are ignored so the
/// Range actually goes to numbers_per_step * 2.
#[structopt(long = "numbers-per-step", default_value = "33554432")]
@ -83,6 +90,20 @@ struct BenchmarkTaskCount {
#[structopt(long = "num-tasks-start", default_value = "1")]
num_tasks_start: usize,
/// The initial number for the local size
#[structopt(long = "local-size-start")]
local_size_start: Option<usize>,
/// The amount the local size increases by every step
#[structopt(long = "local-size-step", default_value = "10")]
local_size_step: usize,
/// The maximum amount of the local size
/// Can't be greater than the maximum local size of the gpu
/// that can be retrieved with the info command
#[structopt(long = "local-size-stop")]
local_size_stop: Option<usize>,
/// The maximum number of tasks for the benchmark
#[structopt(long = "num-tasks-stop", default_value = "10000000")]
num_tasks_stop: usize,
@ -154,6 +175,7 @@ fn calculate_primes(prime_opts: CalculatePrimes, controller: KernelController) -
executor.calculate_primes(
prime_opts.start_offset,
prime_opts.numbers_per_step,
prime_opts.local_size,
prime_opts.max_number,
prime_opts.no_cache,
prime_opts.num_threads,
@ -205,6 +227,7 @@ fn bench_task_count(opts: BenchmarkTaskCount, controller: KernelController) -> o
let csv_writer = CSVWriter::new(
bench_writer,
&[
"local_size",
"num_tasks",
"calc_count",
"write_duration",
@ -214,15 +237,37 @@ fn bench_task_count(opts: BenchmarkTaskCount, controller: KernelController) -> o
)
.unwrap();
let (bench_sender, bench_handle) = create_csv_write_thread(csv_writer);
for n in (opts.num_tasks_start..opts.num_tasks_stop).step_by(opts.num_tasks_step) {
let mut stats = controller.bench_int(opts.calculation_steps, n)?;
for n in (opts.num_tasks_start..=opts.num_tasks_stop).step_by(opts.num_tasks_step) {
if let (Some(start), Some(stop)) = (opts.local_size_start, opts.local_size_stop) {
for l in (start..=stop)
.step_by(opts.local_size_step)
.filter(|v| n % v == 0)
{
let mut stats = controller.bench_int(opts.calculation_steps, n, Some(l))?;
for _ in 1..opts.average_of {
stats.avg(controller.bench_int(opts.calculation_steps, n)?)
stats.avg(controller.bench_int(opts.calculation_steps, n, Some(l))?)
}
println!("{}\n", stats);
bench_sender
.send(vec![
l.to_string(),
n.to_string(),
opts.calculation_steps.to_string(),
duration_to_ms_string(&stats.write_duration),
duration_to_ms_string(&stats.calc_duration),
duration_to_ms_string(&stats.read_duration),
])
.unwrap();
}
} else {
let mut stats = controller.bench_int(opts.calculation_steps, n, None)?;
for _ in 1..opts.average_of {
stats.avg(controller.bench_int(opts.calculation_steps, n, None)?)
}
println!("{}\n", stats);
bench_sender
.send(vec![
"n/a".to_string(),
n.to_string(),
opts.calculation_steps.to_string(),
duration_to_ms_string(&stats.write_duration),
@ -231,6 +276,7 @@ fn bench_task_count(opts: BenchmarkTaskCount, controller: KernelController) -> o
])
.unwrap();
}
}
mem::drop(bench_sender);
bench_handle.join().unwrap();

Loading…
Cancel
Save