From 8704bd387c98ed7f4bb8c3d9447a93eca113903d Mon Sep 17 00:00:00 2001 From: Trivernis Date: Mon, 11 Jan 2021 14:49:22 +0100 Subject: [PATCH] Add option to change the local group size Signed-off-by: Trivernis --- src/concurrency/executor.rs | 6 ++- src/kernel_controller/bench.rs | 22 +++++++--- src/kernel_controller/mod.rs | 4 +- src/kernel_controller/primes.rs | 67 +++++++++++++++++++++-------- src/main.rs | 76 ++++++++++++++++++++++++++------- 5 files changed, 134 insertions(+), 41 deletions(-) diff --git a/src/concurrency/executor.rs b/src/concurrency/executor.rs index 7096866..49df147 100644 --- a/src/concurrency/executor.rs +++ b/src/concurrency/executor.rs @@ -18,6 +18,7 @@ impl ConcurrentKernelExecutor { &self, mut offset: u64, numbers_per_step: usize, + local_size: Option, stop: u64, no_cache: bool, num_threads: usize, @@ -35,6 +36,7 @@ impl ConcurrentKernelExecutor { let controller = self.kernel_controller.clone(); let offset = Arc::clone(&offset); let panic = Arc::clone(&panic); + let local_size = local_size.clone(); handles.push( ThreadBuilder::new() @@ -54,7 +56,7 @@ impl ConcurrentKernelExecutor { .collect::>(); let prime_result = if no_cache { controller - .filter_primes_simple(numbers) + .filter_primes_simple(numbers, local_size.clone()) .map_err(|e| { panic.store(true, Ordering::Relaxed); e @@ -62,7 +64,7 @@ impl ConcurrentKernelExecutor { .unwrap() } else { controller - .filter_primes(numbers) + .filter_primes(numbers, local_size.clone()) .map_err(|e| { panic.store(true, Ordering::Relaxed); e diff --git a/src/kernel_controller/bench.rs b/src/kernel_controller/bench.rs index 2b86926..4a63ad0 100644 --- a/src/kernel_controller/bench.rs +++ b/src/kernel_controller/bench.rs @@ -11,6 +11,7 @@ use std::time::{Duration, Instant}; pub struct BenchStatistics { pub calc_count: u32, pub num_tasks: usize, + pub local_size: Option, pub write_duration: Duration, pub calc_duration: Duration, pub read_duration: Duration, @@ -20,9 +21,10 @@ impl Display for BenchStatistics { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!( f, - "Calculation Count: {}\nTask Count: {}\nWrite Duration: {} ms\nGPU Duration: {} ms\nRead Duration: {} ms", + "Calculation Count: {}\nTask Count: {}\nLocal Size: {}\nWrite Duration: {} ms\nGPU Duration: {} ms\nRead Duration: {} ms", self.calc_count, self.num_tasks, + self.local_size.map(|v|v.to_string()).unwrap_or("n/a".to_string()), self.write_duration.as_secs_f64() * 1000f64, self.calc_duration.as_secs_f64() * 1000f64, self.read_duration.as_secs_f64() * 1000f64 @@ -40,7 +42,12 @@ impl BenchStatistics { impl KernelController { /// Benches an integer - pub fn bench_int(&self, calc_count: u32, num_tasks: usize) -> ocl::Result { + pub fn bench_int( + &self, + calc_count: u32, + num_tasks: usize, + local_size: Option, + ) -> ocl::Result { let write_start = Instant::now(); let input_buffer = self .pro_que @@ -50,9 +57,13 @@ impl KernelController { .build()?; let write_duration = write_start.elapsed(); - let kernel = self - .pro_que - .kernel_builder("bench_int") + let mut builder = self.pro_que.kernel_builder("bench_int"); + + if let Some(local_size) = local_size { + builder.local_work_size(local_size); + } + + let kernel = builder .arg(calc_count) .arg(&input_buffer) .global_work_size(num_tasks) @@ -71,6 +82,7 @@ impl KernelController { Ok(BenchStatistics { num_tasks, calc_count, + local_size, read_duration, calc_duration, write_duration, diff --git a/src/kernel_controller/mod.rs b/src/kernel_controller/mod.rs index 5d454f3..ddc8940 100644 --- a/src/kernel_controller/mod.rs +++ b/src/kernel_controller/mod.rs @@ -6,7 +6,7 @@ use ocl::core::DeviceInfo; use ocl::enums::DeviceInfoResult; -use ocl::ProQue; +use ocl::{CommandQueueProperties, ProQue}; pub mod bench; pub mod primes; @@ -21,12 +21,14 @@ impl KernelController { let pro_que = ProQue::builder() .src(include_str!("kernel.cl")) .dims(1 << 20) + .queue_properties(CommandQueueProperties::PROFILING_ENABLE) .build()?; println!("Using device {}", pro_que.device().name()?); Ok(Self { pro_que }) } + /// Prints information about the gpu capabilities pub fn print_info(&self) -> ocl::Result<()> { let device = self.pro_que.device(); let info_keys = vec![ diff --git a/src/kernel_controller/primes.rs b/src/kernel_controller/primes.rs index a939a8b..fd3fac9 100644 --- a/src/kernel_controller/primes.rs +++ b/src/kernel_controller/primes.rs @@ -5,6 +5,8 @@ */ use crate::kernel_controller::KernelController; +use ocl::core::{get_event_profiling_info, wait_for_event, ProfilingInfo}; +use ocl::EventList; use parking_lot::Mutex; use std::mem::size_of; use std::sync::Arc; @@ -19,7 +21,11 @@ pub struct PrimeCalculationResult { impl KernelController { /// Filters all primes from the input without using a precalculated list of primes /// for divisibility checks - pub fn filter_primes_simple(&self, input: Vec) -> ocl::Result { + pub fn filter_primes_simple( + &self, + input: Vec, + local_size: Option, + ) -> ocl::Result { let input_buffer = self.pro_que.buffer_builder().len(input.len()).build()?; input_buffer.write(&input[..]).enq()?; @@ -30,27 +36,37 @@ impl KernelController { .fill_val(0u8) .build()?; - let kernel = self - .pro_que - .kernel_builder("check_prime") + let mut builder = self.pro_que.kernel_builder("check_prime"); + if let Some(local_size) = local_size { + builder.local_work_size(local_size); + } + let kernel = builder .arg(&input_buffer) .arg(&output_buffer) .global_work_size(input.len()) .build()?; - let start = Instant::now(); + let start_cpu = Instant::now(); + let event_start = self.pro_que.queue().enqueue_marker::(None)?; + unsafe { kernel.enq()?; } + let event_stop = self.pro_que.queue().enqueue_marker::(None)?; + + wait_for_event(&event_start)?; + wait_for_event(&event_stop)?; - self.pro_que.finish()?; - let gpu_calc_duration = start.elapsed(); + let start = get_event_profiling_info(&event_start, ProfilingInfo::End)?; + let stop = get_event_profiling_info(&event_stop, ProfilingInfo::Start)?; + let gpu_calc_duration = Duration::from_nanos(stop.time()? - start.time()?); let mut output = vec![0u8; output_buffer.len()]; output_buffer.read(&mut output).enq()?; println!( - "GPU IO + Calculation took {} ms", - gpu_calc_duration.as_secs_f64() * 1000f64 + "GPU Calculation: {} ms\nGPU IO + Calculation: {} ms", + gpu_calc_duration.as_secs_f64() * 1000f64, + start_cpu.elapsed().as_secs_f64() * 1000f64 ); let filter_start = Instant::now(); @@ -65,7 +81,11 @@ impl KernelController { /// Filters the primes from a list of numbers by using a precalculated list of primes to check /// for divisibility - pub fn filter_primes(&self, input: Vec) -> ocl::Result { + pub fn filter_primes( + &self, + input: Vec, + local_size: Option, + ) -> ocl::Result { lazy_static::lazy_static! {static ref PRIME_CACHE: Arc>> = Arc::new(Mutex::new(Vec::new()));} if PRIME_CACHE.lock().len() == 0 { PRIME_CACHE.lock().append(&mut get_primes( @@ -96,30 +116,41 @@ impl KernelController { .fill_val(0u8) .build()?; - let kernel = self - .pro_que - .kernel_builder("check_prime_cached") + let mut builder = self.pro_que.kernel_builder("check_prime_cached"); + if let Some(local_size) = local_size { + builder.local_work_size(local_size); + } + let kernel = builder .arg(prime_buffer.len() as u32) .arg(&prime_buffer) .arg(&input_buffer) .arg(&output_buffer) + .local_work_size(2) .global_work_size(input.len()) .build()?; - let start = Instant::now(); + let event_start = self.pro_que.queue().enqueue_marker::(None)?; + let start_cpu = Instant::now(); + unsafe { kernel.enq()?; } + let event_stop = self.pro_que.queue().enqueue_marker::(None)?; + + wait_for_event(&event_start)?; + wait_for_event(&event_stop)?; - self.pro_que.finish()?; - let gpu_calc_duration = start.elapsed(); + let start = get_event_profiling_info(&event_start, ProfilingInfo::End)?; + let stop = get_event_profiling_info(&event_stop, ProfilingInfo::Start)?; + let gpu_calc_duration = Duration::from_nanos(stop.time()? - start.time()?); let mut output = vec![0u8; output_buffer.len()]; output_buffer.read(&mut output).enq()?; println!( - "GPU IO + Calculation took {} ms", - gpu_calc_duration.as_secs_f64() * 1000f64 + "GPU Calculation: {} ms\nGPU IO + Calculation: {} ms", + gpu_calc_duration.as_secs_f64() * 1000f64, + start_cpu.elapsed().as_secs_f64() * 1000f64 ); let prime_filter_start = Instant::now(); diff --git a/src/main.rs b/src/main.rs index 2ad185c..0eb5f92 100644 --- a/src/main.rs +++ b/src/main.rs @@ -55,6 +55,13 @@ struct CalculatePrimes { #[structopt(long = "timings-output", default_value = "timings.csv")] timings_file: PathBuf, + /// The local size for the tasks. + /// The value for numbers_per_step needs to be divisible by this number. + /// The maximum local size depends on the gpu capabilities. + /// If no value is provided, OpenCL chooses it automatically. + #[structopt(long = "local-size")] + local_size: Option, + /// The amount of numbers that are checked per step. Even numbers are ignored so the /// Range actually goes to numbers_per_step * 2. #[structopt(long = "numbers-per-step", default_value = "33554432")] @@ -83,6 +90,20 @@ struct BenchmarkTaskCount { #[structopt(long = "num-tasks-start", default_value = "1")] num_tasks_start: usize, + /// The initial number for the local size + #[structopt(long = "local-size-start")] + local_size_start: Option, + + /// The amount the local size increases by every step + #[structopt(long = "local-size-step", default_value = "10")] + local_size_step: usize, + + /// The maximum amount of the local size + /// Can't be greater than the maximum local size of the gpu + /// that can be retrieved with the info command + #[structopt(long = "local-size-stop")] + local_size_stop: Option, + /// The maximum number of tasks for the benchmark #[structopt(long = "num-tasks-stop", default_value = "10000000")] num_tasks_stop: usize, @@ -154,6 +175,7 @@ fn calculate_primes(prime_opts: CalculatePrimes, controller: KernelController) - executor.calculate_primes( prime_opts.start_offset, prime_opts.numbers_per_step, + prime_opts.local_size, prime_opts.max_number, prime_opts.no_cache, prime_opts.num_threads, @@ -205,6 +227,7 @@ fn bench_task_count(opts: BenchmarkTaskCount, controller: KernelController) -> o let csv_writer = CSVWriter::new( bench_writer, &[ + "local_size", "num_tasks", "calc_count", "write_duration", @@ -214,22 +237,45 @@ fn bench_task_count(opts: BenchmarkTaskCount, controller: KernelController) -> o ) .unwrap(); let (bench_sender, bench_handle) = create_csv_write_thread(csv_writer); - for n in (opts.num_tasks_start..opts.num_tasks_stop).step_by(opts.num_tasks_step) { - let mut stats = controller.bench_int(opts.calculation_steps, n)?; - for _ in 1..opts.average_of { - stats.avg(controller.bench_int(opts.calculation_steps, n)?) + for n in (opts.num_tasks_start..=opts.num_tasks_stop).step_by(opts.num_tasks_step) { + if let (Some(start), Some(stop)) = (opts.local_size_start, opts.local_size_stop) { + for l in (start..=stop) + .step_by(opts.local_size_step) + .filter(|v| n % v == 0) + { + let mut stats = controller.bench_int(opts.calculation_steps, n, Some(l))?; + for _ in 1..opts.average_of { + stats.avg(controller.bench_int(opts.calculation_steps, n, Some(l))?) + } + println!("{}\n", stats); + bench_sender + .send(vec![ + l.to_string(), + n.to_string(), + opts.calculation_steps.to_string(), + duration_to_ms_string(&stats.write_duration), + duration_to_ms_string(&stats.calc_duration), + duration_to_ms_string(&stats.read_duration), + ]) + .unwrap(); + } + } else { + let mut stats = controller.bench_int(opts.calculation_steps, n, None)?; + for _ in 1..opts.average_of { + stats.avg(controller.bench_int(opts.calculation_steps, n, None)?) + } + println!("{}\n", stats); + bench_sender + .send(vec![ + "n/a".to_string(), + n.to_string(), + opts.calculation_steps.to_string(), + duration_to_ms_string(&stats.write_duration), + duration_to_ms_string(&stats.calc_duration), + duration_to_ms_string(&stats.read_duration), + ]) + .unwrap(); } - - println!("{}\n", stats); - bench_sender - .send(vec![ - n.to_string(), - opts.calculation_steps.to_string(), - duration_to_ms_string(&stats.write_duration), - duration_to_ms_string(&stats.calc_duration), - duration_to_ms_string(&stats.read_duration), - ]) - .unwrap(); } mem::drop(bench_sender);