Add option to change the local group size

Signed-off-by: Trivernis <trivernis@protonmail.com>
4 years ago · 8704bd387c
parent 16e3e4a1bc
commit 8704bd387c
5 changed files with 134 additions and 41 deletions
--- a/src/concurrency/executor.rs
+++ b/src/concurrency/executor.rs
@ -18,6 +18,7 @@ impl ConcurrentKernelExecutor {
        &self,
        mut offset: u64,
        numbers_per_step: usize,
        local_size: Option<usize>,
        stop: u64,
        no_cache: bool,
        num_threads: usize,
@ -35,6 +36,7 @@ impl ConcurrentKernelExecutor {
            let controller = self.kernel_controller.clone();
            let offset = Arc::clone(&offset);
            let panic = Arc::clone(&panic);
            let local_size = local_size.clone();
            handles.push(
                ThreadBuilder::new()
@ -54,7 +56,7 @@ impl ConcurrentKernelExecutor {
                            .collect::<Vec<u64>>();
                        let prime_result = if no_cache {
                            controller
-                                .filter_primes_simple(numbers)
+                                .filter_primes_simple(numbers, local_size.clone())
                                .map_err(|e| {
                                    panic.store(true, Ordering::Relaxed);
                                    e
@ -62,7 +64,7 @@ impl ConcurrentKernelExecutor {
                                .unwrap()
                        } else {
                            controller
-                                .filter_primes(numbers)
+                                .filter_primes(numbers, local_size.clone())
                                .map_err(|e| {
                                    panic.store(true, Ordering::Relaxed);
                                    e
--- a/src/kernel_controller/bench.rs
+++ b/src/kernel_controller/bench.rs
@ -11,6 +11,7 @@ use std::time::{Duration, Instant};
 pub struct BenchStatistics {
    pub calc_count: u32,
    pub num_tasks: usize,
    pub local_size: Option<usize>,
    pub write_duration: Duration,
    pub calc_duration: Duration,
    pub read_duration: Duration,
@ -20,9 +21,10 @@ impl Display for BenchStatistics {
    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
        write!(
            f,
-            "Calculation Count: {}\nTask Count: {}\nWrite Duration: {} ms\nGPU Duration: {} ms\nRead Duration: {} ms",
+            "Calculation Count: {}\nTask Count: {}\nLocal Size: {}\nWrite Duration: {} ms\nGPU Duration: {} ms\nRead Duration: {} ms",
            self.calc_count,
            self.num_tasks,
            self.local_size.map(|v|v.to_string()).unwrap_or("n/a".to_string()),
            self.write_duration.as_secs_f64() * 1000f64,
            self.calc_duration.as_secs_f64() * 1000f64,
            self.read_duration.as_secs_f64() * 1000f64
@ -40,7 +42,12 @@ impl BenchStatistics {
 impl KernelController {
    /// Benches an integer
-    pub fn bench_int(&self, calc_count: u32, num_tasks: usize) -> ocl::Result<BenchStatistics> {
+    pub fn bench_int(
        &self,
        calc_count: u32,
        num_tasks: usize,
        local_size: Option<usize>,
    ) -> ocl::Result<BenchStatistics> {
        let write_start = Instant::now();
        let input_buffer = self
            .pro_que
@ -50,9 +57,13 @@ impl KernelController {
            .build()?;
        let write_duration = write_start.elapsed();
-        let kernel = self
+        let mut builder = self.pro_que.kernel_builder("bench_int");
-            .pro_que
+
-            .kernel_builder("bench_int")
+        if let Some(local_size) = local_size {
            builder.local_work_size(local_size);
        }
        let kernel = builder
            .arg(calc_count)
            .arg(&input_buffer)
            .global_work_size(num_tasks)
@ -71,6 +82,7 @@ impl KernelController {
        Ok(BenchStatistics {
            num_tasks,
            calc_count,
            local_size,
            read_duration,
            calc_duration,
            write_duration,
--- a/src/kernel_controller/mod.rs
+++ b/src/kernel_controller/mod.rs
@ -6,7 +6,7 @@
 use ocl::core::DeviceInfo;
 use ocl::enums::DeviceInfoResult;
-use ocl::ProQue;
+use ocl::{CommandQueueProperties, ProQue};
 pub mod bench;
 pub mod primes;
@ -21,12 +21,14 @@ impl KernelController {
        let pro_que = ProQue::builder()
            .src(include_str!("kernel.cl"))
            .dims(1 << 20)
            .queue_properties(CommandQueueProperties::PROFILING_ENABLE)
            .build()?;
        println!("Using device {}", pro_que.device().name()?);
        Ok(Self { pro_que })
    }
    /// Prints information about the gpu capabilities
    pub fn print_info(&self) -> ocl::Result<()> {
        let device = self.pro_que.device();
        let info_keys = vec![
--- a/src/kernel_controller/primes.rs
+++ b/src/kernel_controller/primes.rs
@ -5,6 +5,8 @@
 */
 use crate::kernel_controller::KernelController;
 use ocl::core::{get_event_profiling_info, wait_for_event, ProfilingInfo};
 use ocl::EventList;
 use parking_lot::Mutex;
 use std::mem::size_of;
 use std::sync::Arc;
@ -19,7 +21,11 @@ pub struct PrimeCalculationResult {
 impl KernelController {
    /// Filters all primes from the input without using a precalculated list of primes
    /// for divisibility checks
-    pub fn filter_primes_simple(&self, input: Vec<u64>) -> ocl::Result<PrimeCalculationResult> {
+    pub fn filter_primes_simple(
        &self,
        input: Vec<u64>,
        local_size: Option<usize>,
    ) -> ocl::Result<PrimeCalculationResult> {
        let input_buffer = self.pro_que.buffer_builder().len(input.len()).build()?;
        input_buffer.write(&input[..]).enq()?;
@ -30,27 +36,37 @@ impl KernelController {
            .fill_val(0u8)
            .build()?;
-        let kernel = self
+        let mut builder = self.pro_que.kernel_builder("check_prime");
-            .pro_que
+        if let Some(local_size) = local_size {
-            .kernel_builder("check_prime")
+            builder.local_work_size(local_size);
        }
        let kernel = builder
            .arg(&input_buffer)
            .arg(&output_buffer)
            .global_work_size(input.len())
            .build()?;
-        let start = Instant::now();
+        let start_cpu = Instant::now();
        let event_start = self.pro_que.queue().enqueue_marker::<EventList>(None)?;
        unsafe {
            kernel.enq()?;
        }
        let event_stop = self.pro_que.queue().enqueue_marker::<EventList>(None)?;
        wait_for_event(&event_start)?;
        wait_for_event(&event_stop)?;
-        self.pro_que.finish()?;
+        let start = get_event_profiling_info(&event_start, ProfilingInfo::End)?;
-        let gpu_calc_duration = start.elapsed();
+        let stop = get_event_profiling_info(&event_stop, ProfilingInfo::Start)?;
        let gpu_calc_duration = Duration::from_nanos(stop.time()? - start.time()?);
        let mut output = vec![0u8; output_buffer.len()];
        output_buffer.read(&mut output).enq()?;
        println!(
-            "GPU IO + Calculation took {} ms",
+            "GPU Calculation: {} ms\nGPU IO + Calculation: {} ms",
-            gpu_calc_duration.as_secs_f64() * 1000f64
+            gpu_calc_duration.as_secs_f64() * 1000f64,
            start_cpu.elapsed().as_secs_f64() * 1000f64
        );
        let filter_start = Instant::now();
@ -65,7 +81,11 @@ impl KernelController {
    /// Filters the primes from a list of numbers by using a precalculated list of primes to check
    /// for divisibility
-    pub fn filter_primes(&self, input: Vec<u64>) -> ocl::Result<PrimeCalculationResult> {
+    pub fn filter_primes(
        &self,
        input: Vec<u64>,
        local_size: Option<usize>,
    ) -> ocl::Result<PrimeCalculationResult> {
        lazy_static::lazy_static! {static ref PRIME_CACHE: Arc<Mutex<Vec<u64>>> = Arc::new(Mutex::new(Vec::new()));}
        if PRIME_CACHE.lock().len() == 0 {
            PRIME_CACHE.lock().append(&mut get_primes(
@ -96,30 +116,41 @@ impl KernelController {
            .fill_val(0u8)
            .build()?;
-        let kernel = self
+        let mut builder = self.pro_que.kernel_builder("check_prime_cached");
-            .pro_que
+        if let Some(local_size) = local_size {
-            .kernel_builder("check_prime_cached")
+            builder.local_work_size(local_size);
        }
        let kernel = builder
            .arg(prime_buffer.len() as u32)
            .arg(&prime_buffer)
            .arg(&input_buffer)
            .arg(&output_buffer)
            .local_work_size(2)
            .global_work_size(input.len())
            .build()?;
-        let start = Instant::now();
+        let event_start = self.pro_que.queue().enqueue_marker::<EventList>(None)?;
        let start_cpu = Instant::now();
        unsafe {
            kernel.enq()?;
        }
        let event_stop = self.pro_que.queue().enqueue_marker::<EventList>(None)?;
        wait_for_event(&event_start)?;
        wait_for_event(&event_stop)?;
-        self.pro_que.finish()?;
+        let start = get_event_profiling_info(&event_start, ProfilingInfo::End)?;
-        let gpu_calc_duration = start.elapsed();
+        let stop = get_event_profiling_info(&event_stop, ProfilingInfo::Start)?;
        let gpu_calc_duration = Duration::from_nanos(stop.time()? - start.time()?);
        let mut output = vec![0u8; output_buffer.len()];
        output_buffer.read(&mut output).enq()?;
        println!(
-            "GPU IO + Calculation took {} ms",
+            "GPU Calculation: {} ms\nGPU IO + Calculation: {} ms",
-            gpu_calc_duration.as_secs_f64() * 1000f64
+            gpu_calc_duration.as_secs_f64() * 1000f64,
            start_cpu.elapsed().as_secs_f64() * 1000f64
        );
        let prime_filter_start = Instant::now();
--- a/src/main.rs
+++ b/src/main.rs
@ -55,6 +55,13 @@ struct CalculatePrimes {
    #[structopt(long = "timings-output", default_value = "timings.csv")]
    timings_file: PathBuf,
    /// The local size for the tasks.
    /// The value for numbers_per_step needs to be divisible by this number.
    /// The maximum local size depends on the gpu capabilities.
    /// If no value is provided, OpenCL chooses it automatically.
    #[structopt(long = "local-size")]
    local_size: Option<usize>,
    /// The amount of numbers that are checked per step. Even numbers are ignored so the
    /// Range actually goes to numbers_per_step * 2.
    #[structopt(long = "numbers-per-step", default_value = "33554432")]
@ -83,6 +90,20 @@ struct BenchmarkTaskCount {
    #[structopt(long = "num-tasks-start", default_value = "1")]
    num_tasks_start: usize,
    /// The initial number for the local size
    #[structopt(long = "local-size-start")]
    local_size_start: Option<usize>,
    /// The amount the local size increases by every step
    #[structopt(long = "local-size-step", default_value = "10")]
    local_size_step: usize,
    /// The maximum amount of the local size
    /// Can't be greater than the maximum local size of the gpu
    /// that can be retrieved with the info command
    #[structopt(long = "local-size-stop")]
    local_size_stop: Option<usize>,
    /// The maximum number of tasks for the benchmark
    #[structopt(long = "num-tasks-stop", default_value = "10000000")]
    num_tasks_stop: usize,
@ -154,6 +175,7 @@ fn calculate_primes(prime_opts: CalculatePrimes, controller: KernelController) -
            executor.calculate_primes(
                prime_opts.start_offset,
                prime_opts.numbers_per_step,
                prime_opts.local_size,
                prime_opts.max_number,
                prime_opts.no_cache,
                prime_opts.num_threads,
@ -205,6 +227,7 @@ fn bench_task_count(opts: BenchmarkTaskCount, controller: KernelController) -> o
    let csv_writer = CSVWriter::new(
        bench_writer,
        &[
            "local_size",
            "num_tasks",
            "calc_count",
            "write_duration",
@ -214,15 +237,37 @@ fn bench_task_count(opts: BenchmarkTaskCount, controller: KernelController) -> o
    )
    .unwrap();
    let (bench_sender, bench_handle) = create_csv_write_thread(csv_writer);
-    for n in (opts.num_tasks_start..opts.num_tasks_stop).step_by(opts.num_tasks_step) {
+    for n in (opts.num_tasks_start..=opts.num_tasks_stop).step_by(opts.num_tasks_step) {
-        let mut stats = controller.bench_int(opts.calculation_steps, n)?;
+        if let (Some(start), Some(stop)) = (opts.local_size_start, opts.local_size_stop) {
            for l in (start..=stop)
                .step_by(opts.local_size_step)
                .filter(|v| n % v == 0)
            {
                let mut stats = controller.bench_int(opts.calculation_steps, n, Some(l))?;
                for _ in 1..opts.average_of {
-            stats.avg(controller.bench_int(opts.calculation_steps, n)?)
+                    stats.avg(controller.bench_int(opts.calculation_steps, n, Some(l))?)
                }
                println!("{}\n", stats);
                bench_sender
                    .send(vec![
                        l.to_string(),
                        n.to_string(),
                        opts.calculation_steps.to_string(),
                        duration_to_ms_string(&stats.write_duration),
                        duration_to_ms_string(&stats.calc_duration),
                        duration_to_ms_string(&stats.read_duration),
                    ])
                    .unwrap();
            }
        } else {
            let mut stats = controller.bench_int(opts.calculation_steps, n, None)?;
            for _ in 1..opts.average_of {
                stats.avg(controller.bench_int(opts.calculation_steps, n, None)?)
            }
            println!("{}\n", stats);
            bench_sender
                .send(vec![
                    "n/a".to_string(),
                    n.to_string(),
                    opts.calculation_steps.to_string(),
                    duration_to_ms_string(&stats.write_duration),
@ -231,6 +276,7 @@ fn bench_task_count(opts: BenchmarkTaskCount, controller: KernelController) -> o
                ])
                .unwrap();
        }
    }
    mem::drop(bench_sender);
    bench_handle.join().unwrap();