Separate local and global size bench and use streamed executor

Signed-off-by: Trivernis <trivernis@protonmail.com>
4 years ago · 5659ee2923
parent 32bb3f32f6
commit 5659ee2923
5 changed files with 222 additions and 112 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -517,13 +517,14 @@ dependencies = [
 [[package]]
 name = "ocl-stream"
-version = "0.3.0"
+version = "0.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6571c0dc580e1603bdf23e277402b8dea73c0631de6a123b796da9a27681c960"
+checksum = "2cc003c0e91a8daaa706bd4231a05080d18346c97dc051955cce45de60a54ac7"
 dependencies = [
 "crossbeam-channel 0.5.0",
 "num_cpus",
 "ocl",
 "parking_lot",
 "thiserror",
 ]
--- a/Cargo.toml
+++ b/Cargo.toml
@ -12,5 +12,5 @@ structopt = "0.3.20"
 lazy_static = "1.4.0"
 parking_lot = "0.11.1"
 rayon = "1.5.0"
-ocl-stream = "0.3.0"
+ocl-stream = "0.3.4"
 crossbeam-channel = "0.5.0"
--- a/src/kernel_controller/bench.rs
+++ b/src/kernel_controller/bench.rs
@ -4,14 +4,22 @@
 * See LICENSE for more information
 */
 use crate::benching::enqueue_profiled;
 use crate::kernel_controller::KernelController;
 use ocl_stream::executor::context::ExecutorContext;
 use ocl_stream::executor::stream::OCLStream;
 use ocl_stream::traits::*;
 use ocl_stream::utils::result::OCLStreamResult;
 use ocl_stream::utils::shared_buffer::SharedBuffer;
 use std::fmt::{self, Display, Formatter};
 use std::ops::Deref;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::time::{Duration, Instant};
 pub struct BenchStatistics {
    pub calc_count: u32,
-    pub num_tasks: usize,
+    pub global_size: usize,
-    pub local_size: Option<usize>,
+    pub local_size: usize,
    pub write_duration: Duration,
    pub calc_duration: Duration,
    pub read_duration: Duration,
@ -23,8 +31,8 @@ impl Display for BenchStatistics {
            f,
            "Calculation Count: {}\nTask Count: {}\nLocal Size: {}\nWrite Duration: {} ms\nGPU Duration: {} ms\nRead Duration: {} ms",
            self.calc_count,
-            self.num_tasks,
+            self.global_size,
-            self.local_size.map(|v|v.to_string()).unwrap_or("n/a".to_string()),
+            self.local_size,
            self.write_duration.as_secs_f64() * 1000f64,
            self.calc_duration.as_secs_f64() * 1000f64,
            self.read_duration.as_secs_f64() * 1000f64
@ -32,55 +40,108 @@ impl Display for BenchStatistics {
    }
 }
-impl BenchStatistics {
+impl KernelController {
-    pub fn avg(&mut self, other: Self) {
+    /// Benchmarks the value for the global size
-        self.read_duration = (self.read_duration + other.read_duration) / 2;
+    pub fn bench_global_size(
-        self.write_duration = (self.write_duration + other.write_duration) / 2;
+        &self,
-        self.calc_duration = (self.calc_duration + other.calc_duration) / 2;
+        local_size: usize,
        global_size_start: usize,
        global_size_step: usize,
        global_size_stop: usize,
        calc_count: u32,
        repetitions: usize,
    ) -> OCLStreamResult<OCLStream<BenchStatistics>> {
        let global_size = AtomicUsize::new(global_size_start);
        let stream = self.executor.execute_bounded(global_size_stop, move |ctx| {
            loop {
                if global_size.load(Ordering::SeqCst) > global_size_stop {
                    break;
                }
                let global_size = global_size.fetch_add(global_size_step, Ordering::SeqCst);
                if global_size % local_size != 0 {
                    continue;
                }
                let input_buffer: SharedBuffer<u32> =
                    vec![0u32; global_size].to_shared_buffer(ctx.pro_que())?;
                for _ in 0..repetitions {
                    let stats =
                        Self::bench_int(&ctx, local_size, calc_count, input_buffer.clone())?;
                    ctx.sender().send(stats)?;
                }
            }
            Ok(())
        });
        Ok(stream)
    }
 }
-impl KernelController {
+    /// Benchmarks the value for the local size
-    /// Benches an integer
+    pub fn bench_local_size(
    pub fn bench_int(
        &self,
        global_size: usize,
        local_size_start: usize,
        local_size_step: usize,
        local_size_stop: usize,
        calc_count: u32,
-        num_tasks: usize,
+        repetitions: usize,
-        local_size: Option<usize>,
+    ) -> OCLStreamResult<OCLStream<BenchStatistics>> {
        let input_buffer: SharedBuffer<u32> =
            vec![0u32; global_size].to_shared_buffer(self.executor.pro_que())?;
        let local_size = AtomicUsize::new(local_size_start);
        let stream = self.executor.execute_bounded(global_size, move |ctx| {
            loop {
                if local_size.load(Ordering::SeqCst) > local_size_stop {
                    break;
                }
                let local_size = local_size.fetch_add(local_size_step, Ordering::SeqCst);
                if local_size > 1024 || global_size % local_size != 0 {
                    continue;
                }
                for _ in 0..repetitions {
                    let stats =
                        Self::bench_int(&ctx, local_size, calc_count, input_buffer.clone())?;
                    ctx.sender().send(stats)?;
                }
            }
            Ok(())
        });
        Ok(stream)
    }
    /// Benches an integer
    fn bench_int(
        ctx: &ExecutorContext<BenchStatistics>,
        local_size: usize,
        calc_count: u32,
        input_buffer: SharedBuffer<u32>,
    ) -> ocl::Result<BenchStatistics> {
        let num_tasks = input_buffer.inner().lock().len();
        let write_start = Instant::now();
        let input_buffer = self
            .pro_que
            .buffer_builder()
            .len(num_tasks)
            .fill_val(0u32)
            .build()?;
        let write_duration = write_start.elapsed();
-        let mut builder = self.pro_que.kernel_builder("bench_int");
+        let kernel = ctx
-
+            .pro_que()
-        if let Some(local_size) = local_size {
+            .kernel_builder("bench_int")
-            builder.local_work_size(local_size);
+            .local_work_size(local_size)
        }
        let kernel = builder
            .arg(calc_count)
            .arg(&input_buffer)
            .global_work_size(num_tasks)
            .arg(calc_count)
            .arg(input_buffer.inner().lock().deref())
            .build()?;
-        let calc_start = Instant::now();
+
-        unsafe {
+        let calc_duration = enqueue_profiled(ctx.pro_que(), &kernel)?;
-            kernel.enq()?;
+
        }
        self.pro_que.finish()?;
        let calc_duration = calc_start.elapsed();
        let mut output = vec![0u32; num_tasks];
        let read_start = Instant::now();
-        input_buffer.read(&mut output).enq()?;
+        input_buffer.read(&mut output)?;
        let read_duration = read_start.elapsed();
        Ok(BenchStatistics {
-            num_tasks,
+            global_size: num_tasks,
            calc_count,
            local_size,
            read_duration,
--- a/src/kernel_controller/primes_streamed.rs
+++ b/src/kernel_controller/primes_streamed.rs
@ -68,6 +68,7 @@ impl KernelController {
            .fill_val(0u8)
            .build()?;
        let input_buffer = numbers.to_ocl_buffer(pro_que)?;
        let kernel = pro_que
            .kernel_builder("check_prime")
            .local_work_size(local_size)
--- a/src/main.rs
+++ b/src/main.rs
@ -11,7 +11,9 @@ use crate::output::create_prime_write_thread;
 use crate::output::csv::ThreadedCSVWriter;
 use crate::output::threaded::ThreadedWriter;
-use ocl_stream::utils::result::OCLStreamResult;
+use crate::kernel_controller::bench::BenchStatistics;
 use ocl_stream::stream::OCLStream;
 use ocl_stream::utils::result::{OCLStreamError, OCLStreamResult};
 use rayon::prelude::*;
 use std::fs::{File, OpenOptions};
 use std::io::BufWriter;
@ -33,9 +35,13 @@ enum Opts {
    #[structopt(name = "calculate-primes")]
    CalculatePrimes(CalculatePrimes),
-    /// Benchmarks the number of tasks used for the calculations
+    /// Benchmarks the local size value
-    #[structopt(name = "bench-task-count")]
+    #[structopt(name = "bench-local-size")]
-    BenchmarkTaskCount(BenchmarkTaskCount),
+    BenchLocalSize(BenchLocalSize),
    /// Benchmarks the global size (number of tasks) value
    #[structopt(name = "bench-global-size")]
    BenchGlobalSize(BenchGlobalSize),
    /// Prints GPU information
    Info,
@ -90,62 +96,82 @@ struct CalculatePrimes {
 }
 #[derive(StructOpt, Clone, Debug)]
-struct BenchmarkTaskCount {
+struct BenchLocalSize {
-    /// How many calculations steps should be done per GPU thread
+    #[structopt(flatten)]
-    #[structopt(long = "calculation-steps", default_value = "1000000")]
+    bench_options: BenchOptions,
    calculation_steps: u32,
    /// The initial number of tasks for the benchmark
    #[structopt(long = "num-tasks-start", default_value = "1")]
    num_tasks_start: usize,
    /// The initial number for the local size
-    #[structopt(long = "local-size-start")]
+    #[structopt(long = "local-size-start", default_value = "4")]
-    local_size_start: Option<usize>,
+    local_size_start: usize,
    /// The amount the local size increases by every step
-    #[structopt(long = "local-size-step", default_value = "10")]
+    #[structopt(long = "local-size-step", default_value = "4")]
    local_size_step: usize,
    /// The maximum amount of the local size
    /// Can't be greater than the maximum local size of the gpu
    /// that can be retrieved with the info command
-    #[structopt(long = "local-size-stop")]
+    #[structopt(long = "local-size-stop", default_value = "1024")]
-    local_size_stop: Option<usize>,
+    local_size_stop: usize,
    /// The maximum number of tasks for the benchmark
-    #[structopt(long = "num-tasks-stop", default_value = "10000000")]
+    #[structopt(long = "global-size", default_value = "6144")]
-    num_tasks_stop: usize,
+    global_size: usize,
 }
 #[derive(StructOpt, Clone, Debug)]
 pub struct BenchGlobalSize {
    #[structopt(flatten)]
    options: BenchOptions,
-    /// The amount the task number increases per step
+    /// The start value for the used global size
-    #[structopt(long = "num-tasks-step", default_value = "10")]
+    #[structopt(long = "global-size-start", default_value = "1024")]
-    num_tasks_step: usize,
+    global_size_start: usize,
-    /// The average of n runs that is used instead of using one value only.
+    /// The step value for the used global size
-    /// By default the benchmark for each step is only run once
+    #[structopt(long = "global-size-step", default_value = "128")]
-    #[structopt(long = "average-of", default_value = "1")]
+    global_size_step: usize,
-    average_of: usize,
+
    /// The stop value for the used global size
    #[structopt(long = "global-size-stop", default_value = "1048576")]
    global_size_stop: usize,
    /// The maximum number of tasks for the benchmark
    #[structopt(long = "local-size", default_value = "128")]
    local_size: usize,
 }
 #[derive(StructOpt, Clone, Debug)]
 pub struct BenchOptions {
    /// How many calculations steps should be done per GPU thread
    #[structopt(short = "n", long = "calculation-steps", default_value = "1000000")]
    calculation_steps: u32,
    /// The output file for timings
-    #[structopt(long = "bench-output", default_value = "bench.csv")]
+    #[structopt(short = "o", long = "bench-output", default_value = "bench.csv")]
    benchmark_file: PathBuf,
    /// The average of n runs that is used instead of using one value only.
    /// By default the benchmark for each step is only run once
    #[structopt(short = "r", long = "repetitions", default_value = "1")]
    repetitions: usize,
 }
-fn main() -> ocl::Result<()> {
+fn main() -> OCLStreamResult<()> {
    let opts: Opts = Opts::from_args();
    let controller = KernelController::new()?;
    match opts {
-        Opts::Info => controller.print_info(),
+        Opts::Info => controller.print_info().map_err(OCLStreamError::from),
        Opts::CalculatePrimes(prime_opts) => {
            if prime_opts.streamed {
-                calculate_primes_streamed(prime_opts, controller).unwrap();
+                calculate_primes_streamed(prime_opts, controller)
                Ok(())
            } else {
-                calculate_primes(prime_opts, controller)
+                calculate_primes(prime_opts, controller).map_err(OCLStreamError::from)
            }
        }
-        Opts::BenchmarkTaskCount(bench_opts) => bench_task_count(bench_opts, controller),
+        Opts::BenchGlobalSize(bench_opts) => bench_global_size(bench_opts, controller),
        Opts::BenchLocalSize(bench_opts) => bench_local_size(bench_opts, controller),
    }
 }
@ -275,65 +301,86 @@ fn calculate_primes(prime_opts: CalculatePrimes, controller: KernelController) -
    Ok(())
 }
-fn bench_task_count(opts: BenchmarkTaskCount, controller: KernelController) -> ocl::Result<()> {
+/// Benchmarks the local size used for calculations
-    let bench_writer = BufWriter::new(
+fn bench_local_size(opts: BenchLocalSize, controller: KernelController) -> OCLStreamResult<()> {
-        OpenOptions::new()
+    let bench_writer = open_write_buffered(&opts.bench_options.benchmark_file);
-            .truncate(true)
+    let csv_writer = ThreadedCSVWriter::new(
-            .write(true)
+        bench_writer,
-            .create(true)
+        &[
-            .open(opts.benchmark_file)
+            "local_size",
-            .unwrap(),
+            "global_size",
            "calc_count",
            "write_duration",
            "gpu_duration",
            "read_duration",
        ],
    );
-    let mut csv_writer = ThreadedCSVWriter::new(
+    let stream = controller.bench_local_size(
        opts.global_size,
        opts.local_size_start,
        opts.local_size_step,
        opts.local_size_stop,
        opts.bench_options.calculation_steps,
        opts.bench_options.repetitions,
    )?;
    read_bench_results(opts.bench_options.calculation_steps, csv_writer, stream);
    Ok(())
 }
 /// Benchmarks the global size used for calculations
 fn bench_global_size(opts: BenchGlobalSize, controller: KernelController) -> OCLStreamResult<()> {
    let bench_writer = open_write_buffered(&opts.options.benchmark_file);
    let csv_writer = ThreadedCSVWriter::new(
        bench_writer,
        &[
            "local_size",
-            "num_tasks",
+            "global_size",
            "calc_count",
            "write_duration",
            "gpu_duration",
            "read_duration",
        ],
    );
-    for n in (opts.num_tasks_start..=opts.num_tasks_stop).step_by(opts.num_tasks_step) {
+    let stream = controller.bench_global_size(
-        if let (Some(start), Some(stop)) = (opts.local_size_start, opts.local_size_stop) {
+        opts.local_size,
-            for l in (start..=stop)
+        opts.global_size_start,
-                .step_by(opts.local_size_step)
+        opts.global_size_step,
-                .filter(|v| n % v == 0)
+        opts.global_size_stop,
-            {
+        opts.options.calculation_steps,
-                let mut stats = controller.bench_int(opts.calculation_steps, n, Some(l))?;
+        opts.options.repetitions,
-                for _ in 1..opts.average_of {
+    )?;
-                    stats.avg(controller.bench_int(opts.calculation_steps, n, Some(l))?)
+    read_bench_results(opts.options.calculation_steps, csv_writer, stream);
-                }
+
    Ok(())
 }
 /// Reads benchmark results from the stream and prints
 /// them to the console
 fn read_bench_results(
    calculation_steps: u32,
    mut csv_writer: ThreadedCSVWriter,
    mut stream: OCLStream<BenchStatistics>,
 ) {
    loop {
        match stream.next() {
            Ok(stats) => {
                println!("{}\n", stats);
                csv_writer.add_row(vec![
-                    l.to_string(),
+                    stats.local_size.to_string(),
-                    n.to_string(),
+                    stats.global_size.to_string(),
-                    opts.calculation_steps.to_string(),
+                    calculation_steps.to_string(),
                    duration_to_ms_string(&stats.write_duration),
                    duration_to_ms_string(&stats.calc_duration),
                    duration_to_ms_string(&stats.read_duration),
                ])
            }
-        } else {
+            _ => {
-            let mut stats = controller.bench_int(opts.calculation_steps, n, None)?;
+                break;
            for _ in 1..opts.average_of {
                stats.avg(controller.bench_int(opts.calculation_steps, n, None)?)
            }
            println!("{}\n", stats);
            csv_writer.add_row(vec![
                "n/a".to_string(),
                n.to_string(),
                opts.calculation_steps.to_string(),
                duration_to_ms_string(&stats.write_duration),
                duration_to_ms_string(&stats.calc_duration),
                duration_to_ms_string(&stats.read_duration),
            ]);
        }
    }
    csv_writer.close();
    Ok(())
 }
 fn validate_primes_on_cpu(primes: &Vec<u64>) {