Add semaphores for more accurate results

Signed-off-by: Trivernis <trivernis@protonmail.com>
4 years ago · cded1c7701
parent f9ab4f66fe
commit cded1c7701
9 changed files with 11127 additions and 150 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -776,6 +776,7 @@ name = "rust-opencl-demo"
 version = "0.1.0"
 dependencies = [
 "chrono",
+ "clap",
 "colored",
 "crossbeam-channel 0.5.0",
 "fern",
@ -786,6 +787,7 @@ dependencies = [
 "ocl-stream",
 "parking_lot",
 "rayon",
+ "std-semaphore",
 "structopt",
 ]

@ -828,6 +830,12 @@ version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7acad6f34eb9e8a259d3283d1e8c1d34d7415943d4895f65cc73813c7396fc85"

+[[package]]
+name = "std-semaphore"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33ae9eec00137a8eed469fb4148acd9fc6ac8c3f9b110f52cd34698c8b5bfa0e"
+
 [[package]]
 name = "strsim"
 version = "0.8.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -18,4 +18,6 @@ log = "0.4.13"
 fern = "0.6.0"
 colored = "2.0.0"
 chrono = "0.4.19"
-indicatif = "0.15.0"
+indicatif = "0.15.0"
+clap = "2.33.3"
+std-semaphore = "0.1.0"
--- a/output.log
+++ b/output.log
--- a/src/benching/mod.rs
+++ b/src/benching/mod.rs
@ -7,13 +7,20 @@
 use ocl::core::{get_event_profiling_info, wait_for_event, ProfilingInfo};
 use ocl::{EventList, Kernel, ProQue};
 use std::time::Duration;
+use std_semaphore::Semaphore;

 pub mod result;

 /// Runs a benchmark on the kernel
 /// The ProQue needs to have profiling enabled
-pub fn enqueue_profiled(pro_que: &ProQue, kernel: &Kernel) -> ocl::Result<Duration> {
+pub fn enqueue_profiled(
+    pro_que: &ProQue,
+    kernel: &Kernel,
+    sem: &Semaphore,
+) -> ocl::Result<Duration> {
    log::trace!("Running kernel with profiling");
+    log::trace!("Acquiring lock for enqueueing");
+    sem.acquire();
    log::trace!("Enqueueing start event");
    let event_start = pro_que.queue().enqueue_marker::<EventList>(None)?;
    log::trace!("Enqueueing Kernel");
@ -23,6 +30,8 @@ pub fn enqueue_profiled(pro_que: &ProQue, kernel: &Kernel) -> ocl::Result<Durati
    }
    log::trace!("Enqueueing stop event");
    let event_stop = pro_que.queue().enqueue_marker::<EventList>(None)?;
+    log::trace!("Releasing enqueueing lock");
+    sem.release();

    log::trace!("Waiting for start event");
    wait_for_event(&event_start)?;
--- a/src/kernel_controller/bench.rs
+++ b/src/kernel_controller/bench.rs
@ -18,6 +18,7 @@ use ocl_stream::utils::shared_buffer::SharedBuffer;
 use crate::benching::enqueue_profiled;
 use crate::kernel_controller::KernelController;
 use crate::utils::progress::get_progress_bar;
+use std_semaphore::Semaphore;

 #[derive(Clone, Debug)]
 pub struct BenchStatistics {
@ -61,6 +62,7 @@ impl KernelController {
        let pb = get_progress_bar(
            ((global_size_stop - global_size_start) / global_size_step) as u64 * repetitions as u64,
        );
+        let sem = Semaphore::new(1);

        let stream = self.executor.execute_bounded(global_size_stop, move |ctx| {
            loop {
@ -85,7 +87,7 @@ impl KernelController {
                );
                for _ in 0..repetitions {
                    let stats =
-                        Self::bench_int(&ctx, local_size, calc_count, input_buffer.clone())?;
+                        Self::bench_int(&ctx, local_size, calc_count, input_buffer.clone(), &sem)?;
                    ctx.sender().send(stats)?;
                    pb.inc(1);
                }
@ -115,6 +117,7 @@ impl KernelController {
        let pb = get_progress_bar(
            ((local_size_stop - local_size_start) / local_size_step) as u64 * repetitions as u64,
        );
+        let sem = Semaphore::new(1);

        let stream = self.executor.execute_bounded(global_size, move |ctx| {
            loop {
@ -137,7 +140,7 @@ impl KernelController {
                );
                for _ in 0..repetitions {
                    let stats =
-                        Self::bench_int(&ctx, local_size, calc_count, input_buffer.clone())?;
+                        Self::bench_int(&ctx, local_size, calc_count, input_buffer.clone(), &sem)?;
                    ctx.sender().send(stats)?;
                    pb.inc(1);
                }
@ -154,6 +157,7 @@ impl KernelController {
        local_size: usize,
        calc_count: u32,
        input_buffer: SharedBuffer<u32>,
+        sem: &Semaphore,
    ) -> ocl::Result<BenchStatistics> {
        let num_tasks = input_buffer.inner().lock().len();

@ -167,7 +171,7 @@ impl KernelController {
            .arg(input_buffer.inner().lock().deref())
            .build()?;

-        let calc_duration = enqueue_profiled(ctx.pro_que(), &kernel)?;
+        let calc_duration = enqueue_profiled(ctx.pro_que(), &kernel, sem)?;

        log::trace!("Reading output");
        let mut output = vec![0u32; num_tasks];
--- a/src/kernel_controller/primes.rs
+++ b/src/kernel_controller/primes.rs
@ -16,6 +16,7 @@ use std::mem::size_of;
 use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::Arc;
 use std::time::Instant;
+use std_semaphore::Semaphore;

 const MEMORY_LIMIT: u64 = 4 * 1024 * 1024 * 1024;

@ -48,6 +49,7 @@ impl KernelController {
        }

        let pb = get_progress_bar((stop - start) / (step * 2) as u64);
+        let sem = Semaphore::new(1);

        self.executor.execute_bounded(step * 10, move |ctx| {
            loop {
@ -66,10 +68,10 @@ impl KernelController {
                let result = if use_cache {
                    let prime_cache = Arc::clone(&prime_cache);
                    log::trace!("Using optimized function with cached primes");
-                    Self::filter_primes_cached(pro_que, numbers, local_size, prime_cache)?
+                    Self::filter_primes_cached(pro_que, numbers, local_size, prime_cache, &sem)?
                } else {
                    log::trace!("Using normal prime calculation function");
-                    Self::filter_primes(pro_que, numbers, local_size)?
+                    Self::filter_primes(pro_que, numbers, local_size, &sem)?
                };
                sender.send(result)?;
                pb.inc(1);
@ -84,6 +86,7 @@ impl KernelController {
        pro_que: &ProQue,
        numbers: Vec<u64>,
        local_size: usize,
+        sem: &Semaphore,
    ) -> ocl::Result<ProfiledResult<Vec<u64>>> {
        log::trace!("Creating 0u8 output buffer");
        let output_buffer = pro_que
@ -102,7 +105,7 @@ impl KernelController {
            .arg(&output_buffer)
            .global_work_size(numbers.len())
            .build()?;
-        let duration = enqueue_profiled(pro_que, &kernel)?;
+        let duration = enqueue_profiled(pro_que, &kernel, &sem)?;

        log::trace!("Reading output");
        let mut output = vec![0u8; output_buffer.len()];
@ -119,6 +122,7 @@ impl KernelController {
        numbers: Vec<u64>,
        local_size: usize,
        prime_cache: Arc<Mutex<Vec<u64>>>,
+        sem: &Semaphore,
    ) -> ocl::Result<ProfiledResult<Vec<u64>>> {
        let prime_buffer = prime_cache.lock().to_ocl_buffer(pro_que)?;
        let input_buffer = numbers.to_ocl_buffer(pro_que)?;
@ -141,7 +145,7 @@ impl KernelController {
            .global_work_size(numbers.len())
            .build()?;

-        let duration = enqueue_profiled(pro_que, &kernel)?;
+        let duration = enqueue_profiled(pro_que, &kernel, sem)?;

        log::trace!("Reading output");
        let mut output = vec![0u8; output_buffer.len()];
--- a/src/main.rs
+++ b/src/main.rs
@ -4,152 +4,34 @@
 * See LICENSE for more information
 */

-use crate::kernel_controller::primes::is_prime;
-use crate::kernel_controller::KernelController;
-use crate::output::csv::ThreadedCSVWriter;
-use crate::output::threaded::ThreadedWriter;
+#[macro_use]
+extern crate clap;

-use crate::kernel_controller::bench::BenchStatistics;
-use crate::utils::logging::init_logger;
-use ocl_stream::stream::OCLStream;
-use ocl_stream::utils::result::{OCLStreamError, OCLStreamResult};
-use rayon::prelude::*;
 use std::fs::{File, OpenOptions};
 use std::io::BufWriter;
 use std::path::PathBuf;
 use std::time::Duration;
+
+use ocl_stream::stream::OCLStream;
+use ocl_stream::utils::result::{OCLStreamError, OCLStreamResult};
+use rayon::prelude::*;
+
 use structopt::StructOpt;
+use utils::args::{BenchGlobalSize, BenchLocalSize, CalculatePrimes, Opts};
+
+use crate::kernel_controller::bench::BenchStatistics;
+use crate::kernel_controller::primes::is_prime;
+use crate::kernel_controller::KernelController;
+use crate::output::csv::ThreadedCSVWriter;
+use crate::output::threaded::ThreadedWriter;
+use crate::utils::args::UseColors;
+use crate::utils::logging::init_logger;

 mod benching;
 mod kernel_controller;
 mod output;
 mod utils;

-#[derive(StructOpt, Clone, Debug)]
-#[structopt()]
-enum Opts {
-    /// Calculates primes on the GPU
-    #[structopt(name = "calculate-primes")]
-    CalculatePrimes(CalculatePrimes),
-
-    /// Benchmarks the local size value
-    #[structopt(name = "bench-local-size")]
-    BenchLocalSize(BenchLocalSize),
-
-    /// Benchmarks the global size (number of tasks) value
-    #[structopt(name = "bench-global-size")]
-    BenchGlobalSize(BenchGlobalSize),
-
-    /// Prints GPU information
-    Info,
-}
-
-#[derive(StructOpt, Clone, Debug)]
-struct CalculatePrimes {
-    /// The number to start with
-    #[structopt(long = "start", default_value = "0")]
-    start_offset: u64,
-
-    /// The maximum number to calculate to
-    #[structopt(long = "end", default_value = "9223372036854775807")]
-    max_number: u64,
-
-    /// The output file for the calculated prime numbers
-    #[structopt(short = "o", long = "output", default_value = "primes.txt")]
-    output_file: PathBuf,
-
-    /// The output file for timings
-    #[structopt(long = "timings-output", default_value = "timings.csv")]
-    timings_file: PathBuf,
-
-    /// The local size for the tasks.
-    /// The value for numbers_per_step needs to be divisible by this number.
-    /// The maximum local size depends on the gpu capabilities.
-    /// If no value is provided, OpenCL chooses it automatically.
-    #[structopt(long = "local-size")]
-    local_size: Option<usize>,
-
-    /// The amount of numbers that are checked per step. Even numbers are ignored so the
-    /// Range actually goes to numbers_per_step * 2.
-    #[structopt(long = "numbers-per-step", default_value = "33554432")]
-    numbers_per_step: usize,
-
-    /// If the prime numbers should be used for the divisibility check instead of using
-    /// an optimized auto-increment loop.
-    #[structopt(long = "no-cache")]
-    no_cache: bool,
-
-    /// If the calculated prime numbers should be validated on the cpu by a simple prime algorithm
-    #[structopt(long = "cpu-validate")]
-    cpu_validate: bool,
-
-    /// number of used threads
-    #[structopt(short = "p", long = "parallel", default_value = "2")]
-    num_threads: usize,
-}
-
-#[derive(StructOpt, Clone, Debug)]
-struct BenchLocalSize {
-    #[structopt(flatten)]
-    bench_options: BenchOptions,
-
-    /// The initial number for the local size
-    #[structopt(long = "local-size-start", default_value = "4")]
-    local_size_start: usize,
-
-    /// The amount the local size increases by every step
-    #[structopt(long = "local-size-step", default_value = "4")]
-    local_size_step: usize,
-
-    /// The maximum amount of the local size
-    /// Can't be greater than the maximum local size of the gpu
-    /// that can be retrieved with the info command
-    #[structopt(long = "local-size-stop", default_value = "1024")]
-    local_size_stop: usize,
-
-    /// The maximum number of tasks for the benchmark
-    #[structopt(long = "global-size", default_value = "6144")]
-    global_size: usize,
-}
-
-#[derive(StructOpt, Clone, Debug)]
-pub struct BenchGlobalSize {
-    #[structopt(flatten)]
-    options: BenchOptions,
-
-    /// The start value for the used global size
-    #[structopt(long = "global-size-start", default_value = "1024")]
-    global_size_start: usize,
-
-    /// The step value for the used global size
-    #[structopt(long = "global-size-step", default_value = "128")]
-    global_size_step: usize,
-
-    /// The stop value for the used global size
-    #[structopt(long = "global-size-stop", default_value = "1048576")]
-    global_size_stop: usize,
-
-    /// The maximum number of tasks for the benchmark
-    #[structopt(long = "local-size", default_value = "128")]
-    local_size: usize,
-}
-
-#[derive(StructOpt, Clone, Debug)]
-pub struct BenchOptions {
-    /// How many calculations steps should be done per GPU thread
-    #[structopt(short = "n", long = "calculation-steps", default_value = "1000000")]
-    calculation_steps: u32,
-
-    /// The output file for timings
-    #[structopt(short = "o", long = "bench-output", default_value = "bench.csv")]
-    benchmark_file: PathBuf,
-
-    /// The average of n runs that is used instead of using one value only.
-    /// By default the benchmark for each step is only run once
-    #[structopt(short = "r", long = "repetitions", default_value = "1")]
-    repetitions: usize,
-}
-
 fn main() -> OCLStreamResult<()> {
    let opts: Opts = Opts::from_args();
    let controller = KernelController::new()?;
@ -168,7 +50,8 @@ fn calculate_primes(
    prime_opts: CalculatePrimes,
    mut controller: KernelController,
 ) -> OCLStreamResult<()> {
-    controller.set_concurrency(prime_opts.num_threads);
+    set_output_colored(prime_opts.general_options.color);
+    controller.set_concurrency(prime_opts.general_options.threads);

    let csv_file = open_write_buffered(&prime_opts.timings_file);
    let mut csv_writer = ThreadedCSVWriter::new(csv_file, &["first", "count", "gpu_duration"]);
@ -214,7 +97,9 @@ fn calculate_primes(
 }

 /// Benchmarks the local size used for calculations
-fn bench_local_size(opts: BenchLocalSize, controller: KernelController) -> OCLStreamResult<()> {
+fn bench_local_size(opts: BenchLocalSize, mut controller: KernelController) -> OCLStreamResult<()> {
+    set_output_colored(opts.bench_options.general_options.color);
+    controller.set_concurrency(opts.bench_options.general_options.threads);
    let bench_writer = open_write_buffered(&opts.bench_options.benchmark_file);
    let csv_writer = ThreadedCSVWriter::new(
        bench_writer,
@ -241,8 +126,13 @@ fn bench_local_size(opts: BenchLocalSize, controller: KernelController) -> OCLSt
 }

 /// Benchmarks the global size used for calculations
-fn bench_global_size(opts: BenchGlobalSize, controller: KernelController) -> OCLStreamResult<()> {
-    let bench_writer = open_write_buffered(&opts.options.benchmark_file);
+fn bench_global_size(
+    opts: BenchGlobalSize,
+    mut controller: KernelController,
+) -> OCLStreamResult<()> {
+    set_output_colored(opts.bench_options.general_options.color);
+    controller.set_concurrency(opts.bench_options.general_options.threads);
+    let bench_writer = open_write_buffered(&opts.bench_options.benchmark_file);
    let csv_writer = ThreadedCSVWriter::new(
        bench_writer,
        &[
@ -259,10 +149,10 @@ fn bench_global_size(opts: BenchGlobalSize, controller: KernelController) -> OCL
        opts.global_size_start,
        opts.global_size_step,
        opts.global_size_stop,
-        opts.options.calculation_steps,
-        opts.options.repetitions,
+        opts.bench_options.calculation_steps,
+        opts.bench_options.repetitions,
    )?;
-    read_bench_results(opts.options.calculation_steps, csv_writer, stream);
+    read_bench_results(opts.bench_options.calculation_steps, csv_writer, stream);

    Ok(())
 }
@ -328,3 +218,11 @@ fn open_write_buffered(path: &PathBuf) -> BufWriter<File> {
            .expect("Failed to open file!"),
    )
 }
+
+fn set_output_colored(colored: UseColors) {
+    match colored {
+        UseColors::On => colored::control::set_override(true),
+        UseColors::Off => colored::control::set_override(false),
+        _ => {}
+    }
+}
--- a/src/utils/args.rs
+++ b/src/utils/args.rs
@ -0,0 +1,155 @@
+/*
+ * opencl demos with rust
+ * Copyright (C) 2021 trivernis
+ * See LICENSE for more information
+ */
+
+use std::path::PathBuf;
+use structopt::StructOpt;
+
+#[derive(StructOpt, Clone, Debug)]
+#[structopt()]
+pub enum Opts {
+    /// Calculates primes on the GPU
+    #[structopt(name = "calculate-primes")]
+    CalculatePrimes(CalculatePrimes),
+
+    /// Benchmarks the local size value
+    #[structopt(name = "bench-local-size")]
+    BenchLocalSize(BenchLocalSize),
+
+    /// Benchmarks the global size (number of tasks) value
+    #[structopt(name = "bench-global-size")]
+    BenchGlobalSize(BenchGlobalSize),
+
+    /// Prints GPU information
+    Info,
+}
+
+#[derive(StructOpt, Clone, Debug)]
+pub struct CalculatePrimes {
+    #[structopt(flatten)]
+    pub general_options: GeneralOptions,
+
+    /// The number to start with
+    #[structopt(long = "start", default_value = "0")]
+    pub start_offset: u64,
+
+    /// The maximum number to calculate to
+    #[structopt(long = "end", default_value = "9223372036854775807")]
+    pub max_number: u64,
+
+    /// The output file for the calculated prime numbers
+    #[structopt(short = "o", long = "output", default_value = "primes.txt")]
+    pub output_file: PathBuf,
+
+    /// The output file for timings
+    #[structopt(long = "timings-output", default_value = "timings.csv")]
+    pub timings_file: PathBuf,
+
+    /// The local size for the tasks.
+    /// The value for numbers_per_step needs to be divisible by this number.
+    /// The maximum local size depends on the gpu capabilities.
+    /// If no value is provided, OpenCL chooses it automatically.
+    #[structopt(long = "local-size")]
+    pub local_size: Option<usize>,
+
+    /// The amount of numbers that are checked per step. Even numbers are ignored so the
+    /// Range actually goes to numbers_per_step * 2.
+    #[structopt(long = "numbers-per-step", default_value = "33554432")]
+    pub numbers_per_step: usize,
+
+    /// If the prime numbers should be used for the divisibility check instead of using
+    /// an optimized auto-increment loop.
+    #[structopt(long = "no-cache")]
+    pub no_cache: bool,
+
+    /// If the calculated prime numbers should be validated on the cpu by a simple prime algorithm
+    #[structopt(long = "cpu-validate")]
+    pub cpu_validate: bool,
+}
+
+#[derive(StructOpt, Clone, Debug)]
+pub struct BenchLocalSize {
+    #[structopt(flatten)]
+    pub bench_options: BenchOptions,
+
+    /// The initial number for the local size
+    #[structopt(long = "local-size-start", default_value = "4")]
+    pub local_size_start: usize,
+
+    /// The amount the local size increases by every step
+    #[structopt(long = "local-size-step", default_value = "4")]
+    pub local_size_step: usize,
+
+    /// The maximum amount of the local size
+    /// Can't be greater than the maximum local size of the gpu
+    /// that can be retrieved with the info command
+    #[structopt(long = "local-size-stop", default_value = "1024")]
+    pub local_size_stop: usize,
+
+    /// The maximum number of tasks for the benchmark
+    #[structopt(long = "global-size", default_value = "6144")]
+    pub global_size: usize,
+}
+
+#[derive(StructOpt, Clone, Debug)]
+pub struct BenchGlobalSize {
+    #[structopt(flatten)]
+    pub bench_options: BenchOptions,
+
+    /// The start value for the used global size
+    #[structopt(long = "global-size-start", default_value = "1024")]
+    pub global_size_start: usize,
+
+    /// The step value for the used global size
+    #[structopt(long = "global-size-step", default_value = "128")]
+    pub global_size_step: usize,
+
+    /// The stop value for the used global size
+    #[structopt(long = "global-size-stop", default_value = "1048576")]
+    pub global_size_stop: usize,
+
+    /// The maximum number of tasks for the benchmark
+    #[structopt(long = "local-size", default_value = "128")]
+    pub local_size: usize,
+}
+
+#[derive(StructOpt, Clone, Debug)]
+pub struct BenchOptions {
+    #[structopt(flatten)]
+    pub general_options: GeneralOptions,
+
+    /// How many calculations steps should be done per GPU thread
+    #[structopt(short = "n", long = "calculation-steps", default_value = "1000000")]
+    pub calculation_steps: u32,
+
+    /// The output file for timings
+    #[structopt(short = "o", long = "bench-output", default_value = "bench.csv")]
+    pub benchmark_file: PathBuf,
+
+    /// The average of n runs that is used instead of using one value only.
+    /// By default the benchmark for each step is only run once
+    #[structopt(short = "r", long = "repetitions", default_value = "1")]
+    pub repetitions: usize,
+}
+
+#[derive(StructOpt, Clone, Debug)]
+pub struct GeneralOptions {
+    /// If the output should be colored
+    #[structopt(long = "color", possible_values = &UseColors::variants(), case_insensitive = true, default_value = "auto")]
+    pub color: UseColors,
+
+    /// number of used threads
+    #[structopt(short = "p", long = "threads", default_value = "2")]
+    pub threads: usize,
+}
+
+arg_enum! {
+    #[derive(Clone, Debug)]
+    pub enum UseColors {
+        Off,
+        On,
+        Auto,
+    }
+}
--- a/src/utils/mod.rs
+++ b/src/utils/mod.rs
@ -3,5 +3,6 @@
 * Copyright (C) 2021 trivernis
 * See LICENSE for more information
 */
+pub mod args;
 pub mod logging;
 pub mod progress;