Add semaphores for more accurate results

Signed-off-by: Trivernis <trivernis@protonmail.com>
main
Trivernis 4 years ago
parent f9ab4f66fe
commit cded1c7701
No known key found for this signature in database
GPG Key ID: EB543D89E02BC83F

8
Cargo.lock generated

@ -776,6 +776,7 @@ name = "rust-opencl-demo"
version = "0.1.0"
dependencies = [
"chrono",
"clap",
"colored",
"crossbeam-channel 0.5.0",
"fern",
@ -786,6 +787,7 @@ dependencies = [
"ocl-stream",
"parking_lot",
"rayon",
"std-semaphore",
"structopt",
]
@ -828,6 +830,12 @@ version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7acad6f34eb9e8a259d3283d1e8c1d34d7415943d4895f65cc73813c7396fc85"
[[package]]
name = "std-semaphore"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33ae9eec00137a8eed469fb4148acd9fc6ac8c3f9b110f52cd34698c8b5bfa0e"
[[package]]
name = "strsim"
version = "0.8.0"

@ -18,4 +18,6 @@ log = "0.4.13"
fern = "0.6.0"
colored = "2.0.0"
chrono = "0.4.19"
indicatif = "0.15.0"
indicatif = "0.15.0"
clap = "2.33.3"
std-semaphore = "0.1.0"

File diff suppressed because it is too large Load Diff

@ -7,13 +7,20 @@
use ocl::core::{get_event_profiling_info, wait_for_event, ProfilingInfo};
use ocl::{EventList, Kernel, ProQue};
use std::time::Duration;
use std_semaphore::Semaphore;
pub mod result;
/// Runs a benchmark on the kernel
/// The ProQue needs to have profiling enabled
pub fn enqueue_profiled(pro_que: &ProQue, kernel: &Kernel) -> ocl::Result<Duration> {
pub fn enqueue_profiled(
pro_que: &ProQue,
kernel: &Kernel,
sem: &Semaphore,
) -> ocl::Result<Duration> {
log::trace!("Running kernel with profiling");
log::trace!("Acquiring lock for enqueueing");
sem.acquire();
log::trace!("Enqueueing start event");
let event_start = pro_que.queue().enqueue_marker::<EventList>(None)?;
log::trace!("Enqueueing Kernel");
@ -23,6 +30,8 @@ pub fn enqueue_profiled(pro_que: &ProQue, kernel: &Kernel) -> ocl::Result<Durati
}
log::trace!("Enqueueing stop event");
let event_stop = pro_que.queue().enqueue_marker::<EventList>(None)?;
log::trace!("Releasing enqueueing lock");
sem.release();
log::trace!("Waiting for start event");
wait_for_event(&event_start)?;

@ -18,6 +18,7 @@ use ocl_stream::utils::shared_buffer::SharedBuffer;
use crate::benching::enqueue_profiled;
use crate::kernel_controller::KernelController;
use crate::utils::progress::get_progress_bar;
use std_semaphore::Semaphore;
#[derive(Clone, Debug)]
pub struct BenchStatistics {
@ -61,6 +62,7 @@ impl KernelController {
let pb = get_progress_bar(
((global_size_stop - global_size_start) / global_size_step) as u64 * repetitions as u64,
);
let sem = Semaphore::new(1);
let stream = self.executor.execute_bounded(global_size_stop, move |ctx| {
loop {
@ -85,7 +87,7 @@ impl KernelController {
);
for _ in 0..repetitions {
let stats =
Self::bench_int(&ctx, local_size, calc_count, input_buffer.clone())?;
Self::bench_int(&ctx, local_size, calc_count, input_buffer.clone(), &sem)?;
ctx.sender().send(stats)?;
pb.inc(1);
}
@ -115,6 +117,7 @@ impl KernelController {
let pb = get_progress_bar(
((local_size_stop - local_size_start) / local_size_step) as u64 * repetitions as u64,
);
let sem = Semaphore::new(1);
let stream = self.executor.execute_bounded(global_size, move |ctx| {
loop {
@ -137,7 +140,7 @@ impl KernelController {
);
for _ in 0..repetitions {
let stats =
Self::bench_int(&ctx, local_size, calc_count, input_buffer.clone())?;
Self::bench_int(&ctx, local_size, calc_count, input_buffer.clone(), &sem)?;
ctx.sender().send(stats)?;
pb.inc(1);
}
@ -154,6 +157,7 @@ impl KernelController {
local_size: usize,
calc_count: u32,
input_buffer: SharedBuffer<u32>,
sem: &Semaphore,
) -> ocl::Result<BenchStatistics> {
let num_tasks = input_buffer.inner().lock().len();
@ -167,7 +171,7 @@ impl KernelController {
.arg(input_buffer.inner().lock().deref())
.build()?;
let calc_duration = enqueue_profiled(ctx.pro_que(), &kernel)?;
let calc_duration = enqueue_profiled(ctx.pro_que(), &kernel, sem)?;
log::trace!("Reading output");
let mut output = vec![0u32; num_tasks];

@ -16,6 +16,7 @@ use std::mem::size_of;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use std::time::Instant;
use std_semaphore::Semaphore;
const MEMORY_LIMIT: u64 = 4 * 1024 * 1024 * 1024;
@ -48,6 +49,7 @@ impl KernelController {
}
let pb = get_progress_bar((stop - start) / (step * 2) as u64);
let sem = Semaphore::new(1);
self.executor.execute_bounded(step * 10, move |ctx| {
loop {
@ -66,10 +68,10 @@ impl KernelController {
let result = if use_cache {
let prime_cache = Arc::clone(&prime_cache);
log::trace!("Using optimized function with cached primes");
Self::filter_primes_cached(pro_que, numbers, local_size, prime_cache)?
Self::filter_primes_cached(pro_que, numbers, local_size, prime_cache, &sem)?
} else {
log::trace!("Using normal prime calculation function");
Self::filter_primes(pro_que, numbers, local_size)?
Self::filter_primes(pro_que, numbers, local_size, &sem)?
};
sender.send(result)?;
pb.inc(1);
@ -84,6 +86,7 @@ impl KernelController {
pro_que: &ProQue,
numbers: Vec<u64>,
local_size: usize,
sem: &Semaphore,
) -> ocl::Result<ProfiledResult<Vec<u64>>> {
log::trace!("Creating 0u8 output buffer");
let output_buffer = pro_que
@ -102,7 +105,7 @@ impl KernelController {
.arg(&output_buffer)
.global_work_size(numbers.len())
.build()?;
let duration = enqueue_profiled(pro_que, &kernel)?;
let duration = enqueue_profiled(pro_que, &kernel, &sem)?;
log::trace!("Reading output");
let mut output = vec![0u8; output_buffer.len()];
@ -119,6 +122,7 @@ impl KernelController {
numbers: Vec<u64>,
local_size: usize,
prime_cache: Arc<Mutex<Vec<u64>>>,
sem: &Semaphore,
) -> ocl::Result<ProfiledResult<Vec<u64>>> {
let prime_buffer = prime_cache.lock().to_ocl_buffer(pro_que)?;
let input_buffer = numbers.to_ocl_buffer(pro_que)?;
@ -141,7 +145,7 @@ impl KernelController {
.global_work_size(numbers.len())
.build()?;
let duration = enqueue_profiled(pro_que, &kernel)?;
let duration = enqueue_profiled(pro_que, &kernel, sem)?;
log::trace!("Reading output");
let mut output = vec![0u8; output_buffer.len()];

@ -4,152 +4,34 @@
* See LICENSE for more information
*/
use crate::kernel_controller::primes::is_prime;
use crate::kernel_controller::KernelController;
use crate::output::csv::ThreadedCSVWriter;
use crate::output::threaded::ThreadedWriter;
#[macro_use]
extern crate clap;
use crate::kernel_controller::bench::BenchStatistics;
use crate::utils::logging::init_logger;
use ocl_stream::stream::OCLStream;
use ocl_stream::utils::result::{OCLStreamError, OCLStreamResult};
use rayon::prelude::*;
use std::fs::{File, OpenOptions};
use std::io::BufWriter;
use std::path::PathBuf;
use std::time::Duration;
use ocl_stream::stream::OCLStream;
use ocl_stream::utils::result::{OCLStreamError, OCLStreamResult};
use rayon::prelude::*;
use structopt::StructOpt;
use utils::args::{BenchGlobalSize, BenchLocalSize, CalculatePrimes, Opts};
use crate::kernel_controller::bench::BenchStatistics;
use crate::kernel_controller::primes::is_prime;
use crate::kernel_controller::KernelController;
use crate::output::csv::ThreadedCSVWriter;
use crate::output::threaded::ThreadedWriter;
use crate::utils::args::UseColors;
use crate::utils::logging::init_logger;
mod benching;
mod kernel_controller;
mod output;
mod utils;
#[derive(StructOpt, Clone, Debug)]
#[structopt()]
enum Opts {
/// Calculates primes on the GPU
#[structopt(name = "calculate-primes")]
CalculatePrimes(CalculatePrimes),
/// Benchmarks the local size value
#[structopt(name = "bench-local-size")]
BenchLocalSize(BenchLocalSize),
/// Benchmarks the global size (number of tasks) value
#[structopt(name = "bench-global-size")]
BenchGlobalSize(BenchGlobalSize),
/// Prints GPU information
Info,
}
#[derive(StructOpt, Clone, Debug)]
struct CalculatePrimes {
/// The number to start with
#[structopt(long = "start", default_value = "0")]
start_offset: u64,
/// The maximum number to calculate to
#[structopt(long = "end", default_value = "9223372036854775807")]
max_number: u64,
/// The output file for the calculated prime numbers
#[structopt(short = "o", long = "output", default_value = "primes.txt")]
output_file: PathBuf,
/// The output file for timings
#[structopt(long = "timings-output", default_value = "timings.csv")]
timings_file: PathBuf,
/// The local size for the tasks.
/// The value for numbers_per_step needs to be divisible by this number.
/// The maximum local size depends on the gpu capabilities.
/// If no value is provided, OpenCL chooses it automatically.
#[structopt(long = "local-size")]
local_size: Option<usize>,
/// The amount of numbers that are checked per step. Even numbers are ignored so the
/// Range actually goes to numbers_per_step * 2.
#[structopt(long = "numbers-per-step", default_value = "33554432")]
numbers_per_step: usize,
/// If the prime numbers should be used for the divisibility check instead of using
/// an optimized auto-increment loop.
#[structopt(long = "no-cache")]
no_cache: bool,
/// If the calculated prime numbers should be validated on the cpu by a simple prime algorithm
#[structopt(long = "cpu-validate")]
cpu_validate: bool,
/// number of used threads
#[structopt(short = "p", long = "parallel", default_value = "2")]
num_threads: usize,
}
#[derive(StructOpt, Clone, Debug)]
struct BenchLocalSize {
#[structopt(flatten)]
bench_options: BenchOptions,
/// The initial number for the local size
#[structopt(long = "local-size-start", default_value = "4")]
local_size_start: usize,
/// The amount the local size increases by every step
#[structopt(long = "local-size-step", default_value = "4")]
local_size_step: usize,
/// The maximum amount of the local size
/// Can't be greater than the maximum local size of the gpu
/// that can be retrieved with the info command
#[structopt(long = "local-size-stop", default_value = "1024")]
local_size_stop: usize,
/// The maximum number of tasks for the benchmark
#[structopt(long = "global-size", default_value = "6144")]
global_size: usize,
}
#[derive(StructOpt, Clone, Debug)]
pub struct BenchGlobalSize {
#[structopt(flatten)]
options: BenchOptions,
/// The start value for the used global size
#[structopt(long = "global-size-start", default_value = "1024")]
global_size_start: usize,
/// The step value for the used global size
#[structopt(long = "global-size-step", default_value = "128")]
global_size_step: usize,
/// The stop value for the used global size
#[structopt(long = "global-size-stop", default_value = "1048576")]
global_size_stop: usize,
/// The maximum number of tasks for the benchmark
#[structopt(long = "local-size", default_value = "128")]
local_size: usize,
}
#[derive(StructOpt, Clone, Debug)]
pub struct BenchOptions {
/// How many calculations steps should be done per GPU thread
#[structopt(short = "n", long = "calculation-steps", default_value = "1000000")]
calculation_steps: u32,
/// The output file for timings
#[structopt(short = "o", long = "bench-output", default_value = "bench.csv")]
benchmark_file: PathBuf,
/// The average of n runs that is used instead of using one value only.
/// By default the benchmark for each step is only run once
#[structopt(short = "r", long = "repetitions", default_value = "1")]
repetitions: usize,
}
fn main() -> OCLStreamResult<()> {
let opts: Opts = Opts::from_args();
let controller = KernelController::new()?;
@ -168,7 +50,8 @@ fn calculate_primes(
prime_opts: CalculatePrimes,
mut controller: KernelController,
) -> OCLStreamResult<()> {
controller.set_concurrency(prime_opts.num_threads);
set_output_colored(prime_opts.general_options.color);
controller.set_concurrency(prime_opts.general_options.threads);
let csv_file = open_write_buffered(&prime_opts.timings_file);
let mut csv_writer = ThreadedCSVWriter::new(csv_file, &["first", "count", "gpu_duration"]);
@ -214,7 +97,9 @@ fn calculate_primes(
}
/// Benchmarks the local size used for calculations
fn bench_local_size(opts: BenchLocalSize, controller: KernelController) -> OCLStreamResult<()> {
fn bench_local_size(opts: BenchLocalSize, mut controller: KernelController) -> OCLStreamResult<()> {
set_output_colored(opts.bench_options.general_options.color);
controller.set_concurrency(opts.bench_options.general_options.threads);
let bench_writer = open_write_buffered(&opts.bench_options.benchmark_file);
let csv_writer = ThreadedCSVWriter::new(
bench_writer,
@ -241,8 +126,13 @@ fn bench_local_size(opts: BenchLocalSize, controller: KernelController) -> OCLSt
}
/// Benchmarks the global size used for calculations
fn bench_global_size(opts: BenchGlobalSize, controller: KernelController) -> OCLStreamResult<()> {
let bench_writer = open_write_buffered(&opts.options.benchmark_file);
fn bench_global_size(
opts: BenchGlobalSize,
mut controller: KernelController,
) -> OCLStreamResult<()> {
set_output_colored(opts.bench_options.general_options.color);
controller.set_concurrency(opts.bench_options.general_options.threads);
let bench_writer = open_write_buffered(&opts.bench_options.benchmark_file);
let csv_writer = ThreadedCSVWriter::new(
bench_writer,
&[
@ -259,10 +149,10 @@ fn bench_global_size(opts: BenchGlobalSize, controller: KernelController) -> OCL
opts.global_size_start,
opts.global_size_step,
opts.global_size_stop,
opts.options.calculation_steps,
opts.options.repetitions,
opts.bench_options.calculation_steps,
opts.bench_options.repetitions,
)?;
read_bench_results(opts.options.calculation_steps, csv_writer, stream);
read_bench_results(opts.bench_options.calculation_steps, csv_writer, stream);
Ok(())
}
@ -328,3 +218,11 @@ fn open_write_buffered(path: &PathBuf) -> BufWriter<File> {
.expect("Failed to open file!"),
)
}
fn set_output_colored(colored: UseColors) {
match colored {
UseColors::On => colored::control::set_override(true),
UseColors::Off => colored::control::set_override(false),
_ => {}
}
}

@ -0,0 +1,155 @@
/*
* opencl demos with rust
* Copyright (C) 2021 trivernis
* See LICENSE for more information
*/
use std::path::PathBuf;
use structopt::StructOpt;
#[derive(StructOpt, Clone, Debug)]
#[structopt()]
pub enum Opts {
/// Calculates primes on the GPU
#[structopt(name = "calculate-primes")]
CalculatePrimes(CalculatePrimes),
/// Benchmarks the local size value
#[structopt(name = "bench-local-size")]
BenchLocalSize(BenchLocalSize),
/// Benchmarks the global size (number of tasks) value
#[structopt(name = "bench-global-size")]
BenchGlobalSize(BenchGlobalSize),
/// Prints GPU information
Info,
}
#[derive(StructOpt, Clone, Debug)]
pub struct CalculatePrimes {
#[structopt(flatten)]
pub general_options: GeneralOptions,
/// The number to start with
#[structopt(long = "start", default_value = "0")]
pub start_offset: u64,
/// The maximum number to calculate to
#[structopt(long = "end", default_value = "9223372036854775807")]
pub max_number: u64,
/// The output file for the calculated prime numbers
#[structopt(short = "o", long = "output", default_value = "primes.txt")]
pub output_file: PathBuf,
/// The output file for timings
#[structopt(long = "timings-output", default_value = "timings.csv")]
pub timings_file: PathBuf,
/// The local size for the tasks.
/// The value for numbers_per_step needs to be divisible by this number.
/// The maximum local size depends on the gpu capabilities.
/// If no value is provided, OpenCL chooses it automatically.
#[structopt(long = "local-size")]
pub local_size: Option<usize>,
/// The amount of numbers that are checked per step. Even numbers are ignored so the
/// Range actually goes to numbers_per_step * 2.
#[structopt(long = "numbers-per-step", default_value = "33554432")]
pub numbers_per_step: usize,
/// If the prime numbers should be used for the divisibility check instead of using
/// an optimized auto-increment loop.
#[structopt(long = "no-cache")]
pub no_cache: bool,
/// If the calculated prime numbers should be validated on the cpu by a simple prime algorithm
#[structopt(long = "cpu-validate")]
pub cpu_validate: bool,
}
#[derive(StructOpt, Clone, Debug)]
pub struct BenchLocalSize {
#[structopt(flatten)]
pub bench_options: BenchOptions,
/// The initial number for the local size
#[structopt(long = "local-size-start", default_value = "4")]
pub local_size_start: usize,
/// The amount the local size increases by every step
#[structopt(long = "local-size-step", default_value = "4")]
pub local_size_step: usize,
/// The maximum amount of the local size
/// Can't be greater than the maximum local size of the gpu
/// that can be retrieved with the info command
#[structopt(long = "local-size-stop", default_value = "1024")]
pub local_size_stop: usize,
/// The maximum number of tasks for the benchmark
#[structopt(long = "global-size", default_value = "6144")]
pub global_size: usize,
}
#[derive(StructOpt, Clone, Debug)]
pub struct BenchGlobalSize {
#[structopt(flatten)]
pub bench_options: BenchOptions,
/// The start value for the used global size
#[structopt(long = "global-size-start", default_value = "1024")]
pub global_size_start: usize,
/// The step value for the used global size
#[structopt(long = "global-size-step", default_value = "128")]
pub global_size_step: usize,
/// The stop value for the used global size
#[structopt(long = "global-size-stop", default_value = "1048576")]
pub global_size_stop: usize,
/// The maximum number of tasks for the benchmark
#[structopt(long = "local-size", default_value = "128")]
pub local_size: usize,
}
#[derive(StructOpt, Clone, Debug)]
pub struct BenchOptions {
#[structopt(flatten)]
pub general_options: GeneralOptions,
/// How many calculations steps should be done per GPU thread
#[structopt(short = "n", long = "calculation-steps", default_value = "1000000")]
pub calculation_steps: u32,
/// The output file for timings
#[structopt(short = "o", long = "bench-output", default_value = "bench.csv")]
pub benchmark_file: PathBuf,
/// The average of n runs that is used instead of using one value only.
/// By default the benchmark for each step is only run once
#[structopt(short = "r", long = "repetitions", default_value = "1")]
pub repetitions: usize,
}
#[derive(StructOpt, Clone, Debug)]
pub struct GeneralOptions {
/// If the output should be colored
#[structopt(long = "color", possible_values = &UseColors::variants(), case_insensitive = true, default_value = "auto")]
pub color: UseColors,
/// number of used threads
#[structopt(short = "p", long = "threads", default_value = "2")]
pub threads: usize,
}
arg_enum! {
#[derive(Clone, Debug)]
pub enum UseColors {
Off,
On,
Auto,
}
}

@ -3,5 +3,6 @@
* Copyright (C) 2021 trivernis
* See LICENSE for more information
*/
pub mod args;
pub mod logging;
pub mod progress;

Loading…
Cancel
Save