Separate local and global size bench and use streamed executor

Signed-off-by: Trivernis <trivernis@protonmail.com>
main
Trivernis 4 years ago
parent 32bb3f32f6
commit 5659ee2923

5
Cargo.lock generated

@ -517,13 +517,14 @@ dependencies = [
[[package]] [[package]]
name = "ocl-stream" name = "ocl-stream"
version = "0.3.0" version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6571c0dc580e1603bdf23e277402b8dea73c0631de6a123b796da9a27681c960" checksum = "2cc003c0e91a8daaa706bd4231a05080d18346c97dc051955cce45de60a54ac7"
dependencies = [ dependencies = [
"crossbeam-channel 0.5.0", "crossbeam-channel 0.5.0",
"num_cpus", "num_cpus",
"ocl", "ocl",
"parking_lot",
"thiserror", "thiserror",
] ]

@ -12,5 +12,5 @@ structopt = "0.3.20"
lazy_static = "1.4.0" lazy_static = "1.4.0"
parking_lot = "0.11.1" parking_lot = "0.11.1"
rayon = "1.5.0" rayon = "1.5.0"
ocl-stream = "0.3.0" ocl-stream = "0.3.4"
crossbeam-channel = "0.5.0" crossbeam-channel = "0.5.0"

@ -4,14 +4,22 @@
* See LICENSE for more information * See LICENSE for more information
*/ */
use crate::benching::enqueue_profiled;
use crate::kernel_controller::KernelController; use crate::kernel_controller::KernelController;
use ocl_stream::executor::context::ExecutorContext;
use ocl_stream::executor::stream::OCLStream;
use ocl_stream::traits::*;
use ocl_stream::utils::result::OCLStreamResult;
use ocl_stream::utils::shared_buffer::SharedBuffer;
use std::fmt::{self, Display, Formatter}; use std::fmt::{self, Display, Formatter};
use std::ops::Deref;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::time::{Duration, Instant}; use std::time::{Duration, Instant};
pub struct BenchStatistics { pub struct BenchStatistics {
pub calc_count: u32, pub calc_count: u32,
pub num_tasks: usize, pub global_size: usize,
pub local_size: Option<usize>, pub local_size: usize,
pub write_duration: Duration, pub write_duration: Duration,
pub calc_duration: Duration, pub calc_duration: Duration,
pub read_duration: Duration, pub read_duration: Duration,
@ -23,8 +31,8 @@ impl Display for BenchStatistics {
f, f,
"Calculation Count: {}\nTask Count: {}\nLocal Size: {}\nWrite Duration: {} ms\nGPU Duration: {} ms\nRead Duration: {} ms", "Calculation Count: {}\nTask Count: {}\nLocal Size: {}\nWrite Duration: {} ms\nGPU Duration: {} ms\nRead Duration: {} ms",
self.calc_count, self.calc_count,
self.num_tasks, self.global_size,
self.local_size.map(|v|v.to_string()).unwrap_or("n/a".to_string()), self.local_size,
self.write_duration.as_secs_f64() * 1000f64, self.write_duration.as_secs_f64() * 1000f64,
self.calc_duration.as_secs_f64() * 1000f64, self.calc_duration.as_secs_f64() * 1000f64,
self.read_duration.as_secs_f64() * 1000f64 self.read_duration.as_secs_f64() * 1000f64
@ -32,55 +40,108 @@ impl Display for BenchStatistics {
} }
} }
impl BenchStatistics { impl KernelController {
pub fn avg(&mut self, other: Self) { /// Benchmarks the value for the global size
self.read_duration = (self.read_duration + other.read_duration) / 2; pub fn bench_global_size(
self.write_duration = (self.write_duration + other.write_duration) / 2; &self,
self.calc_duration = (self.calc_duration + other.calc_duration) / 2; local_size: usize,
global_size_start: usize,
global_size_step: usize,
global_size_stop: usize,
calc_count: u32,
repetitions: usize,
) -> OCLStreamResult<OCLStream<BenchStatistics>> {
let global_size = AtomicUsize::new(global_size_start);
let stream = self.executor.execute_bounded(global_size_stop, move |ctx| {
loop {
if global_size.load(Ordering::SeqCst) > global_size_stop {
break;
}
let global_size = global_size.fetch_add(global_size_step, Ordering::SeqCst);
if global_size % local_size != 0 {
continue;
}
let input_buffer: SharedBuffer<u32> =
vec![0u32; global_size].to_shared_buffer(ctx.pro_que())?;
for _ in 0..repetitions {
let stats =
Self::bench_int(&ctx, local_size, calc_count, input_buffer.clone())?;
ctx.sender().send(stats)?;
}
}
Ok(())
});
Ok(stream)
} }
}
impl KernelController { /// Benchmarks the value for the local size
/// Benches an integer pub fn bench_local_size(
pub fn bench_int(
&self, &self,
global_size: usize,
local_size_start: usize,
local_size_step: usize,
local_size_stop: usize,
calc_count: u32, calc_count: u32,
num_tasks: usize, repetitions: usize,
local_size: Option<usize>, ) -> OCLStreamResult<OCLStream<BenchStatistics>> {
let input_buffer: SharedBuffer<u32> =
vec![0u32; global_size].to_shared_buffer(self.executor.pro_que())?;
let local_size = AtomicUsize::new(local_size_start);
let stream = self.executor.execute_bounded(global_size, move |ctx| {
loop {
if local_size.load(Ordering::SeqCst) > local_size_stop {
break;
}
let local_size = local_size.fetch_add(local_size_step, Ordering::SeqCst);
if local_size > 1024 || global_size % local_size != 0 {
continue;
}
for _ in 0..repetitions {
let stats =
Self::bench_int(&ctx, local_size, calc_count, input_buffer.clone())?;
ctx.sender().send(stats)?;
}
}
Ok(())
});
Ok(stream)
}
/// Benches an integer
fn bench_int(
ctx: &ExecutorContext<BenchStatistics>,
local_size: usize,
calc_count: u32,
input_buffer: SharedBuffer<u32>,
) -> ocl::Result<BenchStatistics> { ) -> ocl::Result<BenchStatistics> {
let num_tasks = input_buffer.inner().lock().len();
let write_start = Instant::now(); let write_start = Instant::now();
let input_buffer = self
.pro_que
.buffer_builder()
.len(num_tasks)
.fill_val(0u32)
.build()?;
let write_duration = write_start.elapsed(); let write_duration = write_start.elapsed();
let mut builder = self.pro_que.kernel_builder("bench_int"); let kernel = ctx
.pro_que()
if let Some(local_size) = local_size { .kernel_builder("bench_int")
builder.local_work_size(local_size); .local_work_size(local_size)
}
let kernel = builder
.arg(calc_count)
.arg(&input_buffer)
.global_work_size(num_tasks) .global_work_size(num_tasks)
.arg(calc_count)
.arg(input_buffer.inner().lock().deref())
.build()?; .build()?;
let calc_start = Instant::now();
unsafe { let calc_duration = enqueue_profiled(ctx.pro_que(), &kernel)?;
kernel.enq()?;
}
self.pro_que.finish()?;
let calc_duration = calc_start.elapsed();
let mut output = vec![0u32; num_tasks]; let mut output = vec![0u32; num_tasks];
let read_start = Instant::now(); let read_start = Instant::now();
input_buffer.read(&mut output).enq()?; input_buffer.read(&mut output)?;
let read_duration = read_start.elapsed(); let read_duration = read_start.elapsed();
Ok(BenchStatistics { Ok(BenchStatistics {
num_tasks, global_size: num_tasks,
calc_count, calc_count,
local_size, local_size,
read_duration, read_duration,

@ -68,6 +68,7 @@ impl KernelController {
.fill_val(0u8) .fill_val(0u8)
.build()?; .build()?;
let input_buffer = numbers.to_ocl_buffer(pro_que)?; let input_buffer = numbers.to_ocl_buffer(pro_que)?;
let kernel = pro_que let kernel = pro_que
.kernel_builder("check_prime") .kernel_builder("check_prime")
.local_work_size(local_size) .local_work_size(local_size)

@ -11,7 +11,9 @@ use crate::output::create_prime_write_thread;
use crate::output::csv::ThreadedCSVWriter; use crate::output::csv::ThreadedCSVWriter;
use crate::output::threaded::ThreadedWriter; use crate::output::threaded::ThreadedWriter;
use ocl_stream::utils::result::OCLStreamResult; use crate::kernel_controller::bench::BenchStatistics;
use ocl_stream::stream::OCLStream;
use ocl_stream::utils::result::{OCLStreamError, OCLStreamResult};
use rayon::prelude::*; use rayon::prelude::*;
use std::fs::{File, OpenOptions}; use std::fs::{File, OpenOptions};
use std::io::BufWriter; use std::io::BufWriter;
@ -33,9 +35,13 @@ enum Opts {
#[structopt(name = "calculate-primes")] #[structopt(name = "calculate-primes")]
CalculatePrimes(CalculatePrimes), CalculatePrimes(CalculatePrimes),
/// Benchmarks the number of tasks used for the calculations /// Benchmarks the local size value
#[structopt(name = "bench-task-count")] #[structopt(name = "bench-local-size")]
BenchmarkTaskCount(BenchmarkTaskCount), BenchLocalSize(BenchLocalSize),
/// Benchmarks the global size (number of tasks) value
#[structopt(name = "bench-global-size")]
BenchGlobalSize(BenchGlobalSize),
/// Prints GPU information /// Prints GPU information
Info, Info,
@ -90,62 +96,82 @@ struct CalculatePrimes {
} }
#[derive(StructOpt, Clone, Debug)] #[derive(StructOpt, Clone, Debug)]
struct BenchmarkTaskCount { struct BenchLocalSize {
/// How many calculations steps should be done per GPU thread #[structopt(flatten)]
#[structopt(long = "calculation-steps", default_value = "1000000")] bench_options: BenchOptions,
calculation_steps: u32,
/// The initial number of tasks for the benchmark
#[structopt(long = "num-tasks-start", default_value = "1")]
num_tasks_start: usize,
/// The initial number for the local size /// The initial number for the local size
#[structopt(long = "local-size-start")] #[structopt(long = "local-size-start", default_value = "4")]
local_size_start: Option<usize>, local_size_start: usize,
/// The amount the local size increases by every step /// The amount the local size increases by every step
#[structopt(long = "local-size-step", default_value = "10")] #[structopt(long = "local-size-step", default_value = "4")]
local_size_step: usize, local_size_step: usize,
/// The maximum amount of the local size /// The maximum amount of the local size
/// Can't be greater than the maximum local size of the gpu /// Can't be greater than the maximum local size of the gpu
/// that can be retrieved with the info command /// that can be retrieved with the info command
#[structopt(long = "local-size-stop")] #[structopt(long = "local-size-stop", default_value = "1024")]
local_size_stop: Option<usize>, local_size_stop: usize,
/// The maximum number of tasks for the benchmark /// The maximum number of tasks for the benchmark
#[structopt(long = "num-tasks-stop", default_value = "10000000")] #[structopt(long = "global-size", default_value = "6144")]
num_tasks_stop: usize, global_size: usize,
}
#[derive(StructOpt, Clone, Debug)]
pub struct BenchGlobalSize {
#[structopt(flatten)]
options: BenchOptions,
/// The amount the task number increases per step /// The start value for the used global size
#[structopt(long = "num-tasks-step", default_value = "10")] #[structopt(long = "global-size-start", default_value = "1024")]
num_tasks_step: usize, global_size_start: usize,
/// The average of n runs that is used instead of using one value only. /// The step value for the used global size
/// By default the benchmark for each step is only run once #[structopt(long = "global-size-step", default_value = "128")]
#[structopt(long = "average-of", default_value = "1")] global_size_step: usize,
average_of: usize,
/// The stop value for the used global size
#[structopt(long = "global-size-stop", default_value = "1048576")]
global_size_stop: usize,
/// The maximum number of tasks for the benchmark
#[structopt(long = "local-size", default_value = "128")]
local_size: usize,
}
#[derive(StructOpt, Clone, Debug)]
pub struct BenchOptions {
/// How many calculations steps should be done per GPU thread
#[structopt(short = "n", long = "calculation-steps", default_value = "1000000")]
calculation_steps: u32,
/// The output file for timings /// The output file for timings
#[structopt(long = "bench-output", default_value = "bench.csv")] #[structopt(short = "o", long = "bench-output", default_value = "bench.csv")]
benchmark_file: PathBuf, benchmark_file: PathBuf,
/// The average of n runs that is used instead of using one value only.
/// By default the benchmark for each step is only run once
#[structopt(short = "r", long = "repetitions", default_value = "1")]
repetitions: usize,
} }
fn main() -> ocl::Result<()> { fn main() -> OCLStreamResult<()> {
let opts: Opts = Opts::from_args(); let opts: Opts = Opts::from_args();
let controller = KernelController::new()?; let controller = KernelController::new()?;
match opts { match opts {
Opts::Info => controller.print_info(), Opts::Info => controller.print_info().map_err(OCLStreamError::from),
Opts::CalculatePrimes(prime_opts) => { Opts::CalculatePrimes(prime_opts) => {
if prime_opts.streamed { if prime_opts.streamed {
calculate_primes_streamed(prime_opts, controller).unwrap(); calculate_primes_streamed(prime_opts, controller)
Ok(())
} else { } else {
calculate_primes(prime_opts, controller) calculate_primes(prime_opts, controller).map_err(OCLStreamError::from)
} }
} }
Opts::BenchmarkTaskCount(bench_opts) => bench_task_count(bench_opts, controller), Opts::BenchGlobalSize(bench_opts) => bench_global_size(bench_opts, controller),
Opts::BenchLocalSize(bench_opts) => bench_local_size(bench_opts, controller),
} }
} }
@ -275,65 +301,86 @@ fn calculate_primes(prime_opts: CalculatePrimes, controller: KernelController) -
Ok(()) Ok(())
} }
fn bench_task_count(opts: BenchmarkTaskCount, controller: KernelController) -> ocl::Result<()> { /// Benchmarks the local size used for calculations
let bench_writer = BufWriter::new( fn bench_local_size(opts: BenchLocalSize, controller: KernelController) -> OCLStreamResult<()> {
OpenOptions::new() let bench_writer = open_write_buffered(&opts.bench_options.benchmark_file);
.truncate(true) let csv_writer = ThreadedCSVWriter::new(
.write(true) bench_writer,
.create(true) &[
.open(opts.benchmark_file) "local_size",
.unwrap(), "global_size",
"calc_count",
"write_duration",
"gpu_duration",
"read_duration",
],
); );
let mut csv_writer = ThreadedCSVWriter::new( let stream = controller.bench_local_size(
opts.global_size,
opts.local_size_start,
opts.local_size_step,
opts.local_size_stop,
opts.bench_options.calculation_steps,
opts.bench_options.repetitions,
)?;
read_bench_results(opts.bench_options.calculation_steps, csv_writer, stream);
Ok(())
}
/// Benchmarks the global size used for calculations
fn bench_global_size(opts: BenchGlobalSize, controller: KernelController) -> OCLStreamResult<()> {
let bench_writer = open_write_buffered(&opts.options.benchmark_file);
let csv_writer = ThreadedCSVWriter::new(
bench_writer, bench_writer,
&[ &[
"local_size", "local_size",
"num_tasks", "global_size",
"calc_count", "calc_count",
"write_duration", "write_duration",
"gpu_duration", "gpu_duration",
"read_duration", "read_duration",
], ],
); );
for n in (opts.num_tasks_start..=opts.num_tasks_stop).step_by(opts.num_tasks_step) { let stream = controller.bench_global_size(
if let (Some(start), Some(stop)) = (opts.local_size_start, opts.local_size_stop) { opts.local_size,
for l in (start..=stop) opts.global_size_start,
.step_by(opts.local_size_step) opts.global_size_step,
.filter(|v| n % v == 0) opts.global_size_stop,
{ opts.options.calculation_steps,
let mut stats = controller.bench_int(opts.calculation_steps, n, Some(l))?; opts.options.repetitions,
for _ in 1..opts.average_of { )?;
stats.avg(controller.bench_int(opts.calculation_steps, n, Some(l))?) read_bench_results(opts.options.calculation_steps, csv_writer, stream);
}
Ok(())
}
/// Reads benchmark results from the stream and prints
/// them to the console
fn read_bench_results(
calculation_steps: u32,
mut csv_writer: ThreadedCSVWriter,
mut stream: OCLStream<BenchStatistics>,
) {
loop {
match stream.next() {
Ok(stats) => {
println!("{}\n", stats); println!("{}\n", stats);
csv_writer.add_row(vec![ csv_writer.add_row(vec![
l.to_string(), stats.local_size.to_string(),
n.to_string(), stats.global_size.to_string(),
opts.calculation_steps.to_string(), calculation_steps.to_string(),
duration_to_ms_string(&stats.write_duration), duration_to_ms_string(&stats.write_duration),
duration_to_ms_string(&stats.calc_duration), duration_to_ms_string(&stats.calc_duration),
duration_to_ms_string(&stats.read_duration), duration_to_ms_string(&stats.read_duration),
]) ])
} }
} else { _ => {
let mut stats = controller.bench_int(opts.calculation_steps, n, None)?; break;
for _ in 1..opts.average_of {
stats.avg(controller.bench_int(opts.calculation_steps, n, None)?)
} }
println!("{}\n", stats);
csv_writer.add_row(vec![
"n/a".to_string(),
n.to_string(),
opts.calculation_steps.to_string(),
duration_to_ms_string(&stats.write_duration),
duration_to_ms_string(&stats.calc_duration),
duration_to_ms_string(&stats.read_duration),
]);
} }
} }
csv_writer.close(); csv_writer.close();
Ok(())
} }
fn validate_primes_on_cpu(primes: &Vec<u64>) { fn validate_primes_on_cpu(primes: &Vec<u64>) {

Loading…
Cancel
Save