Add streaming executor
Signed-off-by: Trivernis <trivernis@protonmail.com>pull/1/head
parent
8704bd387c
commit
d89c574589
@ -0,0 +1,22 @@
|
||||
use ocl::core::{get_event_profiling_info, wait_for_event, ProfilingInfo};
|
||||
use ocl::{EventList, Kernel, ProQue};
|
||||
use std::time::Duration;
|
||||
|
||||
pub mod result;
|
||||
|
||||
/// Runs a benchmark on the kernel
|
||||
/// The ProQue needs to have profiling enabled
|
||||
pub fn enqueue_profiled(pro_que: &ProQue, kernel: &Kernel) -> ocl::Result<Duration> {
|
||||
let event_start = pro_que.queue().enqueue_marker::<EventList>(None)?;
|
||||
unsafe {
|
||||
kernel.enq()?;
|
||||
}
|
||||
let event_stop = pro_que.queue().enqueue_marker::<EventList>(None)?;
|
||||
wait_for_event(&event_start)?;
|
||||
wait_for_event(&event_stop)?;
|
||||
let start = get_event_profiling_info(&event_start, ProfilingInfo::End)?;
|
||||
let stop = get_event_profiling_info(&event_stop, ProfilingInfo::Start)?;
|
||||
let gpu_calc_duration = Duration::from_nanos(stop.time()? - start.time()?);
|
||||
|
||||
Ok(gpu_calc_duration)
|
||||
}
|
@ -0,0 +1,34 @@
|
||||
use std::time::Duration;
|
||||
|
||||
/// Result of a benched kernel execution
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct ProfiledResult<T>
|
||||
where
|
||||
T: Send + Sync + Clone,
|
||||
{
|
||||
gpu_duration: Duration,
|
||||
value: T,
|
||||
}
|
||||
|
||||
impl<T> ProfiledResult<T>
|
||||
where
|
||||
T: Send + Sync + Clone,
|
||||
{
|
||||
/// Creates a new profiled result with the given duraiton and value
|
||||
pub fn new(gpu_duration: Duration, value: T) -> Self {
|
||||
Self {
|
||||
gpu_duration,
|
||||
value,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the execution duration on the gpu
|
||||
pub fn gpu_duration(&self) -> &Duration {
|
||||
&self.gpu_duration
|
||||
}
|
||||
|
||||
/// Returns the value of the result
|
||||
pub fn value(&self) -> &T {
|
||||
&self.value
|
||||
}
|
||||
}
|
@ -0,0 +1,69 @@
|
||||
use crate::benching::enqueue_profiled;
|
||||
use crate::benching::result::ProfiledResult;
|
||||
use crate::kernel_controller::primes::map_gpu_prime_result;
|
||||
use crate::kernel_controller::KernelController;
|
||||
use ocl::ProQue;
|
||||
use ocl_stream::stream::OCLStream;
|
||||
use ocl_stream::traits::ToOclBuffer;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
impl KernelController {
|
||||
pub fn get_primes(
|
||||
&self,
|
||||
mut start: u64,
|
||||
stop: u64,
|
||||
step: usize,
|
||||
local_size: usize,
|
||||
) -> OCLStream<ProfiledResult<Vec<u64>>> {
|
||||
if start % 2 == 0 {
|
||||
start += 1;
|
||||
}
|
||||
let offset = Arc::new(AtomicU64::new(start));
|
||||
self.executor.execute_bounded(step * 10, move |ctx| {
|
||||
loop {
|
||||
let pro_que = ctx.pro_que();
|
||||
let sender = ctx.sender();
|
||||
if offset.load(Ordering::SeqCst) >= stop {
|
||||
break;
|
||||
}
|
||||
let offset = offset.fetch_add(step as u64 * 2, Ordering::SeqCst);
|
||||
let numbers = (offset..(step as u64 * 2 + offset))
|
||||
.step_by(2)
|
||||
.collect::<Vec<u64>>();
|
||||
let result = Self::filter_primes_streamed(pro_que, numbers, local_size)?;
|
||||
sender.send(result)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
/// Creates the prime filter kernel and executes it
|
||||
fn filter_primes_streamed(
|
||||
pro_que: &ProQue,
|
||||
numbers: Vec<u64>,
|
||||
local_size: usize,
|
||||
) -> ocl::Result<ProfiledResult<Vec<u64>>> {
|
||||
let output_buffer = pro_que
|
||||
.buffer_builder()
|
||||
.len(numbers.len())
|
||||
.fill_val(0u8)
|
||||
.build()?;
|
||||
let input_buffer = numbers.to_ocl_buffer(pro_que)?;
|
||||
let kernel = pro_que
|
||||
.kernel_builder("check_prime")
|
||||
.local_work_size(local_size)
|
||||
.arg(&input_buffer)
|
||||
.arg(&output_buffer)
|
||||
.global_work_size(numbers.len())
|
||||
.build()?;
|
||||
let duration = enqueue_profiled(pro_que, &kernel)?;
|
||||
|
||||
let mut output = vec![0u8; output_buffer.len()];
|
||||
output_buffer.read(&mut output).enq()?;
|
||||
let primes = map_gpu_prime_result(numbers, output);
|
||||
|
||||
Ok(ProfiledResult::new(duration, primes))
|
||||
}
|
||||
}
|
@ -0,0 +1,45 @@
|
||||
use crossbeam_channel::Sender;
|
||||
use std::io::Write;
|
||||
use std::mem;
|
||||
use std::thread::{self, JoinHandle};
|
||||
|
||||
pub struct ThreadedWriter<T>
|
||||
where
|
||||
T: Send + Sync,
|
||||
{
|
||||
handle: JoinHandle<()>,
|
||||
tx: Sender<T>,
|
||||
}
|
||||
|
||||
impl<T> ThreadedWriter<T>
|
||||
where
|
||||
T: Send + Sync + 'static,
|
||||
{
|
||||
/// Creates a new threaded writer
|
||||
pub fn new<W, F>(mut writer: W, serializer: F) -> Self
|
||||
where
|
||||
F: Fn(T) -> Vec<u8> + Send + Sync + 'static,
|
||||
W: Write + Send + Sync + 'static,
|
||||
{
|
||||
let (tx, rx) = crossbeam_channel::bounded(1024);
|
||||
let handle = thread::spawn(move || {
|
||||
for value in rx {
|
||||
let mut bytes = serializer(value);
|
||||
writer.write_all(&mut bytes[..]).unwrap();
|
||||
writer.flush().unwrap();
|
||||
}
|
||||
});
|
||||
Self { handle, tx }
|
||||
}
|
||||
|
||||
/// Writes a value
|
||||
pub fn write(&self, value: T) {
|
||||
self.tx.send(value).unwrap();
|
||||
}
|
||||
|
||||
/// Closes the channel to the writer and waits for the writer thread to stop
|
||||
pub fn close(self) {
|
||||
mem::drop(self.tx);
|
||||
self.handle.join().unwrap();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue