Add streaming executor
Signed-off-by: Trivernis <trivernis@protonmail.com>pull/1/head
parent
8704bd387c
commit
d89c574589
@ -0,0 +1,22 @@
|
|||||||
|
use ocl::core::{get_event_profiling_info, wait_for_event, ProfilingInfo};
|
||||||
|
use ocl::{EventList, Kernel, ProQue};
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
pub mod result;
|
||||||
|
|
||||||
|
/// Runs a benchmark on the kernel
|
||||||
|
/// The ProQue needs to have profiling enabled
|
||||||
|
pub fn enqueue_profiled(pro_que: &ProQue, kernel: &Kernel) -> ocl::Result<Duration> {
|
||||||
|
let event_start = pro_que.queue().enqueue_marker::<EventList>(None)?;
|
||||||
|
unsafe {
|
||||||
|
kernel.enq()?;
|
||||||
|
}
|
||||||
|
let event_stop = pro_que.queue().enqueue_marker::<EventList>(None)?;
|
||||||
|
wait_for_event(&event_start)?;
|
||||||
|
wait_for_event(&event_stop)?;
|
||||||
|
let start = get_event_profiling_info(&event_start, ProfilingInfo::End)?;
|
||||||
|
let stop = get_event_profiling_info(&event_stop, ProfilingInfo::Start)?;
|
||||||
|
let gpu_calc_duration = Duration::from_nanos(stop.time()? - start.time()?);
|
||||||
|
|
||||||
|
Ok(gpu_calc_duration)
|
||||||
|
}
|
@ -0,0 +1,34 @@
|
|||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
/// Result of a benched kernel execution
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct ProfiledResult<T>
|
||||||
|
where
|
||||||
|
T: Send + Sync + Clone,
|
||||||
|
{
|
||||||
|
gpu_duration: Duration,
|
||||||
|
value: T,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> ProfiledResult<T>
|
||||||
|
where
|
||||||
|
T: Send + Sync + Clone,
|
||||||
|
{
|
||||||
|
/// Creates a new profiled result with the given duraiton and value
|
||||||
|
pub fn new(gpu_duration: Duration, value: T) -> Self {
|
||||||
|
Self {
|
||||||
|
gpu_duration,
|
||||||
|
value,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the execution duration on the gpu
|
||||||
|
pub fn gpu_duration(&self) -> &Duration {
|
||||||
|
&self.gpu_duration
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the value of the result
|
||||||
|
pub fn value(&self) -> &T {
|
||||||
|
&self.value
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,69 @@
|
|||||||
|
use crate::benching::enqueue_profiled;
|
||||||
|
use crate::benching::result::ProfiledResult;
|
||||||
|
use crate::kernel_controller::primes::map_gpu_prime_result;
|
||||||
|
use crate::kernel_controller::KernelController;
|
||||||
|
use ocl::ProQue;
|
||||||
|
use ocl_stream::stream::OCLStream;
|
||||||
|
use ocl_stream::traits::ToOclBuffer;
|
||||||
|
use std::sync::atomic::{AtomicU64, Ordering};
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
impl KernelController {
|
||||||
|
pub fn get_primes(
|
||||||
|
&self,
|
||||||
|
mut start: u64,
|
||||||
|
stop: u64,
|
||||||
|
step: usize,
|
||||||
|
local_size: usize,
|
||||||
|
) -> OCLStream<ProfiledResult<Vec<u64>>> {
|
||||||
|
if start % 2 == 0 {
|
||||||
|
start += 1;
|
||||||
|
}
|
||||||
|
let offset = Arc::new(AtomicU64::new(start));
|
||||||
|
self.executor.execute_bounded(step * 10, move |ctx| {
|
||||||
|
loop {
|
||||||
|
let pro_que = ctx.pro_que();
|
||||||
|
let sender = ctx.sender();
|
||||||
|
if offset.load(Ordering::SeqCst) >= stop {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let offset = offset.fetch_add(step as u64 * 2, Ordering::SeqCst);
|
||||||
|
let numbers = (offset..(step as u64 * 2 + offset))
|
||||||
|
.step_by(2)
|
||||||
|
.collect::<Vec<u64>>();
|
||||||
|
let result = Self::filter_primes_streamed(pro_que, numbers, local_size)?;
|
||||||
|
sender.send(result)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates the prime filter kernel and executes it
|
||||||
|
fn filter_primes_streamed(
|
||||||
|
pro_que: &ProQue,
|
||||||
|
numbers: Vec<u64>,
|
||||||
|
local_size: usize,
|
||||||
|
) -> ocl::Result<ProfiledResult<Vec<u64>>> {
|
||||||
|
let output_buffer = pro_que
|
||||||
|
.buffer_builder()
|
||||||
|
.len(numbers.len())
|
||||||
|
.fill_val(0u8)
|
||||||
|
.build()?;
|
||||||
|
let input_buffer = numbers.to_ocl_buffer(pro_que)?;
|
||||||
|
let kernel = pro_que
|
||||||
|
.kernel_builder("check_prime")
|
||||||
|
.local_work_size(local_size)
|
||||||
|
.arg(&input_buffer)
|
||||||
|
.arg(&output_buffer)
|
||||||
|
.global_work_size(numbers.len())
|
||||||
|
.build()?;
|
||||||
|
let duration = enqueue_profiled(pro_que, &kernel)?;
|
||||||
|
|
||||||
|
let mut output = vec![0u8; output_buffer.len()];
|
||||||
|
output_buffer.read(&mut output).enq()?;
|
||||||
|
let primes = map_gpu_prime_result(numbers, output);
|
||||||
|
|
||||||
|
Ok(ProfiledResult::new(duration, primes))
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,45 @@
|
|||||||
|
use crossbeam_channel::Sender;
|
||||||
|
use std::io::Write;
|
||||||
|
use std::mem;
|
||||||
|
use std::thread::{self, JoinHandle};
|
||||||
|
|
||||||
|
pub struct ThreadedWriter<T>
|
||||||
|
where
|
||||||
|
T: Send + Sync,
|
||||||
|
{
|
||||||
|
handle: JoinHandle<()>,
|
||||||
|
tx: Sender<T>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> ThreadedWriter<T>
|
||||||
|
where
|
||||||
|
T: Send + Sync + 'static,
|
||||||
|
{
|
||||||
|
/// Creates a new threaded writer
|
||||||
|
pub fn new<W, F>(mut writer: W, serializer: F) -> Self
|
||||||
|
where
|
||||||
|
F: Fn(T) -> Vec<u8> + Send + Sync + 'static,
|
||||||
|
W: Write + Send + Sync + 'static,
|
||||||
|
{
|
||||||
|
let (tx, rx) = crossbeam_channel::bounded(1024);
|
||||||
|
let handle = thread::spawn(move || {
|
||||||
|
for value in rx {
|
||||||
|
let mut bytes = serializer(value);
|
||||||
|
writer.write_all(&mut bytes[..]).unwrap();
|
||||||
|
writer.flush().unwrap();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
Self { handle, tx }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Writes a value
|
||||||
|
pub fn write(&self, value: T) {
|
||||||
|
self.tx.send(value).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Closes the channel to the writer and waits for the writer thread to stop
|
||||||
|
pub fn close(self) {
|
||||||
|
mem::drop(self.tx);
|
||||||
|
self.handle.join().unwrap();
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue