From 1ed8539e730eed39b3f0d73c23dcb72c9ed486fe Mon Sep 17 00:00:00 2001 From: trivernis Date: Wed, 2 Dec 2020 18:14:50 +0100 Subject: [PATCH] Change gpu timing to not include transmissions Signed-off-by: trivernis --- src/kernel_controller/mod.rs | 209 +----------------------------- src/kernel_controller/primes.rs | 218 ++++++++++++++++++++++++++++++++ src/main.rs | 3 +- 3 files changed, 222 insertions(+), 208 deletions(-) create mode 100644 src/kernel_controller/primes.rs diff --git a/src/kernel_controller/mod.rs b/src/kernel_controller/mod.rs index 60fb75d..8003837 100644 --- a/src/kernel_controller/mod.rs +++ b/src/kernel_controller/mod.rs @@ -7,10 +7,8 @@ use ocl::core::DeviceInfo; use ocl::enums::DeviceInfoResult; use ocl::ProQue; -use parking_lot::Mutex; -use std::mem::size_of; -use std::sync::Arc; -use std::time::{Duration, Instant}; + +pub mod primes; pub struct KernelController { pro_que: ProQue, @@ -43,207 +41,4 @@ impl KernelController { _ => Ok(0), } } - - /// Filters all primes from the input without using a precalculated list of primes - /// for divisibility checks - pub fn filter_primes_simple(&self, input: Vec) -> ocl::Result { - let input_buffer = self.pro_que.buffer_builder().len(input.len()).build()?; - input_buffer.write(&input[..]).enq()?; - - let output_buffer = self - .pro_que - .buffer_builder() - .len(input.len()) - .fill_val(0u8) - .build()?; - - let kernel = self - .pro_que - .kernel_builder("check_prime") - .arg(&input_buffer) - .arg(&output_buffer) - .global_work_size(input.len()) - .build()?; - - let start = Instant::now(); - unsafe { - kernel.enq()?; - } - - let mut output = vec![0u8; output_buffer.len()]; - output_buffer.read(&mut output).enq()?; - let gpu_calc_duration = start.elapsed(); - println!( - "GPU IO + Calculation took {} ms", - gpu_calc_duration.as_secs_f64() * 1000f64 - ); - - let filter_start = Instant::now(); - let primes = input - .iter() - .enumerate() - .filter(|(index, _)| output[*index] == 1) - .map(|(_, v)| *v) - .collect::>(); - - Ok(PrimeCalculationResult { - primes, - filter_duration: filter_start.elapsed(), - gpu_duration: gpu_calc_duration, - }) - } - - /// Filters the primes from a list of numbers by using a precalculated list of primes to check - /// for divisibility - pub fn filter_primes(&self, input: Vec) -> ocl::Result { - lazy_static::lazy_static! {static ref PRIME_CACHE: Arc>> = Arc::new(Mutex::new(Vec::new()));} - if PRIME_CACHE.lock().len() == 0 { - PRIME_CACHE.lock().append(&mut get_primes( - (*input.iter().max().unwrap_or(&1024) as f64).sqrt().ceil() as u64, - )); - } - - let prime_buffer = self - .pro_que - .buffer_builder() - .len(PRIME_CACHE.lock().len()) - .build()?; - - prime_buffer.write(&PRIME_CACHE.lock()[..]).enq()?; - - let input_buffer = self.pro_que.buffer_builder().len(input.len()).build()?; - input_buffer.write(&input[..]).enq()?; - - let output_buffer = self - .pro_que - .buffer_builder() - .len(input.len()) - .fill_val(0u8) - .build()?; - - let kernel = self - .pro_que - .kernel_builder("check_prime_cached") - .arg(prime_buffer.len() as u32) - .arg(&prime_buffer) - .arg(&input_buffer) - .arg(&output_buffer) - .global_work_size(input.len()) - .build()?; - - let start = Instant::now(); - unsafe { - kernel.enq()?; - } - - let mut output = vec![0u8; output_buffer.len()]; - output_buffer.read(&mut output).enq()?; - - let gpu_calc_duration = start.elapsed(); - - println!( - "GPU IO + Calculation took {} ms", - gpu_calc_duration.as_secs_f64() * 1000f64 - ); - - let prime_filter_start = Instant::now(); - let primes = input - .iter() - .enumerate() - .filter(|(index, _)| output[*index] == 1) - .map(|(_, v)| *v) - .collect::>(); - let filter_duration = prime_filter_start.elapsed(); - - let prime_calc_start = Instant::now(); - let mut prime_cache = PRIME_CACHE.lock(); - - if (prime_cache.len() + primes.len()) * size_of::() - < self.available_memory()? as usize / 4 - { - prime_cache.append(&mut primes.clone()); - prime_cache.sort(); - prime_cache.dedup(); - } - let cache_duration = prime_calc_start.elapsed(); - println!( - "Prime caching took: {} ms, size: {}", - cache_duration.as_secs_f64() * 1000f64, - prime_cache.len(), - ); - - Ok(PrimeCalculationResult { - primes, - gpu_duration: gpu_calc_duration, - filter_duration, - }) - } -} - -/// Returns a list of prime numbers that can be used to speed up the divisibility check -fn get_primes(max_number: u64) -> Vec { - let start = Instant::now(); - let mut primes = Vec::with_capacity((max_number as f64).sqrt() as usize); - let mut num = 1; - - while num < max_number { - let mut is_prime = true; - - if num == 2 || num == 3 { - is_prime = true; - } else if num == 1 || num % 2 == 0 { - is_prime = false; - } else { - let check_stop = (num as f64).sqrt().ceil() as u64; - - if check_stop <= 9 { - for i in (3..check_stop).step_by(2) { - if num % i == 0 { - is_prime = false; - } - } - } else { - for i in (9..(check_stop + 6)).step_by(6) { - if num % (i - 2) == 0 || num % (i - 4) == 0 { - is_prime = false; - } - } - } - } - if is_prime { - primes.push(num) - } - num += 2; - } - println!( - "Generated {} primes on the cpu in {} ms", - primes.len(), - start.elapsed().as_secs_f64() * 1000f64, - ); - - primes -} - -#[allow(dead_code)] -pub fn is_prime(number: u64) -> bool { - if number == 2 || number == 3 { - return true; - } - if number == 1 || number % 2 == 0 { - return false; - } - let limit = (number as f64).sqrt().ceil() as u64; - for i in (3..limit).step_by(2) { - if number % i == 0 { - return false; - } - } - - return true; -} - -pub struct PrimeCalculationResult { - pub primes: Vec, - pub gpu_duration: Duration, - pub filter_duration: Duration, } diff --git a/src/kernel_controller/primes.rs b/src/kernel_controller/primes.rs new file mode 100644 index 0000000..059ce41 --- /dev/null +++ b/src/kernel_controller/primes.rs @@ -0,0 +1,218 @@ +/* + * opencl demos with rust + * Copyright (C) 2020 trivernis + * See LICENSE for more information + */ + +use crate::kernel_controller::KernelController; +use parking_lot::Mutex; +use std::mem::size_of; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +pub struct PrimeCalculationResult { + pub primes: Vec, + pub gpu_duration: Duration, + pub filter_duration: Duration, +} + +impl KernelController { + /// Filters all primes from the input without using a precalculated list of primes + /// for divisibility checks + pub fn filter_primes_simple(&self, input: Vec) -> ocl::Result { + let input_buffer = self.pro_que.buffer_builder().len(input.len()).build()?; + input_buffer.write(&input[..]).enq()?; + + let output_buffer = self + .pro_que + .buffer_builder() + .len(input.len()) + .fill_val(0u8) + .build()?; + + let kernel = self + .pro_que + .kernel_builder("check_prime") + .arg(&input_buffer) + .arg(&output_buffer) + .global_work_size(input.len()) + .build()?; + + let start = Instant::now(); + unsafe { + kernel.enq()?; + } + + self.pro_que.finish()?; + let gpu_calc_duration = start.elapsed(); + + let mut output = vec![0u8; output_buffer.len()]; + output_buffer.read(&mut output).enq()?; + println!( + "GPU IO + Calculation took {} ms", + gpu_calc_duration.as_secs_f64() * 1000f64 + ); + + let filter_start = Instant::now(); + let primes = map_gpu_prime_result(input, output); + + Ok(PrimeCalculationResult { + primes, + filter_duration: filter_start.elapsed(), + gpu_duration: gpu_calc_duration, + }) + } + + /// Filters the primes from a list of numbers by using a precalculated list of primes to check + /// for divisibility + pub fn filter_primes(&self, input: Vec) -> ocl::Result { + lazy_static::lazy_static! {static ref PRIME_CACHE: Arc>> = Arc::new(Mutex::new(Vec::new()));} + if PRIME_CACHE.lock().len() == 0 { + PRIME_CACHE.lock().append(&mut get_primes( + (*input.iter().max().unwrap_or(&1024) as f64).sqrt().ceil() as u64, + )); + } + + let prime_buffer = self + .pro_que + .buffer_builder() + .len(PRIME_CACHE.lock().len()) + .build()?; + + prime_buffer.write(&PRIME_CACHE.lock()[..]).enq()?; + + let input_buffer = self.pro_que.buffer_builder().len(input.len()).build()?; + input_buffer.write(&input[..]).enq()?; + + let output_buffer = self + .pro_que + .buffer_builder() + .len(input.len()) + .fill_val(0u8) + .build()?; + + let kernel = self + .pro_que + .kernel_builder("check_prime_cached") + .arg(prime_buffer.len() as u32) + .arg(&prime_buffer) + .arg(&input_buffer) + .arg(&output_buffer) + .global_work_size(input.len()) + .build()?; + + let start = Instant::now(); + unsafe { + kernel.enq()?; + } + + self.pro_que.finish()?; + let gpu_calc_duration = start.elapsed(); + + let mut output = vec![0u8; output_buffer.len()]; + output_buffer.read(&mut output).enq()?; + + println!( + "GPU IO + Calculation took {} ms", + gpu_calc_duration.as_secs_f64() * 1000f64 + ); + + let prime_filter_start = Instant::now(); + let primes = map_gpu_prime_result(input, output); + let filter_duration = prime_filter_start.elapsed(); + + let prime_calc_start = Instant::now(); + let mut prime_cache = PRIME_CACHE.lock(); + + if (prime_cache.len() + primes.len()) * size_of::() + < self.available_memory()? as usize / 4 + { + prime_cache.append(&mut primes.clone()); + prime_cache.sort(); + prime_cache.dedup(); + } + let cache_duration = prime_calc_start.elapsed(); + println!( + "Prime caching took: {} ms, size: {}", + cache_duration.as_secs_f64() * 1000f64, + prime_cache.len(), + ); + + Ok(PrimeCalculationResult { + primes, + gpu_duration: gpu_calc_duration, + filter_duration, + }) + } +} + +/// Returns a list of prime numbers that can be used to speed up the divisibility check +fn get_primes(max_number: u64) -> Vec { + let start = Instant::now(); + let mut primes = Vec::with_capacity((max_number as f64).sqrt() as usize); + let mut num = 1; + + while num < max_number { + let mut is_prime = true; + + if num == 2 || num == 3 { + is_prime = true; + } else if num == 1 || num % 2 == 0 { + is_prime = false; + } else { + let check_stop = (num as f64).sqrt().ceil() as u64; + + if check_stop <= 9 { + for i in (3..check_stop).step_by(2) { + if num % i == 0 { + is_prime = false; + } + } + } else { + for i in (9..(check_stop + 6)).step_by(6) { + if num % (i - 2) == 0 || num % (i - 4) == 0 { + is_prime = false; + } + } + } + } + if is_prime { + primes.push(num) + } + num += 2; + } + println!( + "Generated {} primes on the cpu in {} ms", + primes.len(), + start.elapsed().as_secs_f64() * 1000f64, + ); + + primes +} + +pub fn is_prime(number: u64) -> bool { + if number == 2 || number == 3 { + return true; + } + if number == 1 || number % 2 == 0 { + return false; + } + let limit = (number as f64).sqrt().ceil() as u64; + for i in (3..limit).step_by(2) { + if number % i == 0 { + return false; + } + } + + return true; +} + +#[inline] +fn map_gpu_prime_result(input: Vec, output: Vec) -> Vec { + input + .into_iter() + .enumerate() + .filter(|(index, _)| output[*index] == 1) + .map(|(_, v)| v) + .collect::>() +} diff --git a/src/main.rs b/src/main.rs index e31d19c..3783564 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,7 +4,8 @@ * See LICENSE for more information */ -use crate::kernel_controller::{is_prime, KernelController}; +use crate::kernel_controller::primes::is_prime; +use crate::kernel_controller::KernelController; use rayon::prelude::*; use std::fs::{File, OpenOptions}; use std::io::{BufWriter, Write};