From d4081c7debddf03648e210e66d680ded69c2f431 Mon Sep 17 00:00:00 2001 From: Max Ehrlicher-Schmidt Date: Fri, 15 Jan 2021 14:46:20 +0100 Subject: [PATCH] Delete unnecessary scripts --- LIFE.py | 101 -------------------------------- README.md | 2 +- customKernel.py | 37 ------------ main.py | 153 ++++++++++++++++++++++++++++++++++++++++++------ primesGPU.py | 142 -------------------------------------------- 5 files changed, 135 insertions(+), 300 deletions(-) delete mode 100644 LIFE.py delete mode 100644 customKernel.py delete mode 100644 primesGPU.py diff --git a/LIFE.py b/LIFE.py deleted file mode 100644 index cbee769..0000000 --- a/LIFE.py +++ /dev/null @@ -1,101 +0,0 @@ -# Iterative Conway's game of life in Python / CUDA C -# this version is meant to illustrate the use of shared kernel memory in CUDA. -# written by Brian Tuomanen for "Hands on GPU Programming with Python and CUDA" - -import pycuda.autoinit -import pycuda.driver as drv -from pycuda import gpuarray -from pycuda.compiler import SourceModule -import numpy as np -import matplotlib.pyplot as plt -from time import time - -shared_ker = SourceModule(""" -#define _iters 1000000 - -#define _X ( threadIdx.x + blockIdx.x * blockDim.x ) -#define _Y ( threadIdx.y + blockIdx.y * blockDim.y ) - -#define _WIDTH ( blockDim.x * gridDim.x ) -#define _HEIGHT ( blockDim.y * gridDim.y ) - -#define _XM(x) ( (x + _WIDTH) % _WIDTH ) -#define _YM(y) ( (y + _HEIGHT) % _HEIGHT ) - -#define _INDEX(x,y) ( _XM(x) + _YM(y) * _WIDTH ) - -// return the number of living neighbors for a given cell -__device__ int nbrs(int x, int y, int * in) -{ - return ( in[ _INDEX(x -1, y+1) ] + in[ _INDEX(x-1, y) ] + in[ _INDEX(x-1, y-1) ] \ - + in[ _INDEX(x, y+1)] + in[_INDEX(x, y - 1)] \ - + in[ _INDEX(x+1, y+1) ] + in[ _INDEX(x+1, y) ] + in[ _INDEX(x+1, y-1) ] ); -} - -__global__ void conway_ker_shared(int * p_lattice, int iters) -{ - // x, y are the appropriate values for the cell covered by this thread - int x = _X, y = _Y; - __shared__ int lattice[32*32]; - - - lattice[_INDEX(x,y)] = p_lattice[_INDEX(x,y)]; - __syncthreads(); - - for (int i = 0; i < iters; i++) - { - - // count the number of neighbors around the current cell - int n = nbrs(x, y, lattice); - - int cell_value; - - - // if the current cell is alive, then determine if it lives or dies for the next generation. - if ( lattice[_INDEX(x,y)] == 1) - switch(n) - { - // if the cell is alive: it remains alive only if it has 2 or 3 neighbors. - case 2: - case 3: cell_value = 1; - break; - default: cell_value = 0; - } - else if( lattice[_INDEX(x,y)] == 0 ) - switch(n) - { - // a dead cell comes to life only if it has 3 neighbors that are alive. - case 3: cell_value = 1; - break; - default: cell_value = 0; - } - - __syncthreads(); - lattice[_INDEX(x,y)] = cell_value; - __syncthreads(); - - } - - __syncthreads(); - p_lattice[_INDEX(x,y)] = lattice[_INDEX(x,y)]; - __syncthreads(); - -} -""") - -conway_ker_shared = shared_ker.get_function("conway_ker_shared") - -if __name__ == '__main__': - # set lattice size - N = 32 - - lattice = np.int32(np.random.choice([1, 0], N * N, p=[0.25, 0.75]).reshape(N, N)) - lattice_gpu = gpuarray.to_gpu(lattice) - - conway_ker_shared(lattice_gpu, np.int32(1000000), grid=(1, 1, 1), block=(32, 32, 1)) - - fig = plt.figure(1) - plt.imshow(lattice_gpu.get()) - plt.show() - - diff --git a/README.md b/README.md index 0ade61f..b5ac008 100644 --- a/README.md +++ b/README.md @@ -1 +1 @@ -helloCuda +Cuda Benchmark diff --git a/customKernel.py b/customKernel.py deleted file mode 100644 index 7583d91..0000000 --- a/customKernel.py +++ /dev/null @@ -1,37 +0,0 @@ -import numpy as np -import pycuda.autoinit -from pycuda import gpuarray -from time import time -from pycuda.elementwise import ElementwiseKernel - -host_data = np.float32(np.random.random(50000000)) - -gpu_2x_ker = ElementwiseKernel( - "float *in, float *out", - "out[i] = 2*in[i];", - "gpu_2x_ker") - -# warm up -test_data = gpuarray.to_gpu(host_data) -gpu_2x_ker(test_data, gpuarray.empty_like(test_data)) - - -def speed_comparison(): - t1 = time() - host_data_2x = host_data * np.float32(2) - t2 = time() - print('total time to compute on CPU: %f' % (t2 - t1)) - device_data = gpuarray.to_gpu(host_data) - # allocate memory for output - device_data_2x = gpuarray.empty_like(device_data) - t1 = time() - gpu_2x_ker(device_data, device_data_2x) - t2 = time() - from_device = device_data_2x.get() - print('total time to compute on GPU: %f' % (t2 - t1)) - print( - 'Is the host computation the same as the GPU computation? : {}'.format(np.allclose(from_device, host_data_2x))) - - -if __name__ == '__main__': - speed_comparison() diff --git a/main.py b/main.py index 74fcc65..86a59aa 100644 --- a/main.py +++ b/main.py @@ -1,30 +1,145 @@ +import math +from time import time + +import pycuda.autoinit +import pycuda.driver as drv import numpy as np from pycuda import gpuarray +from pycuda.compiler import SourceModule -# -- initialize the device -import pycuda.autoinit +from optparse import OptionParser + +ker = SourceModule(""" +__global__ void +check_prime(unsigned long long *input, bool *output) +{ + int i = threadIdx.x + blockDim.x * blockIdx.x; + + unsigned long long num = input[i]; + if (num == 2) { + output[i] = true; + return; + } else if (num < 3 || num % 2 == 0) { + return; + } + unsigned long long limit = (long) sqrt((double) num) + 1; + for (unsigned long long i = 3; i <= limit; i += 2) { + if (num % i == 0) { + return; + } + } + output[i] = true; +} +""") + +ker2 = SourceModule(""" +__global__ void check_prime2(const unsigned long long *IN, bool *OUT) { + int id = threadIdx.x + blockDim.x * blockIdx.x; + unsigned long long num = IN[id]; + unsigned long long limit = (unsigned long long) sqrt((double) num) + 1; + + if (num == 2 || num == 3) { + OUT[id] = true; + return; + } else if (num == 1 || num % 2 == 0) { + return; + } + if (limit < 9) { + for (unsigned long long i = 3; i <= limit; i++) { + if (num % i == 0) { + return; + } + } + } else { + if (num > 3 && num % 3 == 0) { + return; + } + for (unsigned long long i = 9; i <= (limit + 6); i += 6) { + if (num % (i - 2) == 0 || num % (i - 4) == 0) { + return; + } + } + } + + OUT[id] = true; +} +""") + + +def calc_primes(start: int = 1, grid_size: int = 1000, block_size: int = 1024): + check_prime = ker2.get_function("check_prime2") + + primes = [] + if start < 2: + primes = [2] + start = 3 + if start % 2 == 0: + start = start + 1 + + startEvent = drv.Event() + endEvent = drv.Event() + + testvec = np.arange(start, block_size * grid_size * 2 + start, step=2).astype(np.ulonglong) + + testvec_gpu = gpuarray.to_gpu(testvec) + outvec_gpu = gpuarray.to_gpu(np.full(block_size * grid_size, False, dtype=bool)) + startEvent.record() + check_prime(testvec_gpu, outvec_gpu, block=(block_size, 1, 1), grid=(grid_size, 1, 1)) + endEvent.record() + endEvent.synchronize() + kernel_execution_time = startEvent.time_till(endEvent) + + result = outvec_gpu.get() + + for idx, val in enumerate(result): + if val: + primes.append(testvec[idx]) -dev = pycuda.autoinit.device + print('checked ' + str(block_size * grid_size) + ' numbers' + ' (' + str(start) + ' - ' + str( + start + block_size * grid_size) + ')') + print('last prime: ' + str(primes[-1])) + print('The GPU needed ' + str(kernel_execution_time) + ' milliseconds') -print(dev.name()) -print('\t Total Memory: {} megabytes'.format(dev.total_memory() // (1024 ** 2))) + with open(options.timings_output, 'a') as file: + file.write(str(start) + "," + str(kernel_execution_time) + "," + str((block_size * grid_size)/(kernel_execution_time/1000)) + "\n") -device_attributes = {} -for k, v in dev.get_attributes().items(): - device_attributes[str(k)] = v - print('\t ' + str(k) + ': ' + str(v)) + return primes -host_data = np.array([1, 2, 3, 4, 5], dtype=np.float32) -host_data_2 = np.array([7, 12, 3, 5, 4], dtype=np.float32) -device_data = gpuarray.to_gpu(host_data) -device_data_2 = gpuarray.to_gpu(host_data_2) +if __name__ == "__main__": + parser = OptionParser() + parser.add_option("-e", "--end", dest="end", + help="numbers to check without even numbers", default="5000000000", type="int") + parser.add_option("--numbers-per-step", dest="numbers_per_step", + help="amount of uneven numbers checked in each step (even number are skipped)", default="8000000", + type="int") + parser.add_option("--block_size", dest="block_size", + help="number of threads per block, max = 1024", default="1024", + type="int") + parser.add_option("--output", dest="output", + help="name of the file, where the primes should be stored", default="primes.txt", type="string") + parser.add_option("--timings-output", dest="timings_output", + help="name of the csv file, where the timing is logged as csv", default="timings.csv", + type="string") + parser.add_option("--save-primes", dest="save_primes", + help="whether the calculated primes should be saved in a txt file", default=False) + (options, args) = parser.parse_args() -print(host_data * host_data_2) -print((device_data * device_data_2).get()) + block_size = options.block_size + start = 1 + grid_size = int(math.ceil(options.numbers_per_step / block_size)) + resulting_numbers_per_step = block_size * grid_size + last_number_checked = start - 1 -print(host_data / 2) -print((device_data / 2).get()) + with open(options.timings_output, 'w') as file: + file.write("offset,duration,numbers_per_second\n") + if options.save_primes: + with open(options.output, 'w') as file: + file.write("") -print(host_data - host_data_2) -print((device_data - device_data_2).get()) + while last_number_checked < options.end: + calculated_primes = calc_primes(last_number_checked + 1, grid_size, block_size) + if options.save_primes: + with open(options.output, 'a') as file: + file.write("\n".join([str(p) for p in calculated_primes])) + last_number_checked = last_number_checked + resulting_numbers_per_step * 2 diff --git a/primesGPU.py b/primesGPU.py deleted file mode 100644 index 9df20af..0000000 --- a/primesGPU.py +++ /dev/null @@ -1,142 +0,0 @@ -import math -from time import time - -import pycuda.autoinit -import pycuda.driver as drv -import numpy as np -from pycuda import gpuarray -from pycuda.compiler import SourceModule - -from optparse import OptionParser - -ker = SourceModule(""" -__global__ void -check_prime(unsigned long long *input, bool *output) -{ - int i = threadIdx.x + blockDim.x * blockIdx.x; - - unsigned long long num = input[i]; - if (num == 2) { - output[i] = true; - return; - } else if (num < 3 || num % 2 == 0) { - return; - } - unsigned long long limit = (long) sqrt((double) num) + 1; - for (unsigned long long i = 3; i <= limit; i += 2) { - if (num % i == 0) { - return; - } - } - output[i] = true; -} -""") - -ker2 = SourceModule(""" -__global__ void check_prime2(const unsigned long long *IN, bool *OUT) { - int id = threadIdx.x + blockDim.x * blockIdx.x; - unsigned long long num = IN[id]; - unsigned long long limit = (unsigned long long) sqrt((double) num) + 1; - - if (num == 2 || num == 3) { - OUT[id] = true; - return; - } else if (num == 1 || num % 2 == 0) { - return; - } - if (limit < 9) { - for (unsigned long long i = 3; i <= limit; i++) { - if (num % i == 0) { - return; - } - } - } else { - if (num > 3 && num % 3 == 0) { - return; - } - for (unsigned long long i = 9; i <= (limit + 6); i += 6) { - if (num % (i - 2) == 0 || num % (i - 4) == 0) { - return; - } - } - } - - OUT[id] = true; -} -""") - - -def calc_primes(start: int = 1, grid_size: int = 1000, block_size: int = 1024): - check_prime = ker2.get_function("check_prime2") - - primes = [] - if start < 2: - primes = [2] - start = 3 - if start % 2 == 0: - start = start + 1 - - startEvent = drv.Event() - endEvent = drv.Event() - - testvec = np.arange(start, block_size * grid_size * 2 + start, step=2).astype(np.ulonglong) - - testvec_gpu = gpuarray.to_gpu(testvec) - outvec_gpu = gpuarray.to_gpu(np.full(block_size * grid_size, False, dtype=bool)) - startEvent.record() - check_prime(testvec_gpu, outvec_gpu, block=(block_size, 1, 1), grid=(grid_size, 1, 1)) - endEvent.record() - endEvent.synchronize() - kernel_execution_time = startEvent.time_till(endEvent) - - result = outvec_gpu.get() - - for idx, val in enumerate(result): - if val: - primes.append(testvec[idx]) - - print('checked ' + str(block_size * grid_size) + ' numbers' + ' (' + str(start) + ' - ' + str( - start + block_size * grid_size) + ')') - print('last prime: ' + str(primes[-1])) - print('The GPU needed ' + str(kernel_execution_time) + ' milliseconds') - - with open(options.timings_output, 'a') as file: - file.write(str(start) + "," + str(kernel_execution_time) + "," + str((block_size * grid_size)/(kernel_execution_time/1000)) + "\n") - - return primes - - -if __name__ == "__main__": - parser = OptionParser() - parser.add_option("-e", "--end", dest="end", - help="numbers to check without even numbers", default="5000000000", type="int") - parser.add_option("--numbers-per-step", dest="numbers_per_step", - help="amount of uneven numbers checked in each step (even number are skipped)", default="8000000", - type="int") - parser.add_option("--output", dest="output", - help="name of the file, where the primes should be stored", default="primes.txt", type="string") - parser.add_option("--timings-output", dest="timings_output", - help="name of the csv file, where the timing is logged as csv", default="timings.csv", - type="string") - parser.add_option("--save-primes", dest="save_primes", - help="whether the calculated primes should be saved in a txt file", default=False) - (options, args) = parser.parse_args() - - block_size = 1024 - start = 1 - grid_size = int(math.ceil(options.numbers_per_step / block_size)) - resulting_numbers_per_step = block_size * grid_size - last_number_checked = start - 1 - - with open(options.timings_output, 'w') as file: - file.write("offset,duration,numbers_per_second\n") - if options.save_primes: - with open(options.output, 'w') as file: - file.write("") - - while last_number_checked < options.end: - calculated_primes = calc_primes(last_number_checked + 1, grid_size, block_size) - if options.save_primes: - with open(options.output, 'a') as file: - file.write("\n".join([str(p) for p in calculated_primes])) - last_number_checked = last_number_checked + resulting_numbers_per_step * 2