diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..95b0275 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +# Default ignored files +/shelf/ +/workspace.xml +.idea diff --git a/LIFE.py b/LIFE.py new file mode 100644 index 0000000..cbee769 --- /dev/null +++ b/LIFE.py @@ -0,0 +1,101 @@ +# Iterative Conway's game of life in Python / CUDA C +# this version is meant to illustrate the use of shared kernel memory in CUDA. +# written by Brian Tuomanen for "Hands on GPU Programming with Python and CUDA" + +import pycuda.autoinit +import pycuda.driver as drv +from pycuda import gpuarray +from pycuda.compiler import SourceModule +import numpy as np +import matplotlib.pyplot as plt +from time import time + +shared_ker = SourceModule(""" +#define _iters 1000000 + +#define _X ( threadIdx.x + blockIdx.x * blockDim.x ) +#define _Y ( threadIdx.y + blockIdx.y * blockDim.y ) + +#define _WIDTH ( blockDim.x * gridDim.x ) +#define _HEIGHT ( blockDim.y * gridDim.y ) + +#define _XM(x) ( (x + _WIDTH) % _WIDTH ) +#define _YM(y) ( (y + _HEIGHT) % _HEIGHT ) + +#define _INDEX(x,y) ( _XM(x) + _YM(y) * _WIDTH ) + +// return the number of living neighbors for a given cell +__device__ int nbrs(int x, int y, int * in) +{ + return ( in[ _INDEX(x -1, y+1) ] + in[ _INDEX(x-1, y) ] + in[ _INDEX(x-1, y-1) ] \ + + in[ _INDEX(x, y+1)] + in[_INDEX(x, y - 1)] \ + + in[ _INDEX(x+1, y+1) ] + in[ _INDEX(x+1, y) ] + in[ _INDEX(x+1, y-1) ] ); +} + +__global__ void conway_ker_shared(int * p_lattice, int iters) +{ + // x, y are the appropriate values for the cell covered by this thread + int x = _X, y = _Y; + __shared__ int lattice[32*32]; + + + lattice[_INDEX(x,y)] = p_lattice[_INDEX(x,y)]; + __syncthreads(); + + for (int i = 0; i < iters; i++) + { + + // count the number of neighbors around the current cell + int n = nbrs(x, y, lattice); + + int cell_value; + + + // if the current cell is alive, then determine if it lives or dies for the next generation. + if ( lattice[_INDEX(x,y)] == 1) + switch(n) + { + // if the cell is alive: it remains alive only if it has 2 or 3 neighbors. + case 2: + case 3: cell_value = 1; + break; + default: cell_value = 0; + } + else if( lattice[_INDEX(x,y)] == 0 ) + switch(n) + { + // a dead cell comes to life only if it has 3 neighbors that are alive. + case 3: cell_value = 1; + break; + default: cell_value = 0; + } + + __syncthreads(); + lattice[_INDEX(x,y)] = cell_value; + __syncthreads(); + + } + + __syncthreads(); + p_lattice[_INDEX(x,y)] = lattice[_INDEX(x,y)]; + __syncthreads(); + +} +""") + +conway_ker_shared = shared_ker.get_function("conway_ker_shared") + +if __name__ == '__main__': + # set lattice size + N = 32 + + lattice = np.int32(np.random.choice([1, 0], N * N, p=[0.25, 0.75]).reshape(N, N)) + lattice_gpu = gpuarray.to_gpu(lattice) + + conway_ker_shared(lattice_gpu, np.int32(1000000), grid=(1, 1, 1), block=(32, 32, 1)) + + fig = plt.figure(1) + plt.imshow(lattice_gpu.get()) + plt.show() + + diff --git a/customKernel.py b/customKernel.py new file mode 100644 index 0000000..7583d91 --- /dev/null +++ b/customKernel.py @@ -0,0 +1,37 @@ +import numpy as np +import pycuda.autoinit +from pycuda import gpuarray +from time import time +from pycuda.elementwise import ElementwiseKernel + +host_data = np.float32(np.random.random(50000000)) + +gpu_2x_ker = ElementwiseKernel( + "float *in, float *out", + "out[i] = 2*in[i];", + "gpu_2x_ker") + +# warm up +test_data = gpuarray.to_gpu(host_data) +gpu_2x_ker(test_data, gpuarray.empty_like(test_data)) + + +def speed_comparison(): + t1 = time() + host_data_2x = host_data * np.float32(2) + t2 = time() + print('total time to compute on CPU: %f' % (t2 - t1)) + device_data = gpuarray.to_gpu(host_data) + # allocate memory for output + device_data_2x = gpuarray.empty_like(device_data) + t1 = time() + gpu_2x_ker(device_data, device_data_2x) + t2 = time() + from_device = device_data_2x.get() + print('total time to compute on GPU: %f' % (t2 - t1)) + print( + 'Is the host computation the same as the GPU computation? : {}'.format(np.allclose(from_device, host_data_2x))) + + +if __name__ == '__main__': + speed_comparison() diff --git a/main.py b/main.py new file mode 100644 index 0000000..74fcc65 --- /dev/null +++ b/main.py @@ -0,0 +1,30 @@ +import numpy as np +from pycuda import gpuarray + +# -- initialize the device +import pycuda.autoinit + +dev = pycuda.autoinit.device + +print(dev.name()) +print('\t Total Memory: {} megabytes'.format(dev.total_memory() // (1024 ** 2))) + +device_attributes = {} +for k, v in dev.get_attributes().items(): + device_attributes[str(k)] = v + print('\t ' + str(k) + ': ' + str(v)) + +host_data = np.array([1, 2, 3, 4, 5], dtype=np.float32) +host_data_2 = np.array([7, 12, 3, 5, 4], dtype=np.float32) + +device_data = gpuarray.to_gpu(host_data) +device_data_2 = gpuarray.to_gpu(host_data_2) + +print(host_data * host_data_2) +print((device_data * device_data_2).get()) + +print(host_data / 2) +print((device_data / 2).get()) + +print(host_data - host_data_2) +print((device_data - device_data_2).get()) diff --git a/multiplyCPUGPU.py b/multiplyCPUGPU.py new file mode 100644 index 0000000..afa4513 --- /dev/null +++ b/multiplyCPUGPU.py @@ -0,0 +1,30 @@ +import numpy as np +from pycuda import gpuarray +from time import time + +# -- initialize the device +import pycuda.autoinit + +print((gpuarray.to_gpu(np.array([1], dtype=np.float32)) * 1).get()) # this call speeds up the following gpu +# calculation, because at this point the gpu code gets compiled and cached for the next calls + + +# compare cpu and gpu +host_data = np.float32(np.random.random(50000000)) + +t1 = time() +host_data_2x = host_data * np.float32(2) +t2 = time() + +print('total time to compute on CPU: %f' % (t2 - t1)) + +device_data = gpuarray.to_gpu(host_data) + +t1 = time() +device_data_2x = device_data * np.float32(2) +t2 = time() + +from_device = device_data_2x.get() + +print('total time to compute on GPU: %f' % (t2 - t1)) +print('Is the host computation the same as the GPU computation? : {}'.format(np.allclose(from_device, host_data_2x))) diff --git a/primesCPU.py b/primesCPU.py new file mode 100644 index 0000000..abc97dd --- /dev/null +++ b/primesCPU.py @@ -0,0 +1,23 @@ +import math +from time import time + +lower = 3 +upper = 2000000 + +primes = [2] + +t1 = time() +if (lower % 2) == 0: + lower = lower + 1 +for num in range(lower, upper + 1, 2): + # all prime numbers are greater than 1 + for i in range(2, int(math.sqrt(num) + 1)): + if (num % i) == 0: + break + else: + primes.append(num) +t2 = time() + +print(len(primes)) +print(primes) +print('The CPU needed ' + str(t2 - t1) + ' seconds') diff --git a/primesGPU.py b/primesGPU.py new file mode 100644 index 0000000..3d4040c --- /dev/null +++ b/primesGPU.py @@ -0,0 +1,89 @@ +from time import time + +import pycuda.autoinit +import pycuda.driver as drv +import numpy as np +from pycuda import gpuarray +from pycuda.compiler import SourceModule + +ker = SourceModule(""" +__global__ void +check_prime(unsigned long long *input, bool *output) +{ + int i = threadIdx.x + blockDim.x * blockIdx.x; + + unsigned long long num = input[i]; + if (num == 2) { + output[i] = true; + return; + } else if (num < 3 || num % 2 == 0) { + return; + } + unsigned long long limit = (long) sqrt((double) num) + 1; + for (unsigned long long i = 3; i <= limit; i += 2) { + if (num % i == 0) { + return; + } + } + output[i] = true; +} +""") + +ker2 = SourceModule(""" +__global__ void check_prime2(const unsigned long *IN, bool *OUT) { + int id = threadIdx.x + blockDim.x * blockIdx.x; + unsigned long num = IN[id]; + unsigned long limit = (unsigned long) sqrt((double) num) + 1; + + if (num == 2 || num == 3) { + OUT[id] = true; + return; + } else if (num == 1 || num % 2 == 0) { + return; + } + if (limit < 9) { + for (unsigned long i = 3; i <= limit; i++) { + if (num % i == 0) { + return; + } + } + } else { + if (num > 3 && num % 3 == 0) { + return; + } + for (unsigned long i = 9; i <= (limit + 6); i += 6) { + if (num % (i - 2) == 0 || num % (i - 4) == 0) { + return; + } + } + } + + OUT[id] = true; +} +""") + +block_size = 1024 +grid_size = 50000 + +check_prime = ker2.get_function("check_prime2") + +testvec = np.arange(1, block_size * grid_size * 2, step=2).astype(np.uint) + +testvec_gpu = gpuarray.to_gpu(testvec) +outvec_gpu = gpuarray.to_gpu(np.full(block_size * grid_size, False, dtype=bool)) +t1 = time() +check_prime(testvec_gpu, outvec_gpu, block=(block_size, 1, 1), grid=(grid_size, 1, 1)) +result = outvec_gpu.get() +t2 = time() + +primes = [] + +for idx, val in enumerate(result): + if val: + primes.append(idx) + +#print(primes) +print(len(primes)) +print('checked the first ' + str(block_size * grid_size) + ' numbers') +print('last prime: ' + str(primes[-1])) +print('The GPU needed ' + str(t2 - t1) + ' seconds') diff --git a/simpleCustomKernel.py b/simpleCustomKernel.py new file mode 100644 index 0000000..188f8d1 --- /dev/null +++ b/simpleCustomKernel.py @@ -0,0 +1,24 @@ +import pycuda.autoinit +import pycuda.driver as drv +import numpy as np +from pycuda import gpuarray +from pycuda.compiler import SourceModule + +ker = SourceModule(""" +__global__ void scalar_multiply_kernel(float *outvec, float scalar, float *vec) +{ + int i = threadIdx.x; + outvec[i] = scalar*vec[i]; +} +""") + +scalar_multiply_gpu = ker.get_function("scalar_multiply_kernel") + +testvec = np.random.randn(512).astype(np.float32) +testvec_gpu = gpuarray.to_gpu(testvec) +outvec_gpu = gpuarray.empty_like(testvec_gpu) + +scalar_multiply_gpu(outvec_gpu, np.float32(2), testvec_gpu, block=(512, 1, 1), grid=(1, 1, 1)) +print("Does our kernel work correctly? : {}".format(np.allclose(outvec_gpu.get(), 2 * testvec))) +print(outvec_gpu.get()) +print(2 * testvec)