Init

4 years ago · 4b5ae5b82b
parent d330fa8526
commit 4b5ae5b82b
8 changed files with 338 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+.idea
--- a/LIFE.py
+++ b/LIFE.py
@ -0,0 +1,101 @@
+# Iterative Conway's game of life in Python / CUDA C
+# this version is meant to illustrate the use of shared kernel memory in CUDA.
+# written by Brian Tuomanen for "Hands on GPU Programming with Python and CUDA"
+
+import pycuda.autoinit
+import pycuda.driver as drv
+from pycuda import gpuarray
+from pycuda.compiler import SourceModule
+import numpy as np
+import matplotlib.pyplot as plt
+from time import time
+
+shared_ker = SourceModule("""    
+#define _iters 1000000                       
+
+#define _X  ( threadIdx.x + blockIdx.x * blockDim.x )
+#define _Y  ( threadIdx.y + blockIdx.y * blockDim.y )
+
+#define _WIDTH  ( blockDim.x * gridDim.x )
+#define _HEIGHT ( blockDim.y * gridDim.y  )
+
+#define _XM(x)  ( (x + _WIDTH) % _WIDTH )
+#define _YM(y)  ( (y + _HEIGHT) % _HEIGHT )
+
+#define _INDEX(x,y)  ( _XM(x)  + _YM(y) * _WIDTH )
+
+// return the number of living neighbors for a given cell                
+__device__ int nbrs(int x, int y, int * in)
+{
+     return ( in[ _INDEX(x -1, y+1) ] + in[ _INDEX(x-1, y) ] + in[ _INDEX(x-1, y-1) ] \
+                   + in[ _INDEX(x, y+1)] + in[_INDEX(x, y - 1)] \
+                   + in[ _INDEX(x+1, y+1) ] + in[ _INDEX(x+1, y) ] + in[ _INDEX(x+1, y-1) ] );
+}
+
+__global__ void conway_ker_shared(int * p_lattice, int iters)
+{
+   // x, y are the appropriate values for the cell covered by this thread
+   int x = _X, y = _Y;
+   __shared__ int lattice[32*32];
+
+
+   lattice[_INDEX(x,y)] = p_lattice[_INDEX(x,y)];
+   __syncthreads();
+
+   for (int i = 0; i < iters; i++)
+   {
+
+       // count the number of neighbors around the current cell
+       int n = nbrs(x, y, lattice);
+
+       int cell_value;
+
+
+        // if the current cell is alive, then determine if it lives or dies for the next generation.
+        if ( lattice[_INDEX(x,y)] == 1)
+           switch(n)
+           {
+              // if the cell is alive: it remains alive only if it has 2 or 3 neighbors.
+              case 2:
+              case 3: cell_value = 1;
+                      break;
+              default: cell_value = 0;                   
+           }
+        else if( lattice[_INDEX(x,y)] == 0 )
+             switch(n)
+             {
+                // a dead cell comes to life only if it has 3 neighbors that are alive.
+                case 3: cell_value = 1;
+                        break;
+                default: cell_value = 0;         
+             }
+
+        __syncthreads();
+        lattice[_INDEX(x,y)] = cell_value;
+        __syncthreads();
+
+    }
+
+    __syncthreads();
+    p_lattice[_INDEX(x,y)] = lattice[_INDEX(x,y)];
+    __syncthreads();
+
+}
+""")
+
+conway_ker_shared = shared_ker.get_function("conway_ker_shared")
+
+if __name__ == '__main__':
+    # set lattice size
+    N = 32
+
+    lattice = np.int32(np.random.choice([1, 0], N * N, p=[0.25, 0.75]).reshape(N, N))
+    lattice_gpu = gpuarray.to_gpu(lattice)
+
+    conway_ker_shared(lattice_gpu, np.int32(1000000), grid=(1, 1, 1), block=(32, 32, 1))
+
+    fig = plt.figure(1)
+    plt.imshow(lattice_gpu.get())
+    plt.show()
+
+
--- a/customKernel.py
+++ b/customKernel.py
@ -0,0 +1,37 @@
+import numpy as np
+import pycuda.autoinit
+from pycuda import gpuarray
+from time import time
+from pycuda.elementwise import ElementwiseKernel
+
+host_data = np.float32(np.random.random(50000000))
+
+gpu_2x_ker = ElementwiseKernel(
+    "float *in, float *out",
+    "out[i] = 2*in[i];",
+    "gpu_2x_ker")
+
+# warm up
+test_data = gpuarray.to_gpu(host_data)
+gpu_2x_ker(test_data, gpuarray.empty_like(test_data))
+
+
+def speed_comparison():
+    t1 = time()
+    host_data_2x = host_data * np.float32(2)
+    t2 = time()
+    print('total time to compute on CPU: %f' % (t2 - t1))
+    device_data = gpuarray.to_gpu(host_data)
+    # allocate memory for output
+    device_data_2x = gpuarray.empty_like(device_data)
+    t1 = time()
+    gpu_2x_ker(device_data, device_data_2x)
+    t2 = time()
+    from_device = device_data_2x.get()
+    print('total time to compute on GPU: %f' % (t2 - t1))
+    print(
+        'Is the host computation the same as the GPU computation? : {}'.format(np.allclose(from_device, host_data_2x)))
+
+
+if __name__ == '__main__':
+    speed_comparison()
--- a/main.py
+++ b/main.py
@ -0,0 +1,30 @@
+import numpy as np
+from pycuda import gpuarray
+
+# -- initialize the device
+import pycuda.autoinit
+
+dev = pycuda.autoinit.device
+
+print(dev.name())
+print('\t Total Memory: {} megabytes'.format(dev.total_memory() // (1024 ** 2)))
+
+device_attributes = {}
+for k, v in dev.get_attributes().items():
+    device_attributes[str(k)] = v
+    print('\t ' + str(k) + ': ' + str(v))
+
+host_data = np.array([1, 2, 3, 4, 5], dtype=np.float32)
+host_data_2 = np.array([7, 12, 3, 5, 4], dtype=np.float32)
+
+device_data = gpuarray.to_gpu(host_data)
+device_data_2 = gpuarray.to_gpu(host_data_2)
+
+print(host_data * host_data_2)
+print((device_data * device_data_2).get())
+
+print(host_data / 2)
+print((device_data / 2).get())
+
+print(host_data - host_data_2)
+print((device_data - device_data_2).get())
--- a/multiplyCPUGPU.py
+++ b/multiplyCPUGPU.py
@ -0,0 +1,30 @@
+import numpy as np
+from pycuda import gpuarray
+from time import time
+
+# -- initialize the device
+import pycuda.autoinit
+
+print((gpuarray.to_gpu(np.array([1], dtype=np.float32)) * 1).get())  # this call speeds up the following gpu
+# calculation, because at this point the gpu code gets compiled and cached for the next calls
+
+
+# compare cpu and gpu
+host_data = np.float32(np.random.random(50000000))
+
+t1 = time()
+host_data_2x = host_data * np.float32(2)
+t2 = time()
+
+print('total time to compute on CPU: %f' % (t2 - t1))
+
+device_data = gpuarray.to_gpu(host_data)
+
+t1 = time()
+device_data_2x = device_data * np.float32(2)
+t2 = time()
+
+from_device = device_data_2x.get()
+
+print('total time to compute on GPU: %f' % (t2 - t1))
+print('Is the host computation the same as the GPU computation? : {}'.format(np.allclose(from_device, host_data_2x)))
--- a/primesCPU.py
+++ b/primesCPU.py
@ -0,0 +1,23 @@
+import math
+from time import time
+
+lower = 3
+upper = 2000000
+
+primes = [2]
+
+t1 = time()
+if (lower % 2) == 0:
+    lower = lower + 1
+for num in range(lower, upper + 1, 2):
+    # all prime numbers are greater than 1
+    for i in range(2, int(math.sqrt(num) + 1)):
+        if (num % i) == 0:
+            break
+    else:
+        primes.append(num)
+t2 = time()
+
+print(len(primes))
+print(primes)
+print('The CPU needed ' + str(t2 - t1) + ' seconds')
--- a/primesGPU.py
+++ b/primesGPU.py
@ -0,0 +1,89 @@
+from time import time
+
+import pycuda.autoinit
+import pycuda.driver as drv
+import numpy as np
+from pycuda import gpuarray
+from pycuda.compiler import SourceModule
+
+ker = SourceModule("""
+__global__ void
+check_prime(unsigned long long *input, bool *output)
+{
+    int i = threadIdx.x + blockDim.x * blockIdx.x;
+    
+    unsigned long long num = input[i];
+    if (num == 2) {
+        output[i] = true;
+        return;
+    } else if (num < 3 || num % 2 == 0) {
+        return;
+    } 
+    unsigned long long limit = (long) sqrt((double) num) + 1;
+    for (unsigned long long i = 3; i <= limit; i += 2) {
+        if (num % i == 0) {
+            return;
+        }
+    }
+    output[i] = true;
+}
+""")
+
+ker2 = SourceModule("""
+__global__ void check_prime2(const unsigned long *IN, bool *OUT) {
+    int id = threadIdx.x + blockDim.x * blockIdx.x;
+    unsigned long num = IN[id];
+    unsigned long limit = (unsigned long) sqrt((double) num) + 1;
+
+    if (num == 2 || num == 3) {
+        OUT[id] = true;
+        return;
+    } else if (num == 1 || num % 2 == 0) {
+        return;
+    }
+    if (limit < 9) {
+        for (unsigned long i = 3; i <= limit; i++) {
+            if (num % i == 0) {
+                return;
+            }
+        }
+    } else {
+        if (num > 3 && num % 3 == 0) {
+            return;
+        }
+        for (unsigned long i = 9; i <= (limit + 6); i += 6) {
+            if (num % (i - 2) == 0 || num % (i - 4) == 0) {
+                return;
+            }
+        }
+    }
+
+    OUT[id] = true;
+}
+""")
+
+block_size = 1024
+grid_size = 50000
+
+check_prime = ker2.get_function("check_prime2")
+
+testvec = np.arange(1, block_size * grid_size * 2, step=2).astype(np.uint)
+
+testvec_gpu = gpuarray.to_gpu(testvec)
+outvec_gpu = gpuarray.to_gpu(np.full(block_size * grid_size, False, dtype=bool))
+t1 = time()
+check_prime(testvec_gpu, outvec_gpu, block=(block_size, 1, 1), grid=(grid_size, 1, 1))
+result = outvec_gpu.get()
+t2 = time()
+
+primes = []
+
+for idx, val in enumerate(result):
+    if val:
+        primes.append(idx)
+
+#print(primes)
+print(len(primes))
+print('checked the first ' + str(block_size * grid_size) + ' numbers')
+print('last prime: ' + str(primes[-1]))
+print('The GPU needed ' + str(t2 - t1) + ' seconds')
--- a/simpleCustomKernel.py
+++ b/simpleCustomKernel.py
@ -0,0 +1,24 @@
+import pycuda.autoinit
+import pycuda.driver as drv
+import numpy as np
+from pycuda import gpuarray
+from pycuda.compiler import SourceModule
+
+ker = SourceModule("""
+__global__ void scalar_multiply_kernel(float *outvec, float scalar, float *vec)
+{
+     int i = threadIdx.x;
+     outvec[i] = scalar*vec[i];
+}
+""")
+
+scalar_multiply_gpu = ker.get_function("scalar_multiply_kernel")
+
+testvec = np.random.randn(512).astype(np.float32)
+testvec_gpu = gpuarray.to_gpu(testvec)
+outvec_gpu = gpuarray.empty_like(testvec_gpu)
+
+scalar_multiply_gpu(outvec_gpu, np.float32(2), testvec_gpu, block=(512, 1, 1), grid=(1, 1, 1))
+print("Does our kernel work correctly? : {}".format(np.allclose(outvec_gpu.get(), 2 * testvec)))
+print(outvec_gpu.get())
+print(2 * testvec)