From 4b5ae5b82b8c58e8e152b4cde1901f157007eb77 Mon Sep 17 00:00:00 2001
From: Max Ehrlicher-Schmidt <maxes2000@t-online.de>
Date: Mon, 30 Nov 2020 11:33:36 +0100
Subject: [PATCH] Init

---
 .gitignore            |   4 ++
 LIFE.py               | 101 ++++++++++++++++++++++++++++++++++++++++++
 customKernel.py       |  37 ++++++++++++++++
 main.py               |  30 +++++++++++++
 multiplyCPUGPU.py     |  30 +++++++++++++
 primesCPU.py          |  23 ++++++++++
 primesGPU.py          |  89 +++++++++++++++++++++++++++++++++++++
 simpleCustomKernel.py |  24 ++++++++++
 8 files changed, 338 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 LIFE.py
 create mode 100644 customKernel.py
 create mode 100644 main.py
 create mode 100644 multiplyCPUGPU.py
 create mode 100644 primesCPU.py
 create mode 100644 primesGPU.py
 create mode 100644 simpleCustomKernel.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..95b0275
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+.idea
diff --git a/LIFE.py b/LIFE.py
new file mode 100644
index 0000000..cbee769
--- /dev/null
+++ b/LIFE.py
@@ -0,0 +1,101 @@
+# Iterative Conway's game of life in Python / CUDA C
+# this version is meant to illustrate the use of shared kernel memory in CUDA.
+# written by Brian Tuomanen for "Hands on GPU Programming with Python and CUDA"
+
+import pycuda.autoinit
+import pycuda.driver as drv
+from pycuda import gpuarray
+from pycuda.compiler import SourceModule
+import numpy as np
+import matplotlib.pyplot as plt
+from time import time
+
+shared_ker = SourceModule("""    
+#define _iters 1000000                       
+
+#define _X  ( threadIdx.x + blockIdx.x * blockDim.x )
+#define _Y  ( threadIdx.y + blockIdx.y * blockDim.y )
+
+#define _WIDTH  ( blockDim.x * gridDim.x )
+#define _HEIGHT ( blockDim.y * gridDim.y  )
+
+#define _XM(x)  ( (x + _WIDTH) % _WIDTH )
+#define _YM(y)  ( (y + _HEIGHT) % _HEIGHT )
+
+#define _INDEX(x,y)  ( _XM(x)  + _YM(y) * _WIDTH )
+
+// return the number of living neighbors for a given cell                
+__device__ int nbrs(int x, int y, int * in)
+{
+     return ( in[ _INDEX(x -1, y+1) ] + in[ _INDEX(x-1, y) ] + in[ _INDEX(x-1, y-1) ] \
+                   + in[ _INDEX(x, y+1)] + in[_INDEX(x, y - 1)] \
+                   + in[ _INDEX(x+1, y+1) ] + in[ _INDEX(x+1, y) ] + in[ _INDEX(x+1, y-1) ] );
+}
+
+__global__ void conway_ker_shared(int * p_lattice, int iters)
+{
+   // x, y are the appropriate values for the cell covered by this thread
+   int x = _X, y = _Y;
+   __shared__ int lattice[32*32];
+
+
+   lattice[_INDEX(x,y)] = p_lattice[_INDEX(x,y)];
+   __syncthreads();
+
+   for (int i = 0; i < iters; i++)
+   {
+
+       // count the number of neighbors around the current cell
+       int n = nbrs(x, y, lattice);
+
+       int cell_value;
+
+
+        // if the current cell is alive, then determine if it lives or dies for the next generation.
+        if ( lattice[_INDEX(x,y)] == 1)
+           switch(n)
+           {
+              // if the cell is alive: it remains alive only if it has 2 or 3 neighbors.
+              case 2:
+              case 3: cell_value = 1;
+                      break;
+              default: cell_value = 0;                   
+           }
+        else if( lattice[_INDEX(x,y)] == 0 )
+             switch(n)
+             {
+                // a dead cell comes to life only if it has 3 neighbors that are alive.
+                case 3: cell_value = 1;
+                        break;
+                default: cell_value = 0;         
+             }
+
+        __syncthreads();
+        lattice[_INDEX(x,y)] = cell_value;
+        __syncthreads();
+
+    }
+
+    __syncthreads();
+    p_lattice[_INDEX(x,y)] = lattice[_INDEX(x,y)];
+    __syncthreads();
+
+}
+""")
+
+conway_ker_shared = shared_ker.get_function("conway_ker_shared")
+
+if __name__ == '__main__':
+    # set lattice size
+    N = 32
+
+    lattice = np.int32(np.random.choice([1, 0], N * N, p=[0.25, 0.75]).reshape(N, N))
+    lattice_gpu = gpuarray.to_gpu(lattice)
+
+    conway_ker_shared(lattice_gpu, np.int32(1000000), grid=(1, 1, 1), block=(32, 32, 1))
+
+    fig = plt.figure(1)
+    plt.imshow(lattice_gpu.get())
+    plt.show()
+
+
diff --git a/customKernel.py b/customKernel.py
new file mode 100644
index 0000000..7583d91
--- /dev/null
+++ b/customKernel.py
@@ -0,0 +1,37 @@
+import numpy as np
+import pycuda.autoinit
+from pycuda import gpuarray
+from time import time
+from pycuda.elementwise import ElementwiseKernel
+
+host_data = np.float32(np.random.random(50000000))
+
+gpu_2x_ker = ElementwiseKernel(
+    "float *in, float *out",
+    "out[i] = 2*in[i];",
+    "gpu_2x_ker")
+
+# warm up
+test_data = gpuarray.to_gpu(host_data)
+gpu_2x_ker(test_data, gpuarray.empty_like(test_data))
+
+
+def speed_comparison():
+    t1 = time()
+    host_data_2x = host_data * np.float32(2)
+    t2 = time()
+    print('total time to compute on CPU: %f' % (t2 - t1))
+    device_data = gpuarray.to_gpu(host_data)
+    # allocate memory for output
+    device_data_2x = gpuarray.empty_like(device_data)
+    t1 = time()
+    gpu_2x_ker(device_data, device_data_2x)
+    t2 = time()
+    from_device = device_data_2x.get()
+    print('total time to compute on GPU: %f' % (t2 - t1))
+    print(
+        'Is the host computation the same as the GPU computation? : {}'.format(np.allclose(from_device, host_data_2x)))
+
+
+if __name__ == '__main__':
+    speed_comparison()
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..74fcc65
--- /dev/null
+++ b/main.py
@@ -0,0 +1,30 @@
+import numpy as np
+from pycuda import gpuarray
+
+# -- initialize the device
+import pycuda.autoinit
+
+dev = pycuda.autoinit.device
+
+print(dev.name())
+print('\t Total Memory: {} megabytes'.format(dev.total_memory() // (1024 ** 2)))
+
+device_attributes = {}
+for k, v in dev.get_attributes().items():
+    device_attributes[str(k)] = v
+    print('\t ' + str(k) + ': ' + str(v))
+
+host_data = np.array([1, 2, 3, 4, 5], dtype=np.float32)
+host_data_2 = np.array([7, 12, 3, 5, 4], dtype=np.float32)
+
+device_data = gpuarray.to_gpu(host_data)
+device_data_2 = gpuarray.to_gpu(host_data_2)
+
+print(host_data * host_data_2)
+print((device_data * device_data_2).get())
+
+print(host_data / 2)
+print((device_data / 2).get())
+
+print(host_data - host_data_2)
+print((device_data - device_data_2).get())
diff --git a/multiplyCPUGPU.py b/multiplyCPUGPU.py
new file mode 100644
index 0000000..afa4513
--- /dev/null
+++ b/multiplyCPUGPU.py
@@ -0,0 +1,30 @@
+import numpy as np
+from pycuda import gpuarray
+from time import time
+
+# -- initialize the device
+import pycuda.autoinit
+
+print((gpuarray.to_gpu(np.array([1], dtype=np.float32)) * 1).get())  # this call speeds up the following gpu
+# calculation, because at this point the gpu code gets compiled and cached for the next calls
+
+
+# compare cpu and gpu
+host_data = np.float32(np.random.random(50000000))
+
+t1 = time()
+host_data_2x = host_data * np.float32(2)
+t2 = time()
+
+print('total time to compute on CPU: %f' % (t2 - t1))
+
+device_data = gpuarray.to_gpu(host_data)
+
+t1 = time()
+device_data_2x = device_data * np.float32(2)
+t2 = time()
+
+from_device = device_data_2x.get()
+
+print('total time to compute on GPU: %f' % (t2 - t1))
+print('Is the host computation the same as the GPU computation? : {}'.format(np.allclose(from_device, host_data_2x)))
diff --git a/primesCPU.py b/primesCPU.py
new file mode 100644
index 0000000..abc97dd
--- /dev/null
+++ b/primesCPU.py
@@ -0,0 +1,23 @@
+import math
+from time import time
+
+lower = 3
+upper = 2000000
+
+primes = [2]
+
+t1 = time()
+if (lower % 2) == 0:
+    lower = lower + 1
+for num in range(lower, upper + 1, 2):
+    # all prime numbers are greater than 1
+    for i in range(2, int(math.sqrt(num) + 1)):
+        if (num % i) == 0:
+            break
+    else:
+        primes.append(num)
+t2 = time()
+
+print(len(primes))
+print(primes)
+print('The CPU needed ' + str(t2 - t1) + ' seconds')
diff --git a/primesGPU.py b/primesGPU.py
new file mode 100644
index 0000000..3d4040c
--- /dev/null
+++ b/primesGPU.py
@@ -0,0 +1,89 @@
+from time import time
+
+import pycuda.autoinit
+import pycuda.driver as drv
+import numpy as np
+from pycuda import gpuarray
+from pycuda.compiler import SourceModule
+
+ker = SourceModule("""
+__global__ void
+check_prime(unsigned long long *input, bool *output)
+{
+    int i = threadIdx.x + blockDim.x * blockIdx.x;
+    
+    unsigned long long num = input[i];
+    if (num == 2) {
+        output[i] = true;
+        return;
+    } else if (num < 3 || num % 2 == 0) {
+        return;
+    } 
+    unsigned long long limit = (long) sqrt((double) num) + 1;
+    for (unsigned long long i = 3; i <= limit; i += 2) {
+        if (num % i == 0) {
+            return;
+        }
+    }
+    output[i] = true;
+}
+""")
+
+ker2 = SourceModule("""
+__global__ void check_prime2(const unsigned long *IN, bool *OUT) {
+    int id = threadIdx.x + blockDim.x * blockIdx.x;
+    unsigned long num = IN[id];
+    unsigned long limit = (unsigned long) sqrt((double) num) + 1;
+
+    if (num == 2 || num == 3) {
+        OUT[id] = true;
+        return;
+    } else if (num == 1 || num % 2 == 0) {
+        return;
+    }
+    if (limit < 9) {
+        for (unsigned long i = 3; i <= limit; i++) {
+            if (num % i == 0) {
+                return;
+            }
+        }
+    } else {
+        if (num > 3 && num % 3 == 0) {
+            return;
+        }
+        for (unsigned long i = 9; i <= (limit + 6); i += 6) {
+            if (num % (i - 2) == 0 || num % (i - 4) == 0) {
+                return;
+            }
+        }
+    }
+
+    OUT[id] = true;
+}
+""")
+
+block_size = 1024
+grid_size = 50000
+
+check_prime = ker2.get_function("check_prime2")
+
+testvec = np.arange(1, block_size * grid_size * 2, step=2).astype(np.uint)
+
+testvec_gpu = gpuarray.to_gpu(testvec)
+outvec_gpu = gpuarray.to_gpu(np.full(block_size * grid_size, False, dtype=bool))
+t1 = time()
+check_prime(testvec_gpu, outvec_gpu, block=(block_size, 1, 1), grid=(grid_size, 1, 1))
+result = outvec_gpu.get()
+t2 = time()
+
+primes = []
+
+for idx, val in enumerate(result):
+    if val:
+        primes.append(idx)
+
+#print(primes)
+print(len(primes))
+print('checked the first ' + str(block_size * grid_size) + ' numbers')
+print('last prime: ' + str(primes[-1]))
+print('The GPU needed ' + str(t2 - t1) + ' seconds')
diff --git a/simpleCustomKernel.py b/simpleCustomKernel.py
new file mode 100644
index 0000000..188f8d1
--- /dev/null
+++ b/simpleCustomKernel.py
@@ -0,0 +1,24 @@
+import pycuda.autoinit
+import pycuda.driver as drv
+import numpy as np
+from pycuda import gpuarray
+from pycuda.compiler import SourceModule
+
+ker = SourceModule("""
+__global__ void scalar_multiply_kernel(float *outvec, float scalar, float *vec)
+{
+     int i = threadIdx.x;
+     outvec[i] = scalar*vec[i];
+}
+""")
+
+scalar_multiply_gpu = ker.get_function("scalar_multiply_kernel")
+
+testvec = np.random.randn(512).astype(np.float32)
+testvec_gpu = gpuarray.to_gpu(testvec)
+outvec_gpu = gpuarray.empty_like(testvec_gpu)
+
+scalar_multiply_gpu(outvec_gpu, np.float32(2), testvec_gpu, block=(512, 1, 1), grid=(1, 1, 1))
+print("Does our kernel work correctly? : {}".format(np.allclose(outvec_gpu.get(), 2 * testvec)))
+print(outvec_gpu.get())
+print(2 * testvec)