From d4081c7debddf03648e210e66d680ded69c2f431 Mon Sep 17 00:00:00 2001
From: Max Ehrlicher-Schmidt <maxes2000@t-online.de>
Date: Fri, 15 Jan 2021 14:46:20 +0100
Subject: [PATCH] Delete unnecessary scripts

---
 LIFE.py         | 101 --------------------------------
 README.md       |   2 +-
 customKernel.py |  37 ------------
 main.py         | 153 ++++++++++++++++++++++++++++++++++++++++++------
 primesGPU.py    | 142 --------------------------------------------
 5 files changed, 135 insertions(+), 300 deletions(-)
 delete mode 100644 LIFE.py
 delete mode 100644 customKernel.py
 delete mode 100644 primesGPU.py

diff --git a/LIFE.py b/LIFE.py
deleted file mode 100644
index cbee769..0000000
--- a/LIFE.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Iterative Conway's game of life in Python / CUDA C
-# this version is meant to illustrate the use of shared kernel memory in CUDA.
-# written by Brian Tuomanen for "Hands on GPU Programming with Python and CUDA"
-
-import pycuda.autoinit
-import pycuda.driver as drv
-from pycuda import gpuarray
-from pycuda.compiler import SourceModule
-import numpy as np
-import matplotlib.pyplot as plt
-from time import time
-
-shared_ker = SourceModule("""    
-#define _iters 1000000                       
-
-#define _X  ( threadIdx.x + blockIdx.x * blockDim.x )
-#define _Y  ( threadIdx.y + blockIdx.y * blockDim.y )
-
-#define _WIDTH  ( blockDim.x * gridDim.x )
-#define _HEIGHT ( blockDim.y * gridDim.y  )
-
-#define _XM(x)  ( (x + _WIDTH) % _WIDTH )
-#define _YM(y)  ( (y + _HEIGHT) % _HEIGHT )
-
-#define _INDEX(x,y)  ( _XM(x)  + _YM(y) * _WIDTH )
-
-// return the number of living neighbors for a given cell                
-__device__ int nbrs(int x, int y, int * in)
-{
-     return ( in[ _INDEX(x -1, y+1) ] + in[ _INDEX(x-1, y) ] + in[ _INDEX(x-1, y-1) ] \
-                   + in[ _INDEX(x, y+1)] + in[_INDEX(x, y - 1)] \
-                   + in[ _INDEX(x+1, y+1) ] + in[ _INDEX(x+1, y) ] + in[ _INDEX(x+1, y-1) ] );
-}
-
-__global__ void conway_ker_shared(int * p_lattice, int iters)
-{
-   // x, y are the appropriate values for the cell covered by this thread
-   int x = _X, y = _Y;
-   __shared__ int lattice[32*32];
-
-
-   lattice[_INDEX(x,y)] = p_lattice[_INDEX(x,y)];
-   __syncthreads();
-
-   for (int i = 0; i < iters; i++)
-   {
-
-       // count the number of neighbors around the current cell
-       int n = nbrs(x, y, lattice);
-
-       int cell_value;
-
-
-        // if the current cell is alive, then determine if it lives or dies for the next generation.
-        if ( lattice[_INDEX(x,y)] == 1)
-           switch(n)
-           {
-              // if the cell is alive: it remains alive only if it has 2 or 3 neighbors.
-              case 2:
-              case 3: cell_value = 1;
-                      break;
-              default: cell_value = 0;                   
-           }
-        else if( lattice[_INDEX(x,y)] == 0 )
-             switch(n)
-             {
-                // a dead cell comes to life only if it has 3 neighbors that are alive.
-                case 3: cell_value = 1;
-                        break;
-                default: cell_value = 0;         
-             }
-
-        __syncthreads();
-        lattice[_INDEX(x,y)] = cell_value;
-        __syncthreads();
-
-    }
-
-    __syncthreads();
-    p_lattice[_INDEX(x,y)] = lattice[_INDEX(x,y)];
-    __syncthreads();
-
-}
-""")
-
-conway_ker_shared = shared_ker.get_function("conway_ker_shared")
-
-if __name__ == '__main__':
-    # set lattice size
-    N = 32
-
-    lattice = np.int32(np.random.choice([1, 0], N * N, p=[0.25, 0.75]).reshape(N, N))
-    lattice_gpu = gpuarray.to_gpu(lattice)
-
-    conway_ker_shared(lattice_gpu, np.int32(1000000), grid=(1, 1, 1), block=(32, 32, 1))
-
-    fig = plt.figure(1)
-    plt.imshow(lattice_gpu.get())
-    plt.show()
-
-
diff --git a/README.md b/README.md
index 0ade61f..b5ac008 100644
--- a/README.md
+++ b/README.md
@@ -1 +1 @@
-helloCuda
+Cuda Benchmark
diff --git a/customKernel.py b/customKernel.py
deleted file mode 100644
index 7583d91..0000000
--- a/customKernel.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import numpy as np
-import pycuda.autoinit
-from pycuda import gpuarray
-from time import time
-from pycuda.elementwise import ElementwiseKernel
-
-host_data = np.float32(np.random.random(50000000))
-
-gpu_2x_ker = ElementwiseKernel(
-    "float *in, float *out",
-    "out[i] = 2*in[i];",
-    "gpu_2x_ker")
-
-# warm up
-test_data = gpuarray.to_gpu(host_data)
-gpu_2x_ker(test_data, gpuarray.empty_like(test_data))
-
-
-def speed_comparison():
-    t1 = time()
-    host_data_2x = host_data * np.float32(2)
-    t2 = time()
-    print('total time to compute on CPU: %f' % (t2 - t1))
-    device_data = gpuarray.to_gpu(host_data)
-    # allocate memory for output
-    device_data_2x = gpuarray.empty_like(device_data)
-    t1 = time()
-    gpu_2x_ker(device_data, device_data_2x)
-    t2 = time()
-    from_device = device_data_2x.get()
-    print('total time to compute on GPU: %f' % (t2 - t1))
-    print(
-        'Is the host computation the same as the GPU computation? : {}'.format(np.allclose(from_device, host_data_2x)))
-
-
-if __name__ == '__main__':
-    speed_comparison()
diff --git a/main.py b/main.py
index 74fcc65..86a59aa 100644
--- a/main.py
+++ b/main.py
@@ -1,30 +1,145 @@
+import math
+from time import time
+
+import pycuda.autoinit
+import pycuda.driver as drv
 import numpy as np
 from pycuda import gpuarray
+from pycuda.compiler import SourceModule
 
-# -- initialize the device
-import pycuda.autoinit
+from optparse import OptionParser
+
+ker = SourceModule("""
+__global__ void
+check_prime(unsigned long long *input, bool *output)
+{
+    int i = threadIdx.x + blockDim.x * blockIdx.x;
+    
+    unsigned long long num = input[i];
+    if (num == 2) {
+        output[i] = true;
+        return;
+    } else if (num < 3 || num % 2 == 0) {
+        return;
+    } 
+    unsigned long long limit = (long) sqrt((double) num) + 1;
+    for (unsigned long long i = 3; i <= limit; i += 2) {
+        if (num % i == 0) {
+            return;
+        }
+    }
+    output[i] = true;
+}
+""")
+
+ker2 = SourceModule("""
+__global__ void check_prime2(const unsigned long long *IN, bool *OUT) {
+    int id = threadIdx.x + blockDim.x * blockIdx.x;
+    unsigned long long num = IN[id];
+    unsigned long long limit = (unsigned long long) sqrt((double) num) + 1;
+
+    if (num == 2 || num == 3) {
+        OUT[id] = true;
+        return;
+    } else if (num == 1 || num % 2 == 0) {
+        return;
+    }
+    if (limit < 9) {
+        for (unsigned long long i = 3; i <= limit; i++) {
+            if (num % i == 0) {
+                return;
+            }
+        }
+    } else {
+        if (num > 3 && num % 3 == 0) {
+            return;
+        }
+        for (unsigned long long i = 9; i <= (limit + 6); i += 6) {
+            if (num % (i - 2) == 0 || num % (i - 4) == 0) {
+                return;
+            }
+        }
+    }
+
+    OUT[id] = true;
+}
+""")
+
+
+def calc_primes(start: int = 1, grid_size: int = 1000, block_size: int = 1024):
+    check_prime = ker2.get_function("check_prime2")
+
+    primes = []
+    if start < 2:
+        primes = [2]
+        start = 3
+    if start % 2 == 0:
+        start = start + 1
+
+    startEvent = drv.Event()
+    endEvent = drv.Event()
+
+    testvec = np.arange(start, block_size * grid_size * 2 + start, step=2).astype(np.ulonglong)
+
+    testvec_gpu = gpuarray.to_gpu(testvec)
+    outvec_gpu = gpuarray.to_gpu(np.full(block_size * grid_size, False, dtype=bool))
+    startEvent.record()
+    check_prime(testvec_gpu, outvec_gpu, block=(block_size, 1, 1), grid=(grid_size, 1, 1))
+    endEvent.record()
+    endEvent.synchronize()
+    kernel_execution_time = startEvent.time_till(endEvent)
+
+    result = outvec_gpu.get()
+
+    for idx, val in enumerate(result):
+        if val:
+            primes.append(testvec[idx])
 
-dev = pycuda.autoinit.device
+    print('checked ' + str(block_size * grid_size) + ' numbers' + ' (' + str(start) + ' - ' + str(
+        start + block_size * grid_size) + ')')
+    print('last prime: ' + str(primes[-1]))
+    print('The GPU needed ' + str(kernel_execution_time) + ' milliseconds')
 
-print(dev.name())
-print('\t Total Memory: {} megabytes'.format(dev.total_memory() // (1024 ** 2)))
+    with open(options.timings_output, 'a') as file:
+        file.write(str(start) + "," + str(kernel_execution_time) + "," + str((block_size * grid_size)/(kernel_execution_time/1000)) + "\n")
 
-device_attributes = {}
-for k, v in dev.get_attributes().items():
-    device_attributes[str(k)] = v
-    print('\t ' + str(k) + ': ' + str(v))
+    return primes
 
-host_data = np.array([1, 2, 3, 4, 5], dtype=np.float32)
-host_data_2 = np.array([7, 12, 3, 5, 4], dtype=np.float32)
 
-device_data = gpuarray.to_gpu(host_data)
-device_data_2 = gpuarray.to_gpu(host_data_2)
+if __name__ == "__main__":
+    parser = OptionParser()
+    parser.add_option("-e", "--end", dest="end",
+                      help="numbers to check without even numbers", default="5000000000", type="int")
+    parser.add_option("--numbers-per-step", dest="numbers_per_step",
+                      help="amount of uneven numbers checked in each step (even number are skipped)", default="8000000",
+                      type="int")
+    parser.add_option("--block_size", dest="block_size",
+                      help="number of threads per block, max = 1024", default="1024",
+                      type="int")
+    parser.add_option("--output", dest="output",
+                      help="name of the file, where the primes should be stored", default="primes.txt", type="string")
+    parser.add_option("--timings-output", dest="timings_output",
+                      help="name of the csv file, where the timing is logged as csv", default="timings.csv",
+                      type="string")
+    parser.add_option("--save-primes", dest="save_primes",
+                      help="whether the calculated primes should be saved in a txt file", default=False)
+    (options, args) = parser.parse_args()
 
-print(host_data * host_data_2)
-print((device_data * device_data_2).get())
+    block_size = options.block_size
+    start = 1
+    grid_size = int(math.ceil(options.numbers_per_step / block_size))
+    resulting_numbers_per_step = block_size * grid_size
+    last_number_checked = start - 1
 
-print(host_data / 2)
-print((device_data / 2).get())
+    with open(options.timings_output, 'w') as file:
+        file.write("offset,duration,numbers_per_second\n")
+    if options.save_primes:
+        with open(options.output, 'w') as file:
+            file.write("")
 
-print(host_data - host_data_2)
-print((device_data - device_data_2).get())
+    while last_number_checked < options.end:
+        calculated_primes = calc_primes(last_number_checked + 1, grid_size, block_size)
+        if options.save_primes:
+            with open(options.output, 'a') as file:
+                file.write("\n".join([str(p) for p in calculated_primes]))
+        last_number_checked = last_number_checked + resulting_numbers_per_step * 2
diff --git a/primesGPU.py b/primesGPU.py
deleted file mode 100644
index 9df20af..0000000
--- a/primesGPU.py
+++ /dev/null
@@ -1,142 +0,0 @@
-import math
-from time import time
-
-import pycuda.autoinit
-import pycuda.driver as drv
-import numpy as np
-from pycuda import gpuarray
-from pycuda.compiler import SourceModule
-
-from optparse import OptionParser
-
-ker = SourceModule("""
-__global__ void
-check_prime(unsigned long long *input, bool *output)
-{
-    int i = threadIdx.x + blockDim.x * blockIdx.x;
-    
-    unsigned long long num = input[i];
-    if (num == 2) {
-        output[i] = true;
-        return;
-    } else if (num < 3 || num % 2 == 0) {
-        return;
-    } 
-    unsigned long long limit = (long) sqrt((double) num) + 1;
-    for (unsigned long long i = 3; i <= limit; i += 2) {
-        if (num % i == 0) {
-            return;
-        }
-    }
-    output[i] = true;
-}
-""")
-
-ker2 = SourceModule("""
-__global__ void check_prime2(const unsigned long long *IN, bool *OUT) {
-    int id = threadIdx.x + blockDim.x * blockIdx.x;
-    unsigned long long num = IN[id];
-    unsigned long long limit = (unsigned long long) sqrt((double) num) + 1;
-
-    if (num == 2 || num == 3) {
-        OUT[id] = true;
-        return;
-    } else if (num == 1 || num % 2 == 0) {
-        return;
-    }
-    if (limit < 9) {
-        for (unsigned long long i = 3; i <= limit; i++) {
-            if (num % i == 0) {
-                return;
-            }
-        }
-    } else {
-        if (num > 3 && num % 3 == 0) {
-            return;
-        }
-        for (unsigned long long i = 9; i <= (limit + 6); i += 6) {
-            if (num % (i - 2) == 0 || num % (i - 4) == 0) {
-                return;
-            }
-        }
-    }
-
-    OUT[id] = true;
-}
-""")
-
-
-def calc_primes(start: int = 1, grid_size: int = 1000, block_size: int = 1024):
-    check_prime = ker2.get_function("check_prime2")
-
-    primes = []
-    if start < 2:
-        primes = [2]
-        start = 3
-    if start % 2 == 0:
-        start = start + 1
-
-    startEvent = drv.Event()
-    endEvent = drv.Event()
-
-    testvec = np.arange(start, block_size * grid_size * 2 + start, step=2).astype(np.ulonglong)
-
-    testvec_gpu = gpuarray.to_gpu(testvec)
-    outvec_gpu = gpuarray.to_gpu(np.full(block_size * grid_size, False, dtype=bool))
-    startEvent.record()
-    check_prime(testvec_gpu, outvec_gpu, block=(block_size, 1, 1), grid=(grid_size, 1, 1))
-    endEvent.record()
-    endEvent.synchronize()
-    kernel_execution_time = startEvent.time_till(endEvent)
-
-    result = outvec_gpu.get()
-
-    for idx, val in enumerate(result):
-        if val:
-            primes.append(testvec[idx])
-
-    print('checked ' + str(block_size * grid_size) + ' numbers' + ' (' + str(start) + ' - ' + str(
-        start + block_size * grid_size) + ')')
-    print('last prime: ' + str(primes[-1]))
-    print('The GPU needed ' + str(kernel_execution_time) + ' milliseconds')
-
-    with open(options.timings_output, 'a') as file:
-        file.write(str(start) + "," + str(kernel_execution_time) + "," + str((block_size * grid_size)/(kernel_execution_time/1000)) + "\n")
-
-    return primes
-
-
-if __name__ == "__main__":
-    parser = OptionParser()
-    parser.add_option("-e", "--end", dest="end",
-                      help="numbers to check without even numbers", default="5000000000", type="int")
-    parser.add_option("--numbers-per-step", dest="numbers_per_step",
-                      help="amount of uneven numbers checked in each step (even number are skipped)", default="8000000",
-                      type="int")
-    parser.add_option("--output", dest="output",
-                      help="name of the file, where the primes should be stored", default="primes.txt", type="string")
-    parser.add_option("--timings-output", dest="timings_output",
-                      help="name of the csv file, where the timing is logged as csv", default="timings.csv",
-                      type="string")
-    parser.add_option("--save-primes", dest="save_primes",
-                      help="whether the calculated primes should be saved in a txt file", default=False)
-    (options, args) = parser.parse_args()
-
-    block_size = 1024
-    start = 1
-    grid_size = int(math.ceil(options.numbers_per_step / block_size))
-    resulting_numbers_per_step = block_size * grid_size
-    last_number_checked = start - 1
-
-    with open(options.timings_output, 'w') as file:
-        file.write("offset,duration,numbers_per_second\n")
-    if options.save_primes:
-        with open(options.output, 'w') as file:
-            file.write("")
-
-    while last_number_checked < options.end:
-        calculated_primes = calc_primes(last_number_checked + 1, grid_size, block_size)
-        if options.save_primes:
-            with open(options.output, 'a') as file:
-                file.write("\n".join([str(p) for p in calculated_primes]))
-        last_number_checked = last_number_checked + resulting_numbers_per_step * 2