Delete unnecessary scripts

master
Max Ehrlicher-Schmidt 3 years ago
parent c854f55af0
commit d4081c7deb

@ -1,101 +0,0 @@
# Iterative Conway's game of life in Python / CUDA C
# this version is meant to illustrate the use of shared kernel memory in CUDA.
# written by Brian Tuomanen for "Hands on GPU Programming with Python and CUDA"
import pycuda.autoinit
import pycuda.driver as drv
from pycuda import gpuarray
from pycuda.compiler import SourceModule
import numpy as np
import matplotlib.pyplot as plt
from time import time
shared_ker = SourceModule("""
#define _iters 1000000
#define _X ( threadIdx.x + blockIdx.x * blockDim.x )
#define _Y ( threadIdx.y + blockIdx.y * blockDim.y )
#define _WIDTH ( blockDim.x * gridDim.x )
#define _HEIGHT ( blockDim.y * gridDim.y )
#define _XM(x) ( (x + _WIDTH) % _WIDTH )
#define _YM(y) ( (y + _HEIGHT) % _HEIGHT )
#define _INDEX(x,y) ( _XM(x) + _YM(y) * _WIDTH )
// return the number of living neighbors for a given cell
__device__ int nbrs(int x, int y, int * in)
{
return ( in[ _INDEX(x -1, y+1) ] + in[ _INDEX(x-1, y) ] + in[ _INDEX(x-1, y-1) ] \
+ in[ _INDEX(x, y+1)] + in[_INDEX(x, y - 1)] \
+ in[ _INDEX(x+1, y+1) ] + in[ _INDEX(x+1, y) ] + in[ _INDEX(x+1, y-1) ] );
}
__global__ void conway_ker_shared(int * p_lattice, int iters)
{
// x, y are the appropriate values for the cell covered by this thread
int x = _X, y = _Y;
__shared__ int lattice[32*32];
lattice[_INDEX(x,y)] = p_lattice[_INDEX(x,y)];
__syncthreads();
for (int i = 0; i < iters; i++)
{
// count the number of neighbors around the current cell
int n = nbrs(x, y, lattice);
int cell_value;
// if the current cell is alive, then determine if it lives or dies for the next generation.
if ( lattice[_INDEX(x,y)] == 1)
switch(n)
{
// if the cell is alive: it remains alive only if it has 2 or 3 neighbors.
case 2:
case 3: cell_value = 1;
break;
default: cell_value = 0;
}
else if( lattice[_INDEX(x,y)] == 0 )
switch(n)
{
// a dead cell comes to life only if it has 3 neighbors that are alive.
case 3: cell_value = 1;
break;
default: cell_value = 0;
}
__syncthreads();
lattice[_INDEX(x,y)] = cell_value;
__syncthreads();
}
__syncthreads();
p_lattice[_INDEX(x,y)] = lattice[_INDEX(x,y)];
__syncthreads();
}
""")
conway_ker_shared = shared_ker.get_function("conway_ker_shared")
if __name__ == '__main__':
# set lattice size
N = 32
lattice = np.int32(np.random.choice([1, 0], N * N, p=[0.25, 0.75]).reshape(N, N))
lattice_gpu = gpuarray.to_gpu(lattice)
conway_ker_shared(lattice_gpu, np.int32(1000000), grid=(1, 1, 1), block=(32, 32, 1))
fig = plt.figure(1)
plt.imshow(lattice_gpu.get())
plt.show()

@ -1 +1 @@
helloCuda
Cuda Benchmark

@ -1,37 +0,0 @@
import numpy as np
import pycuda.autoinit
from pycuda import gpuarray
from time import time
from pycuda.elementwise import ElementwiseKernel
host_data = np.float32(np.random.random(50000000))
gpu_2x_ker = ElementwiseKernel(
"float *in, float *out",
"out[i] = 2*in[i];",
"gpu_2x_ker")
# warm up
test_data = gpuarray.to_gpu(host_data)
gpu_2x_ker(test_data, gpuarray.empty_like(test_data))
def speed_comparison():
t1 = time()
host_data_2x = host_data * np.float32(2)
t2 = time()
print('total time to compute on CPU: %f' % (t2 - t1))
device_data = gpuarray.to_gpu(host_data)
# allocate memory for output
device_data_2x = gpuarray.empty_like(device_data)
t1 = time()
gpu_2x_ker(device_data, device_data_2x)
t2 = time()
from_device = device_data_2x.get()
print('total time to compute on GPU: %f' % (t2 - t1))
print(
'Is the host computation the same as the GPU computation? : {}'.format(np.allclose(from_device, host_data_2x)))
if __name__ == '__main__':
speed_comparison()

@ -1,30 +1,145 @@
import math
from time import time
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda import gpuarray
from pycuda.compiler import SourceModule
# -- initialize the device
import pycuda.autoinit
from optparse import OptionParser
ker = SourceModule("""
__global__ void
check_prime(unsigned long long *input, bool *output)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
unsigned long long num = input[i];
if (num == 2) {
output[i] = true;
return;
} else if (num < 3 || num % 2 == 0) {
return;
}
unsigned long long limit = (long) sqrt((double) num) + 1;
for (unsigned long long i = 3; i <= limit; i += 2) {
if (num % i == 0) {
return;
}
}
output[i] = true;
}
""")
ker2 = SourceModule("""
__global__ void check_prime2(const unsigned long long *IN, bool *OUT) {
int id = threadIdx.x + blockDim.x * blockIdx.x;
unsigned long long num = IN[id];
unsigned long long limit = (unsigned long long) sqrt((double) num) + 1;
if (num == 2 || num == 3) {
OUT[id] = true;
return;
} else if (num == 1 || num % 2 == 0) {
return;
}
if (limit < 9) {
for (unsigned long long i = 3; i <= limit; i++) {
if (num % i == 0) {
return;
}
}
} else {
if (num > 3 && num % 3 == 0) {
return;
}
for (unsigned long long i = 9; i <= (limit + 6); i += 6) {
if (num % (i - 2) == 0 || num % (i - 4) == 0) {
return;
}
}
}
OUT[id] = true;
}
""")
def calc_primes(start: int = 1, grid_size: int = 1000, block_size: int = 1024):
check_prime = ker2.get_function("check_prime2")
primes = []
if start < 2:
primes = [2]
start = 3
if start % 2 == 0:
start = start + 1
startEvent = drv.Event()
endEvent = drv.Event()
testvec = np.arange(start, block_size * grid_size * 2 + start, step=2).astype(np.ulonglong)
testvec_gpu = gpuarray.to_gpu(testvec)
outvec_gpu = gpuarray.to_gpu(np.full(block_size * grid_size, False, dtype=bool))
startEvent.record()
check_prime(testvec_gpu, outvec_gpu, block=(block_size, 1, 1), grid=(grid_size, 1, 1))
endEvent.record()
endEvent.synchronize()
kernel_execution_time = startEvent.time_till(endEvent)
result = outvec_gpu.get()
for idx, val in enumerate(result):
if val:
primes.append(testvec[idx])
dev = pycuda.autoinit.device
print('checked ' + str(block_size * grid_size) + ' numbers' + ' (' + str(start) + ' - ' + str(
start + block_size * grid_size) + ')')
print('last prime: ' + str(primes[-1]))
print('The GPU needed ' + str(kernel_execution_time) + ' milliseconds')
print(dev.name())
print('\t Total Memory: {} megabytes'.format(dev.total_memory() // (1024 ** 2)))
with open(options.timings_output, 'a') as file:
file.write(str(start) + "," + str(kernel_execution_time) + "," + str((block_size * grid_size)/(kernel_execution_time/1000)) + "\n")
device_attributes = {}
for k, v in dev.get_attributes().items():
device_attributes[str(k)] = v
print('\t ' + str(k) + ': ' + str(v))
return primes
host_data = np.array([1, 2, 3, 4, 5], dtype=np.float32)
host_data_2 = np.array([7, 12, 3, 5, 4], dtype=np.float32)
device_data = gpuarray.to_gpu(host_data)
device_data_2 = gpuarray.to_gpu(host_data_2)
if __name__ == "__main__":
parser = OptionParser()
parser.add_option("-e", "--end", dest="end",
help="numbers to check without even numbers", default="5000000000", type="int")
parser.add_option("--numbers-per-step", dest="numbers_per_step",
help="amount of uneven numbers checked in each step (even number are skipped)", default="8000000",
type="int")
parser.add_option("--block_size", dest="block_size",
help="number of threads per block, max = 1024", default="1024",
type="int")
parser.add_option("--output", dest="output",
help="name of the file, where the primes should be stored", default="primes.txt", type="string")
parser.add_option("--timings-output", dest="timings_output",
help="name of the csv file, where the timing is logged as csv", default="timings.csv",
type="string")
parser.add_option("--save-primes", dest="save_primes",
help="whether the calculated primes should be saved in a txt file", default=False)
(options, args) = parser.parse_args()
print(host_data * host_data_2)
print((device_data * device_data_2).get())
block_size = options.block_size
start = 1
grid_size = int(math.ceil(options.numbers_per_step / block_size))
resulting_numbers_per_step = block_size * grid_size
last_number_checked = start - 1
print(host_data / 2)
print((device_data / 2).get())
with open(options.timings_output, 'w') as file:
file.write("offset,duration,numbers_per_second\n")
if options.save_primes:
with open(options.output, 'w') as file:
file.write("")
print(host_data - host_data_2)
print((device_data - device_data_2).get())
while last_number_checked < options.end:
calculated_primes = calc_primes(last_number_checked + 1, grid_size, block_size)
if options.save_primes:
with open(options.output, 'a') as file:
file.write("\n".join([str(p) for p in calculated_primes]))
last_number_checked = last_number_checked + resulting_numbers_per_step * 2

@ -1,142 +0,0 @@
import math
from time import time
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda import gpuarray
from pycuda.compiler import SourceModule
from optparse import OptionParser
ker = SourceModule("""
__global__ void
check_prime(unsigned long long *input, bool *output)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
unsigned long long num = input[i];
if (num == 2) {
output[i] = true;
return;
} else if (num < 3 || num % 2 == 0) {
return;
}
unsigned long long limit = (long) sqrt((double) num) + 1;
for (unsigned long long i = 3; i <= limit; i += 2) {
if (num % i == 0) {
return;
}
}
output[i] = true;
}
""")
ker2 = SourceModule("""
__global__ void check_prime2(const unsigned long long *IN, bool *OUT) {
int id = threadIdx.x + blockDim.x * blockIdx.x;
unsigned long long num = IN[id];
unsigned long long limit = (unsigned long long) sqrt((double) num) + 1;
if (num == 2 || num == 3) {
OUT[id] = true;
return;
} else if (num == 1 || num % 2 == 0) {
return;
}
if (limit < 9) {
for (unsigned long long i = 3; i <= limit; i++) {
if (num % i == 0) {
return;
}
}
} else {
if (num > 3 && num % 3 == 0) {
return;
}
for (unsigned long long i = 9; i <= (limit + 6); i += 6) {
if (num % (i - 2) == 0 || num % (i - 4) == 0) {
return;
}
}
}
OUT[id] = true;
}
""")
def calc_primes(start: int = 1, grid_size: int = 1000, block_size: int = 1024):
check_prime = ker2.get_function("check_prime2")
primes = []
if start < 2:
primes = [2]
start = 3
if start % 2 == 0:
start = start + 1
startEvent = drv.Event()
endEvent = drv.Event()
testvec = np.arange(start, block_size * grid_size * 2 + start, step=2).astype(np.ulonglong)
testvec_gpu = gpuarray.to_gpu(testvec)
outvec_gpu = gpuarray.to_gpu(np.full(block_size * grid_size, False, dtype=bool))
startEvent.record()
check_prime(testvec_gpu, outvec_gpu, block=(block_size, 1, 1), grid=(grid_size, 1, 1))
endEvent.record()
endEvent.synchronize()
kernel_execution_time = startEvent.time_till(endEvent)
result = outvec_gpu.get()
for idx, val in enumerate(result):
if val:
primes.append(testvec[idx])
print('checked ' + str(block_size * grid_size) + ' numbers' + ' (' + str(start) + ' - ' + str(
start + block_size * grid_size) + ')')
print('last prime: ' + str(primes[-1]))
print('The GPU needed ' + str(kernel_execution_time) + ' milliseconds')
with open(options.timings_output, 'a') as file:
file.write(str(start) + "," + str(kernel_execution_time) + "," + str((block_size * grid_size)/(kernel_execution_time/1000)) + "\n")
return primes
if __name__ == "__main__":
parser = OptionParser()
parser.add_option("-e", "--end", dest="end",
help="numbers to check without even numbers", default="5000000000", type="int")
parser.add_option("--numbers-per-step", dest="numbers_per_step",
help="amount of uneven numbers checked in each step (even number are skipped)", default="8000000",
type="int")
parser.add_option("--output", dest="output",
help="name of the file, where the primes should be stored", default="primes.txt", type="string")
parser.add_option("--timings-output", dest="timings_output",
help="name of the csv file, where the timing is logged as csv", default="timings.csv",
type="string")
parser.add_option("--save-primes", dest="save_primes",
help="whether the calculated primes should be saved in a txt file", default=False)
(options, args) = parser.parse_args()
block_size = 1024
start = 1
grid_size = int(math.ceil(options.numbers_per_step / block_size))
resulting_numbers_per_step = block_size * grid_size
last_number_checked = start - 1
with open(options.timings_output, 'w') as file:
file.write("offset,duration,numbers_per_second\n")
if options.save_primes:
with open(options.output, 'w') as file:
file.write("")
while last_number_checked < options.end:
calculated_primes = calc_primes(last_number_checked + 1, grid_size, block_size)
if options.save_primes:
with open(options.output, 'a') as file:
file.write("\n".join([str(p) for p in calculated_primes]))
last_number_checked = last_number_checked + resulting_numbers_per_step * 2
Loading…
Cancel
Save