Delete unnecessary scripts
parent
c854f55af0
commit
d4081c7deb
@ -1,101 +0,0 @@
|
|||||||
# Iterative Conway's game of life in Python / CUDA C
|
|
||||||
# this version is meant to illustrate the use of shared kernel memory in CUDA.
|
|
||||||
# written by Brian Tuomanen for "Hands on GPU Programming with Python and CUDA"
|
|
||||||
|
|
||||||
import pycuda.autoinit
|
|
||||||
import pycuda.driver as drv
|
|
||||||
from pycuda import gpuarray
|
|
||||||
from pycuda.compiler import SourceModule
|
|
||||||
import numpy as np
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
from time import time
|
|
||||||
|
|
||||||
shared_ker = SourceModule("""
|
|
||||||
#define _iters 1000000
|
|
||||||
|
|
||||||
#define _X ( threadIdx.x + blockIdx.x * blockDim.x )
|
|
||||||
#define _Y ( threadIdx.y + blockIdx.y * blockDim.y )
|
|
||||||
|
|
||||||
#define _WIDTH ( blockDim.x * gridDim.x )
|
|
||||||
#define _HEIGHT ( blockDim.y * gridDim.y )
|
|
||||||
|
|
||||||
#define _XM(x) ( (x + _WIDTH) % _WIDTH )
|
|
||||||
#define _YM(y) ( (y + _HEIGHT) % _HEIGHT )
|
|
||||||
|
|
||||||
#define _INDEX(x,y) ( _XM(x) + _YM(y) * _WIDTH )
|
|
||||||
|
|
||||||
// return the number of living neighbors for a given cell
|
|
||||||
__device__ int nbrs(int x, int y, int * in)
|
|
||||||
{
|
|
||||||
return ( in[ _INDEX(x -1, y+1) ] + in[ _INDEX(x-1, y) ] + in[ _INDEX(x-1, y-1) ] \
|
|
||||||
+ in[ _INDEX(x, y+1)] + in[_INDEX(x, y - 1)] \
|
|
||||||
+ in[ _INDEX(x+1, y+1) ] + in[ _INDEX(x+1, y) ] + in[ _INDEX(x+1, y-1) ] );
|
|
||||||
}
|
|
||||||
|
|
||||||
__global__ void conway_ker_shared(int * p_lattice, int iters)
|
|
||||||
{
|
|
||||||
// x, y are the appropriate values for the cell covered by this thread
|
|
||||||
int x = _X, y = _Y;
|
|
||||||
__shared__ int lattice[32*32];
|
|
||||||
|
|
||||||
|
|
||||||
lattice[_INDEX(x,y)] = p_lattice[_INDEX(x,y)];
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
for (int i = 0; i < iters; i++)
|
|
||||||
{
|
|
||||||
|
|
||||||
// count the number of neighbors around the current cell
|
|
||||||
int n = nbrs(x, y, lattice);
|
|
||||||
|
|
||||||
int cell_value;
|
|
||||||
|
|
||||||
|
|
||||||
// if the current cell is alive, then determine if it lives or dies for the next generation.
|
|
||||||
if ( lattice[_INDEX(x,y)] == 1)
|
|
||||||
switch(n)
|
|
||||||
{
|
|
||||||
// if the cell is alive: it remains alive only if it has 2 or 3 neighbors.
|
|
||||||
case 2:
|
|
||||||
case 3: cell_value = 1;
|
|
||||||
break;
|
|
||||||
default: cell_value = 0;
|
|
||||||
}
|
|
||||||
else if( lattice[_INDEX(x,y)] == 0 )
|
|
||||||
switch(n)
|
|
||||||
{
|
|
||||||
// a dead cell comes to life only if it has 3 neighbors that are alive.
|
|
||||||
case 3: cell_value = 1;
|
|
||||||
break;
|
|
||||||
default: cell_value = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
lattice[_INDEX(x,y)] = cell_value;
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
p_lattice[_INDEX(x,y)] = lattice[_INDEX(x,y)];
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
}
|
|
||||||
""")
|
|
||||||
|
|
||||||
conway_ker_shared = shared_ker.get_function("conway_ker_shared")
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
# set lattice size
|
|
||||||
N = 32
|
|
||||||
|
|
||||||
lattice = np.int32(np.random.choice([1, 0], N * N, p=[0.25, 0.75]).reshape(N, N))
|
|
||||||
lattice_gpu = gpuarray.to_gpu(lattice)
|
|
||||||
|
|
||||||
conway_ker_shared(lattice_gpu, np.int32(1000000), grid=(1, 1, 1), block=(32, 32, 1))
|
|
||||||
|
|
||||||
fig = plt.figure(1)
|
|
||||||
plt.imshow(lattice_gpu.get())
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
|
|
@ -1,37 +0,0 @@
|
|||||||
import numpy as np
|
|
||||||
import pycuda.autoinit
|
|
||||||
from pycuda import gpuarray
|
|
||||||
from time import time
|
|
||||||
from pycuda.elementwise import ElementwiseKernel
|
|
||||||
|
|
||||||
host_data = np.float32(np.random.random(50000000))
|
|
||||||
|
|
||||||
gpu_2x_ker = ElementwiseKernel(
|
|
||||||
"float *in, float *out",
|
|
||||||
"out[i] = 2*in[i];",
|
|
||||||
"gpu_2x_ker")
|
|
||||||
|
|
||||||
# warm up
|
|
||||||
test_data = gpuarray.to_gpu(host_data)
|
|
||||||
gpu_2x_ker(test_data, gpuarray.empty_like(test_data))
|
|
||||||
|
|
||||||
|
|
||||||
def speed_comparison():
|
|
||||||
t1 = time()
|
|
||||||
host_data_2x = host_data * np.float32(2)
|
|
||||||
t2 = time()
|
|
||||||
print('total time to compute on CPU: %f' % (t2 - t1))
|
|
||||||
device_data = gpuarray.to_gpu(host_data)
|
|
||||||
# allocate memory for output
|
|
||||||
device_data_2x = gpuarray.empty_like(device_data)
|
|
||||||
t1 = time()
|
|
||||||
gpu_2x_ker(device_data, device_data_2x)
|
|
||||||
t2 = time()
|
|
||||||
from_device = device_data_2x.get()
|
|
||||||
print('total time to compute on GPU: %f' % (t2 - t1))
|
|
||||||
print(
|
|
||||||
'Is the host computation the same as the GPU computation? : {}'.format(np.allclose(from_device, host_data_2x)))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
speed_comparison()
|
|
@ -1,30 +1,145 @@
|
|||||||
|
import math
|
||||||
|
from time import time
|
||||||
|
|
||||||
|
import pycuda.autoinit
|
||||||
|
import pycuda.driver as drv
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from pycuda import gpuarray
|
from pycuda import gpuarray
|
||||||
|
from pycuda.compiler import SourceModule
|
||||||
|
|
||||||
# -- initialize the device
|
from optparse import OptionParser
|
||||||
import pycuda.autoinit
|
|
||||||
|
ker = SourceModule("""
|
||||||
|
__global__ void
|
||||||
|
check_prime(unsigned long long *input, bool *output)
|
||||||
|
{
|
||||||
|
int i = threadIdx.x + blockDim.x * blockIdx.x;
|
||||||
|
|
||||||
|
unsigned long long num = input[i];
|
||||||
|
if (num == 2) {
|
||||||
|
output[i] = true;
|
||||||
|
return;
|
||||||
|
} else if (num < 3 || num % 2 == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
unsigned long long limit = (long) sqrt((double) num) + 1;
|
||||||
|
for (unsigned long long i = 3; i <= limit; i += 2) {
|
||||||
|
if (num % i == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
output[i] = true;
|
||||||
|
}
|
||||||
|
""")
|
||||||
|
|
||||||
|
ker2 = SourceModule("""
|
||||||
|
__global__ void check_prime2(const unsigned long long *IN, bool *OUT) {
|
||||||
|
int id = threadIdx.x + blockDim.x * blockIdx.x;
|
||||||
|
unsigned long long num = IN[id];
|
||||||
|
unsigned long long limit = (unsigned long long) sqrt((double) num) + 1;
|
||||||
|
|
||||||
|
if (num == 2 || num == 3) {
|
||||||
|
OUT[id] = true;
|
||||||
|
return;
|
||||||
|
} else if (num == 1 || num % 2 == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (limit < 9) {
|
||||||
|
for (unsigned long long i = 3; i <= limit; i++) {
|
||||||
|
if (num % i == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (num > 3 && num % 3 == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (unsigned long long i = 9; i <= (limit + 6); i += 6) {
|
||||||
|
if (num % (i - 2) == 0 || num % (i - 4) == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
OUT[id] = true;
|
||||||
|
}
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
|
def calc_primes(start: int = 1, grid_size: int = 1000, block_size: int = 1024):
|
||||||
|
check_prime = ker2.get_function("check_prime2")
|
||||||
|
|
||||||
|
primes = []
|
||||||
|
if start < 2:
|
||||||
|
primes = [2]
|
||||||
|
start = 3
|
||||||
|
if start % 2 == 0:
|
||||||
|
start = start + 1
|
||||||
|
|
||||||
|
startEvent = drv.Event()
|
||||||
|
endEvent = drv.Event()
|
||||||
|
|
||||||
|
testvec = np.arange(start, block_size * grid_size * 2 + start, step=2).astype(np.ulonglong)
|
||||||
|
|
||||||
|
testvec_gpu = gpuarray.to_gpu(testvec)
|
||||||
|
outvec_gpu = gpuarray.to_gpu(np.full(block_size * grid_size, False, dtype=bool))
|
||||||
|
startEvent.record()
|
||||||
|
check_prime(testvec_gpu, outvec_gpu, block=(block_size, 1, 1), grid=(grid_size, 1, 1))
|
||||||
|
endEvent.record()
|
||||||
|
endEvent.synchronize()
|
||||||
|
kernel_execution_time = startEvent.time_till(endEvent)
|
||||||
|
|
||||||
|
result = outvec_gpu.get()
|
||||||
|
|
||||||
|
for idx, val in enumerate(result):
|
||||||
|
if val:
|
||||||
|
primes.append(testvec[idx])
|
||||||
|
|
||||||
dev = pycuda.autoinit.device
|
print('checked ' + str(block_size * grid_size) + ' numbers' + ' (' + str(start) + ' - ' + str(
|
||||||
|
start + block_size * grid_size) + ')')
|
||||||
|
print('last prime: ' + str(primes[-1]))
|
||||||
|
print('The GPU needed ' + str(kernel_execution_time) + ' milliseconds')
|
||||||
|
|
||||||
print(dev.name())
|
with open(options.timings_output, 'a') as file:
|
||||||
print('\t Total Memory: {} megabytes'.format(dev.total_memory() // (1024 ** 2)))
|
file.write(str(start) + "," + str(kernel_execution_time) + "," + str((block_size * grid_size)/(kernel_execution_time/1000)) + "\n")
|
||||||
|
|
||||||
device_attributes = {}
|
return primes
|
||||||
for k, v in dev.get_attributes().items():
|
|
||||||
device_attributes[str(k)] = v
|
|
||||||
print('\t ' + str(k) + ': ' + str(v))
|
|
||||||
|
|
||||||
host_data = np.array([1, 2, 3, 4, 5], dtype=np.float32)
|
|
||||||
host_data_2 = np.array([7, 12, 3, 5, 4], dtype=np.float32)
|
|
||||||
|
|
||||||
device_data = gpuarray.to_gpu(host_data)
|
if __name__ == "__main__":
|
||||||
device_data_2 = gpuarray.to_gpu(host_data_2)
|
parser = OptionParser()
|
||||||
|
parser.add_option("-e", "--end", dest="end",
|
||||||
|
help="numbers to check without even numbers", default="5000000000", type="int")
|
||||||
|
parser.add_option("--numbers-per-step", dest="numbers_per_step",
|
||||||
|
help="amount of uneven numbers checked in each step (even number are skipped)", default="8000000",
|
||||||
|
type="int")
|
||||||
|
parser.add_option("--block_size", dest="block_size",
|
||||||
|
help="number of threads per block, max = 1024", default="1024",
|
||||||
|
type="int")
|
||||||
|
parser.add_option("--output", dest="output",
|
||||||
|
help="name of the file, where the primes should be stored", default="primes.txt", type="string")
|
||||||
|
parser.add_option("--timings-output", dest="timings_output",
|
||||||
|
help="name of the csv file, where the timing is logged as csv", default="timings.csv",
|
||||||
|
type="string")
|
||||||
|
parser.add_option("--save-primes", dest="save_primes",
|
||||||
|
help="whether the calculated primes should be saved in a txt file", default=False)
|
||||||
|
(options, args) = parser.parse_args()
|
||||||
|
|
||||||
print(host_data * host_data_2)
|
block_size = options.block_size
|
||||||
print((device_data * device_data_2).get())
|
start = 1
|
||||||
|
grid_size = int(math.ceil(options.numbers_per_step / block_size))
|
||||||
|
resulting_numbers_per_step = block_size * grid_size
|
||||||
|
last_number_checked = start - 1
|
||||||
|
|
||||||
print(host_data / 2)
|
with open(options.timings_output, 'w') as file:
|
||||||
print((device_data / 2).get())
|
file.write("offset,duration,numbers_per_second\n")
|
||||||
|
if options.save_primes:
|
||||||
|
with open(options.output, 'w') as file:
|
||||||
|
file.write("")
|
||||||
|
|
||||||
print(host_data - host_data_2)
|
while last_number_checked < options.end:
|
||||||
print((device_data - device_data_2).get())
|
calculated_primes = calc_primes(last_number_checked + 1, grid_size, block_size)
|
||||||
|
if options.save_primes:
|
||||||
|
with open(options.output, 'a') as file:
|
||||||
|
file.write("\n".join([str(p) for p in calculated_primes]))
|
||||||
|
last_number_checked = last_number_checked + resulting_numbers_per_step * 2
|
||||||
|
@ -1,142 +0,0 @@
|
|||||||
import math
|
|
||||||
from time import time
|
|
||||||
|
|
||||||
import pycuda.autoinit
|
|
||||||
import pycuda.driver as drv
|
|
||||||
import numpy as np
|
|
||||||
from pycuda import gpuarray
|
|
||||||
from pycuda.compiler import SourceModule
|
|
||||||
|
|
||||||
from optparse import OptionParser
|
|
||||||
|
|
||||||
ker = SourceModule("""
|
|
||||||
__global__ void
|
|
||||||
check_prime(unsigned long long *input, bool *output)
|
|
||||||
{
|
|
||||||
int i = threadIdx.x + blockDim.x * blockIdx.x;
|
|
||||||
|
|
||||||
unsigned long long num = input[i];
|
|
||||||
if (num == 2) {
|
|
||||||
output[i] = true;
|
|
||||||
return;
|
|
||||||
} else if (num < 3 || num % 2 == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
unsigned long long limit = (long) sqrt((double) num) + 1;
|
|
||||||
for (unsigned long long i = 3; i <= limit; i += 2) {
|
|
||||||
if (num % i == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
output[i] = true;
|
|
||||||
}
|
|
||||||
""")
|
|
||||||
|
|
||||||
ker2 = SourceModule("""
|
|
||||||
__global__ void check_prime2(const unsigned long long *IN, bool *OUT) {
|
|
||||||
int id = threadIdx.x + blockDim.x * blockIdx.x;
|
|
||||||
unsigned long long num = IN[id];
|
|
||||||
unsigned long long limit = (unsigned long long) sqrt((double) num) + 1;
|
|
||||||
|
|
||||||
if (num == 2 || num == 3) {
|
|
||||||
OUT[id] = true;
|
|
||||||
return;
|
|
||||||
} else if (num == 1 || num % 2 == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (limit < 9) {
|
|
||||||
for (unsigned long long i = 3; i <= limit; i++) {
|
|
||||||
if (num % i == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (num > 3 && num % 3 == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
for (unsigned long long i = 9; i <= (limit + 6); i += 6) {
|
|
||||||
if (num % (i - 2) == 0 || num % (i - 4) == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
OUT[id] = true;
|
|
||||||
}
|
|
||||||
""")
|
|
||||||
|
|
||||||
|
|
||||||
def calc_primes(start: int = 1, grid_size: int = 1000, block_size: int = 1024):
|
|
||||||
check_prime = ker2.get_function("check_prime2")
|
|
||||||
|
|
||||||
primes = []
|
|
||||||
if start < 2:
|
|
||||||
primes = [2]
|
|
||||||
start = 3
|
|
||||||
if start % 2 == 0:
|
|
||||||
start = start + 1
|
|
||||||
|
|
||||||
startEvent = drv.Event()
|
|
||||||
endEvent = drv.Event()
|
|
||||||
|
|
||||||
testvec = np.arange(start, block_size * grid_size * 2 + start, step=2).astype(np.ulonglong)
|
|
||||||
|
|
||||||
testvec_gpu = gpuarray.to_gpu(testvec)
|
|
||||||
outvec_gpu = gpuarray.to_gpu(np.full(block_size * grid_size, False, dtype=bool))
|
|
||||||
startEvent.record()
|
|
||||||
check_prime(testvec_gpu, outvec_gpu, block=(block_size, 1, 1), grid=(grid_size, 1, 1))
|
|
||||||
endEvent.record()
|
|
||||||
endEvent.synchronize()
|
|
||||||
kernel_execution_time = startEvent.time_till(endEvent)
|
|
||||||
|
|
||||||
result = outvec_gpu.get()
|
|
||||||
|
|
||||||
for idx, val in enumerate(result):
|
|
||||||
if val:
|
|
||||||
primes.append(testvec[idx])
|
|
||||||
|
|
||||||
print('checked ' + str(block_size * grid_size) + ' numbers' + ' (' + str(start) + ' - ' + str(
|
|
||||||
start + block_size * grid_size) + ')')
|
|
||||||
print('last prime: ' + str(primes[-1]))
|
|
||||||
print('The GPU needed ' + str(kernel_execution_time) + ' milliseconds')
|
|
||||||
|
|
||||||
with open(options.timings_output, 'a') as file:
|
|
||||||
file.write(str(start) + "," + str(kernel_execution_time) + "," + str((block_size * grid_size)/(kernel_execution_time/1000)) + "\n")
|
|
||||||
|
|
||||||
return primes
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = OptionParser()
|
|
||||||
parser.add_option("-e", "--end", dest="end",
|
|
||||||
help="numbers to check without even numbers", default="5000000000", type="int")
|
|
||||||
parser.add_option("--numbers-per-step", dest="numbers_per_step",
|
|
||||||
help="amount of uneven numbers checked in each step (even number are skipped)", default="8000000",
|
|
||||||
type="int")
|
|
||||||
parser.add_option("--output", dest="output",
|
|
||||||
help="name of the file, where the primes should be stored", default="primes.txt", type="string")
|
|
||||||
parser.add_option("--timings-output", dest="timings_output",
|
|
||||||
help="name of the csv file, where the timing is logged as csv", default="timings.csv",
|
|
||||||
type="string")
|
|
||||||
parser.add_option("--save-primes", dest="save_primes",
|
|
||||||
help="whether the calculated primes should be saved in a txt file", default=False)
|
|
||||||
(options, args) = parser.parse_args()
|
|
||||||
|
|
||||||
block_size = 1024
|
|
||||||
start = 1
|
|
||||||
grid_size = int(math.ceil(options.numbers_per_step / block_size))
|
|
||||||
resulting_numbers_per_step = block_size * grid_size
|
|
||||||
last_number_checked = start - 1
|
|
||||||
|
|
||||||
with open(options.timings_output, 'w') as file:
|
|
||||||
file.write("offset,duration,numbers_per_second\n")
|
|
||||||
if options.save_primes:
|
|
||||||
with open(options.output, 'w') as file:
|
|
||||||
file.write("")
|
|
||||||
|
|
||||||
while last_number_checked < options.end:
|
|
||||||
calculated_primes = calc_primes(last_number_checked + 1, grid_size, block_size)
|
|
||||||
if options.save_primes:
|
|
||||||
with open(options.output, 'a') as file:
|
|
||||||
file.write("\n".join([str(p) for p in calculated_primes]))
|
|
||||||
last_number_checked = last_number_checked + resulting_numbers_per_step * 2
|
|
Loading…
Reference in New Issue