python-cuda-demo/customKernel.py

import numpy as np
import pycuda.autoinit
from pycuda import gpuarray
from time import time
from pycuda.elementwise import ElementwiseKernel

host_data = np.float32(np.random.random(50000000))

gpu_2x_ker = ElementwiseKernel(
    "float *in, float *out",
    "out[i] = 2*in[i];",
    "gpu_2x_ker")

# warm up
test_data = gpuarray.to_gpu(host_data)
gpu_2x_ker(test_data, gpuarray.empty_like(test_data))


def speed_comparison():
    t1 = time()
    host_data_2x = host_data * np.float32(2)
    t2 = time()
    print('total time to compute on CPU: %f' % (t2 - t1))
    device_data = gpuarray.to_gpu(host_data)
    # allocate memory for output
    device_data_2x = gpuarray.empty_like(device_data)
    t1 = time()
    gpu_2x_ker(device_data, device_data_2x)
    t2 = time()
    from_device = device_data_2x.get()
    print('total time to compute on GPU: %f' % (t2 - t1))
    print(
        'Is the host computation the same as the GPU computation? : {}'.format(np.allclose(from_device, host_data_2x)))


if __name__ == '__main__':
    speed_comparison()