## Jupyter Snippet CB2nd 08_cuda

Jupyter Snippet CB2nd 08_cuda

# 5.8. Writing massively parallel code for NVIDIA graphics cards (GPUs) with CUDA

``````import math
import numpy as np
from numba import cuda
import matplotlib.pyplot as plt
%matplotlib inline
``````
``````len(cuda.gpus)
``````
``````1
``````
``````cuda.gpus[0].name
``````
``````b'GeForce GTX 980M'
``````
``````@cuda.jit
def mandelbrot_numba(m, iterations):
# Matrix index.
i, j = cuda.grid(2)
size = m.shape[0]
# Skip threads outside the matrix.
if i >= size or j >= size:
return
# Run the simulation.
c = (-2 + 3. / size * j +
1j * (1.5 - 3. / size * i))
z = 0
for n in range(iterations):
if abs(z) <= 10:
z = z * z + c
m[i, j] = n
else:
break
``````
``````size = 400
iterations = 100
``````
``````m = np.zeros((size, size))
``````
``````# 16x16 threads per block.
bs = 16
# Number of blocks in the grid.
bpg = math.ceil(size / bs)
# We prepare the GPU function.
f = mandelbrot_numba[(bpg, bpg), (bs, bs)]
``````
``````f(m, iterations)
``````
``````fig, ax = plt.subplots(1, 1, figsize=(10, 10))
ax.imshow(np.log(m), cmap=plt.cm.hot)
ax.set_axis_off()
``````

``````%timeit -n10 -r100 f(m, iterations)
``````
``````2.99 ms ± 173 µs per loop (mean ± std. dev. of 100 runs,
10 loops each)
``````
``````%timeit -n10 -r100 cuda.to_device(m)
``````
``````481 µs ± 106 µs per loop (mean ± std. dev. of 100 runs,
10 loops each)
``````
``````%%timeit -n10 -r100 m_gpu = cuda.to_device(m)
f(m_gpu, iterations)
``````
``````101 µs ± 11.8 µs per loop (mean ± std. dev. of 100 runs,
10 loops each)
``````
``````m_gpu = cuda.to_device(m)
``````
``````%timeit -n10 -r100 m_gpu.copy_to_host()
``````
``````238 µs ± 67.8 µs per loop (mean ± std. dev. of 100 runs,
10 loops each)
``````
``````# Thread id in a 1D block