I just started to learn CUDA this summer (isn’t it too late? no, it is never to late to learn!)
The processing today gets faster because we have more transistors available for computation, thus learning parallel computing is essential for GPGPU usage.
Smaller and more efficient processors are used to increase the computing power.
A great site to learn CUDA at the Udacity: https://classroom.udacity.com/courses/cs344/
A simple CUDA program is something like this:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> #define SIZE 1024 cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size); __global__ void VectorAdd(int *a, int *b, int *c, int n) { int i = threadIdx.x; if (i < n) c[i] = a[i] + b[i]; } int main() { int *a, *b, *c; int *d_a, *d_b, *d_c; a = (int *)malloc(SIZE * sizeof(int)); b = (int *)malloc(SIZE * sizeof(int)); c = (int *)malloc(SIZE * sizeof(int)); cudaMalloc(&d_a, SIZE * sizeof(int)); cudaMalloc(&d_b, SIZE * sizeof(int)); cudaMalloc(&d_c, SIZE * sizeof(int)); for (int i = 0; i < SIZE; ++i) { a[i] = i; b[i] = i; c[i] = 0; } cudaMemcpy(d_a, a, SIZE * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_b, b, SIZE * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_c, c, SIZE * sizeof(int), cudaMemcpyHostToDevice); // 1 block of threads, SIZE of total threads in the block VectorAdd <<< 1, SIZE >>> (d_a, d_b, d_c, SIZE); cudaMemcpy(c, d_c, SIZE * sizeof(int), cudaMemcpyDeviceToHost); for (int i = 0; i < 10; ++i) printf("c[%d] = %d\n", i, c[i]); free(a); free(b); free(c); cudaFree(a); cudaFree(b); cudaFree(c); return 0; } |
KERNEL <<< GRID OF BLOCKS, BLOCK OF THREADS, >>> ( … )
dim3(x, y, z)
dim3(w, 1, 1) == dim3(w) == w
square <<5,256>> (…) == square << dim3(5,1,1), dim3(256,1,1) >>
for 128 * 128 image,