Cuda Memo

Hello world

#include <stdio.h>
#include <cuda.h>

/*Device code: runs on GPU*/
__global__ void Hello(void) { // always return type with void
	printf("Hello from thread %d!\n", threadIdx.x);
}

/*Host code: runs on CPU*/
int main(int argc, char* argv[]) {
	int thread_count;
	thread_count = strtol(argv[1], NULL, 10);
	Hello <<<1, thread_count>>>(); // 1st arg specify how many SMs, 2nd arg specify how many SPs in each SMs. i.e. 1st arg specify the number of thread blocks. 2nd arg specifies the number of threads in each thread block.
	cudaDeviceSynchronize();  // this will cause main program to wait until all the threads have finished.
	return 0;
}

nvcc -o cuda_hello cuda_hello.cu
./cuda_hello 1

Build-in variables

threadIdx: the rank or index of the thread in its thread blocks
blockDim: the dimensions, shape, or size of the thread of blocks (how many thread in block)
blockIdx: the rank or index of the block within the grid
griddim: the dimensions, shape, or size of grid. (how many block in grid) they all get fields x, y, and z

int blk_ct, th_per_blk;
...
Hello <<<blk_ct, th_per_blk>>>();
//there gridDim.x = blk_ct, blockDim.x = th_per_blk

//if 3 dim,
dim3 grid_dims, block_dims;
grid_dims.x = 2;
grid_dims.y = 3;
grid_dims.z = 1;
block_dims.x = 4;
block_dims.y = 4;
block_dims.z = 4;
Kenerl <<<grid_dims, block_dims>>>();

Vec-add

__global__ void Vec_add(
	const float x[], //in
	const float y[], //in
	float z[],       //out
	const int n //in
) {
	int my_elt = blockDim.x * blockIdx.x + threadIdx.x ;  //important
	if (my_elt < n) // global variable n
		z[my_elt] = x[my_elt] + y[my_elt];
}

key idea: assign the iterations of a loop to a individual threads

Memory Malloc

We can use normal malloc in cuda c, but only for host.

cudaMalloc, it allocate memory on the device
cudaMallocManaged, it allocates memory that can be accessed by both CPU and GPU (auto transfer, only supported on newer version, Unified Memory).
cudaHostAlloc, allocates pinned host memory for faster CPU-GPU transfers.
cudaMemcpy, copy memo, use key word cudaMemcpyHostToDevice,etc.

API

cudaFree

__host__ __device__ cudaError_t cudaFree(void* ptr)
//__device__ indicate this function can be called from the device
// it frees storage allocated on the device.

cudaMemcpy

__host__ cudaError_t cudaMemcpy(
	void* dest, //out, dest is the first.
	const void* source, //in
	size_t count, //in
	cudaMemcpyKind kind //in cudaMemcpyHostToDevice or cudaMemcpyDeviceToHost
) // this function is synchronous

Returing results from cuda kernels

// wrong way
__global__ void Add(int x, int y, int* sum_p) {
	*sum_p = x + y;  // invalid address in device
}
int main(void) {
	int sum = -5;
	Add <<<1, 1>>> (2, 3, &sum);
	cudaDeviceSynchronize();
	printf("The sum is %d\n", sum);
}

//using managed memory
__global__ void Add(int x, int y, int* sum_p) {
	*sum_p = x + y;  // invalid address in device
}
int main(void) {
	int* sum_p;
	cudaMallocManaged(&sum_p, sizeof(int));
	*sum_p = -5
	Add <<<1, 1>>> (2, 3, sum_p);
	cudaDeviceSynchronize();
	printf("The sum is %d\n", sum);
}

//using explicity copying 
__global__ void Add(int x, int y, int* sum_p) {
	*sum_p = x + y;  // invalid address in device
}
int main(void) {
	int* hsum_p, *dsum_p;
	hsum_p = (int*) malloc(sizeof(int));
	cudaMalloc(&dsum_p, sizeof(int));
	*hsum_p = -5
	Add <<<1, 1>>> (2, 3, dsum_p);
	cudaMemcpy(hsum_p, dsum_p, sizeof(int), cudaMemcpyDeviceToHost);
	cudaDeviceSynchronize();
	printf("The sum is %d\n", sum);
	free(hsum_p);
	cudaFree(dsum_p);
}

// we can a managed qualifier, it's just like a variable managed by cudaMallocManaged
__managed__ int sum;
__global__ void Add(int x, int y) {
	sum = x + y;
}
int main(void) {
	sum = -5;
	Add <<<1, 1>>> (2, 3);
	cudaDeviceSynchronize();
	printf("After kernel: The sum is %d \n", sum);
}

Recall of serial trapezoidal rule

float Serial_trap(
	const float a, //in
	const float b, //in
	const float n, //in
) {
	float x, h = (b - a) / n;
	float trap = 0.5 * (f(a) + f(b));
	for (int i = 1; i <= n - 1; i++) {
		x = a + i * h;
		trap += f(x);
	}
	trap = trap * h;
	return trap;
}

A kernel version:

__global__ void Dev_trap(
	const float a, // in
	const float b, // in
	const float h, // in
	const float n, // in
	float* trap_p //in and out
) {
	int my_i = blockDim.x * blockIdx.x + threadIdx.x;
	if (0 < my_i && my_i < n) {
		float my_x = a + my_i * h;
		float my_trap = f(my_x);
		atomicAdd(trap_p, my_trap);
	}
}

void Trap_wrapper(
	const float a, //in
	const float b, //in
	const float n, //in
	float* trap_p, //out
	const int blk_ct, //in
	const int th_per_blk, //in
) {
	// trap p is allocated in main with cudaMallocManaged, and it's shared for all threads
	*trap_p = 0.5 * (f(a) + f(b));
	float h = (b - a) / n;
	Dev_trap<<<blk_ct, th_per_blk>>>(a, b, h, n, trap_p);
	cudaDeviceSynchronoize();
	*trap_p = h*(*trap_p);
}

warp shuffle

warp shuffle allow a collection of threads within a warp to read variables stored by other threads in the warp.

In cuda, warp is a set of threads with consecutive ranks belonging to a thread block. The threads in a warp operate in SIMD fashion. So threads in different warps can execute different statements with no penalty. If threads in a warp take different branches in an if-else, the threads are said to have diverged.

The rank of a thread within a warp is called the thread’s lane, and it can be computed using the formula:

lane = threadIdx.x % warpSize;

Coalesced Access

If memory address accessed by threads in the same thread block are consecutive, then these memory accesses are grouped into one memory transaction.

Qualifier

device

callable from the device only
synchronous

host

can be used with device to compile the function both on host and the device

Variable Type Qualifiers

a variable declared in device code without a type qualifier typically resides in the register.

device

declare a variable that resides on the device.

resides in global memory space
has lifetime of an application
is accessible from all the threads within the grid

__device__ int d_value;
__global__ void test_Kernel()
{
	int threadID = threadIdx.x;
	d_value = 1;
	printf("threadID %-3d d_value %3d\n", threadID, d_value);
}

constant

reside in constant memory space
is accessible from all the threads within the grid and fom the host

shared

resides in the shared memory space of a thread block
has lifetime of the block
is accessible from all the threads within the block