LeetGPU Memo

reduce kernel

#include <cuda_runtime.h>

  

#define T 256

// we need a static number when declaring shared memory

__global__ void reduction_kernel(const float* input, float* output, int N) {
	__shared__ float shared[T];

	int tid = threadIdx.x;
	int idx = blockDim.x * blockIdx.x + threadIdx.x;
	  
	// init the shared memory of each block
	shared[tid] = (idx < N) ? input[idx] : 0.0f;
	__syncthreads();
	
	// reduction
	for (int s = blockDim.x / 2; s > 0; s >>= 1) {
		if (tid < s) {
			shared[tid] += shared[tid + s];
		}
		__syncthreads();
	}
	// inverse way
//	for (int s = 1; s < blockDim.x; s <<= 1) {
//		if (tid % (2 * s) == 0) {
//			shared[tid] += shared[tid + s];
//		}
//		__syncthreads();
//	}
	
	if (tid == 0) {
		atomicAdd(output, shared[0]);
	}
}
// input, output are device pointers

extern "C" void solve(const float* input, float* output, int N) {
	int threadsPerBlock = T;
	int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
	reduction_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, N);
	cudaDeviceSynchronize();
}

Count array element

#include <cuda_runtime.h>

__global__ void count_equal_kernel(const int* input, int* output, int N, int K) {
	int thread_id = blockDim.x * blockIdx.x + threadIdx.x;
	if (thread_id < N) {
		if(input[thread_id] == K) {
			atomicAdd(output, 1);
		}
	}
}


extern "C" void solve(const float* input, float* output, int N) {
	int threadsPerBlock = 256;
	int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
	reduction_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, N);
	cudaDeviceSynchronize();
}