一、直方圖

在Ubuntu系統的運行命令：

nvcc -o histo histo.cu

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>

int log2(int i)
{
	int r = 0;
	while (i >>= 1) r++;
	return r;
}

int bit_reverse(int w, int bits)
{
	int r = 0;
	for (int i = 0; i < bits; i++)
	{
		int bit = (w & (1 << i)) >> i;
		r |= bit << (bits - i - 1);
	}
	return r;
}

// Wrong writing:Pay attention to synchronization when multiple threads read and manipulate a variable
__global__ void naive_histo(int *d_bins, const int *d_in, const int BIN_COUNT)
{
	int myId = threadIdx.x + blockDim.x * blockIdx.x;
	int myItem = d_in[myId];
	int myBin = myItem % BIN_COUNT;
	d_bins[myBin]++;
}

// Correct writing, but the efficiency of parallelism is not stable, the degree of parallelism depends on BIN_COUNT
__global__ void simple_histo(int *d_bins, const int *d_in, const int BIN_COUNT)
{
	int myId = threadIdx.x + blockDim.x * blockIdx.x;
	int myItem = d_in[myId];
	int myBin = myItem % BIN_COUNT;
	atomicAdd(&(d_bins[myBin]), 1);
}


int main(int argc, char **argv)
{
	int deviceCount;
	cudaGetDeviceCount(&deviceCount);
	if (deviceCount == 0) {
		fprintf(stderr, "error: no devices supporting CUDA.\n");
		exit(EXIT_FAILURE);
	}
	int dev = 0;
	cudaSetDevice(dev);

	cudaDeviceProp devProps;
	if (cudaGetDeviceProperties(&devProps, dev) == 0)
	{
		printf("Using device %d:\n", dev);
		printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
			devProps.name, (int)devProps.totalGlobalMem,
			(int)devProps.major, (int)devProps.minor,
			(int)devProps.clockRate);
	}

	const int ARRAY_SIZE = 65536;
	const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
	const int BIN_COUNT = 16;
	const int BIN_BYTES = BIN_COUNT * sizeof(int);

	// generate the input array on the host
	int h_in[ARRAY_SIZE];
	for (int i = 0; i < ARRAY_SIZE; i++) {
		h_in[i] = bit_reverse(i, log2(ARRAY_SIZE));
	}
	int h_bins[BIN_COUNT];
	for (int i = 0; i < BIN_COUNT; i++) {
		h_bins[i] = 0;
	}

	// declare GPU memory pointers
	int * d_in;
	int * d_bins;

	// allocate GPU memory
	cudaMalloc((void **)&d_in, ARRAY_BYTES);
	cudaMalloc((void **)&d_bins, BIN_BYTES);

	// transfer the arrays to the GPU
	cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
	cudaMemcpy(d_bins, h_bins, BIN_BYTES, cudaMemcpyHostToDevice);

	int whichKernel = 0;
	if (argc == 2) {
		whichKernel = atoi(argv[1]);
	}

	// launch the kernel
	switch (whichKernel) {
	case 0:
		printf("Running naive histo\n");
		naive_histo << <ARRAY_SIZE / 64, 64 >> > (d_bins, d_in, BIN_COUNT);
		break;
	case 1:
		printf("Running simple histo\n");
		simple_histo << <ARRAY_SIZE / 64, 64 >> > (d_bins, d_in, BIN_COUNT);
		break;
	default:
		fprintf(stderr, "error: ran no kernel\n");
		exit(EXIT_FAILURE);
	}

	// copy back the sum from GPU
	cudaMemcpy(h_bins, d_bins, BIN_BYTES, cudaMemcpyDeviceToHost);

	for (int i = 0; i < BIN_COUNT; i++) {
		printf("bin %d: count %d\n", i, h_bins[i]);
	}

	// free GPU memory allocation
	cudaFree(d_in);
	cudaFree(d_bins);

	return 0;
}

二、彩色轉灰度

Ubuntu上運行的命令：

nvcc -o rgb2gray rgb2gray.cu -lopencv_highgui -lopencv_core -lopencv_imgcodecs -lopencv_imgproc

#include <iostream>
#include <string>
#include <cassert>

#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/opencv.hpp>

#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <device_launch_parameters.h>

#define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)

cv::Mat imageRGBA;
cv::Mat imageGrey;

uchar4        *d_rgbaImage__;
unsigned char *d_greyImage__;

size_t numRows() { return imageRGBA.rows; }
size_t numCols() { return imageRGBA.cols; }

template<typename T>
void check(T err, const char* const func, const char* const file, const int line) {
	if (err != cudaSuccess) {
		std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
		std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
		exit(1);
	}
}

void preProcess(uchar4 **inputImage, unsigned char **greyImage,
	uchar4 **d_rgbaImage, unsigned char **d_greyImage,
	const std::string &filename) {
	//make sure the context initializes ok
	checkCudaErrors(cudaFree(0));

	cv::Mat image;
	image = cv::imread(filename.c_str());
	if (image.empty()) {
		std::cerr << "Couldn't open file: " << filename << std::endl;
		exit(1);
	}

	cv::cvtColor(image, imageRGBA, cv::COLOR_BGR2RGBA);

	//allocate memory for the output
	imageGrey.create(image.rows, image.cols, CV_8UC1);

	//This shouldn't ever happen given the way the images are created
	//at least based upon my limited understanding of OpenCV, but better to check
	if (!imageRGBA.isContinuous() || !imageGrey.isContinuous()) {
		std::cerr << "Images aren't continuous!! Exiting." << std::endl;
		exit(1);
	}

	*inputImage = (uchar4 *)imageRGBA.ptr<unsigned char>(0);
	*greyImage = imageGrey.ptr<unsigned char>(0);

	const size_t numPixels = numRows() * numCols();
	//allocate memory on the device for both input and output
	checkCudaErrors(cudaMalloc(d_rgbaImage, sizeof(uchar4) * numPixels));
	checkCudaErrors(cudaMalloc(d_greyImage, sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMemset(*d_greyImage, 0, numPixels * sizeof(unsigned char))); //make sure no memory is left laying around

	//copy input array to the GPU
	checkCudaErrors(cudaMemcpy(*d_rgbaImage, *inputImage, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice));

	d_rgbaImage__ = *d_rgbaImage;
	d_greyImage__ = *d_greyImage;
}

__global__
void rgba_to_greyscale(const uchar4* const rgbaImage, unsigned char* const greyImage, int numRows, int numCols) {
	int threadId = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
	if (threadId < numRows * numCols) {
		const unsigned char R = rgbaImage[threadId].x;
		const unsigned char G = rgbaImage[threadId].y;
		const unsigned char B = rgbaImage[threadId].z;
		greyImage[threadId] = .299f * R + .587f * G + .114f * B;
	}
}

void postProcess(const std::string& output_file, unsigned char* data_ptr) {
	cv::Mat output(numRows(), numCols(), CV_8UC1, (void*)data_ptr);
	//output the image
	cv::imwrite(output_file.c_str(), output);
}

void cleanup() {
	//cleanup
	cudaFree(d_rgbaImage__);
	cudaFree(d_greyImage__);
}

int main(int argc, char* argv[]) {

	//load input file
	//std::string input_file = argv[1];
	std::string input_file = "123.jpg";
	//define output file
	//std::string output_file = argv[2];
	std::string output_file = "gray.jpg";

	uchar4 *h_rgbaImage, *d_rgbaImage;
	unsigned char *h_greyImage, *d_greyImage;

	//load the image and give us our input and output pointers
	preProcess(&h_rgbaImage, &h_greyImage, &d_rgbaImage, &d_greyImage, input_file);

	int thread = 16;
	int grid = (numRows()*numCols() + thread - 1) / (thread * thread);
	const dim3 blockSize(thread, thread);
	const dim3 gridSize(grid);
	rgba_to_greyscale << <gridSize, blockSize >> > (d_rgbaImage, d_greyImage, numRows(), numCols());


	cudaDeviceSynchronize(); //checkCudaErrors(cudaGetLastError());

	size_t numPixels = numRows()*numCols();
	checkCudaErrors(cudaMemcpy(h_greyImage, d_greyImage, sizeof(unsigned char) * numPixels, cudaMemcpyDeviceToHost));

	//check results and output the grey image
	postProcess(output_file, h_greyImage);

	cleanup();
}

三、高斯模糊

Ubuntu運行的命令：

nvcc -o blur blur.cu -lopencv_highgui -lopencv_core -lopencv_imgcodecs -lopencv_imgproc

#include <iostream>
#include <string>
#include <cassert>

#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/opencv.hpp>

#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <device_launch_parameters.h>

#define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)

cv::Mat imageInputRGBA;
cv::Mat imageOutputRGBA;

uchar4 *d_inputImageRGBA__;
uchar4 *d_outputImageRGBA__;

float *h_filter__;

size_t numRows() { return imageInputRGBA.rows; }
size_t numCols() { return imageInputRGBA.cols; }

template<typename T>
void check(T err, const char* const func, const char* const file, const int line) {
	if (err != cudaSuccess) {
		std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
		std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
		exit(1);
	}
}

void preProcess(uchar4 **h_inputImageRGBA, uchar4 **h_outputImageRGBA,
	uchar4 **d_inputImageRGBA, uchar4 **d_outputImageRGBA,
	unsigned char **d_redBlurred,
	unsigned char **d_greenBlurred,
	unsigned char **d_blueBlurred,
	float **h_filter, int *filterWidth,
	const std::string &filename) {
	//make sure the context initializes ok
	checkCudaErrors(cudaFree(0));

	cv::Mat image = cv::imread(filename.c_str());
	if (image.empty()) {
		std::cerr << "Couldn't open file: " << filename << std::endl;
		exit(1);
	}

	cv::cvtColor(image, imageInputRGBA, cv::COLOR_BGR2RGBA);

	//allocate memory for the output
	imageOutputRGBA.create(image.rows, image.cols, CV_8UC4);

	//This shouldn't ever happen given the way the images are created
	//at least based upon my limited understanding of OpenCV, but better to check
	if (!imageInputRGBA.isContinuous() || !imageOutputRGBA.isContinuous()) {
		std::cerr << "Images aren't continuous!! Exiting." << std::endl;
		exit(1);
	}

	*h_inputImageRGBA = (uchar4 *)imageInputRGBA.ptr<unsigned char>(0);
	*h_outputImageRGBA = (uchar4 *)imageOutputRGBA.ptr<unsigned char>(0);

	const size_t numPixels = numRows() * numCols();
	//allocate memory on the device for both input and output
	checkCudaErrors(cudaMalloc(d_inputImageRGBA, sizeof(uchar4) * numPixels));
	checkCudaErrors(cudaMalloc(d_outputImageRGBA, sizeof(uchar4) * numPixels));
	checkCudaErrors(cudaMemset(*d_outputImageRGBA, 0, numPixels * sizeof(uchar4))); //make sure no memory is left laying around

	//copy input array to the GPU
	checkCudaErrors(cudaMemcpy(*d_inputImageRGBA, *h_inputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice));

	d_inputImageRGBA__ = *d_inputImageRGBA;
	d_outputImageRGBA__ = *d_outputImageRGBA;

	//now create the filter that they will use
	const int blurKernelWidth = 9;
	const float blurKernelSigma = 2.;

	*filterWidth = blurKernelWidth;

	//create and fill the filter we will convolve with
	*h_filter = new float[blurKernelWidth * blurKernelWidth];
	h_filter__ = *h_filter;

	float filterSum = 0.f; //for normalization

	for (int r = -blurKernelWidth / 2; r <= blurKernelWidth / 2; ++r) {
		for (int c = -blurKernelWidth / 2; c <= blurKernelWidth / 2; ++c) {
			float filterValue = expf(-(float)(c * c + r * r) / (2.f * blurKernelSigma * blurKernelSigma));
			(*h_filter)[(r + blurKernelWidth / 2) * blurKernelWidth + c + blurKernelWidth / 2] = filterValue;
			filterSum += filterValue;
		}
	}

	float normalizationFactor = 1.f / filterSum;

	for (int r = -blurKernelWidth / 2; r <= blurKernelWidth / 2; ++r) {
		for (int c = -blurKernelWidth / 2; c <= blurKernelWidth / 2; ++c) {
			(*h_filter)[(r + blurKernelWidth / 2) * blurKernelWidth + c + blurKernelWidth / 2] *= normalizationFactor;
		}
	}

	//blurred
	checkCudaErrors(cudaMalloc(d_redBlurred, sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMalloc(d_greenBlurred, sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMalloc(d_blueBlurred, sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMemset(*d_redBlurred, 0, sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMemset(*d_greenBlurred, 0, sizeof(unsigned char) * numPixels));
	checkCudaErrors(cudaMemset(*d_blueBlurred, 0, sizeof(unsigned char) * numPixels));//make sure the context initializes ok
	checkCudaErrors(cudaFree(0));

}

__global__
void gaussian_blur(const unsigned char* const inputChannel,
	unsigned char* const outputChannel,
	int numRows, int numCols,
	const float* const filter, const int filterWidth)
{
	const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x,
		blockIdx.y * blockDim.y + threadIdx.y);
	const int thread_1D_pos = thread_2D_pos.y * numCols + thread_2D_pos.x;
	const int absolute_image_position_x = thread_2D_pos.x;
	const int absolute_image_position_y = thread_2D_pos.y;
	if (absolute_image_position_x >= numCols ||
		absolute_image_position_y >= numRows)
	{
		return;
	}
	float color = 0.0f;
	for (int py = 0; py < filterWidth; py++) {
		for (int px = 0; px < filterWidth; px++) {
			int c_x = absolute_image_position_x + px - filterWidth / 2;
			int c_y = absolute_image_position_y + py - filterWidth / 2;
			c_x = min(max(c_x, 0), numCols - 1);
			c_y = min(max(c_y, 0), numRows - 1);
			float filter_value = filter[py*filterWidth + px];
			color += filter_value * static_cast<float>(inputChannel[c_y*numCols + c_x]);
		}
	}
	outputChannel[thread_1D_pos] = color;
}

//This kernel takes in an image represented as a uchar4 and splits
//it into three images consisting of only one color channel each
__global__
void separateChannels(const uchar4* const inputImageRGBA,
	int numRows,
	int numCols,
	unsigned char* const redChannel,
	unsigned char* const greenChannel,
	unsigned char* const blueChannel)
{
	// NOTE: Be careful not to try to access memory that is outside the bounds of
	// the image. You'll want code that performs the following check before accessing
	// GPU memory:
	const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x,
		blockIdx.y * blockDim.y + threadIdx.y);
	const int thread_1D_pos = thread_2D_pos.y * numCols + thread_2D_pos.x;
	const int absolute_image_position_x = thread_2D_pos.x;
	const int absolute_image_position_y = thread_2D_pos.y;
	if (absolute_image_position_x >= numCols ||
		absolute_image_position_y >= numRows)
	{
		return;
	}
	redChannel[thread_1D_pos] = inputImageRGBA[thread_1D_pos].x;
	greenChannel[thread_1D_pos] = inputImageRGBA[thread_1D_pos].y;
	blueChannel[thread_1D_pos] = inputImageRGBA[thread_1D_pos].z;
}

//This kernel takes in three color channels and recombines them
//into one image.  The alpha channel is set to 255 to represent
//that this image has no transparency.
__global__
void recombineChannels(const unsigned char* const redChannel,
	const unsigned char* const greenChannel,
	const unsigned char* const blueChannel,
	uchar4* const outputImageRGBA,
	int numRows,
	int numCols)
{
	const int2 thread_2D_pos = make_int2(blockIdx.x * blockDim.x + threadIdx.x,
		blockIdx.y * blockDim.y + threadIdx.y);

	const int thread_1D_pos = thread_2D_pos.y * numCols + thread_2D_pos.x;

	//make sure we don't try and access memory outside the image
	//by having any threads mapped there return early
	if (thread_2D_pos.x >= numCols || thread_2D_pos.y >= numRows)
		return;

	unsigned char red = redChannel[thread_1D_pos];
	unsigned char green = greenChannel[thread_1D_pos];
	unsigned char blue = blueChannel[thread_1D_pos];

	//Alpha should be 255 for no transparency
	uchar4 outputPixel = make_uchar4(red, green, blue, 255);

	outputImageRGBA[thread_1D_pos] = outputPixel;
}

unsigned char *d_red, *d_green, *d_blue;
float         *d_filter;

void allocateMemoryAndCopyToGPU(const size_t numRowsImage, const size_t numColsImage,
	const float* const h_filter, const size_t filterWidth)
{

	//allocate memory for the three different channels
	//original
	checkCudaErrors(cudaMalloc(&d_red, sizeof(unsigned char) * numRowsImage * numColsImage));
	checkCudaErrors(cudaMalloc(&d_green, sizeof(unsigned char) * numRowsImage * numColsImage));
	checkCudaErrors(cudaMalloc(&d_blue, sizeof(unsigned char) * numRowsImage * numColsImage));

	//Allocate memory for the filter on the GPU
	//Use the pointer d_filter that we have already declared for you
	//You need to allocate memory for the filter with cudaMalloc
	//be sure to use checkCudaErrors like the above examples to
	//be able to tell if anything goes wrong
	//IMPORTANT: Notice that we pass a pointer to a pointer to cudaMalloc
	checkCudaErrors(cudaMalloc(&d_filter, sizeof(float) * filterWidth * filterWidth));
	//Copy the filter on the host (h_filter) to the memory you just allocated
	//on the GPU.  cudaMemcpy(dst, src, numBytes, cudaMemcpyHostToDevice);
	//Remember to use checkCudaErrors!
	checkCudaErrors(cudaMemcpy(d_filter, h_filter, sizeof(float) * filterWidth * filterWidth, cudaMemcpyHostToDevice));

}

void postProcess(const std::string& output_file, uchar4* data_ptr) {
	cv::Mat output(numRows(), numCols(), CV_8UC4, (void*)data_ptr);
	cv::Mat imageOutputBGR;
	cv::cvtColor(output, imageOutputBGR, cv::COLOR_RGBA2BGR);
	//output the image
	cv::imwrite(output_file.c_str(), imageOutputBGR);
}

void cleanup() {
	//cleanup
	cudaFree(d_inputImageRGBA__);
	cudaFree(d_outputImageRGBA__);
	delete[] h_filter__;
}

int main(int argc, char* argv[]) {

	//load input file
	//std::string input_file = argv[1];
	std::string input_file = "123.jpg";
	//define output file
	//std::string output_file = argv[2];
	std::string output_file = "blur.jpg";

	uchar4 *h_inputImageRGBA, *d_inputImageRGBA;
	uchar4 *h_outputImageRGBA, *d_outputImageRGBA;
	unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred;

	float *h_filter;
	int    filterWidth;

	//load the image and give us our input and output pointers
	preProcess(&h_inputImageRGBA, &h_outputImageRGBA, &d_inputImageRGBA, &d_outputImageRGBA,
		&d_redBlurred, &d_greenBlurred, &d_blueBlurred,
		&h_filter, &filterWidth, input_file);

	allocateMemoryAndCopyToGPU(numRows(), numCols(), h_filter, filterWidth);

	const dim3 blockSize(16, 16);
	const dim3 gridSize(numCols() / blockSize.x + 1, numRows() / blockSize.y + 1);

	//Launch a kernel for separating the RGBA image into different color channels
	separateChannels << <gridSize, blockSize >> > (d_inputImageRGBA,
		numRows(),
		numCols(),
		d_red,
		d_green,
		d_blue);

	cudaDeviceSynchronize(); //checkCudaErrors(cudaGetLastError());

	//Call your convolution kernel here 3 times, once for each color channel.
	gaussian_blur << <gridSize, blockSize >> > (d_red,
		d_redBlurred,
		numRows(),
		numCols(),
		d_filter,
		filterWidth);
	cudaDeviceSynchronize(); //checkCudaErrors(cudaGetLastError());
	gaussian_blur << <gridSize, blockSize >> > (d_green,
		d_greenBlurred,
		numRows(),
		numCols(),
		d_filter,
		filterWidth);
	cudaDeviceSynchronize(); //checkCudaErrors(cudaGetLastError());
	gaussian_blur << <gridSize, blockSize >> > (d_blue,
		d_blueBlurred,
		numRows(),
		numCols(),
		d_filter,
		filterWidth);
	cudaDeviceSynchronize(); //checkCudaErrors(cudaGetLastError());

// Now we recombine your results. We take care of launching this kernel for you.
//
// NOTE: This kernel launch depends on the gridSize and blockSize variables,
// which you must set yourself.
	recombineChannels << <gridSize, blockSize >> > (d_redBlurred,
		d_greenBlurred,
		d_blueBlurred,
		d_outputImageRGBA,
		numRows(),
		numCols());
	cudaDeviceSynchronize(); //checkCudaErrors(cudaGetLastError());

	size_t numPixels = numRows()*numCols();
	//copy the output back to the host
	checkCudaErrors(cudaMemcpy(h_outputImageRGBA, d_outputImageRGBA__, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost));

	postProcess(output_file, h_outputImageRGBA);

	checkCudaErrors(cudaFree(d_redBlurred));
	checkCudaErrors(cudaFree(d_greenBlurred));
	checkCudaErrors(cudaFree(d_blueBlurred));

	cleanup();

	return 0;
}

cuda編程（4）：常見的例程

一、直方圖

二、彩色轉灰度

三、高斯模糊

cuda編程（7）：實現LK稀疏光流算法--完整的cuda程序

cuda編程（6）：實現knn算法

cuda編程可以使用的庫函數

cuda編程（5）：優化理論

cuda編程（4）：常見的例程

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結