一、原理

二、算法的假設（其實這樣的條件比較苛刻）

三、具體實現

main.cc

#include <opencv2/opencv.hpp>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include "cudaLK.h"
#include <stdio.h>
#include <time.h>
#include <omp.h>
#include <vector>

using namespace std;
using namespace cv;

int main(int argc, char **argv)
{
	//if (argc != 3) {
	//	printf("./cudaLK img1.png img2.png\n");
	//	return 0;
	//}

	//Mat img1 = imread(argv[1]);
	//Mat img2 = imread(argv[2]);

	Mat img1 = imread("img1.jpg");
	Mat img2 = imread("img2.jpg");

	if (img1.empty() || img2.empty()) {
		printf("Error loading images\n");
		return 1;
	}

	cudaLK LK;

	LK.run(img1.data, img2.data, img1.size().width, img2.size().height);

	Mat clone = img1.clone();
	Mat grey1 = Mat(img1.rows, img1.cols, CV_8UC1);
	cvtColor(img1, grey1, COLOR_BGR2GRAY);
	cvtColor(grey1, clone, COLOR_GRAY2BGR);

	for (int y = 0; y < img1.size().height; y += 16) {
		for (int x = 0; x < img1.size().width; x += 16) {
			int idx = y * img1.size().width + x;

			if (LK.status[idx] == 0)
				continue;

			line(clone, Point(x, y), Point(x + LK.dx[idx], y + LK.dy[idx]), Scalar(255, 0, 0));
		}
	}

	imwrite("cudaLK.png", clone);

	return 0;
}

cudaLK.h

#ifndef CUDALK_H
#define CUDALK_H

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <time.h>

#define LEVELS 3
#define PATCH_R 6 // patch radius, patch size is (2*PATCH_R+1)*(2*PATCH_R+1)
#define NTHREAD_X 16
#define NTHREAD_Y 16

class cudaLK
{
public:
	cudaLK();
	~cudaLK();
	void run(unsigned char *prev, unsigned char *cur, int w, int h);

	float *dx, *dy;
	char *status;
	int pyr_w[LEVELS], pyr_h[LEVELS];

private:
	void initMem();
	void checkCUDAError(const char *msg);

	int w, h;

	unsigned char *gpu_img_prev_RGB;
	unsigned char *gpu_img_cur_RGB;

	float *gpu_img_pyramid_prev[LEVELS];
	float *gpu_img_pyramid_cur[LEVELS];
	float *gpu_smoothed_prev_x;
	float *gpu_smoothed_cur_x;
	float *gpu_smoothed_prev;
	float *gpu_smoothed_cur;

	cudaArray *gpu_array_pyramid_prev;
	//cudaArray *gpu_array_pyramid_prev_Ix;
	//cudaArray *gpu_array_pyramid_prev_Iy;
	cudaArray *gpu_array_pyramid_cur;
	float *gpu_dx, *gpu_dy;
	char *gpu_status;

	inline int getTimeNow() {
		return clock();
	}
};

#endif

cudaLK.cu

#include "cudaLK.h"
#include <stdio.h>

const float scaling[] = { 1, 0.5f, 0.25f, 0.125f, 0.0625f, 0.03125f, 0.015625f, 0.0078125f };

// Can't use an array of texture<> !! so we'll just re-use the one texture buffer for each image
texture<float, 2, cudaReadModeElementType> texRef_pyramid_prev;
texture<float, 2, cudaReadModeElementType> texRef_pyramid_cur;

__global__ void convertToGrey(unsigned char *d_in, float *d_out, int N)
{
	int idx = blockIdx.x*blockDim.x + threadIdx.x;

	if (idx < N)
		d_out[idx] = d_in[idx * 3] * 0.1144f + d_in[idx * 3 + 1] * 0.5867f + d_in[idx * 3 + 2] * 0.2989f;
}

__global__ void pyrDownsample(float *in, int w1, int h1, float *out, int w2, int h2)
{
	// Input has to be greyscale
	int x2 = blockIdx.x*blockDim.x + threadIdx.x;
	int y2 = blockIdx.y*blockDim.y + threadIdx.y;

	if ((x2 < w2) && (y2 < h2)) {
		int x = x2 * 2;
		int y = y2 * 2;
		int x_1 = x - 1;
		int y_1 = y - 1;
		int x_2 = x + 1;
		int y_2 = y + 1;

		if (x_1 < 0) x_1 = 0;
		if (y_1 < 0) y_1 = 0;
		if (x_2 >= w1) x_2 = w1 - 1;
		if (y_2 >= h1) y_2 = h1 - 1;

		out[y2*w2 + x2] = 0.25f*in[y*w1 + x] + 0.125f*(in[y*w1 + x_1] + in[y*w1 + x_2] + in[y_1*w1 + x] + in[y_2*w1 + x]) +
			0.0625f*(in[y_1*w1 + x_1] + in[y_2*w1 + x_1] + in[y_1*w1 + x_2] + in[y_2*w1 + x_2]);
	}
}


__global__ void smoothX(float *in, int w, int h, float *out)
{
	int x = blockIdx.x*blockDim.x + threadIdx.x;
	int y = blockIdx.y*blockDim.y + threadIdx.y;

	if (x >= w || y >= h)
		return;

	int idx = y * w;

	int a = x - 2;
	int b = x - 1;
	int c = x;
	int d = x + 1;
	int e = x + 2;

	if (a < 0) a = 0;
	if (b < 0) b = 0;
	if (c >= w) c = w - 1;
	if (d >= w) d = w - 1;

	out[y*w + x] = 0.0625f*in[idx + a] + 0.25f*in[idx + b] + 0.375f*in[idx + c] + 0.25f*in[idx + d] + 0.0625f*in[idx + e];
}

__global__ void smoothY(float *in, int w, int h, float *out)
{
	int x = blockIdx.x*blockDim.x + threadIdx.x;
	int y = blockIdx.y*blockDim.y + threadIdx.y;

	if (x >= w || y >= h)
		return;

	int a = y - 2;
	int b = y - 1;
	int c = y;
	int d = y + 1;
	int e = y + 2;

	if (a < 0) a = 0;
	if (b < 0) b = 0;
	if (c >= h) c = h - 1;
	if (d >= h) d = h - 1;

	out[y*w + x] = 0.0625f*in[a*w + x] + 0.25f*in[b*w + x] + 0.375f*in[c*w + x] + 0.25f*in[d*w + x] + 0.0625f*in[e*w + x];
}

// Call recursively
// w/h - original dimension of image

__global__ void track(const int w, const int h,
	const int pyr_w, const int pyr_h,
	float scaling, int level, char initGuess,
	float *dx, float *dy, char *status)
{
	int x = blockIdx.x*blockDim.x + threadIdx.x;
	int y = blockIdx.y*blockDim.y + threadIdx.y;
	int idx = y * w + x;

	if (x > w - 1 || y > h - 1)
		return;

	if (status[idx] == 0)
		return;

	float prev_x = x * scaling;
	float prev_y = y * scaling;

	float Vx, Vy;
	float cur_x, cur_y;
	float sum_Ixx = 0;
	float sum_Ixy = 0;
	float sum_Iyy = 0;
	float sum_Ixt;
	float sum_Iyt;
	float Ix, Iy, It;
	int xx, yy;
	float det, D;
	float I, J;
	float vx, vy;
	int j;

	if (initGuess) {
		Vx = 0;
		Vy = 0;
		cur_x = prev_x;
		cur_y = prev_y;
	}
	else {
		Vx = dx[idx];
		Vy = dy[idx];
		cur_x = prev_x + Vx;
		cur_y = prev_y + Vy;
	}

	// Calculate spatial gradient 
	for (yy = -PATCH_R; yy <= PATCH_R; yy++) {
		for (xx = -PATCH_R; xx <= PATCH_R; xx++) {
			Ix = (tex2D(texRef_pyramid_prev, prev_x + xx + 1, prev_y + yy) - tex2D(texRef_pyramid_prev, prev_x + xx - 1, prev_y + yy))*0.5f;
			Iy = (tex2D(texRef_pyramid_prev, prev_x + xx, prev_y + yy + 1) - tex2D(texRef_pyramid_prev, prev_x + xx, prev_y + yy - 1))*0.5f;

			sum_Ixx += Ix * Ix;
			sum_Ixy += Ix * Iy;
			sum_Iyy += Iy * Iy;
		}
	}

	det = sum_Ixx * sum_Iyy - sum_Ixy * sum_Ixy;

	if (det < 0.00001f) {
		status[idx] = 0;
		return;
	}

	D = 1 / det;

	// Iteration part
	for (j = 0; j < 10; j++) {
		if (cur_x < 0 || cur_x > pyr_w || cur_y < 0 || cur_y > pyr_h) {
			status[idx] = 0;
			return;
		}

		sum_Ixt = 0;
		sum_Iyt = 0;

		// No explicit handling of pixels outside the image ... maybe we don't have to because the hardware interpolation scheme
		// will always give a result for pixels outside the image. How greatly the duplicated pixel values affect the result is unknown at the moment.
		for (yy = -PATCH_R; yy <= PATCH_R; yy++) {
			for (xx = -PATCH_R; xx <= PATCH_R; xx++) {
				I = tex2D(texRef_pyramid_prev, prev_x + xx, prev_y + yy);
				J = tex2D(texRef_pyramid_cur, cur_x + xx, cur_y + yy);

				Ix = (tex2D(texRef_pyramid_prev, prev_x + xx + 1, prev_y + yy) - tex2D(texRef_pyramid_prev, prev_x + xx - 1, prev_y + yy))*0.5f;
				Iy = (tex2D(texRef_pyramid_prev, prev_x + xx, prev_y + yy + 1) - tex2D(texRef_pyramid_prev, prev_x + xx, prev_y + yy - 1))*0.5f;

				It = J - I;

				sum_Ixt += Ix * It;
				sum_Iyt += Iy * It;
			}
		}

		// Find the inverse of the 2x2 matrix using a mix of determinant and adjugate matrix
		// http://cnx.org/content/m19446/latest/
		vx = D * (-sum_Iyy * sum_Ixt + sum_Ixy * sum_Iyt);
		vy = D * (sum_Ixy*sum_Ixt - sum_Ixx * sum_Iyt);

		Vx += vx;
		Vy += vy;
		cur_x += vx;
		cur_y += vy;

		// Movement very small
		if (fabsf(vx) < 0.01f && fabsf(vy) < 0.01f)
			break;
	}

	if (level != 0) {
		cur_x += cur_x;
		cur_y += cur_y;

		Vx += Vx;
		Vy += Vy;
	}

	dx[idx] = Vx;
	dy[idx] = Vy;
}

cudaLK::cudaLK()
{

}

cudaLK::~cudaLK()
{
	for (int i = 0; i < LEVELS; i++) {
		cudaFree(gpu_img_pyramid_prev[i]);
		cudaFree(gpu_img_pyramid_cur[i]);
	}

	cudaFree(gpu_smoothed_prev_x);
	cudaFree(gpu_smoothed_cur_x);
	cudaFree(gpu_smoothed_prev);
	cudaFree(gpu_smoothed_cur);

	cudaFreeArray(gpu_array_pyramid_prev);
	//cudaFreeArray(gpu_array_pyramid_prev_Ix);
	//cudaFreeArray(gpu_array_pyramid_prev_Iy);  // 這裏會出錯，註釋掉就不會
	cudaFreeArray(gpu_array_pyramid_cur);

	cudaFree(gpu_dx);
	cudaFree(gpu_dy);
	cudaFree(gpu_status);

	delete[] dx;
	delete[] dy;
	delete[] status;
}

void cudaLK::checkCUDAError(const char *msg) {
	cudaError_t err = cudaGetLastError();
	if (cudaSuccess != err) {
		fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}
}

void cudaLK::initMem()
{
	cudaMalloc((void**)&gpu_img_prev_RGB, sizeof(char)*w*h * 3);
	cudaMalloc((void**)&gpu_img_cur_RGB, sizeof(char)*w*h * 3);
	cudaMalloc((void**)&gpu_img_pyramid_prev[0], sizeof(float)*w*h);
	cudaMalloc((void**)&gpu_img_pyramid_cur[0], sizeof(float)*w*h);

	cudaMalloc((void**)&gpu_smoothed_prev_x, sizeof(float)*w*h);
	cudaMalloc((void**)&gpu_smoothed_cur_x, sizeof(float)*w*h);
	cudaMalloc((void**)&gpu_smoothed_prev, sizeof(float)*w*h);
	cudaMalloc((void**)&gpu_smoothed_cur, sizeof(float)*w*h);

	// Texture
	cudaMallocArray(&gpu_array_pyramid_prev, &texRef_pyramid_prev.channelDesc, w, h);
	cudaMallocArray(&gpu_array_pyramid_cur, &texRef_pyramid_cur.channelDesc, w, h);
	cudaBindTextureToArray(texRef_pyramid_prev, gpu_array_pyramid_prev);
	cudaBindTextureToArray(texRef_pyramid_cur, gpu_array_pyramid_cur);

	texRef_pyramid_prev.normalized = 0;
	texRef_pyramid_prev.filterMode = cudaFilterModeLinear;
	texRef_pyramid_prev.addressMode[0] = cudaAddressModeClamp;
	texRef_pyramid_prev.addressMode[1] = cudaAddressModeClamp;

	texRef_pyramid_cur.normalized = 0;
	texRef_pyramid_cur.filterMode = cudaFilterModeLinear;
	texRef_pyramid_cur.addressMode[0] = cudaAddressModeClamp;
	texRef_pyramid_cur.addressMode[1] = cudaAddressModeClamp;

	cudaMalloc((void**)&gpu_dx, sizeof(float)*w*h);
	cudaMalloc((void**)&gpu_dy, sizeof(float)*w*h);
	cudaMalloc((void**)&gpu_status, sizeof(char)*w*h);

	int _w = w;
	int _h = h;

	dx = new float[w*h];
	dy = new float[w*h];
	status = new char[w*h];

	pyr_w[0] = w;
	pyr_h[0] = h;

	for (int i = 1; i < LEVELS; i++) {
		_w /= 2;
		_h /= 2;
		pyr_w[i] = _w;
		pyr_h[i] = _h;

		cudaMalloc((void**)&gpu_img_pyramid_prev[i], sizeof(float)*_w*_h);
		cudaMalloc((void**)&gpu_img_pyramid_cur[i], sizeof(float)*_w*_h);
	}
}

void cudaLK::run(unsigned char *prev, unsigned char *cur, int _w, int _h)
{
	;
	w = _w;
	h = _h;
	initMem();

	int nThreadsX = NTHREAD_X;
	int nThreadsY = NTHREAD_Y;

	int blocksW = w / nThreadsX + ((w % nThreadsX) ? 1 : 0);
	int blocksH = h / nThreadsY + ((h % nThreadsY) ? 1 : 0);
	dim3 blocks(blocksW, blocksH);
	dim3 threads(nThreadsX, nThreadsY);
	int blocks1D = (w*h) / 256 + (w*h % 256 ? 1 : 0); // for greyscale

	int start = getTimeNow();
	int s;

	// Copy image to GPU 
	s = getTimeNow();
	cudaMemcpy(gpu_img_prev_RGB, prev, w*h * 3, cudaMemcpyHostToDevice);
	cudaMemcpy(gpu_img_cur_RGB, cur, w*h * 3, cudaMemcpyHostToDevice);
	checkCUDAError("start");

	printf("Copying 2 images from CPU to GPU: %d ms\n", getTimeNow() - s);

	// RGB -> grey
	s = getTimeNow();
	convertToGrey << <blocks1D, 256 >> > (gpu_img_prev_RGB, gpu_img_pyramid_prev[0], w*h);
	convertToGrey << <blocks1D, 256 >> > (gpu_img_cur_RGB, gpu_img_pyramid_cur[0], w*h);
	cudaThreadSynchronize();
	checkCUDAError("convertToGrey");
	printf("Converting from RGB to greyscale: %d ms\n", getTimeNow() - s);


	s = getTimeNow();

	for (int i = 0; i < LEVELS - 1; i++) {
		smoothX << <blocks, threads >> > (gpu_img_pyramid_prev[i], pyr_w[i], pyr_h[i], gpu_smoothed_prev_x);
		smoothX << <blocks, threads >> > (gpu_img_pyramid_cur[i], pyr_w[i], pyr_h[i], gpu_smoothed_cur_x);
		cudaThreadSynchronize();
		smoothY << <blocks, threads >> > (gpu_smoothed_prev_x, pyr_w[i], pyr_h[i], gpu_smoothed_prev);
		smoothY << <blocks, threads >> > (gpu_smoothed_cur_x, pyr_w[i], pyr_h[i], gpu_smoothed_cur);
		cudaThreadSynchronize();

		pyrDownsample << <blocks, threads >> > (gpu_smoothed_prev, pyr_w[i], pyr_h[i], gpu_img_pyramid_prev[i + 1], pyr_w[i + 1], pyr_h[i + 1]);
		pyrDownsample << <blocks, threads >> > (gpu_smoothed_cur, pyr_w[i], pyr_h[i], gpu_img_pyramid_cur[i + 1], pyr_w[i + 1], pyr_h[i + 1]);
		cudaThreadSynchronize();

		checkCUDAError("pyrDownsample here");
	}

	printf("Generating the pyramids: %d ms\n", getTimeNow() - s);

	s = getTimeNow();
	cudaMemset(gpu_status, 1, sizeof(char)*w*h);

	// Do the actual tracking
	for (int l = LEVELS - 1; l >= 0; l--) {

		cudaMemcpy2DToArray(gpu_array_pyramid_prev, 0, 0, gpu_img_pyramid_prev[l],
			sizeof(float)*pyr_w[l], sizeof(float)*pyr_w[l], pyr_h[l], cudaMemcpyDeviceToDevice);

		cudaMemcpy2DToArray(gpu_array_pyramid_cur, 0, 0, gpu_img_pyramid_cur[l],
			sizeof(float)*pyr_w[l], sizeof(float)*pyr_w[l], pyr_h[l], cudaMemcpyDeviceToDevice);

		track << <blocks, threads >> > (w, h, pyr_w[l], pyr_w[l], scaling[l], l, (l == LEVELS - 1), gpu_dx, gpu_dy, gpu_status);

		cudaThreadSynchronize();
	}

	printf("Tracking: %d ms\n", getTimeNow() - s);

	// Copy back results 
	s = getTimeNow();
	cudaMemcpy(dx, gpu_dx, sizeof(float)*w*h, cudaMemcpyDeviceToHost);
	cudaMemcpy(dy, gpu_dy, sizeof(float)*w*h, cudaMemcpyDeviceToHost);
	cudaMemcpy(status, gpu_status, sizeof(char)*w*h, cudaMemcpyDeviceToHost);
	printf("Copying results from GPU to CPU: %d ms\n", getTimeNow() - s);

	printf("Total time for cudaLK: %d ms\n", getTimeNow() - start);
}

cuda編程（7）：實現LK稀疏光流算法--完整的cuda程序

一、原理

二、算法的假設（其實這樣的條件比較苛刻）

三、具體實現

cuda編程（7）：實現LK稀疏光流算法--完整的cuda程序

cuda編程（6）：實現knn算法

cuda編程可以使用的庫函數

cuda編程（5）：優化理論

cuda編程（4）：常見的例程

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結