本文通過使用malloc分配內存和cudaHostAlloc分配頁鎖定內存,說明使用頁鎖定內存可提高運算效率,並指出哪些場合適合使用頁鎖定內存。
malloc分配的是標準的可分頁的(pagable)的主機內存,操作系統在對內存進行調度的時候可能會將這種內存分頁或者交換到磁盤上,需要的時候再調回內存,這樣就會增加運算時間。而cudaHostAlloc分配的是頁鎖定的(page-locked)主機內存,操作系統不會對這塊內存分頁和交換到磁盤上,確保該內存始終駐留在物理內存中。
下面通過100M數據在主機和設備上的交換說明二者的差異。貼上代碼:
/********************************************************************
* PageLockedMem.cu
* Compare the performance of general mem and page locked mem.
*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil_inline.h>
#define _SIZE 100*1024*1024
/************************************************************************/
/* Init CUDA */
/************************************************************************/
bool InitCUDA(void)
{
......
}
float cudaMallocTest(int size, bool dir)
{
cudaEvent_t start, stop;
float elapsedTime;
int *a, *dev_a;
a = (int*)malloc(size*sizeof(int));
if (!a)
{
printf("Mem error!\n");
}
cutilSafeCall(cudaMalloc((void**)&dev_a, size*sizeof(int)));
cutilSafeCall(cudaEventCreate(&start));
cutilSafeCall(cudaEventCreate(&stop));
cutilSafeCall(cudaEventRecord(start, 0));
for (int i=0; i<10; i++)
{
if (dir)
{
cutilSafeCall(cudaMemcpy(dev_a, a, size*sizeof(int), cudaMemcpyHostToDevice));
}
else
{
cutilSafeCall(cudaMemcpy(a, dev_a, size*sizeof(int), cudaMemcpyDeviceToHost));
}
}
cutilSafeCall(cudaEventRecord(stop, 0));
cudaEventSynchronize(stop);
cutilSafeCall(cudaEventElapsedTime(&elapsedTime, start, stop));
free(a);
cutilSafeCall(cudaFree(dev_a));
cutilSafeCall(cudaEventDestroy(start));
cutilSafeCall(cudaEventDestroy(stop));
return elapsedTime;
}
float cudaHostAllocTest(int size, bool dir)
{
cudaEvent_t start, stop;
float elapsedTime;
int *a, *dev_a;
cutilSafeCall(cudaHostAlloc((void**)&a, size*sizeof(int), cudaHostAllocDefault));
cutilSafeCall(cudaMalloc((void**)&dev_a, size*sizeof(int)));
cutilSafeCall(cudaEventCreate(&start));
cutilSafeCall(cudaEventCreate(&stop));
cutilSafeCall(cudaEventRecord(start, 0));
for (int i=0; i<10; i++)
{
if (dir)
{
cutilSafeCall(cudaMemcpy(dev_a, a, size*sizeof(int), cudaMemcpyHostToDevice));
}
else
{
cutilSafeCall(cudaMemcpy(a, dev_a, size*sizeof(int), cudaMemcpyDeviceToHost));
}
}
cutilSafeCall(cudaEventRecord(stop, 0));
cudaEventSynchronize(stop);
cutilSafeCall(cudaEventElapsedTime(&elapsedTime, start, stop));
cutilSafeCall(cudaFreeHost(a));
cutilSafeCall(cudaFree(dev_a));
cutilSafeCall(cudaEventDestroy(start));
cutilSafeCall(cudaEventDestroy(stop));
return elapsedTime;
}
int main(int argc, char* argv[])
{
if(!InitCUDA()) {
return 0;
}
float elapsedTime;
float MB = (float)100*_SIZE*sizeof(int)/1024/1024;
elapsedTime = cudaMallocTest(_SIZE, true);
printf("Time using cudaMalloc: %3.1f ms\n", elapsedTime);
printf("\tMB/s during copy up: %3.1f \n", MB/(elapsedTime/1000));
elapsedTime = cudaMallocTest(_SIZE, true);
printf("Time using cudaMalloc: %3.1f ms\n", elapsedTime);
printf("\tMB/s during copy down: %3.1f \n", MB/(elapsedTime/1000));
elapsedTime = cudaHostAllocTest(_SIZE, true);
printf("Time using cudaHostAlloc: %3.1f ms\n", elapsedTime);
printf("\tMB/s during copy up: %3.1f \n", MB/(elapsedTime/1000));
elapsedTime = cudaHostAllocTest(_SIZE, true);
printf("Time using cudaHostAlloc: %3.1f ms\n", elapsedTime);
printf("\tMB/s during copy down: %3.1f \n", MB/(elapsedTime/1000));
return 0;
}
可以看出運算時間縮短了約2倍。
但是並不是所有的場合都適合用頁鎖定內存,因爲使用固定內存時,將失去虛擬內存的所有功能,即需要爲每個頁鎖定內存分配物理內存,系統將更快耗盡內存(跟使用普通內存相比)。所以要根據需要進行選擇。