CUDA編程結(jié)構(gòu)
CUDA顯存管理
- 分配顯存
cudaError_t cudaMalloc ( void** devPtr, size_t size )
- 傳輸數(shù)據(jù)
cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind )
//傳輸方向:
cudaMemcpyHostToHost
cudaMemcpyHostToDevice
cudaMemcpyDeviceToHost
cudaMemcpyDeviceToDevice
Example:
- 返回類型
cudaSuccess
cudaErrorMemoryAllocation
CUDA內(nèi)存模型
線程
核函數(shù)在CPU端創(chuàng)立阁危,在GPU端執(zhí)行锹淌。thread組織成block,block組成grid财搁,一個核函數(shù)對應(yīng)一個grid贴届。block可以一維,二維震桶,三維休傍,grid也可以是一維,二維蹲姐,三維組織磨取。
block:同一個block內(nèi)共享內(nèi)存,同一block中的thread可以彼此進行通信淤堵。
block:block-local synchronization寝衫。同一個塊內(nèi)的線程可以同步。
線程拐邪,可以根據(jù)blockIdx和threadIdx唯一的指定慰毅。
blockIdx (block index within a grid)
threadIdx (thread index within a block)
blockIdx和threadIdx都是GPU中的內(nèi)置變量,unit3扎阶,blockIdx和threadIdx有3個維度汹胃,x,y东臀,z着饥。
threadIdx.x、threadIdx.y惰赋、threadIdx.z
blockIdx.x宰掉、blockIdx.y呵哨、blockIdx.z
? blockDim (block dimension, measured in threads)
? gridDim (grid dimension, measured in blocks)
另外我們要特別注意,GPU中線程的內(nèi)置變量定義類型是unit3轨奄,在程序中設(shè)置block和grid的類型是dim3孟害。dim3在CPU端使用,unit3在GPU中使用挪拟。
? Decide the block size.定義數(shù)據(jù)量大小
? Calculate the grid dimension based on the application data size and the block size.設(shè)置block的大小挨务,再設(shè)置grid的大小。
確定block的維度的因素:
? Performance characteristics of the kernel(kernel的特性)
? Limitations on GPU resources(GPU上的資源)
example 定義block和grid
#include "../common/common.h"
#include <cuda_runtime.h>
#include <stdio.h>
/*
* Display the dimensionality of a thread block and grid from the host and
* device.
*/
__global__ void checkIndex(void)
{
printf("threadIdx:(%d, %d, %d)\n", threadIdx.x, threadIdx.y, threadIdx.z);
printf("blockIdx:(%d, %d, %d)\n", blockIdx.x, blockIdx.y, blockIdx.z);
printf("blockDim:(%d, %d, %d)\n", blockDim.x, blockDim.y, blockDim.z);
printf("gridDim:(%d, %d, %d)\n", gridDim.x, gridDim.y, gridDim.z);
}
int main(int argc, char **argv)
{
// define total data element
int nElem = 6;
// define grid and block structure
dim3 block(3);
dim3 grid((nElem + block.x - 1) / block.x);
// check grid and block dimension from host side
printf("grid.x %d grid.y %d grid.z %d\n", grid.x, grid.y, grid.z);
printf("block.x %d block.y %d block.z %d\n", block.x, block.y, block.z);
// check grid and block dimension from device side
checkIndex<<<grid, block>>>();
// reset device before you leave
CHECK(cudaDeviceReset());
return(0);
}
launching a CUDA Kernel
kernel_name <<<grid, block>>>(argument list);
//根據(jù)設(shè)置的grid和block可以確定kernel的總線程數(shù)玉组,kernel的層次結(jié)構(gòu)谎柄。
** 關(guān)鍵點**
1.數(shù)據(jù)在global memory中是線性存儲的,我們可以根據(jù)內(nèi)置變量blickIdx和threadIdx可以唯一的確定某個線程惯雳。
2.建立一種映射關(guān)系朝巫,線程和數(shù)據(jù)的映射方式。
3.kernel Call 是異步執(zhí)行的吨凑,我理解的異步是kernel中的線程不是同時執(zhí)行結(jié)束的捍歪,當(dāng)核函數(shù)創(chuàng)建了以后控制權(quán)就返回到host端了。
//強制host端等待所有的線程都完成執(zhí)行鸵钝。
cudaError_t cudaDeviceSynchronize(void);
//注意cudaMemcpy函數(shù)是同步的糙臼,將等待kernel中所有線程都完成了執(zhí)行,再執(zhí)行數(shù)據(jù)的拷貝恩商。
編寫kernel函數(shù)
__global __void kernel_name(argument list)
//1. kernel返回值必須是void
//2. __global__標(biāo)識符號
The following restrictions apply for all kernels:
? Access to device memory only
? Must have void return type
? No support for a variable number of arguments
? No support for static variables
? No support for function pointers
? Exhibit an asynchronous behavior
Example:兩個數(shù)組相加
#include "../common/common.h"
#include <cuda_runtime.h>
#include <stdio.h>
/*
* This example demonstrates a simple vector sum on the GPU and on the host.
* sumArraysOnGPU splits the work of the vector sum across CUDA threads on the
* GPU. Only a single thread block is used in this small case, for simplicity.
* sumArraysOnHost sequentially iterates through vector elements on the host.
*/
void checkResult(float *hostRef, float *gpuRef, const int N)
{
double epsilon = 1.0E-8;
bool match = 1;
for (int i = 0; i < N; i++)
{
if (abs(hostRef[i] - gpuRef[i]) > epsilon)
{
match = 0;
printf("Arrays do not match!\n");
printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i],
gpuRef[i], i);
break;
}
}
if (match) printf("Arrays match.\n\n");
return;
}
void initialData(float *ip, int size)
{
// generate different seed for random number
time_t t;
srand((unsigned) time(&t));
for (int i = 0; i < size; i++)
{
ip[i] = (float)(rand() & 0xFF) / 10.0f;
}
return;
}
void sumArraysOnHost(float *A, float *B, float *C, const int N)
{
for (int idx = 0; idx < N; idx++)
C[idx] = A[idx] + B[idx];
}
__global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N)
{
int i = threadIdx.x;
if (i < N) C[i] = A[i] + B[i];
}
int main(int argc, char **argv)
{
printf("%s Starting...\n", argv[0]);
// set up device
int dev = 0;
CHECK(cudaSetDevice(dev));
// set up data size of vectors
int nElem = 1 << 5;
printf("Vector size %d\n", nElem);
// malloc host memory
size_t nBytes = nElem * sizeof(float);
float *h_A, *h_B, *hostRef, *gpuRef;
h_A = (float *)malloc(nBytes);
h_B = (float *)malloc(nBytes);
hostRef = (float *)malloc(nBytes);
gpuRef = (float *)malloc(nBytes);
// initialize data at host side
initialData(h_A, nElem);
initialData(h_B, nElem);
memset(hostRef, 0, nBytes);
memset(gpuRef, 0, nBytes);
// malloc device global memory
float *d_A, *d_B, *d_C;
CHECK(cudaMalloc((float**)&d_A, nBytes));
CHECK(cudaMalloc((float**)&d_B, nBytes));
CHECK(cudaMalloc((float**)&d_C, nBytes));
// transfer data from host to device
CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_C, gpuRef, nBytes, cudaMemcpyHostToDevice));
// invoke kernel at host side
dim3 block (nElem);
dim3 grid (1);
sumArraysOnGPU<<<grid, block>>>(d_A, d_B, d_C, nElem);
printf("Execution configure <<<%d, %d>>>\n", grid.x, block.x);
// copy kernel result back to host side
CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
// add vector at host side for result checks
sumArraysOnHost(h_A, h_B, hostRef, nElem);
// check device results
checkResult(hostRef, gpuRef, nElem);
// free device global memory
CHECK(cudaFree(d_A));
CHECK(cudaFree(d_B));
CHECK(cudaFree(d_C));
// free host memory
free(h_A);
free(h_B);
free(hostRef);
free(gpuRef);
CHECK(cudaDeviceReset());
return(0);
}
__global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) C[i] = A[i] + B[i];
}
linux 計算kernel的運行時間
#include <sys/time.h>
double cpuSeconds()
{
struct timeval tp;
gettimeofday(&tp, NULL);
return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}
//計算kernel的時間
double iStart = cpuSecond();
kernel_name<<<grid, block>>>(argument list);
cudaDeviceSynchronize();
double iElaps = cpuSecond() - iStart;
Timing with nvprof
$ nvprof [nvprof_args] <application> [application_args]
$ nvprof --help
$ nvprof ./sumArraysOnGPU-timer
//我的電腦Ubuntu16.04+CUDA8.0報錯:
Error:unified memory profiling failed.
//為什么會出現(xiàn)這種情況我還不清楚变逃,但是可以做如下處理
$ nvprof --unified-memory-profiling off ./sumArraysOnGPU-timer
線程的組織形式
不同的block和grid會對kernel性能有很大的影響,下面以矩陣相加為例怠堪。
① 以2D的grid和2D的block組織線程揽乱,每個thread處理一個數(shù)據(jù)。
通常而言粟矿,矩陣中的元素是線性存儲的凰棉,是以行為主進行線性的存儲。
在一個kernel函數(shù)中陌粹,可以采用一個thread處理一個位置的元素相加撒犀。首先要考慮如下3個問題:
?線程的索引和塊索引
?矩陣中給定點的坐標(biāo)
?線性全局存儲器中的偏移量
對于給定的線程,您可以從塊索引和線程索引中獲取全局內(nèi)存中的偏移量
將線程索和塊索引映射到矩陣中的坐標(biāo)掏秩,然后映射這些矩陣
坐標(biāo)到全局內(nèi)存位置或舞。
?1.將線程索引和塊索引映射到矩陣中的坐標(biāo)。
ix=threadIdx.x+blockIdx.x*blockDim.x
iy=threadIdx.y+blockIdx.y*blockDim.y
?2.根據(jù)矩陣中的坐標(biāo)計算偏移量(行為主)
idx=ix*nx+iy
注意:這里的x指的是橫坐標(biāo)蒙幻,y指的縱坐標(biāo)映凳,一定要和矩陣的行列區(qū)分開來
?3.設(shè)置block和grid
dim3 blcok(32,32);
dim3 grid((nx+block.x-1)/block.x,(ny+block.y-1)/block.y);
sumMatrixOnGPU2D<<<grid,block>>>(d_MatA, d_MatB, d_MatC, nx, ny)
__global__ void sumMatrixOnGPU2D(float *MatA, float *MatB, float *MatC,
int nx, int ny) {
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int idx = iy*nx + ix;
if (ix < nx && iy < ny)
MatC[idx] = MatA[idx] + MatB[idx];
}
②1D的grid和1D的block,,每個thread處理多個數(shù)據(jù)邮破。
如果grid和block都是一維的诈豌,但是grid和block每一維有最大的限制仆救,當(dāng)數(shù)據(jù)量比較大時,一個線程不能只處理一個數(shù)據(jù)队询,可以一個線程處理多個數(shù)據(jù)派桩。比如可以將一個線程處理ny個數(shù)據(jù)。
dim3 blcok(32,1);
dim3 grid((nx+block.x-1)/block.x,1);
__global__ void sumMatrixOnGPU1D(float *MatA, float *MatB, float *MatC,int nx, int ny) {
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
if (ix < nx ) {
for (int iy=0; iy<ny; iy++) {
int idx = iy*nx + ix;
MatC[idx] = MatA[idx] + MatB[idx];
}
}
}
③2D的grid和1D的block蚌斩,每個thread處理一個數(shù)據(jù)。
ix=threadIdx.x+blockIdx.x*blockDim.x;
iy=blockIdx.y;
__global__ void sumMatrixOnGPUMix(float *MatA, float *MatB, float *MatC,int nx, int ny)
{
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int iy = blockIdx.y;
unsigned int idx = iy*nx + ix;
if (ix < nx && iy < ny)
MatC[idx] = MatA[idx] + MatB[idx];
}
查詢GPU設(shè)備信息
? CUDA runtime API functions
cudaGetDeviceProperties(cudaDeviceProp*prop,int device);
? nvidia-smi