一优烧、Hello Wold
CPU和CPU內(nèi)存被稱為host memory,GPU和顯存被稱為device memory,下面我們來(lái)看英偉達(dá)官網(wǎng)給出的一段代碼:
#include <iostream>
#include <algorithm>
using namespace std;
#define N 1024
#define RADIUS 3
#define BLOCK_SIZE 16
// parallel fn(并行函數(shù))
__global__ void stencil_1d(int *in, int *out) {
__shared__ int temp[BLOCK_SIZE + 2 * RADIUS];
int gindex = threadIdx.x + blockIdx.x * blockDim.x;
int lindex = threadIdx.x + RADIUS;
// Read input elements into shared memory
temp[lindex] = in[gindex];
if (threadIdx.x < RADIUS) {
temp[lindex - RADIUS] = in[gindex - RADIUS];
temp[lindex + BLOCK_SIZE] = in[gindex + BLOCK_SIZE];
}
// Synchronize (ensure all the data is available)
__syncthreads();
// Apply the stencil
int result = 0;
for (int offset = -RADIUS ; offset <= RADIUS ; offset++)
result += temp[lindex + offset];
// Store the result
out[gindex] = result;
}
void fill_ints(int *x, int n) {
fill_n(x, n, 1);
}
// CPU代碼(包括CPU數(shù)據(jù)拷貝到GPU,并行計(jì)算,GPU數(shù)據(jù)拷貝到CPU)
int main(void) {
int *in, *out; // host copies of a, b, c
int *d_in, *d_out; // device copies of a, b, c
int size = (N + 2*RADIUS) * sizeof(int);
// Alloc space for host copies and setup values
in = (int *)malloc(size); fill_ints(in, N + 2*RADIUS);
out = (int *)malloc(size); fill_ints(out, N + 2*RADIUS);
// Alloc space for device copies
cudaMalloc((void **)&d_in, size);
cudaMalloc((void **)&d_out, size);
// Copy to device
cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_out, out, size, cudaMemcpyHostToDevice);
// Launch stencil_1d() kernel on GPU
stencil_1d<<<N/BLOCK_SIZE,BLOCK_SIZE>>>(d_in + RADIUS, d_out + RADIUS);
// Copy result back to host
cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost);
// Cleanup
free(in); free(out);
cudaFree(d_in); cudaFree(d_out);
return 0;
}
下面三張是表示上面代碼的整體過程:
比較下C的hello world和cuda的hello world
// C
#include <stdio.h>
int main(void) {
printf("Hello World!\n");
return 0;
}
//Cuda
#include <cstdlib>
#include <cstdio>
#include <cuda.h>
__global__ void mykernel(void) {
}
int main(void) {
mykernel<<<1,1>>> ();
printf("Hello World!\n");
return 0;
}
CUDA, device代碼中參數(shù)的使用方式是和C類似的,例如:
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
內(nèi)存管理
cuda中和c類似的malloc(),free(),memcpy()有cudaMalloc(),cudaFree(),cudaMemory()
int main(void) {
int a, b, c; // host copies of a, b, c
int *d_a, *d_b, *d_c; // device copies of a, b, c
int size = sizeof(int);
// Allocate space for device copies of a, b, c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
// Setup input values
a = 2;
b = 7;
// Copy inputs to device
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU
add<<<1,1>>>(d_a, d_b, d_c);
// Copy result back to host
cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
// Cleanup
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
二、CUDA Block
GPU可以將add<<<1,1>>>() 修改為add<<<N, 1>>>()來(lái)并行計(jì)算代碼,對(duì)向量的加法的簡(jiǎn)單例子:
__global__ void add(int *a, int *b, int *c) {
c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
}
//使用blockIdx.x可以使得不同block處理不同的index
block運(yùn)算如下圖.
向量加法的并行運(yùn)算示例代碼:
#define N 512
int main(void) {
int *a, *b, *c; // host copies of a, b, c
int *d_a, *d_b, *d_c; // device copies of a, b, c
int size = N * sizeof(int);
// Alloc space for device copies of a, b, c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
// Alloc space for host copies of a, b, c and setup input values
a = (int *)malloc(size); random_ints(a, N);
b = (int *)malloc(size); random_ints(b, N);
c = (int *)malloc(size);
// Copy inputs to device
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU with N blocks
add<<<N,1>>>(d_a, d_b, d_c);
// Copy result back to host
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
// Cleanup
free(a); free(b); free(c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
三、CUDA Thread
類似的和上面的blockIdx.x類似的,有
__global__ void add(int *a, int *b, int *c) {
c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
}
類似的也有使用線程向量加法的代碼:
#define N 512
int main(void) {
int *a, *b, *c; // host copies of a, b, c
int *d_a, *d_b, *d_c; // device copies of a, b, c
int size = N * sizeof(int);
// Alloc space for device copies of a, b, c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
// Alloc space for host copies of a, b, c and setup input values
a = (int *)malloc(size); random_ints(a, N);
b = (int *)malloc(size); random_ints(b, N);
c = (int *)malloc(size);
// Copy inputs to device
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU with N threads
add<<<1,N>>>(d_a, d_b, d_c);
// Copy result back to host
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
// Cleanup
free(a); free(b); free(c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
可以很明顯的看出,唯一的不同其實(shí)就是add<<<N,1>>>和add<<<1,N>>>的區(qū)別.
四、Blocks和Threads的結(jié)合
先來(lái)看一張圖
對(duì)于第M個(gè)blockIdx的threadInx可以到新的index:
int index = threadIdx.x + blockIdx.x * M
分別可以寫出它們的add函數(shù)和main函數(shù)
__global__ void add(int *a, int *b, int *c) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
c[index] = a[index] + b[index];
}
#define N (2048*2048)
#define THREADS_PER_BLOCK 512
int main(void) {
int *a, *b, *c; // host copies of a, b, c
int *d_a, *d_b, *d_c; // device copies of a, b, c
int size = N * sizeof(int);
// Alloc space for device copies of a, b, c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
// Alloc space for host copies of a, b, c and setup input values
a = (int *)malloc(size); random_ints(a, N);
b = (int *)malloc(size); random_ints(b, N);
c = (int *)malloc(size);
// Copy inputs to device
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU
add<<<N/THREADS_PER_BLOCK,THREADS_PER_BLOCK>>>(d_a, d_b, d_c);
// Copy result back to host
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
// Cleanup
free(a); free(b); free(c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
在通常情況下,數(shù)組長(zhǎng)度并不一定等于blocks* threads,上面的add函數(shù)和kernel啟動(dòng)代碼可以修改如下
__global__ void add(int *a, int *b, int *c, int n) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < n)
c[index] = a[index] + b[index];
}
add <<<(N + M -1) / M , M>>>(d_a, d_b, d_c, N)
五、Cooperation Threads(線程間的協(xié)作)
考慮這么個(gè)問題,指定半徑radius,將一個(gè)1維的stencil(數(shù)組環(huán))中的值變成另外一個(gè)一維數(shù)組;
如果半徑是3,一維數(shù)組的每個(gè)數(shù)都是stencil中數(shù)和相鄰距離小于或等于3的總共7個(gè)數(shù)的和,在轉(zhuǎn)變過程中,stencil中每個(gè)數(shù)都會(huì)被訪問7次.
CUDA中線程數(shù)據(jù):
- 在同一個(gè)block,threads通過shared memory來(lái)共享數(shù)據(jù);
- Extremely fast on-chip memory, user-managed (?)
- 使用_shared_來(lái)聲明shared memory
- 對(duì)于同一個(gè)thread的其他block數(shù)據(jù)是不可見的.這部分內(nèi)存都是需要單獨(dú)管理的.
__global__ void stencil_1d(int *in, int *out) {
__shared__ int temp[BLOCK_SIZE + 2 * RADIUS]; //對(duì)應(yīng)下圖中的第一個(gè)長(zhǎng)條
int gindex = threadIdx.x + blockIdx.x * blockDim.x;
int lindex = threadIdx.x + RADIUS;
// Read input elements into shared memory
temp[lindex] = in[gindex]; //對(duì)應(yīng)下圖中的第二個(gè)長(zhǎng)條
if (threadIdx.x < RADIUS) {
temp[lindex - RADIUS] = in[gindex - RADIUS]; //對(duì)應(yīng)下圖中的第三個(gè)長(zhǎng)條
temp[lindex + BLOCK_SIZE] = in[gindex + BLOCK_SIZE]; //對(duì)應(yīng)下圖中的第四個(gè)長(zhǎng)條
}
// Synchronize (ensure all the data is available)
__syncthreads();
// Apply the stencil
int result = 0;
for (int offset = -RADIUS ; offset <= RADIUS ; offset++)
result += temp[lindex + offset];
// Store the result
out[gindex] = result;
}
上述操作過程中__syncthreds()適用于同步一個(gè)block的所有線程.
六脖隶、同步host和device
CUDA所有錯(cuò)誤都會(huì)返回cuda錯(cuò)誤碼(cudaError_t)
// 獲取cuda error string
char *cudaGetErrorString(cudaError_t)
printf("%s\n", cudaGetErrorString(cudaGetLastError()));
另外還有一些常用的方法:
cudaGetDeviceCount(int *count)
cudaSetDevice(int device)
cudaGetDevice(int *device)
cudaGetDeviceProperties(cudaDeviceProp *prop, int device)
七、冒泡排序的CUDA版本
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#define N 400
void random_ints(int *);
__global__ void myKernel(int *d_a)
{
__shared__ int s_a[N]; //定義共享變量
int tid = threadIdx.x;
s_a[tid] = d_a[tid]; //每個(gè)線程搬運(yùn)對(duì)應(yīng)的數(shù)據(jù)到共享內(nèi)存中
__syncthreads(); //線程同步
for (int i = 1; i <= N; i++) { //最多N次排序完成
if (i % 2 == 1 && (2 * tid + 1) < N) { //奇數(shù)步
if (s_a[2 * tid] > s_a[2 * tid + 1]) {
int temp = s_a[2 * tid];
s_a[2 * tid] = s_a[2 * tid+1];
s_a[2 * tid + 1] = temp;
}
}
__syncthreads(); //線程同步
if (i % 2 == 0 && (2 * tid + 2) < N ) { //偶數(shù)步
if (s_a[2 * tid+1] > s_a[2 * tid + 2]) {
int temp = s_a[2 * tid+1];
s_a[2 * tid+1] = s_a[2 * tid + 2];
s_a[2 * tid + 2] = temp;
}
}
__syncthreads(); //線程同步
}
d_a[tid] = s_a[tid]; //將排序結(jié)果搬回到Global Memory
}
int main()
{
//定義變量
int *a,*d_a;
int size = N * sizeof(int);
//Host端變量分配內(nèi)存
a = (int *)malloc(size);
//初始化待排序數(shù)組
random_ints(a);
//Device端變量分配內(nèi)存
cudaMalloc((void **)&d_a, size);
//將數(shù)據(jù)從Host端拷貝到Device端
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
//調(diào)用核函數(shù)
myKernel<<<1, N >>> (d_a);
//將數(shù)據(jù)從Device端拷貝到Host端
cudaMemcpy(a, d_a, size, cudaMemcpyDeviceToHost);
//打印排序結(jié)果
for (int i = 0; i < N; i++) {
printf("%d ", a[i]);
}
//釋放內(nèi)存
free(a);
cudaFree(d_a);
return 0;
}
void random_ints(int *a)
{
if (!a) { //異常判斷
return;
}
for (int i = 0; i < N; i++) {
a[i] = rand() % N; //產(chǎn)生0-N之間的隨機(jī)整數(shù)
}