CUDA C/C++簡(jiǎn)介

一优烧、Hello Wold

CPU和CPU內(nèi)存被稱為host memory,GPU和顯存被稱為device memory,下面我們來(lái)看英偉達(dá)官網(wǎng)給出的一段代碼:

#include <iostream>
#include <algorithm>

using namespace std;

#define N          1024
#define RADIUS     3
#define BLOCK_SIZE 16

// parallel fn(并行函數(shù))
__global__ void stencil_1d(int *in, int *out) {
    __shared__ int temp[BLOCK_SIZE + 2 * RADIUS];
    int gindex = threadIdx.x + blockIdx.x * blockDim.x;
    int lindex = threadIdx.x + RADIUS;

    // Read input elements into shared memory
    temp[lindex] = in[gindex];
    if (threadIdx.x < RADIUS) {
        temp[lindex - RADIUS] = in[gindex - RADIUS];
        temp[lindex + BLOCK_SIZE] = in[gindex + BLOCK_SIZE];
    }

    // Synchronize (ensure all the data is available)
    __syncthreads();

    // Apply the stencil
    int result = 0;
    for (int offset = -RADIUS ; offset <= RADIUS ; offset++)
        result += temp[lindex + offset];

    // Store the result
    out[gindex] = result;
}

void fill_ints(int *x, int n) {
    fill_n(x, n, 1);
}
// CPU代碼(包括CPU數(shù)據(jù)拷貝到GPU,并行計(jì)算,GPU數(shù)據(jù)拷貝到CPU)
int main(void) {
    int *in, *out;              // host copies of a, b, c
    int *d_in, *d_out;          // device copies of a, b, c
    int size = (N + 2*RADIUS) * sizeof(int);

    // Alloc space for host copies and setup values
    in  = (int *)malloc(size); fill_ints(in,  N + 2*RADIUS);
    out = (int *)malloc(size); fill_ints(out, N + 2*RADIUS);
    
    // Alloc space for device copies
    cudaMalloc((void **)&d_in,  size);
    cudaMalloc((void **)&d_out, size);

    // Copy to device
    cudaMemcpy(d_in,  in,  size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_out, out, size, cudaMemcpyHostToDevice);

    // Launch stencil_1d() kernel on GPU
    stencil_1d<<<N/BLOCK_SIZE,BLOCK_SIZE>>>(d_in + RADIUS, d_out + RADIUS);

    // Copy result back to host
    cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost);

    // Cleanup
    free(in); free(out);
    cudaFree(d_in); cudaFree(d_out);
    return 0;
}

下面三張是表示上面代碼的整體過程:

Simple Processing Flow1.png

Simple Processing Flow2.png

Simple Procesing Flow3.png

比較下C的hello world和cuda的hello world

// C
#include <stdio.h>

int main(void) {
    printf("Hello World!\n");
    return 0;
}

//Cuda
#include <cstdlib>
#include <cstdio>
#include <cuda.h>

__global__ void mykernel(void) {

}

int main(void) {
    mykernel<<<1,1>>> ();
    printf("Hello World!\n");
    return 0;
}

CUDA, device代碼中參數(shù)的使用方式是和C類似的,例如:

__global__ void add(int *a, int *b, int *c) {
        *c = *a + *b;
}

內(nèi)存管理

cuda中和c類似的malloc(),free(),memcpy()有cudaMalloc(),cudaFree(),cudaMemory()

int main(void) {
        int a, b, c;                // host copies of a, b, c
        int *d_a, *d_b, *d_c;        // device copies of a, b, c
        int size = sizeof(int);
        
        // Allocate space for device copies of a, b, c
        cudaMalloc((void **)&d_a, size);
        cudaMalloc((void **)&d_b, size);
        cudaMalloc((void **)&d_c, size);

        // Setup input values
        a = 2;
        b = 7;
        
        // Copy inputs to device
        cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);

        // Launch add() kernel on GPU
        add<<<1,1>>>(d_a, d_b, d_c);

        // Copy result back to host
        cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);

        // Cleanup
        cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
        return 0;
    }

二、CUDA Block

GPU可以將add<<<1,1>>>() 修改為add<<<N, 1>>>()來(lái)并行計(jì)算代碼,對(duì)向量的加法的簡(jiǎn)單例子:

__global__ void add(int *a, int *b, int *c) {
        c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
    }
//使用blockIdx.x可以使得不同block處理不同的index

block運(yùn)算如下圖.

Block運(yùn)算.png

向量加法的并行運(yùn)算示例代碼:

#define N 512
int main(void) {
    int *a, *b, *c;     // host copies of a, b, c
    int *d_a, *d_b, *d_c;   // device copies of a, b, c
    int size = N * sizeof(int);
        
    // Alloc space for device copies of a, b, c
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);
    cudaMalloc((void **)&d_c, size);

    // Alloc space for host copies of a, b, c and setup input values
    a = (int *)malloc(size); random_ints(a, N);
    b = (int *)malloc(size); random_ints(b, N);
    c = (int *)malloc(size);
    // Copy inputs to device
    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

    // Launch add() kernel on GPU with N blocks
    add<<<N,1>>>(d_a, d_b, d_c);

    // Copy result back to host
    cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

    // Cleanup
    free(a); free(b); free(c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    return 0;
}

三、CUDA Thread

類似的和上面的blockIdx.x類似的,有

__global__ void add(int *a, int *b, int *c) {
    c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
}

類似的也有使用線程向量加法的代碼:

#define N 512
    int main(void) {
        int *a, *b, *c;         // host copies of a, b, c
        int *d_a, *d_b, *d_c;       // device copies of a, b, c
        int size = N * sizeof(int);
        
        // Alloc space for device copies of a, b, c
        cudaMalloc((void **)&d_a, size);
        cudaMalloc((void **)&d_b, size);
        cudaMalloc((void **)&d_c, size);
        
        // Alloc space for host copies of a, b, c and setup input values
        a = (int *)malloc(size); random_ints(a, N);
        b = (int *)malloc(size); random_ints(b, N);
        c = (int *)malloc(size);
         // Copy inputs to device
        cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

        // Launch add() kernel on GPU with N threads
        add<<<1,N>>>(d_a, d_b, d_c);

        // Copy result back to host
        cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

        // Cleanup
        free(a); free(b); free(c);
        cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
        return 0;
    }

可以很明顯的看出,唯一的不同其實(shí)就是add<<<N,1>>>和add<<<1,N>>>的區(qū)別.

四、Blocks和Threads的結(jié)合

先來(lái)看一張圖

Blocks和Threads.png

對(duì)于第M個(gè)blockIdx的threadInx可以到新的index:
int index = threadIdx.x + blockIdx.x * M
分別可以寫出它們的add函數(shù)和main函數(shù)

__global__ void add(int *a, int *b, int *c) {
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    c[index] = a[index] + b[index];
}


#define N (2048*2048)
#define THREADS_PER_BLOCK 512
int main(void) {
    int *a, *b, *c;         // host copies of a, b, c
    int *d_a, *d_b, *d_c;       // device copies of a, b, c
    int size = N * sizeof(int);

    // Alloc space for device copies of a, b, c
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);
    cudaMalloc((void **)&d_c, size);

    // Alloc space for host copies of a, b, c and setup input values
    a = (int *)malloc(size); random_ints(a, N);
    b = (int *)malloc(size); random_ints(b, N);
    c = (int *)malloc(size);
    // Copy inputs to device
    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

    // Launch add() kernel on GPU
    add<<<N/THREADS_PER_BLOCK,THREADS_PER_BLOCK>>>(d_a, d_b, d_c);

    // Copy result back to host
    cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

    // Cleanup
    free(a); free(b); free(c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    return 0;
}

在通常情況下,數(shù)組長(zhǎng)度并不一定等于blocks* threads,上面的add函數(shù)和kernel啟動(dòng)代碼可以修改如下

__global__ void add(int *a, int *b, int *c, int n) {
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    if (index < n)
        c[index] = a[index] + b[index];
}

add <<<(N + M -1) / M , M>>>(d_a, d_b, d_c, N)

五、Cooperation Threads(線程間的協(xié)作)

考慮這么個(gè)問題,指定半徑radius,將一個(gè)1維的stencil(數(shù)組環(huán))中的值變成另外一個(gè)一維數(shù)組;
如果半徑是3,一維數(shù)組的每個(gè)數(shù)都是stencil中數(shù)和相鄰距離小于或等于3的總共7個(gè)數(shù)的和,在轉(zhuǎn)變過程中,stencil中每個(gè)數(shù)都會(huì)被訪問7次.
CUDA中線程數(shù)據(jù):

在同一個(gè)block,threads通過shared memory來(lái)共享數(shù)據(jù);
Extremely fast on-chip memory, user-managed (?)
使用_shared_來(lái)聲明shared memory
對(duì)于同一個(gè)thread的其他block數(shù)據(jù)是不可見的.這部分內(nèi)存都是需要單獨(dú)管理的.

__global__ void stencil_1d(int *in, int *out) {
  __shared__ int temp[BLOCK_SIZE + 2 * RADIUS]; //對(duì)應(yīng)下圖中的第一個(gè)長(zhǎng)條
  int gindex = threadIdx.x + blockIdx.x * blockDim.x;
  int lindex = threadIdx.x + RADIUS;


  // Read input elements into shared memory
  temp[lindex] = in[gindex];  //對(duì)應(yīng)下圖中的第二個(gè)長(zhǎng)條
  if (threadIdx.x < RADIUS) {
    temp[lindex - RADIUS] = in[gindex - RADIUS];  //對(duì)應(yīng)下圖中的第三個(gè)長(zhǎng)條
    temp[lindex + BLOCK_SIZE] = in[gindex + BLOCK_SIZE]; //對(duì)應(yīng)下圖中的第四個(gè)長(zhǎng)條
  }
  // Synchronize (ensure all the data is available)
  __syncthreads();

  // Apply the stencil
  int result = 0;
  for (int offset = -RADIUS ; offset <= RADIUS ; offset++)
    result += temp[lindex + offset];

  // Store the result
  out[gindex] = result;
}

stencil kernel.png

上述操作過程中__syncthreds()適用于同步一個(gè)block的所有線程.

六脖隶、同步host和device

CUDA所有錯(cuò)誤都會(huì)返回cuda錯(cuò)誤碼(cudaError_t)

// 獲取cuda error string
char *cudaGetErrorString(cudaError_t)
printf("%s\n", cudaGetErrorString(cudaGetLastError()));

另外還有一些常用的方法:

cudaGetDeviceCount(int *count)
cudaSetDevice(int device)
cudaGetDevice(int *device)
cudaGetDeviceProperties(cudaDeviceProp *prop, int device)

七、冒泡排序的CUDA版本

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
 
#include <stdio.h>
#include <stdlib.h>
 
#define N 400
void random_ints(int *);
 
__global__ void myKernel(int *d_a)
{
    __shared__ int s_a[N]; //定義共享變量
    int tid = threadIdx.x;
    s_a[tid] = d_a[tid]; //每個(gè)線程搬運(yùn)對(duì)應(yīng)的數(shù)據(jù)到共享內(nèi)存中
    __syncthreads(); //線程同步
 
    for (int i = 1; i <= N; i++) { //最多N次排序完成
 
        if (i % 2 == 1 && (2 * tid + 1) < N) { //奇數(shù)步
            if (s_a[2 * tid] > s_a[2 * tid + 1]) {
                int temp = s_a[2 * tid];
                s_a[2 * tid] = s_a[2 * tid+1];
                s_a[2 * tid + 1] = temp;
            }
        } 
        __syncthreads(); //線程同步
        if (i % 2 == 0 && (2 * tid + 2) < N ) { //偶數(shù)步
            if (s_a[2 * tid+1] > s_a[2 * tid + 2]) {
                int temp = s_a[2 * tid+1];
                s_a[2 * tid+1] = s_a[2 * tid + 2];
                s_a[2 * tid + 2] = temp;
            }
        }
        __syncthreads(); //線程同步     
    }
    d_a[tid] = s_a[tid]; //將排序結(jié)果搬回到Global Memory
}
 
int main()
{
    //定義變量
    int *a,*d_a;
    int size = N * sizeof(int);
 
    //Host端變量分配內(nèi)存
    a = (int *)malloc(size); 
    //初始化待排序數(shù)組
    random_ints(a);
    
    //Device端變量分配內(nèi)存
    cudaMalloc((void **)&d_a, size);
 
    //將數(shù)據(jù)從Host端拷貝到Device端
    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
 
    //調(diào)用核函數(shù)
    myKernel<<<1, N >>> (d_a);
 
    //將數(shù)據(jù)從Device端拷貝到Host端
    cudaMemcpy(a, d_a, size, cudaMemcpyDeviceToHost);
 
    //打印排序結(jié)果
    for (int i = 0; i < N; i++) {
        printf("%d ", a[i]);
    }
 
    //釋放內(nèi)存
    free(a); 
    cudaFree(d_a); 
    return 0;
}
void random_ints(int *a)
{
    if (!a) { //異常判斷
        return;
    }
    for (int i = 0; i < N; i++) {
        a[i] = rand() % N; //產(chǎn)生0-N之間的隨機(jī)整數(shù)
    }

最后編輯于：2019.06.04 20:20:19

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者

人面猴
序言：七十年代末暇检，一起剝皮案震驚了整個(gè)濱河市产阱，隨后出現(xiàn)的幾起案子，更是在濱河造成了極大的恐慌块仆，老刑警劉巖构蹬，帶你破解...
沈念sama閱讀 206,214評(píng)論 6贊 481
死咒
序言：濱河連續(xù)發(fā)生了三起死亡事件，死亡現(xiàn)場(chǎng)離奇詭異悔据，居然都是意外死亡庄敛，警方通過查閱死者的電腦和手機(jī)，發(fā)現(xiàn)死者居然都...
沈念sama閱讀 88,307評(píng)論 2贊 382
救了他兩次的神仙讓他今天三更去死
文/潘曉璐我一進(jìn)店門科汗，熙熙樓的掌柜王于貴愁眉苦臉地迎上來(lái)藻烤，“玉大人，你說(shuō)我怎么就攤上這事头滔〔劳ぃ” “怎么了？”我有些...
開封第一講書人閱讀 152,543評(píng)論 0贊 341
道士緝兇錄：失蹤的賣姜人
文/不壞的土叔我叫張陵坤检，是天一觀的道長(zhǎng)依许。經(jīng)常有香客問我，道長(zhǎng)缀蹄，這世上最難降的妖魔是什么峭跳？我笑而不...
開封第一講書人閱讀 55,221評(píng)論 1贊 279
?港島之戀（遺憾婚禮）
正文為了忘掉前任，我火速辦了婚禮缺前，結(jié)果婚禮上蛀醉，老公的妹妹穿的比我還像新娘。我一直安慰自己衅码，他們只是感情好拯刁，可當(dāng)我...
茶點(diǎn)故事閱讀 64,224評(píng)論 5贊 371
惡毒庶女頂嫁案：這布局不是一般人想出來(lái)的
文/花漫我一把揭開白布。她就那樣靜靜地躺著逝段，像睡著了一般垛玻。火紅的嫁衣襯著肌膚如雪割捅。梳的紋絲不亂的頭發(fā)上，一...
開封第一講書人閱讀 49,007評(píng)論 1贊 284
城市分裂傳說(shuō)
那天帚桩，我揣著相機(jī)與錄音亿驾，去河邊找鬼。笑死账嚎，一個(gè)胖子當(dāng)著我的面吹牛莫瞬，可吹牛的內(nèi)容都是我干的。我是一名探鬼主播郭蕉，決...
沈念sama閱讀 38,313評(píng)論 3贊 399
雙鴛鴦連環(huán)套：你想象不到人心有多黑
文/蒼蘭香墨我猛地睜開眼疼邀，長(zhǎng)吁一口氣：“原來(lái)是場(chǎng)噩夢(mèng)啊……” “哼！你這毒婦竟也來(lái)了召锈？” 一聲冷哼從身側(cè)響起旁振，我...
開封第一講書人閱讀 36,956評(píng)論 0贊 259
萬(wàn)榮殺人案實(shí)錄
序言：老撾萬(wàn)榮一對(duì)情侶失蹤，失蹤者是張志新（化名）和其女友劉穎涨岁，沒想到半個(gè)月后拐袜，有當(dāng)?shù)厝嗽跇淞掷锇l(fā)現(xiàn)了一具尸體，經(jīng)...
沈念sama閱讀 43,441評(píng)論 1贊 300
?護(hù)林員之死
正文獨(dú)居荒郊野嶺守林人離奇死亡卵惦，尸身上長(zhǎng)有42處帶血的膿包…… 初始之章·張勛以下內(nèi)容為張勛視角年9月15日...
茶點(diǎn)故事閱讀 35,925評(píng)論 2贊 323
?白月光啟示錄
正文我和宋清朗相戀三年阻肿，在試婚紗的時(shí)候發(fā)現(xiàn)自己被綠了。大學(xué)時(shí)的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片沮尿。...
茶點(diǎn)故事閱讀 38,018評(píng)論 1贊 333
活死人
序言：一個(gè)原本活蹦亂跳的男人離奇死亡丛塌，死狀恐怖，靈堂內(nèi)的尸體忽然破棺而出畜疾，到底是詐尸還是另有隱情赴邻，我是刑警寧澤，帶...
沈念sama閱讀 33,685評(píng)論 4贊 322
?日本核電站爆炸內(nèi)幕
正文年R本政府宣布啡捶，位于F島的核電站姥敛，受9級(jí)特大地震影響，放射性物質(zhì)發(fā)生泄漏瞎暑。R本人自食惡果不足惜彤敛，卻給世界環(huán)境...
茶點(diǎn)故事閱讀 39,234評(píng)論 3贊 307
男人毒藥：我在死后第九天來(lái)索命
文/蒙蒙一、第九天我趴在偏房一處隱蔽的房頂上張望了赌。院中可真熱鬧墨榄，春花似錦、人聲如沸勿她。這莊子的主人今日做“春日...
開封第一講書人閱讀 30,240評(píng)論 0贊 19
一樁弒父案，背后竟有這般陰謀
文/蒼蘭香墨我抬頭看了看天上的太陽(yáng)。三九已至之剧，卻和暖如春郭卫，著一層夾襖步出監(jiān)牢的瞬間，已是汗流浹背背稼。一陣腳步聲響...
開封第一講書人閱讀 31,464評(píng)論 1贊 261
情欲美人皮
我被黑心中介騙來(lái)泰國(guó)打工贰军，沒想到剛下飛機(jī)就差點(diǎn)兒被人妖公主榨干…… 1. 我叫王不留，地道東北人雇庙。一個(gè)月前我還...
沈念sama閱讀 45,467評(píng)論 2贊 352
代替公主和親
正文我出身青樓谓形，卻偏偏與公主長(zhǎng)得像灶伊，于是被迫代替她去往敵國(guó)和親疆前。傳聞我的和親對(duì)象是個(gè)殘疾皇子，可洞房花燭夜當(dāng)晚...
茶點(diǎn)故事閱讀 42,762評(píng)論 2贊 345