使用python 調用 pybind11封裝的 cuda C++ 動態(tài)鏈接庫

pybind11是可以使C++和python程序間互相調用的輕量頭文件庫，它可以將C++代碼編譯成python可調用的動態(tài)鏈接庫窿冯，

pybind11可以自動實現(xiàn)C++中vector、list等與python中l(wèi)ist的自動轉換醒串，也可以C++中多維數(shù)組自動轉換為 numpy.ndarray的格式。

pybind11中numpy.ndarray在C++中的表現(xiàn)形式：

py::array_t<float> np_multiply_Cublas(py::array_t<float> &inLeft, py::array_t<float> &inRight)
{
    //可通過此函數(shù)傳入python中的numpy.ndarray數(shù)據(jù)鼻吮，在C++中表現(xiàn)為py::array_t<T>格式。
    py::buffer_info bufLeft = inLeft.request(), bufRight = inRight.request();
    //request方法活得對py::array_t<T>的綁定椎木，包括維度博烂、數(shù)據(jù)指針、size禽篱、shape等參數(shù)
    assert(bufLeft.ndim == bufRight.ndim);
    assert(bufLeft.ndim == 2);
    assert(bufLeft.shape[1] == bufRight.shape[0]);
    const int M = bufLeft.shape[0], K = bufLeft.shape[1], N = bufRight.shape[1];
    // 實現(xiàn)矩陣乘法，C=A*B
    //M玛界、K悼吱、N分別是A的行數(shù)、列數(shù)后添、B的列數(shù)，C的shape為{M馅精，N}
    auto result = py::array_t<float>(M * N);
    result.resize({M, N});

    py::buffer_info bufResult = result.request();
    float *ptrLeft = (float *)bufLeft.ptr,
          *ptrRight = (float *)bufRight.ptr,
          *ptrResult = (float *)bufResult.ptr;  //獲得數(shù)據(jù)指針
    Gpu_mul(ptrLeft, ptrRight, ptrResult, M, K, N);//將響應參數(shù)傳輸cuda C++編寫的矩陣乘法函數(shù)中結果保存在ptrResult中
    return result;
    //返回result,result也是py::array_t<T>格式粱檀，也就是python中 的numpy.ndarray
}

以下是簡單的python調用接口封裝

PYBIND11_MODULE(mylib, m)
{
   // m.doc("use cuda and demo");
    //mylib代表封裝的python模塊名
    m.def("np_sum", &np_sum, "Add two Numpy arrays use cuda");
    m.def("Gpu_mul", &np_multiply, "Multuply tow arrays use cuda flag==1 use shared memory,flag==2 use global memory");
    
    m.def("Gpu_Cublas", &np_multiply_Cublas, "Multuply tow arrays use cublas");
    //"Gpu_Cublas"代表python中對應的函數(shù)，&np_multiply_Cublas是對應的C++函數(shù)指針压彭，之后的字符串是python中的函數(shù)doc
}

以下是python中的調用示例

import os
os.sys.path.append("../build/bin")
os.sys.path.append("build/bin")
os.sys.path.append("build/bin/Release")#這三個路徑是 mylib.cp37-win_amd64.pyd 動態(tài)庫可能存在的路徑
import mylib as my # 載入mylib模塊
import time
import numpy as np

Ga = np.random.rand(50* 100, 31 * 100).astype(np.float32)
Gb = np.random.rand(31 * 100, 50 * 100).astype(np.float32)
# 生成隨機numpy.ndarray,為float32格式
Gstart = time.time()
Ghc = Ga @ Gb  # 調用numpy自帶的矩陣乘法函數(shù)
Gpaush = time.time()
Gc = my.Gpu_Cublas(Ga, Gb) # 調研C++端的liys cublas矩陣乘法函數(shù)
Gend = time.time()


print("the numpy cost", (Gpaush - Gstart)) # 比較兩者耗時
print("the cublas cost time ", (Gend - Gpaush))
print("the two result are same?", (Ghc == Gc).any()) # 比較兩個答案是否相等

以下是C++中利用cublas實現(xiàn)矩陣乘法的代碼實現(xiàn)


void useCublas(float *_A, float *_B, float *_C, int M, int K, int N)
{
    cublasHandle_t handle;
    cublasCreate(&handle);
    float alpha = 1, beta = 0;
    cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, _B, N, _A, K, &beta, _C, N);//C=alpha*A*B+beta,
    //cublas中矩陣是列優(yōu)先的格式，而C++是行優(yōu)先的格式,所以調用的時候是_B在前刨秆，_A在后 C^T = B^T*A^T 
}
void Gpu_mul(float *ptrLeft, float *ptrRight, float *ptrResult, int M, int K, int N,int flage)
{
    float *d_a, *d_b, *d_c;
    cudaMalloc((void **)&d_a, M * K * sizeof(float));
    cudaMalloc((void **)&d_b, K * N * sizeof(float));
    cudaMalloc((void **)&d_c, M * N * sizeof(float));
    CHECK(cudaMemcpy(d_a, ptrLeft, M * K * sizeof(float), cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(d_b, ptrRight, K * N * sizeof(float), cudaMemcpyHostToDevice));
    constexpr const int TP = 16;
    dim3 threadsPer(TP, TP);
    dim3 blocksPer((M + TP - 1) / TP, (N + TP - 1) / TP);
    if (flage == 1)
    {
        matrix_MulG<TP><<<blocksPer, threadsPer>>>(d_a, d_b, d_c, M, K, N);
    }
    else if (flage == 0)
    {
        useCublas(d_a, d_b, d_c, M, K, N); //如果flag==0,調用cublas的矩陣乘法忆畅，否則調用 手寫的cuda核函數(shù)進行矩陣相乘。
    }
    else if (flage == 2)
    {
        matrix_glbal_mul<<<blocksPer, threadsPer>>>(d_a, d_b, d_c, M, K, N);
    }
    
    CHECK(cudaMemcpy(ptrResult, d_c, M * N * sizeof(float), cudaMemcpyDeviceToHost));
    
    CHECK(cudaFree(d_a));
    CHECK(cudaFree(d_b));
    CHECK(cudaFree(d_c));
    
}

矩陣乘法cuda核函數(shù)版,最直接的版本缓醋，直接使用全局內存

__global__ void matrix_glbal_mul(float*_A ,float* _B,float* _C, int M,int K,int N)
{
    int x = threadIdx.x + blockIdx.x * blockDim.x; //對應于_C的列
    int y = threadIdx.y + blockIdx.y * blockDim.y; //對應于_C的行索引
    if(y>=M || x>=N)
        return;//防止越界
    float sum = 0.;
    for (int i = 0;i<K; ++i)
    {
        sum += _A[y * K + i] * _B[i * K + x];//A[y][i]*B[i][x]的乘累加
    }
    _C[y * N + x] = sum;
}
//這個版本绊诲，A和B中的每個元素都需要讀取多次，由于GPU中global memory讀寫速度遠小于每個block的shared memory

利用shared memory的儲存A和B的數(shù)據(jù)掂之，global memory中的每個數(shù)據(jù)只需讀取一次


template <int TILE_WIDTH>
__global__ void matrix_MulG(float *_A, float *_B, float *_C, int M, int K, int N)
{
    __shared__ float subM[TILE_WIDTH][TILE_WIDTH];
    __shared__ float subN[TILE_WIDTH][TILE_WIDTH];//每個block中的所有threads共享同一的shared memory
    int x = threadIdx.x + blockIdx.x * blockDim.x; //
    int y = threadIdx.y + blockIdx.y * blockDim.y; //y為行，x為列 對應與結果C的行和列
    int tx = threadIdx.x;
    int ty = threadIdx.y;//該thread 在所屬block中的索引
    
    float tmp = 0.;
    for (int i = 0; i < ((K + TILE_WIDTH - 1) / TILE_WIDTH); ++i)
    {
        if ((tx + i * TILE_WIDTH) < K && (y < M))
            subM[ty][tx] = _A[y * K + tx + i * TILE_WIDTH]; //共享內存參數(shù)更新
        else
        {
            subM[ty][tx] = 0.;
        }
        if ((ty + i * TILE_WIDTH) < K && (x < N))
            subN[ty][tx] = _B[(ty + i * TILE_WIDTH) * N + x];
        else
        {
            subN[ty][tx] = 0.;
        }
        __syncthreads();// block內thread同步
        for (int j = 0; j < TILE_WIDTH; ++j)
        {
            tmp += subM[ty][j] * subN[j][tx];
        }
        __syncthreads();
    }
    if (y < M && x < N) //越界檢查
        _C[y * N + x] = tmp;
}

最終調用的python測試动雹，運行結果可能在不同機器上不同

the numpy cost 4.054656028747559
the cublas cost time 2.555680513381958
the two result are same? True
the Gpu with shared memory cost time 4.681894779205322
the two result are same? True
the Gpu with global memory cost time 4.836901903152466
the two result are same? True

實際上如果需要在Python代碼中調用cuda核函數(shù)胰蝠，可以用numba加速庫的numba.cuda模塊。