1、并行算法介紹
Reduce:
對于一般可以加法運算绒净,如進(jìn)行8個數(shù)相加:
串行計算
由于對于符合交換律见咒,其運算順序可以改變,可以使用并行計算疯溺,如下:
并行計算
對比串行運算论颅,與并行運算的速度:
運算量 | 8 | ... | N |
---|---|---|---|
并行Step | 7 | ... | N-1 |
串行Step | 3 | ... | log2(N) |
對于并行計算的實現(xiàn),可參照udacity的課程GitHub:reduce.cu
Scan
Scan 可稱為prefix sum囱嫩,即求前N項(或N-1項)的和恃疯,如下:
prefix sum
串行計算流程如上求和無太大差異,而并行計算起來相當(dāng)講技巧墨闲,以N=8舉例:
黑\紅實線:加法 黑虛線:賦值
詳細(xì)可閱讀:Parallel prefix sum (scan) with CUDA
2今妄、圖像的直方圖均衡
假設(shè)存在8個灰度級的圖像如下圖,那么它實際可見的范圍為1~4
1 | 2 | 3 | 4 |
---|
1 | 2 | 3 | 4 |
---|
1 | 2 | 3 | 4 |
---|
1 | 2 | 3 | 4 |
---|
頻數(shù)圖如下:
image.png
histogram_equalize.png
即原來的灰度級為1映射后為0鸳碧,2映射后為2盾鳞,3映射后為3,4映射后為5
均衡后新灰度頻數(shù)表如下:
灰度級 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
---|---|---|---|---|---|---|---|---|
頻數(shù) | 4 | 0 | 4 | 4 | 0 | 4 | 0 | 0 |
image.png
映射結(jié)果:
0 | 2 | 3 | 5 |
---|
0 | 2 | 3 | 5 |
---|
0 | 2 | 3 | 5 |
---|
0 | 2 | 3 | 5 |
---|
總的來說沒有整個過程沒有改變像素點灰度值點之間大小瞻离、數(shù)量關(guān)系腾仅,或者說將可視區(qū)間拉伸,讓色彩對比度更加明顯套利。
3推励、并行圖像直方圖均衡化
1)統(tǒng)計每個灰度級像素的數(shù)目
__global__ void histogram_init(char *src,unsigned int *out)
//針對512*512大小灰度圖 開啟512*512線程鹤耍,計算256個灰度級相應(yīng)頻數(shù)
2)應(yīng)用Parallel Scan(prefix sum) 計算出映射關(guān)系
__global__ void scan_downsweep(unsigned int *in,unsigned int *out,int n)
//輸入256點頻數(shù)表,輸出均衡化后256對應(yīng)得映射值
3)將原圖按照映射關(guān)系映射到結(jié)果圖
__global__ void equal_remap(char *img,char *out,unsigned int *idetity)
//根據(jù)映射表验辞,重新映射圖像
程序效果圖如下:
histogram_result
源代碼:
#include "cuda.h"
#include "cuda_runtime.h"
#include "cuda_runtime_api.h"
#include "device_functions.h"
#include "opencv2/core/core.hpp"
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/opencv.hpp"
#include "stdio.h"
#include <vector>
using namespace std;
using namespace cv;
#define IMAGE_DIR "/home/dzqiu/Documents/image/chaozhou.JPG"
__global__ void histogram_init(char *src,unsigned int *out)
{
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockDim.y;
int offset = x + y * gridDim.x * blockDim.x;
unsigned char value = src[offset];
//原子操作稿黄,否則會出錯。
atomicAdd(&out[value],1);
}
//refer: Parallel Prefix Sum (Scan) with CUDA, Mark Harris, April 2007
//link : http: //developer.download.nvidia.com/compute/cuda/2_2/sdk/website/projects/scan/doc/scan.pdf
__global__ void scan_downsweep(unsigned int *in,unsigned int *out,int n)
{
int Idx = threadIdx.x ;
extern __shared__ float sdata[];
sdata[Idx] = in[Idx];
__syncthreads();
unsigned int offset=1;
for(unsigned int i=n>>1;i>0;i>>=1) //build sum iin place up the tree
{
__syncthreads();
if(Idx<i)
{
int ai = offset*(2*Idx+1)-1;
int bi = offset*(2*Idx+2)-1;
sdata[bi] += sdata[ai];
}
offset *= 2 ;
}
if(Idx==0) sdata[n-1]=0; //travers down tree &build scan
for(unsigned int i=1;i<n;i<<=1)
{
offset >>= 1;
__syncthreads();
if(Idx<i)
{
int ai = offset*(2*Idx+1)-1;
int bi = offset*(2*Idx+2)-1;
int tmp =sdata[ai];
sdata[ai] = sdata[bi];
sdata[bi] += tmp;
}
}
__syncthreads();
//out[Idx]=sdata[Idx];
out[Idx]=((float)sdata[Idx]/512/512*256)-1;
}
__global__ void equal_remap(char *img,char *out,unsigned int *idetity)
{
int x = threadIdx.x+blockIdx.x*blockDim.x;
int y = threadIdx.y+blockIdx.y*blockDim.y;
int offset = x+y*blockDim.x*gridDim.x;
unsigned char value = img[offset];
out[offset] = (unsigned char) idetity[value];
}
#define DIM 512
#define GRAY_IDENTITY 256
int main(int argc,char** argv)
{
Mat img_src = imread(IMAGE_DIR);
Mat img_gray;
cvtColor(img_src,img_gray,CV_RGB2GRAY);
resize(img_gray,img_gray,Size(DIM,DIM));
unsigned int *dev_histogram;
cudaMalloc((void**)&dev_histogram,GRAY_IDENTITY*sizeof(int));
cudaMemset(dev_histogram,0,GRAY_IDENTITY*sizeof(int));
/*build the histogram of the image*/
char *dev_gray;
cudaMalloc((void**)&dev_gray,DIM*DIM);
cudaMemcpy(dev_gray,img_gray.data,DIM*DIM,cudaMemcpyHostToDevice);
dim3 blocks(DIM/16,DIM/16);dim3 threads(16,16);
histogram_init<<<blocks,threads>>>(dev_gray,dev_histogram);
int host_histogram[GRAY_IDENTITY]={0};
cudaMemcpy(host_histogram,dev_histogram,GRAY_IDENTITY*sizeof(int),cudaMemcpyDeviceToHost);
/*equalize the histogram*/
unsigned int *dev_histogram_equal;
cudaMalloc((void**)&dev_histogram_equal,GRAY_IDENTITY*sizeof(int));
blocks=dim3(1,1);threads=dim3(GRAY_IDENTITY,1);
scan_downsweep<<<blocks,threads,sizeof(int)*GRAY_IDENTITY>>>(dev_histogram,dev_histogram_equal,GRAY_IDENTITY);
int host_histogram_equal[GRAY_IDENTITY]={0};
cudaMemcpy(host_histogram_equal,dev_histogram_equal,GRAY_IDENTITY*sizeof(int),cudaMemcpyDeviceToHost);
for(int i=0;i<GRAY_IDENTITY/4;i++)
{
//if(host_histogram[i]!=0)
printf("bin[%3d]:%6d -> %6d| bin[%3d]:%6d -> %6d| bin[%3d]:%6d -> %6d| bin[%3d]:%6d -> %6d\n",
4*i,host_histogram[4*i],host_histogram_equal[4*i],
4*i+1,host_histogram[4*i+1],host_histogram_equal[4*i+1],
4*i+2,host_histogram[4*i+2],host_histogram_equal[4*i+2],
4*i+3,host_histogram[4*i+3],host_histogram_equal[4*i+3]);
}
/*remap the image use the equalized histogram*/
char *dev_equalImg;
cudaMalloc((void**)&dev_equalImg,DIM*DIM);
blocks=dim3(DIM/16,DIM/16);threads=dim3(16,16);
equal_remap<<<blocks,threads>>>(dev_gray,dev_equalImg,dev_histogram_equal);
Mat equalImg(DIM,DIM,CV_8UC1,Scalar(255));
cudaMemcpy(equalImg.data,dev_equalImg,DIM*DIM,cudaMemcpyDeviceToHost);
cudaFree(dev_gray);
cudaFree(dev_histogram);
cudaFree(dev_histogram_equal);
imshow("gray_img",img_gray);
imshow("histogram equalization use GPU",equalImg);
waitKey(0);
return 0;
}
源碼下載:GitHub
參考:Parallel Prefix Sum (Scan) with CUDA, Mark Harris, April 2007