已知有一系列觀測贱案,且
刀脏,其表達式如下
其中噪聲,現(xiàn)需要求解
根據(jù)問題最住,易得的概率密度函數(shù)為
因此可以得到
其中钞澳,為標準累積分布函數(shù)
如果認為每次測量獨立,則所有觀測的先驗概率為
現(xiàn)在需要求解使得上式最大涨缚,這里采用數(shù)值解法
- 首先假設(shè)
轧粟,直接通過最小二乘(高斯下的最優(yōu)解)得到
的初始值
- 求解導(dǎo)數(shù)
- 更新直到收斂
=================================================================================
cmake_minimum_required(VERSION 3.9)
project(aoi)
enable_language(CUDA)
set(CMAKE_BUILD_TYPE Release)
set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/bin)
set(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/lib)
# set(CMAKE_CUDA_ARCHITECTURES 61)
include_directories(${PROJECT_SOURCE_DIR}/include)
aux_source_directory(src SRC_LIST)
add_executable(${PROJECT_NAME} ${SRC_LIST})
#include <iostream>
#include <chrono>
#include "cuda_operator.cuh"
int main() {
auto time_start = std::chrono::system_clock::now();
auto time_end = std::chrono::system_clock::now();
auto duration = time_end - time_start;
std::cout << "cost time: " << duration.count() / 1000000.0 << " ms" << std::endl;
MatchTemplateGPU();
return 0;
}
#include <iostream>
#include <vector>
#include <chrono>
#include "cuda_operator.cuh"
__global__ void MatchTemplateKernel(int *image, int image_width, int image_height, float *conv_res) {
// image size : height * width
// conv_res size : (height + 1) * (width + 1)
int row_col = blockIdx.x * blockDim.x + threadIdx.x;
int *image_ptr = image + row_col * image_width;
float *res_ptr = conv_res + (row_col + 1) * (image_width + 1);
float temp = 0.0;
if(row_col < image_height) {
*res_ptr = 0;
for(int i = 0; i < image_width; ++i) {
temp += *image_ptr;
++image_ptr;
++res_ptr;
*res_ptr = temp;
}
}
// __syncthreads();
__threadfence();
res_ptr = conv_res + row_col + 1;
temp = 0.0;
if(row_col < image_width) {
*res_ptr = 0;
for(int i = 0; i < image_height; ++i) {
res_ptr += image_width + 1;
temp += *res_ptr;
*res_ptr = temp;
}
}
}
__global__ void PrefixSumRow(int *image, int image_width, int image_height, float *res) {
// image size : height * width
// res size : (height + 1) * (width + 1)
int row = blockIdx.x * blockDim.x + threadIdx.x;
int *image_ptr = image + row * image_width;
float *res_ptr = res + (row + 1) * (image_width + 1);
float temp = 0.0;
if(row < image_height) {
*res_ptr = 0;
for(int i = 0; i < image_width; ++i) {
temp += *image_ptr;
++image_ptr;
++res_ptr;
*res_ptr = temp;
}
}
}
__global__ void PrefixSumColumn(float *res, int res_width, int res_height) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
float *res_ptr = res + col;
float pre_sum = *res_ptr;
if(col < res_width) {
*res_ptr = pre_sum;
for(int i = 0; i < res_height; ++i) {
res_ptr += res_width;
pre_sum += *res_ptr;
*res_ptr = pre_sum;
}
}
}
__global__ void PrefixSumColumn1(float *img, int img_width, int img_height) {
__shared__ float integral_img[32][32];
int bundle_row_num = blockDim.x / 32;
int local_row = threadIdx.x / 32;
int local_col = threadIdx.x % 32;
int global_col = blockIdx.x * 32 + local_col;
int row_start = 0;
float cur_thread_value = 0.0;
float pre_sum = 0.0;
float step_sum = 0.0;
float *integral_ptr = NULL;
// integral_img[local_row][local_col] = 0.0;
// __syncthreads();
for(row_start = 0; row_start + bundle_row_num <= img_height; row_start += bundle_row_num) {
if(global_col < img_width) {
integral_ptr = img + (row_start + local_row) * img_width + global_col;
integral_img[local_row][local_col] = *integral_ptr;
__syncthreads();
step_sum = integral_img[threadIdx.x % bundle_row_num][threadIdx.x / bundle_row_num];
#pragma unroll
for(int i = 1; i <= 16; i *= 2) {
cur_thread_value = __shfl_up_sync(0xffffffff, step_sum, i, 32);
if(threadIdx.x % bundle_row_num >= i) {
step_sum += cur_thread_value;
}
}
step_sum += pre_sum;
pre_sum = __shfl_sync(0xffffffff, step_sum, bundle_row_num - 1, bundle_row_num);
__syncthreads();
*integral_ptr = step_sum;
}
}
if(row_start + local_row < img_height && global_col < img_width) {
integral_ptr = img + (row_start + local_row) * img_width + global_col;
integral_img[local_row][local_col] = *integral_ptr;
__syncthreads();
step_sum = integral_img[threadIdx.x % bundle_row_num][threadIdx.x / bundle_row_num];
#pragma unroll
for(int i = 1; i <= 16; i *= 2) {
cur_thread_value = __shfl_up_sync(0xffffffff, step_sum, i, 32);
if(threadIdx.x % bundle_row_num >= i) {
step_sum += cur_thread_value;
}
}
step_sum += pre_sum;
__syncthreads();
*integral_ptr = step_sum;
}
}
void MatchTemplateGPU() {
int image_width = 10;
int image_height = 10;
int image_size = image_width * image_height;
int integral_image_size = (image_width + 1) * (image_height + 1);
std::vector<float> res(integral_image_size, 0);
std::vector<int> image(image_size, 0);
for(int i = 0; i < image.size(); ++i) {
image[i] = i + 1;
}
float count = 1;
for(int i = image_width + 1; i < res.size(); ++i) {
if(i % (image_width + 1) == 0) continue;
res[i] = count++;
}
std::cout << "conv res" << std::endl;
for(int i = 0; i <= image_height; ++i) {
for(int j = 0; j <= image_width; ++j) {
std::cout << res[i * (image_width + 1) + j] << " ";
}
std::cout << std::endl;
}
image_size = image_size * sizeof(int);
integral_image_size = integral_image_size * sizeof(float);
dim3 threads_per_block(32 * 32);
dim3 blocks_per_grid((max(image_width, image_height) + threads_per_block.x - 1) / threads_per_block.x);
int *image_gpu;
float *res_gpu;
cudaMalloc(&image_gpu, image_size);
cudaMalloc(&res_gpu, integral_image_size);
// cudaMemset(res_gpu, 0, integral_image_size);
cudaMemcpy(image_gpu, &image[0], image_size, cudaMemcpyHostToDevice);
cudaMemcpy(res_gpu, &res[0], integral_image_size, cudaMemcpyHostToDevice);
auto time_start = std::chrono::system_clock::now();
// MatchTemplateKernel<<<blocks_per_grid, threads_per_block>>>(image_gpu, image_width, image_height, res_gpu);
// cudaDeviceSynchronize();
// PrefixSumRow<<<blocks_per_grid, threads_per_block>>>(image_gpu, image_width, image_height, res_gpu);
// PrefixSumColumn<<<blocks_per_grid, threads_per_block>>>(res_gpu, image_width + 1, image_height + 1);
PrefixSumColumn1<<<blocks_per_grid, threads_per_block>>>(res_gpu, image_width + 1, image_height + 1);
// cudaError_t err = cudaGetLastError();
// if(err != cudaSuccess) {
// printf("CUDA Error: %s\n", cudaGetErrorString(err));
// }
auto time_end = std::chrono::system_clock::now();
auto duration = time_end - time_start;
cudaMemcpy(&res[0], res_gpu, integral_image_size, cudaMemcpyDeviceToHost);
std::cout << "GPU cost time: " << duration.count() / 1000000.0 << " ms" << std::endl;
cudaFree(image_gpu);
cudaFree(res_gpu);
// std::cout << "image" << std::endl;
// for(int i = 0; i < image_height; ++i) {
// for(int j = 0; j < image_width; ++j) {
// std::cout << image[i * image_width + j] << " ";
// }
// std::cout << std::endl;
// }
std::cout << "conv res" << std::endl;
for(int i = 0; i <= image_height; ++i) {
for(int j = 0; j <= image_width; ++j) {
std::cout << res[i * (image_width + 1) + j] << " ";
}
std::cout << std::endl;
}
}