CPU優(yōu)化測試
結(jié)論 :預(yù)處理速度方面枪向,采用三方庫進(jìn)行處理時還不是瓶頸勤揩,難度在于保持結(jié)果與速度 和三方庫一致時預(yù)處理的自實(shí)現(xiàn)。
實(shí)現(xiàn)與測試內(nèi)容:resize 函數(shù)實(shí)現(xiàn)和優(yōu)化(雙線性插值)
測試工程代碼github: https://github.com/sisong/demoForHssBlog/tree/master/ZoomDemo
測試環(huán)境: i5-8400 2.8Ghz 6核心
圖像縮放大忻鼗住: 1960 * 1080 -> 1360 * 720
優(yōu)化方式 | 函數(shù)名稱 | 時間 (ms) |
---|---|---|
1陨亡、基礎(chǔ)版本(浮點(diǎn)實(shí)現(xiàn)版本) | PicZoom_Bilinear0 | 23.66 |
2、浮點(diǎn)優(yōu)化為整數(shù) | PicZoom_Bilinear1 | 12.18 |
3深员、圖像邊界與其他區(qū)域分開計算 | PicZoom_Bilinear2 | 7.11 |
4负蠕、邊界處使用近似值處理 | PicZoom_ftBilinear_Common | 5.36 |
5、MMX指令改寫(3) | PicZoom_Bilinear_MMX | 4.01 |
6倦畅、SSE2指令集改寫(4) | PicZoom_ftBilinear_SSE2 | 2.83 |
7遮糖、SSE2指令集改寫(4)+ 預(yù)計算縮放系數(shù)表 | PicZoom_ftBilinearTable_SSE2 | 1.12 |
Opencv3.1 安裝版 | 1.36 | |
雙線性插值公式
如圖,已知Q12叠赐,Q22欲账,Q11,Q21芭概,但是要插值的點(diǎn)為P點(diǎn)赛不,這就要用雙線性插值,
首先在x軸方向上谈山,對R1和R2兩個點(diǎn)進(jìn)行插值俄删,然后根據(jù)R1和R2對P點(diǎn)進(jìn)行插值,這就是所謂的雙線性插值
在圖像處理的時候奏路,我們先根據(jù)
srcX=dstX* (srcWidth/dstWidth)
srcY = dstY * (srcHeight/dstHeight) 來計算目標(biāo)像素在源圖像中的位置畴椰,這里計算的srcX和srcY一般都是浮點(diǎn)數(shù),比如f(1.2, 3.4)這個像素點(diǎn)是虛擬存在的鸽粉,先找到與它臨近的四個實(shí)際存在的像素點(diǎn)
(1斜脂,3) (2,3) 〈セ(1帚戳,4) (2玷或,4)
寫成f(i+u,j+v)的形式,則u=0.2,v=0.4, i=1, j=3
直接整理一步計算片任,f(i+u,j+v) = (1-u)(1-v)f(i,j) + (1-u)vf(i,j+1) + u(1-v)f(i+1,j) + uvf(i+1,j+1) 偏友。
假設(shè)源圖像是3 * 3,中心點(diǎn)坐標(biāo)(1对供,1)目標(biāo)圖像是9 * 9位他,中心點(diǎn)坐標(biāo)(4,4)产场,我們在進(jìn)行插值映射的時候鹅髓,盡可能希望均勻的用到源圖像的像素信息,最直觀的就是(4,4)映射到(1,1)現(xiàn)在直接計算srcX=4*3/9=1.3333京景!=1窿冯,也就是我們在插值的時候所利用的像素集中在圖像的右下方,而不是均勻分布整個圖像确徙。
為了保證圖像縮放時候均勻的用到源圖像的像素信息醒串,我們在原始的浮點(diǎn)坐標(biāo)上加上了0.5*(srcWidth/dstWidth-1)這樣一個控制因子,即:
srcX=dstX* (srcWidth/dstWidth)+0.5*(srcWidth/dstWidth-1)
此時 srcX=(4+0.5)*3/9-0.5=1
所以鄙皇,在雙線性插值計算時候 厦凤,大多都采用中心對齊方式(Opencv,Matlab也是);中心對齊公式 SrcX=(dstX+0.5)* (srcWidth/dstWidth) -0.5 SrcY=(dstY+0.5) * (srcHeight/dstHeight)-0.5
代碼塊:
注:為保證代碼更為方便的閱讀育苟,均將調(diào)用的子函數(shù)寫在了主函數(shù)的下方(未將聲明現(xiàn)在主函數(shù)前);
一椎木、像素格式
//圖像數(shù)據(jù)區(qū)的描述信息
struct TPixels32Ref{
public:
Color32* pdata; //圖像數(shù)據(jù)區(qū)首地址 即 y==0行的顏色首地址
long byte_width; //一行圖像數(shù)據(jù)的字節(jié)寬度 正負(fù)值都有可能
long width; //圖像寬度
long height; //圖像高度
inline TPixels32Ref() :pdata(0),byte_width(0),width(0),height(0){}
inline TPixels32Ref(const TPixels32Ref& ref) :pdata(ref.pdata),byte_width(ref.byte_width),width(ref.width),height(ref.height){}
//訪問(x,y)坐標(biāo)處的顏色
inline Color32& pixels(const long x,const long y) const { return getLinePixels(y)[x]; }
//得到y(tǒng)行的顏色首地址
inline Color32* getLinePixels(const long y) const { return (Color32*) ( ((UInt8*)pdata) + byte_width*y ); }
//是否是空圖像區(qū)
inline bool getIsEmpty()const { return ((width<=0)||(height<=0)); }
//將pline指向下一行顏色
inline void nextLine(Color32*& pline)const { ((UInt8*&)pline)+=byte_width; }
//坐標(biāo)邊界飽和 如果(x,y)坐標(biāo)在圖片數(shù)據(jù)區(qū)外,(x,y)值會被設(shè)置到圖片最近的邊界內(nèi),并返回false(否則什么也不做,返回true) //警告! 圖片區(qū)域不能為空
inline bool clipToBorder(long& x, long& y)const{ //a=2 /a+1
bool isIn = true;
if (x < 0) {
isIn = false; x = 0;
} else if (x >= width) {
isIn = false; x = width - 1;
}
if (y < 0) {
isIn = false; y = 0;
} else if (y >= height) {
isIn = false; y = height - 1;
}
return isIn;
}
//獲取一個點(diǎn)的顏色,默認(rèn)執(zhí)行邊界飽和測試 當(dāng)坐標(biāo)超出區(qū)域的時候返回的顏色為最近的邊界上的顏色值并且其alpha通道置零 //警告! 圖片區(qū)域不能為空 速度很慢
inline Color32 getPixelsBorder(long x, long y) const {
bool isInPic = clipToBorder(x,y);
Color32 result = pixels(x,y);
if (!isInPic)
result.a=0;
return result;
}
};
二违柏、雙線性插值函數(shù):
1、基礎(chǔ)版本(浮點(diǎn)實(shí)現(xiàn)版本)
//主函數(shù)
void PicZoom_Bilinear0(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
if ( (0==Dst.width)||(0==Dst.height)
||(0==Src.width)||(0==Src.height)) return;
long dst_width=Dst.width;
Color32* pDstLine=Dst.pdata;
for (long y=0;y<Dst.height;++y)
{
double srcy=(y+0.4999999)*Src.height/Dst.height-0.5; //幾合中心對齊方式香椎,
for (long x=0;x<dst_width;++x)
{
double srcx=(x+0.4999999)*Src.width/Dst.width-0.5;//幾合中心對齊方式
Bilinear0(Src,srcx,srcy,&pDstLine[x]);
}
((UInt8*&)pDstLine)+=Dst.byte_width;
}
}
must_inline void Bilinear0(const TPixels32Ref& pic,double fx,double fy,Color32* result)
{
long x=(long)fx; if (x>fx) --x; //x=floor(fx);
long y=(long)fy; if (y>fy) --y; //y=floor(fy);
Color32 Color0=pic.getPixelsBorder(x,y);
Color32 Color2=pic.getPixelsBorder(x+1,y);
Color32 Color1=pic.getPixelsBorder(x,y+1);
Color32 Color3=pic.getPixelsBorder(x+1,y+1);
double u=fx-x;
double v=fy-y;
double pm3=u*v;
double pm2=u*(1-v);
double pm1=v*(1-u);
double pm0=(1-u)*(1-v);
result->a=(UInt8)(pm0*Color0.a+pm1*Color1.a+pm2*Color2.a+pm3*Color3.a);
result->r=(UInt8)(pm0*Color0.r+pm1*Color1.r+pm2*Color2.r+pm3*Color3.r);
result->g=(UInt8)(pm0*Color0.g+pm1*Color1.g+pm2*Color2.g+pm3*Color3.g);
result->b=(UInt8)(pm0*Color0.b+pm1*Color1.b+pm2*Color2.b+pm3*Color3.b);
}
2漱竖、浮點(diǎn)優(yōu)化為整數(shù)
//主函數(shù)
//將浮點(diǎn)數(shù)改成整數(shù)計算
void PicZoom_Bilinear1(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
if ( (0==Dst.width)||(0==Dst.height)
||(0==Src.width)||(0==Src.height)) return;
long xrIntFloat_16=((Src.width)<<16)/Dst.width+1;
long yrIntFloat_16=((Src.height)<<16)/Dst.height+1;
const long csDErrorX=-(1<<15)+(xrIntFloat_16>>1);
const long csDErrorY=-(1<<15)+(yrIntFloat_16>>1);
long dst_width=Dst.width;
Color32* pDstLine=Dst.pdata;
long srcy_16=csDErrorY;
long y;
for (y=0;y<Dst.height;++y)
{
long srcx_16=csDErrorX;
for (long x=0;x<dst_width;++x)
{
Bilinear1(Src,srcx_16,srcy_16,&pDstLine[x]); //border
srcx_16+=xrIntFloat_16;
}
srcy_16+=yrIntFloat_16;
((UInt8*&)pDstLine)+=Dst.byte_width;
}
}
must_inline void Bilinear1(const TPixels32Ref& pic,const long x_16,const long y_16,Color32* result)
{
long x=x_16>>16;
long y=y_16>>16;
Color32 Color0=pic.getPixelsBorder(x,y);
Color32 Color2=pic.getPixelsBorder(x+1,y);
Color32 Color1=pic.getPixelsBorder(x,y+1);
Color32 Color3=pic.getPixelsBorder(x+1,y+1);
unsigned long u_8=(x_16 & 0xFFFF)>>8;
unsigned long v_8=(y_16 & 0xFFFF)>>8;
unsigned long pm3_16=(u_8*v_8);
unsigned long pm2_16=(u_8*(unsigned long)(256-v_8));
unsigned long pm1_16=(v_8*(unsigned long)(256-u_8));
unsigned long pm0_16=((256-u_8)*(256-v_8));
result->a=(UInt8)((pm0_16*Color0.a+pm1_16*Color1.a+pm2_16*Color2.a+pm3_16*Color3.a)>>16);
result->r=(UInt8)((pm0_16*Color0.r+pm1_16*Color1.r+pm2_16*Color2.r+pm3_16*Color3.r)>>16);
result->g=(UInt8)((pm0_16*Color0.g+pm1_16*Color1.g+pm2_16*Color2.g+pm3_16*Color3.g)>>16);
result->b=(UInt8)((pm0_16*Color0.b+pm1_16*Color1.b+pm2_16*Color2.b+pm3_16*Color3.b)>>16);
}
3、圖像邊界與其他區(qū)域分開計算
//主函數(shù)
void PicZoom_Bilinear2(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
if ( (0==Dst.width)||(0==Dst.height)
||(0==Src.width)||(0==Src.height)) return;
long xrIntFloat_16=((Src.width)<<16)/Dst.width+1;
long yrIntFloat_16=((Src.height)<<16)/Dst.height+1;
const long csDErrorX=-(1<<15)+(xrIntFloat_16>>1);
const long csDErrorY=-(1<<15)+(yrIntFloat_16>>1);
long dst_width=Dst.width;
//計算出需要特殊處理的邊界
long border_y0=-csDErrorY/yrIntFloat_16+1; //y0+y*yr>=0; y0=csDErrorY => y>=-csDErrorY/yr
if (border_y0>=Dst.height) border_y0=Dst.height;
long border_x0=-csDErrorX/xrIntFloat_16+1;
if (border_x0>=Dst.width ) border_x0=Dst.width;
long border_y1=(((Src.height-2)<<16)-csDErrorY)/yrIntFloat_16+1; //y0+y*yr<=(height-2) => y<=(height-2-csDErrorY)/yr
if (border_y1<border_y0) border_y1=border_y0;
long border_x1=(((Src.width-2)<<16)-csDErrorX)/xrIntFloat_16+1;
if (border_x1<border_x0) border_x1=border_x0;
Color32* pDstLine=Dst.pdata;
long Src_byte_width=Src.byte_width;
long srcy_16=csDErrorY;
long y;
for (y=0;y<border_y0;++y) //
{
long srcx_16=csDErrorX;
for (long x=0;x<dst_width;++x)
{
Bilinear2_Border(Src,srcx_16,srcy_16,&pDstLine[x]); //單獨(dú)計算邊界border
srcx_16+=xrIntFloat_16;
}
srcy_16+=yrIntFloat_16;
((UInt8*&)pDstLine)+=Dst.byte_width;
}
for (y=border_y0;y<border_y1;++y)
{
long srcx_16=csDErrorX;
long x;
for (x=0;x<border_x0;++x)
{
Bilinear2_Border(Src,srcx_16,srcy_16,&pDstLine[x]);//單獨(dú)計算邊界border
srcx_16+=xrIntFloat_16;
}
{
unsigned long v_8=(srcy_16 & 0xFFFF)>>8;
Color32* PSrcLineColor= (Color32*)((UInt8*)(Src.pdata)+Src_byte_width*(srcy_16>>16)) ;
for (long x=border_x0;x<border_x1;++x)
{
Color32* PColor0=&PSrcLineColor[srcx_16>>16];
Color32* PColor1=(Color32*)((UInt8*)(PColor0)+Src_byte_width);
Bilinear2_Fast(PColor0,PColor1,(srcx_16 & 0xFFFF)>>8,v_8,&pDstLine[x]);
srcx_16+=xrIntFloat_16;
}
}
for (x=border_x1;x<dst_width;++x)
{
Bilinear2_Border(Src,srcx_16,srcy_16,&pDstLine[x]);//單獨(dú)計算邊界border
srcx_16+=xrIntFloat_16;
}
srcy_16+=yrIntFloat_16;
((UInt8*&)pDstLine)+=Dst.byte_width;
}
for (y=border_y1;y<Dst.height;++y)
{
long srcx_16=csDErrorX;
for (long x=0;x<dst_width;++x)
{
Bilinear2_Border(Src,srcx_16,srcy_16,&pDstLine[x]); //單獨(dú)計算邊界border
srcx_16+=xrIntFloat_16;
}
srcy_16+=yrIntFloat_16;
((UInt8*&)pDstLine)+=Dst.byte_width;
}
}
must_inline void Bilinear2_Fast(Color32* PColor0,Color32* PColor1,unsigned long u_8,unsigned long v_8,Color32* result)
{
unsigned long pm3_16=u_8*v_8;
unsigned long pm2_16=(u_8<<8)-pm3_16;
unsigned long pm1_16=(v_8<<8)-pm3_16;
unsigned long pm0_16=(1<<16)-pm1_16-pm2_16-pm3_16;
result->a=(UInt8)((pm0_16*PColor0[0].a+pm2_16*PColor0[1].a+pm1_16*PColor1[0].a+pm3_16*PColor1[1].a)>>16);
result->r=(UInt8)((pm0_16*PColor0[0].r+pm2_16*PColor0[1].r+pm1_16*PColor1[0].r+pm3_16*PColor1[1].r)>>16);
result->g=(UInt8)((pm0_16*PColor0[0].g+pm2_16*PColor0[1].g+pm1_16*PColor1[0].g+pm3_16*PColor1[1].g)>>16);
result->b=(UInt8)((pm0_16*PColor0[0].b+pm2_16*PColor0[1].b+pm1_16*PColor1[0].b+pm3_16*PColor1[1].b)>>16);
}
inline void Bilinear2_Border(const TPixels32Ref& pic,const long x_16,const long y_16,Color32* result)
{
long x=(x_16>>16);
long y=(y_16>>16);
unsigned long u_16=((unsigned short)(x_16));
unsigned long v_16=((unsigned short)(y_16));
Color32 pixel[4];
pixel[0]=pic.getPixelsBorder(x,y);
pixel[1]=pic.getPixelsBorder(x+1,y);
pixel[2]=pic.getPixelsBorder(x,y+1);
pixel[3]=pic.getPixelsBorder(x+1,y+1);
Bilinear2_Fast(&pixel[0],&pixel[2],u_16>>8,v_16>>8,result);
}
4畜伐、邊界處使用近似值處理
如果不想處理邊界訪問超界問題馍惹,可以考慮擴(kuò)大源圖片的尺寸,加一個邊框 (“哨兵”優(yōu)化)玛界; 這樣插值算法就不用考慮邊界問題了万矾,程序?qū)懫饋硪埠唵魏芏? 如果對縮放結(jié)果的邊界像素級精度要求不是太高,可使用如下縮放公式: Sx=Dx(SW-1)/DW慎框; Sy=Dy(SH-1)/DH良狈; (源圖片寬和高:SW>=2;SH>=2)
這個公式不會造成內(nèi)存訪問超界:
//主函數(shù)
void PicZoom_ftBilinear_Common(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
if ( (0==Dst.width)||(0==Dst.height)
||(2>Src.width)||(2>Src.height)) return;
// Sx=Dx*(SW-1)/DW; Sy=Dy*(SH-1)/DH
long xrIntFloat_16=((Src.width-1)<<16)/Dst.width;
long yrIntFloat_16=((Src.height-1)<<16)/Dst.height;
long dst_width=Dst.width;
long Src_byte_width=Src.byte_width;
Color32* pDstLine=Dst.pdata;
long srcy_16=0;
for (long y=0;y<Dst.height;++y)
{
unsigned long v_8=(srcy_16 & 0xFFFF)>>8;
Color32* PSrcLineColor= (Color32*)((UInt8*)(Src.pdata)+Src_byte_width*(srcy_16>>16)) ;
long srcx_16=0;
for (long x=0;x<dst_width;++x)
{
Color32* PColor0=&PSrcLineColor[srcx_16>>16];
Bilinear_Fast_Common(PColor0,(Color32*)((UInt8*)(PColor0)+Src_byte_width),(srcx_16 & 0xFFFF)>>8,v_8,&pDstLine[x]);
srcx_16+=xrIntFloat_16;
}
srcy_16+=yrIntFloat_16;
((UInt8*&)pDstLine)+=Dst.byte_width;
}
}
must_inline void Bilinear_Fast_Common(Color32* PColor0,Color32* PColor1,unsigned long u_8,unsigned long v_8,Color32* result)
{
unsigned long pm3_8=(u_8*v_8)>>8;
unsigned long pm2_8=u_8-pm3_8;
unsigned long pm1_8=v_8-pm3_8;
unsigned long pm0_8=256-pm1_8-pm2_8-pm3_8;
unsigned long Color=*(unsigned long*)(PColor0);
unsigned long BR=(Color & 0x00FF00FF)*pm0_8;
unsigned long GA=((Color & 0xFF00FF00)>>8)*pm0_8;
Color=((unsigned long*)(PColor0))[1];
GA+=((Color & 0xFF00FF00)>>8)*pm2_8;
BR+=(Color & 0x00FF00FF)*pm2_8;
Color=*(unsigned long*)(PColor1);
GA+=((Color & 0xFF00FF00)>>8)*pm1_8;
BR+=(Color & 0x00FF00FF)*pm1_8;
Color=((unsigned long*)(PColor1))[1];
GA+=((Color & 0xFF00FF00)>>8)*pm3_8;
BR+=(Color & 0x00FF00FF)*pm3_8;
*(unsigned long*)(result)=(GA & 0xFF00FF00)|((BR & 0xFF00FF00)>>8);
}
5笨枯、MMX指令改寫(3)
//主函數(shù)
void PicZoom_Bilinear_MMX(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
if ( (0==Dst.width)||(0==Dst.height)
||(0==Src.width)||(0==Src.height)) return;
long xrIntFloat_16=((Src.width)<<16)/Dst.width+1;
long yrIntFloat_16=((Src.height)<<16)/Dst.height+1;
const long csDErrorX=-(1<<15)+(xrIntFloat_16>>1);
const long csDErrorY=-(1<<15)+(yrIntFloat_16>>1);
long dst_width=Dst.width;
//計算出需要特殊處理的邊界
long border_y0=-csDErrorY/yrIntFloat_16+1; //y0+y*yr>=0; y0=csDErrorY => y>=-csDErrorY/yr
if (border_y0>=Dst.height) border_y0=Dst.height;
long border_x0=-csDErrorX/xrIntFloat_16+1;
if (border_x0>=Dst.width ) border_x0=Dst.width;
long border_y1=(((Src.height-2)<<16)-csDErrorY)/yrIntFloat_16+1; //y0+y*yr<=(height-2) => y<=(height-2-csDErrorY)/yr
if (border_y1<border_y0) border_y1=border_y0;
long border_x1=(((Src.width-2)<<16)-csDErrorX)/xrIntFloat_16+1;
if (border_x1<border_x0) border_x1=border_x0;
Color32* pDstLine=Dst.pdata;
long Src_byte_width=Src.byte_width;
long srcy_16=csDErrorY;
long y;
for (y=0;y<border_y0;++y)
{
long srcx_16=csDErrorX;
for (long x=0;x<dst_width;++x)
{
Bilinear_Border_MMX(Src,srcx_16,srcy_16,&pDstLine[x]); //border
srcx_16+=xrIntFloat_16;
}
srcy_16+=yrIntFloat_16;
((UInt8*&)pDstLine)+=Dst.byte_width;
}
for (y=border_y0;y<border_y1;++y)
{
long srcx_16=csDErrorX;
long x;
for (x=0;x<border_x0;++x)
{
Bilinear_Border_MMX(Src,srcx_16,srcy_16,&pDstLine[x]);//border
srcx_16+=xrIntFloat_16;
}
{
unsigned long v_8=(srcy_16 & 0xFFFF)>>8;
Color32* PSrcLineColor= (Color32*)((UInt8*)(Src.pdata)+Src_byte_width*(srcy_16>>16)) ;
for (long x=border_x0;x<border_x1;++x)
{
Color32* PColor0=&PSrcLineColor[srcx_16>>16];
Color32* PColor1=(Color32*)((UInt8*)(PColor0)+Src_byte_width);
Bilinear_Fast_MMX(PColor0,PColor1,(srcx_16 & 0xFFFF)>>8,v_8,&pDstLine[x]);
srcx_16+=xrIntFloat_16;
}
}
for (x=border_x1;x<dst_width;++x)
{
Bilinear_Border_MMX(Src,srcx_16,srcy_16,&pDstLine[x]);//border
srcx_16+=xrIntFloat_16;
}
srcy_16+=yrIntFloat_16;
((UInt8*&)pDstLine)+=Dst.byte_width;
}
for (y=border_y1;y<Dst.height;++y)
{
long srcx_16=csDErrorX;
for (long x=0;x<dst_width;++x)
{
Bilinear_Border_MMX(Src,srcx_16,srcy_16,&pDstLine[x]); //border
srcx_16+=xrIntFloat_16;
}
srcy_16+=yrIntFloat_16;
((UInt8*&)pDstLine)+=Dst.byte_width;
}
asm emms
}
void Bilinear_Border_MMX(const TPixels32Ref& pic,const long x_16,const long y_16,Color32* result)
{
long x=(x_16>>16);
long y=(y_16>>16);
unsigned long u_16=((unsigned short)(x_16));
unsigned long v_16=((unsigned short)(y_16));
Color32 pixel[4];
pixel[0]=pic.getPixelsBorder(x,y);
pixel[1]=pic.getPixelsBorder(x+1,y);
pixel[2]=pic.getPixelsBorder(x,y+1);
pixel[3]=pic.getPixelsBorder(x+1,y+1);
Bilinear_Fast_MMX(&pixel[0],&pixel[2],u_16>>8,v_16>>8,result);
}
must_inline void Bilinear_Fast_MMX(Color32* PColor0,Color32* PColor1,unsigned long u_8,unsigned long v_8,Color32* result)
{
asm
{
MOVD MM6,v_8
MOVD MM5,u_8
mov edx,PColor0
mov eax,PColor1
PXOR mm7,mm7
MOVD MM2,dword ptr [eax]
MOVD MM0,dword ptr [eax+4]
PUNPCKLWD MM5,MM5
PUNPCKLWD MM6,MM6
MOVD MM3,dword ptr [edx]
MOVD MM1,dword ptr [edx+4]
PUNPCKLDQ MM5,MM5
PUNPCKLBW MM0,MM7
PUNPCKLBW MM1,MM7
PUNPCKLBW MM2,MM7
PUNPCKLBW MM3,MM7
PSUBw MM0,MM2
PSUBw MM1,MM3
PSLLw MM2,8
PSLLw MM3,8
PMULlw MM0,MM5
PMULlw MM1,MM5
PUNPCKLDQ MM6,MM6
PADDw MM0,MM2
PADDw MM1,MM3
PSRLw MM0,8
PSRLw MM1,8
PSUBw MM0,MM1
PSLLw MM1,8
PMULlw MM0,MM6
mov eax,result
PADDw MM0,MM1
PSRLw MM0,8
PACKUSwb MM0,MM7
movd [eax],MM0
//emms
}
}
6薪丁、SSE2指令集改寫(4)
//主函數(shù)
void PicZoom_ftBilinear_SSE2(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
if ( (0==Dst.width)||(0==Dst.height)
||(2>Src.width)||(2>Src.height)) return;
long xrIntFloat_16=((Src.width-1)<<16)/Dst.width;
long yrIntFloat_16=((Src.height-1)<<16)/Dst.height;
long dst_width=Dst.width;
long Src_byte_width=Src.byte_width;
Color32* pDstLine=Dst.pdata;
long srcy_16=0;
asm pxor xmm7,xmm7 //xmm7=0
for (long y=0;y<Dst.height;++y)
{
unsigned long v_8=(srcy_16 & 0xFFFF)>>8;
Color32* PSrcLineColor= (Color32*)((UInt8*)(Src.pdata)+Src_byte_width*(srcy_16>>16)) ;
Color32* PSrcLineColorNext= (Color32*)((UInt8*)(PSrcLineColor)+Src_byte_width) ;
asm
{
movd xmm6,v_8
PUNPCKLWD xmm6,xmm6
PUNPCKLDQ xmm6,xmm6
PUNPCKLQDQ xmm6,xmm6//xmm6=v_8
mov esi,PSrcLineColor
mov ecx,PSrcLineColorNext
xor edx,edx //srcx_16=0
mov ebx,dst_width
mov edi,pDstLine
push ebp
mov ebp,xrIntFloat_16
push ebx
and ebx,(not 1)
test ebx,ebx //nop
jle end_loop2
lea edi,[edi+ebx*4]
neg ebx
loop2_start:
call ftBilinear_SSE2_expand2
lea edx,[edx+ebp*2]
add ebx,2
jnz loop2_start
end_loop2:
pop ebx
and ebx,1
test ebx,ebx
jle end_write
lea edi,[edi+ebx*4]
neg ebx
loop1_start:
call ftBilinear_SSE2
lea edx,[edx+ebp]
add ebx,1
jnz loop1_start
end_write:
pop ebp
}
srcy_16+=yrIntFloat_16;
((UInt8*&)pDstLine)+=Dst.byte_width;
}
asm emms
}
//ftBilinear_SSE2_expand2(out [edi+ebx*4];xmm6=v_8,xmm7=0,edx=srcx_16,esi=PSrcLineColor,ecx=PSrcLineColorNext,ebp=xrIntFloat_16)
void __declspec(naked) ftBilinear_SSE2_expand2()
{
asm
{
lea eax,[edx+ebp]
MOVD XMM5,edx
MOVD XMM4,eax
PUNPCKLWD XMM5,XMM4
PSRLW XMM5,8
mov eax,edx
shr eax,16 //srcx_16>>16
PUNPCKLWD XMM5,XMM5
MOVQ XMM2, qword ptr [ecx+eax*4]//XMM2=0 0 Color0 Color2
MOVQ XMM3, qword ptr [esi+eax*4]//XMM3=0 0 Color1 Color3
lea eax,[edx+ebp]
shr eax,16 //srcx_16>>16
PUNPCKLDQ XMM5,XMM5 //mm5=u_8' u_8' u_8' u_8' u_8 u_8 u_8 u_8
movq xmm4,qword ptr [ecx+eax*4]
PUNPCKLDQ XMM2,xmm4//XMM2=Color0' Color0 Color2' Color2
movq xmm4,qword ptr [esi+eax*4]
PUNPCKLDQ XMM3,xmm4//XMM3=Color1' Color1 Color3' Color3
MOVHLPS XMM0,XMM2 //XMM0= X X Color0' Color0
MOVHLPS XMM1,XMM3 //XMM1= X X Color1' Color1
PUNPCKLBW XMM0,XMM7
PUNPCKLBW XMM1,XMM7
PUNPCKLBW XMM2,XMM7
PUNPCKLBW XMM3,XMM7
PSUBw XMM0,XMM2
PSUBw XMM1,XMM3
PSLLw XMM2,8
PSLLw XMM3,8
PMULlw XMM0,XMM5
PMULlw XMM1,XMM5
PADDw XMM0,XMM2
PADDw XMM1,XMM3
PSRLw XMM0,8
PSRLw XMM1,8
PSUBw XMM0,XMM1
PSLLw XMM1,8
PMULlw XMM0,XMM6
PADDw XMM0,XMM1
PSRLw XMM0,8
PACKUSwb XMM0,XMM7
//MOVQ qword ptr [edi+ebx*4], xmm0//write two DstColor
MOVDQ2Q mm4,xmm0
movntq qword ptr [edi+ebx*4],mm4
ret
}
}
//ftBilinear_SSE2(out [edi+ebx*4];xmm6=v_8,xmm7=0,edx=srcx_16,esi=PSrcLineColor,ecx=PSrcLineColorNext,ebp=xrIntFloat_16)
void __declspec(naked) ftBilinear_SSE2()
{
asm
{
mov eax,edx
shl eax,16
shr eax,24
//== movzx eax,dh //eax=u_8
MOVD XMM5,eax
mov eax,edx
shr eax,16 //srcx_16>>16
MOVD XMM0, dword ptr [ecx+eax*4+4]//XMM0=Color2
MOVD XMM2, dword ptr [ecx+eax*4] //XMM2=Color0
PUNPCKLWD XMM5,XMM5
MOVD XMM1, dword ptr [esi+eax*4+4]//XMM1=Color3
MOVD XMM3, dword ptr [esi+eax*4] //XMM3=Color1
PUNPCKLDQ XMM5,XMM5 //mm5=u_8
PUNPCKLBW XMM0,XMM7
PUNPCKLBW XMM1,XMM7
PUNPCKLBW XMM2,XMM7
PUNPCKLBW XMM3,XMM7
PSUBw XMM0,XMM2
PSUBw XMM1,XMM3
PSLLw XMM2,8
PSLLw XMM3,8
PMULlw XMM0,XMM5
PMULlw XMM1,XMM5
PADDw XMM0,XMM2
PADDw XMM1,XMM3
PSRLw XMM0,8
PSRLw XMM1,8
PSUBw XMM0,XMM1
PSLLw XMM1,8
PMULlw XMM0,XMM6
PADDw XMM0,XMM1
PSRLw XMM0,8
PACKUSwb XMM0,XMM7
MOVd dword ptr [edi+ebx*4],XMM0 //write DstColor
ret
}
}
7遇西、SSE2指令集改寫(4)+預(yù)計算縮放系數(shù)表
void PicZoom_ftBilinearTable_SSE2(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
if ( (0==Dst.width)||(0==Dst.height)
||(2>Src.width)||(2>Src.height)) return;
long xrIntFloat_16=((Src.width-1)<<16)/Dst.width;
long yrIntFloat_16=((Src.height-1)<<16)/Dst.height;
long dst_width=Dst.width;
UInt8* _bufMem=new UInt8[(dst_width*2*sizeof(TMMXData64)+15)+dst_width*sizeof(Int32)];
TMMXData64* uList=(TMMXData64*)((((ptrdiff_t)_bufMem)+15)>>4<<4); //16byte對齊
Int32* xList=(Int32*)(uList+dst_width*2);
{//init u table
long srcx_16=0;
for (long x=0;x<dst_width*2;x+=2){
xList[x>>1]=(srcx_16>>16);
unsigned long u=(srcx_16>>8)&0xFF;
unsigned long ur=(256-u)<<1;
u=u<<1;
uList[x+0]=(ur|(ur<<16));
uList[x+0]|=uList[x+0]<<32;
uList[x+1]=u|(u<<16);
uList[x+1]|=uList[x+1]<<32;
srcx_16+=xrIntFloat_16;
}
}
Color32* pDstLine=Dst.pdata;
long srcy_16=0;
asm pxor xmm7,xmm7 //xmm7=0
for (long y=0;y<Dst.height;++y){
unsigned long v=(srcy_16>>8) & 0xFF;
unsigned long vr=(256-v)>>1;
v>>=1;
Color32* PSrcLineColor= (Color32*)((UInt8*)(Src.pdata)+Src.byte_width*(srcy_16>>16)) ;
Color32* PSrcLineColorNext= (Color32*)((UInt8*)(PSrcLineColor)+Src.byte_width) ;
asm{
movd xmm5,vr
movd xmm6,v
punpcklwd xmm5,xmm5
punpcklwd xmm6,xmm6
punpckldq xmm5,xmm5
punpckldq xmm6,xmm6
punpcklqdq xmm5,xmm5
punpcklqdq xmm6,xmm6
mov esi,PSrcLineColor
mov ecx,PSrcLineColorNext
mov edx,xList //x
mov ebx,dst_width
mov edi,pDstLine
push ebp
mov ebp,uList
push ebx
and ebx,(not 1)
test ebx,ebx
jle end_loop2
lea ebx,[ebx*4]
lea edi,[edi+ebx]
lea edx,[edx+ebx]
lea ebp,[ebp+ebx*4]
neg ebx
loop2_start:
//call ftBilinearTable_SSE2_expand2
ftBilinearTable_SSE2_expand2()
add ebx,8
jnz loop2_start
end_loop2:
pop ebx
and ebx,1
test ebx,ebx
jle end_write
lea ebx,[ebx*4]
lea edi,[edi+ebx]
lea edx,[edx+ebx]
lea ebp,[ebp+ebx*4]
neg ebx
loop1_start:
//call ftBilinearTable_SSE2
ftBilinearTable_SSE2()
add ebx,4
jnz loop1_start
end_write:
pop ebp
}
srcy_16+=yrIntFloat_16;
((UInt8*&)pDstLine)+=Dst.byte_width;
}
delete []_bufMem;
}
//ftBilinearTable_SSE2(out [edi+ebx*4]; xmm5=v,xmm6=vr,xmm7=0,[ebp]=(u,ur),[edx]=srx_x,esi=PSrcLineColor,ecx=PSrcLineColorNext)
//void __declspec(naked) ftBilinearTable_SSE2(){
#define ftBilinearTable_SSE2() \
asm mov eax,[edx+ebx] \
asm movq xmm0,qword ptr[esi+eax*4] \
asm movq xmm1,qword ptr[ecx+eax*4] \
asm punpcklbw xmm0,xmm7 \
asm punpcklbw xmm1,xmm7 \
asm pmullw xmm0,xmm5 \
asm pmullw xmm1,xmm6 \
asm paddw xmm0,xmm1 \
asm pmulhw xmm0,xmmword ptr [ebp+ebx*4] \
asm movdqa xmm1,xmm0 \
asm punpckhqdq xmm0,xmm0 \
asm paddw xmm0,xmm1 \
asm packuswb xmm0,xmm7 \
asm movd dword ptr [edi+ebx],xmm0
//ret //for __declspec(naked)
//}
//}
//void __declspec(naked) ftBilinearTable_SSE2_expand2(){
#define ftBilinearTable_SSE2_expand2() \
asm mov eax,[edx+ebx] \
asm movq xmm0,qword ptr[esi+eax*4] \
asm movq xmm1,qword ptr[ecx+eax*4] \
asm mov eax,[edx+ebx+4] \
asm movq xmm2,qword ptr[esi+eax*4] \
asm movq xmm3,qword ptr[ecx+eax*4] \
asm punpcklbw xmm0,xmm7 \
asm punpcklbw xmm1,xmm7 \
asm punpcklbw xmm2,xmm7 \
asm punpcklbw xmm3,xmm7 \
asm pmullw xmm0,xmm5 \
asm pmullw xmm1,xmm6 \
asm pmullw xmm2,xmm5 \
asm pmullw xmm3,xmm6 \
asm paddw xmm0,xmm1 \
asm paddw xmm2,xmm3 \
asm pmulhw xmm0,xmmword ptr [ebp+ebx*4] \
asm pmulhw xmm2,xmmword ptr [ebp+ebx*4+16] \
asm movdqa xmm1,xmm0 \
asm punpcklqdq xmm0,xmm2 \
asm punpckhqdq xmm1,xmm2 \
asm paddw xmm0,xmm1 \
asm packuswb xmm0,xmm7 \
asm movq qword ptr [edi+ebx],xmm0 \
//ret //for __declspec(naked)
//}
//}