【影象縮放篇之一】近鄰取樣插值和其速度優化
阿新 • • 發佈:2019-02-19
unsigned long dst_width=Dst.width;
TARGB32* pDstLine=Dst.pdata;
unsigned long srcy_16=0;
unsigned long for4count=dst_width/4*4;
for (unsigned long y=0;y<Dst.height;++y)
{
TARGB32* pSrcLine=((TARGB32*)((TUInt8*)Src.pdata+Src.byte_width*(srcy_16>>16)));
unsigned long srcx_16=0;
unsigned long x;
for (x=0;x<for4count;x+=4)//迴圈4次展開
{
__m64 m0=_m_from_int(*(int*)(&pSrcLine[srcx_16>>16]));
srcx_16+=xrIntFloat_16;
m0=_m_punpckldq(m0, _m_from_int(*(int*)(&pSrcLine[srcx_16>>16])) );
srcx_16+=xrIntFloat_16;
__m64 m1=_m_from_int(*(int*)(&pSrcLine[srcx_16>>16]));
srcx_16+=xrIntFloat_16;
m1=_m_punpckldq(m1, _m_from_int(*(int*)(&pSrcLine[srcx_16>>16])) );
srcx_16+=xrIntFloat_16;
_mm_stream_pi((__m64 *)&pDstLine[x],m0); //不使用快取的寫入指令
_mm_stream_pi((__m64 *)&pDstLine[x+2],m1); //不使用快取的寫入指令
}
for (x=for4count;x<dst_width;++x)//處理邊界
{
pDstLine[x]=pSrcLine[srcx_16>>16];
srcx_16+=xrIntFloat_16;
}
srcy_16+=yrIntFloat_16;
((TUInt8*&)pDstLine)+=Dst.byte_width;
}
_m_empty();
}
TARGB32* pDstLine=Dst.pdata;
unsigned long srcy_16=0;
unsigned long for4count=dst_width/4*4;
for (unsigned long y=0;y<Dst.height;++y)
{
TARGB32* pSrcLine=((TARGB32*)((TUInt8*)Src.pdata+Src.byte_width*(srcy_16>>16)));
unsigned long srcx_16=0;
unsigned long x;
for (x=0;x<for4count;x+=4)//迴圈4次展開
{
__m64 m0=_m_from_int(*(int*)(&pSrcLine[srcx_16>>16]));
srcx_16+=xrIntFloat_16;
m0=_m_punpckldq(m0, _m_from_int(*(int*)(&pSrcLine[srcx_16>>16])) );
srcx_16+=xrIntFloat_16;
__m64 m1=_m_from_int(*(int*)(&pSrcLine[srcx_16>>16]));
srcx_16+=xrIntFloat_16;
m1=_m_punpckldq(m1, _m_from_int(*(int*)(&pSrcLine[srcx_16>>16])) );
srcx_16+=xrIntFloat_16;
_mm_stream_pi((__m64 *)&pDstLine[x],m0); //不使用快取的寫入指令
_mm_stream_pi((__m64 *)&pDstLine[x+2],m1); //不使用快取的寫入指令
}
for (x=for4count;x<dst_width;++x)//處理邊界
{
pDstLine[x]=pSrcLine[srcx_16>>16];
srcx_16+=xrIntFloat_16;
}
srcy_16+=yrIntFloat_16;
((TUInt8*&)pDstLine)+=Dst.byte_width;
}
_m_empty();
}