1. 程式人生 > >XBurst基於MXU指令對YUYV轉RGB2的優化

XBurst基於MXU指令對YUYV轉RGB2的優化

YUV轉RGB的公式是固定的,YUV轉RGB的程式碼網上也可以找到很多,不過真的要將這些標準程式碼用在實際的專案中就會發現, 效能還是不夠好。

最近在君正的X1000 CPU上實現YUYV轉RGB24時,
為了不使用浮點計算,使用了下面的整數優化公式

	B = y + ((443 * (u - 128)) >> 8);
	G = y - ((179 * (v - 128) + 86 * (u - 128)) >> 8);
	R = y + ((351 * (v - 128)) >> 8);

基於上面的公式用標準C實現了YUYV轉RGB24。然而實際執行效率還是不令人滿意 。
於是想到了利用XBurst的SIMD擴充套件指令集(MXU)來優化YUYV轉RGB24的函式,更進一步提高效率。

XBurst的MXU指令集,參見 ftp://ftp.ingenic.com/SOC/X1000/X1000_M200_XBurst_ISA_MXU_PM.pdf

下面是YUV TO RGB24的完整實現程式碼,還是基於上面的YUV2RGB公式,但從C程式碼轉為彙編程式碼後,長度增加了近10倍。實際效果呢?執行效率的改善還是很明顯的,CPU佔用率從40%下降到18%.

#include <stddef.h>
#include <stdint.h>

#include <jzmedia.h>

#define i_pref(hint,base,offset)                                        \
		({ __asm__ __volatile__("pref %0,%2(%1)"::"i"(hint),"r"(base),"i"(offset):"memory");})
int YUYV_to_RGB888 (void *src_buf, void *dst_buf, size_t src_width ,size_t src_height,size_t srcStride, size_t dstStride ) { int line =0; int col =0; const uint8_t *src = (uint8_t *)src_buf; uint8_t *dst = (uint8_t *)dst_buf; uint32_t blue4,green4,red4; i_pref(4, src , 0); // load_streamed i_pref
(5, dst , 0); // store_streamed for(line = 0; line < src_height; line++) { i_pref(4, src , 0); // load_streamed i_pref(5, dst , 0); // store_streamed for( col = 0; col < src_width; col = col + 4) { S32LDDV(xr3, src, col, 1); // xr3:v0 ,y01,u0 ,y00 S32LDDV(xr4, src, col + 2, 1); // xr4:v1 ,y11,u1 ,y10 S32SFL(xr3, xr4, xr3, xr1, ptn1); // xr1:y11,y10,y01,y00 // xr3:v1 ,u1 ,v0 ,u0 S32LUI(xr13,128,ptn7); // xr13 = 128,128,128,128 Q8ADDE_SS(xr3, xr3, xr13, xr2); // xr2[15:0 ] = xr3[07:00] - xr13[07:00] u0-128 // xr2[31:16] = xr3[15:08] - xr13[15:08] v0-128 // xr3[15:0 ] = xr3[23:16] - xr13[23:16] u1-128 // xr3[31:16] = xr3[31:24] - xr13[31:24] v1-128 S32SFL(xr3, xr3, xr2, xr2, ptn3); // xr2[15:0 ] = u0-128 // xr2[31:16] = u1-128 // xr3[15:0 ] = v0-128 // xr3[31:16] = v1-128 /////////////now xr1 -> y,xr2-> u xr3 -> v store until loop end /*******************************************BLUE**************/ S32LUI(xr14,222,ptn4); // /xr13 = 0,222,0,222 D16MUL_WW(xr6,xr2,xr14,xr5); // xr5[31:00] = xr2[15:00] * xr14[15:00] (u0 - 128)*222 // xr6[31:00] = xr2[31:16] * xr14[31:16] (u1 - 128)*222 D32SAR(xr6,xr6,xr5,xr5,7); // xr5[31:00] = xr5[31:00] >>7 (u0 - 128)*222/128 // xr6[31:00] = xr6[31:00] >>7 (u1 - 128)*222/128 // compact to short S32SFL(xr14,xr5,xr5,xr5,ptn3); // xr5[15:00] = (u0 - 128)*222/128 // xr5[31:16] = (u0 - 128)*222/128 // xr14 unused // convert 16bit S32SFL(xr14,xr6,xr6,xr6,ptn3); // xr6[15:00] = (u1 - 128)*222/128 // xr6[31:16] = (u1 - 128)*222/128 // xr14 unused // expand y3,y2,y1,y0 to short S32LUI(xr14,0,ptn7); S32SFL(xr8,xr14,xr1,xr7,ptn0); //xr7[15:00] = xr1[07:00] y00 //xr7[31:16] = xr1[15:08] y01 //xr8[15:00] = xr1[24:16] y10 //xr8[31:16] = xr1[31:24] y11 Q16ACCM_AA(xr6, xr8, xr7 ,xr5); //xr5[15:0 ] += xr7[15:0 ] y00 + (u0 -128)*222/128 //xr5[31:16] += xr7[31:16] y01 + (u0 -128)*222/128 //xr6[15:0 ] += xr8[15:0 ] y10 + (u1 -128)*222/128 //xr6[31:16] += xr8[31:16] y11 + (u1 -128)*222/128 // xr9 {b3,b2,b1,b0} Q16SAT (xr9, xr6, xr5); //xr9[ 7: 0] = xr5[15:0 ] y01 + (u1 -128)*222/128 //xr9[15: 8] = xr5[31:16] y02 + (u1 -128)*222/128 //xr9[23:16] = xr6[15:0 ] y03 + (u3 -128)*222/128 //xr9[31:24] = xr6[31:16] y04 + (u4 -128)*222/128 // blue4 {b3,b2,b1,b0} S32STD(xr9,&blue4,0); /*******************************************GREEN**************/ S32LUI(xr14,86,4); //xr14 = 0,86,0,86 D16MUL_WW(xr6,xr2,xr14,xr5); //xr5[31:00] = xr2[15:00] * xr14[15:00] (u0 - 128)*86 //xr6[31:00] = xr2[15:00] * xr14[31:16] (u1 - 128)*86 // compact to short S32SFL(xr6,xr6,xr5,xr5,ptn3); // xr5[15:00] = (u0 - 128)*86 // xr5[31:16] = (u1 - 128)*86 S32LUI(xr14,179,4); // xr14 = 0,179,0,179 D16MADL_AA_WW(xr5,xr3,xr14,xr5); //xr5[15:00] += xr3[15:00] * xr14[15:00] (u0 - 128)*86 + (v0 - 128)*179 //xr5[31:16] += xr3[31:16] * xr14[31:16] (u1 - 128)*86 + (v1 - 128)*179 // expand to xr5 xr6 S32SFL(xr6,xr5,xr5,xr5,ptn3); //xr5[15:00] (u0 - 128)*86 + (v0 - 128)*179 //xr5[31:16] (u0 - 128)*86 + (v0 - 128)*179 //xr6[15:00] (u1 - 128)*86 + (v1 - 128)*179 //xr6[31:16] (u1 - 128)*86 + (v1 - 128)*179 Q16SAR(xr6,xr6,xr5,xr5,8); //xr5[15:00] = xr5[15:00]>>8 ((u0 - 128)*86 + (v0 - 128)*179)/256 //xr5[31:16] = xr5[31:16]>>8 ((u0 - 128)*86 + (v0 - 128)*179)/256 //xr6[15:00] = xr5[15:00]>>8 ((u1 - 128)*86 + (v1 - 128)*179)/256 //xr6[31:16] = xr5[31:16]>>8 ((u1 - 128)*86 + (v1 - 128)*179)/256 // expand y3,y2,y1,y0 to short S32SFL(xr8,xr0,xr1,xr7,ptn0); //xr7[15:00] = xr1[07:00] y00 //xr7[31:16] = xr1[15:08] y01 //xr8[15:00] = xr1[23:16] y10 //xr8[31:16] = xr1[31:24] y11 Q16ACCM_AA(xr8, xr6, xr5 ,xr7); //xr7[15:00] += xr5[15:00] y00 + (u0 - 128)*86 + (v0 - 128)*179/256 //xr7[31:16] += xr5[31:16] y01 + (u0 - 128)*86 + (v0 - 128)*179/256 //xr8[15:00] += xr6[15:00] y10 + (u1 - 128)*86 + (v1 - 128)*179/256 //xr8[31:16] += xr6[31:16] y11 + (u1 - 128)*86 + (v1 - 128)*179/256 // xr9 {g3,g2,g1,g0} Q16SAT (xr9, xr8, xr7); //xr9[07:00] = xr7[15:00] //xr9[15:08] = xr7[31:16] //xr9[23:16] = xr8[15:00] //xr9[31:24] = xr8[31:16] // green4 {g3,g2,g1,g0} S32STD(xr9,&green4,0); /*****************************************RED****************/ S32LUI(xr14,175,ptn4); //xr13 = 0,175,0,175 D16MUL_WW(xr6,xr3,xr14,xr5); // xr5[31:00] = xr3[15:00] * xr14[15:00] (v0 - 128)*175 // xr6[31:00] = xr3[31:16] * xr14[31:16] (v1 - 128)*175 D32SAR(xr6,xr6,xr5,xr5,7); // xr5[31:00] = xr5[31:00] >>7 (v0 - 128)*175/128 // xr6[31:00] = xr6[31:00] >>7 (v1 - 128)*175/128 // compact to short S32SFL(xr6,xr6,xr5,xr5,ptn3); // xr5[15:00] = (v0 - 128)*175/128 // xr5[31:16] = (v1 - 128)*175/128 S32SFL(xr6,xr5,xr5,xr5,ptn3); // xr5[15:00] = (v0 - 128)*175/128 // xr5[31:16] = (v0 - 128)*175/128 // xr6[15:00] = (v1 - 128)*175/128 // xr6[31:16] = (v1 - 128)*175/128 // expand y3,y2,y1,y0 to short S32SFL(xr8,xr0,xr1,xr7,ptn0); //xr7[15:00] = xr1[07:00] y00 //xr7[31:16] = xr1[15:08] y01 //xr8[15:00] = xr1[24:16] y10 //xr8[31:16] = xr1[31:24] y11 Q16ACCM_AA(xr6, xr8, xr7 ,xr5); //xr5[15:0 ] += xr7[15:0 ] y00 + (v0 -128)*175/128 //xr5[31:16] += xr7[31:16] y01 + (v0 -128)*175/128 //xr6[15:0 ] += xr8[15:0 ] y10 + (v1 -128)*175/128 //xr6[31:16] += xr8[31:16] y11 + (v1 -128)*175/128 // xr9 {b3,b2,b1,b0} Q16SAT (xr9, xr6, xr5); //xr9[ 7: 0] = xr5[15:0 ] y01 + (v1 -128)*175/128 //xr9[15: 8] = xr5[31:16] y02 + (v1 -128)*175/128 //xr9[23:16] = xr6[15:0 ] y03 + (v3 -128)*175/128 //xr9[31:24] = xr6[31:16] y04 + (v4 -128)*175/128 // red4 {r3,r2,r1,r0} S32STD(xr9,&red4,0); /*****************************************COMPACT TO RGB24****************/ // green S32LDD(xr2,&green4,0); // xr2 {g3,g2,g1,g0} S32SFL(xr3,xr2,xr9,xr2,ptn0); // xr3 {g3,r3,g2,r2} // xr2 {g1,r1,g0,r0} S32LDD(xr4,&blue4,0); // xr4 {b3,b2,b1,b0} S32SFL(xr5,xr0,xr4,xr4,ptn0); // xr5 {0,b3,0,b2} // xr4 {0,b1,0,b0} S32SFL(xr5,xr5,xr3,xr3,ptn3); // xr5 {0,b3,g3,r3} // xr3 {0,b2,g2,r2} S32SFL(xr4,xr4,xr2,xr2,ptn3); // xr4 {0,b1,g1,r1} // xr2 {0,b0,g0,r0} D32SLL(xr3,xr3,xr2,xr2,8); // xr3 {b2,g2,r2,0} xr2 {b0,g0,r0,0} // final xr8,xr7,xr6 S32ALNI(xr8,xr5,xr3,ptn1); // xr8 {b3,g3,r3,b2} D32SARW(xr7,xr3,xr4,8); // xr7 {b2,g2,r1,b1} S32ALNI(xr6,xr4,xr2,ptn3); // xr6 {b1,g0,r0,b0} /*****************************************STORE****************/ S32STDV(xr6, dst, col, 1); S32STDV(xr7, dst, col + 2, 1); S32STDV(xr8, dst, col + 4, 1); } src += srcStride; dst += dstStride; } return 0; }