雙目相機獲得深度圖小結
阿新 • • 發佈:2018-11-07
1、真實場景的雙目立體匹配(Stereo Matching)獲取深度圖詳解
2、OpenCV3.2.0 雙目標定+立體匹配(官方自帶例子的使用方法)
3、Simple stereo matching (BM/SGBM) example
4、簡單可以執行的demo
main.cpp
#include "opencv2/core/core.hpp" #include "opencv2/imgproc/imgproc.hpp" #include "opencv2/highgui/highgui.hpp" #include "stereoSGBM.h" #include <iostream> using namespace cv; using namespace std; int main(int argc, char ** argv) { Mat L = imread("left.png", CV_LOAD_IMAGE_GRAYSCALE), R = imread("right.png", CV_LOAD_IMAGE_GRAYSCALE); Mat disp; StereoSGBMParams params; int cn = 1; cout << "beign---" << endl; params.preFilterCap = 63; params.SADWindowSize = 5; params.P1 = 8 * cn * params.SADWindowSize*params.SADWindowSize; params.P2 = 32 * cn * params.SADWindowSize*params.SADWindowSize; params.minDisparity = 0; params.numDisparities = 64; params.uniquenessRatio = 10; params.speckleWindowSize = 100; params.speckleRange = 128; params.disp12MaxDiff = 10; cout << "params----" << endl; int64 t = getTickCount(); cout << t << endl; compute(L, R, disp, params); t = getTickCount() - t; printf("StereoSGBM comput(): number of disparity: %d | time: %fms\n", params.numDisparities, t * 1000 / getTickFrequency()); Mat visibleMap; disp.convertTo(visibleMap, CV_8U, 255 / (params.numDisparities*16.0)); namedWindow("disp", CV_WINDOW_AUTOSIZE); imwrite("disp.png", visibleMap); //imshow("disp", visibleMap); waitKey(0); system("PAUSE"); return 0; }
stereoSGBM.cpp
#include "stereoSGBM.h" #include <stdio.h> static void calcPixelCostBT(const Mat& img1, const Mat& img2, int y, int minD, int maxD, CostType* cost, PixType* buffer, const PixType* tab, int tabOfs, int) { int x, c, width = img1.cols, cn = img1.channels(); int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0); int minX2 = std::max(minX1 - maxD, 0), maxX2 = std::min(maxX1 - minD, width); int D = maxD - minD, width1 = maxX1 - minX1, width2 = maxX2 - minX2; const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y); PixType *prow1 = buffer + width2 * 2, *prow2 = prow1 + width*cn * 2; /* ** 查表 ** TAB_OFS = 1024 TAB_SIZE = 2304 clipTab[0] ~ clipTab[961] = 0 clipTab[962] = 1 clipTab[963] = 2 clipTab[964] = 3 ...... clipTab[1087] = 126 clipTab[1088] = 126 ...... clipTab[2303] = 126 */ tab += tabOfs; for (c = 0; c < cn * 2; c++) { prow1[width*c] = prow1[width*c + width - 1] = prow2[width*c] = prow2[width*c + width - 1] = tab[0]; } int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows - 1 ? (int)img1.step : 0; int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows - 1 ? (int)img2.step : 0; /* n1, n2 第一列 (y軸) = 0, 其他 = (-) width * cn n1,n2用來存取上一個row的資料 row1[x + n1) => (x, y-1) s1, s2 最後一列(y軸) = 0, 其他 = (+) width * cn s1,s2用來存取下一個row的資料 row1[x + s1) => (x, y+1) */ if (cn == 1) { //灰階影像 for (x = 1; x < width - 1; x++) { //左影像 位置 x 的 pixDiff (查表得到) prow1[x] = tab[(row1[x + 1] - row1[x - 1]) * 2 + row1[x + n1 + 1] - row1[x + n1 - 1] + row1[x + s1 + 1] - row1[x + s1 - 1]]; //右影像 位置 x (從右邊往回數, 因為是 width-1-x) 的 pixDiff prow2[width - 1 - x] = tab[(row2[x + 1] - row2[x - 1]) * 2 + row2[x + n2 + 1] - row2[x + n2 - 1] + row2[x + s2 + 1] - row2[x + s2 - 1]]; //prow為每個pixel開了兩格的空間 //以 x 來說, prow1[x] 放的是 x 的 cost, prow1[x+width] 放的是 x 的 畫素值 //存放 左影像 位置 x 的畫素到 prow1 prow1[x + width] = row1[x]; //存放 右影像 位置 x 的畫素到 prow2 prow2[width - 1 - x + width] = row2[x]; } } else { //彩色影像 for (x = 1; x < width - 1; x++) { prow1[x] = tab[(row1[x * 3 + 3] - row1[x * 3 - 3]) * 2 + row1[x * 3 + n1 + 3] - row1[x * 3 + n1 - 3] + row1[x * 3 + s1 + 3] - row1[x * 3 + s1 - 3]]; prow1[x + width] = tab[(row1[x * 3 + 4] - row1[x * 3 - 2]) * 2 + row1[x * 3 + n1 + 4] - row1[x * 3 + n1 - 2] + row1[x * 3 + s1 + 4] - row1[x * 3 + s1 - 2]]; prow1[x + width * 2] = tab[(row1[x * 3 + 5] - row1[x * 3 - 1]) * 2 + row1[x * 3 + n1 + 5] - row1[x * 3 + n1 - 1] + row1[x * 3 + s1 + 5] - row1[x * 3 + s1 - 1]]; prow2[width - 1 - x] = tab[(row2[x * 3 + 3] - row2[x * 3 - 3]) * 2 + row2[x * 3 + n2 + 3] - row2[x * 3 + n2 - 3] + row2[x * 3 + s2 + 3] - row2[x * 3 + s2 - 3]]; prow2[width - 1 - x + width] = tab[(row2[x * 3 + 4] - row2[x * 3 - 2]) * 2 + row2[x * 3 + n2 + 4] - row2[x * 3 + n2 - 2] + row2[x * 3 + s2 + 4] - row2[x * 3 + s2 - 2]]; prow2[width - 1 - x + width * 2] = tab[(row2[x * 3 + 5] - row2[x * 3 - 1]) * 2 + row2[x * 3 + n2 + 5] - row2[x * 3 + n2 - 1] + row2[x * 3 + s2 + 5] - row2[x * 3 + s2 - 1]]; prow1[x + width * 3] = row1[x * 3]; prow1[x + width * 4] = row1[x * 3 + 1]; prow1[x + width * 5] = row1[x * 3 + 2]; prow2[width - 1 - x + width * 3] = row2[x * 3]; prow2[width - 1 - x + width * 4] = row2[x * 3 + 1]; prow2[width - 1 - x + width * 5] = row2[x * 3 + 2]; } } memset(cost, 0, width1*D*sizeof(cost[0])); buffer -= minX2; cost -= minX1*D + minD; // simplify the cost indices inside the loop #if 1 for (c = 0; c < cn * 2; c++, prow1 += width, prow2 += width) { int diff_scale = c < cn ? 0 : 2; // precompute // v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and // v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and //做右影像 for (x = minX2; x < maxX2; x++) { int v = prow2[x]; int vl = x > 0 ? (v + prow2[x - 1]) / 2 : v; int vr = x < width - 1 ? (v + prow2[x + 1]) / 2 : v; int v0 = std::min(vl, vr); v0 = std::min(v0, v); int v1 = std::max(vl, vr); v1 = std::max(v1, v); buffer[x] = (PixType)v0; buffer[x + width2] = (PixType)v1; } //做左影像 for (x = minX1; x < maxX1; x++) { int u = prow1[x]; int ul = x > 0 ? (u + prow1[x - 1]) / 2 : u; int ur = x < width - 1 ? (u + prow1[x + 1]) / 2 : u; int u0 = std::min(ul, ur); u0 = std::min(u0, u); int u1 = std::max(ul, ur); u1 = std::max(u1, u); #if CV_SIMD128 v_uint8x16 _u = v_setall_u8((uchar)u), _u0 = v_setall_u8((uchar)u0); v_uint8x16 _u1 = v_setall_u8((uchar)u1); for (int d = minD; d < maxD; d += 16) { v_uint8x16 _v = v_load(prow2 + width - x - 1 + d); v_uint8x16 _v0 = v_load(buffer + width - x - 1 + d); v_uint8x16 _v1 = v_load(buffer + width - x - 1 + d + width2); v_uint8x16 c0 = v_max(_u - _v1, _v0 - _u); v_uint8x16 c1 = v_max(_v - _u1, _u0 - _v); v_uint8x16 diff = v_min(c0, c1); v_int16x8 _c0 = v_load_aligned(cost + x*D + d); v_int16x8 _c1 = v_load_aligned(cost + x*D + d + 8); v_uint16x8 diff1, diff2; v_expand(diff, diff1, diff2); v_store_aligned(cost + x*D + d, _c0 + v_reinterpret_as_s16(diff1 >> diff_scale)); v_store_aligned(cost + x*D + d + 8, _c1 + v_reinterpret_as_s16(diff2 >> diff_scale)); } #else for (int d = minD; d < maxD; d++) { int v = prow2[width - x - 1 + d]; int v0 = buffer[width - x - 1 + d]; int v1 = buffer[width - x - 1 + d + width2]; int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u); int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v); cost[x*D + d] = (CostType)(cost[x*D + d] + (std::min(c0, c1) >> diff_scale)); } #endif } } #else for (c = 0; c < cn * 2; c++, prow1 += width, prow2 += width) { for (x = minX1; x < maxX1; x++) { int u = prow1[x]; #if CV_SSE2 if (useSIMD) { __m128i _u = _mm_set1_epi8(u), z = _mm_setzero_si128(); for (int d = minD; d < maxD; d += 16) { __m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width - 1 - x + d)); __m128i diff = _mm_adds_epu8(_mm_subs_epu8(_u, _v), _mm_subs_epu8(_v, _u)); __m128i c0 = _mm_load_si128((__m128i*)(cost + x*D + d)); __m128i c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8)); _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_unpacklo_epi8(diff, z))); _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_unpackhi_epi8(diff, z))); } } else #endif { for (int d = minD; d < maxD; d++) { int v = prow2[width - 1 - x + d]; cost[x*D + d] = (CostType)(cost[x*D + d] + (CostType)std::abs(u - v)); } } } } #endif } static void computeDisparitySGBM(const Mat& img1, const Mat& img2, Mat& disp1, const StereoSGBMParams& params, Mat& buffer) { #if CV_SSE2 static const uchar LSBTab[] = { 0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 }; volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif const int ALIGN = 16; //const int DISP_SHIFT = StereoMatcher::DISP_SHIFT; const int DISP_SCALE = (1 << DISP_SHIFT); const CostType MAX_COST = SHRT_MAX; int minD = params.minDisparity, maxD = minD + params.numDisparities; Size SADWindowSize; SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5; //int ftzero = std::max(params.preFilterCap, 15) | 1; int ftzero = 63; int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10; int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1; //int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? (int)params.P2 : 5, (int)(P1 + 1)); int P1 = params.P1 > 0 ? params.P1 : 2, P2 = params.P2; P2 = std::max(P2 > 0 ? P2 : 5, P1 + 1); int k, width = disp1.cols, height = disp1.rows; int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0); int D = maxD - minD, width1 = maxX1 - minX1; int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE; int SW2 = SADWindowSize.width / 2, SH2 = SADWindowSize.height / 2; //bool fullDP = params.mode == StereoSGBM::MODE_HH; bool fullDP = params.mode == MODE_HH; int npasses = fullDP ? 2 : 1; const int TAB_OFS = 256 * 4, TAB_SIZE = 256 + TAB_OFS * 2; PixType clipTab[TAB_SIZE]; /************************************************************ OpenCV 將 gradient 的值限制在 0 ~ ftzero*2 之間 因此如果 gradient score 的值為 0 => 則 tab[0] = ftzero 若 gradient 極大,值為255 => tab[255] 最多隻到 ftzero * 2 若 gradient 極小,值為-255 => tab[-255] 最小隻到 0 /************************************************************/ for (k = 0; k < TAB_SIZE; k++) clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero); if (minX1 >= maxX1) { disp1 = Scalar::all(INVALID_DISP_SCALED); return; } CV_Assert(D % 16 == 0); // NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8. // if you change NR, please, modify the loop as well. int D2 = D + 16, NRD2 = NR2*D2; // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer: // for 8-way dynamic programming we need the current row and // the previous row, i.e. 2 rows in total const int NLR = 2; const int LrBorder = NLR - 1; // for each possible stereo match (img1(x,y) <=> img2(x-d,y)) // we keep pixel difference cost (C) and the summary cost over NR directions (S). // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k) size_t costBufSize = width1*D; size_t CSBufSize = costBufSize*(fullDP ? height : 1); size_t minLrSize = (width1 + LrBorder * 2)*NR2, LrSize = minLrSize*D2; int hsumBufNRows = SH2 * 2 + 2; size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType)+ // minLr[] and Lr[] costBufSize*(hsumBufNRows + 1)*sizeof(CostType)+ // hsumBuf, pixdiff CSBufSize * 2 * sizeof(CostType)+ // C, S width * 16 * img1.channels()*sizeof(PixType)+ // temp buffer for computing per-pixel cost width*(sizeof(CostType)+sizeof(DispType)) + 1024; // disp2cost + disp2 if (buffer.empty() || !buffer.isContinuous() || buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize) buffer.create(1, (int)totalBufSize, CV_8U); // summary cost over different (nDirs) directions CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN); CostType* Sbuf = Cbuf + CSBufSize; CostType* hsumBuf = Sbuf + CSBufSize; CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows; CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR; DispType* disp2ptr = (DispType*)(disp2cost + width); PixType* tempBuf = (PixType*)(disp2ptr + width); // add P2 to every C(x,y). it saves a few operations in the inner loops for (k = 0; k < width1*D; k++) Cbuf[k] = (CostType)P2; /************* note *************/ /* 設 params.preFilterCap = 63 則 int ftzero = std::max(params.preFilterCap, 15) | 1 = 63 (params.preFilterCap = 62或63 ftzero的結果都是63) TAB_OFS = 1024 TAB_SIZE = 2304 clipTab[0] ~ clipTab[961] = 0 clipTab[962] = 1 clipTab[963] = 2 clipTab[964] = 3 ...... clipTab[1087] = 126 clipTab[1088] = 126 ...... clipTab[2303] = 126 (目前合理猜測stereoSGBM 5 way aggregation是指: 左 左上 上 右上 右, 因為每個 row 在掃完就決定了 bestD, 那能夠利用的就只有 "該row" 跟 "之前算過的資訊" 了) NR - the number of direction (NR從頭到尾沒被用過)(如果是 5 way NR = 5) NR2 似乎才是實際用的值 (預設NR = 16, NR2 = NR/2) dx, dy => x,y loop 時累進的方向(pass1時:+1(遞增) or pass2:-1(從後面算回來)) maxD = minD + params.numDisparities; (128 in my experiment, minD = 0) width = disp1.cols, height = disp1.rows; int minX1 = std::max(maxD, 0) (minX1 = 128) maxX1 = width + std::min(minD, 0); (maxX1 = 612) D = maxD - minD (D = 128) D2 = D + 16 (D2 = D + 16) width1 = maxX1 - minX1; (width1 = 484) DISP_SCALE = 16(應該是) int SW2 = SADWindowSize.width/2 SH2 = SADWindowSize.height/2; // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer: // for 8-way dynamic programming we need the current row and // the previous row, i.e. 2 rows in total // 意思就是 aggregation 需要 NLR = 2 個 row 去做 aggregation (這兩個row包含前一個row, 與現在的row, 共兩個row) const int NLR = 2; const int LrBorder = NLR - 1; // for each possible stereo match (img1(x,y) <=> img2(x-d,y)) // we keep pixel difference cost (C) and the summary cost over NR directions (S). // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k) */ for (int pass = 1; pass <= npasses; pass++) { int x1, y1, x2, y2, dx, dy; if (pass == 1) { y1 = 0; y2 = height; dy = 1; x1 = 0; x2 = width1; dx = 1; } else { y1 = height - 1; y2 = -1; dy = -1; x1 = width1 - 1; x2 = -1; dx = -1; } CostType *Lr[NLR] = { 0 }, *minLr[NLR] = { 0 }; for (k = 0; k < NLR; k++) { // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders, // and will occasionally use negative indices with the arrays // we need to shift Lr[k] pointers by 1, to give the space for d=-1. // however, then the alignment will be imperfect, i.e. bad for SSE, // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment) Lr[k] = pixDiff + costBufSize + LrSize*k + NRD2*LrBorder + 8; memset(Lr[k] - LrBorder*NRD2 - 8, 0, LrSize*sizeof(CostType)); minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + NR2*LrBorder; memset(minLr[k] - LrBorder*NR2, 0, minLrSize*sizeof(CostType)); } for (int y = y1; y != y2; y += dy) { int x, d; DispType* disp1ptr = disp1.ptr<DispType>(y); CostType* C = Cbuf + (!fullDP ? 0 : y*costBufSize); CostType* S = Sbuf + (!fullDP ? 0 : y*costBufSize); if (pass == 1) // compute C on the first pass, and reuse it on the second pass, if any. { int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1; /* 只有 pass == 1時才會做 在y == 0時 dy1 = 0 dy2 = SH2 = 2 在y == rest dy1 = y + SH2 = y + 2 dy2 = dy1 = y + 2 演演算法會預先做下兩個row的結果 */ for (k = dy1; k <= dy2; k++) { /*************************************************** 在此迴圈中, 會計算 SAD 的 cost (Block Matching), 步驟如下 (此處假設 SADWindowSize = 5, SH2 = 2, SW2 = 2) 1. 計算第一個 row 中每個pixel 的 pixDiff (calcPixelCostBT) 2. 將水平的 pixDiff 累加到 hsum 中 (pixDiff * scale => 若上面會左右邊沒有pixel, 則用第一個來補, scale就是控制的權重, 因此對第一個來說scale都是 3 (SW2+1或SH2+1)) (hsum[x] = (x-2) ~ (x+2) 的 pixDiff總和 ) 3. 將 hsum 的值加到 C 中 (透過C來把Cost傳遞一下面的row) C[x] = hsumAdd[x] (現在的 C 在經過後面的 cost aggregation 後, 就會成為下次的 Cprev) 4. 計算第二個 row 中每個pixel 的 pixDiff 5. 將水平的 pixDiff 累加到 hsum 中 6. C[x] = Cprev[x] + hsumAdd - hsumSub (hsumAdd 是 y+2 那個 row 的水平累加結果) (hsumSub 是 y-3 那個 row 的水平累加結果) (加入新的, 減掉最後的, 加快運算速度, 重複利用, 不用每次都重算) 之後就是不斷重複這個過程 特別需要注意的是 此程式的流程每個 row 都會通過 semi-global cost aggregation 因此 Cprev 都是經過 cost aggregation的(這其實也是一條由上往下的cost aggregation路徑) /***************************************************/ //除了第一個row以外, 之後做的都是第y+2個row //costBufSize 是 一個 row 的cost buffer size大小 = width1 * D bytes //hsumBufNRows 是 水平kernel size + 1 = 5 + 1 // +1 的話 mod 的結果才會控制在 0 ~ 5 CostType* hsumAdd = hsumBuf + (std::min(k, height - 1) % hsumBufNRows)*costBufSize; if (k < height) { calcPixelCostBT(img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero); memset(hsumAdd, 0, D*sizeof(CostType)); /* ** 累加 kernel中 水平的pixDif ** 邊緣處理 x=0 hsumAdd = (x=0) + (x=1) + (x=2) 的 pixDiff (各 disparity 獨立) scale 在 x==0 時 為 SW2 + 1, 因為 kernel 左邊沒有畫素(少 SW2 個畫素), 所以就直接拿中心點的值來充數 中間 x=1 ~ width1 hsumAdd[x] = (x-2) + (x-1) + x + (x+1) + (x+2) 的 pixDiff */ for (x = 0; x <= SW2*D; x += D) { int scale = x == 0 ? SW2 + 1 : 1; for (d = 0; d < D; d++){ hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d] * scale); //printf("x:%d d:%d; hsumAdd[d] = %d\n", x, d, hsumAdd[d]); } } if (y > 0) { const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize; const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize; for (x = D; x < width1*D; x += D) { const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1 - 1)*D); const CostType* pixSub = pixDiff + std::max(x - (SW2 + 1)*D, 0); #if CV_SSE2 if (useSIMD) { for (d = 0; d < D; d += 8) { __m128i hv = _mm_load_si128((const __m128i*)(hsumAdd + x - D + d)); __m128i Cx = _mm_load_si128((__m128i*)(Cprev + x + d)); hv = _mm_adds_epi16(_mm_subs_epi16(hv, _mm_load_si128((const __m128i*)(pixSub + d))), _mm_load_si128((const __m128i*)(pixAdd + d))); Cx = _mm_adds_epi16(_mm_subs_epi16(Cx, _mm_load_si128((const __m128i*)(hsumSub + x + d))), hv); _mm_store_si128((__m128i*)(hsumAdd + x + d), hv); _mm_store_si128((__m128i*)(C + x + d), Cx); } } else #endif { for (d = 0; d < D; d++) { //這次的 horizontal kernel sum = 上次的hsum + 下一個pixDiff - 上次的最後一個pixDiff int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]); //這次的Cost = C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]); } } } } else { for (x = D; x < width1*D; x += D) { const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1 - 1)*D); const CostType* pixSub = pixDiff + std::max(x - (SW2 + 1)*D, 0); for (d = 0; d < D; d++) hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]); } } } /* 第一列的 C[x] = hsumAdd[x] * scale (因為沒有上兩個row, 所以用第一個row的值補) */ if (y == 0) { int scale = k == 0 ? SH2 + 1 : 1; for (x = 0; x < width1*D; x++) C[x] = (CostType)(C[x] + hsumAdd[x] * scale); } } // also, clear the S buffer for (k = 0; k < width1*D; k++) S[k] = 0; } // clear the left and the right borders memset(Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType)); memset(Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType)); memset(minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType)); memset(minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType)); /* [formula 13 in the paper] compute L_r(p, d) = C(p, d) + min(L_r(p-r, d), L_r(p-r, d-1) + P1, L_r(p-r, d+1) + P1, min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k) where p = (x,y), r is one of the directions. we process all the directions at once: 0: r=(-dx, 0) 1: r=(-1, -dy) 2: r=(0, -dy) 3: r=(1, -dy) 4: r=(-2, -dy) 5: r=(-1, -dy*2) 6: r=(1, -dy*2) 7: r=(2, -dy) */ for (x = x1; x != x2; x += dx) { int xm = x*NR2, xd = xm*D2; int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2; int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2; CostType* Lr_p0 = Lr[0] + xd - dx*NRD2; CostType* Lr_p1 = Lr[1] + xd - NRD2 + D2; CostType* Lr_p2 = Lr[1] + xd + D2 * 2; CostType* Lr_p3 = Lr[1] + xd + NRD2 + D2 * 3; Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] = Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST; CostType* Lr_p = Lr[0] + xd; const CostType* Cp = C + x*D; CostType* Sp = S + x*D; #if CV_SSE2 if (useSIMD) { __m128i _P1 = _mm_set1_epi16((short)P1); __m128i _delta0 = _mm_set1_epi16((short)delta0); __m128i _delta1 = _mm_set1_epi16((short)delta1); __m128i _delta2 = _mm_set1_epi16((short)delta2); __m128i _delta3 = _mm_set1_epi16((short)delta3); __m128i _minL0 = _mm_set1_epi16((short)MAX_COST); for (d = 0; d < D; d += 8) { __m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d)); __m128i L0, L1, L2, L3; L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d)); L1 = _mm_load_si128((const __m128i*)(Lr_p1 + d)); L2 = _mm_load_si128((const __m128i*)(Lr_p2 + d)); L3 = _mm_load_si128((const __m128i*)(Lr_p3 + d)); L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1)); L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1)); L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d - 1)), _P1)); L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d + 1)), _P1)); L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d - 1)), _P1)); L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d + 1)), _P1)); L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d - 1)), _P1)); L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d + 1)), _P1)); L0 = _mm_min_epi16(L0, _delta0); L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd); L1 = _mm_min_epi16(L1, _delta1); L1 = _mm_adds_epi16(_mm_subs_epi16(L1, _delta1), Cpd); L2 = _mm_min_epi16(L2, _delta2); L2 = _mm_adds_epi16(_mm_subs_epi16(L2, _delta2), Cpd); L3 = _mm_min_epi16(L3, _delta3); L3 = _mm_adds_epi16(_mm_subs_epi16(L3, _delta3), Cpd); _mm_store_si128((__m128i*)(Lr_p + d), L0); _mm_store_si128((__m128i*)(Lr_p + d + D2), L1); _mm_store_si128((__m128i*)(Lr_p + d + D2 * 2), L2); _mm_store_si128((__m128i*)(Lr_p + d + D2 * 3), L3); __m128i t0 = _mm_min_epi16(_mm_unpacklo_epi16(L0, L2), _mm_unpackhi_epi16(L0, L2)); __m128i t1 = _mm_min_epi16(_mm_unpacklo_epi16(L1, L3), _mm_unpackhi_epi16(L1, L3)); t0 = _mm_min_epi16(_mm_unpacklo_epi16(t0, t1), _mm_unpackhi_epi16(t0, t1)); _minL0 = _mm_min_epi16(_minL0, t0); __m128i Sval = _mm_load_si128((const __m128i*)(Sp + d)); L0 = _mm_adds_epi16(L0, L1); L2 = _mm_adds_epi16(L2, L3); Sval = _mm_adds_epi16(Sval, L0); Sval = _mm_adds_epi16(Sval, L2); _mm_store_si128((__m128i*)(Sp + d), Sval); } _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8)); _mm_storel_epi64((__m128i*)&minLr[0][xm], _minL0); } else #endif { int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST; for (d = 0; d < D; d++) { int Cpd = Cp[d], L0, L1, L2, L3; L0 = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d - 1] + P1, std::min(Lr_p0[d + 1] + P1, delta0))) - delta0; L1 = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d - 1] + P1, std::min(Lr_p1[d + 1] + P1, delta1))) - delta1; L2 = Cpd + std::min((int)Lr_p2[d], std::min(Lr_p2[d - 1] + P1, std::min(Lr_p2[d + 1] + P1, delta2))) - delta2; L3 = Cpd + std::min((int)Lr_p3[d], std::min(Lr_p3[d - 1] + P1, std::min(Lr_p3[d + 1] + P1, delta3))) - delta3; Lr_p[d] = (CostType)L0; minL0 = std::min(minL0, L0); Lr_p[d + D2] = (CostType)L1; minL1 = std::min(minL1, L1); Lr_p[d + D2 * 2] = (CostType)L2; minL2 = std::min(minL2, L2); Lr_p[d + D2 * 3] = (CostType)L3; minL3 = std::min(minL3, L3); Sp[d] = saturate_cast<CostType>(Sp[d] + L0 + L1 + L2 + L3); } minLr[0][xm] = (CostType)minL0; minLr[0][xm + 1] = (CostType)minL1; minLr[0][xm + 2] = (CostType)minL2; minLr[0][xm + 3] = (CostType)minL3; } } if (pass == npasses) { for (x = 0; x < width; x++) { disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED; disp2cost[x] = MAX_COST; } for (x = width1 - 1; x >= 0; x--) { CostType* Sp = S + x*D; int minS = MAX_COST, bestDisp = -1; if (npasses == 1) { int xm = x*NR2, xd = xm*D2; int minL0 = MAX_COST; int delta0 = minLr[0][xm + NR2] + P2; CostType* Lr_p0 = Lr[0] + xd + NRD2; Lr_p0[-1] = Lr_p0[D] = MAX_COST; CostType* Lr_p = Lr[0] + xd; const CostType* Cp = C + x*D; #if CV_SSE2 if (useSIMD) { __m128i _P1 = _mm_set1_epi16((short)P1); __m128i _delta0 = _mm_set1_epi16((short)delta0); __m128i _minL0 = _mm_set1_epi16((short)minL0); __m128i _minS = _mm_set1_epi16(MAX_COST), _bestDisp = _mm_set1_epi16(-1); __m128i _d8 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7), _8 = _mm_set1_epi16(8); for (d = 0; d < D; d += 8) { __m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d)), L0; L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d)); L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1)); L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1)); L0 = _mm_min_epi16(L0, _delta0); L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd); _mm_store_si128((__m128i*)(Lr_p + d), L0); _minL0 = _mm_min_epi16(_minL0, L0); L0 = _mm_adds_epi16(L0, *(__m128i*)(Sp + d)); _mm_store_si128((__m128i*)(Sp + d), L0); __m128i mask = _mm_cmpgt_epi16(_minS, L0); _minS = _mm_min_epi16(_minS, L0); _bestDisp = _mm_xor_si128(_bestDisp, _mm_and_si128(_mm_xor_si128(_bestDisp, _d8), mask)); _d8 = _mm_adds_epi16(_d8, _8); } short CV_DECL_ALIGNED(16) bestDispBuf[8]; _mm_store_si128((__m128i*)bestDispBuf, _bestDisp); _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8)); _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 4)); _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 2)); __m128i qS = _mm_min_epi16(_minS, _mm_srli_si128(_minS, 8)); qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 4)); qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 2)); minLr[0][xm] = (CostType)_mm_cvtsi128_si32(_minL0); minS = (CostType)_mm_cvtsi128_si32(qS); qS = _mm_shuffle_epi32(_mm_unpacklo_epi16(qS, qS), 0); qS = _mm_cmpeq_epi16(_minS, qS); int idx = _mm_movemask_epi8(_mm_packs_epi16(qS, qS)) & 255; bestDisp = bestDispBuf[LSBTab[idx]]; } else #endif { for (d = 0; d < D; d++) { int L0 = Cp[d] + std::min((int)Lr_p0[d], std::min(Lr_p0[d - 1] + P1, std::min(Lr_p0[d + 1] + P1, delta0))) - delta0; Lr_p[d] = (CostType)L0; minL0 = std::min(minL0, L0); int Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0); if (Sval < minS) { minS = Sval; bestDisp = d; } } minLr[0][xm] = (CostType)minL0; } } else { for (d = 0; d < D; d++) { int Sval = Sp[d]; if (Sval < minS) { minS = Sval; bestDisp = d; } } } for (d = 0; d < D; d++) { if (Sp[d] * (100 - uniquenessRatio) < minS * 100 && std::abs(bestDisp - d) > 1) break; } if (d < D) continue; d = bestDisp; int _x2 = x + minX1 - d - minD; if (disp2cost[_x2] > minS) { disp2cost[_x2] = (CostType)minS; disp2ptr[_x2] = (DispType)(d + minD); } if (0 < d && d < D - 1) { // do subpixel quadratic interpolation: // fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1]) // then find minimum of the parabola. /******************************************************************* (Subpixel) 用 Cost 當作 y 軸座標 (aggregation後的) 用 d 當作 x 軸座標 用 (d-1) (d) (d+1) 三個點,使用 parabola fitting, 找出該 拋物線真正的 minimum 位置 作為最後 subpixel 的結果 可參考 Sub-pixel Estimation of Local Extrema (不過公式有些不同, 目前找不到OpenCV這個版本的演算法從哪來的) /*******************************************************************/ int denom2 = std::max(Sp[d - 1] + Sp[d + 1] - 2 * Sp[d], 1); d = d*DISP_SCALE + ((Sp[d - 1] - Sp[d + 1])*DISP_SCALE + denom2) / (denom2 * 2); } else d *= DISP_SCALE; disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE); } for (x = minX1; x < maxX1; x++) { // we round the computed disparity both towards -inf and +inf and check // if either of the corresponding disparities in disp2 is consistent. // This is to give the computed disparity a chance to look valid if it is. int d1 = disp1ptr[x]; if (d1 == INVALID_DISP_SCALED) continue; int _d = d1 >> DISP_SHIFT; int d_ = (d1 + DISP_SCALE - 1) >> DISP_SHIFT; int _x = x - _d, x_ = x - d_; if (0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff && 0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff) disp1ptr[x] = (DispType)INVALID_DISP_SCALED; } } // now shift the cyclic buffers std::swap(Lr[0], Lr[1]); std::swap(minLr[0], minLr[1]); } } } void compute(InputArray leftarr, InputArray rightarr, OutputArray disparr, StereoSGBMParams params) { Mat buffer; const int DISP_SCALE = (1 << DISP_SHIFT); Mat left = leftarr.getMat(), right = rightarr.getMat(); //CV_ASSERT(left.size() == right.size() && left.type() == right.type() && // left.depth() == CV_8U); disparr.create(left.size(), CV_16S); Mat disp = disparr.getMat(); computeDisparitySGBM(left, right, disp, params, buffer); medianBlur(disp, disp, 3); if (params.speckleWindowSize > 0) filterSpeckles(disp, (params.minDisparity - 1) * DISP_SCALE, params.speckleWindowSize, DISP_SCALE*params.speckleRange, buffer); }
stereoSGBM.h
#include <opencv2\opencv.hpp> #include <opencv2\calib3d\calib3d.hpp> using namespace cv; //copy stereoSGBM setting typedef uchar PixType; typedef short CostType; typedef short DispType; enum { NR = 16, NR2 = NR / 2 }; #define DISP_SHIFT 4 #define MODE_SGBM 2 #define MODE_HH 3 struct StereoSGBMParams { StereoSGBMParams() { minDisparity = numDisparities = 0; SADWindowSize = 0; P1 = P2 = 0; disp12MaxDiff = 0; preFilterCap = 0; uniquenessRatio = 0; speckleWindowSize = 0; speckleRange = 0; //mode = StereoSGBM::MODE_SGBM; mode = MODE_SGBM; } StereoSGBMParams(int _minDisparity, int _numDisparities, int _SADWindowSize, int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap, int _uniquenessRatio, int _speckleWindowSize, int _speckleRange, int _mode) { minDisparity = _minDisparity; numDisparities = _numDisparities; SADWindowSize = _SADWindowSize; P1 = _P1; P2 = _P2; disp12MaxDiff = _disp12MaxDiff; preFilterCap = _preFilterCap; uniquenessRatio = _uniquenessRatio; speckleWindowSize = _speckleWindowSize; speckleRange = _speckleRange; mode = _mode; } int minDisparity; int numDisparities; int SADWindowSize; int preFilterCap; int uniquenessRatio; int P1; int P2; int speckleWindowSize; int speckleRange; int disp12MaxDiff; int mode; }; void compute(InputArray leftarr, InputArray rightarr, OutputArray disparr, StereoSGBMParams params);