1. 程式人生 > >雙目相機獲得深度圖小結

雙目相機獲得深度圖小結

1、真實場景的雙目立體匹配(Stereo Matching)獲取深度圖詳解

2、OpenCV3.2.0 雙目標定+立體匹配(官方自帶例子的使用方法)

3、Simple stereo matching (BM/SGBM) example

4、簡單可以執行的demo

main.cpp

#include "opencv2/core/core.hpp"
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/highgui/highgui.hpp"

#include "stereoSGBM.h"

#include <iostream>

using namespace cv;
using namespace std;
int main(int argc, char ** argv)
{
	Mat L = imread("left.png", CV_LOAD_IMAGE_GRAYSCALE), R = imread("right.png", CV_LOAD_IMAGE_GRAYSCALE);
	Mat disp;
	StereoSGBMParams params;
	int cn = 1;
	cout << "beign---" << endl;

	params.preFilterCap = 63;
	params.SADWindowSize = 5;
	params.P1 = 8 * cn * params.SADWindowSize*params.SADWindowSize;
	params.P2 = 32 * cn * params.SADWindowSize*params.SADWindowSize;
	params.minDisparity = 0;
	params.numDisparities = 64;
	params.uniquenessRatio = 10;
	params.speckleWindowSize = 100;
	params.speckleRange = 128;
	params.disp12MaxDiff = 10;

	cout << "params----" << endl;

	int64 t = getTickCount();
	cout << t << endl;

	compute(L, R, disp, params);
	t = getTickCount() - t;
	printf("StereoSGBM comput(): number of disparity: %d | time: %fms\n", params.numDisparities, t * 1000 / getTickFrequency());

	Mat visibleMap;
	disp.convertTo(visibleMap, CV_8U, 255 / (params.numDisparities*16.0));


	namedWindow("disp", CV_WINDOW_AUTOSIZE);
	imwrite("disp.png", visibleMap);
	//imshow("disp", visibleMap);
	waitKey(0);

	system("PAUSE");
	return 0;
}

stereoSGBM.cpp

#include "stereoSGBM.h"

#include <stdio.h>

static void calcPixelCostBT(const Mat& img1, const Mat& img2, int y,
	int minD, int maxD, CostType* cost,
	PixType* buffer, const PixType* tab,
	int tabOfs, int)
{
	int x, c, width = img1.cols, cn = img1.channels();
	int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
	int minX2 = std::max(minX1 - maxD, 0), maxX2 = std::min(maxX1 - minD, width);
	int D = maxD - minD, width1 = maxX1 - minX1, width2 = maxX2 - minX2;
	const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
	PixType *prow1 = buffer + width2 * 2, *prow2 = prow1 + width*cn * 2;

	/*
	**	查表	  **
	TAB_OFS = 1024
	TAB_SIZE = 2304
	clipTab[0] ~ clipTab[961] = 0
	clipTab[962] = 1
	clipTab[963] = 2
	clipTab[964] = 3
	......
	clipTab[1087] = 126
	clipTab[1088] = 126
	......
	clipTab[2303] = 126
	*/

	tab += tabOfs;

	for (c = 0; c < cn * 2; c++)
	{
		prow1[width*c] = prow1[width*c + width - 1] =
			prow2[width*c] = prow2[width*c + width - 1] = tab[0];
	}

	int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows - 1 ? (int)img1.step : 0;
	int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows - 1 ? (int)img2.step : 0;

	/*
	n1, n2 第一列  (y軸) = 0, 其他 = (-) width * cn
		n1,n2用來存取上一個row的資料 row1[x + n1) => (x, y-1)
	s1, s2 最後一列(y軸) = 0, 其他 = (+) width * cn
		s1,s2用來存取下一個row的資料 row1[x + s1) => (x, y+1)
	*/
	if (cn == 1)
	{	//灰階影像
		for (x = 1; x < width - 1; x++)
		{
			//左影像 位置 x 的 pixDiff (查表得到)
			prow1[x] = tab[(row1[x + 1] - row1[x - 1]) * 2 + row1[x + n1 + 1] - row1[x + n1 - 1] + row1[x + s1 + 1] - row1[x + s1 - 1]];
			//右影像 位置 x (從右邊往回數, 因為是 width-1-x) 的 pixDiff
			prow2[width - 1 - x] = tab[(row2[x + 1] - row2[x - 1]) * 2 + row2[x + n2 + 1] - row2[x + n2 - 1] + row2[x + s2 + 1] - row2[x + s2 - 1]];

			//prow為每個pixel開了兩格的空間
			//以 x 來說, prow1[x] 放的是 x 的 cost, prow1[x+width] 放的是 x 的 畫素值
			
			//存放 左影像 位置 x 的畫素到 prow1
			prow1[x + width] = row1[x];
			//存放 右影像 位置 x 的畫素到 prow2
			prow2[width - 1 - x + width] = row2[x];
		}
	}
	else
	{	//彩色影像
		for (x = 1; x < width - 1; x++)
		{
			prow1[x] = tab[(row1[x * 3 + 3] - row1[x * 3 - 3]) * 2 + row1[x * 3 + n1 + 3] - row1[x * 3 + n1 - 3] + row1[x * 3 + s1 + 3] - row1[x * 3 + s1 - 3]];
			prow1[x + width] = tab[(row1[x * 3 + 4] - row1[x * 3 - 2]) * 2 + row1[x * 3 + n1 + 4] - row1[x * 3 + n1 - 2] + row1[x * 3 + s1 + 4] - row1[x * 3 + s1 - 2]];
			prow1[x + width * 2] = tab[(row1[x * 3 + 5] - row1[x * 3 - 1]) * 2 + row1[x * 3 + n1 + 5] - row1[x * 3 + n1 - 1] + row1[x * 3 + s1 + 5] - row1[x * 3 + s1 - 1]];

			prow2[width - 1 - x] = tab[(row2[x * 3 + 3] - row2[x * 3 - 3]) * 2 + row2[x * 3 + n2 + 3] - row2[x * 3 + n2 - 3] + row2[x * 3 + s2 + 3] - row2[x * 3 + s2 - 3]];
			prow2[width - 1 - x + width] = tab[(row2[x * 3 + 4] - row2[x * 3 - 2]) * 2 + row2[x * 3 + n2 + 4] - row2[x * 3 + n2 - 2] + row2[x * 3 + s2 + 4] - row2[x * 3 + s2 - 2]];
			prow2[width - 1 - x + width * 2] = tab[(row2[x * 3 + 5] - row2[x * 3 - 1]) * 2 + row2[x * 3 + n2 + 5] - row2[x * 3 + n2 - 1] + row2[x * 3 + s2 + 5] - row2[x * 3 + s2 - 1]];

			prow1[x + width * 3] = row1[x * 3];
			prow1[x + width * 4] = row1[x * 3 + 1];
			prow1[x + width * 5] = row1[x * 3 + 2];

			prow2[width - 1 - x + width * 3] = row2[x * 3];
			prow2[width - 1 - x + width * 4] = row2[x * 3 + 1];
			prow2[width - 1 - x + width * 5] = row2[x * 3 + 2];
		}
	}

	memset(cost, 0, width1*D*sizeof(cost[0]));

	buffer -= minX2;
	cost -= minX1*D + minD; // simplify the cost indices inside the loop

#if 1
	for (c = 0; c < cn * 2; c++, prow1 += width, prow2 += width)
	{
		int diff_scale = c < cn ? 0 : 2;

		// precompute
		//   v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and
		//   v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and

		//做右影像
		for (x = minX2; x < maxX2; x++)
		{
			int v = prow2[x];
			int vl = x > 0 ? (v + prow2[x - 1]) / 2 : v;
			int vr = x < width - 1 ? (v + prow2[x + 1]) / 2 : v;
			int v0 = std::min(vl, vr); v0 = std::min(v0, v);
			int v1 = std::max(vl, vr); v1 = std::max(v1, v);
			buffer[x] = (PixType)v0;
			buffer[x + width2] = (PixType)v1;
		}

		//做左影像
		for (x = minX1; x < maxX1; x++)
		{
			int u = prow1[x];
			int ul = x > 0 ? (u + prow1[x - 1]) / 2 : u;
			int ur = x < width - 1 ? (u + prow1[x + 1]) / 2 : u;
			int u0 = std::min(ul, ur); u0 = std::min(u0, u);
			int u1 = std::max(ul, ur); u1 = std::max(u1, u);

#if CV_SIMD128
			v_uint8x16 _u = v_setall_u8((uchar)u), _u0 = v_setall_u8((uchar)u0);
			v_uint8x16 _u1 = v_setall_u8((uchar)u1);

			for (int d = minD; d < maxD; d += 16)
			{
				v_uint8x16 _v = v_load(prow2 + width - x - 1 + d);
				v_uint8x16 _v0 = v_load(buffer + width - x - 1 + d);
				v_uint8x16 _v1 = v_load(buffer + width - x - 1 + d + width2);
				v_uint8x16 c0 = v_max(_u - _v1, _v0 - _u);
				v_uint8x16 c1 = v_max(_v - _u1, _u0 - _v);
				v_uint8x16 diff = v_min(c0, c1);

				v_int16x8 _c0 = v_load_aligned(cost + x*D + d);
				v_int16x8 _c1 = v_load_aligned(cost + x*D + d + 8);

				v_uint16x8 diff1, diff2;
				v_expand(diff, diff1, diff2);
				v_store_aligned(cost + x*D + d, _c0 + v_reinterpret_as_s16(diff1 >> diff_scale));
				v_store_aligned(cost + x*D + d + 8, _c1 + v_reinterpret_as_s16(diff2 >> diff_scale));
			}
#else
			for (int d = minD; d < maxD; d++)
			{
				int v = prow2[width - x - 1 + d];
				int v0 = buffer[width - x - 1 + d];
				int v1 = buffer[width - x - 1 + d + width2];
				int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u);
				int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v);

				cost[x*D + d] = (CostType)(cost[x*D + d] + (std::min(c0, c1) >> diff_scale));
			}
#endif
		}
	}
#else
	for (c = 0; c < cn * 2; c++, prow1 += width, prow2 += width)
	{
		for (x = minX1; x < maxX1; x++)
		{
			int u = prow1[x];
#if CV_SSE2
			if (useSIMD)
			{
				__m128i _u = _mm_set1_epi8(u), z = _mm_setzero_si128();

				for (int d = minD; d < maxD; d += 16)
				{
					__m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width - 1 - x + d));
					__m128i diff = _mm_adds_epu8(_mm_subs_epu8(_u, _v), _mm_subs_epu8(_v, _u));
					__m128i c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
					__m128i c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));

					_mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_unpacklo_epi8(diff, z)));
					_mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_unpackhi_epi8(diff, z)));
				}
			}
			else
#endif
			{
				for (int d = minD; d < maxD; d++)
				{
					int v = prow2[width - 1 - x + d];
					cost[x*D + d] = (CostType)(cost[x*D + d] + (CostType)std::abs(u - v));
				}
			}
		}
	}
#endif
}

static void computeDisparitySGBM(const Mat& img1, const Mat& img2,
	Mat& disp1, const StereoSGBMParams& params,
	Mat& buffer)
{
#if CV_SSE2
	static const uchar LSBTab[] =
	{
		0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
		5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
		6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
		5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
		7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
		5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
		6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
		5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
	};

	volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
#endif

	const int ALIGN = 16;
	//const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
	const int DISP_SCALE = (1 << DISP_SHIFT);
	const CostType MAX_COST = SHRT_MAX;

	int minD = params.minDisparity, maxD = minD + params.numDisparities;
	Size SADWindowSize;
	SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
	//int ftzero = std::max(params.preFilterCap, 15) | 1;
	int ftzero = 63;
	int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
	int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
	//int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? (int)params.P2 : 5, (int)(P1 + 1));
	int P1 = params.P1 > 0 ? params.P1 : 2, P2 = params.P2;
	P2 = std::max(P2 > 0 ? P2 : 5, P1 + 1);
	int k, width = disp1.cols, height = disp1.rows;
	int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
	int D = maxD - minD, width1 = maxX1 - minX1;
	int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
	int SW2 = SADWindowSize.width / 2, SH2 = SADWindowSize.height / 2;
	//bool fullDP = params.mode == StereoSGBM::MODE_HH;
	bool fullDP = params.mode == MODE_HH;
	int npasses = fullDP ? 2 : 1;
	const int TAB_OFS = 256 * 4, TAB_SIZE = 256 + TAB_OFS * 2;
	PixType clipTab[TAB_SIZE];

	/************************************************************
		OpenCV 將 gradient 的值限制在 0 ~ ftzero*2 之間
		因此如果 gradient score 的值為 0 => 則 tab[0] = ftzero
		若 gradient 極大,值為255 => tab[255] 最多隻到 ftzero * 2
		若 gradient 極小,值為-255 => tab[-255] 最小隻到 0
	/************************************************************/
	for (k = 0; k < TAB_SIZE; k++)
		clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);

	if (minX1 >= maxX1)
	{
		disp1 = Scalar::all(INVALID_DISP_SCALED);
		return;
	}

	CV_Assert(D % 16 == 0);

	// NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
	// if you change NR, please, modify the loop as well.
	int D2 = D + 16, NRD2 = NR2*D2;

	// the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
	// for 8-way dynamic programming we need the current row and
	// the previous row, i.e. 2 rows in total
	const int NLR = 2;
	const int LrBorder = NLR - 1;

	// for each possible stereo match (img1(x,y) <=> img2(x-d,y))
	// we keep pixel difference cost (C) and the summary cost over NR directions (S).
	// we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
	size_t costBufSize = width1*D;
	size_t CSBufSize = costBufSize*(fullDP ? height : 1);
	size_t minLrSize = (width1 + LrBorder * 2)*NR2, LrSize = minLrSize*D2;
	int hsumBufNRows = SH2 * 2 + 2;
	size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType)+ // minLr[] and Lr[]
		costBufSize*(hsumBufNRows + 1)*sizeof(CostType)+ // hsumBuf, pixdiff
		CSBufSize * 2 * sizeof(CostType)+ // C, S
		width * 16 * img1.channels()*sizeof(PixType)+ // temp buffer for computing per-pixel cost
		width*(sizeof(CostType)+sizeof(DispType)) + 1024; // disp2cost + disp2

	if (buffer.empty() || !buffer.isContinuous() ||
		buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize)
		buffer.create(1, (int)totalBufSize, CV_8U);

	// summary cost over different (nDirs) directions
	CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);
	CostType* Sbuf = Cbuf + CSBufSize;
	CostType* hsumBuf = Sbuf + CSBufSize;
	CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;

	CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;
	DispType* disp2ptr = (DispType*)(disp2cost + width);
	PixType* tempBuf = (PixType*)(disp2ptr + width);

	// add P2 to every C(x,y). it saves a few operations in the inner loops
	for (k = 0; k < width1*D; k++)
		Cbuf[k] = (CostType)P2;

	/************* note *************/
	/*
	設 params.preFilterCap = 63 則
	int ftzero = std::max(params.preFilterCap, 15) | 1
	= 63 (params.preFilterCap = 62或63 ftzero的結果都是63)
	TAB_OFS = 1024
	TAB_SIZE = 2304
	clipTab[0] ~ clipTab[961] = 0
	clipTab[962] = 1
	clipTab[963] = 2
	clipTab[964] = 3
	......
	clipTab[1087] = 126
	clipTab[1088] = 126
	......
	clipTab[2303] = 126

	(目前合理猜測stereoSGBM 5 way aggregation是指: 左 左上 上 右上 右,
	因為每個 row 在掃完就決定了 bestD, 那能夠利用的就只有 "該row" 跟 "之前算過的資訊" 了)
	NR - the number of direction (NR從頭到尾沒被用過)(如果是 5 way NR = 5)
	NR2 似乎才是實際用的值 (預設NR = 16, NR2 = NR/2)

	dx, dy => x,y loop 時累進的方向(pass1時:+1(遞增) or pass2:-1(從後面算回來))

	maxD = minD + params.numDisparities;     (128 in my experiment, minD = 0)

	width = disp1.cols, height = disp1.rows;
	int minX1 = std::max(maxD, 0)            (minX1 = 128)
	maxX1 = width + std::min(minD, 0);   (maxX1 = 612)

	D  = maxD - minD						 (D = 128)
	D2 = D + 16								 (D2 = D + 16)
	width1 = maxX1 - minX1;              (width1 = 484)

	DISP_SCALE = 16(應該是)

	int SW2 = SADWindowSize.width/2
	SH2 = SADWindowSize.height/2;

	// the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
	// for 8-way dynamic programming we need the current row and
	// the previous row, i.e. 2 rows in total
	// 意思就是 aggregation 需要 NLR = 2 個 row 去做 aggregation (這兩個row包含前一個row, 與現在的row, 共兩個row)
	const int NLR = 2;
	const int LrBorder = NLR - 1;

	// for each possible stereo match (img1(x,y) <=> img2(x-d,y))
	// we keep pixel difference cost (C) and the summary cost over NR directions (S).
	// we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
	*/

	for (int pass = 1; pass <= npasses; pass++)
	{
		int x1, y1, x2, y2, dx, dy;

		if (pass == 1)
		{
			y1 = 0; y2 = height; dy = 1;
			x1 = 0; x2 = width1; dx = 1;
		}
		else
		{
			y1 = height - 1; y2 = -1; dy = -1;
			x1 = width1 - 1; x2 = -1; dx = -1;
		}

		CostType *Lr[NLR] = { 0 }, *minLr[NLR] = { 0 };

		for (k = 0; k < NLR; k++)
		{
			// shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
			// and will occasionally use negative indices with the arrays
			// we need to shift Lr[k] pointers by 1, to give the space for d=-1.
			// however, then the alignment will be imperfect, i.e. bad for SSE,
			// thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
			Lr[k] = pixDiff + costBufSize + LrSize*k + NRD2*LrBorder + 8;
			memset(Lr[k] - LrBorder*NRD2 - 8, 0, LrSize*sizeof(CostType));
			minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + NR2*LrBorder;
			memset(minLr[k] - LrBorder*NR2, 0, minLrSize*sizeof(CostType));
		}

		for (int y = y1; y != y2; y += dy)
		{
			int x, d;
			DispType* disp1ptr = disp1.ptr<DispType>(y);
			CostType* C = Cbuf + (!fullDP ? 0 : y*costBufSize);
			CostType* S = Sbuf + (!fullDP ? 0 : y*costBufSize);

			if (pass == 1) // compute C on the first pass, and reuse it on the second pass, if any.
			{
				int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
				/*
				只有 pass == 1時才會做
				在y == 0時
				dy1 = 0
				dy2 = SH2 = 2
				在y == rest
				dy1 = y + SH2 = y + 2
				dy2 = dy1     = y + 2
				
				演演算法會預先做下兩個row的結果
				*/

				for (k = dy1; k <= dy2; k++)
				{
					/***************************************************
						在此迴圈中, 會計算 SAD 的 cost (Block Matching), 步驟如下
						(此處假設 SADWindowSize = 5, SH2 = 2, SW2 = 2)
						1. 計算第一個 row 中每個pixel 的 pixDiff (calcPixelCostBT)
						2. 將水平的 pixDiff 累加到 hsum 中
						(pixDiff * scale => 若上面會左右邊沒有pixel, 則用第一個來補, scale就是控制的權重, 因此對第一個來說scale都是 3 (SW2+1或SH2+1))
						(hsum[x] = (x-2) ~ (x+2) 的 pixDiff總和 )
						3. 將 hsum 的值加到 C 中 (透過C來把Cost傳遞一下面的row)
						C[x] = hsumAdd[x]
						(現在的 C 在經過後面的 cost aggregation 後, 就會成為下次的 Cprev)

						4. 計算第二個 row 中每個pixel 的 pixDiff
						5. 將水平的 pixDiff 累加到 hsum 中
						6. C[x] = Cprev[x] + hsumAdd - hsumSub
						(hsumAdd 是 y+2 那個 row 的水平累加結果)
						(hsumSub 是 y-3 那個 row 的水平累加結果)
						(加入新的, 減掉最後的, 加快運算速度, 重複利用, 不用每次都重算)
						之後就是不斷重複這個過程

						特別需要注意的是 此程式的流程每個 row 都會通過 semi-global cost aggregation
						因此 Cprev 都是經過 cost aggregation的(這其實也是一條由上往下的cost aggregation路徑)
					/***************************************************/
					//除了第一個row以外, 之後做的都是第y+2個row
					//costBufSize  是 一個 row 的cost buffer size大小 = width1 * D bytes
					//hsumBufNRows 是 水平kernel size + 1 = 5 + 1 
					//			   +1 的話 mod 的結果才會控制在 0 ~ 5
					CostType* hsumAdd = hsumBuf + (std::min(k, height - 1) % hsumBufNRows)*costBufSize;

					if (k < height)
					{
						calcPixelCostBT(img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero);

						memset(hsumAdd, 0, D*sizeof(CostType));
						/*
								** 累加 kernel中 水平的pixDif **
							邊緣處理 x=0
								hsumAdd = (x=0) + (x=1) + (x=2) 的 pixDiff (各 disparity 獨立) 
								scale 在 x==0 時 為 SW2 + 1, 因為 kernel 左邊沒有畫素(少 SW2 個畫素), 所以就直接拿中心點的值來充數
							中間     x=1 ~ width1
								hsumAdd[x] = (x-2) + (x-1) + x + (x+1) + (x+2) 的 pixDiff 
								
						*/
						for (x = 0; x <= SW2*D; x += D)
						{
							int scale = x == 0 ? SW2 + 1 : 1;
							for (d = 0; d < D; d++){
								hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d] * scale);
								//printf("x:%d d:%d; hsumAdd[d] = %d\n", x, d, hsumAdd[d]);
							}
						}

						if (y > 0)
						{
							const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
							const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;

							for (x = D; x < width1*D; x += D)
							{
								const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1 - 1)*D);
								const CostType* pixSub = pixDiff + std::max(x - (SW2 + 1)*D, 0);

#if CV_SSE2
								if (useSIMD)
								{
									for (d = 0; d < D; d += 8)
									{
										__m128i hv = _mm_load_si128((const __m128i*)(hsumAdd + x - D + d));
										__m128i Cx = _mm_load_si128((__m128i*)(Cprev + x + d));
										hv = _mm_adds_epi16(_mm_subs_epi16(hv,
											_mm_load_si128((const __m128i*)(pixSub + d))),
											_mm_load_si128((const __m128i*)(pixAdd + d)));
										Cx = _mm_adds_epi16(_mm_subs_epi16(Cx,
											_mm_load_si128((const __m128i*)(hsumSub + x + d))),
											hv);
										_mm_store_si128((__m128i*)(hsumAdd + x + d), hv);
										_mm_store_si128((__m128i*)(C + x + d), Cx);
									}
								}
								else
#endif
								{
									for (d = 0; d < D; d++)
									{
										//這次的 horizontal kernel sum = 上次的hsum + 下一個pixDiff - 上次的最後一個pixDiff
										int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
										//這次的Cost = 
										C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
									}
								}
							}
						}
						else
						{
							for (x = D; x < width1*D; x += D)
							{
								const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1 - 1)*D);
								const CostType* pixSub = pixDiff + std::max(x - (SW2 + 1)*D, 0);

								for (d = 0; d < D; d++)
									hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
							}
						}
					}

					/*
						第一列的 C[x] = hsumAdd[x] * scale (因為沒有上兩個row, 所以用第一個row的值補)
					*/
					if (y == 0)
					{
						int scale = k == 0 ? SH2 + 1 : 1;
						for (x = 0; x < width1*D; x++)
							C[x] = (CostType)(C[x] + hsumAdd[x] * scale);
					}
				}

				// also, clear the S buffer
				for (k = 0; k < width1*D; k++)
					S[k] = 0;
			}

			// clear the left and the right borders
			memset(Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType));
			memset(Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType));
			memset(minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType));
			memset(minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType));

			/*
			[formula 13 in the paper]
			compute L_r(p, d) = C(p, d) +
			min(L_r(p-r, d),
			L_r(p-r, d-1) + P1,
			L_r(p-r, d+1) + P1,
			min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
			where p = (x,y), r is one of the directions.
			we process all the directions at once:
			0: r=(-dx, 0)
			1: r=(-1, -dy)
			2: r=(0, -dy)
			3: r=(1, -dy)
			4: r=(-2, -dy)
			5: r=(-1, -dy*2)
			6: r=(1, -dy*2)
			7: r=(2, -dy)
			*/
			for (x = x1; x != x2; x += dx)
			{
				int xm = x*NR2, xd = xm*D2;

				int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;
				int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;

				CostType* Lr_p0 = Lr[0] + xd - dx*NRD2;
				CostType* Lr_p1 = Lr[1] + xd - NRD2 + D2;
				CostType* Lr_p2 = Lr[1] + xd + D2 * 2;
				CostType* Lr_p3 = Lr[1] + xd + NRD2 + D2 * 3;

				Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =
					Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;

				CostType* Lr_p = Lr[0] + xd;
				const CostType* Cp = C + x*D;
				CostType* Sp = S + x*D;

#if CV_SSE2
				if (useSIMD)
				{
					__m128i _P1 = _mm_set1_epi16((short)P1);

					__m128i _delta0 = _mm_set1_epi16((short)delta0);
					__m128i _delta1 = _mm_set1_epi16((short)delta1);
					__m128i _delta2 = _mm_set1_epi16((short)delta2);
					__m128i _delta3 = _mm_set1_epi16((short)delta3);
					__m128i _minL0 = _mm_set1_epi16((short)MAX_COST);

					for (d = 0; d < D; d += 8)
					{
						__m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d));
						__m128i L0, L1, L2, L3;

						L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
						L1 = _mm_load_si128((const __m128i*)(Lr_p1 + d));
						L2 = _mm_load_si128((const __m128i*)(Lr_p2 + d));
						L3 = _mm_load_si128((const __m128i*)(Lr_p3 + d));

						L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
						L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));

						L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d - 1)), _P1));
						L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d + 1)), _P1));

						L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d - 1)), _P1));
						L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d + 1)), _P1));

						L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d - 1)), _P1));
						L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d + 1)), _P1));

						L0 = _mm_min_epi16(L0, _delta0);
						L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);

						L1 = _mm_min_epi16(L1, _delta1);
						L1 = _mm_adds_epi16(_mm_subs_epi16(L1, _delta1), Cpd);

						L2 = _mm_min_epi16(L2, _delta2);
						L2 = _mm_adds_epi16(_mm_subs_epi16(L2, _delta2), Cpd);

						L3 = _mm_min_epi16(L3, _delta3);
						L3 = _mm_adds_epi16(_mm_subs_epi16(L3, _delta3), Cpd);

						_mm_store_si128((__m128i*)(Lr_p + d), L0);
						_mm_store_si128((__m128i*)(Lr_p + d + D2), L1);
						_mm_store_si128((__m128i*)(Lr_p + d + D2 * 2), L2);
						_mm_store_si128((__m128i*)(Lr_p + d + D2 * 3), L3);

						__m128i t0 = _mm_min_epi16(_mm_unpacklo_epi16(L0, L2), _mm_unpackhi_epi16(L0, L2));
						__m128i t1 = _mm_min_epi16(_mm_unpacklo_epi16(L1, L3), _mm_unpackhi_epi16(L1, L3));
						t0 = _mm_min_epi16(_mm_unpacklo_epi16(t0, t1), _mm_unpackhi_epi16(t0, t1));
						_minL0 = _mm_min_epi16(_minL0, t0);

						__m128i Sval = _mm_load_si128((const __m128i*)(Sp + d));

						L0 = _mm_adds_epi16(L0, L1);
						L2 = _mm_adds_epi16(L2, L3);
						Sval = _mm_adds_epi16(Sval, L0);
						Sval = _mm_adds_epi16(Sval, L2);

						_mm_store_si128((__m128i*)(Sp + d), Sval);
					}

					_minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
					_mm_storel_epi64((__m128i*)&minLr[0][xm], _minL0);
				}
				else
#endif
				{
					int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST;

					for (d = 0; d < D; d++)
					{
						int Cpd = Cp[d], L0, L1, L2, L3;

						L0 = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d - 1] + P1, std::min(Lr_p0[d + 1] + P1, delta0))) - delta0;
						L1 = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d - 1] + P1, std::min(Lr_p1[d + 1] + P1, delta1))) - delta1;
						L2 = Cpd + std::min((int)Lr_p2[d], std::min(Lr_p2[d - 1] + P1, std::min(Lr_p2[d + 1] + P1, delta2))) - delta2;
						L3 = Cpd + std::min((int)Lr_p3[d], std::min(Lr_p3[d - 1] + P1, std::min(Lr_p3[d + 1] + P1, delta3))) - delta3;

						Lr_p[d] = (CostType)L0;
						minL0 = std::min(minL0, L0);

						Lr_p[d + D2] = (CostType)L1;
						minL1 = std::min(minL1, L1);

						Lr_p[d + D2 * 2] = (CostType)L2;
						minL2 = std::min(minL2, L2);

						Lr_p[d + D2 * 3] = (CostType)L3;
						minL3 = std::min(minL3, L3);

						Sp[d] = saturate_cast<CostType>(Sp[d] + L0 + L1 + L2 + L3);
					}
					minLr[0][xm] = (CostType)minL0;
					minLr[0][xm + 1] = (CostType)minL1;
					minLr[0][xm + 2] = (CostType)minL2;
					minLr[0][xm + 3] = (CostType)minL3;
				}
			}

			if (pass == npasses)
			{
				for (x = 0; x < width; x++)
				{
					disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
					disp2cost[x] = MAX_COST;
				}

				for (x = width1 - 1; x >= 0; x--)
				{
					CostType* Sp = S + x*D;
					int minS = MAX_COST, bestDisp = -1;

					if (npasses == 1)
					{
						int xm = x*NR2, xd = xm*D2;

						int minL0 = MAX_COST;
						int delta0 = minLr[0][xm + NR2] + P2;
						CostType* Lr_p0 = Lr[0] + xd + NRD2;
						Lr_p0[-1] = Lr_p0[D] = MAX_COST;
						CostType* Lr_p = Lr[0] + xd;

						const CostType* Cp = C + x*D;

#if CV_SSE2
						if (useSIMD)
						{
							__m128i _P1 = _mm_set1_epi16((short)P1);
							__m128i _delta0 = _mm_set1_epi16((short)delta0);

							__m128i _minL0 = _mm_set1_epi16((short)minL0);
							__m128i _minS = _mm_set1_epi16(MAX_COST), _bestDisp = _mm_set1_epi16(-1);
							__m128i _d8 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7), _8 = _mm_set1_epi16(8);

							for (d = 0; d < D; d += 8)
							{
								__m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d)), L0;

								L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
								L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
								L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
								L0 = _mm_min_epi16(L0, _delta0);
								L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);

								_mm_store_si128((__m128i*)(Lr_p + d), L0);
								_minL0 = _mm_min_epi16(_minL0, L0);
								L0 = _mm_adds_epi16(L0, *(__m128i*)(Sp + d));
								_mm_store_si128((__m128i*)(Sp + d), L0);

								__m128i mask = _mm_cmpgt_epi16(_minS, L0);
								_minS = _mm_min_epi16(_minS, L0);
								_bestDisp = _mm_xor_si128(_bestDisp, _mm_and_si128(_mm_xor_si128(_bestDisp, _d8), mask));
								_d8 = _mm_adds_epi16(_d8, _8);
							}

							short CV_DECL_ALIGNED(16) bestDispBuf[8];
							_mm_store_si128((__m128i*)bestDispBuf, _bestDisp);

							_minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
							_minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 4));
							_minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 2));

							__m128i qS = _mm_min_epi16(_minS, _mm_srli_si128(_minS, 8));
							qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 4));
							qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 2));

							minLr[0][xm] = (CostType)_mm_cvtsi128_si32(_minL0);
							minS = (CostType)_mm_cvtsi128_si32(qS);

							qS = _mm_shuffle_epi32(_mm_unpacklo_epi16(qS, qS), 0);
							qS = _mm_cmpeq_epi16(_minS, qS);
							int idx = _mm_movemask_epi8(_mm_packs_epi16(qS, qS)) & 255;

							bestDisp = bestDispBuf[LSBTab[idx]];
						}
						else
#endif
						{
							for (d = 0; d < D; d++)
							{
								int L0 = Cp[d] + std::min((int)Lr_p0[d], std::min(Lr_p0[d - 1] + P1, std::min(Lr_p0[d + 1] + P1, delta0))) - delta0;

								Lr_p[d] = (CostType)L0;
								minL0 = std::min(minL0, L0);

								int Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
								if (Sval < minS)
								{
									minS = Sval;
									bestDisp = d;
								}
							}
							minLr[0][xm] = (CostType)minL0;
						}
					}
					else
					{
						for (d = 0; d < D; d++)
						{
							int Sval = Sp[d];
							if (Sval < minS)
							{
								minS = Sval;
								bestDisp = d;
							}
						}
					}

					for (d = 0; d < D; d++)
					{
						if (Sp[d] * (100 - uniquenessRatio) < minS * 100 && std::abs(bestDisp - d) > 1)
							break;
					}
					if (d < D)
						continue;
					d = bestDisp;
					int _x2 = x + minX1 - d - minD;
					if (disp2cost[_x2] > minS)
					{
						disp2cost[_x2] = (CostType)minS;
						disp2ptr[_x2] = (DispType)(d + minD);
					}

					if (0 < d && d < D - 1)
					{
						// do subpixel quadratic interpolation:
						//   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
						//   then find minimum of the parabola.
						/*******************************************************************
							(Subpixel)
							用 Cost 當作 y 軸座標 (aggregation後的)
							用 d    當作 x 軸座標
							用 (d-1) (d) (d+1) 三個點,使用 parabola fitting,
							找出該 拋物線真正的 minimum 位置 作為最後 subpixel 的結果
							可參考 Sub-pixel Estimation of Local Extrema (不過公式有些不同, 目前找不到OpenCV這個版本的演算法從哪來的)
						/*******************************************************************/
						int denom2 = std::max(Sp[d - 1] + Sp[d + 1] - 2 * Sp[d], 1);
						d = d*DISP_SCALE + ((Sp[d - 1] - Sp[d + 1])*DISP_SCALE + denom2) / (denom2 * 2);
					}
					else
						d *= DISP_SCALE;
					disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
				}

				for (x = minX1; x < maxX1; x++)
				{
					// we round the computed disparity both towards -inf and +inf and check
					// if either of the corresponding disparities in disp2 is consistent.
					// This is to give the computed disparity a chance to look valid if it is.
					int d1 = disp1ptr[x];
					if (d1 == INVALID_DISP_SCALED)
						continue;
					int _d = d1 >> DISP_SHIFT;
					int d_ = (d1 + DISP_SCALE - 1) >> DISP_SHIFT;
					int _x = x - _d, x_ = x - d_;
					if (0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
						0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff)
						disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
				}
			}

			// now shift the cyclic buffers
			std::swap(Lr[0], Lr[1]);
			std::swap(minLr[0], minLr[1]);
		}
	}
}

void compute(InputArray leftarr, InputArray rightarr, OutputArray disparr, StereoSGBMParams params)
{
	Mat buffer;
	const int DISP_SCALE = (1 << DISP_SHIFT);

	Mat left = leftarr.getMat(), right = rightarr.getMat();
	//CV_ASSERT(left.size() == right.size() && left.type() == right.type() &&
	//		  left.depth() == CV_8U);
	disparr.create(left.size(), CV_16S);
	Mat disp = disparr.getMat();

	computeDisparitySGBM(left, right, disp, params, buffer);
	medianBlur(disp, disp, 3);

	if (params.speckleWindowSize > 0)
		filterSpeckles(disp, (params.minDisparity - 1) * DISP_SCALE, params.speckleWindowSize, DISP_SCALE*params.speckleRange, buffer);
}

stereoSGBM.h

#include <opencv2\opencv.hpp>
#include <opencv2\calib3d\calib3d.hpp>
using namespace cv;

//copy stereoSGBM setting
typedef uchar PixType;
typedef short CostType;
typedef short DispType;
enum { NR = 16, NR2 = NR / 2 };
#define DISP_SHIFT 4

#define MODE_SGBM 2
#define MODE_HH 3


struct StereoSGBMParams
{
	StereoSGBMParams()
	{
		minDisparity = numDisparities = 0;
		SADWindowSize = 0;
		P1 = P2 = 0;
		disp12MaxDiff = 0;
		preFilterCap = 0;
		uniquenessRatio = 0;
		speckleWindowSize = 0;
		speckleRange = 0;
		//mode = StereoSGBM::MODE_SGBM;
		mode = MODE_SGBM;
	}

	StereoSGBMParams(int _minDisparity, int _numDisparities, int _SADWindowSize,
		int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
		int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
		int _mode)
	{
		minDisparity = _minDisparity;
		numDisparities = _numDisparities;
		SADWindowSize = _SADWindowSize;
		P1 = _P1;
		P2 = _P2;
		disp12MaxDiff = _disp12MaxDiff;
		preFilterCap = _preFilterCap;
		uniquenessRatio = _uniquenessRatio;
		speckleWindowSize = _speckleWindowSize;
		speckleRange = _speckleRange;
		mode = _mode;
	}

	int minDisparity;
	int numDisparities;
	int SADWindowSize;
	int preFilterCap;
	int uniquenessRatio;
	int P1;
	int P2;
	int speckleWindowSize;
	int speckleRange;
	int disp12MaxDiff;
	int mode;
};

void compute(InputArray leftarr, InputArray rightarr, OutputArray disparr, StereoSGBMParams params);