自制記憶體拷貝函式學習筆記
本文嘗試自己實現記憶體拷貝函式並且對各種大小的拷貝效能進行測試,與linux系統呼叫memcpy進行對比,旨在深入理解記憶體訪問方式,以及對記憶體訪問在程式執行中佔用時間比有一定的認識。
測試環境為:
* 64位linux
* Intel(R) Xeon(R) 8 Core 2GHZ
* cache大小8192KB,cache對齊位元組數64位元組,一次快取分組數:8
* gcc-4.1.2,編譯引數-O2
幾種實現方式:
1. 無任何優化
void* mymemcpy_naive(void* dest, const void* src, size_t len) { char* destc = (char*)dest; const char* srcc = (const char*)src; while (len-- > 0) { *destc++ = *srcc++; } return dest; }
2. 以64/32/16/8位元組單位進行傳輸
void* mymemcpy_64(void* dest, const void* src, size_t len) { char* destc = (char*)dest; const char* srcc = (const char*)src; // first do 64 bytes align while (((size_t)srcc) & 0x3F != 0 && len > 0) { *destc++ = *srcc++; --len; } COPY64(destc, srcc); COPY32(destc, srcc); COPY16(destc, srcc); COPY8(destc, srcc); COPY4(destc, srcc); while (len-- > 0) { *destc++ = *srcc++; } return dest; }
其中COPY64巨集定義如下:
#define COPY64(destc, srcc) \ while (len >= 64) { \ register unsigned long q1 = *(unsigned long*)srcc; \ register unsigned long q2 = *(unsigned long*)(srcc+8); \ register unsigned long q3 = *(unsigned long*)(srcc+16); \ register unsigned long q4 = *(unsigned long*)(srcc+24); \ register unsigned long q5 = *(unsigned long*)(srcc+32); \ register unsigned long q6 = *(unsigned long*)(srcc+40); \ register unsigned long q7 = *(unsigned long*)(srcc+48); \ register unsigned long q8 = *(unsigned long*)(srcc+56); \ *(unsigned long*)destc = q1; \ *(unsigned long*)(destc+8) = q2; \ *(unsigned long*)(destc+16) = q3; \ *(unsigned long*)(destc+24) = q4; \ *(unsigned long*)(destc+32) = q5; \ *(unsigned long*)(destc+40) = q6; \ *(unsigned long*)(destc+48) = q7; \ *(unsigned long*)(destc+56) = q8; \ srcc+=64; \ destc+=64; \ len-=64; \ }
COPY32/COPY16/COPY8等類似COPY64定義。此處略。
3. 在傳輸中間加上memory barrier。程式碼同上,只是COPY64/COPY32/COPY16等換為COPY64B/COPY32B/COPY16B。其中COPY64定義如下:
#define COPY64B(destc, srcc) \
while (len >= 64) { \
register unsigned long q1 = *(unsigned long*)srcc; \
register unsigned long q2 = *(unsigned long*)(srcc+8); \
register unsigned long q3 = *(unsigned long*)(srcc+16); \
register unsigned long q4 = *(unsigned long*)(srcc+24); \
register unsigned long q5 = *(unsigned long*)(srcc+32); \
register unsigned long q6 = *(unsigned long*)(srcc+40); \
register unsigned long q7 = *(unsigned long*)(srcc+48); \
register unsigned long q8 = *(unsigned long*)(srcc+56); \
__memory_barrier(); \
*(unsigned long*)destc = q1; \
*(unsigned long*)(destc+8) = q2; \
*(unsigned long*)(destc+16) = q3; \
*(unsigned long*)(destc+24) = q4; \
*(unsigned long*)(destc+32) = q5; \
*(unsigned long*)(destc+40) = q6; \
*(unsigned long*)(destc+48) = q7; \
*(unsigned long*)(destc+56) = q8; \
srcc+=64; \
destc+=64; \
len-=64; \
}
__memory_barrier()定義如下:
#define __memory_barrier() asm volatile("":::"memory")
注:
* 加上memory barrier有什麼效果?
實測-O2優化情況下,COPY64巨集展開之後的迴圈體內部,會編譯為以下程式碼:
mov 0x8(%r11),%rdx
mov 0x10(%r11),%rcx
sub $0x40,%rbp
mov 0x18(%r11),%rsi
mov 0x20(%r11),%rdi
mov 0x28(%r11),%r8
mov 0x30(%r11),%r9
mov 0x38(%r11),%r10
mov (%r11),%rax
add $0x40,%r11
mov %rdx,0x8(%rbx)
mov %rcx,0x10(%rbx)
mov %rsi,0x18(%rbx)
mov %rdi,0x20(%rbx)
mov %rax,(%rbx)
mov %r8,0x28(%rbx)
mov %r9,0x30(%rbx)
mov %r10,0x38(%rbx)
關於在mov過程中夾雜無關運算程式碼sub $0x40,%rbp,add $0x40,%r11,可以參考編譯器為隱藏記憶體傳輸延時而打亂指令順序的優化資料,在讀記憶體指令後到記憶體實際被fetch到暫存器,該暫存器可用的過程之間是需要一定的等待週期的,在現代CPU和記憶體上,此週期為100-200ns之間,在這期間可以執行其它指令,否則必須等待記憶體讀取完成。
關於mov (%r11),%rax這一句。編譯器沒有按照實際讀取順序從src+0開始訪問,估計跟gcc優化中關於打亂指令順序以增強暫存器藕合性有關。但是這一舉動可能造成cache失效。加上memory barrier之後,編譯的結果就是按順序訪問記憶體了。具體有沒有影響請參見後面的測試資料。
4. 不使用中間變數直接賦值。程式碼同上,只是將COPY64B/COPY32B/COPY16B/COPY8B換成COPY64D/COPY32D/COPY16D/COPY8D,其中COPY64D定義如下:
#define COPY64D(destc, srcc) \
while (len >= 64) { \
*(unsigned long*)destc = *(unsigned long*)srcc; \
*(unsigned long*)(destc+8) = *(unsigned long*)(srcc+8); \
*(unsigned long*)(destc+16) = *(unsigned long*)(srcc+16); \
*(unsigned long*)(destc+24) = *(unsigned long*)(srcc+24); \
*(unsigned long*)(destc+32) = *(unsigned long*)(srcc+32); \
*(unsigned long*)(destc+40) = *(unsigned long*)(srcc+40); \
*(unsigned long*)(destc+48) = *(unsigned long*)(srcc+48); \
*(unsigned long*)(destc+56) = *(unsigned long*)(srcc+56); \
srcc+=64; \
destc+=64; \
len-=64; \
}
實測編譯器並不能生成x86上的repz movsq之類的程式碼,因此這種方式展開之後也是需要藉助中間暫存器的。但是區別是讀和寫交錯進行了。具體效果請見後文資料。
函式名定義如下:
mymemcpy_ 傳輸位元組 _ (d=直接傳輸 b=加memory barrier)
表頭:一次性拷貝的位元組數 memcpy(dest, src, N)中的N
資料:每秒鐘拷貝位元組數(MB/s) 這裡MB是Mega BYTE
8 | 16 | 25 | 32 | 50 | 64 | 100 | 200 | 256 | 500 | 1000 | 1024 | 4096 | 16384 | 512000 | 1000000 | |
sys_memcpy | 871.06 | 1755.35 | 3361.04 | 4028.17 | 3885.78 | 6162.04 | 6724.57 | 10101.9 | 12263.3 | 13648.7 | 14932.5 | 15035.8 | 7501.12 | 7889.67 | 3903.07 | 3465.7 |
mymemcpy_naive | 690.748 | 817.927 | 782.5 | 824.169 | 893.4 | 916.579 | 897.421 | 945.511 | 962.502 | 985.278 | 996.581 | 997.625 | 963.121 | 1003.89 | 1004.52 | 982.129 |
mymemcpy_8 | 1093.51 | 1776.38 | 2620.14 | 2898.56 | 3431.27 | 3960.09 | 5772.17 | 4684.11 | 6080.54 | 7483.41 | 5143.5 | 4991.39 | 5298.27 | 5359.06 | 5146.09 | 5234.19 |
mymemcpy_8d | 1076.36 | 1684.14 | 2294.36 | 2444.5 | 2730.97 | 3009.11 | 3940.89 | 4293.32 | 4495.53 | 4977.22 | 4923.46 | 4903.33 | 5265.59 | 5349.46 | 5137.51 | 5230.2 |
mymemcpy_16 | 1242.03 | 1984.16 | 2501.14 | 3342.26 | 4139.8 | 5070.09 | 5689.43 | 8239.53 | 7837.96 | 8841.32 | 10146.2 | 9234.83 | 8045.63 | 8053.07 | 5710.49 | 6204.91 |
mymemcpy_16b | 1242.68 | 1984.72 | 2800.19 | 3340.88 | 4149.63 | 5036.35 | 5685.04 | 8234.77 | 7721.36 | 8770.93 | 10139.4 | 9164.73 | 8047.52 | 8053.32 | 5625.9 | 4903.46 |
mymemcpy_16d | 1242.86 | 2018.99 | 3154.52 | 3286.88 | 3975.27 | 4969.42 | 5771.7 | 6985.4 | 7717.84 | 8761.11 | 7201.82 | 9201.3 | 8040.79 | 8055.22 | 5666.88 | 6200.88 |
mymemcpy_32 | 897.28 | 2308.46 | 2651.23 | 3231.68 | 4719.01 | 5385.69 | 7347.54 | 8392.39 | 9747.96 | 13096.5 | 13617.4 | 10656.8 | 10582.6 | 10573.9 | 6189.46 | 6284.59 |
mymemcpy_32b | 897.659 | 2306.73 | 2657.88 | 3229.03 | 4699.63 | 5387.46 | 7345.72 | 8389.51 | 9758.69 | 13125.7 | 13652 | 13868 | 10623 | 10585.1 | 6131.78 | 6252.79 |
mymemcpy_32d | 1000.06 | 2486 | 2966.07 | 3523.38 | 4925.87 | 6062.82 | 8417.91 | 10896.9 | 12216.2 | 13648.2 | 11190.6 | 14841.4 | 8045.32 | 8053.23 | 5710.89 | 6216.18 |
mymemcpy_64 | 863.286 | 1794.37 | 2295.35 | 3400.56 | 3882.92 | 6155.23 | 7481.61 | 11199.2 | 12606.5 | 13463.7 | 14755 | 14677.9 | 12453 | 10880.5 | 3705.59 | 3818.43 |
mymemcpy_64b | 897.59 | 1794.84 | 2294.51 | 3395.98 | 4040.51 | 6155.34 | 7471 | 11195.1 | 12610.5 | 13645.9 | 14828 | 14471.4 | 12513 | 12262 | 6199.8 | 6226.03 |
mymemcpy_64d | 1002 | 1583.03 | 2509.21 | 3800.17 | 4811.31 | 6250.39 | 8782.94 | 12382.3 | 14160.7 | 14873.9 | 15363.8 | 15538.1 | 8048.33 | 8054.37 | 5688.67 | 6213.36 |
結論:
* 不加優化的memcpy僅在拷貝資料較小時速度和其它函式差不多
* 資料量在cache可以完全容納的大小之內的情況下,傳輸可以達到超過理論最大值的速度以上(本機記憶體訪問速度理論值為8GB/s)
* 如果不考慮通用性,較大或較小記憶體的傳輸可以考慮用自制函式替代系統memory copy
* 指令的亂序可能對記憶體訪問的優化造成一定影響