剩餘塊用switch處理
阿新 • • 發佈:2020-12-05
在做迴圈展開時,處理完整除塊後,還需要在剩餘塊處理。做了個實驗對比,用switch加速剩餘塊段處理
// switch0.c #include <stdio.h> #include <stdint.h> #include <stdlib.h> #include <time.h> int main(int argc, char *argv[]) { int res = atoi(argv[1]); uint32_t accum = 0; int vec[res], *vec_end = vec + res; for (int i = 0; i < res; i++) { vec[i] = rand(); } clock_t start = clock(); for (uint32_t i = UINT32_MAX; i; i--) { switch (res) { // case 20: // accum += vec[20 - 1]; // case 19: // accum += vec[19 - 1]; // case 18: // accum += vec[18 - 1]; // case 17: // accum += vec[17 - 1]; case 16: accum += vec[16 - 1]; case 15: accum += vec[15 - 1]; case 14: accum += vec[14 - 1]; case 13: accum += vec[13 - 1]; case 12: accum += vec[12 - 1]; case 11: accum += vec[11 - 1]; case 10: accum += vec[10 - 1]; case 9: accum += vec[9 - 1]; case 8: accum += vec[8 - 1]; case 7: accum += vec[7 - 1]; case 6: accum += vec[6 - 1]; case 5: accum += vec[5 - 1]; case 4: accum += vec[4 - 1]; case 3: accum += vec[3 - 1]; case 2: accum += vec[2 - 1]; case 1: accum += vec[1 - 1]; } } clock_t end = clock(); printf("%u\n", accum); printf("%lu\n", end - start); }
// switch0.c #include <stdio.h> #include <stdint.h> #include <stdlib.h> #include <time.h> int main(int argc, char *argv[]) { int res = atoi(argv[1]); uint32_t accum = 0; int vec[res], *vec_end = vec + res; for (int i = 0; i < res; i++) { vec[i] = rand(); } clock_t start = clock(); for (uint32_t i = UINT32_MAX; i; i--) { switch (res) { // case 20: // accum += vec_end[-20]; // case 19: // accum += vec_end[-19]; // case 18: // accum += vec_end[-18]; // case 17: // accum += vec_end[-17]; case 16: accum += vec_end[-16]; case 15: accum += vec_end[-15]; case 14: accum += vec_end[-14]; case 13: accum += vec_end[-13]; case 12: accum += vec_end[-12]; case 11: accum += vec_end[-11]; case 10: accum += vec_end[-10]; case 9: accum += vec_end[-9]; case 8: accum += vec_end[-8]; case 7: accum += vec_end[-7]; case 6: accum += vec_end[-6]; case 5: accum += vec_end[-5]; case 4: accum += vec_end[-4]; case 3: accum += vec_end[-3]; case 2: accum += vec_end[-2]; case 1: accum += vec_end[-1]; } } clock_t end = clock(); printf("%u\n", accum); printf("%lu\n", end - start); }
filename | 剩餘塊大小為7|剩餘塊大小為14| |
---|---|
swtich0.c | 9265104 |
switch1.c | 9250006 |
好像效能差不多(我原以為第二種寫法會快一些)。
另外在datasketches-cpp/common/include/MurmurHash3.h裡看到裡類似於第一種段寫法。另外或許可以用accum陣列代替accum變數來進一步加速
一般intel的cpu的cachelinesize為64