1. 程式人生 > >用C語言推測Cache相關引數

用C語言推測Cache相關引數

1)塊大小

思路:

假定塊大小為8*4B(其中&為未命中,@為命中)

步長

命中情況

1

& @ @ @

@ @ @ @

2

& @ @ @

& @ @ @

4

& @ & @

& @ & @

8

& & & &

& & & &

16

& & & &

& & & &

……

……

……

因為不命中時間要遠遠大於命中時間,所以步長為2時平均的執行時間應該是步長1的2倍,以此類推,到步長為8時最接近2倍(注:假定塊中為8個字時),但步長再增大時,就應該不會再增加,或者增幅很小,程式碼如下:

#include 
#include 

#define N 256*1024*1024
int arr[N];

int main()
{
	clock_t start, finish;
	int count = N;	//used to produce the average time
					
	for(int j = 1; j <= 512; j <<= 1){	//j is the step,
	                                    //the blocksize can not exceed 512*4B
		start = clock();
			for(int i = 0; i < N; i += j){	// N for the accurate result
				arr[i] = 1;
			}
		finish = clock();
		printf("when the step is %3d, it takes %f clock numbers\n",
			j, (double)(finish-start) / count);
		count >>= 1;
	}
	
	return 0;
}
2)容量及層級數  

思路:

不妨假定,cache是1KB的整數倍,可能值為1KB,2KB,4KB……8MB(即程式碼中的length),由上一步可知塊大小為64B,每塊有16字。每次訪問保證在1KB或者2KB或者4KB……的大小內訪問,假設L1大小為16KB,當剛超出範圍超出16KB時,訪問時間會出現一個急劇的上升,因為這時L1 cache大小已不滿足需要,必需藉助L2,據此寫如下程式碼:
#include 
#include 
#define PATH "/home/deropty/cache/cachesize.txt"
#define N 2*1024*1024	//the maxsize is 2*1024*1024*4B=8MB
int arr[N];
int loop = (1<<24);


int main()
{
	clock_t start, finish;
	int count = 1;
	FILE *fp = NULL;
	fp = fopen(PATH, "w");
	for(int length = 256; length <= N; length += 256){	
	                            //256*4B=1KB,the step is 1KB
								//length controls the range
		start = clock();
		int lengthmod = length - 1;
		for(int i = 0; i < loop; ++i){		
			++arr[(i*16) & lengthmod];	//i*16 to save the time
		}
		finish = clock();
		printf("%4dk: when the array length is %7d, the cost time is %7.3fms\n",
			count,length,(double)(finish-start) / 1000);
		fprintf(fp, "%4dk\t\t%7.3fms\n", count, (double)(finish-start) / 1000);
		++count;
	}
	fclose(fp);
	fp = NULL;
	return 0;
}

3)命中時間和缺失代價
由1)已知塊大小為16字,所以每次訪問16倍數個字,讓每次都缺失,對大量缺失時間做均值即可求出缺失代價。對於命中時間可一直訪問同一資料,對大量命中時間的資料做均值也可得出命中時間,程式碼如下:
#include 
#include 
#include 

#define N 1024*1024
int loop = (1<<25);

int main()
{
	int arr[N];
	clock_t start, finish;
	double sumtime = 0;
	for(int k = 0; k < 16; k ++){
		start = clock();
		for(int i = 0; i < loop; ++i){
			int index = (i * 16) & (N-1);	//gurantee that it does miss
			arr[index] *= 3;	//meaningless
		}
		finish = clock();
		int duration = finish - start;
		start = clock();			//minus the time taked by lopps
		for(int i = 0; i < loop; ++i){
			int index = (i*16) & (N-1);
		}
		finish = clock();
		double tmp = (double)(duration - (finish-start)) / loop;
		sumtime += tmp;
		printf("the miss time is %fus\n", tmp);
	}
	printf("\nThe average miss time is %fus\n", sumtime / 16);

	sumtime = 0;
	for(int k = 0; k < 16; k ++){
		arr[0] = 1;
		start = clock();
		for(int i = 0; i < loop; ++i){
			arr[0] *= 3;		//always visit the arr[0] ;)
		}
		finish = clock();
		int duration = finish - start;
		start = clock();
		for(int i = 0; i < loop; ++i){

		}
		finish = clock();
		double tmp = (double)(duration - (finish-start)) / loop;
		sumtime += tmp;
		printf("the hit  time is %fus\n", tmp);
	}
	printf("\nThe average hit  time is %fus\n", sumtime / 16);

	return 0;
}
4)求快取的關聯度

         思路:

假設有每個快取有16個塊,則直接對映情況如下:

塊編號

0

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

@

@

@

@

@

@

@

@

@

@

@

@

@

@

@

@

對映情況

0

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

48

50

51

52

2路組相聯對映情況:

塊編號

0

1

2

3

4

5

6

7

@

@

@

@

@

@

@

@

@

@

@

@

@

@

@

@

對映情況

0

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

4路組相聯對映情況

塊編號

0

1

2

3

@

@

@

@

@

@

@

@

@

@

@

@

@

@

@

@

對映情況

0

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

如上圖所示,對於編號為0和16的塊,不停訪問,在直接對映中會產生衝突,但對2路、4路、8路……則不會衝突;對於編號為0、16、32、48的塊,不停訪問,在1路、2路中會產生衝突,但對於4路、8路、16路……則不會衝突,以此類推,我們讓程式每次都出現可能導致塊缺失的情況,即訪問編號為0、16的多次,計算平均訪問時間;訪問0、16、32、48的塊多次,計算平均訪問時間;訪問編號為0、16、32、48、64、80的塊多次,計算平均訪問時間……假定CPU為4路組相聯,則不停訪問0、16、32、48不會出現缺失,而再增大為0、16、32、48、64,就會出現不停缺失的情況,據此思路,寫出程式碼如下:

#include 
#include 

#define N (1<<8<<10<<10)
int arr[N];
int loop = (1<<16);	//for producing a accurate result

int main()
{
	clock_t start, finish;
	int sumtime = 0;

	for(int asso = 2; asso <= 16; asso += 2){ 
	                                //suppose that associative is in (2-16)
		start = clock();
		for(int k = 0; k < loop; ++k)
			for(int i = 0; i < asso; ++i)
				arr[i<<9<<4] = 0;	//512blocks in cache L1, 16 int/block
		finish = clock();
		int duration = finish - start;
		start = clock();		//again for accuraty
		for(int k = 0; k < loop; ++k)
			for(int i = 0; i < asso; ++i)
				i<<9<<4<<4;
		finish = clock();
		printf("%2dways associative's average time is %fus\n",
			asso, (double)(duration - (finish-start)) / (loop*asso));
	}
	return 0;
}


上述程式碼還有很多問題,但現在作為一個階段的總結,先貼出來,以後有時間再回來修改(估計沒時間了,哈哈)