1. 程式人生 > >XV6學習(12)Lab lock: Parallelism/locking

XV6學習(12)Lab lock: Parallelism/locking

程式碼在[github](https://github.com/weijunji/xv6-6.S081/tree/lock)上 這一次實驗是要對XV6內部的鎖進行優化,減少鎖爭用,提高系統的效能。 ## Memory allocator (moderate) 第一個實驗是對XV6核心的記憶體頁面分配器進行改進,改進的策略在前面的章節中也講過了。XV6原本是使用一個空閒頁面連結串列,但是這樣就會導致不同CPU上的`kalloc`和`kfree`會產生鎖爭用,記憶體頁面的分配被完全序列化了,降低了系統的效能。 而一個改進策略就是為每個CPU核心分配一個空閒連結串列,`kalloc`和`kfree`都在本核心的連結串列上進行,只有噹噹前核心的連結串列為空時才去訪問其他核心的連結串列。通過這種策略就可以減少鎖的爭用,只有當某核心的連結串列為空時才會發生鎖爭用。 首先定義NCPU個`kmem`結構體,並在`kinit`函式中對鎖進行初始化。 ```c struct { struct spinlock lock; struct run *freelist; char lock_name[7]; } kmem[NCPU]; void kinit() { for (int i = 0; i < NCPU; i++) { snprintf(kmem[i].lock_name, sizeof(kmem[i].lock_name), "kmem_%d", i); initlock(&kmem[i].lock, kmem[i].lock_name); } freerange(end, (void*)PHYSTOP); } ``` 對於`kfree`函式只需要將釋放的頁面插入到當前核心對應連結串列上就行了 ```c void kfree(void *pa) { ... r = (struct run*)pa; push_off(); int id = cpuid(); acquire(&kmem[id].lock); r->next = kmem[id].freelist; kmem[id].freelist = r; release(&kmem[id].lock); pop_off(); } ``` 對於`kalloc`函式,當在當前核心上申請失敗時,就嘗試從其他核心上獲取頁面。使用快慢指標來找到連結串列的中點,之後將一半的頁面移動到當前核心的連結串列上。 ```c void * kalloc(void) { struct run *r; push_off(); int id = cpuid(); acquire(&kmem[id].lock); r = kmem[id].freelist; if(r) { kmem[id].freelist = r->next; } else { // alloc failed, try to steal from other cpu int success = 0; int i = 0; for(i = 0; i < NCPU; i++) { if (i == id) continue; acquire(&kmem[i].lock); struct run *p = kmem[i].freelist; if(p) { // steal half of memory struct run *fp = p; // faster pointer struct run *pre = p; while (fp && fp->
next) { fp = fp->next->next; pre = p; p = p->next; } kmem[id].freelist = kmem[i].freelist; if (p == kmem[i].freelist) { // only have one page kmem[i].freelist = 0; } else { kmem[i].freelist = p; pre->
next = 0; } success = 1; } release(&kmem[i].lock); if (success) { r = kmem[id].freelist; kmem[id].freelist = r->next; break; } } } release(&kmem[id].lock); pop_off(); if(r) memset((char*)r, 5, PGSIZE); // fill with junk return (void*)r; } ``` 實驗結果如下: ```shell $ kalloctest start test1 test1 results: --- lock kmem/bcache stats lock: kmem_0: #fetch-and-add 0 #acquire() 77186 lock: kmem_1: #fetch-and-add 0 #acquire() 182362 lock: kmem_2: #fetch-and-add 0 #acquire() 173534 lock: bcache_bucket: #fetch-and-add 0 #acquire() 128 lock: bcache_bucket: #fetch-and-add 0 #acquire() 138 lock: bcache_bucket: #fetch-and-add 0 #acquire() 142 lock: bcache_bucket: #fetch-and-add 0 #acquire() 148 lock: bcache_bucket: #fetch-and-add 0 #acquire() 132 lock: bcache_bucket: #fetch-and-add 0 #acquire() 6 lock: bcache_bucket: #fetch-and-add 0 #acquire() 42 lock: bcache_bucket: #fetch-and-add 0 #acquire() 34 lock: bcache_bucket: #fetch-and-add 0 #acquire() 5916 lock: bcache_bucket: #fetch-and-add 0 #acquire() 32 lock: bcache_bucket: #fetch-and-add 0 #acquire() 242 lock: bcache_bucket: #fetch-and-add 0 #acquire() 128 lock: bcache_bucket: #fetch-and-add 0 #acquire() 128 --- top 5 contended locks: lock: proc: #fetch-and-add 31954 #acquire() 206502 lock: proc: #fetch-and-add 24395 #acquire() 206518 lock: proc: #fetch-and-add 9306 #acquire() 206501 lock: proc: #fetch-and-add 7463 #acquire() 206481 lock: proc: #fetch-and-add 5209 #acquire() 206480 tot= 0 test1 OK start test2 total free number of pages: 32493 (out of 32768) ..... test2 OK ``` ## Buffer cache (hard) 這個實驗是要對XV6的磁碟緩衝區進行優化。在初始的XV6磁碟緩衝區中是使用一個LRU連結串列來維護的,而這就導致了每次獲取、釋放緩衝區時就要對整個連結串列加鎖,也就是說緩衝區的操作是完全序列進行的。 為了提高並行效能,我們可以用雜湊表來代替連結串列,這樣每次獲取和釋放的時候,都只需要對雜湊表的一個桶進行加鎖,桶之間的操作就可以並行進行。只有當需要對緩衝區進行驅逐替換時,才需要對整個雜湊表加鎖來查詢要替換的塊。 使用雜湊表就不能使用連結串列來維護LRU資訊,因此需要在`buf`結構體中新增`timestamp`域來記錄釋放的事件,同時`prev`域也不再需要。 ```c struct buf { int valid; // has data been read from disk? int disk; // does disk "own" buf? uint dev; uint blockno; struct sleeplock lock; uint refcnt; // struct buf *prev; // LRU cache list struct buf *next; uchar data[BSIZE]; uint timestamp; }; ``` 在`brelse`函式中對`timestamp`域進行維護,同時將連結串列的鎖替換為桶級鎖: ```c void brelse(struct buf *b) { if(!holdingsleep(&b->
lock)) panic("brelse"); releasesleep(&b->lock); int idx = hash(b->dev, b->blockno); acquire(&hashtable[idx].lock); b->refcnt--; if (b->refcnt == 0) { // no one is waiting for it. b->timestamp = ticks; } release(&hashtable[idx].lock); } ``` 定義雜湊表的結構體,`bcache.lock`為表級鎖,而`hashtable[i].lock`為桶級鎖: ```c #define NBUCKET 13 #define NBUF (NBUCKET * 3) struct { struct spinlock lock; struct buf buf[NBUF]; } bcache; struct bucket { struct spinlock lock; struct buf head; }hashtable[NBUCKET]; int hash(uint dev, uint blockno) { return blockno % NBUCKET; } ``` 在`binit`函式中對雜湊表進行初始化,將`bcache.buf[NBUF]`中的塊平均分配給每個桶,記得設定`b->blockno`使塊的hash與桶相對應,後面需要根據塊來查詢對應的桶。 ```c void binit(void) { struct buf *b; initlock(&bcache.lock, "bcache"); for(b = bcache.buf; b < bcache.buf+NBUF; b++){ initsleeplock(&b->lock, "buffer"); } b = bcache.buf; for (int i = 0; i < NBUCKET; i++) { initlock(&hashtable[i].lock, "bcache_bucket"); for (int j = 0; j < NBUF / NBUCKET; j++) { b->blockno = i; // hash(b) should equal to i b->next = hashtable[i].head.next; hashtable[i].head.next = b; b++; } } } ``` 之後就是重點`bget`函式,首先在對應的桶當中查詢當前塊是否被快取,如果被快取就直接返回;如果沒有被快取的話,就需要查詢一個塊並將其逐出替換。我這裡使用的策略是先在當前桶當中查詢,當前桶沒有查詢到再去全域性陣列中查詢,這樣的話如果當前桶中有空閒塊就可以避免全域性鎖。 在全域性陣列中查詢時,要先加上表級鎖,當找到一個塊之後,就可以根據塊的資訊查詢到對應的桶,之後再對該桶加鎖,將塊從桶的連結串列上取下來,釋放鎖,最後再加到當前桶的連結串列上去。 這裡有個小問題就是全域性陣列中找到一個塊之後,到對該桶加上鎖之間有一個視窗,可能就在這個窗口裡面這個塊就被那個桶對應的本地查詢階段用掉了。因此,需要在加上鎖之後判斷是否被用了,如果被用了就要重新查詢。 ```c static struct buf* bget(uint dev, uint blockno) { // printf("dev: %d blockno: %d Status: ", dev, blockno); struct buf *b; int idx = hash(dev, blockno); struct bucket* bucket = hashtable + idx; acquire(&bucket->lock); // Is the block already cached? for(b = bucket->head.next; b != 0; b = b->next){ if(b->dev == dev && b->blockno == blockno){ b->refcnt++; release(&bucket->lock); acquiresleep(&b->lock); // printf("Cached %p\n", b); return b; } } // Not cached. // First try to find in current bucket. int min_time = 0x8fffffff; struct buf* replace_buf = 0; for(b = bucket->head.next; b != 0; b = b->next){ if(b->refcnt == 0 && b->timestamp < min_time) { replace_buf = b; min_time = b->timestamp; } } if(replace_buf) { // printf("Local %d %p\n", idx, replace_buf); goto find; } // Try to find in other bucket. acquire(&bcache.lock); refind: for(b = bcache.buf; b < bcache.buf + NBUF; b++) { if(b->refcnt == 0 && b->timestamp < min_time) { replace_buf = b; min_time = b->timestamp; } } if (replace_buf) { // remove from old bucket int ridx = hash(replace_buf->dev, replace_buf->blockno); acquire(&hashtable[ridx].lock); if(replace_buf->refcnt != 0) // be used in another bucket's local find between finded and acquire { release(&hashtable[ridx].lock); goto refind; } struct buf *pre = &hashtable[ridx].head; struct buf *p = hashtable[ridx].head.next; while (p != replace_buf) { pre = pre->next; p = p->next; } pre->next = p->next; release(&hashtable[ridx].lock); // add to current bucket replace_buf->next = hashtable[idx].head.next; hashtable[idx].head.next = replace_buf; release(&bcache.lock); // printf("Global %d -> %d %p\n", ridx, idx, replace_buf); goto find; } else { panic("bget: no buffers"); } find: replace_buf->dev = dev; replace_buf->blockno = blockno; replace_buf->valid = 0; replace_buf->refcnt = 1; release(&bucket->lock); acquiresleep(&replace_buf->lock); return replace_buf; } ``` 最後將`bpin`和`bunpin`的鎖替換為桶級鎖就行了: ```c void bpin(struct buf *b) { int idx = hash(b->dev, b->blockno); acquire(&hashtable[idx].lock); b->refcnt++; release(&hashtable[idx].lock); } void bunpin(struct buf *b) { int idx = hash(b->dev, b->blockno); acquire(&hashtable[idx].lock); b->refcnt--; release(&hashtable[idx].lock); } ``` 實驗結果如下: ```shell start test0 test0 results: --- lock kmem/bcache stats ... lock: bcache_bucket: #fetch-and-add 0 #acquire() 4244 lock: bcache_bucket: #fetch-and-add 0 #acquire() 2246 lock: bcache_bucket: #fetch-and-add 0 #acquire() 4402 lock: bcache_bucket: #fetch-and-add 0 #acquire() 4458 lock: bcache_bucket: #fetch-and-add 0 #acquire() 6450 lock: bcache_bucket: #fetch-and-add 0 #acquire() 6312 lock: bcache_bucket: #fetch-and-add 0 #acquire() 6624 lock: bcache_bucket: #fetch-and-add 0 #acquire() 6634 lock: bcache_bucket: #fetch-and-add 0 #acquire() 12706 lock: bcache_bucket: #fetch-and-add 0 #acquire() 6208 lock: bcache_bucket: #fetch-and-add 0 #acquire() 4360 lock: bcache_bucket: #fetch-and-add 0 #acquire() 4246 lock: bcache_bucket: #fetch-and-add 0 #acquire() 2236 --- top 5 contended locks: lock: proc: #fetch-and-add 269741 #acquire() 4551132 lock: proc: #fetch-and-add 236112 #acquire() 4551131 lock: proc: #fetch-and-add 186278 #acquire() 4551151 lock: proc: #fetch-and-add 167286 #acquire() 4551164 lock: proc: #fetch-and-add 151922 #acquire() 4551132 tot= 0 test0: OK start test1 test1