1. 程式人生 > >XV6學習(15)Lab mmap: Mmap

XV6學習(15)Lab mmap: Mmap

程式碼在[Github](https://github.com/weijunji/xv6-6.S081/tree/mmap)上。 這一個實驗是要實現最基礎的`mmap`功能。mmap即記憶體對映檔案,將一個檔案直接對映到記憶體當中,之後對檔案的讀寫就可以直接通過對記憶體進行讀寫來進行,而對檔案的同步則由作業系統來負責完成。使用`mmap`可以避免對檔案大量`read`和`write`操作帶來的核心緩衝區和使用者緩衝區之間的頻繁的資料拷貝。在Kafka訊息佇列等軟體中藉助`mmap`來實現零拷貝(zero-copy)。 首先定義`vma`結構體用於儲存記憶體對映資訊,並在`proc`結構體中加入`struct vma *vma`指標: ```c #define NVMA 16 #define VMA_START (MAXVA / 2) struct vma{ uint64 start; uint64 end; uint64 length; // 0 means vma not used uint64 off; int permission; int flags; struct file *file; struct vma *next; struct spinlock lock; }; // Per-process state struct proc { ... struct vma *vma; ... }; ``` 之後實現對`vma`分配的程式碼: ```c struct vma vma_list[NVMA]; struct vma* vma_alloc(){ for(int i = 0; i < NVMA; i++){ acquire(&vma_list[i].lock); if(vma_list[i].length == 0){ return &vma_list[i]; }else{ release(&vma_list[i].lock); } } panic("no enough vma"); } ``` 實現`mmap`系統呼叫,這個函式主要就是申請一個`vma`,之後查詢一塊空閒記憶體,填入相關資訊,將`vma`插入到程序的`vma`連結串列中去: ```c uint64 sys_mmap(void) { uint64 addr; int length, prot, flags, fd, offset; if(argaddr(0, &addr) < 0 || argint(1, &length) < 0 || argint(2, &prot) < 0 || argint(3, &flags) < 0 || argint(4, &fd) < 0 || argint(5, &offset) < 0){ return -1; } if(addr != 0) panic("mmap: addr not 0"); if(offset != 0) panic("mmap: offset not 0"); struct proc *p = myproc(); struct file* f = p->ofile[fd]; int pte_flag = PTE_U; if (prot & PROT_WRITE) { if(!f->writable && !(flags & MAP_PRIVATE)) return -1; // map to a unwritable file with PROT_WRITE pte_flag |= PTE_W; } if (prot & PROT_READ) { if(!f->readable) return -1; // map to a unreadable file with PROT_READ pte_flag |= PTE_R; } struct vma* v = vma_alloc(); v->permission = pte_flag; v->length = length; v->off = offset; v->file = myproc()->ofile[fd]; v->flags = flags; filedup(f); struct vma* pv = p->vma; if(pv == 0){ v->start = VMA_START; v->end = v->start + length; p->vma = v; }else{ while(pv->next) pv = pv->next; v->start = PGROUNDUP(pv->end); v->end = v->start + length; pv->next = v; v->next = 0; } addr = v->start; printf("mmap: [%p, %p)\n", addr, v->end); release(&v->lock); return addr; } ``` 接下來就可以在`usertrap`中對缺頁中斷進行處理:查詢程序的`vma`連結串列,判斷該地址是否為對映地址,如果不是就說明出錯,直接返回;如果在`vma`連結串列中,就可以申請並對映一個頁面,之後根據`vma`從對應的檔案中讀取資料: ```c int mmap_handler(uint64 va, int scause) { struct proc *p = myproc(); struct vma* v = p->vma; while(v != 0){ if(va >= v->start && va < v->end){ break; } //printf("%p\n", v); v = v->next; } if(v == 0) return -1; // not mmap addr if(scause == 13 && !(v->permission & PTE_R)) return -2; // unreadable vma if(scause == 15 && !(v->permission & PTE_W)) return -3; // unwritable vma // load page from file va = PGROUNDDOWN(va); char* mem = kalloc(); if (mem == 0) return -4; // kalloc failed memset(mem, 0, PGSIZE); if(mappages(p->pagetable, va, PGSIZE, (uint64)mem, v->permission) != 0){ kfree(mem); return -5; // map page failed } struct file *f = v->file; ilock(f->ip); readi(f->ip, 0, (uint64)mem, v->off + va - v->start, PGSIZE); iunlock(f->ip); return 0; } ``` 之後就是`munmap`的實現,同樣先從連結串列中找到對應的`vma`結構體,之後根據三種不同情況(頭部、尾部、整個)來寫回並釋放對應的頁面並更新`vma`資訊,如果整個區域都被釋放就將`vma`和檔案釋放。 ```c uint64 sys_munmap(void) { uint64 addr; int length; if(argaddr(0, &addr) < 0 || argint(1, &length) < 0){ return -1; } struct proc *p = myproc(); struct vma *v = p->vma; struct vma *pre = 0; while(v != 0){ if(addr >= v->start && addr < v->end) break; // found pre = v; v = v->next; } if(v == 0) return -1; // not mapped printf("munmap: %p %d\n", addr, length); if(addr != v->start && addr + length != v->end) panic("munmap middle of vma"); if(addr == v->start){ writeback(v, addr, length); uvmunmap(p->pagetable, addr, length / PGSIZE, 1); if(length == v->length){ // free all fileclose(v->file); if(pre == 0){ p->vma = v->next; // head }else{ pre->next = v->next; v->next = 0; } acquire(&v->lock); v->length = 0; release(&v->lock); }else{ // free head v->start -= length; v->off += length; v->length -= length; } }else{ // free tail v->length -= length; v->end -= length; } return 0; } ``` 寫回函式先判斷是否需要寫回,當需要寫回時就仿照`filewrite`的實現,將資料寫回到對應的檔案當中去,這裡的實現是直接寫回所有頁面,但實際可以根據`PTE_D`來判斷記憶體是否被寫入,如果沒有寫入就不用寫回: ```c void writeback(struct vma* v, uint64 addr, int n) { if(!(v->permission & PTE_W) || (v->flags & MAP_PRIVATE)) // no need to writeback return; if((addr % PGSIZE) != 0) panic("unmap: not aligned"); printf("starting writeback: %p %d\n", addr, n); struct file* f = v->file; int max = ((MAXOPBLOCKS-1-1-2) / 2) * BSIZE; int i = 0; while(i < n){ int n1 = n - i; if(n1 > max) n1 = max; begin_op(); ilock(f->ip); printf("%p %d %d\n",addr + i, v->off + v->start - addr, n1); int r = writei(f->ip, 1, addr + i, v->off + v->start - addr + i, n1); iunlock(f->ip); end_op(); i += r; } } ``` 最後就是在`fork`當中複製`vma`到子程序,在`exit`中當前程序的`vma`連結串列釋放,在`exit`時要對頁面進行寫回: ```c int fork(void) { ... np->state = RUNNABLE; np->vma = 0; struct vma *pv = p->vma; struct vma *pre = 0; while(pv){ struct vma *vma = vma_alloc(); vma->start = pv->start; vma->end = pv->end; vma->off = pv->off; vma->length = pv->length; vma->permission = pv->permission; vma->flags = pv->flags; vma->file = pv->file; filedup(vma->file); vma->next = 0; if(pre == 0){ np->vma = vma; }else{ pre->next = vma; } pre = vma; release(&vma->lock); pv = pv->next; } ... } void exit(int status) { struct proc *p = myproc(); if(p == initproc) panic("init exiting"); // munmap all mmap vma struct vma* v = p->vma; struct vma* pv; while(v){ writeback(v, v->start, v->length); uvmunmap(p->pagetable, v->start, PGROUNDUP(v->length) / PGSIZE, 1); fileclose(v->file); pv = v->next; acquire(&v->lock); v->next = 0; v->length = 0; release(&v->lock); v = pv; } .