MIT 6.S081的Lab 5: Copy-on-Write Fork for xv6 题解
前置 
内核可以通过将 PTE 标记为无效或只读来截获内存引用,导致页面错误, 并且可以通过修改 PTE 来更改地址的含义。
为优化内存,比如父进程fork后复制可能需大量时间,但子进程不一定用,很可能浪费了,只有在父/子进程需要写内容时才需要各自一个单独的page,否则都可以共享。
COW fork() 只为子项创建一个分页表,为用户提供 PTE 指向父级物理页的内存,标记父子的PTE只读。因需要写page fault时,内核复制并分配新page,标记为可写。物理页面需要引用计数,没有引用时才可释放。
要求 
实现copy-on-write fork。
计划:
修改kernel/vm.c中的uvmcopy使其不分配新物理内存而是映射到原本的页表。 
修改kernel/trap.c中的usertrap使其识别page fault,用kalloc分配新内存,复制page,设PTE可写。 
在kalloc时更新引用计数, kfree() 将page放在free list中但只有引用计数为0才正式释放。可以将这些计数放在一个固定大小的数组里,找一个方法让page能找到数组下标,比如模数,give the array a number of elements equal to highest physical address of any page placed on the free list by kinit() in kalloc.c.(没看懂?) 
kernel/vm.c中的copyout也需要为引用计数修改。 
提示:
PTE最后两位RSW保留给supervisor software(内核),将bit8标识为当前是一个copy-on-write page。 
pagetable的很多define在 kernel/riscv.h 
COW的page fault,如果没有内存可分配,进程应该kill了。 
 
分析 
fork后不分配直接映射 
fork()调用uvmcopy复制page,在uvmcopy中首先删除父进程的可写位,并设置第8位(在riscv.h中define一个),之后分配内存的部分全部删掉,将映射mappages中的新内存mem改成已有的pa就行了。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 #define  PTE_C (1L << 7) int uvmcopy (pagetable_t  old, pagetable_t  new, uint64 sz) {   ...   for (i = 0 ; i < sz; i += PGSIZE){     ...     pa = PTE2PA(*pte);          *pte = *pte&(~PTE_W)|PTE_C;     flags = PTE_FLAGS(*pte);                         if (mappages(new, i, PGSIZE, pa, flags) != 0 ){              goto  err;     }          addref((void  *)pa);   }   ... } 
识别page fault 
在usertrap中增加page fault的处理,根据文档:
应当是在r_scause() == 15 时是因为page fault而trap的(其他的page fault不是复制这些操作),所以在这里复制原映射的page给当前进程,注意判断是否是COW的page fault,剩下就是普通的复制页面,可参考原本的uvmcopy:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 if (r_scause() == 15 ) {    uint64 va = PGROUNDDOWN(r_stval());     if (va >= MAXVA || va > p->sz)        goto  err;     pte_t  *pte;     pte = walk(p->pagetable, va, 0 );     if (pte && (*pte & PTE_C)) {       char  *mem;       if ((mem = kalloc()) == 0 )         goto  err;       memset (mem, 0 , PGSIZE);       uint64 pa = walkaddr(p->pagetable, va);       if (pa == 0 )          goto  err;       memmove(mem, (char *)pa, PGSIZE);       int  flags = PTE_FLAGS(*pte);              flags = (flags | PTE_W) & (~PTE_C);       if (mappages(p->pagetable, va, PGSIZE, (uint64)mem, flags) != 0 ){         printf ("usertrap: COW can't map new page" );         kfree(mem);         goto  err;       }       kfree((void * )pa);     } else  {       printf ("usertrap: not COW page fault" );       err:         p->killed = 1 ;         goto  end;     }   } ...   end:   if (p->killed)     exit (-1 ); ... 
引用计数的设计 
剩下的部分是关于引用计数的,设计要非常精巧(/(ㄒoㄒ)/~~)。
注意这里涉及到可能多个进程改一个引用计数,要加锁,至于数组的大小,从kinit找到了一些奇怪的define的数,点进去发现kernel/memlayout.h中有这样一段:
1 2 3 4 5 #define  KERNBASE 0x80000000L #define  PHYSTOP (KERNBASE + 128*1024*1024) 
所以页表的个数就是128*1024*1024/PGSIZE,index就是第几块,(位置-KERNBASE)/PGSIZE就可算:
1 2 3 4 5 6 7 8 9 10 11 12 struct  page_ref {  struct  spinlock  lock ;   int  num; }; #define  PGNUM 128*1024*1024/PGSIZE struct  page_ref  page_ref [PGNUM ];int getrefindex (void  *pa) {   return  ((uint64)pa-KERNBASE)/PGSIZE; } 
在freerange中一起初始化,注意kfree了所以计数先设为1:
1 2 3 4 5 6 7 8 9 10 11 void freerange (void  *pa_start, void  *pa_end) {   char  *p;   p = (char *)PGROUNDUP((uint64)pa_start);   for (; p + PGSIZE <= (char *)pa_end; p += PGSIZE) {     initlock(&(page_ref[getrefindex((void  *)p)].lock), "page_ref" );     page_ref[getrefindex(p)].num = 1 ;     kfree(p);   } } 
写一个加引用计数和设置引用计数为1的函数,不需要写减法因为减的时候可能涉及到释放内存,有并行的风险(先减再读,两步必须在一个锁里):
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 void addref (void  *pa) {   uint64 index = getrefindex(pa);   acquire(&page_ref[index].lock);   page_ref[index].num++;   release(&page_ref[index].lock); } void setref (void  *pa) {   uint64 index = getrefindex(pa);   acquire(&page_ref[index].lock);   page_ref[index].num = 1 ;   release(&page_ref[index].lock); } 
先改kalloc,在分配过内存后调用一个上述updateref即可:
1 2 3 4 5 6 7 8 9 10 void  *kalloc (void ) {   ...   if (r) {     memset ((char *)r, 5 , PGSIZE);      setref((void *)r);   }   return  (void *)r; } 
再改kfree,计数减少前先检验是否已经为0(报panic),之后加锁,修改后检查计数值,如果为0则按原来代码释放内存,最后释放锁。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 void kfree (void  *pa) {   struct  run  *r ;   if (((uint64)pa % PGSIZE) != 0  || (char *)pa < end || (uint64)pa >= PHYSTOP)     panic("kfree" );      uint64 index = getrefindex(pa);   acquire(&page_ref[index].lock);   page_ref[index].num--;   if (page_ref[index].num == 0 ) {          memset (pa, 1 , PGSIZE);     r = (struct  run*)pa;     acquire(&kmem.lock);     r->next = kmem.freelist;     kmem.freelist = r;     release(&kmem.lock);   }   release(&page_ref[index].lock); } 
这里有个细节,是在读网上的博客看到的,如果引用计数的减少写在kfree里是最方便的,因为原本的代码中kfree包括了映射变更后的释放,如果引用计数减少写在每个映射改动的地方会很多很麻烦。
内核态的COW 
内核态复制页面COW在copyout中,与usertrap基本一致:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 int copyout (pagetable_t  pagetable, uint64 dstva, char  *src, uint64 len) {   uint64 n, va0, pa0;   pte_t  *pte;   while (len > 0 ){     va0 = PGROUNDDOWN(dstva);     if (va0 >= MAXVA)        return  -1 ;     pte = walk(pagetable, va0, 0 );     if (pte && (*pte & PTE_C)) {       char  *mem;       if ((mem = kalloc()) == 0 ) {         printf ("copyout: kalloc failed\n" );         return  -1 ;       }       memset (mem, 0 , sizeof (mem));       uint64 pa = walkaddr(pagetable, va0);       if (pa) {         memmove(mem, (char *)pa, PGSIZE);         int  flags = PTE_FLAGS(*pte);                  flags = (flags | PTE_W) & (~PTE_C);         if (mappages(pagetable, va0, PGSIZE, (uint64)mem, flags) != 0 ){           printf ("copyout: COW can't map new page" );           kfree(mem);           return  -1 ;         }         kfree((void *)pa);       }     }     pa0 = walkaddr(pagetable, va0);     if (pa0 == 0 )       return  -1 ;     n = PGSIZE - (dstva - va0);     if (n > len)       n = len;     memmove((void  *)(pa0 + (dstva - va0)), src, n);     len -= n;     src += n;     dstva = va0 + PGSIZE;   }   return  0 ; } 
函数定义补充 
使用的函数添加到kernel/def.h:
1 2 3 4 5 6 7 ... void             addref (void  *) ;... ... pte_t  *         walk (pagetable_t , uint64, int ) ;
重复映射判断的修改 
上面代码写完如果直接跑会报panic: mappages: remap,因为重复映射时原本会报错,但现在如果发现page有标记PTE_C就可以不报错,在mappages()中修改:
1 2 3 4 5 6 7 8 int mappages (pagetable_t  pagetable, uint64 va, uint64 size, uint64 pa, int  perm) {   ...     if ((*pte & PTE_C) == 0  && (*pte & PTE_V))       panic("mappages: remap" );   ... } 
实现 
以下文件都在kernel中,所以不写一级目录了。
def.h:
1 2 3 4 5 6 7 ... void             addref (void  *) ;... ... pte_t  *         walk (pagetable_t , uint64, int ) ;
kalloc.c
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 struct  page_ref {  struct  spinlock  lock ;   int  num; }; #define  PGNUM 128*1024*1024/PGSIZE struct  page_ref  page_ref [PGNUM ];uint64 getrefindex (void  *pa) {   return  ((uint64)pa-KERNBASE)/PGSIZE; } void addref (void  *pa) {   uint64 index = getrefindex(pa);   acquire(&page_ref[index].lock);   page_ref[index].num++;   release(&page_ref[index].lock); } void setref (void  *pa) {   uint64 index = getrefindex(pa);   acquire(&page_ref[index].lock);   page_ref[index].num = 1 ;   release(&page_ref[index].lock); } ... void freerange (void  *pa_start, void  *pa_end) {   char  *p;   p = (char *)PGROUNDUP((uint64)pa_start);   for (; p + PGSIZE <= (char *)pa_end; p += PGSIZE) {     initlock(&(page_ref[getrefindex((void  *)p)].lock), "page_ref" );     page_ref[getrefindex(p)].num = 1 ;     kfree(p);   } } ... void kfree (void  *pa) {   struct  run  *r ;   if (((uint64)pa % PGSIZE) != 0  || (char *)pa < end || (uint64)pa >= PHYSTOP)     panic("kfree" );      uint64 index = getrefindex(pa);   acquire(&page_ref[index].lock);   page_ref[index].num--;   if (page_ref[index].num == 0 ) {          memset (pa, 1 , PGSIZE);     r = (struct  run*)pa;     acquire(&kmem.lock);     r->next = kmem.freelist;     kmem.freelist = r;     release(&kmem.lock);   }   release(&page_ref[index].lock); } void  *kalloc (void ) {   struct  run  *r ;   acquire(&kmem.lock);   r = kmem.freelist;   if (r)     kmem.freelist = r->next;   release(&kmem.lock);   if (r) {     memset ((char *)r, 5 , PGSIZE);      setref((void *)r);   }   return  (void *)r; } 
riscv.h
trap.c
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 ... else  if (r_scause() == 15 ) {    uint64 va = PGROUNDDOWN(r_stval());     if (va >= MAXVA || va > p->sz)        goto  err;     pte_t  *pte;     pte = walk(p->pagetable, va, 0 );     if (pte && (*pte & PTE_C)) {       char  *mem;       if ((mem = kalloc()) == 0 )         goto  err;       uint64 pa = walkaddr(p->pagetable, va);       if (pa == 0 )          goto  err;       memmove(mem, (char *)pa, PGSIZE);       int  flags = PTE_FLAGS(*pte);              flags = (flags | PTE_W) & (~PTE_C);       if (mappages(p->pagetable, va, PGSIZE, (uint64)mem, flags) != 0 ){         printf ("usertrap: COW can't map new page" );         kfree(mem);         goto  err;       }       kfree((void * )pa);     } else  {       printf ("usertrap: not COW page fault" );       err:         p->killed = 1 ;         goto  end;     }   } else  if ((which_dev = devintr()) != 0 ){        } else  {     printf ("usertrap(): unexpected scause %p pid=%d\n" , r_scause(), p->pid);     printf ("            sepc=%p stval=%p\n" , r_sepc(), r_stval());     p->killed = 1 ;   }   end:   if (p->killed)     exit (-1 ); ... 
vm.c
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 int mappages (pagetable_t  pagetable, uint64 va, uint64 size, uint64 pa, int  perm) {   ...     if ((*pte & PTE_C) == 0  && (*pte & PTE_V))       panic("mappages: remap" );   ... } int uvmcopy (pagetable_t  old, pagetable_t  new, uint64 sz) {   pte_t  *pte;   uint64 pa, i;   uint flags;   for (i = 0 ; i < sz; i += PGSIZE){     if ((pte = walk(old, i, 0 )) == 0 )       panic("uvmcopy: pte should exist" );     if ((*pte & PTE_V) == 0 )       panic("uvmcopy: page not present" );     pa = PTE2PA(*pte);          *pte = (*pte & (~PTE_W)) | PTE_C;     flags = PTE_FLAGS(*pte);     if (mappages(new, i, PGSIZE, (uint64)pa, flags) != 0 ){       goto  err;     }     addref((void  *)pa);   }   return  0 ;  err:   uvmunmap(new, 0 , i / PGSIZE, 1 );   return  -1 ; } int copyout (pagetable_t  pagetable, uint64 dstva, char  *src, uint64 len) {   uint64 n, va0, pa0;   pte_t  *pte;   while (len > 0 ){     va0 = PGROUNDDOWN(dstva);     if (va0 >= MAXVA)        return  -1 ;     pte = walk(pagetable, va0, 0 );     if (pte && (*pte & PTE_C)) {       char  *mem;       if ((mem = kalloc()) == 0 ) {         printf ("copyout: kalloc failed\n" );         return  -1 ;       }       uint64 pa = walkaddr(pagetable, va0);       if (pa) {         memmove(mem, (char *)pa, PGSIZE);         int  flags = PTE_FLAGS(*pte);                  flags = (flags | PTE_W) & (~PTE_C);         if (mappages(pagetable, va0, PGSIZE, (uint64)mem, flags) != 0 ){           printf ("copyout: COW can't map new page" );           kfree(mem);           return  -1 ;         }         kfree((void *)pa);       }     }     pa0 = walkaddr(pagetable, va0);     if (pa0 == 0 )       return  -1 ;     n = PGSIZE - (dstva - va0);     if (n > len)       n = len;     memmove((void  *)(pa0 + (dstva - va0)), src, n);     len -= n;     src += n;     dstva = va0 + PGSIZE;   }   return  0 ; } 
测试 
记得加一个time.txt的文件记录时间。
1 sudo python3 grade-lab-cow