Linux 0.11 程序0、1、2
本部落格研究重點主要放在程序0、1、2的建立和fork、execv等系統呼叫,因此會省去很多內部細節,只在程式碼上加上必要註釋,這對我們理解重點沒有影響。文章全部參考《Linux核心設計的藝術(第二版)》
程序0初始化
程序0是Linux作業系統中執行的第一個程序,也是Linux作業系統父子程序建立機制的第一個父程序。Linux初始化主要包含以下三方面:
- 系統初始化程序0。程序管理結構task_struct的母本(init_task = {INIT_TASK})已經在程式碼設計階段事先準備好了。之後要對程序0的task_struct中的LDT、TSS與GDT掛接,並對GDT、task[64]以及與程序排程有關的暫存器進行初始化
- 設定時鐘中斷,支援多程序輪轉。
- 程序0要具備處理系統呼叫的能力,通過set_system_gate將system_call與IDT相掛接。
這三點的實現都是在sched_init()函式中實現的,具體程式碼如下:
//程式碼路徑:include/linux/head.h typedef struct desc_struct { unsigned long a,b; } desc_table[256]; //程式碼路徑:kernel/sched.c #define LATCH (1193180/HZ) union task_union { struct task_struct task; char stack[PAGE_SIZE]; }; static union task_union init_task = {INIT_TASK,}; struct task_struct *current = &(init_task.task); struct task_struct *last_task_used_math = NULL; //初始化程序槽task[NR_TASKS]的第一項為程序0,即task[0]為程序0佔用 struct task_struct * task[NR_TASKS] = {&(init_task.task), }; void sched_init(void) { int i; struct desc_struct * p; if (sizeof(struct sigaction) != 16) panic("Struct sigaction MUST be 16 bytes"); set_tss_desc(gdt+FIRST_TSS_ENTRY,&(init_task.task.tss)); set_ldt_desc(gdt+FIRST_LDT_ENTRY,&(init_task.task.ldt)); p = gdt+2+FIRST_TSS_ENTRY; for(i=1;i<NR_TASKS;i++) { task[i] = NULL; p->a=p->b=0; p++; p->a=p->b=0; p++; } /* Clear NT, so that we won't have troubles with that later on */ __asm__("pushfl ; andl $0xffffbfff,(%esp) ; popfl"); ltr(0); lldt(0); outb_p(0x36,0x43); /* binary, mode 3, LSB/MSB, ch 0 */ outb_p(LATCH & 0xff , 0x40); /* LSB */ outb(LATCH >> 8 , 0x40); /* MSB */ set_intr_gate(0x20,&timer_interrupt); outb(inb_p(0x21)&~0x01,0x21); set_system_gate(0x80,&system_call); } //程式碼路徑:include/linux/sched.h struct tss_struct { long back_link; /* 16 high bits zero */ long esp0; long ss0; /* 16 high bits zero */ long esp1; long ss1; /* 16 high bits zero */ long esp2; long ss2; /* 16 high bits zero */ long cr3; long eip; long eflags; long eax,ecx,edx,ebx; long esp; long ebp; long esi; long edi; long es; /* 16 high bits zero */ long cs; /* 16 high bits zero */ long ss; /* 16 high bits zero */ long ds; /* 16 high bits zero */ long fs; /* 16 high bits zero */ long gs; /* 16 high bits zero */ long ldt; /* 16 high bits zero */ long trace_bitmap; /* bits: trace 0, bitmap 16-31 */ struct i387_struct i387; }; struct task_struct { /* these are hardcoded - don't touch */ long state; /* -1 unrunnable, 0 runnable, >0 stopped */ long counter; long priority; long signal; struct sigaction sigaction[32]; long blocked; /* bitmap of masked signals */ /* various fields */ int exit_code; unsigned long start_code,end_code,end_data,brk,start_stack; long pid,father,pgrp,session,leader; unsigned short uid,euid,suid; unsigned short gid,egid,sgid; long alarm; long utime,stime,cutime,cstime,start_time; unsigned short used_math; /* file system info */ int tty; /* -1 if no tty, so it must be signed */ unsigned short umask; struct m_inode * pwd; struct m_inode * root; struct m_inode * executable; unsigned long close_on_exec; struct file * filp[NR_OPEN]; /* ldt for this task 0 - zero 1 - cs 2 - ds&ss */ struct desc_struct ldt[3]; /* tss for this task */ struct tss_struct tss; }; /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x9ffff (=640kB) */ #define INIT_TASK \ /* state etc */ { 0,15,15, \ /* signals */ 0,{{},},0, \ /* ec,brk... */ 0,0,0,0,0,0, \ /* pid etc.. */ 0,-1,0,0,0, \ /* uid etc */ 0,0,0,0,0,0, \ /* alarm */ 0,0,0,0,0,0, \ /* math */ 0, \ /* fs info */ -1,0022,NULL,NULL,NULL,0, \ /* filp */ {NULL,}, \ { \ {0,0}, \ /* ldt */ {0x9f,0xc0fa00}, \ {0x9f,0xc0f200}, \ }, \ /*tss*/ {0,PAGE_SIZE+(long)&init_task,0x10,0,0,0,0,(long)&pg_dir,\ 0,0,0,0,0,0,0,0, \ 0,0,0x17,0x17,0x17,0x17,0x17,0x17, \ _LDT(0),0x80000000, \ {} \ }, \ } extern struct task_struct *task[NR_TASKS]; extern struct task_struct *last_task_used_math; extern struct task_struct *current; /* * Entry into gdt where to find first TSS. 0-nul, 1-cs, 2-ds, 3-syscall * 4-TSS0, 5-LDT0, 6-TSS1 etc ... */ #define FIRST_TSS_ENTRY 4 #define FIRST_LDT_ENTRY (FIRST_TSS_ENTRY+1) #define _TSS(n) ((((unsigned long) n)<<4)+(FIRST_TSS_ENTRY<<3)) #define _LDT(n) ((((unsigned long) n)<<4)+(FIRST_LDT_ENTRY<<3)) #define ltr(n) __asm__("ltr %%ax"::"a" (_TSS(n))) #define lldt(n) __asm__("lldt %%ax"::"a" (_LDT(n)))
設定GDT中對應程序0的TSS、LDT描述符
#define FIRST_TSS_ENTRY 4
#define FIRST_LDT_ENTRY (FIRST_TSS_ENTRY+1)
set_tss_desc(gdt+FIRST_TSS_ENTRY,&(init_task.task.tss));
set_ldt_desc(gdt+FIRST_LDT_ENTRY,&(init_task.task.ldt));
設定GDT中和程序0相關的TSS和LDT描述符,程序0的tss內容和ldt內容均放在自己的task_struct中,因此向set_xxx_desc傳入描述符對應的偏移量和地址即可進行初始化。
p = gdt+2+FIRST_TSS_ENTRY;
for(i=1;i<NR_TASKS;i++) {
task[i] = NULL; //程序i的指標為null
p->a=p->b=0; //程序i的tss描述符
p++;
p->a=p->b=0; //程序i的ldt描述符
p++;
}
之後對gdt中的其他內容和task陣列的其他內容都清零。
#define FIRST_TSS_ENTRY 4
#define FIRST_LDT_ENTRY (FIRST_TSS_ENTRY+1)
#define _TSS(n) ((((unsigned long) n)<<4)+(FIRST_TSS_ENTRY<<3))
#define _LDT(n) ((((unsigned long) n)<<4)+(FIRST_LDT_ENTRY<<3))
#define ltr(n) __asm__("ltr %%ax"::"a" (_TSS(n)))
#define lldt(n) __asm__("lldt %%ax"::"a" (_LDT(n)))
ltr(0);
lldt(0);
設定tr暫存器和ldt暫存器,指向程序0的tss描述符和ldt描述符。
設定時鐘中斷
outb_p(0x36,0x43); /* binary, mode 3, LSB/MSB, ch 0 */
outb_p(LATCH & 0xff , 0x40); /* LSB */
outb(LATCH >> 8 , 0x40); /* MSB */
set_intr_gate(0x20,&timer_interrupt);
outb(inb_p(0x21)&~0x01,0x21); //開啟時鐘中斷
時鐘中斷處理程式timer_interrupt的程式碼程序排程時再進行介紹,先略過。
設定系統呼叫總入口
#define set_system_gate(n,addr) \
_set_gate(&idt[n],15,3,addr) //陷阱門
set_system_gate(0x80,&system_call);
將系統呼叫處理函式system_call與int0x80中斷描述符表掛接。system_call是整個作業系統中系統呼叫軟中斷的總入口。所有使用者程式使用系統呼叫,產生int 0x80軟中斷後,作業系統都是通過這個總入口找到具體的系統呼叫函式。
程序0由0特權級反轉到3特權級,成為真正的程序
Linux作業系統規定,除程序0之外,所有程序都要由一個已有程序在3特權級下建立。
在Linux0.11中,程序0的程式碼和資料都是由作業系統的設計者寫在核心程式碼、資料區,並且,此前處在0特權級,嚴格說還不是真正意義上的程序。為了遵守規則,在程序0正式建立程序1之前,要將程序0由0特權級轉變為3特權級。方法是呼叫move_ to_ user_ mode()函式,模仿中斷返回動作,實現程序0的特權級從0轉變為3。
//程式碼路徑:init/main.c
void main()
{
...
move_to_user_mode();
...
}
//程式碼路徑:include/system.h
#define move_to_user_mode() \
__asm__ ("movl %%esp,%%eax\n\t" \
"pushl $0x17\n\t" \ //ss 0b10111: CPL=3,LDT中的第3個描述符
"pushl %%eax\n\t" \ //esp
"pushfl\n\t" \ //flags
"pushl $0x0f\n\t" \ //cs 0b1111: CPL=3, LDT中的第2個描述符
"pushl $1f\n\t" \ //把標號1對應的eip壓到棧中
"iret\n" \
"1:\tmovl $0x17,%%eax\n\t" \
"movw %%ax,%%ds\n\t" \
"movw %%ax,%%es\n\t" \
"movw %%ax,%%fs\n\t" \
"movw %%ax,%%gs" \
:::"ax")
手工模擬特權級從0到3的棧的內容,使用iret反轉到3特權級,執行3特權級的程式碼
此時我們再來看一下程序0的LDT內容
/*
* INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x9ffff (=640kB)
*/
#define INIT_TASK \
/* state etc */ { 0,15,15, \
/* signals */ 0,{{},},0, \
/* ec,brk... */ 0,0,0,0,0,0, \
/* pid etc.. */ 0,-1,0,0,0, \
/* uid etc */ 0,0,0,0,0,0, \
/* alarm */ 0,0,0,0,0,0, \
/* math */ 0, \
/* fs info */ -1,0022,NULL,NULL,NULL,0, \
/* filp */ {NULL,}, \
{ \
{0,0}, \
/* ldt */ {0x9f,0xc0fa00}, \ //程式碼段,base=0,G=1,DPL=3,Execute/Read
{0x9f,0xc0f200}, \ //資料段,base=0,G=1,DPL=3,Read/Write
}, \
/*tss*/ {0,PAGE_SIZE+(long)&init_task,0x10,0,0,0,0,(long)&pg_dir,\
0,0,0,0,0,0,0,0, \
0,0,0x17,0x17,0x17,0x17,0x17,0x17, \
_LDT(0),0x80000000, \
{} \
}, \
}
可以看到此時程序0的程式碼段都是從0開始,界限為640KB。ss0為0x10,指向GDT中核心資料段,esp0為init_task頁面最頂端。
程序1的建立
程序0現在處在3特權級狀態,即程序狀態。正式開始執行要做的第一件事就是作為父程序呼叫fork函式建立第一個子程序—程序 1,這是父子程序建立機制的第一次實際運用。以後,所有程序都是基於父子程序建立機制由父程序創建出來的。
//程式碼路i紀念館:init/main.c
static inline _syscall0(int, fork)
void main(void)
{
move_to_user_mode();
if(!fork()){
init();
}
/*
* NOTE!! For any other task 'pause()' would mean we have to get a
* signal to awaken, but task0 is the sole exception (see 'schedule()')
* as task 0 gets activated at every idle moment (when no other tasks
* can run). For task0 'pause()' just means we go check if some other
* task can run, and if not we return here.
*/
for(;;) pause();
}
//程式碼路徑:include/unistd.h
#define __NR_setup 0 /* used only by init, to get system going */
#define __NR_exit 1
#define __NR_fork 2
#define _syscall0(type,name) \
type name(void) \
{ \
long __res; \
__asm__ volatile ("int $0x80" \
: "=a" (__res) \
: "0" (__NR_##name)); \ //將呼叫號傳入eax暫存器中作為引數,fork中為2,2為sys_fork在sys_call_table中的偏移值
if (__res >= 0) \
return (type) __res; \
errno = -__res; \
return -1; \
}
extern int errno;
//程式碼路徑:include/linux/sys.h
extern int sys_setup();
extern int sys_exit();
extern int sys_fork();
fn_ptr sys_call_table[] = { sys_setup, sys_exit, sys_fork, sys_read,
sys_write, sys_open, sys_close, sys_waitpid, sys_creat, sys_link,
sys_unlink, sys_execve, sys_chdir, sys_time, sys_mknod, sys_chmod,
sys_chown, sys_break, sys_stat, sys_lseek, sys_getpid, sys_mount,
sys_umount, sys_setuid, sys_getuid, sys_stime, sys_ptrace, sys_alarm,
sys_fstat, sys_pause, sys_utime, sys_stty, sys_gtty, sys_access,
sys_nice, sys_ftime, sys_sync, sys_kill, sys_rename, sys_mkdir,
sys_rmdir, sys_dup, sys_pipe, sys_times, sys_prof, sys_brk, sys_setgid,
sys_getgid, sys_signal, sys_geteuid, sys_getegid, sys_acct, sys_phys,
sys_lock, sys_ioctl, sys_fcntl, sys_mpx, sys_setpgid, sys_ulimit,
sys_uname, sys_umask, sys_chroot, sys_ustat, sys_dup2, sys_getppid,
sys_getpgrp, sys_setsid, sys_sigaction, sys_sgetmask, sys_ssetmask,
sys_setreuid,sys_setregid };
執行int 0x80程式後,產生一個軟中斷,CPU從特權級3的程序0程式碼跳轉到0特權級的核心程式碼中執行,中斷將SS、ESP、EFLAGS、CS和EIP這5個暫存器自動壓棧,壓入init_task中的程序0核心棧。
# 程式碼路徑:kernel/system_call.s
nr_system_calls = 72
.align 2
bad_sys_call:
movl $-1,%eax
iret
.align 2
reschedule:
pushl $ret_from_sys_call
jmp _schedule
.align 2
_system_call:
cmpl $nr_system_calls-1,%eax //比較eax中的程序號和nr_system_calls,檢視
ja bad_sys_call //是否越界
push %ds //繼續壓入引數
push %es
push %fs
pushl %edx
pushl %ecx # push %ebx,%ecx,%edx as parameters
pushl %ebx # to the system call
movl $0x10,%edx # set up ds,es to kernel space
mov %dx,%ds
mov %dx,%es
movl $0x17,%edx # fs points to local data space
mov %dx,%fs
call _sys_call_table(,%eax,4) //執行函式
pushl %eax
movl _current,%eax
cmpl $0,state(%eax) # state
jne reschedule
cmpl $0,counter(%eax) # counter
je reschedule
ret_from_sys_call:
movl _current,%eax # task[0] cannot have signals
cmpl _task,%eax
je 3f
cmpw $0x0f,CS(%esp) # was old code segment supervisor ?
jne 3f
cmpw $0x17,OLDSS(%esp) # was stack segment = 0x17 ?
jne 3f
movl signal(%eax),%ebx
movl blocked(%eax),%ecx
notl %ecx
andl %ebx,%ecx
bsfl %ecx,%ecx
je 3f
btrl %ecx,%ebx
movl %ebx,signal(%eax)
incl %ecx
pushl %ecx
call _do_signal
popl %eax
3: popl %eax
popl %ebx
popl %ecx
popl %edx
pop %fs
pop %es
pop %ds
iret
接下來到了我們的重點fork函式,程式碼如下
.align 2
_sys_fork:
call _find_empty_process
testl %eax,%eax
js 1f
push %gs
pushl %esi
pushl %edi
pushl %ebp
pushl %eax
call _copy_process
addl $20,%esp
1: ret
-
首先在task[64]中為程序1申請一個空閒位置並獲取程序號
呼叫find_empty_process()函式為程序1獲得一個可用的程序號和task[64]中的一個位置,將該位置放入eax暫存器中返回。
在該函式中,核心用全域性變數last_pid來存放系統自開機以來累計的程序數,也將此變數用作新建程序的程序號(task_struct中的pid變數)。因為Linux 0.11的task[64]只有64項,最多隻能同時執行64個程序,如果find_empty_process()函式返回-EAGAIN,意味著當前已經有64個程序在執行。
-
繼續壓入暫存器
-
呼叫copy_process函式
copy_process是我們關注的重點,接下來好好研究一下。
copy_process
程序0已經成為一個可以建立子程序的父程序,在核心中有“程序0的task_ struct” 和“程序0的頁表項”等專屬程序0的管理資訊。程序0將在copy_ process() 函式中做非常重要的、體現父子程序建立機制的工作:
- 為程序1建立task_struct,將程序0的task_struct的內容複製給程序1
- 為程序1的task_struct、tss以及LDT中的內容做個性化設定
- 為程序1建立第一個頁表,將程序0的頁表項內容賦給這個頁表
- 程序1共享程序0的檔案
- 設定程序1的GDT項
- 最後將程序1設定為就緒態,使其可以參與程序間的輪轉。
//程式碼路徑:kernel/fork.c
/*
* Ok, this is the main fork-routine. It copies the system process
* information (task[nr]) and sets up the necessary registers. It
* also copies the data segment in it's entirety.
*/
int copy_process(int nr,long ebp,long edi,long esi,long gs,long none,
long ebx,long ecx,long edx,
long fs,long es,long ds,
long eip,long cs,long eflags,long esp,long ss)
{
struct task_struct *p;
int i;
struct file *f;
p = (struct task_struct *) get_free_page();
if (!p)
return -EAGAIN;
task[nr] = p;
*p = *current; /* NOTE! this doesn't copy the supervisor stack */
p->state = TASK_UNINTERRUPTIBLE;
p->pid = last_pid;
p->father = current->pid;
p->counter = p->priority;
p->signal = 0;
p->alarm = 0;
p->leader = 0; /* process leadership doesn't inherit */
p->utime = p->stime = 0;
p->cutime = p->cstime = 0;
p->start_time = jiffies;
p->tss.back_link = 0;
p->tss.esp0 = PAGE_SIZE + (long) p; //核心棧發生變化,
p->tss.ss0 = 0x10;
p->tss.eip = eip; //int 0x80後面的那條指令
p->tss.eflags = eflags;
p->tss.eax = 0; //子程序fork返回值為0
p->tss.ecx = ecx;
p->tss.edx = edx;
p->tss.ebx = ebx;
p->tss.esp = esp;
p->tss.ebp = ebp;
p->tss.esi = esi;
p->tss.edi = edi;
p->tss.es = es & 0xffff; //es是long型別的,因此要取低16位
p->tss.cs = cs & 0xffff;
p->tss.ss = ss & 0xffff;
p->tss.ds = ds & 0xffff;
p->tss.fs = fs & 0xffff;
p->tss.gs = gs & 0xffff;
p->tss.ldt = _LDT(nr); //設定ldt
p->tss.trace_bitmap = 0x80000000;
if (last_task_used_math == current)
__asm__("clts ; fnsave %0"::"m" (p->tss.i387));
if (copy_mem(nr,p)) { //設定頁表、以及ldt中段描述符的新基址
task[nr] = NULL;
free_page((long) p);
return -EAGAIN;
}
for (i=0; i<NR_OPEN;i++) //共享檔案
if (f=p->filp[i])
f->f_count++;
if (current->pwd)
current->pwd->i_count++;
if (current->root)
current->root->i_count++;
if (current->executable)
current->executable->i_count++;
set_tss_desc(gdt+(nr<<1)+FIRST_TSS_ENTRY,&(p->tss)); //設定對應的GDT
set_ldt_desc(gdt+(nr<<1)+FIRST_LDT_ENTRY,&(p->ldt));
p->state = TASK_RUNNING; /* do this last, just in case */
return last_pid; //父程序返回值為子程序的pid
}
其中比較重要的copy_mem函式,該函式改變了LDT中程式碼段和資料段的基址,將程序0的頁表拷貝給程序1。
//程式碼路徑:include/linux/sched.h
#define _set_base(addr,base) \
__asm__("movw %%dx,%0\n\t" \
"rorl $16,%%edx\n\t" \
"movb %%dl,%1\n\t" \
"movb %%dh,%2" \
::"m" (*((addr)+2)), \
"m" (*((addr)+4)), \
"m" (*((addr)+7)), \
"d" (base) \
:"dx")
#define _set_limit(addr,limit) \
__asm__("movw %%dx,%0\n\t" \
"rorl $16,%%edx\n\t" \
"movb %1,%%dh\n\t" \
"andb $0xf0,%%dh\n\t" \
"orb %%dh,%%dl\n\t" \
"movb %%dl,%1" \
::"m" (*(addr)), \
"m" (*((addr)+6)), \
"d" (limit) \
:"dx")
#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , base )
#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , (limit-1)>>12 )
#define _get_base(addr) ({\
unsigned long __base; \
__asm__("movb %3,%%dh\n\t" \
"movb %2,%%dl\n\t" \
"shll $16,%%edx\n\t" \
"movw %1,%%dx" \
:"=d" (__base) \
:"m" (*((addr)+2)), \
"m" (*((addr)+4)), \
"m" (*((addr)+7))); \
__base;})
#define get_base(ldt) _get_base( ((char *)&(ldt)) )
#define get_limit(segment) ({ \
unsigned long __limit; \
__asm__("lsll %1,%0\n\t incl %0":"=r" (__limit):"r" (segment)); \
__limit;})
//程式碼路徑:kernel/fork.c
int copy_mem(int nr,struct task_struct * p)
{
unsigned long old_data_base,new_data_base,data_limit;
unsigned long old_code_base,new_code_base,code_limit;
code_limit=get_limit(0x0f);
data_limit=get_limit(0x17);
old_code_base = get_base(current->ldt[1]);
old_data_base = get_base(current->ldt[2]);
if (old_data_base != old_code_base)
panic("We don't support separate I&D");
if (data_limit < code_limit)
panic("Bad data_limit");
new_data_base = new_code_base = nr * 0x4000000;
p->start_code = new_code_base;
set_base(p->ldt[1],new_code_base);
set_base(p->ldt[2],new_data_base);
if (copy_page_tables(old_data_base,new_data_base,data_limit)) { //old_data_base 為 0,new_data_base為nr * 0x4000000,data_limit為640KB
free_page_tables(new_data_base,data_limit);
return -ENOMEM;
}
return 0;
}
該函式更改程序1的程式碼段和資料段基址後呼叫copy_page_table設定頁目錄項和複製頁表。
進入copy_ page_tables() 函式後,先為新的頁表申請- - 個空閒頁面,並把程序0中第一個頁表裡面前160 個頁表項複製到這個頁面中(1 個頁表項控制一個頁面4 KB記憶體空間,160 個頁表項可以控制640KB記憶體空間)。程序0和程序1的頁表暫時都指向了相同的頁面,意味著程序1也可以操作程序0的頁面。之後對程序1的頁目錄表進行設定。最後,用重置CR3的方法重新整理頁變換快取記憶體。程序1的頁表和頁目錄表設定完畢。
/*
* Well, here is one of the most complicated functions in mm. It
* copies a range of linerar addresses by copying only the pages.
* Let's hope this is bug-free, 'cause this one I don't want to debug :-)
*
* Note! We don't copy just any chunks of memory - addresses have to
* be divisible by 4Mb (one page-directory entry), as this makes the
* function easier. It's used only by fork anyway.
*
* NOTE 2!! When from==0 we are copying kernel space for the first
* fork(). Then we DONT want to copy a full page-directory entry, as
* that would lead to some serious memory waste - we just copy the
* first 160 pages - 640kB. Even that is more than we need, but it
* doesn't take any more memory - we don't copy-on-write in the low
* 1 Mb-range, so the pages can be shared with the kernel. Thus the
* special case for nr=xxxx.
*/
int copy_page_tables(unsigned long from,unsigned long to,long size)
{
unsigned long * from_page_table;
unsigned long * to_page_table;
unsigned long this_page;
unsigned long * from_dir, * to_dir;
unsigned long nr;
if ((from&0x3fffff) || (to&0x3fffff))
panic("copy_page_tables called with wrong alignment");
from_dir = (unsigned long *) ((from>>20) & 0xffc); /* _pg_dir = 0 */
to_dir = (unsigned long *) ((to>>20) & 0xffc);
size = ((unsigned) (size+0x3fffff)) >> 22; //得到需要複製的頁目錄項個數,data_limit除以4MB,此時size = 1
for( ; size-->0 ; from_dir++,to_dir++) {
if (1 & *to_dir)
panic("copy_page_tables: already exist");
if (!(1 & *from_dir)) //看目錄項最低為p位是否為1
continue;
from_page_table = (unsigned long *) (0xfffff000 & *from_dir);
if (!(to_page_table = (unsigned long *) get_free_page())) //為頁表分配記憶體
return -1; /* Out of memory, see freeing */
*to_dir = ((unsigned long) to_page_table) | 7; //使用者,可寫,存在
nr = (from==0)?0xA0:1024; //如果是拷貝程序0的頁表,則複製160個頁表項,否則複製1024個頁表項
for ( ; nr-- > 0 ; from_page_table++,to_page_table++) {
this_page = *from_page_table;
if (!(1 & this_page))
continue;
this_page &= ~2; //使用者,只讀,存在
*to_page_table = this_page;
if (this_page > LOW_MEM) { //1MB以內的核心區不參與使用者分頁管理
*from_page_table = this_page;
this_page -= LOW_MEM;
this_page >>= 12;
mem_map[this_page]++;
}
}
}
invalidate(); //重置CR3為0,重新整理”頁快取記憶體“
return 0;
}
執行結束後,程序1和程序0管理頁面完全一致,因為頁表項是完全拷貝過來的,所以他們共享頁面。效果如下:
copy_process結束時,程序1的建立工作完成,程序1已經具備了程序0的全部能力,可以在主機中正常地執行。返回sys_fork()中call_copy_process()的下一行執行,執行程式碼如下:
.align 2
_sys_fork:
call _find_empty_process
testl %eax,%eax
js 1f
push %gs
pushl %esi
pushl %edi
pushl %ebp
pushl %eax
call _copy_process //返回值last_pid放在eax暫存器中,層層返回
addl $20,%esp //將前面的gs、esi、edi、ebp和eax出棧
1: ret
.align 2
_system_call:
cmpl $nr_system_calls-1,%eax
ja bad_sys_call
push %ds
push %es
push %fs
pushl %edx
pushl %ecx # push %ebx,%ecx,%edx as parameters
pushl %ebx # to the system call
movl $0x10,%edx # set up ds,es to kernel space
mov %dx,%ds
mov %dx,%es
movl $0x17,%edx # fs points to local data space
mov %dx,%fs
call _sys_call_table(,%eax,4)
pushl %eax //將程序號壓棧
movl _current,%eax //檢查當前程序的狀態和counter位
cmpl $0,state(%eax) # state
jne reschedule
cmpl $0,counter(%eax) # counter
je reschedule
ret_from_sys_call:
movl _current,%eax # task[0] cannot have signals
cmpl _task,%eax //如果當前程序是程序0,跳到下面的3執行
je 3f
cmpw $0x0f,CS(%esp) # was old code segment supervisor ?
jne 3f
cmpw $0x17,OLDSS(%esp) # was stack segment = 0x17 ?
jne 3f
movl signal(%eax),%ebx
movl blocked(%eax),%ecx
notl %ecx
andl %ebx,%ecx
bsfl %ecx,%ecx
je 3f
btrl %ecx,%ebx
movl %ebx,signal(%eax)
incl %ecx
pushl %ecx
call _do_signal
popl %eax //
3: popl %eax //將程序號放入eax暫存器中
popl %ebx
popl %ecx
popl %edx
pop %fs
pop %es
pop %ds
iret
由於當前程序是程序0,所以就跳到標號3處,將壓棧的各個暫存器數值還原。之後iret中斷返回,CPU硬體自動將int 0x80的中斷時壓入的ss、esp、eflags、cs和eip值按壓棧的反序出棧給CPU對應暫存器,從0特權級的核心程式碼切換到3特權級的程序0程式碼執行,CS:EIP指向fork()中int 0x80的下一行if(_res >= 0)。
#define _syscall0(type,name) \
type name(void) \
{ \
long __res; \
__asm__ volatile ("int $0x80" \
: "=a" (__res) \
: "0" (__NR_##name)); \
if (__res >= 0) \ <----------iret返回後就這一行,eax暫存器此時存放在_res中
return (type) __res; \
errno = -__res; \
return -1; \
}
return (type) _res將程序號1返回,回到呼叫點if(!fork())處執行,!1為假,這樣就不會執行到init()函式中,而是程序0繼續執行,直到pause()函式
void main(void)
{
move_to_user_mode();
if(!fork()) {
init();
}
for(; ;) pause(); //執行
}
進入pause函式後,最終會對映到sys_pause函式,該函式將程序置為可中斷等待狀態,同時呼叫schedule函式。
int sys_pause(void)
{
current->state = TASK_INTERRUPTIBLE;
schedule();
return 0;
}
schedule程式碼如下:
/*
* 'schedule()' is the scheduler function. This is GOOD CODE! There
* probably won't be any reason to change this, as it should work well
* in all circumstances (ie gives IO-bound processes good response etc).
* The one thing you might take a look at is the signal-handler code here.
*
* NOTE!! Task 0 is the 'idle' task, which gets called when no other
* tasks can run. It can not be killed, and it cannot sleep. The 'state'
* information in task[0] is never used.
*/
void schedule(void)
{
int i,next,c;
struct task_struct ** p;
/* check alarm, wake up any interruptible tasks that have got a signal */
for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
if (*p) {
if ((*p)->alarm && (*p)->alarm < jiffies) {
(*p)->signal |= (1<<(SIGALRM-1));
(*p)->alarm = 0;
}
if (((*p)->signal & ~(_BLOCKABLE & (*p)->blocked)) &&
(*p)->state==TASK_INTERRUPTIBLE)
(*p)->state=TASK_RUNNING;
}
/* this is the scheduler proper: */
while (1) {
c = -1;
next = 0;
i = NR_TASKS;
p = &task[NR_TASKS];
while (--i) {
if (!*--p)
continue;
if ((*p)->state == TASK_RUNNING && (*p)->counter > c)
c = (*p)->counter, next = i;
}
if (c) break;
for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
if (*p)
(*p)->counter = ((*p)->counter >> 1) +
(*p)->priority;
}
switch_to(next);
}
/*
* switch_to(n) should switch tasks to task nr n, first
* checking that n isn't the current task, in which case it does nothing.
* This also clears the TS-flag if the task we switched to has used
* tha math co-processor latest.
*/
#define switch_to(n) {\
struct {long a,b;} __tmp; \
__asm__("cmpl %%ecx,_current\n\t" \
"je 1f\n\t" \ //如果程序n是當前程序,則沒必要切換,直接退出
"movw %%dx,%1\n\t" \
"xchgl %%ecx,_current\n\t" \
"ljmp %0\n\t" \ //通過TSS選擇符進行跳轉,此時儲存各個暫存器到程序0的TSS中
"cmpl %%ecx,_last_task_used_math\n\t" \
"jne 1f\n\t" \
"clts\n" \
"1:" \
::"m" (*&__tmp.a),"m" (*&__tmp.b), \
"d" (_TSS(n)),"c" ((long) task[n])); \
}
ljmp跳轉到程序1執行,將CPU的各個暫存器值儲存在程序0的TSS中,將程序1的TSS資料以及LDT的程式碼段、資料段描述符資料恢復給CPU的各個暫存器,實現從0特權級的核心程式碼切換到3特權級的程序1程式碼執行。
程序1執行
當時為程序1的TSS中eax設定為0,eip設定為int 0x80下一指令的地址,即if(_res >= 0)。程序開始從這一行執行。
#define _syscall0(type,name) \
type name(void) \
{ \
long __res; \
__asm__ volatile ("int $0x80" \
: "=a" (__res) \
: "0" (__NR_##name)); \
if (__res >= 0) \ <----------程序0從這一行開始執行,但此時eax存放值為0
return (type) __res; \
errno = -__res; \
return -1; \
}
返回後,執行到main()函式中if(!fork())這一行,!0為“真”,呼叫init()函式!
void init(void)
{
int pid,i;
setup((void *) &drive_info);
(void) open("/dev/tty0",O_RDWR,0);
(void) dup(0);
(void) dup(0);
printf("%d buffers = %d bytes buffer space\n\r",NR_BUFFERS,
NR_BUFFERS*BLOCK_SIZE);
printf("Free mem: %d bytes\n\r",memory_end-main_memory_start);
if (!(pid=fork())) {
close(0);
if (open("/etc/rc",O_RDONLY,0))
_exit(1);
execve("/bin/sh",argv_rc,envp_rc);
_exit(2);
}
if (pid>0)
while (pid != wait(&i))
/* nothing */;
while (1) {
if ((pid=fork())<0) {
printf("Fork failed in init\r\n");
continue;
}
if (!pid) {
close(0);close(1);close(2);
setsid();
(void) open("/dev/tty0",O_RDWR,0);
(void) dup(0);
(void) dup(0);
_exit(execve("/bin/sh",argv,envp));
}
while (1)
if (pid == wait(&i))
break;
printf("\n\rchild %d died with code %04x\n\r",pid,i);
sync();
}
_exit(0); /* NOTE! _exit, not exit() */
}
執行setup函式對根檔案系統初始化後,程序1又打開了/dev/tty0字元檔案,之後呼叫fork函式建立程序2。
fork對映到sys_fork,執行過程和之前是一樣的,會為程序2的task_struct以及核心棧申請頁面,並複製task_struct,隨後對程序2的task_struct進行各種個性化設定,包括各個暫存器的設定、記憶體頁面的管理設定、共享檔案的設定、GDT表項的設定等。不太一樣的是程序2複製了程序1的1024個頁表項。
程序2 shell
程序2建立完畢後,fork()函式返回,返回值為2,因此!(pid=fork())為假,於是呼叫wait()函式。此函式的功能是:如果程序1有等待退出的子程序,就為該程序的退出做善後工作;如果有子程序,但並不等待退出,則進行程序切換;如果沒有子程序,函式返回。
//程式碼路徑:/lib/wait.c
_syscall3(pid_t,waitpid,pid_t,pid,int *,wait_stat,int,options)
pid_t wait(int * wait_stat)
{
return waitpid(-1,wait_stat,0);
}
//程式碼路徑:kernel/exit.c
int sys_waitpid(pid_t pid,unsigned long * stat_addr, int options)
{
int flag, code;
struct task_struct ** p;
verify_area(stat_addr,4);
repeat:
flag=0;
for(p = &LAST_TASK ; p > &FIRST_TASK ; --p) {
if (!*p || *p == current)
continue;
if ((*p)->father != current->pid)
continue;
if (pid>0) { //wait時傳入pid為-1
if ((*p)->pid != pid)
continue;
} else if (!pid) {
if ((*p)->pgrp != current->pgrp)
continue;
} else if (pid != -1) {
if ((*p)->pgrp != -pid)
continue;
}
switch ((*p)->state) {
case TASK_STOPPED:
if (!(options & WUNTRACED))
continue;
put_fs_long(0x7f,stat_addr);
return (*p)->pid;
case TASK_ZOMBIE:
current->cutime += (*p)->utime;
current->cstime += (*p)->stime;
flag = (*p)->pid;
code = (*p)->exit_code;
release(*p);
put_fs_long(code,stat_addr);
return flag;
default: //此時程序2為就緒態,執行到這裡
flag=1;
continue;
}
}
if (flag) {
if (options & WNOHANG)
return 0;
current->state=TASK_INTERRUPTIBLE; //將程序1設定為可中斷等待狀態
schedule(); //執行程序2
if (!(current->signal &= ~(1<<(SIGCHLD-1))))
goto repeat;
else
return -EINTR;
}
return -ECHILD;
}
輪轉到程序2執行,程序2關閉檔案後開啟檔案/etc/rc
if (!(pid=fork())) {
close(0);
if (open("/etc/rc",O_RDONLY,0))
_exit(1);
execve("/bin/sh",argv_rc,envp_rc);
_exit(2);
}
之後執行execve函式,進入核心對應_sys_execve函式
_sys_execve:
lea EIP(%esp),%eax
pushl %eax
call _do_execve
addl $4,%esp
ret
#define MAX_ARG_PAGES 32
/*
* 'do_execve()' executes a new program.
*/
int do_execve(unsigned long * eip,long tmp,char * filename,
char ** argv, char ** envp)
{
struct m_inode * inode;
struct buffer_head * bh;
struct exec ex;
unsigned long page[MAX_ARG_PAGES];
int i,argc,envc;
int e_uid, e_gid;
int retval;
int sh_bang = 0;
unsigned long p=PAGE_SIZE*MAX_ARG_PAGES-4;
if ((0xffff & eip[1]) != 0x000f)
panic("execve called from supervisor mode");
for (i=0 ; i<MAX_ARG_PAGES ; i++) /* clear page-table */
page[i]=0;
if (!(inode=namei(filename))) /* get executables inode */
return -ENOENT;
argc = count(argv);
envc = count(envp);
restart_interp:
if (!S_ISREG(inode->i_mode)) { /* must be regular file */
retval = -EACCES;
goto exec_error2;
}
i = inode->i_mode; //檔案uid、gid暫時還看不懂
e_uid = (i & S_ISUID) ? inode->i_uid : current->euid;
e_gid = (i & S_ISGID) ? inode->i_gid : current->egid;
if (current->euid == inode->i_uid)
i >>= 6;
else if (current->egid == inode->i_gid)
i >>= 3;
if (!(i & 1) &&
!((inode->i_mode & 0111) && suser())) {
retval = -ENOEXEC;
goto exec_error2;
}
if (!(bh = bread(inode->i_dev,inode->i_zone[0]))) {
retval = -EACCES;
goto exec_error2;
}
ex = *((struct exec *) bh->b_data); /* read exec-header */
if ((bh->b_data[0] == '#') && (bh->b_data[1] == '!') && (!sh_bang)) {
/*
* This section does the #! interpretation.
* Sorta complicated, but hopefully it will work. -TYT
*/
char buf[1023], *cp, *interp, *i_name, *i_arg;
unsigned long old_fs;
strncpy(buf, bh->b_data+2, 1022);
brelse(bh);
iput(inode);
buf[1022] = '\0';
if (cp = strchr(buf, '\n')) {
*cp = '\0';
for (cp = buf; (*cp == ' ') || (*cp == '\t'); cp++);
}
if (!cp || *cp == '\0') {
retval = -ENOEXEC; /* No interpreter name found */
goto exec_error1;
}
interp = i_name = cp;
i_arg = 0;
for ( ; *cp && (*cp != ' ') && (*cp != '\t'); cp++) {
if (*cp == '/')
i_name = cp+1;
}
if (*cp) {
*cp++ = '\0';
i_arg = cp;
}
/*
* OK, we've parsed out the interpreter name and
* (optional) argument.
*/
if (sh_bang++ == 0) {
p = copy_strings(envc, envp, page, p, 0);
p = copy_strings(--argc, argv+1, page, p, 0);
}
/*
* Splice in (1) the interpreter's name for argv[0]
* (2) (optional) argument to interpreter
* (3) filename of shell script
*
* This is done in reverse order, because of how the
* user environment and arguments are stored.
*/
p = copy_strings(1, &filename, page, p, 1);
argc++;
if (i_arg) {
p = copy_strings(1, &i_arg, page, p, 2);
argc++;
}
p = copy_strings(1, &i_name, page, p, 2);
argc++;
if (!p) {
retval = -ENOMEM;
goto exec_error1;
}
/*
* OK, now restart the process with the interpreter's inode.
*/
old_fs = get_fs();
set_fs(get_ds());
if (!(inode=namei(interp))) { /* get executables inode */
set_fs(old_fs);
retval = -ENOENT;
goto exec_error1;
}
set_fs(old_fs);
goto restart_interp;
}
brelse(bh); //檢測可執行檔案是否可執行
if (N_MAGIC(ex) != ZMAGIC || ex.a_trsize || ex.a_drsize ||
ex.a_text+ex.a_data+ex.a_bss>0x3000000 ||
inode->i_size < ex.a_text+ex.a_data+ex.a_syms+N_TXTOFF(ex)) {
retval = -ENOEXEC;
goto exec_error2;
}
if (N_TXTOFF(ex) != BLOCK_SIZE) {
printk("%s: N_TXTOFF != BLOCK_SIZE. See a.out.h.", filename);
retval = -ENOEXEC;
goto exec_error2;
}
if (!sh_bang) {
p = copy_strings(envc,envp,page,p,0); //分配頁面,將環境變數和引數拷貝到新分配的頁面中
p = copy_strings(argc,argv,page,p,0); //page陣列存放的分配頁面的實體地址
if (!p) {
retval = -ENOMEM;
goto exec_error2;
}
}
/* OK, This is the point of no return */
if (current->executable)
iput(current->executable); //減少inode的引用數,由於程序2複製程序1的執行檔案是0,所以不會執行
current->executable = inode;
for (i=0 ; i<32 ; i++)
current->sigaction[i].sa_handler = NULL;
for (i=0 ; i<NR_OPEN ; i++)
if ((current->close_on_exec>>i)&1) //遍歷close_on_exec,關閉需要關閉的檔案
sys_close(i);
current->close_on_exec = 0;
free_page_tables(get_base(current->ldt[1]),get_limit(0x0f)); //非常關鍵,並不直接將可執行檔案讀進記憶體然後設定頁表來對映,需要觸發缺頁中斷開始執行
free_page_tables(get_base(current->ldt[2]),get_limit(0x17));
if (last_task_used_math == current)
last_task_used_math = NULL;
current->used_math = 0;
p += change_ldt(ex.a_text,page)-MAX_ARG_PAGES*PAGE_SIZE; //修改資料段的data limit為data_limit = 0x4000000;最高地址準備存放環境變數和引數
p = (unsigned long) create_tables((char *)p,argc,envc); //設定環境變數parses the env- and arg-strings in new user memory and creates the pointer tables from them, and puts their addresses on the "stack", returning the new stack pointer value.
current->brk = ex.a_bss +
(current->end_data = ex.a_data +
(current->end_code = ex.a_text));
current->start_stack = p & 0xfffff000; //
current->euid = e_uid;
current->egid = e_gid;
i = ex.a_text+ex.a_data;
while (i&0xfff)
put_fs_byte(0,(char *) (i++));
eip[0] = ex.a_entry; /* eip, magic happens :-) */ //改變棧上的內容
eip[3] = p; /* stack pointer */ //設定eip和esp
return 0;
exec_error2:
iput(inode);
exec_error1:
for (i=0 ; i<MAX_ARG_PAGES ; i++)
free_page(page[i]);
return(retval);
}
shell程式開始執行時,其線性地址空間對應的程式內容並未載入,也就不存在相應的頁面,因此會產生一個“頁異常”中斷,此中斷會進一步呼叫“缺頁中斷”處理程式來分配該頁面,並載入一頁shell程式。
//程式碼路徑:mm/page.s
.globl _page_fault
_page_fault:
xchgl %eax,(%esp) //error code
pushl %ecx
pushl %edx
push %ds
push %es
push %fs
movl $0x10,%edx
mov %dx,%ds
mov %dx,%es
mov %dx,%fs
movl %cr2,%edx //address
pushl %edx //addr
pushl %eax //error code
testl $1,%eax //p bits
jne 1f //p != 0
call _do_no_page //p = 0 means no page
jmp 2f
1: call _do_wp_page
2: addl $8,%esp
pop %fs
pop %es
pop %ds
popl %edx
popl %ecx
popl %eax
iret
通過檢測error code後呼叫do_no_page()函式,先確定缺頁的原因。加入是由於需要載入程式才卻頁。會嘗試與其他程序共享shell(顯然此前沒有程序載入過shell,無法共享),於是申請一個頁面,呼叫bread_page()函式,從虛擬盤上讀取4塊(4KB,一頁)shell程式內容,載入記憶體頁面。
void do_no_page(unsigned long error_code,unsigned long address)
{
int nr[4];
unsigned long tmp;
unsigned long page;
int block,i;
address &= 0xfffff000;
tmp = address - current->start_code;
if (!current->executable || tmp >= current->end_data) {
get_empty_page(address);
return;
}
if (share_page(tmp))
return;
if (!(page = get_free_page()))
oom();
/* remember that 1 block is used for header */
block = 1 + tmp/BLOCK_SIZE;
for (i=0 ; i<4 ; block++,i++)
nr[i] = bmap(current->executable,block);
bread_page(page,current->executable->i_dev,nr); //讀取四個邏輯塊(1頁)的shell程式內容進記憶體頁面
i = tmp + 4096 - current->end_data;
tmp = page + 4096;
while (i-- > 0) {
tmp--;
*(char *)tmp = 0;
}
if (put_page(page,address)) //修改頁表,建立對映
return;
free_page(page);
oom();
}
載入一頁的shell程式後,核心會將該頁內容對映到shell程序的線性地址空間內,建立頁目錄表->頁表->頁面的三級對映管理關係。
程序3 update
之後程序2讀取rc檔案上的資訊,fork出了update程序,這個新程序的程序號為3。update程序有一項很重要的任務:將緩衝區中的資料同步到外色號(軟盤、硬碟)上。由於主機與外設的資料交換速度遠低於主機內部的資料處理速度,因此,當核心需要往外設上寫資料的時候,為了提高系統的整體執行效率,並不把資料直接寫入到外設上,而是先寫入緩衝區,之後根據實際情況,再將資料從緩衝區同步到外設。
每隔一段時間,update程序就會被喚醒,把資料往外設上同步一次,之後這個程序會被掛起,即被設定為可中斷等待狀態,等待著下一次被喚醒後繼續執行,如此周而復始。
update程序執行後,沒有同步任務,於是該程序被掛起,系統執行程序排程,最終切換到shell程序繼續執行。
完成工作後呼叫exit()函式,對應的系統呼叫函式為sys_exit(),執行程式碼如下
int sys_exit(int error_code)
{
return do_exit((error_code&0xff)<<8);
}
int do_exit(long code)
{
int i;
free_page_tables(get_base(current->ldt[1]),get_limit(0x0f));
free_page_tables(get_base(current->ldt[2]),get_limit(0x17));
for (i=0 ; i<NR_TASKS ; i++)
if (task[i] && task[i]->father == current->pid) { //尋找子程序
task[i]->father = 1; //將子程序父程序設定為1
if (task[i]->state == TASK_ZOMBIE)
/* assumption task[1] is always init */
(void) send_sig(SIGCHLD, task[1], 1); //如果子程序為zombie,即已經呼叫了exit函式,給程序1傳送訊號。
}
for (i=0 ; i<NR_OPEN ; i++)
if (current->filp[i])
sys_close(i);
iput(current->pwd);
current->pwd=NULL;
iput(current->root);
current->root=NULL;
iput(current->executable);
current->executable=NULL;
if (current->leader && current->tty >= 0)
tty_table[current->tty].pgrp = 0;
if (last_task_used_math == current)
last_task_used_math = NULL;
if (current->leader)
kill_session();
current->state = TASK_ZOMBIE;
current->exit_code = code;
tell_father(current->father);
schedule();
return (-1); /* just to suppress warnings */
}
值得注意的是tell_father()和schedule()函式的執行,tell_father向父程序傳送SIGCHLD訊號
static void tell_father(int pid)
{
int i;
if (pid)
for (i=0;i<NR_TASKS;i++) {
if (!task[i])
continue;
if (task[i]->pid != pid)
continue;
task[i]->signal |= (1<<(SIGCHLD-1));
return;
}
/* if we don't find any fathers, we just release ourselves */
/* This is not really OK. Must change it to make father 1 */
printk("BAD BAD - no father found\n\r");
release(current);
}
tell_father()函式執行完畢後,呼叫schedule()函式準備程序切換。此次schedule()函式中對訊號的檢測,影響了程序切換
/*
* 'schedule()' is the scheduler function. This is GOOD CODE! There
* probably won't be any reason to change this, as it should work well
* in all circumstances (ie gives IO-bound processes good response etc).
* The one thing you might take a look at is the signal-handler code here.
*
* NOTE!! Task 0 is the 'idle' task, which gets called when no other
* tasks can run. It can not be killed, and it cannot sleep. The 'state'
* information in task[0] is never used.
*/
void schedule(void)
{
int i,next,c;
struct task_struct ** p;
/* check alarm, wake up any interruptible tasks that have got a signal */
for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
if (*p) {
if ((*p)->alarm && (*p)->alarm < jiffies) {
(*p)->signal |= (1<<(SIGALRM-1));
(*p)->alarm = 0;
}
if (((*p)->signal & ~(_BLOCKABLE & (*p)->blocked)) &&
(*p)->state==TASK_INTERRUPTIBLE)
(*p)->state=TASK_RUNNING; //將程序1設定為task_running
}
/* this is the scheduler proper: */
while (1) {
c = -1;
next = 0;
i = NR_TASKS;
p = &task[NR_TASKS];
while (--i) {
if (!*--p)
continue;
if ((*p)->state == TASK_RUNNING && (*p)->counter > c)
c = (*p)->counter, next = i;
}
if (c) break;
for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
if (*p)
(*p)->counter = ((*p)->counter >> 1) +
(*p)->priority;
}
switch_to(next);
}
將程序1設定為task_running後,後面排程到程序1執行,此時task1還在schedule函式中,執行完畢後繼續執行sys_waitpid()函式
int sys_waitpid(pid_t pid,unsigned long * stat_addr, int options)
{
int flag, code;
struct task_struct ** p;
verify_area(stat_addr,4);
repeat:
flag=0;
for(p = &LAST_TASK ; p > &FIRST_TASK ; --p) {
if (!*p || *p == current)
continue;
if ((*p)->father != current->pid)
continue;
if (pid>0) {
if ((*p)->pid != pid)
continue;
} else if (!pid) {
if ((*p)->pgrp != current->pgrp)
continue;
} else if (pid != -1) {
if ((*p)->pgrp != -pid)
continue;
}
switch ((*p)->state) {
case TASK_STOPPED:
if (!(options & WUNTRACED))
continue;
put_fs_long(0x7f,stat_addr);
return (*p)->pid;
case TASK_ZOMBIE: //找到子程序的狀態為task_zombie
current->cutime += (*p)->utime;
current->cstime += (*p)->stime;
flag = (*p)->pid; //記錄程序2的pid
code = (*p)->exit_code;
release(*p); //釋放他的task_struct頁面
put_fs_long(code,stat_addr);
return flag;
default:
flag=1;
continue;
}
}
if (flag) {
if (options & WNOHANG)
return 0;
current->state=TASK_INTERRUPTIBLE;
schedule();
if (!(current->signal &= ~(1<<(SIGCHLD-1)))) //執行到這裡,檢測到SIGCHLD,確定有子程序要退出,if條件成立,repeat
goto repeat;
else
return -EINTR;
}
return -ECHILD;
}
sys_waitpid()函式執行完畢後,會回到wait()函式,最後返回到init()函式中,程序1繼續執行。
void init(void)
{
int pid,i;
setup((void *) &drive_info);
(void) open("/dev/tty0",O_RDWR,0);
(void) dup(0);
(void) dup(0);
printf("%d buffers = %d bytes buffer space\n\r",NR_BUFFERS,
NR_BUFFERS*BLOCK_SIZE);
printf("Free mem: %d bytes\n\r",memory_end-main_memory_start);
if (!(pid=fork())) {
close(0);
if (open("/etc/rc",O_RDONLY,0))
_exit(1);
execve("/bin/sh",argv_rc,envp_rc);
_exit(2); //不會執行到這句話
}
if (pid>0)
while (pid != wait(&i)) //此時wait的返回值為2,while條件為假,退出
/* nothing */;
while (1) {
if ((pid=fork())<0) { //fork程序
printf("Fork failed in init\r\n");
continue;
}
if (!pid) { //此時程序號為4,雖然在task陣列中索引為2(因為之前shell程序還fork出了一個update程序,該程序pid為3
close(0);close(1);close(2);
setsid();
(void) open("/dev/tty0",O_RDWR,0); //
(void) dup(0);
(void) dup(0);
_exit(execve("/bin/sh",argv,envp));
}
while (1)
if (pid == wait(&i)) //因為程序1是所有孤兒程序的父程序,因此要一直wait,從而release子程序
break;
printf("\n\rchild %d died with code %04x\n\r",pid,i);
sync();
}
_exit(0); /* NOTE! _exit, not exit() */
}
程序4 shell
這次shell開啟的是標準輸入裝置檔案tty0而不是rc,這使得shell開始執行後,不再退出。進入rw_char()函式後,shell程序將被設定為可中斷等待狀態,這樣所有的程序都處於可中斷等待狀態,再次切換到程序0去執行,系統實現怠速。
怠速以後,作業系統使用者將通過shell程序提供的平臺與計算機進行互動,shell程序處理使用者指令的工作原理如下:使用者通過鍵盤輸入資訊,儲存在指定的字元緩衝佇列上。該緩衝佇列上的內容,就是tty0檔案的內容,shell程序會不斷讀取緩衝佇列上的資料資訊。如果使用者沒有下達指令,緩衝佇列就不會有資料,shell程序將會被設定為可終端等待狀態,即被掛起。如果使用者通過鍵盤下達指令,將產生鍵盤中斷,中斷程式會將字元資訊儲存在緩衝佇列上,並給shell程序發訊號,訊號將導致shell程序被設定為就緒狀態,即被喚醒,喚醒後的shell繼續從緩衝佇列中讀取資料資訊並處理,完畢後,shell程序將再次被掛起,等待下一次鍵盤中斷被喚醒。