1. 程式人生 > 其它 >Linux 0.11 程序0、1、2

Linux 0.11 程序0、1、2

本部落格研究重點主要放在程序0、1、2的建立和fork、execv等系統呼叫,因此會省去很多內部細節,只在程式碼上加上必要註釋,這對我們理解重點沒有影響。文章全部參考《Linux核心設計的藝術(第二版)》

程序0初始化

程序0是Linux作業系統中執行的第一個程序,也是Linux作業系統父子程序建立機制的第一個父程序。Linux初始化主要包含以下三方面:

  1. 系統初始化程序0。程序管理結構task_struct的母本(init_task = {INIT_TASK})已經在程式碼設計階段事先準備好了。之後要對程序0的task_struct中的LDT、TSS與GDT掛接,並對GDT、task[64]以及與程序排程有關的暫存器進行初始化
  2. 設定時鐘中斷,支援多程序輪轉。
  3. 程序0要具備處理系統呼叫的能力,通過set_system_gate將system_call與IDT相掛接。

這三點的實現都是在sched_init()函式中實現的,具體程式碼如下:

//程式碼路徑:include/linux/head.h
typedef struct desc_struct {
	unsigned long a,b;
} desc_table[256];


//程式碼路徑:kernel/sched.c
#define LATCH (1193180/HZ)

union task_union {
	struct task_struct task;
	char stack[PAGE_SIZE];
};

static union task_union init_task = {INIT_TASK,};

struct task_struct *current = &(init_task.task);
struct task_struct *last_task_used_math = NULL;

//初始化程序槽task[NR_TASKS]的第一項為程序0,即task[0]為程序0佔用
struct task_struct * task[NR_TASKS] = {&(init_task.task), };

void sched_init(void)
{
	int i;
	struct desc_struct * p;

	if (sizeof(struct sigaction) != 16)
		panic("Struct sigaction MUST be 16 bytes");
	set_tss_desc(gdt+FIRST_TSS_ENTRY,&(init_task.task.tss));
	set_ldt_desc(gdt+FIRST_LDT_ENTRY,&(init_task.task.ldt));
	p = gdt+2+FIRST_TSS_ENTRY;
	for(i=1;i<NR_TASKS;i++) {
		task[i] = NULL;
		p->a=p->b=0;
		p++;
		p->a=p->b=0;
		p++;
	}
/* Clear NT, so that we won't have troubles with that later on */
	__asm__("pushfl ; andl $0xffffbfff,(%esp) ; popfl");
	ltr(0);
	lldt(0);
	outb_p(0x36,0x43);		/* binary, mode 3, LSB/MSB, ch 0 */
	outb_p(LATCH & 0xff , 0x40);	/* LSB */
	outb(LATCH >> 8 , 0x40);	/* MSB */
	set_intr_gate(0x20,&timer_interrupt);
	outb(inb_p(0x21)&~0x01,0x21);
	set_system_gate(0x80,&system_call);
}

//程式碼路徑:include/linux/sched.h
struct tss_struct {
	long	back_link;	/* 16 high bits zero */
	long	esp0;
	long	ss0;		/* 16 high bits zero */
	long	esp1;
	long	ss1;		/* 16 high bits zero */
	long	esp2;
	long	ss2;		/* 16 high bits zero */
	long	cr3;
	long	eip;
	long	eflags;
	long	eax,ecx,edx,ebx;
	long	esp;
	long	ebp;
	long	esi;
	long	edi;
	long	es;		/* 16 high bits zero */
	long	cs;		/* 16 high bits zero */
	long	ss;		/* 16 high bits zero */
	long	ds;		/* 16 high bits zero */
	long	fs;		/* 16 high bits zero */
	long	gs;		/* 16 high bits zero */
	long	ldt;		/* 16 high bits zero */
	long	trace_bitmap;	/* bits: trace 0, bitmap 16-31 */
	struct i387_struct i387;
};

struct task_struct {
/* these are hardcoded - don't touch */
	long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
	long counter;
	long priority;
	long signal;
	struct sigaction sigaction[32];
	long blocked;	/* bitmap of masked signals */
/* various fields */
	int exit_code;
	unsigned long start_code,end_code,end_data,brk,start_stack;
	long pid,father,pgrp,session,leader;
	unsigned short uid,euid,suid;
	unsigned short gid,egid,sgid;
	long alarm;
	long utime,stime,cutime,cstime,start_time;
	unsigned short used_math;
/* file system info */
	int tty;		/* -1 if no tty, so it must be signed */
	unsigned short umask;
	struct m_inode * pwd;
	struct m_inode * root;
	struct m_inode * executable;
	unsigned long close_on_exec;
	struct file * filp[NR_OPEN];
/* ldt for this task 0 - zero 1 - cs 2 - ds&ss */
	struct desc_struct ldt[3];
/* tss for this task */
	struct tss_struct tss;
};

/*
 *  INIT_TASK is used to set up the first task table, touch at
 * your own risk!. Base=0, limit=0x9ffff (=640kB)
 */
#define INIT_TASK \
/* state etc */	{ 0,15,15, \
/* signals */	0,{{},},0, \
/* ec,brk... */	0,0,0,0,0,0, \
/* pid etc.. */	0,-1,0,0,0, \
/* uid etc */	0,0,0,0,0,0, \
/* alarm */	0,0,0,0,0,0, \
/* math */	0, \
/* fs info */	-1,0022,NULL,NULL,NULL,0, \
/* filp */	{NULL,}, \
	{ \
		{0,0}, \
/* ldt */	{0x9f,0xc0fa00}, \
		{0x9f,0xc0f200}, \
	}, \
/*tss*/	{0,PAGE_SIZE+(long)&init_task,0x10,0,0,0,0,(long)&pg_dir,\
	 0,0,0,0,0,0,0,0, \
	 0,0,0x17,0x17,0x17,0x17,0x17,0x17, \
	 _LDT(0),0x80000000, \
		{} \
	}, \
}

extern struct task_struct *task[NR_TASKS];
extern struct task_struct *last_task_used_math;
extern struct task_struct *current;

/*
 * Entry into gdt where to find first TSS. 0-nul, 1-cs, 2-ds, 3-syscall
 * 4-TSS0, 5-LDT0, 6-TSS1 etc ...
 */
#define FIRST_TSS_ENTRY 4
#define FIRST_LDT_ENTRY (FIRST_TSS_ENTRY+1)
#define _TSS(n) ((((unsigned long) n)<<4)+(FIRST_TSS_ENTRY<<3))
#define _LDT(n) ((((unsigned long) n)<<4)+(FIRST_LDT_ENTRY<<3))
#define ltr(n) __asm__("ltr %%ax"::"a" (_TSS(n)))
#define lldt(n) __asm__("lldt %%ax"::"a" (_LDT(n)))

設定GDT中對應程序0的TSS、LDT描述符

#define FIRST_TSS_ENTRY 4
#define FIRST_LDT_ENTRY (FIRST_TSS_ENTRY+1)

set_tss_desc(gdt+FIRST_TSS_ENTRY,&(init_task.task.tss));
set_ldt_desc(gdt+FIRST_LDT_ENTRY,&(init_task.task.ldt));

設定GDT中和程序0相關的TSS和LDT描述符,程序0的tss內容和ldt內容均放在自己的task_struct中,因此向set_xxx_desc傳入描述符對應的偏移量和地址即可進行初始化。

p = gdt+2+FIRST_TSS_ENTRY;
for(i=1;i<NR_TASKS;i++) {
	task[i] = NULL;		//程序i的指標為null
	p->a=p->b=0;		//程序i的tss描述符
	p++;
	p->a=p->b=0;		//程序i的ldt描述符
	p++;
}

之後對gdt中的其他內容和task陣列的其他內容都清零。

#define FIRST_TSS_ENTRY 4
#define FIRST_LDT_ENTRY (FIRST_TSS_ENTRY+1)
#define _TSS(n) ((((unsigned long) n)<<4)+(FIRST_TSS_ENTRY<<3))
#define _LDT(n) ((((unsigned long) n)<<4)+(FIRST_LDT_ENTRY<<3))
#define ltr(n) __asm__("ltr %%ax"::"a" (_TSS(n)))
#define lldt(n) __asm__("lldt %%ax"::"a" (_LDT(n)))

ltr(0);
lldt(0);

設定tr暫存器和ldt暫存器,指向程序0的tss描述符和ldt描述符。

設定時鐘中斷

outb_p(0x36,0x43);		/* binary, mode 3, LSB/MSB, ch 0 */
outb_p(LATCH & 0xff , 0x40);	/* LSB */
outb(LATCH >> 8 , 0x40);	/* MSB */
set_intr_gate(0x20,&timer_interrupt);
outb(inb_p(0x21)&~0x01,0x21);		//開啟時鐘中斷

時鐘中斷處理程式timer_interrupt的程式碼程序排程時再進行介紹,先略過。

設定系統呼叫總入口

#define set_system_gate(n,addr) \
	_set_gate(&idt[n],15,3,addr)		//陷阱門

set_system_gate(0x80,&system_call);

將系統呼叫處理函式system_call與int0x80中斷描述符表掛接。system_call是整個作業系統中系統呼叫軟中斷的總入口。所有使用者程式使用系統呼叫,產生int 0x80軟中斷後,作業系統都是通過這個總入口找到具體的系統呼叫函式。

程序0由0特權級反轉到3特權級,成為真正的程序

Linux作業系統規定,除程序0之外,所有程序都要由一個已有程序在3特權級下建立。
在Linux0.11中,程序0的程式碼和資料都是由作業系統的設計者寫在核心程式碼、資料區,並且,此前處在0特權級,嚴格說還不是真正意義上的程序。為了遵守規則,在程序0正式建立程序1之前,要將程序0由0特權級轉變為3特權級。方法是呼叫move_ to_ user_ mode()函式,模仿中斷返回動作,實現程序0的特權級從0轉變為3。

//程式碼路徑:init/main.c
void main()
{
	...
	move_to_user_mode();
	...
}

//程式碼路徑:include/system.h
#define move_to_user_mode() \
__asm__ ("movl %%esp,%%eax\n\t" \
	"pushl $0x17\n\t" \					//ss	0b10111: CPL=3,LDT中的第3個描述符
	"pushl %%eax\n\t" \					//esp
	"pushfl\n\t" \						//flags
	"pushl $0x0f\n\t" \					//cs	0b1111: CPL=3, LDT中的第2個描述符
	"pushl $1f\n\t" \					//把標號1對應的eip壓到棧中
	"iret\n" \
	"1:\tmovl $0x17,%%eax\n\t" \
	"movw %%ax,%%ds\n\t" \
	"movw %%ax,%%es\n\t" \
	"movw %%ax,%%fs\n\t" \
	"movw %%ax,%%gs" \
	:::"ax")

手工模擬特權級從0到3的棧的內容,使用iret反轉到3特權級,執行3特權級的程式碼

此時我們再來看一下程序0的LDT內容

/*
 *  INIT_TASK is used to set up the first task table, touch at
 * your own risk!. Base=0, limit=0x9ffff (=640kB)
 */
#define INIT_TASK \
/* state etc */	{ 0,15,15, \
/* signals */	0,{{},},0, \
/* ec,brk... */	0,0,0,0,0,0, \
/* pid etc.. */	0,-1,0,0,0, \
/* uid etc */	0,0,0,0,0,0, \
/* alarm */	0,0,0,0,0,0, \
/* math */	0, \
/* fs info */	-1,0022,NULL,NULL,NULL,0, \
/* filp */	{NULL,}, \
	{ \
		{0,0}, \
/* ldt */	{0x9f,0xc0fa00}, \				//程式碼段,base=0,G=1,DPL=3,Execute/Read
		{0x9f,0xc0f200}, \					//資料段,base=0,G=1,DPL=3,Read/Write
	}, \
/*tss*/	{0,PAGE_SIZE+(long)&init_task,0x10,0,0,0,0,(long)&pg_dir,\
	 0,0,0,0,0,0,0,0, \
	 0,0,0x17,0x17,0x17,0x17,0x17,0x17, \
	 _LDT(0),0x80000000, \
		{} \
	}, \
}

可以看到此時程序0的程式碼段都是從0開始,界限為640KB。ss0為0x10,指向GDT中核心資料段,esp0為init_task頁面最頂端。

程序1的建立

程序0現在處在3特權級狀態,即程序狀態。正式開始執行要做的第一件事就是作為父程序呼叫fork函式建立第一個子程序—程序 1,這是父子程序建立機制的第一次實際運用。以後,所有程序都是基於父子程序建立機制由父程序創建出來的。

//程式碼路i紀念館:init/main.c
static inline _syscall0(int, fork)

void main(void)
{
	move_to_user_mode();
	if(!fork()){
		init();
	}
    /*
 *   NOTE!!   For any other task 'pause()' would mean we have to get a
 * signal to awaken, but task0 is the sole exception (see 'schedule()')
 * as task 0 gets activated at every idle moment (when no other tasks
 * can run). For task0 'pause()' just means we go check if some other
 * task can run, and if not we return here.
 */
	for(;;) pause();
}

//程式碼路徑:include/unistd.h
#define __NR_setup	0	/* used only by init, to get system going */
#define __NR_exit	1
#define __NR_fork	2

#define _syscall0(type,name) \
type name(void) \
{ \
long __res; \
__asm__ volatile ("int $0x80" \
	: "=a" (__res) \
	: "0" (__NR_##name)); \			//將呼叫號傳入eax暫存器中作為引數,fork中為2,2為sys_fork在sys_call_table中的偏移值
if (__res >= 0) \
	return (type) __res; \
errno = -__res; \
return -1; \
}

extern int errno;

//程式碼路徑:include/linux/sys.h
extern int sys_setup();
extern int sys_exit();
extern int sys_fork();

fn_ptr sys_call_table[] = { sys_setup, sys_exit, sys_fork, sys_read,
sys_write, sys_open, sys_close, sys_waitpid, sys_creat, sys_link,
sys_unlink, sys_execve, sys_chdir, sys_time, sys_mknod, sys_chmod,
sys_chown, sys_break, sys_stat, sys_lseek, sys_getpid, sys_mount,
sys_umount, sys_setuid, sys_getuid, sys_stime, sys_ptrace, sys_alarm,
sys_fstat, sys_pause, sys_utime, sys_stty, sys_gtty, sys_access,
sys_nice, sys_ftime, sys_sync, sys_kill, sys_rename, sys_mkdir,
sys_rmdir, sys_dup, sys_pipe, sys_times, sys_prof, sys_brk, sys_setgid,
sys_getgid, sys_signal, sys_geteuid, sys_getegid, sys_acct, sys_phys,
sys_lock, sys_ioctl, sys_fcntl, sys_mpx, sys_setpgid, sys_ulimit,
sys_uname, sys_umask, sys_chroot, sys_ustat, sys_dup2, sys_getppid,
sys_getpgrp, sys_setsid, sys_sigaction, sys_sgetmask, sys_ssetmask,
sys_setreuid,sys_setregid };

執行int 0x80程式後,產生一個軟中斷,CPU從特權級3的程序0程式碼跳轉到0特權級的核心程式碼中執行,中斷將SS、ESP、EFLAGS、CS和EIP這5個暫存器自動壓棧,壓入init_task中的程序0核心棧。

# 程式碼路徑:kernel/system_call.s
nr_system_calls = 72

.align 2
bad_sys_call:
	movl $-1,%eax
	iret
.align 2
reschedule:
	pushl $ret_from_sys_call
	jmp _schedule
.align 2
_system_call:
	cmpl $nr_system_calls-1,%eax		//比較eax中的程序號和nr_system_calls,檢視
	ja bad_sys_call						//是否越界
	push %ds							//繼續壓入引數
	push %es
	push %fs
	pushl %edx
	pushl %ecx		# push %ebx,%ecx,%edx as parameters
	pushl %ebx		# to the system call
	movl $0x10,%edx		# set up ds,es to kernel space
	mov %dx,%ds
	mov %dx,%es
	movl $0x17,%edx		# fs points to local data space
	mov %dx,%fs
	call _sys_call_table(,%eax,4)		//執行函式
	pushl %eax
	movl _current,%eax
	cmpl $0,state(%eax)		# state
	jne reschedule
	cmpl $0,counter(%eax)		# counter
	je reschedule
ret_from_sys_call:
	movl _current,%eax		# task[0] cannot have signals
	cmpl _task,%eax
	je 3f
	cmpw $0x0f,CS(%esp)		# was old code segment supervisor ?
	jne 3f
	cmpw $0x17,OLDSS(%esp)		# was stack segment = 0x17 ?
	jne 3f
	movl signal(%eax),%ebx
	movl blocked(%eax),%ecx
	notl %ecx
	andl %ebx,%ecx
	bsfl %ecx,%ecx
	je 3f
	btrl %ecx,%ebx
	movl %ebx,signal(%eax)
	incl %ecx
	pushl %ecx
	call _do_signal
	popl %eax
3:	popl %eax
	popl %ebx
	popl %ecx
	popl %edx
	pop %fs
	pop %es
	pop %ds
	iret

接下來到了我們的重點fork函式,程式碼如下

.align 2
_sys_fork:
	call _find_empty_process
	testl %eax,%eax
	js 1f
	push %gs
	pushl %esi
	pushl %edi
	pushl %ebp
	pushl %eax
	call _copy_process
	addl $20,%esp
1:	ret
  1. 首先在task[64]中為程序1申請一個空閒位置並獲取程序號

    呼叫find_empty_process()函式為程序1獲得一個可用的程序號和task[64]中的一個位置,將該位置放入eax暫存器中返回。

    在該函式中,核心用全域性變數last_pid來存放系統自開機以來累計的程序數,也將此變數用作新建程序的程序號(task_struct中的pid變數)。因為Linux 0.11的task[64]只有64項,最多隻能同時執行64個程序,如果find_empty_process()函式返回-EAGAIN,意味著當前已經有64個程序在執行。

  2. 繼續壓入暫存器

  3. 呼叫copy_process函式

copy_process是我們關注的重點,接下來好好研究一下。

copy_process

程序0已經成為一個可以建立子程序的父程序,在核心中有“程序0的task_ struct” 和“程序0的頁表項”等專屬程序0的管理資訊。程序0將在copy_ process() 函式中做非常重要的、體現父子程序建立機制的工作:

  1. 為程序1建立task_struct,將程序0的task_struct的內容複製給程序1
  2. 為程序1的task_struct、tss以及LDT中的內容做個性化設定
  3. 為程序1建立第一個頁表,將程序0的頁表項內容賦給這個頁表
  4. 程序1共享程序0的檔案
  5. 設定程序1的GDT項
  6. 最後將程序1設定為就緒態,使其可以參與程序間的輪轉。
//程式碼路徑:kernel/fork.c

/*
 *  Ok, this is the main fork-routine. It copies the system process
 * information (task[nr]) and sets up the necessary registers. It
 * also copies the data segment in it's entirety.
 */
int copy_process(int nr,long ebp,long edi,long esi,long gs,long none,
		long ebx,long ecx,long edx,
		long fs,long es,long ds,
		long eip,long cs,long eflags,long esp,long ss)
{
	struct task_struct *p;
	int i;
	struct file *f;

	p = (struct task_struct *) get_free_page();
	if (!p)
		return -EAGAIN;
	task[nr] = p;
	*p = *current;	/* NOTE! this doesn't copy the supervisor stack */
	p->state = TASK_UNINTERRUPTIBLE;
	p->pid = last_pid;
	p->father = current->pid;
	p->counter = p->priority;
	p->signal = 0;
	p->alarm = 0;
	p->leader = 0;		/* process leadership doesn't inherit */
	p->utime = p->stime = 0;
	p->cutime = p->cstime = 0;
	p->start_time = jiffies;
	p->tss.back_link = 0;
	p->tss.esp0 = PAGE_SIZE + (long) p;		//核心棧發生變化,
	p->tss.ss0 = 0x10;
	p->tss.eip = eip;						//int 0x80後面的那條指令
	p->tss.eflags = eflags;
	p->tss.eax = 0;						//子程序fork返回值為0
	p->tss.ecx = ecx;
	p->tss.edx = edx;
	p->tss.ebx = ebx;
	p->tss.esp = esp;
	p->tss.ebp = ebp;
	p->tss.esi = esi;
	p->tss.edi = edi;
	p->tss.es = es & 0xffff;				//es是long型別的,因此要取低16位
	p->tss.cs = cs & 0xffff;
	p->tss.ss = ss & 0xffff;
	p->tss.ds = ds & 0xffff;
	p->tss.fs = fs & 0xffff;
	p->tss.gs = gs & 0xffff;
	p->tss.ldt = _LDT(nr);					//設定ldt
	p->tss.trace_bitmap = 0x80000000;
	if (last_task_used_math == current)
		__asm__("clts ; fnsave %0"::"m" (p->tss.i387));
	if (copy_mem(nr,p)) {				//設定頁表、以及ldt中段描述符的新基址
		task[nr] = NULL;
		free_page((long) p);
		return -EAGAIN;
	}
	for (i=0; i<NR_OPEN;i++)			//共享檔案
		if (f=p->filp[i])
			f->f_count++;
	if (current->pwd)
		current->pwd->i_count++;
	if (current->root)
		current->root->i_count++;
	if (current->executable)
		current->executable->i_count++;
	set_tss_desc(gdt+(nr<<1)+FIRST_TSS_ENTRY,&(p->tss));		//設定對應的GDT
	set_ldt_desc(gdt+(nr<<1)+FIRST_LDT_ENTRY,&(p->ldt));
	p->state = TASK_RUNNING;	/* do this last, just in case */
	return last_pid;			//父程序返回值為子程序的pid
}

其中比較重要的copy_mem函式,該函式改變了LDT中程式碼段和資料段的基址,將程序0的頁表拷貝給程序1。

//程式碼路徑:include/linux/sched.h

#define _set_base(addr,base) \
__asm__("movw %%dx,%0\n\t" \
	"rorl $16,%%edx\n\t" \
	"movb %%dl,%1\n\t" \
	"movb %%dh,%2" \
	::"m" (*((addr)+2)), \
	  "m" (*((addr)+4)), \
	  "m" (*((addr)+7)), \
	  "d" (base) \
	:"dx")

#define _set_limit(addr,limit) \
__asm__("movw %%dx,%0\n\t" \
	"rorl $16,%%edx\n\t" \
	"movb %1,%%dh\n\t" \
	"andb $0xf0,%%dh\n\t" \
	"orb %%dh,%%dl\n\t" \
	"movb %%dl,%1" \
	::"m" (*(addr)), \
	  "m" (*((addr)+6)), \
	  "d" (limit) \
	:"dx")

#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , base )
#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , (limit-1)>>12 )

#define _get_base(addr) ({\
unsigned long __base; \
__asm__("movb %3,%%dh\n\t" \
	"movb %2,%%dl\n\t" \
	"shll $16,%%edx\n\t" \
	"movw %1,%%dx" \
	:"=d" (__base) \
	:"m" (*((addr)+2)), \
	 "m" (*((addr)+4)), \
	 "m" (*((addr)+7))); \
__base;})

#define get_base(ldt) _get_base( ((char *)&(ldt)) )

#define get_limit(segment) ({ \
unsigned long __limit; \
__asm__("lsll %1,%0\n\t incl %0":"=r" (__limit):"r" (segment)); \
__limit;})

//程式碼路徑:kernel/fork.c

int copy_mem(int nr,struct task_struct * p)
{
	unsigned long old_data_base,new_data_base,data_limit;
	unsigned long old_code_base,new_code_base,code_limit;

	code_limit=get_limit(0x0f);
	data_limit=get_limit(0x17);
	old_code_base = get_base(current->ldt[1]);
	old_data_base = get_base(current->ldt[2]);
	if (old_data_base != old_code_base)
		panic("We don't support separate I&D");
	if (data_limit < code_limit)
		panic("Bad data_limit");
	new_data_base = new_code_base = nr * 0x4000000;
	p->start_code = new_code_base;
	set_base(p->ldt[1],new_code_base);
	set_base(p->ldt[2],new_data_base);
	if (copy_page_tables(old_data_base,new_data_base,data_limit)) {	//old_data_base 為 0,new_data_base為nr * 0x4000000,data_limit為640KB
		free_page_tables(new_data_base,data_limit);
		return -ENOMEM;
	}
	return 0;
}

該函式更改程序1的程式碼段和資料段基址後呼叫copy_page_table設定頁目錄項和複製頁表。

進入copy_ page_tables() 函式後,先為新的頁表申請- - 個空閒頁面,並把程序0中第一個頁表裡面前160 個頁表項複製到這個頁面中(1 個頁表項控制一個頁面4 KB記憶體空間,160 個頁表項可以控制640KB記憶體空間)。程序0和程序1的頁表暫時都指向了相同的頁面,意味著程序1也可以操作程序0的頁面。之後對程序1的頁目錄表進行設定。最後,用重置CR3的方法重新整理頁變換快取記憶體。程序1的頁表和頁目錄表設定完畢。

/*
 *  Well, here is one of the most complicated functions in mm. It
 * copies a range of linerar addresses by copying only the pages.
 * Let's hope this is bug-free, 'cause this one I don't want to debug :-)
 *
 * Note! We don't copy just any chunks of memory - addresses have to
 * be divisible by 4Mb (one page-directory entry), as this makes the
 * function easier. It's used only by fork anyway.
 *
 * NOTE 2!! When from==0 we are copying kernel space for the first
 * fork(). Then we DONT want to copy a full page-directory entry, as
 * that would lead to some serious memory waste - we just copy the
 * first 160 pages - 640kB. Even that is more than we need, but it
 * doesn't take any more memory - we don't copy-on-write in the low
 * 1 Mb-range, so the pages can be shared with the kernel. Thus the
 * special case for nr=xxxx.
 */
int copy_page_tables(unsigned long from,unsigned long to,long size)
{
	unsigned long * from_page_table;
	unsigned long * to_page_table;
	unsigned long this_page;
	unsigned long * from_dir, * to_dir;
	unsigned long nr;

	if ((from&0x3fffff) || (to&0x3fffff))
		panic("copy_page_tables called with wrong alignment");
	from_dir = (unsigned long *) ((from>>20) & 0xffc); /* _pg_dir = 0 */
	to_dir = (unsigned long *) ((to>>20) & 0xffc);
	size = ((unsigned) (size+0x3fffff)) >> 22;		//得到需要複製的頁目錄項個數,data_limit除以4MB,此時size = 1
	for( ; size-->0 ; from_dir++,to_dir++) {
		if (1 & *to_dir)
			panic("copy_page_tables: already exist");
		if (!(1 & *from_dir))		//看目錄項最低為p位是否為1
			continue;
		from_page_table = (unsigned long *) (0xfffff000 & *from_dir);
		if (!(to_page_table = (unsigned long *) get_free_page()))	//為頁表分配記憶體
			return -1;	/* Out of memory, see freeing */
		*to_dir = ((unsigned long) to_page_table) | 7;	//使用者,可寫,存在
		nr = (from==0)?0xA0:1024;		//如果是拷貝程序0的頁表,則複製160個頁表項,否則複製1024個頁表項
		for ( ; nr-- > 0 ; from_page_table++,to_page_table++) {
			this_page = *from_page_table;
			if (!(1 & this_page))
				continue;
			this_page &= ~2;					//使用者,只讀,存在
			*to_page_table = this_page;
			if (this_page > LOW_MEM) {			//1MB以內的核心區不參與使用者分頁管理
				*from_page_table = this_page;		
				this_page -= LOW_MEM;
				this_page >>= 12;
				mem_map[this_page]++;
			}
		}
	}
	invalidate();		//重置CR3為0,重新整理”頁快取記憶體“
	return 0;
}

執行結束後,程序1和程序0管理頁面完全一致,因為頁表項是完全拷貝過來的,所以他們共享頁面。效果如下:

copy_process結束時,程序1的建立工作完成,程序1已經具備了程序0的全部能力,可以在主機中正常地執行。返回sys_fork()中call_copy_process()的下一行執行,執行程式碼如下:

.align 2
_sys_fork:
	call _find_empty_process
	testl %eax,%eax
	js 1f
	push %gs
	pushl %esi
	pushl %edi
	pushl %ebp
	pushl %eax
	call _copy_process		//返回值last_pid放在eax暫存器中,層層返回
	addl $20,%esp			//將前面的gs、esi、edi、ebp和eax出棧
1:	ret


.align 2
_system_call:
	cmpl $nr_system_calls-1,%eax
	ja bad_sys_call
	push %ds
	push %es
	push %fs
	pushl %edx
	pushl %ecx		# push %ebx,%ecx,%edx as parameters
	pushl %ebx		# to the system call
	movl $0x10,%edx		# set up ds,es to kernel space
	mov %dx,%ds
	mov %dx,%es
	movl $0x17,%edx		# fs points to local data space
	mov %dx,%fs
	call _sys_call_table(,%eax,4)
	pushl %eax					//將程序號壓棧
	movl _current,%eax			//檢查當前程序的狀態和counter位
	cmpl $0,state(%eax)		# state
	jne reschedule
	cmpl $0,counter(%eax)		# counter
	je reschedule
ret_from_sys_call:
	movl _current,%eax		# task[0] cannot have signals
	cmpl _task,%eax				//如果當前程序是程序0,跳到下面的3執行
	je 3f
	cmpw $0x0f,CS(%esp)		# was old code segment supervisor ?
	jne 3f
	cmpw $0x17,OLDSS(%esp)		# was stack segment = 0x17 ?
	jne 3f
	movl signal(%eax),%ebx
	movl blocked(%eax),%ecx
	notl %ecx
	andl %ebx,%ecx
	bsfl %ecx,%ecx
	je 3f
	btrl %ecx,%ebx
	movl %ebx,signal(%eax)
	incl %ecx
	pushl %ecx
	call _do_signal
	popl %eax				//
3:	popl %eax				//將程序號放入eax暫存器中
	popl %ebx
	popl %ecx
	popl %edx
	pop %fs
	pop %es
	pop %ds
	iret

由於當前程序是程序0,所以就跳到標號3處,將壓棧的各個暫存器數值還原。之後iret中斷返回,CPU硬體自動將int 0x80的中斷時壓入的ss、esp、eflags、cs和eip值按壓棧的反序出棧給CPU對應暫存器,從0特權級的核心程式碼切換到3特權級的程序0程式碼執行,CS:EIP指向fork()中int 0x80的下一行if(_res >= 0)。

#define _syscall0(type,name) \
type name(void) \
{ \
long __res; \
__asm__ volatile ("int $0x80" \
	: "=a" (__res) \
	: "0" (__NR_##name)); \
if (__res >= 0) \					<----------iret返回後就這一行,eax暫存器此時存放在_res中
	return (type) __res; \
errno = -__res; \
return -1; \
}

return (type) _res將程序號1返回,回到呼叫點if(!fork())處執行,!1為假,這樣就不會執行到init()函式中,而是程序0繼續執行,直到pause()函式

void main(void)
{
	move_to_user_mode();
	if(!fork()) {
		init();
	}
	
	for(; ;) pause();		//執行
}

進入pause函式後,最終會對映到sys_pause函式,該函式將程序置為可中斷等待狀態,同時呼叫schedule函式。

int sys_pause(void)
{
	current->state = TASK_INTERRUPTIBLE;
	schedule();
	return 0;
}

schedule程式碼如下:

/*
 *  'schedule()' is the scheduler function. This is GOOD CODE! There
 * probably won't be any reason to change this, as it should work well
 * in all circumstances (ie gives IO-bound processes good response etc).
 * The one thing you might take a look at is the signal-handler code here.
 *
 *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other
 * tasks can run. It can not be killed, and it cannot sleep. The 'state'
 * information in task[0] is never used.
 */
void schedule(void)
{
	int i,next,c;
	struct task_struct ** p;

/* check alarm, wake up any interruptible tasks that have got a signal */

	for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
		if (*p) {
			if ((*p)->alarm && (*p)->alarm < jiffies) {
					(*p)->signal |= (1<<(SIGALRM-1));
					(*p)->alarm = 0;
				}
			if (((*p)->signal & ~(_BLOCKABLE & (*p)->blocked)) &&
			(*p)->state==TASK_INTERRUPTIBLE)
				(*p)->state=TASK_RUNNING;
		}

/* this is the scheduler proper: */

	while (1) {
		c = -1;
		next = 0;
		i = NR_TASKS;
		p = &task[NR_TASKS];
		while (--i) {
			if (!*--p)
				continue;
			if ((*p)->state == TASK_RUNNING && (*p)->counter > c)
				c = (*p)->counter, next = i;
		}
		if (c) break;
		for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
			if (*p)
				(*p)->counter = ((*p)->counter >> 1) +
						(*p)->priority;
	}
	switch_to(next);
}

/*
 *	switch_to(n) should switch tasks to task nr n, first
 * checking that n isn't the current task, in which case it does nothing.
 * This also clears the TS-flag if the task we switched to has used
 * tha math co-processor latest.
 */
#define switch_to(n) {\
struct {long a,b;} __tmp; \
__asm__("cmpl %%ecx,_current\n\t" \
	"je 1f\n\t" \					//如果程序n是當前程序,則沒必要切換,直接退出
	"movw %%dx,%1\n\t" \
	"xchgl %%ecx,_current\n\t" \
	"ljmp %0\n\t" \					//通過TSS選擇符進行跳轉,此時儲存各個暫存器到程序0的TSS中
	"cmpl %%ecx,_last_task_used_math\n\t" \
	"jne 1f\n\t" \
	"clts\n" \
	"1:" \
	::"m" (*&__tmp.a),"m" (*&__tmp.b), \
	"d" (_TSS(n)),"c" ((long) task[n])); \
}

ljmp跳轉到程序1執行,將CPU的各個暫存器值儲存在程序0的TSS中,將程序1的TSS資料以及LDT的程式碼段、資料段描述符資料恢復給CPU的各個暫存器,實現從0特權級的核心程式碼切換到3特權級的程序1程式碼執行。

程序1執行

當時為程序1的TSS中eax設定為0,eip設定為int 0x80下一指令的地址,即if(_res >= 0)。程序開始從這一行執行。

#define _syscall0(type,name) \
type name(void) \
{ \
long __res; \
__asm__ volatile ("int $0x80" \
	: "=a" (__res) \
	: "0" (__NR_##name)); \
if (__res >= 0) \					<----------程序0從這一行開始執行,但此時eax存放值為0
	return (type) __res; \
errno = -__res; \
return -1; \
}

返回後,執行到main()函式中if(!fork())這一行,!0為“真”,呼叫init()函式!

void init(void)
{
	int pid,i;

	setup((void *) &drive_info);
	(void) open("/dev/tty0",O_RDWR,0);
	(void) dup(0);
	(void) dup(0);
	printf("%d buffers = %d bytes buffer space\n\r",NR_BUFFERS,
		NR_BUFFERS*BLOCK_SIZE);
	printf("Free mem: %d bytes\n\r",memory_end-main_memory_start);
	if (!(pid=fork())) {
		close(0);
		if (open("/etc/rc",O_RDONLY,0))
			_exit(1);
		execve("/bin/sh",argv_rc,envp_rc);
		_exit(2);
	}
	if (pid>0)
		while (pid != wait(&i))
			/* nothing */;
	while (1) {
		if ((pid=fork())<0) {
			printf("Fork failed in init\r\n");
			continue;
		}
		if (!pid) {
			close(0);close(1);close(2);
			setsid();
			(void) open("/dev/tty0",O_RDWR,0);
			(void) dup(0);
			(void) dup(0);
			_exit(execve("/bin/sh",argv,envp));
		}
		while (1)
			if (pid == wait(&i))
				break;
		printf("\n\rchild %d died with code %04x\n\r",pid,i);
		sync();
	}
	_exit(0);	/* NOTE! _exit, not exit() */
}

執行setup函式對根檔案系統初始化後,程序1又打開了/dev/tty0字元檔案,之後呼叫fork函式建立程序2。

fork對映到sys_fork,執行過程和之前是一樣的,會為程序2的task_struct以及核心棧申請頁面,並複製task_struct,隨後對程序2的task_struct進行各種個性化設定,包括各個暫存器的設定、記憶體頁面的管理設定、共享檔案的設定、GDT表項的設定等。不太一樣的是程序2複製了程序1的1024個頁表項。

程序2 shell

程序2建立完畢後,fork()函式返回,返回值為2,因此!(pid=fork())為假,於是呼叫wait()函式。此函式的功能是:如果程序1有等待退出的子程序,就為該程序的退出做善後工作;如果有子程序,但並不等待退出,則進行程序切換;如果沒有子程序,函式返回。

//程式碼路徑:/lib/wait.c

_syscall3(pid_t,waitpid,pid_t,pid,int *,wait_stat,int,options)

pid_t wait(int * wait_stat)
{
	return waitpid(-1,wait_stat,0);
}

//程式碼路徑:kernel/exit.c
int sys_waitpid(pid_t pid,unsigned long * stat_addr, int options)
{
	int flag, code;
	struct task_struct ** p;

	verify_area(stat_addr,4);
repeat:
	flag=0;
	for(p = &LAST_TASK ; p > &FIRST_TASK ; --p) {
		if (!*p || *p == current)
			continue;
		if ((*p)->father != current->pid)
			continue;
		if (pid>0) {						//wait時傳入pid為-1
			if ((*p)->pid != pid)
				continue;
		} else if (!pid) {
			if ((*p)->pgrp != current->pgrp)
				continue;
		} else if (pid != -1) {
			if ((*p)->pgrp != -pid)
				continue;
		}
		switch ((*p)->state) {
			case TASK_STOPPED:
				if (!(options & WUNTRACED))
					continue;
				put_fs_long(0x7f,stat_addr);
				return (*p)->pid;
			case TASK_ZOMBIE:
				current->cutime += (*p)->utime;
				current->cstime += (*p)->stime;
				flag = (*p)->pid;
				code = (*p)->exit_code;
				release(*p);
				put_fs_long(code,stat_addr);
				return flag;
			default:					//此時程序2為就緒態,執行到這裡
				flag=1;
				continue;
		}
	}
	if (flag) {
		if (options & WNOHANG)
			return 0;
		current->state=TASK_INTERRUPTIBLE;		//將程序1設定為可中斷等待狀態
		schedule();								//執行程序2
		if (!(current->signal &= ~(1<<(SIGCHLD-1))))
			goto repeat;
		else
			return -EINTR;
	}
	return -ECHILD;
}

輪轉到程序2執行,程序2關閉檔案後開啟檔案/etc/rc

	if (!(pid=fork())) {
		close(0);
		if (open("/etc/rc",O_RDONLY,0))
			_exit(1);
		execve("/bin/sh",argv_rc,envp_rc);
		_exit(2);
	}

之後執行execve函式,進入核心對應_sys_execve函式

_sys_execve:
	lea EIP(%esp),%eax
	pushl %eax
	call _do_execve
	addl $4,%esp
	ret
#define MAX_ARG_PAGES 32
/*
 * 'do_execve()' executes a new program.
 */
int do_execve(unsigned long * eip,long tmp,char * filename,
	char ** argv, char ** envp)
{
	struct m_inode * inode;
	struct buffer_head * bh;
	struct exec ex;
	unsigned long page[MAX_ARG_PAGES];
	int i,argc,envc;
	int e_uid, e_gid;
	int retval;
	int sh_bang = 0;
	unsigned long p=PAGE_SIZE*MAX_ARG_PAGES-4;

	if ((0xffff & eip[1]) != 0x000f)
		panic("execve called from supervisor mode");
	for (i=0 ; i<MAX_ARG_PAGES ; i++)	/* clear page-table */
		page[i]=0;
	if (!(inode=namei(filename)))		/* get executables inode */
		return -ENOENT;
	argc = count(argv);
	envc = count(envp);
	
restart_interp:
	if (!S_ISREG(inode->i_mode)) {	/* must be regular file */
		retval = -EACCES;
		goto exec_error2;
	}
	i = inode->i_mode;					//檔案uid、gid暫時還看不懂
	e_uid = (i & S_ISUID) ? inode->i_uid : current->euid;
	e_gid = (i & S_ISGID) ? inode->i_gid : current->egid;
	if (current->euid == inode->i_uid)
		i >>= 6;
	else if (current->egid == inode->i_gid)
		i >>= 3;
	if (!(i & 1) &&
	    !((inode->i_mode & 0111) && suser())) {
		retval = -ENOEXEC;
		goto exec_error2;
	}
	if (!(bh = bread(inode->i_dev,inode->i_zone[0]))) {
		retval = -EACCES;
		goto exec_error2;
	}
	ex = *((struct exec *) bh->b_data);	/* read exec-header */
	if ((bh->b_data[0] == '#') && (bh->b_data[1] == '!') && (!sh_bang)) {
		/*
		 * This section does the #! interpretation.
		 * Sorta complicated, but hopefully it will work.  -TYT
		 */

		char buf[1023], *cp, *interp, *i_name, *i_arg;
		unsigned long old_fs;

		strncpy(buf, bh->b_data+2, 1022);
		brelse(bh);
		iput(inode);
		buf[1022] = '\0';
		if (cp = strchr(buf, '\n')) {
			*cp = '\0';
			for (cp = buf; (*cp == ' ') || (*cp == '\t'); cp++);
		}
		if (!cp || *cp == '\0') {
			retval = -ENOEXEC; /* No interpreter name found */
			goto exec_error1;
		}
		interp = i_name = cp;
		i_arg = 0;
		for ( ; *cp && (*cp != ' ') && (*cp != '\t'); cp++) {
 			if (*cp == '/')
				i_name = cp+1;
		}
		if (*cp) {
			*cp++ = '\0';
			i_arg = cp;
		}
		/*
		 * OK, we've parsed out the interpreter name and
		 * (optional) argument.
		 */
		if (sh_bang++ == 0) {
			p = copy_strings(envc, envp, page, p, 0);
			p = copy_strings(--argc, argv+1, page, p, 0);
		}
		/*
		 * Splice in (1) the interpreter's name for argv[0]
		 *           (2) (optional) argument to interpreter
		 *           (3) filename of shell script
		 *
		 * This is done in reverse order, because of how the
		 * user environment and arguments are stored.
		 */
		p = copy_strings(1, &filename, page, p, 1);
		argc++;
		if (i_arg) {
			p = copy_strings(1, &i_arg, page, p, 2);
			argc++;
		}
		p = copy_strings(1, &i_name, page, p, 2);
		argc++;
		if (!p) {
			retval = -ENOMEM;
			goto exec_error1;
		}
		/*
		 * OK, now restart the process with the interpreter's inode.
		 */
		old_fs = get_fs();
		set_fs(get_ds());
		if (!(inode=namei(interp))) { /* get executables inode */
			set_fs(old_fs);
			retval = -ENOENT;
			goto exec_error1;
		}
		set_fs(old_fs);
		goto restart_interp;
	}
	brelse(bh);		//檢測可執行檔案是否可執行
	if (N_MAGIC(ex) != ZMAGIC || ex.a_trsize || ex.a_drsize ||
		ex.a_text+ex.a_data+ex.a_bss>0x3000000 ||
		inode->i_size < ex.a_text+ex.a_data+ex.a_syms+N_TXTOFF(ex)) {
		retval = -ENOEXEC;
		goto exec_error2;
	}
	if (N_TXTOFF(ex) != BLOCK_SIZE) {
		printk("%s: N_TXTOFF != BLOCK_SIZE. See a.out.h.", filename);
		retval = -ENOEXEC;
		goto exec_error2;
	}
	if (!sh_bang) {
		p = copy_strings(envc,envp,page,p,0);		//分配頁面,將環境變數和引數拷貝到新分配的頁面中
		p = copy_strings(argc,argv,page,p,0);		//page陣列存放的分配頁面的實體地址
		if (!p) {
			retval = -ENOMEM;
			goto exec_error2;
		}
	}
/* OK, This is the point of no return */
	if (current->executable)
		iput(current->executable);			//減少inode的引用數,由於程序2複製程序1的執行檔案是0,所以不會執行
	current->executable = inode;
	for (i=0 ; i<32 ; i++)
		current->sigaction[i].sa_handler = NULL;
	for (i=0 ; i<NR_OPEN ; i++)
		if ((current->close_on_exec>>i)&1)		//遍歷close_on_exec,關閉需要關閉的檔案
			sys_close(i);
	current->close_on_exec = 0;
	free_page_tables(get_base(current->ldt[1]),get_limit(0x0f));		//非常關鍵,並不直接將可執行檔案讀進記憶體然後設定頁表來對映,需要觸發缺頁中斷開始執行
	free_page_tables(get_base(current->ldt[2]),get_limit(0x17));
	if (last_task_used_math == current)
		last_task_used_math = NULL;
	current->used_math = 0;
	p += change_ldt(ex.a_text,page)-MAX_ARG_PAGES*PAGE_SIZE;		//修改資料段的data limit為data_limit = 0x4000000;最高地址準備存放環境變數和引數
	p = (unsigned long) create_tables((char *)p,argc,envc);			//設定環境變數parses the env- and arg-strings in new user memory and creates the pointer tables from them, and puts their addresses on the "stack", returning the new stack pointer value.
	current->brk = ex.a_bss +
		(current->end_data = ex.a_data +
		(current->end_code = ex.a_text));
	current->start_stack = p & 0xfffff000;						//
	current->euid = e_uid;
	current->egid = e_gid;
	i = ex.a_text+ex.a_data;
	while (i&0xfff)
		put_fs_byte(0,(char *) (i++));
	eip[0] = ex.a_entry;		/* eip, magic happens :-) */			//改變棧上的內容
	eip[3] = p;			/* stack pointer */			//設定eip和esp
	return 0;
exec_error2:
	iput(inode);
exec_error1:
	for (i=0 ; i<MAX_ARG_PAGES ; i++)
		free_page(page[i]);
	return(retval);
}

shell程式開始執行時,其線性地址空間對應的程式內容並未載入,也就不存在相應的頁面,因此會產生一個“頁異常”中斷,此中斷會進一步呼叫“缺頁中斷”處理程式來分配該頁面,並載入一頁shell程式。

//程式碼路徑:mm/page.s
.globl _page_fault

_page_fault:
	xchgl %eax,(%esp)			//error code
	pushl %ecx
	pushl %edx
	push %ds
	push %es
	push %fs
	movl $0x10,%edx
	mov %dx,%ds
	mov %dx,%es
	mov %dx,%fs
	movl %cr2,%edx				//address
	pushl %edx					//addr
	pushl %eax					//error code
	testl $1,%eax				//p bits
	jne 1f						//p != 0
	call _do_no_page			//p = 0 means no page
	jmp 2f
1:	call _do_wp_page
2:	addl $8,%esp
	pop %fs
	pop %es
	pop %ds
	popl %edx
	popl %ecx
	popl %eax
	iret

通過檢測error code後呼叫do_no_page()函式,先確定缺頁的原因。加入是由於需要載入程式才卻頁。會嘗試與其他程序共享shell(顯然此前沒有程序載入過shell,無法共享),於是申請一個頁面,呼叫bread_page()函式,從虛擬盤上讀取4塊(4KB,一頁)shell程式內容,載入記憶體頁面。

void do_no_page(unsigned long error_code,unsigned long address)
{
	int nr[4];
	unsigned long tmp;
	unsigned long page;
	int block,i;

	address &= 0xfffff000;
	tmp = address - current->start_code;					
	if (!current->executable || tmp >= current->end_data) {
		get_empty_page(address);
		return;
	}
	if (share_page(tmp))
		return;
	if (!(page = get_free_page()))
		oom();
/* remember that 1 block is used for header */
	block = 1 + tmp/BLOCK_SIZE;
	for (i=0 ; i<4 ; block++,i++)
		nr[i] = bmap(current->executable,block);
	bread_page(page,current->executable->i_dev,nr);		//讀取四個邏輯塊(1頁)的shell程式內容進記憶體頁面
	i = tmp + 4096 - current->end_data;
	tmp = page + 4096;
	while (i-- > 0) {
		tmp--;
		*(char *)tmp = 0;
	}
	if (put_page(page,address))			//修改頁表,建立對映
		return;
	free_page(page);
	oom();
}

載入一頁的shell程式後,核心會將該頁內容對映到shell程序的線性地址空間內,建立頁目錄表->頁表->頁面的三級對映管理關係。

程序3 update

之後程序2讀取rc檔案上的資訊,fork出了update程序,這個新程序的程序號為3。update程序有一項很重要的任務:將緩衝區中的資料同步到外色號(軟盤、硬碟)上。由於主機與外設的資料交換速度遠低於主機內部的資料處理速度,因此,當核心需要往外設上寫資料的時候,為了提高系統的整體執行效率,並不把資料直接寫入到外設上,而是先寫入緩衝區,之後根據實際情況,再將資料從緩衝區同步到外設。

每隔一段時間,update程序就會被喚醒,把資料往外設上同步一次,之後這個程序會被掛起,即被設定為可中斷等待狀態,等待著下一次被喚醒後繼續執行,如此周而復始。

update程序執行後,沒有同步任務,於是該程序被掛起,系統執行程序排程,最終切換到shell程序繼續執行。

完成工作後呼叫exit()函式,對應的系統呼叫函式為sys_exit(),執行程式碼如下

int sys_exit(int error_code)
{
	return do_exit((error_code&0xff)<<8);
}

int do_exit(long code)
{
	int i;

	free_page_tables(get_base(current->ldt[1]),get_limit(0x0f));
	free_page_tables(get_base(current->ldt[2]),get_limit(0x17));
	for (i=0 ; i<NR_TASKS ; i++)
		if (task[i] && task[i]->father == current->pid) {			//尋找子程序
			task[i]->father = 1;									//將子程序父程序設定為1
			if (task[i]->state == TASK_ZOMBIE)
				/* assumption task[1] is always init */
				(void) send_sig(SIGCHLD, task[1], 1);				//如果子程序為zombie,即已經呼叫了exit函式,給程序1傳送訊號。
		}
	for (i=0 ; i<NR_OPEN ; i++)
		if (current->filp[i])
			sys_close(i);
	iput(current->pwd);
	current->pwd=NULL;
	iput(current->root);
	current->root=NULL;
	iput(current->executable);
	current->executable=NULL;
	if (current->leader && current->tty >= 0)
		tty_table[current->tty].pgrp = 0;
	if (last_task_used_math == current)
		last_task_used_math = NULL;
	if (current->leader)
		kill_session();
	current->state = TASK_ZOMBIE;
	current->exit_code = code;
	tell_father(current->father);
	schedule();
	return (-1);	/* just to suppress warnings */
}

值得注意的是tell_father()和schedule()函式的執行,tell_father向父程序傳送SIGCHLD訊號

static void tell_father(int pid)
{
	int i;

	if (pid)
		for (i=0;i<NR_TASKS;i++) {
			if (!task[i])
				continue;
			if (task[i]->pid != pid)
				continue;
			task[i]->signal |= (1<<(SIGCHLD-1));
			return;
		}
/* if we don't find any fathers, we just release ourselves */
/* This is not really OK. Must change it to make father 1 */
	printk("BAD BAD - no father found\n\r");
	release(current);
}

tell_father()函式執行完畢後,呼叫schedule()函式準備程序切換。此次schedule()函式中對訊號的檢測,影響了程序切換

/*
 *  'schedule()' is the scheduler function. This is GOOD CODE! There
 * probably won't be any reason to change this, as it should work well
 * in all circumstances (ie gives IO-bound processes good response etc).
 * The one thing you might take a look at is the signal-handler code here.
 *
 *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other
 * tasks can run. It can not be killed, and it cannot sleep. The 'state'
 * information in task[0] is never used.
 */
void schedule(void)
{
	int i,next,c;
	struct task_struct ** p;

/* check alarm, wake up any interruptible tasks that have got a signal */

	for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
		if (*p) {
			if ((*p)->alarm && (*p)->alarm < jiffies) {
					(*p)->signal |= (1<<(SIGALRM-1));
					(*p)->alarm = 0;
				}
			if (((*p)->signal & ~(_BLOCKABLE & (*p)->blocked)) &&
			(*p)->state==TASK_INTERRUPTIBLE)
				(*p)->state=TASK_RUNNING;			//將程序1設定為task_running
		}

/* this is the scheduler proper: */

	while (1) {
		c = -1;
		next = 0;
		i = NR_TASKS;
		p = &task[NR_TASKS];
		while (--i) {
			if (!*--p)
				continue;
			if ((*p)->state == TASK_RUNNING && (*p)->counter > c)
				c = (*p)->counter, next = i;
		}
		if (c) break;
		for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
			if (*p)
				(*p)->counter = ((*p)->counter >> 1) +
						(*p)->priority;
	}
	switch_to(next);
}

將程序1設定為task_running後,後面排程到程序1執行,此時task1還在schedule函式中,執行完畢後繼續執行sys_waitpid()函式

int sys_waitpid(pid_t pid,unsigned long * stat_addr, int options)
{
	int flag, code;
	struct task_struct ** p;

	verify_area(stat_addr,4);
repeat:
	flag=0;
	for(p = &LAST_TASK ; p > &FIRST_TASK ; --p) {
		if (!*p || *p == current)
			continue;
		if ((*p)->father != current->pid)
			continue;
		if (pid>0) {
			if ((*p)->pid != pid)
				continue;
		} else if (!pid) {
			if ((*p)->pgrp != current->pgrp)
				continue;
		} else if (pid != -1) {
			if ((*p)->pgrp != -pid)
				continue;
		}
		switch ((*p)->state) {
			case TASK_STOPPED:
				if (!(options & WUNTRACED))
					continue;
				put_fs_long(0x7f,stat_addr);
				return (*p)->pid;
			case TASK_ZOMBIE:					//找到子程序的狀態為task_zombie
				current->cutime += (*p)->utime;
				current->cstime += (*p)->stime;
				flag = (*p)->pid;				//記錄程序2的pid
				code = (*p)->exit_code;
				release(*p);					//釋放他的task_struct頁面
				put_fs_long(code,stat_addr);
				return flag;
			default:
				flag=1;
				continue;
		}
	}
	if (flag) {
		if (options & WNOHANG)
			return 0;
		current->state=TASK_INTERRUPTIBLE;
		schedule();
		if (!(current->signal &= ~(1<<(SIGCHLD-1))))		//執行到這裡,檢測到SIGCHLD,確定有子程序要退出,if條件成立,repeat
			goto repeat;
		else
			return -EINTR;
	}
	return -ECHILD;
}

sys_waitpid()函式執行完畢後,會回到wait()函式,最後返回到init()函式中,程序1繼續執行。

void init(void)
{
	int pid,i;

	setup((void *) &drive_info);
	(void) open("/dev/tty0",O_RDWR,0);
	(void) dup(0);
	(void) dup(0);
	printf("%d buffers = %d bytes buffer space\n\r",NR_BUFFERS,
		NR_BUFFERS*BLOCK_SIZE);
	printf("Free mem: %d bytes\n\r",memory_end-main_memory_start);
	if (!(pid=fork())) {
		close(0);
		if (open("/etc/rc",O_RDONLY,0))
			_exit(1);
		execve("/bin/sh",argv_rc,envp_rc);
		_exit(2);		//不會執行到這句話
	}
	if (pid>0)
		while (pid != wait(&i))		//此時wait的返回值為2,while條件為假,退出
			/* nothing */;
	while (1) {
		if ((pid=fork())<0) {				//fork程序
			printf("Fork failed in init\r\n");
			continue;
		}
		if (!pid) {					//此時程序號為4,雖然在task陣列中索引為2(因為之前shell程序還fork出了一個update程序,該程序pid為3
			close(0);close(1);close(2);
			setsid();
			(void) open("/dev/tty0",O_RDWR,0);	//
			(void) dup(0);
			(void) dup(0);
			_exit(execve("/bin/sh",argv,envp));
		}
		while (1)
			if (pid == wait(&i))					//因為程序1是所有孤兒程序的父程序,因此要一直wait,從而release子程序
				break;
		printf("\n\rchild %d died with code %04x\n\r",pid,i);
		sync();
	}
	_exit(0);	/* NOTE! _exit, not exit() */
}

程序4 shell

這次shell開啟的是標準輸入裝置檔案tty0而不是rc,這使得shell開始執行後,不再退出。進入rw_char()函式後,shell程序將被設定為可中斷等待狀態,這樣所有的程序都處於可中斷等待狀態,再次切換到程序0去執行,系統實現怠速。

怠速以後,作業系統使用者將通過shell程序提供的平臺與計算機進行互動,shell程序處理使用者指令的工作原理如下:使用者通過鍵盤輸入資訊,儲存在指定的字元緩衝佇列上。該緩衝佇列上的內容,就是tty0檔案的內容,shell程序會不斷讀取緩衝佇列上的資料資訊。如果使用者沒有下達指令,緩衝佇列就不會有資料,shell程序將會被設定為可終端等待狀態,即被掛起。如果使用者通過鍵盤下達指令,將產生鍵盤中斷,中斷程式會將字元資訊儲存在緩衝佇列上,並給shell程序發訊號,訊號將導致shell程序被設定為就緒狀態,即被喚醒,喚醒後的shell繼續從緩衝佇列中讀取資料資訊並處理,完畢後,shell程序將再次被掛起,等待下一次鍵盤中斷被喚醒。