6進(jìn)程的描述和進(jìn)程的創(chuàng)建

安大大 + 原創(chuàng)作品轉(zhuǎn)載請注明出處 + 《Linux操作系統(tǒng)分析》MOOC課程

進(jìn)程控制塊PCB——task_struct

為了管理進(jìn)程涂佃，內(nèi)核必須對每個進(jìn)程進(jìn)行清晰的描述熔号，進(jìn)程描述符提供了內(nèi)核所需了解的進(jìn)程信息。

struct task_struct數(shù)據(jù)結(jié)構(gòu)很龐大
Linux進(jìn)程的狀態(tài)與操作系統(tǒng)原理中的描述的進(jìn)程狀態(tài)似乎有所不同腻暮，比如就緒狀態(tài)和運(yùn)行狀態(tài)都是TASK_RUNNING，為什么呢？
進(jìn)程的標(biāo)示pid
所有進(jìn)程鏈表struct list_head tasks;
內(nèi)核的雙向循環(huán)鏈表的實現(xiàn)方法 - 一個更簡略的雙向循環(huán)鏈表
程序創(chuàng)建的進(jìn)程具有父子關(guān)系洋侨，在編程時往往需要引用這樣的父子關(guān)系。進(jìn)程描述符中有幾個域用來表示這樣的關(guān)系
Linux為每個進(jìn)程分配一個8KB大小的內(nèi)存區(qū)域倦蚪，用于存放該進(jìn)程兩個不同的數(shù)據(jù)結(jié)構(gòu)：Thread_info和進(jìn)程的內(nèi)核堆棧
進(jìn)程處于內(nèi)核態(tài)時使用希坚，?不同于用戶態(tài)堆棧，即PCB中指定了內(nèi)核棧陵且，那為什么PCB中沒有用戶態(tài)堆棧裁僧？用戶態(tài)堆棧是怎么設(shè)定的？
內(nèi)核控制路徑所用的堆棧?很少慕购，因此對棧和Thread_info?來說聊疲，8KB足夠了
struct thread_struct thread; //CPU-specific state of this task
文件系統(tǒng)和文件描述符
內(nèi)存管理——進(jìn)程的地址空間
參考資料
ProgramAndProcess

進(jìn)程的描述

操作系統(tǒng)的三大功能:進(jìn)程管理，內(nèi)存管理和文件系統(tǒng)沪悲。最核心的是進(jìn)程管理获洲。

進(jìn)程描述符 task_struct數(shù)據(jù)結(jié)構(gòu)

理解部分在代碼注釋中

struct task_struct {
    volatile long state;    /*運(yùn)行狀態(tài) -1 unrunnable, 0 runnable, >0 stopped */
    void *stack;    //指定進(jìn)程的內(nèi)核堆棧 
    atomic_t usage;
    unsigned int flags; /* 每個進(jìn)程的標(biāo)識符  per process flags, defined below */
    unsigned int ptrace;

#ifdef CONFIG_SMP
    //條件編譯，多處理器用 
    struct llist_node wake_entry;
    int on_cpu;
    struct task_struct *last_wakee;
    unsigned long wakee_flips;
    unsigned long wakee_flip_decay_ts;

    int wake_cpu;
#endif
    //下邊一段是與優(yōu)先級調(diào)度相關(guān) ,在不同的環(huán)境下不同的調(diào)度方式和標(biāo)識 
    int on_rq;  //運(yùn)行隊列run queue 

    int prio, static_prio, normal_prio;
    unsigned int rt_priority;
    const struct sched_class *sched_class;
    struct sched_entity se;
    struct sched_rt_entity rt;
#ifdef CONFIG_CGROUP_SCHED
    struct task_group *sched_task_group;
#endif
    struct sched_dl_entity dl;

#ifdef CONFIG_PREEMPT_NOTIFIERS
    /* list of struct preempt_notifier: */
    struct hlist_head preempt_notifiers;
#endif

#ifdef CONFIG_BLK_DEV_IO_TRACE
    unsigned int btrace_seq;
#endif

    unsigned int policy;
    int nr_cpus_allowed;
    cpumask_t cpus_allowed;

#ifdef CONFIG_PREEMPT_RCU
    int rcu_read_lock_nesting;
    union rcu_special rcu_read_unlock_special;
    struct list_head rcu_node_entry;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_TREE_PREEMPT_RCU
    struct rcu_node *rcu_blocked_node;
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
#ifdef CONFIG_TASKS_RCU
    unsigned long rcu_tasks_nvcsw;
    bool rcu_tasks_holdout;
    struct list_head rcu_tasks_holdout_list;
    int rcu_tasks_idle_cpu;
#endif /* #ifdef CONFIG_TASKS_RCU */

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
    struct sched_info sched_info;
#endif

    struct list_head tasks; //進(jìn)程的鏈表殿如，當(dāng)前所有的進(jìn)程都用鏈表連起來昌妹，是一個雙向循環(huán)鏈表 
#ifdef CONFIG_SMP
    struct plist_node pushable_tasks;
    struct rb_node pushable_dl_tasks;
#endif

    struct mm_struct *mm, *active_mm;//和內(nèi)存管理，進(jìn)程的地址空間相關(guān)握截，每個進(jìn)程的代碼段飞崖，數(shù)據(jù)段都和這個相關(guān) 
#ifdef CONFIG_COMPAT_BRK
    unsigned brk_randomized:1;
#endif
    /* per-thread vma caching */
    u32 vmacache_seqnum;
    struct vm_area_struct *vmacache[VMACACHE_SIZE];
#if defined(SPLIT_RSS_COUNTING)
    struct task_rss_stat    rss_stat;
#endif
/* task state 任務(wù)的狀態(tài)*/
    int exit_state;
    int exit_code, exit_signal;
    int pdeath_signal;  /*  The signal sent when the parent dies  */
    unsigned int jobctl;    /* JOBCTL_*, siglock protected */

    /* Used for emulating ABI behavior of previous Linux versions */
    unsigned int personality;

    unsigned in_execve:1;   /* Tell the LSMs that the process is doing an
                 * execve */
    unsigned in_iowait:1;

    /* Revert to default priority/policy when forking */
    unsigned sched_reset_on_fork:1;
    unsigned sched_contributes_to_load:1;

    unsigned long atomic_flags; /* Flags needing atomic access. */

    pid_t pid;  //進(jìn)程的pid，來標(biāo)識某一個進(jìn)程 
    pid_t tgid;

#ifdef CONFIG_CC_STACKPROTECTOR
    /* Canary value for the -fstack-protector gcc feature */
    unsigned long stack_canary;
#endif
    /* 下邊是進(jìn)程的父子關(guān)系管理谨胞，都是通過雙向鏈表鏈接的 
     * pointers to (original) parent process, youngest child, younger sibling,
     * older sibling, respectively.  (p->father can be replaced with
     * p->real_parent->pid)
     */
    struct task_struct __rcu *real_parent; /* real parent process */
    struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */
    /*
     * children/sibling forms the list of my natural children
     */
    struct list_head children;  /* list of my children */
    struct list_head sibling;   /* linkage in my parent's children list */
    struct task_struct *group_leader;   /* threadgroup leader */

    /*
     * ptraced is the list of tasks this task is using ptrace on.
     * This includes both natural children and PTRACE_ATTACH targets.
     * p->ptrace_entry is p's link on the p->parent->ptraced list.
     */
    struct list_head ptraced;   //調(diào)試用 
    struct list_head ptrace_entry;

    /* PID/PID hash table linkage. */
    struct pid_link pids[PIDTYPE_MAX];  //pid的哈希表固歪，用來方便查找 
    struct list_head thread_group;
    struct list_head thread_node;

    struct completion *vfork_done;      /* for vfork() */
    int __user *set_child_tid;      /* CLONE_CHILD_SETTID */
    int __user *clear_child_tid;        /* CLONE_CHILD_CLEARTID */
 
    cputime_t utime, stime, utimescaled, stimescaled;   //下邊是和時間相關(guān)的代碼
    cputime_t gtime;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
    struct cputime prev_cputime;
#endif
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
    seqlock_t vtime_seqlock;
    unsigned long long vtime_snap;
    enum {
        VTIME_SLEEPING = 0,
        VTIME_USER,
        VTIME_SYS,
    } vtime_snap_whence;
#endif
    unsigned long nvcsw, nivcsw; /* context switch counts */
    u64 start_time;     /* monotonic time in nsec */
    u64 real_start_time;    /* boot based time in nsec */
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
    unsigned long min_flt, maj_flt;

    struct task_cputime cputime_expires;
    struct list_head cpu_timers[3];

/* process credentials */
    const struct cred __rcu *real_cred; /* objective and real subjective task
                     * credentials (COW) */
    const struct cred __rcu *cred;  /* effective (overridable) subjective task
                     * credentials (COW) */
    char comm[TASK_COMM_LEN]; /* executable name excluding path
                     - access with [gs]et_task_comm (which lock
                       it with task_lock())
                     - initialized normally by setup_new_exec */
/* file system info */
    int link_count, total_link_count;
#ifdef CONFIG_SYSVIPC
/* ipc stuff */
    struct sysv_sem sysvsem;
    struct sysv_shm sysvshm;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
/* hung task detection */
    unsigned long last_switch_count;
#endif
/* CPU-specific state of this task 和CPU相關(guān)的狀態(tài)，進(jìn)程上下文切換時起著關(guān)鍵作用*/
    struct thread_struct thread;
/* filesystem information 文件系統(tǒng)相關(guān)的數(shù)據(jù)結(jié)構(gòu)*/
    struct fs_struct *fs;
/* open file information 打開的文件描述符列表*/
    struct files_struct *files;
/* namespaces */
    struct nsproxy *nsproxy;
/* signal handlers 和信號處理相關(guān)的*/
    struct signal_struct *signal;
    struct sighand_struct *sighand;

    sigset_t blocked, real_blocked;
    sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
    struct sigpending pending;

    unsigned long sas_ss_sp;
    size_t sas_ss_size;
    int (*notifier)(void *priv);
    void *notifier_data;
    sigset_t *notifier_mask;
    struct callback_head *task_works;

    struct audit_context *audit_context;
#ifdef CONFIG_AUDITSYSCALL
    kuid_t loginuid;
    unsigned int sessionid;
#endif
    struct seccomp seccomp;

/* Thread group tracking */
    u32 parent_exec_id;
    u32 self_exec_id;
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
 * mempolicy */
    spinlock_t alloc_lock;

    /* Protection of the PI data structures: */
    raw_spinlock_t pi_lock;

#ifdef CONFIG_RT_MUTEXES    //mutexes互斥鎖胯努，互斥器 
    /* PI waiters blocked on a rt_mutex held by this task */
    struct rb_root pi_waiters;
    struct rb_node *pi_waiters_leftmost;
    /* Deadlock detection and priority inheritance handling */
    struct rt_mutex_waiter *pi_blocked_on;
#endif

#ifdef CONFIG_DEBUG_MUTEXES
    /* mutex deadlock detection */
    struct mutex_waiter *blocked_on;
#endif
#ifdef CONFIG_TRACE_IRQFLAGS    //和調(diào)試相關(guān) 
    unsigned int irq_events;
    unsigned long hardirq_enable_ip;
    unsigned long hardirq_disable_ip;
    unsigned int hardirq_enable_event;
    unsigned int hardirq_disable_event;
    int hardirqs_enabled;
    int hardirq_context;
    unsigned long softirq_disable_ip;
    unsigned long softirq_enable_ip;
    unsigned int softirq_disable_event;
    unsigned int softirq_enable_event;
    int softirqs_enabled;
    int softirq_context;
#endif
#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH 48UL
    u64 curr_chain_key;
    int lockdep_depth;
    unsigned int lockdep_recursion;
    struct held_lock held_locks[MAX_LOCK_DEPTH];
    gfp_t lockdep_reclaim_gfp;
#endif

/* journalling filesystem info */
    void *journal_info;

/* stacked block device info */
    struct bio_list *bio_list;

#ifdef CONFIG_BLOCK
/* stack plugging */
    struct blk_plug *plug;
#endif

/* VM state */
    struct reclaim_state *reclaim_state;

    struct backing_dev_info *backing_dev_info;

    struct io_context *io_context;

    unsigned long ptrace_message;
    siginfo_t *last_siginfo; /* For ptrace use.  */
    struct task_io_accounting ioac;
#if defined(CONFIG_TASK_XACCT)
    u64 acct_rss_mem1;  /* accumulated rss usage */
    u64 acct_vm_mem1;   /* accumulated virtual memory usage */
    cputime_t acct_timexpd; /* stime + utime since last update */
#endif
#ifdef CONFIG_CPUSETS
    nodemask_t mems_allowed;    /* Protected by alloc_lock */
    seqcount_t mems_allowed_seq;    /* Seqence no to catch updates */
    int cpuset_mem_spread_rotor;
    int cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
    /* Control Group info protected by css_set_lock */
    struct css_set __rcu *cgroups;
    /* cg_list protected by css_set_lock and tsk->alloc_lock */
    struct list_head cg_list;
#endif
#ifdef CONFIG_FUTEX
    struct robust_list_head __user *robust_list;
#ifdef CONFIG_COMPAT
    struct compat_robust_list_head __user *compat_robust_list;
#endif
    struct list_head pi_state_list;
    struct futex_pi_state *pi_state_cache;
#endif
#ifdef CONFIG_PERF_EVENTS
    struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
    struct mutex perf_event_mutex;
    struct list_head perf_event_list;
#endif
#ifdef CONFIG_DEBUG_PREEMPT
    unsigned long preempt_disable_ip;
#endif
#ifdef CONFIG_NUMA
    struct mempolicy *mempolicy;    /* Protected by alloc_lock */
    short il_next;
    short pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
    int numa_scan_seq;
    unsigned int numa_scan_period;
    unsigned int numa_scan_period_max;
    int numa_preferred_nid;
    unsigned long numa_migrate_retry;
    u64 node_stamp;         /* migration stamp  */
    u64 last_task_numa_placement;
    u64 last_sum_exec_runtime;
    struct callback_head numa_work;

    struct list_head numa_entry;
    struct numa_group *numa_group;

    /*
     * Exponential decaying average of faults on a per-node basis.
     * Scheduling placement decisions are made based on the these counts.
     * The values remain static for the duration of a PTE scan
     */
    unsigned long *numa_faults_memory;
    unsigned long total_numa_faults;

    /*
     * numa_faults_buffer records faults per node during the current
     * scan window. When the scan completes, the counts in
     * numa_faults_memory decay and these values are copied.
     */
    unsigned long *numa_faults_buffer_memory;

    /*
     * Track the nodes the process was running on when a NUMA hinting
     * fault was incurred.
     */
    unsigned long *numa_faults_cpu;
    unsigned long *numa_faults_buffer_cpu;

    /*
     * numa_faults_locality tracks if faults recorded during the last
     * scan window were remote/local. The task scan period is adapted
     * based on the locality of the faults with different weights
     * depending on whether they were shared or private faults
     */
    unsigned long numa_faults_locality[2];

    unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */

    struct rcu_head rcu;

    /*
     * cache last used pipe for splice  和管道相關(guān)的 
     */
    struct pipe_inode_info *splice_pipe;

    struct page_frag task_frag;

#ifdef  CONFIG_TASK_DELAY_ACCT
    struct task_delay_info *delays;
#endif
#ifdef CONFIG_FAULT_INJECTION
    int make_it_fail;
#endif
    /*
     * when (nr_dirtied >= nr_dirtied_pause), it's time to call
     * balance_dirty_pages() for some dirty throttling pause
     */
    int nr_dirtied;
    int nr_dirtied_pause;
    unsigned long dirty_paused_when; /* start of a write-and-pause period */

#ifdef CONFIG_LATENCYTOP
    int latency_record_count;
    struct latency_record latency_record[LT_SAVECOUNT];
#endif
    /*
     * time slack values; these are used to round up poll() and
     * select() etc timeout values. These are in nanoseconds.
     */
    unsigned long timer_slack_ns;
    unsigned long default_timer_slack_ns;

#ifdef CONFIG_FUNCTION_GRAPH_TRACER
    /* Index of current stored address in ret_stack */
    int curr_ret_stack;
    /* Stack of return addresses for return function tracing */
    struct ftrace_ret_stack *ret_stack;
    /* time stamp for last schedule */
    unsigned long long ftrace_timestamp;
    /*
     * Number of functions that haven't been traced
     * because of depth overrun.
     */
    atomic_t trace_overrun;
    /* Pause for the tracing */
    atomic_t tracing_graph_pause;
#endif
#ifdef CONFIG_TRACING
    /* state flags for use by tracers */
    unsigned long trace;
    /* bitmask and counter of trace recursion */
    unsigned long trace_recursion;
#endif /* CONFIG_TRACING */
#ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
    unsigned int memcg_kmem_skip_account;
    struct memcg_oom_info {
        struct mem_cgroup *memcg;
        gfp_t gfp_mask;
        int order;
        unsigned int may_oom:1;
    } memcg_oom;
#endif
#ifdef CONFIG_UPROBES
    struct uprobe_task *utask;
#endif
#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
    unsigned int    sequential_io;
    unsigned int    sequential_io_avg;
#endif
};

當(dāng)前任務(wù)的CPU相關(guān)的狀態(tài)牢裳，它在進(jìn)程上下文切換的時候起著關(guān)鍵的作用，其中有sp叶沛、ip以及其它的和CPU相關(guān)的狀態(tài)：

struct thread_struct {
    /* Cached TLS descriptors: */
    struct desc_struct  tls_array[GDT_ENTRY_TLS_ENTRIES];
    unsigned long       sp0;
    unsigned long       sp;
#ifdef CONFIG_X86_32
    unsigned long       sysenter_cs;
#else
    unsigned long       usersp; /* Copy from PDA */
    unsigned short      es;
    unsigned short      ds;
    unsigned short      fsindex;
    unsigned short      gsindex;
#endif
#ifdef CONFIG_X86_32
    unsigned long       ip;
#endif
#ifdef CONFIG_X86_64
    unsigned long       fs;
#endif
    unsigned long       gs;
    /* Save middle states of ptrace breakpoints */
    struct perf_event   *ptrace_bps[HBP_NUM];
    /* Debug status used for traps, single steps, etc... */
    unsigned long           debugreg6;
    /* Keep track of the exact dr7 value set by the user */
    unsigned long           ptrace_dr7;
    /* Fault info: */
    unsigned long       cr2;
    unsigned long       trap_nr;
    unsigned long       error_code;
    /* floating point and extended processor state */
    struct fpu      fpu;
#ifdef CONFIG_X86_32
    /* Virtual 86 mode info */
    struct vm86_struct __user *vm86_info;
    unsigned long       screen_bitmap;
    unsigned long       v86flags;
    unsigned long       v86mask;
    unsigned long       saved_sp0;
    unsigned int        saved_fs;
    unsigned int        saved_gs;
#endif
    /* IO permissions: */
    unsigned long       *io_bitmap_ptr;
    unsigned long       iopl;
    /* Max allowed port in the bitmap, in bytes: */
    unsigned        io_bitmap_max;
    /*
     * fpu_counter contains the number of consecutive context switches
     * that the FPU is used. If this is over a threshold, the lazy fpu
     * saving becomes unlazy to save the trap. This is an unsigned char
     * so that after 256 times the counter wraps and the behavior turns
     * lazy again; this to deal with bursty apps that only use FPU for
     * a short time
     */
    unsigned char fpu_counter;
};

進(jìn)程的創(chuàng)建

fork一個子進(jìn)程的代碼蒲讯。在MenuOS里fork調(diào)用的是sys_clone，但是最終都是調(diào)用的do_fork.

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
int main(int argc, char * argv[])
{
    int pid;
    /* fork another process */
    pid = fork();
    if (pid < 0) 
    { 
        /* error occurred */
        fprintf(stderr,"Fork Failed!");
        exit(-1);
    } 
    else if (pid == 0) 
    {
        /* child process */
        printf("This is Child Process!\n");
    } 
    else 
    {  
        /* parent process  */
        printf("This is Parent Process!\n");
        /* parent will wait for the child to complete*/
        wait(NULL);
        printf("Child Complete!\n");
    }
}

創(chuàng)建一個新進(jìn)程在內(nèi)核中的執(zhí)行過程

fork灰署、vfork和clone三個系統(tǒng)調(diào)用都可以創(chuàng)建一個新進(jìn)程判帮，而且都是通過調(diào)用do_fork來實現(xiàn)進(jìn)程的創(chuàng)建局嘁；
Linux通過復(fù)制父進(jìn)程來創(chuàng)建一個新進(jìn)程，那么這就給我們理解這一個過程提供一個想象的框架：
- 復(fù)制一個PCB——task_struct

err = arch_dup_task_struct(tsk, orig);//復(fù)制父進(jìn)程的task_struct數(shù)據(jù)結(jié)構(gòu)

要給新進(jìn)程分配一個新的內(nèi)核堆棧

ti = alloc_thread_info_node(tsk, node);
tsk->stack = ti;//分配內(nèi)核堆棧
setup_thread_stack(tsk, orig); //這里只是復(fù)制thread_info晦墙，而非復(fù)制內(nèi)核堆棧

要修改復(fù)制過來的進(jìn)程數(shù)據(jù)悦昵，比如pid、進(jìn)程鏈表等等都要改改吧晌畅，見copy_process內(nèi)部但指。
從用戶態(tài)的代碼看fork();函數(shù)返回了兩次，即在父子進(jìn)程中各返回一次抗楔，父進(jìn)程從系統(tǒng)調(diào)用中返回比較容易理解棋凳，子進(jìn)程從系統(tǒng)調(diào)用中返回，那它在系統(tǒng)調(diào)用處理過程中的哪里開始執(zhí)行的呢连躏？這就涉及子進(jìn)程的內(nèi)核堆棧數(shù)據(jù)狀態(tài)和task_struct中thread記錄的sp和ip的一致性問題贫橙，這是在哪里設(shè)定的？copy_thread in copy_process

*childregs = *current_pt_regs(); //復(fù)制內(nèi)核堆棧
childregs->ax = 0; //為什么子進(jìn)程的fork返回0反粥，這里就是原因卢肃！
 
p->thread.sp = (unsigned long) childregs; //調(diào)度到子進(jìn)程時的內(nèi)核棧頂
p->thread.ip = (unsigned long) ret_from_fork; //調(diào)度到子進(jìn)程時的第一條指令地址

進(jìn)程的創(chuàng)建概覽及fork一個進(jìn)程的用戶態(tài)代碼

創(chuàng)建新進(jìn)程是通過復(fù)制當(dāng)前進(jìn)程來實現(xiàn)的

進(jìn)程的創(chuàng)建是復(fù)制當(dāng)前進(jìn)程的進(jìn)程信息，復(fù)制出來一個進(jìn)程才顿，即fork出一個進(jìn)程莫湘。這個新創(chuàng)建出來的子進(jìn)程和父進(jìn)程的絕大部分信息是一樣的，但是也有不同的郑气，比如說pid幅垮，鏈表，內(nèi)核堆棧尾组，記錄ip忙芒、sp的thread等。

設(shè)想創(chuàng)建新進(jìn)程的過程中需要做哪些事

父進(jìn)程創(chuàng)建子進(jìn)程讳侨，會在一個地方復(fù)制父進(jìn)程的PCB（task_struct）呵萨，同時還有很多地方修改這個PCB，因為子進(jìn)程有自己獨(dú)立的信息跨跨，還會有一個地方分配一個新的內(nèi)核堆棧潮峦。因為子進(jìn)程是fork返回到用戶態(tài)，所以它的內(nèi)核堆棧中的一部分也要從父進(jìn)程中拷貝過來勇婴，否則內(nèi)核堆棧無法返回忱嘹。此外，根據(jù)拷貝的內(nèi)核堆棧的狀況耕渴，設(shè)定自己的eip和esp的位置拘悦，如果位置不對，出棧的時候橱脸，執(zhí)行到iret時础米，會和堆棧不一致分苇。

系統(tǒng)調(diào)用內(nèi)核處理函數(shù)sys_fork,sys_clone,sys_vfork

區(qū)別

linux-3.18.6/kernel/fork.c

#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
    return do_fork(SIGCHLD, 0, 0, NULL, NULL);
#else
    /* can not support in nommu mode */
    return -EINVAL;
#endif
}
#endif

#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
    return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
            0, NULL, NULL);
}
#endif

幾種帶不同參數(shù)的clone：

#ifdef __ARCH_WANT_SYS_CLONE
#ifdef CONFIG_CLONE_BACKWARDS
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
         int __user *, parent_tidptr,
         int, tls_val,
         int __user *, child_tidptr)
#elif defined(CONFIG_CLONE_BACKWARDS2)
SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
         int __user *, parent_tidptr,
         int __user *, child_tidptr,
         int, tls_val)
#elif defined(CONFIG_CLONE_BACKWARDS3)
SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
        int, stack_size,
        int __user *, parent_tidptr,
        int __user *, child_tidptr,
        int, tls_val)
#else
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
         int __user *, parent_tidptr,
         int __user *, child_tidptr,
         int, tls_val)
#endif
{
    return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
}
#endif

可以看到在用戶態(tài)不管調(diào)用這三個系統(tǒng)調(diào)用中的哪一個，最后都調(diào)用了do_fork椭盏。
do_fork:

/*
 *  Ok, this is the main fork-routine.//fork的主要的處理例程 
 *
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 */
long do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr)
{
    struct task_struct *p;
    int trace = 0;
    long nr;

    /*
     * Determine whether and which event to report to ptracer.  When
     * called from kernel_thread or CLONE_UNTRACED is explicitly
     * requested, no event is reported; otherwise, report if the event
     * for the type of forking is enabled.
     */
    if (!(clone_flags & CLONE_UNTRACED)) {
        if (clone_flags & CLONE_VFORK)
            trace = PTRACE_EVENT_VFORK;
        else if ((clone_flags & CSIGNAL) != SIGCHLD)
            trace = PTRACE_EVENT_CLONE;
        else
            trace = PTRACE_EVENT_FORK;

        if (likely(!ptrace_event_enabled(current, trace)))
            trace = 0;
    }

    p = copy_process(clone_flags, stack_start, stack_size,
             child_tidptr, NULL, trace);            //創(chuàng)建一個進(jìn)程內(nèi)容的主要代碼 
    /*
     * Do this prior waking up the new thread - the thread pointer
     * might get invalid after that point, if the thread exits quickly.
     */
    if (!IS_ERR(p)) {
        struct completion vfork;
        struct pid *pid;

        trace_sched_process_fork(current, p);

        pid = get_task_pid(p, PIDTYPE_PID);
        nr = pid_vnr(pid);

        if (clone_flags & CLONE_PARENT_SETTID)
            put_user(nr, parent_tidptr);

        if (clone_flags & CLONE_VFORK) {
            p->vfork_done = &vfork;
            init_completion(&vfork);
            get_task_struct(p);
        }

        wake_up_new_task(p);

        /* forking complete and child started to run, tell ptracer */
        if (unlikely(trace))
            ptrace_event_pid(trace, pid);

        if (clone_flags & CLONE_VFORK) {
            if (!wait_for_vfork_done(p, &vfork))
                ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
        }

        put_pid(pid);
    } else {
        nr = PTR_ERR(p);
    }
    return nr;
}

static struct task_struct *copy_process(unsigned long clone_flags,
                    unsigned long stack_start,
                    unsigned long stack_size,
                    int __user *child_tidptr,
                    struct pid *pid,
                    int trace)
{
    ......
    出錯處理
    ......
    p = dup_task_struct(current);//復(fù)制task_struct组砚，p指向了子進(jìn)程的進(jìn)程描述符
    ......
    對子進(jìn)程的初始化吻商，修改
    ......
    /* copy all the process information */
    shm_init_task(p);
    retval = copy_semundo(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_audit;
    retval = copy_files(clone_flags, p);//初始化file 
    if (retval)
        goto bad_fork_cleanup_semundo;
    retval = copy_fs(clone_flags, p);//初始化文件系統(tǒng) 
    if (retval)
        goto bad_fork_cleanup_files;
    retval = copy_sighand(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_fs;
    retval = copy_signal(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_sighand;
    retval = copy_mm(clone_flags, p);//初始化內(nèi)存 
    if (retval)
        goto bad_fork_cleanup_signal;
    retval = copy_namespaces(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_mm;
    retval = copy_io(clone_flags, p);//初始化io 
    if (retval)
        goto bad_fork_cleanup_namespaces;
    retval = copy_thread(clone_flags, stack_start, stack_size, p);//拷貝內(nèi)核堆棧數(shù)據(jù)和指定新進(jìn)程的第一條指令地址 
    if (retval)
        goto bad_fork_cleanup_io;

    if (pid != &init_struct_pid) {
        retval = -ENOMEM;
        pid = alloc_pid(p->nsproxy->pid_ns_for_children);
        if (!pid)
            goto bad_fork_cleanup_io;
    }

    p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
    ......
}

static struct task_struct *dup_task_struct(struct task_struct *orig)
{
    struct task_struct *tsk;
    struct thread_info *ti;
    int node = tsk_fork_get_node(orig);
    int err;

    tsk = alloc_task_struct_node(node);//alloc一個結(jié)點(diǎn) 
    if (!tsk)
        return NULL;

    ti = alloc_thread_info_node(tsk, node);//分配內(nèi)核堆椞图眨空間的效果 
    if (!ti)
        goto free_tsk;

    err = arch_dup_task_struct(tsk, orig);//執(zhí)行了復(fù)制
    if (err)
        goto free_ti;

    tsk->stack = ti;
#ifdef CONFIG_SECCOMP
    /*
     * We must handle setting up seccomp filters once we're under
     * the sighand lock in case orig has changed between now and
     * then. Until then, filter must be NULL to avoid messing up
     * the usage counts on the error path calling free_task.
     */
    tsk->seccomp.filter = NULL;
#endif

    setup_thread_stack(tsk, orig);//復(fù)制task_thread_info信息 
    clear_user_return_notifier(tsk);
    clear_tsk_need_resched(tsk);
    set_task_stack_end_magic(tsk);

#ifdef CONFIG_CC_STACKPROTECTOR
    tsk->stack_canary = get_random_int();
#endif

    /*
     * One for us, one for whoever does the "release_task()" (usually
     * parent)
     */
    atomic_set(&tsk->usage, 2);
#ifdef CONFIG_BLK_DEV_IO_TRACE
    tsk->btrace_seq = 0;
#endif
    tsk->splice_pipe = NULL;
    tsk->task_frag.page = NULL;

    account_kernel_stack(ti, 1);

    return tsk;

free_ti:
    free_thread_info(ti);
free_tsk:
    free_task_struct(tsk);
    return NULL;
}

int __weak arch_dup_task_struct(struct task_struct *dst,
                           struct task_struct *src)
{
    *dst = *src;
    return 0;
}

/*
 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
 * kmemcache based allocator.
 */
# if THREAD_SIZE >= PAGE_SIZE
static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
                          int node)
{
    //創(chuàng)建了兩個頁，一個存放thread_info,另一部分從高地址指向低地址的內(nèi)核堆棧 
    struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
                          THREAD_SIZE_ORDER);

    return page ? page_address(page) : NULL;
}

創(chuàng)建的新進(jìn)程是從哪里開始執(zhí)行的

int copy_thread(unsigned long clone_flags, unsigned long sp,
    unsigned long arg, struct task_struct *p)
{
    struct pt_regs *childregs = task_pt_regs(p);//pt_regs
    struct task_struct *tsk;
    int err;

    p->thread.sp = (unsigned long) childregs;
    p->thread.sp0 = (unsigned long) (childregs+1);
    memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));

    if (unlikely(p->flags & PF_KTHREAD)) {
        /* kernel thread */
        memset(childregs, 0, sizeof(struct pt_regs));
        p->thread.ip = (unsigned long) ret_from_kernel_thread;
        task_user_gs(p) = __KERNEL_STACK_CANARY;
        childregs->ds = __USER_DS;
        childregs->es = __USER_DS;
        childregs->fs = __KERNEL_PERCPU;
        childregs->bx = sp; /* function */
        childregs->bp = arg;
        childregs->orig_ax = -1;
        childregs->cs = __KERNEL_CS | get_kernel_rpl();
        childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
        p->thread.io_bitmap_ptr = NULL;
        return 0;
    }
    *childregs = *current_pt_regs();
    childregs->ax = 0;
    if (sp)
        childregs->sp = sp;

    p->thread.ip = (unsigned long) ret_from_fork;
    task_user_gs(p) = get_user_gs(current_pt_regs());

    p->thread.io_bitmap_ptr = NULL;
    tsk = current;
    err = -ENOMEM;

    if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
        p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
                        IO_BITMAP_BYTES, GFP_KERNEL);
        if (!p->thread.io_bitmap_ptr) {
            p->thread.io_bitmap_max = 0;
            return -ENOMEM;
        }
        set_tsk_thread_flag(p, TIF_IO_BITMAP);
    }

    err = 0;

    /*
     * Set a new TLS for the child thread?
     */
    if (clone_flags & CLONE_SETTLS)
        err = do_set_thread_area(p, -1,
            (struct user_desc __user *)childregs->si, 0);

    if (err && p->thread.io_bitmap_ptr) {
        kfree(p->thread.io_bitmap_ptr);
        p->thread.io_bitmap_max = 0;
    }
    return err;
}

int指令和SAVE_ALL壓到內(nèi)核堆棧的內(nèi)容
復(fù)制內(nèi)核堆棧只復(fù)制下邊這些

struct pt_regs {
    unsigned long bx;
    unsigned long cx;
    unsigned long dx;
    unsigned long si;
    unsigned long di;
    unsigned long bp;
    unsigned long ax;//傳遞的系統(tǒng)調(diào)用號 
    unsigned long ds;
    unsigned long es;
    unsigned long fs;
    unsigned long gs;
    unsigned long orig_ax;
    unsigned long ip;
    unsigned long cs;
    unsigned long flags;
    unsigned long sp;
    unsigned long ss;
};

ENTRY(ret_from_fork)
    CFI_STARTPROC
    pushl_cfi %eax
    call schedule_tail
    GET_THREAD_INFO(%ebp)
    popl_cfi %eax
    pushl_cfi $0x0202       # Reset kernel eflags
    popfl_cfi
    jmp syscall_exit      #會跳轉(zhuǎn)到system_call里邊的syscall_exit艾帐，繼續(xù)向下進(jìn)行
    CFI_ENDPROC
END(ret_from_fork)

ENTRY(system_call)
    RING0_INT_FRAME         # can't unwind into user space anyway
    ASM_CLAC
    pushl_cfi %eax          # save orig_eax
    SAVE_ALL
    GET_THREAD_INFO(%ebp)
                    # system call tracing in operation / emulation
    testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
    jnz syscall_trace_entry
    cmpl $(NR_syscalls), %eax
    jae syscall_badsys
syscall_call:
    call *sys_call_table(,%eax,4)
syscall_after_call:
    movl %eax,PT_EAX(%esp)      # store the return value
syscall_exit:
    LOCKDEP_SYS_EXIT
    DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
                    # setting need_resched or sigpending
                    # between sampling and the iret
    TRACE_IRQS_OFF
    movl TI_flags(%ebp), %ecx
    testl $_TIF_ALLWORK_MASK, %ecx  # current->work
    jne syscall_exit_work

restore_all:
    TRACE_IRQS_IRET
restore_all_notrace:

參考這個"關(guān)于kernel_thread的補(bǔ)充說明"分析的很透徹

使用gdb跟蹤創(chuàng)建新進(jìn)程的過程

在MenuOS里增加一個fork命令乌叶，用帶有fork的test_fork.c把test.c給覆蓋掉

編譯之后，可以看到列表中多了fork命令柒爸。執(zhí)行fork命令之后准浴，它會創(chuàng)建一個子進(jìn)程。父進(jìn)程和子進(jìn)程都輸出了信息捎稚。

使用gdb跟蹤：

設(shè)置幾個斷點(diǎn)：

繼續(xù)執(zhí)行乐横，執(zhí)行到了do_fork

最后編輯于：2017.12.06 06:52:29

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者

人面猴
序言：七十年代末，一起剝皮案震驚了整個濱河市今野，隨后出現(xiàn)的幾起案子葡公，更是在濱河造成了極大的恐慌，老刑警劉巖条霜，帶你破解...
沈念sama閱讀 212,718評論 6贊 492
死咒
序言：濱河連續(xù)發(fā)生了三起死亡事件催什，死亡現(xiàn)場離奇詭異，居然都是意外死亡宰睡，警方通過查閱死者的電腦和手機(jī)蒲凶，發(fā)現(xiàn)死者居然都...
沈念sama閱讀 90,683評論 3贊 385
救了他兩次的神仙讓他今天三更去死
文/潘曉璐我一進(jìn)店門，熙熙樓的掌柜王于貴愁眉苦臉地迎上來拆内，“玉大人旋圆，你說我怎么就攤上這事◆锘校” “怎么了臂聋？”我有些...
開封第一講書人閱讀 158,207評論 0贊 348
道士緝兇錄：失蹤的賣姜人
文/不壞的土叔我叫張陵，是天一觀的道長或南。經(jīng)常有香客問我孩等，道長，這世上最難降的妖魔是什么采够？我笑而不...
開封第一講書人閱讀 56,755評論 1贊 284
?港島之戀（遺憾婚禮）
正文為了忘掉前任肄方，我火速辦了婚禮，結(jié)果婚禮上蹬癌，老公的妹妹穿的比我還像新娘权她。我一直安慰自己虹茶，他們只是感情好，可當(dāng)我...
茶點(diǎn)故事閱讀 65,862評論 6贊 386
惡毒庶女頂嫁案：這布局不是一般人想出來的
文/花漫我一把揭開白布隅要。她就那樣靜靜地躺著蝴罪，像睡著了一般。火紅的嫁衣襯著肌膚如雪步清。梳的紋絲不亂的頭發(fā)上要门，一...
開封第一講書人閱讀 50,050評論 1贊 291
城市分裂傳說
那天，我揣著相機(jī)與錄音廓啊，去河邊找鬼唠梨。笑死恕曲，一個胖子當(dāng)著我的面吹牛瓢娜，可吹牛的內(nèi)容都是我干的识窿。我是一名探鬼主播，決...
沈念sama閱讀 39,136評論 3贊 410
雙鴛鴦連環(huán)套：你想象不到人心有多黑
文/蒼蘭香墨我猛地睜開眼第步，長吁一口氣：“原來是場噩夢啊……” “哼疮装！你這毒婦竟也來了？” 一聲冷哼從身側(cè)響起粘都，我...
開封第一講書人閱讀 37,882評論 0贊 268
萬榮殺人案實錄
序言：老撾萬榮一對情侶失蹤廓推，失蹤者是張志新（化名）和其女友劉穎，沒想到半個月后驯杜，有當(dāng)?shù)厝嗽跇淞掷锇l(fā)現(xiàn)了一具尸體受啥，經(jīng)...
沈念sama閱讀 44,330評論 1贊 303
?護(hù)林員之死
正文獨(dú)居荒郊野嶺守林人離奇死亡，尸身上長有42處帶血的膿包…… 初始之章·張勛以下內(nèi)容為張勛視角年9月15日...
茶點(diǎn)故事閱讀 36,651評論 2贊 327
?白月光啟示錄
正文我和宋清朗相戀三年鸽心，在試婚紗的時候發(fā)現(xiàn)自己被綠了滚局。大學(xué)時的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片。...
茶點(diǎn)故事閱讀 38,789評論 1贊 341
活死人
序言：一個原本活蹦亂跳的男人離奇死亡顽频，死狀恐怖藤肢，靈堂內(nèi)的尸體忽然破棺而出，到底是詐尸還是另有隱情糯景，我是刑警寧澤嘁圈，帶...
沈念sama閱讀 34,477評論 4贊 333
?日本核電站爆炸內(nèi)幕
正文年R本政府宣布，位于F島的核電站蟀淮，受9級特大地震影響最住，放射性物質(zhì)發(fā)生泄漏。R本人自食惡果不足惜怠惶，卻給世界環(huán)境...
茶點(diǎn)故事閱讀 40,135評論 3贊 317
男人毒藥：我在死后第九天來索命
文/蒙蒙一涨缚、第九天我趴在偏房一處隱蔽的房頂上張望。院中可真熱鬧策治，春花似錦脓魏、人聲如沸兰吟。這莊子的主人今日做“春日...
開封第一講書人閱讀 30,864評論 0贊 21
一樁弒父案茂翔，背后竟有這般陰謀
文/蒼蘭香墨我抬頭看了看天上的太陽混蔼。三九已至，卻和暖如春珊燎，著一層夾襖步出監(jiān)牢的瞬間惭嚣，已是汗流浹背。一陣腳步聲響...
開封第一講書人閱讀 32,099評論 1贊 267
情欲美人皮
我被黑心中介騙來泰國打工俐末，沒想到剛下飛機(jī)就差點(diǎn)兒被人妖公主榨干…… 1. 我叫王不留料按，地道東北人奄侠。一個月前我還...
沈念sama閱讀 46,598評論 2贊 362
代替公主和親
正文我出身青樓卓箫，卻偏偏與公主長得像，于是被迫代替她去往敵國和親垄潮。傳聞我的和親對象是個殘疾皇子烹卒，可洞房花燭夜當(dāng)晚...
茶點(diǎn)故事閱讀 43,697評論 2贊 351