LCOV - code coverage report
Current view: top level - kernel - fork.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 547 630 86.8 %
Date: 2015-04-12 14:34:49 Functions: 44 50 88.0 %

          Line data    Source code
       1             : /*
       2             :  *  linux/kernel/fork.c
       3             :  *
       4             :  *  Copyright (C) 1991, 1992  Linus Torvalds
       5             :  */
       6             : 
       7             : /*
       8             :  *  'fork.c' contains the help-routines for the 'fork' system call
       9             :  * (see also entry.S and others).
      10             :  * Fork is rather simple, once you get the hang of it, but the memory
      11             :  * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
      12             :  */
      13             : 
      14             : #include <linux/slab.h>
      15             : #include <linux/init.h>
      16             : #include <linux/unistd.h>
      17             : #include <linux/module.h>
      18             : #include <linux/vmalloc.h>
      19             : #include <linux/completion.h>
      20             : #include <linux/personality.h>
      21             : #include <linux/mempolicy.h>
      22             : #include <linux/sem.h>
      23             : #include <linux/file.h>
      24             : #include <linux/fdtable.h>
      25             : #include <linux/iocontext.h>
      26             : #include <linux/key.h>
      27             : #include <linux/binfmts.h>
      28             : #include <linux/mman.h>
      29             : #include <linux/mmu_notifier.h>
      30             : #include <linux/fs.h>
      31             : #include <linux/mm.h>
      32             : #include <linux/vmacache.h>
      33             : #include <linux/nsproxy.h>
      34             : #include <linux/capability.h>
      35             : #include <linux/cpu.h>
      36             : #include <linux/cgroup.h>
      37             : #include <linux/security.h>
      38             : #include <linux/hugetlb.h>
      39             : #include <linux/seccomp.h>
      40             : #include <linux/swap.h>
      41             : #include <linux/syscalls.h>
      42             : #include <linux/jiffies.h>
      43             : #include <linux/futex.h>
      44             : #include <linux/compat.h>
      45             : #include <linux/kthread.h>
      46             : #include <linux/task_io_accounting_ops.h>
      47             : #include <linux/rcupdate.h>
      48             : #include <linux/ptrace.h>
      49             : #include <linux/mount.h>
      50             : #include <linux/audit.h>
      51             : #include <linux/memcontrol.h>
      52             : #include <linux/ftrace.h>
      53             : #include <linux/proc_fs.h>
      54             : #include <linux/profile.h>
      55             : #include <linux/rmap.h>
      56             : #include <linux/ksm.h>
      57             : #include <linux/acct.h>
      58             : #include <linux/tsacct_kern.h>
      59             : #include <linux/cn_proc.h>
      60             : #include <linux/freezer.h>
      61             : #include <linux/delayacct.h>
      62             : #include <linux/taskstats_kern.h>
      63             : #include <linux/random.h>
      64             : #include <linux/tty.h>
      65             : #include <linux/blkdev.h>
      66             : #include <linux/fs_struct.h>
      67             : #include <linux/magic.h>
      68             : #include <linux/perf_event.h>
      69             : #include <linux/posix-timers.h>
      70             : #include <linux/user-return-notifier.h>
      71             : #include <linux/oom.h>
      72             : #include <linux/khugepaged.h>
      73             : #include <linux/signalfd.h>
      74             : #include <linux/uprobes.h>
      75             : #include <linux/aio.h>
      76             : #include <linux/compiler.h>
      77             : 
      78             : #include <asm/pgtable.h>
      79             : #include <asm/pgalloc.h>
      80             : #include <asm/uaccess.h>
      81             : #include <asm/mmu_context.h>
      82             : #include <asm/cacheflush.h>
      83             : #include <asm/tlbflush.h>
      84             : 
      85             : #include <trace/events/sched.h>
      86             : 
      87             : #define CREATE_TRACE_POINTS
      88             : #include <trace/events/task.h>
      89             : 
      90             : /*
      91             :  * Protected counters by write_lock_irq(&tasklist_lock)
      92             :  */
      93             : unsigned long total_forks;      /* Handle normal Linux uptimes. */
      94             : int nr_threads;                 /* The idle threads do not count.. */
      95             : 
      96             : int max_threads;                /* tunable limit on nr_threads */
      97             : 
      98             : DEFINE_PER_CPU(unsigned long, process_counts) = 0;
      99             : 
     100             : __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
     101             : 
     102             : #ifdef CONFIG_PROVE_RCU
     103             : int lockdep_tasklist_lock_is_held(void)
     104             : {
     105             :         return lockdep_is_held(&tasklist_lock);
     106             : }
     107             : EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
     108             : #endif /* #ifdef CONFIG_PROVE_RCU */
     109             : 
     110          15 : int nr_processes(void)
     111             : {
     112             :         int cpu;
     113             :         int total = 0;
     114             : 
     115          30 :         for_each_possible_cpu(cpu)
     116          15 :                 total += per_cpu(process_counts, cpu);
     117             : 
     118          15 :         return total;
     119             : }
     120             : 
     121        2915 : void __weak arch_release_task_struct(struct task_struct *tsk)
     122             : {
     123        2915 : }
     124             : 
     125             : #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
     126             : static struct kmem_cache *task_struct_cachep;
     127             : 
     128             : static inline struct task_struct *alloc_task_struct_node(int node)
     129             : {
     130        2993 :         return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
     131             : }
     132             : 
     133             : static inline void free_task_struct(struct task_struct *tsk)
     134             : {
     135        2915 :         kmem_cache_free(task_struct_cachep, tsk);
     136             : }
     137             : #endif
     138             : 
     139        2915 : void __weak arch_release_thread_info(struct thread_info *ti)
     140             : {
     141        2915 : }
     142             : 
     143             : #ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
     144             : 
     145             : /*
     146             :  * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
     147             :  * kmemcache based allocator.
     148             :  */
     149             : # if THREAD_SIZE >= PAGE_SIZE
     150        2993 : static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
     151             :                                                   int node)
     152             : {
     153        2993 :         struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
     154             :                                                   THREAD_SIZE_ORDER);
     155             : 
     156        5986 :         return page ? page_address(page) : NULL;
     157             : }
     158             : 
     159             : static inline void free_thread_info(struct thread_info *ti)
     160             : {
     161        2915 :         free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
     162             : }
     163             : # else
     164             : static struct kmem_cache *thread_info_cache;
     165             : 
     166             : static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
     167             :                                                   int node)
     168             : {
     169             :         return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
     170             : }
     171             : 
     172             : static void free_thread_info(struct thread_info *ti)
     173             : {
     174             :         kmem_cache_free(thread_info_cache, ti);
     175             : }
     176             : 
     177             : void thread_info_cache_init(void)
     178             : {
     179             :         thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
     180             :                                               THREAD_SIZE, 0, NULL);
     181             :         BUG_ON(thread_info_cache == NULL);
     182             : }
     183             : # endif
     184             : #endif
     185             : 
     186             : /* SLAB cache for signal_struct structures (tsk->signal) */
     187             : static struct kmem_cache *signal_cachep;
     188             : 
     189             : /* SLAB cache for sighand_struct structures (tsk->sighand) */
     190             : struct kmem_cache *sighand_cachep;
     191             : 
     192             : /* SLAB cache for files_struct structures (tsk->files) */
     193             : struct kmem_cache *files_cachep;
     194             : 
     195             : /* SLAB cache for fs_struct structures (tsk->fs) */
     196             : struct kmem_cache *fs_cachep;
     197             : 
     198             : /* SLAB cache for vm_area_struct structures */
     199             : struct kmem_cache *vm_area_cachep;
     200             : 
     201             : /* SLAB cache for mm_struct structures (tsk->mm) */
     202             : static struct kmem_cache *mm_cachep;
     203             : 
     204        5908 : static void account_kernel_stack(struct thread_info *ti, int account)
     205             : {
     206       11816 :         struct zone *zone = page_zone(virt_to_page(ti));
     207             : 
     208             :         mod_zone_page_state(zone, NR_KERNEL_STACK, account);
     209        5908 : }
     210             : 
     211        2915 : void free_task(struct task_struct *tsk)
     212             : {
     213        2915 :         account_kernel_stack(tsk->stack, -1);
     214        2915 :         arch_release_thread_info(tsk->stack);
     215        2915 :         free_thread_info(tsk->stack);
     216             :         rt_mutex_debug_task_free(tsk);
     217             :         ftrace_graph_exit_task(tsk);
     218        2915 :         put_seccomp_filter(tsk);
     219        2915 :         arch_release_task_struct(tsk);
     220             :         free_task_struct(tsk);
     221        2915 : }
     222             : EXPORT_SYMBOL(free_task);
     223             : 
     224        2915 : static inline void free_signal_struct(struct signal_struct *sig)
     225             : {
     226             :         taskstats_tgid_free(sig);
     227        2915 :         sched_autogroup_exit(sig);
     228        2915 :         kmem_cache_free(signal_cachep, sig);
     229             : }
     230             : 
     231             : static inline void put_signal_struct(struct signal_struct *sig)
     232             : {
     233        5828 :         if (atomic_dec_and_test(&sig->sigcnt))
     234             :                 free_signal_struct(sig);
     235             : }
     236             : 
     237        2914 : void __put_task_struct(struct task_struct *tsk)
     238             : {
     239             :         WARN_ON(!tsk->exit_state);
     240        2914 :         WARN_ON(atomic_read(&tsk->usage));
     241             :         WARN_ON(tsk == current);
     242             : 
     243             :         task_numa_free(tsk);
     244             :         security_task_free(tsk);
     245        2914 :         exit_creds(tsk);
     246             :         delayacct_tsk_free(tsk);
     247        2914 :         put_signal_struct(tsk->signal);
     248             : 
     249             :         if (!profile_handoff_task(tsk))
     250        2914 :                 free_task(tsk);
     251        2914 : }
     252             : EXPORT_SYMBOL_GPL(__put_task_struct);
     253             : 
     254           1 : void __init __weak arch_task_cache_init(void) { }
     255             : 
     256           1 : void __init fork_init(unsigned long mempages)
     257             : {
     258             : #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
     259             : #ifndef ARCH_MIN_TASKALIGN
     260             : #define ARCH_MIN_TASKALIGN      L1_CACHE_BYTES
     261             : #endif
     262             :         /* create a slab on which task_structs can be allocated */
     263           1 :         task_struct_cachep =
     264           1 :                 kmem_cache_create("task_struct", sizeof(struct task_struct),
     265             :                         ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
     266             : #endif
     267             : 
     268             :         /* do the arch specific task caches init */
     269           1 :         arch_task_cache_init();
     270             : 
     271             :         /*
     272             :          * The default maximum number of threads is set to a safe
     273             :          * value: the thread structures can take up at most half
     274             :          * of memory.
     275             :          */
     276           1 :         max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE);
     277             : 
     278             :         /*
     279             :          * we need to allow at least 20 threads to boot a system
     280             :          */
     281           1 :         if (max_threads < 20)
     282           0 :                 max_threads = 20;
     283             : 
     284           1 :         init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
     285           1 :         init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
     286           1 :         init_task.signal->rlim[RLIMIT_SIGPENDING] =
     287           1 :                 init_task.signal->rlim[RLIMIT_NPROC];
     288           1 : }
     289             : 
     290        2993 : int __weak arch_dup_task_struct(struct task_struct *dst,
     291             :                                                struct task_struct *src)
     292             : {
     293        2993 :         *dst = *src;
     294        2993 :         return 0;
     295             : }
     296             : 
     297           1 : void set_task_stack_end_magic(struct task_struct *tsk)
     298             : {
     299             :         unsigned long *stackend;
     300             : 
     301             :         stackend = end_of_stack(tsk);
     302        2994 :         *stackend = STACK_END_MAGIC;    /* for overflow detection */
     303           1 : }
     304             : 
     305        2993 : static struct task_struct *dup_task_struct(struct task_struct *orig)
     306             : {
     307             :         struct task_struct *tsk;
     308             :         struct thread_info *ti;
     309        2993 :         int node = tsk_fork_get_node(orig);
     310             :         int err;
     311             : 
     312             :         tsk = alloc_task_struct_node(node);
     313        2993 :         if (!tsk)
     314             :                 return NULL;
     315             : 
     316        2993 :         ti = alloc_thread_info_node(tsk, node);
     317        2993 :         if (!ti)
     318             :                 goto free_tsk;
     319             : 
     320        2993 :         err = arch_dup_task_struct(tsk, orig);
     321        2993 :         if (err)
     322             :                 goto free_ti;
     323             : 
     324        2993 :         tsk->stack = ti;
     325             : #ifdef CONFIG_SECCOMP
     326             :         /*
     327             :          * We must handle setting up seccomp filters once we're under
     328             :          * the sighand lock in case orig has changed between now and
     329             :          * then. Until then, filter must be NULL to avoid messing up
     330             :          * the usage counts on the error path calling free_task.
     331             :          */
     332        2993 :         tsk->seccomp.filter = NULL;
     333             : #endif
     334             : 
     335             :         setup_thread_stack(tsk, orig);
     336             :         clear_user_return_notifier(tsk);
     337             :         clear_tsk_need_resched(tsk);
     338             :         set_task_stack_end_magic(tsk);
     339             : 
     340             : #ifdef CONFIG_CC_STACKPROTECTOR
     341             :         tsk->stack_canary = get_random_int();
     342             : #endif
     343             : 
     344             :         /*
     345             :          * One for us, one for whoever does the "release_task()" (usually
     346             :          * parent)
     347             :          */
     348        2993 :         atomic_set(&tsk->usage, 2);
     349             : #ifdef CONFIG_BLK_DEV_IO_TRACE
     350             :         tsk->btrace_seq = 0;
     351             : #endif
     352        2993 :         tsk->splice_pipe = NULL;
     353        2993 :         tsk->task_frag.page = NULL;
     354             : 
     355        2993 :         account_kernel_stack(ti, 1);
     356             : 
     357        2993 :         return tsk;
     358             : 
     359             : free_ti:
     360             :         free_thread_info(ti);
     361             : free_tsk:
     362             :         free_task_struct(tsk);
     363           0 :         return NULL;
     364             : }
     365             : 
     366             : #ifdef CONFIG_MMU
     367        2895 : static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
     368             : {
     369       45204 :         struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
     370             :         struct rb_node **rb_link, *rb_parent;
     371             :         int retval;
     372             :         unsigned long charge;
     373             : 
     374             :         uprobe_start_dup_mmap();
     375        2895 :         down_write(&oldmm->mmap_sem);
     376        2895 :         flush_cache_dup_mm(oldmm);
     377             :         uprobe_dup_mmap(oldmm, mm);
     378             :         /*
     379             :          * Not linked in yet - no deadlock potential:
     380             :          */
     381        2895 :         down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
     382             : 
     383        2895 :         mm->total_vm = oldmm->total_vm;
     384        2895 :         mm->shared_vm = oldmm->shared_vm;
     385        2895 :         mm->exec_vm = oldmm->exec_vm;
     386        2895 :         mm->stack_vm = oldmm->stack_vm;
     387             : 
     388        2895 :         rb_link = &mm->mm_rb.rb_node;
     389             :         rb_parent = NULL;
     390        2895 :         pprev = &mm->mmap;
     391             :         retval = ksm_fork(mm, oldmm);
     392             :         if (retval)
     393             :                 goto out;
     394             :         retval = khugepaged_fork(mm, oldmm);
     395             :         if (retval)
     396             :                 goto out;
     397             : 
     398             :         prev = NULL;
     399       81610 :         for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
     400       60132 :                 struct file *file;
     401             : 
     402       78715 :                 if (mpnt->vm_flags & VM_DONTCOPY) {
     403           0 :                         vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
     404           0 :                                                         -vma_pages(mpnt));
     405           0 :                         continue;
     406             :                 }
     407             :                 charge = 0;
     408       78715 :                 if (mpnt->vm_flags & VM_ACCOUNT) {
     409             :                         unsigned long len = vma_pages(mpnt);
     410             : 
     411       90408 :                         if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
     412             :                                 goto fail_nomem;
     413             :                         charge = len;
     414             :                 }
     415       78715 :                 tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
     416       78715 :                 if (!tmp)
     417             :                         goto fail_nomem;
     418       78715 :                 *tmp = *mpnt;
     419       78715 :                 INIT_LIST_HEAD(&tmp->anon_vma_chain);
     420             :                 retval = vma_dup_policy(mpnt, tmp);
     421             :                 if (retval)
     422             :                         goto fail_nomem_policy;
     423       78715 :                 tmp->vm_mm = mm;
     424       78715 :                 if (anon_vma_fork(tmp, mpnt))
     425             :                         goto fail_nomem_anon_vma_fork;
     426       78715 :                 tmp->vm_flags &= ~VM_LOCKED;
     427       78715 :                 tmp->vm_next = tmp->vm_prev = NULL;
     428       78715 :                 file = tmp->vm_file;
     429       78715 :                 if (file) {
     430             :                         struct inode *inode = file_inode(file);
     431       60132 :                         struct address_space *mapping = file->f_mapping;
     432             : 
     433             :                         get_file(file);
     434       60132 :                         if (tmp->vm_flags & VM_DENYWRITE)
     435       15156 :                                 atomic_dec(&inode->i_writecount);
     436             :                         i_mmap_lock_write(mapping);
     437       60132 :                         if (tmp->vm_flags & VM_SHARED)
     438         141 :                                 atomic_inc(&mapping->i_mmap_writable);
     439             :                         flush_dcache_mmap_lock(mapping);
     440             :                         /* insert tmp into the share list, just after mpnt */
     441       60132 :                         if (unlikely(tmp->vm_flags & VM_NONLINEAR))
     442           0 :                                 vma_nonlinear_insert(tmp,
     443             :                                                 &mapping->i_mmap_nonlinear);
     444             :                         else
     445       60132 :                                 vma_interval_tree_insert_after(tmp, mpnt,
     446             :                                                         &mapping->i_mmap);
     447             :                         flush_dcache_mmap_unlock(mapping);
     448             :                         i_mmap_unlock_write(mapping);
     449             :                 }
     450             : 
     451             :                 /*
     452             :                  * Clear hugetlb-related page reserves for children. This only
     453             :                  * affects MAP_PRIVATE mappings. Faults generated by the child
     454             :                  * are not guaranteed to succeed, even if read-only
     455             :                  */
     456             :                 if (is_vm_hugetlb_page(tmp))
     457             :                         reset_vma_resv_huge_pages(tmp);
     458             : 
     459             :                 /*
     460             :                  * Link in the new vma and copy the page table entries.
     461             :                  */
     462       78715 :                 *pprev = tmp;
     463       78715 :                 pprev = &tmp->vm_next;
     464       78715 :                 tmp->vm_prev = prev;
     465             :                 prev = tmp;
     466             : 
     467       78715 :                 __vma_link_rb(mm, tmp, rb_link, rb_parent);
     468       78715 :                 rb_link = &tmp->vm_rb.rb_right;
     469       78715 :                 rb_parent = &tmp->vm_rb;
     470             : 
     471       78715 :                 mm->map_count++;
     472       78715 :                 retval = copy_page_range(mm, oldmm, mpnt);
     473             : 
     474       78715 :                 if (tmp->vm_ops && tmp->vm_ops->open)
     475           0 :                         tmp->vm_ops->open(tmp);
     476             : 
     477       78715 :                 if (retval)
     478             :                         goto out;
     479             :         }
     480             :         /* a new mm has just been created */
     481             :         arch_dup_mmap(oldmm, mm);
     482             :         retval = 0;
     483             : out:
     484        2895 :         up_write(&mm->mmap_sem);
     485             :         flush_tlb_mm(oldmm);
     486        2895 :         up_write(&oldmm->mmap_sem);
     487             :         uprobe_end_dup_mmap();
     488        2895 :         return retval;
     489             : fail_nomem_anon_vma_fork:
     490             :         mpol_put(vma_policy(tmp));
     491             : fail_nomem_policy:
     492           0 :         kmem_cache_free(vm_area_cachep, tmp);
     493             : fail_nomem:
     494             :         retval = -ENOMEM;
     495           0 :         vm_unacct_memory(charge);
     496             :         goto out;
     497             : }
     498             : 
     499             : static inline int mm_alloc_pgd(struct mm_struct *mm)
     500             : {
     501        5044 :         mm->pgd = pgd_alloc(mm);
     502        5044 :         if (unlikely(!mm->pgd))
     503             :                 return -ENOMEM;
     504             :         return 0;
     505             : }
     506             : 
     507             : static inline void mm_free_pgd(struct mm_struct *mm)
     508             : {
     509        5008 :         pgd_free(mm, mm->pgd);
     510             : }
     511             : #else
     512             : #define dup_mmap(mm, oldmm)     (0)
     513             : #define mm_alloc_pgd(mm)        (0)
     514             : #define mm_free_pgd(mm)
     515             : #endif /* CONFIG_MMU */
     516             : 
     517             : __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
     518             : 
     519             : #define allocate_mm()   (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
     520             : #define free_mm(mm)     (kmem_cache_free(mm_cachep, (mm)))
     521             : 
     522             : static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
     523             : 
     524           0 : static int __init coredump_filter_setup(char *s)
     525             : {
     526           0 :         default_dump_filter =
     527           0 :                 (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
     528             :                 MMF_DUMP_FILTER_MASK;
     529           0 :         return 1;
     530             : }
     531             : 
     532             : __setup("coredump_filter=", coredump_filter_setup);
     533             : 
     534             : #include <linux/init_task.h>
     535             : 
     536             : static void mm_init_aio(struct mm_struct *mm)
     537             : {
     538             : #ifdef CONFIG_AIO
     539             :         spin_lock_init(&mm->ioctx_lock);
     540        5044 :         mm->ioctx_table = NULL;
     541             : #endif
     542             : }
     543             : 
     544             : static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
     545             : {
     546             : #ifdef CONFIG_MEMCG
     547        5044 :         mm->owner = p;
     548             : #endif
     549             : }
     550             : 
     551        5044 : static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
     552             : {
     553        5044 :         mm->mmap = NULL;
     554        5044 :         mm->mm_rb = RB_ROOT;
     555        5044 :         mm->vmacache_seqnum = 0;
     556        5044 :         atomic_set(&mm->mm_users, 1);
     557        5044 :         atomic_set(&mm->mm_count, 1);
     558        5044 :         init_rwsem(&mm->mmap_sem);
     559        5044 :         INIT_LIST_HEAD(&mm->mmlist);
     560        5044 :         mm->core_state = NULL;
     561             :         atomic_long_set(&mm->nr_ptes, 0);
     562        5044 :         mm->map_count = 0;
     563        5044 :         mm->locked_vm = 0;
     564        5044 :         mm->pinned_vm = 0;
     565        5044 :         memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
     566             :         spin_lock_init(&mm->page_table_lock);
     567             :         mm_init_cpumask(mm);
     568             :         mm_init_aio(mm);
     569             :         mm_init_owner(mm, p);
     570             :         mmu_notifier_mm_init(mm);
     571             :         clear_tlb_flush_pending(mm);
     572             : #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
     573             :         mm->pmd_huge_pte = NULL;
     574             : #endif
     575             : 
     576        5044 :         if (current->mm) {
     577        5021 :                 mm->flags = current->mm->flags & MMF_INIT_MASK;
     578        5021 :                 mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
     579             :         } else {
     580          23 :                 mm->flags = default_dump_filter;
     581          23 :                 mm->def_flags = 0;
     582             :         }
     583             : 
     584        5044 :         if (mm_alloc_pgd(mm))
     585             :                 goto fail_nopgd;
     586             : 
     587        5044 :         if (init_new_context(p, mm))
     588             :                 goto fail_nocontext;
     589             : 
     590        5044 :         return mm;
     591             : 
     592             : fail_nocontext:
     593             :         mm_free_pgd(mm);
     594             : fail_nopgd:
     595           0 :         free_mm(mm);
     596           0 :         return NULL;
     597             : }
     598             : 
     599        5008 : static void check_mm(struct mm_struct *mm)
     600             : {
     601             :         int i;
     602             : 
     603       20032 :         for (i = 0; i < NR_MM_COUNTERS; i++) {
     604       15024 :                 long x = atomic_long_read(&mm->rss_stat.count[i]);
     605             : 
     606       15024 :                 if (unlikely(x))
     607           0 :                         printk(KERN_ALERT "BUG: Bad rss-counter state "
     608             :                                           "mm:%p idx:%d val:%ld\n", mm, i, x);
     609             :         }
     610             : #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
     611             :         VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
     612             : #endif
     613        5008 : }
     614             : 
     615             : /*
     616             :  * Allocate and initialize an mm_struct.
     617             :  */
     618        2149 : struct mm_struct *mm_alloc(void)
     619             : {
     620             :         struct mm_struct *mm;
     621             : 
     622        2149 :         mm = allocate_mm();
     623        2149 :         if (!mm)
     624             :                 return NULL;
     625             : 
     626        2149 :         memset(mm, 0, sizeof(*mm));
     627        2149 :         return mm_init(mm, current);
     628             : }
     629             : 
     630             : /*
     631             :  * Called when the last reference to the mm
     632             :  * is dropped: either by a lazy thread or by
     633             :  * mmput. Free the page directory and the mm.
     634             :  */
     635        5008 : void __mmdrop(struct mm_struct *mm)
     636             : {
     637             :         BUG_ON(mm == &init_mm);
     638             :         mm_free_pgd(mm);
     639             :         destroy_context(mm);
     640             :         mmu_notifier_mm_destroy(mm);
     641        5008 :         check_mm(mm);
     642        5008 :         free_mm(mm);
     643        5008 : }
     644             : EXPORT_SYMBOL_GPL(__mmdrop);
     645             : 
     646             : /*
     647             :  * Decrement the use count and release all resources for an mm.
     648             :  */
     649        6093 : void mmput(struct mm_struct *mm)
     650             : {
     651             :         might_sleep();
     652             : 
     653       12186 :         if (atomic_dec_and_test(&mm->mm_users)) {
     654             :                 uprobe_clear_state(mm);
     655        5008 :                 exit_aio(mm);
     656             :                 ksm_exit(mm);
     657             :                 khugepaged_exit(mm); /* must run before exit_mmap */
     658        5008 :                 exit_mmap(mm);
     659        5008 :                 set_mm_exe_file(mm, NULL);
     660       10016 :                 if (!list_empty(&mm->mmlist)) {
     661             :                         spin_lock(&mmlist_lock);
     662             :                         list_del(&mm->mmlist);
     663             :                         spin_unlock(&mmlist_lock);
     664             :                 }
     665        5008 :                 if (mm->binfmt)
     666        5008 :                         module_put(mm->binfmt->module);
     667             :                 mmdrop(mm);
     668             :         }
     669        6093 : }
     670             : EXPORT_SYMBOL_GPL(mmput);
     671             : 
     672        7157 : void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
     673             : {
     674        7157 :         if (new_exe_file)
     675             :                 get_file(new_exe_file);
     676        7157 :         if (mm->exe_file)
     677        5008 :                 fput(mm->exe_file);
     678        7157 :         mm->exe_file = new_exe_file;
     679        7157 : }
     680             : 
     681        3065 : struct file *get_mm_exe_file(struct mm_struct *mm)
     682             : {
     683             :         struct file *exe_file;
     684             : 
     685             :         /* We need mmap_sem to protect against races with removal of exe_file */
     686        3065 :         down_read(&mm->mmap_sem);
     687        3065 :         exe_file = mm->exe_file;
     688        3065 :         if (exe_file)
     689             :                 get_file(exe_file);
     690        3065 :         up_read(&mm->mmap_sem);
     691        3065 :         return exe_file;
     692             : }
     693             : 
     694             : static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
     695             : {
     696             :         /* It's safe to write the exe_file pointer without exe_file_lock because
     697             :          * this is called during fork when the task is not yet in /proc */
     698        2895 :         newmm->exe_file = get_mm_exe_file(oldmm);
     699             : }
     700             : 
     701             : /**
     702             :  * get_task_mm - acquire a reference to the task's mm
     703             :  *
     704             :  * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
     705             :  * this kernel workthread has transiently adopted a user mm with use_mm,
     706             :  * to do its AIO) is not set and if so returns a reference to it, after
     707             :  * bumping up the use count.  User must release the mm via mmput()
     708             :  * after use.  Typically used by /proc and ptrace.
     709             :  */
     710        2175 : struct mm_struct *get_task_mm(struct task_struct *task)
     711             : {
     712             :         struct mm_struct *mm;
     713             : 
     714             :         task_lock(task);
     715        2175 :         mm = task->mm;
     716        2175 :         if (mm) {
     717        1085 :                 if (task->flags & PF_KTHREAD)
     718             :                         mm = NULL;
     719             :                 else
     720        1085 :                         atomic_inc(&mm->mm_users);
     721             :         }
     722             :         task_unlock(task);
     723        2175 :         return mm;
     724             : }
     725             : EXPORT_SYMBOL_GPL(get_task_mm);
     726             : 
     727           0 : struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
     728             : {
     729             :         struct mm_struct *mm;
     730             :         int err;
     731             : 
     732           0 :         err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
     733           0 :         if (err)
     734           0 :                 return ERR_PTR(err);
     735             : 
     736           0 :         mm = get_task_mm(task);
     737           0 :         if (mm && mm != current->mm &&
     738           0 :                         !ptrace_may_access(task, mode)) {
     739           0 :                 mmput(mm);
     740             :                 mm = ERR_PTR(-EACCES);
     741             :         }
     742           0 :         mutex_unlock(&task->signal->cred_guard_mutex);
     743             : 
     744           0 :         return mm;
     745             : }
     746             : 
     747          11 : static void complete_vfork_done(struct task_struct *tsk)
     748             : {
     749             :         struct completion *vfork;
     750             : 
     751             :         task_lock(tsk);
     752          11 :         vfork = tsk->vfork_done;
     753          11 :         if (likely(vfork)) {
     754          11 :                 tsk->vfork_done = NULL;
     755          11 :                 complete(vfork);
     756             :         }
     757             :         task_unlock(tsk);
     758          11 : }
     759             : 
     760           0 : static int wait_for_vfork_done(struct task_struct *child,
     761             :                                 struct completion *vfork)
     762             : {
     763             :         int killed;
     764             : 
     765             :         freezer_do_not_count();
     766           0 :         killed = wait_for_completion_killable(vfork);
     767             :         freezer_count();
     768             : 
     769           0 :         if (killed) {
     770             :                 task_lock(child);
     771           0 :                 child->vfork_done = NULL;
     772             :                 task_unlock(child);
     773             :         }
     774             : 
     775             :         put_task_struct(child);
     776           0 :         return killed;
     777             : }
     778             : 
     779             : /* Please note the differences between mmput and mm_release.
     780             :  * mmput is called whenever we stop holding onto a mm_struct,
     781             :  * error success whatever.
     782             :  *
     783             :  * mm_release is called after a mm_struct has been removed
     784             :  * from the current process.
     785             :  *
     786             :  * This difference is important for error handling, when we
     787             :  * only half set up a mm_struct for a new process and need to restore
     788             :  * the old one.  Because we mmput the new mm_struct before
     789             :  * restoring the old one. . .
     790             :  * Eric Biederman 10 January 1998
     791             :  */
     792        5063 : void mm_release(struct task_struct *tsk, struct mm_struct *mm)
     793             : {
     794             :         /* Get rid of any futexes when releasing the mm */
     795             : #ifdef CONFIG_FUTEX
     796        5063 :         if (unlikely(tsk->robust_list)) {
     797         236 :                 exit_robust_list(tsk);
     798         236 :                 tsk->robust_list = NULL;
     799             :         }
     800             : #ifdef CONFIG_COMPAT
     801             :         if (unlikely(tsk->compat_robust_list)) {
     802             :                 compat_exit_robust_list(tsk);
     803             :                 tsk->compat_robust_list = NULL;
     804             :         }
     805             : #endif
     806       10126 :         if (unlikely(!list_empty(&tsk->pi_state_list)))
     807           0 :                 exit_pi_state_list(tsk);
     808             : #endif
     809             : 
     810             :         uprobe_free_utask(tsk);
     811             : 
     812             :         /* Get rid of any cached register state */
     813             :         deactivate_mm(tsk, mm);
     814             : 
     815             :         /*
     816             :          * If we're exiting normally, clear a user-space tid field if
     817             :          * requested.  We leave this alone when dying by signal, to leave
     818             :          * the value intact in a core dump, and to save the unnecessary
     819             :          * trouble, say, a killed vfork parent shouldn't touch this mm.
     820             :          * Userland only wants this done for a sys_exit.
     821             :          */
     822        5063 :         if (tsk->clear_child_tid) {
     823        6220 :                 if (!(tsk->flags & PF_SIGNALED) &&
     824        3107 :                     atomic_read(&mm->mm_users) > 1) {
     825             :                         /*
     826             :                          * We don't check the error code - if userspace has
     827             :                          * not set up a proper pointer then tough luck.
     828             :                          */
     829           0 :                         put_user(0, tsk->clear_child_tid);
     830           0 :                         sys_futex(tsk->clear_child_tid, FUTEX_WAKE,
     831             :                                         1, NULL, NULL, 0);
     832             :                 }
     833        3113 :                 tsk->clear_child_tid = NULL;
     834             :         }
     835             : 
     836             :         /*
     837             :          * All done, finally we can wake up parent and return this mm to him.
     838             :          * Also kthread_stop() uses this completion for synchronization.
     839             :          */
     840        5063 :         if (tsk->vfork_done)
     841          11 :                 complete_vfork_done(tsk);
     842        5063 : }
     843             : 
     844             : /*
     845             :  * Allocate a new mm structure and copy contents from the
     846             :  * mm structure of the passed in task structure.
     847             :  */
     848        2895 : static struct mm_struct *dup_mm(struct task_struct *tsk)
     849             : {
     850        2895 :         struct mm_struct *mm, *oldmm = current->mm;
     851             :         int err;
     852             : 
     853        2895 :         mm = allocate_mm();
     854        2895 :         if (!mm)
     855             :                 goto fail_nomem;
     856             : 
     857        2895 :         memcpy(mm, oldmm, sizeof(*mm));
     858             : 
     859        2895 :         if (!mm_init(mm, tsk))
     860             :                 goto fail_nomem;
     861             : 
     862             :         dup_mm_exe_file(oldmm, mm);
     863             : 
     864        2895 :         err = dup_mmap(mm, oldmm);
     865        2895 :         if (err)
     866             :                 goto free_pt;
     867             : 
     868        2895 :         mm->hiwater_rss = get_mm_rss(mm);
     869        2895 :         mm->hiwater_vm = mm->total_vm;
     870             : 
     871        2895 :         if (mm->binfmt && !try_module_get(mm->binfmt->module))
     872             :                 goto free_pt;
     873             : 
     874        2895 :         return mm;
     875             : 
     876             : free_pt:
     877             :         /* don't put binfmt in mmput, we haven't got module yet */
     878           0 :         mm->binfmt = NULL;
     879           0 :         mmput(mm);
     880             : 
     881             : fail_nomem:
     882             :         return NULL;
     883             : }
     884             : 
     885        2993 : static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
     886             : {
     887             :         struct mm_struct *mm, *oldmm;
     888             :         int retval;
     889             : 
     890        2993 :         tsk->min_flt = tsk->maj_flt = 0;
     891        2993 :         tsk->nvcsw = tsk->nivcsw = 0;
     892             : #ifdef CONFIG_DETECT_HUNG_TASK
     893        2993 :         tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
     894             : #endif
     895             : 
     896        2993 :         tsk->mm = NULL;
     897        2993 :         tsk->active_mm = NULL;
     898             : 
     899             :         /*
     900             :          * Are we cloning a kernel thread?
     901             :          *
     902             :          * We need to steal a active VM for that..
     903             :          */
     904        2993 :         oldmm = current->mm;
     905        2993 :         if (!oldmm)
     906             :                 return 0;
     907             : 
     908             :         /* initialize the new vmacache entries */
     909             :         vmacache_flush(tsk);
     910             : 
     911        2901 :         if (clone_flags & CLONE_VM) {
     912           6 :                 atomic_inc(&oldmm->mm_users);
     913             :                 mm = oldmm;
     914           6 :                 goto good_mm;
     915             :         }
     916             : 
     917             :         retval = -ENOMEM;
     918        2895 :         mm = dup_mm(tsk);
     919        2895 :         if (!mm)
     920             :                 goto fail_nomem;
     921             : 
     922             : good_mm:
     923        2901 :         tsk->mm = mm;
     924        2901 :         tsk->active_mm = mm;
     925        2901 :         return 0;
     926             : 
     927             : fail_nomem:
     928             :         return retval;
     929             : }
     930             : 
     931        2993 : static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
     932             : {
     933        2993 :         struct fs_struct *fs = current->fs;
     934        2993 :         if (clone_flags & CLONE_FS) {
     935             :                 /* tsk->fs is already what we want */
     936             :                 spin_lock(&fs->lock);
     937          76 :                 if (fs->in_exec) {
     938             :                         spin_unlock(&fs->lock);
     939             :                         return -EAGAIN;
     940             :                 }
     941          76 :                 fs->users++;
     942             :                 spin_unlock(&fs->lock);
     943             :                 return 0;
     944             :         }
     945        2917 :         tsk->fs = copy_fs_struct(fs);
     946        2917 :         if (!tsk->fs)
     947             :                 return -ENOMEM;
     948        2917 :         return 0;
     949             : }
     950             : 
     951        2993 : static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
     952             : {
     953             :         struct files_struct *oldf, *newf;
     954        2993 :         int error = 0;
     955             : 
     956             :         /*
     957             :          * A background process may not have any files ...
     958             :          */
     959        2993 :         oldf = current->files;
     960        2993 :         if (!oldf)
     961             :                 goto out;
     962             : 
     963        2993 :         if (clone_flags & CLONE_FILES) {
     964          75 :                 atomic_inc(&oldf->count);
     965             :                 goto out;
     966             :         }
     967             : 
     968        2918 :         newf = dup_fd(oldf, &error);
     969        2918 :         if (!newf)
     970             :                 goto out;
     971             : 
     972        2918 :         tsk->files = newf;
     973        2918 :         error = 0;
     974             : out:
     975        2993 :         return error;
     976             : }
     977             : 
     978        2993 : static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
     979             : {
     980             : #ifdef CONFIG_BLOCK
     981        2993 :         struct io_context *ioc = current->io_context;
     982             :         struct io_context *new_ioc;
     983             : 
     984        2993 :         if (!ioc)
     985             :                 return 0;
     986             :         /*
     987             :          * Share io context with parent, if CLONE_IO is set
     988             :          */
     989        1017 :         if (clone_flags & CLONE_IO) {
     990             :                 ioc_task_link(ioc);
     991           0 :                 tsk->io_context = ioc;
     992        1017 :         } else if (ioprio_valid(ioc->ioprio)) {
     993           0 :                 new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
     994           0 :                 if (unlikely(!new_ioc))
     995             :                         return -ENOMEM;
     996             : 
     997           0 :                 new_ioc->ioprio = ioc->ioprio;
     998           0 :                 put_io_context(new_ioc);
     999             :         }
    1000             : #endif
    1001             :         return 0;
    1002             : }
    1003             : 
    1004        2993 : static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
    1005             : {
    1006             :         struct sighand_struct *sig;
    1007             : 
    1008        2993 :         if (clone_flags & CLONE_SIGHAND) {
    1009           6 :                 atomic_inc(&current->sighand->count);
    1010           6 :                 return 0;
    1011             :         }
    1012        2987 :         sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
    1013        2987 :         rcu_assign_pointer(tsk->sighand, sig);
    1014        2987 :         if (!sig)
    1015             :                 return -ENOMEM;
    1016        2987 :         atomic_set(&sig->count, 1);
    1017        5974 :         memcpy(sig->action, current->sighand->action, sizeof(sig->action));
    1018        2987 :         return 0;
    1019             : }
    1020             : 
    1021        2915 : void __cleanup_sighand(struct sighand_struct *sighand)
    1022             : {
    1023        5830 :         if (atomic_dec_and_test(&sighand->count)) {
    1024        2915 :                 signalfd_cleanup(sighand);
    1025             :                 /*
    1026             :                  * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it
    1027             :                  * without an RCU grace period, see __lock_task_sighand().
    1028             :                  */
    1029        2915 :                 kmem_cache_free(sighand_cachep, sighand);
    1030             :         }
    1031        2915 : }
    1032             : 
    1033             : /*
    1034             :  * Initialize POSIX timer handling for a thread group.
    1035             :  */
    1036             : static void posix_cpu_timers_init_group(struct signal_struct *sig)
    1037             : {
    1038             :         unsigned long cpu_limit;
    1039             : 
    1040             :         /* Thread group counters. */
    1041             :         thread_group_cputime_init(sig);
    1042             : 
    1043        2987 :         cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
    1044        2987 :         if (cpu_limit != RLIM_INFINITY) {
    1045           0 :                 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
    1046           0 :                 sig->cputimer.running = 1;
    1047             :         }
    1048             : 
    1049             :         /* The timer lists. */
    1050        2987 :         INIT_LIST_HEAD(&sig->cpu_timers[0]);
    1051        2987 :         INIT_LIST_HEAD(&sig->cpu_timers[1]);
    1052        2987 :         INIT_LIST_HEAD(&sig->cpu_timers[2]);
    1053             : }
    1054             : 
    1055        2993 : static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
    1056             : {
    1057             :         struct signal_struct *sig;
    1058             : 
    1059        2993 :         if (clone_flags & CLONE_THREAD)
    1060             :                 return 0;
    1061             : 
    1062        2987 :         sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
    1063        2987 :         tsk->signal = sig;
    1064        2987 :         if (!sig)
    1065             :                 return -ENOMEM;
    1066             : 
    1067        2987 :         sig->nr_threads = 1;
    1068        2987 :         atomic_set(&sig->live, 1);
    1069        2987 :         atomic_set(&sig->sigcnt, 1);
    1070             : 
    1071             :         /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
    1072        2987 :         sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
    1073        2987 :         tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
    1074             : 
    1075        2987 :         init_waitqueue_head(&sig->wait_chldexit);
    1076        2987 :         sig->curr_target = tsk;
    1077             :         init_sigpending(&sig->shared_pending);
    1078        2987 :         INIT_LIST_HEAD(&sig->posix_timers);
    1079             :         seqlock_init(&sig->stats_lock);
    1080             : 
    1081        2987 :         hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
    1082        2987 :         sig->real_timer.function = it_real_fn;
    1083             : 
    1084             :         task_lock(current->group_leader);
    1085        5974 :         memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
    1086             :         task_unlock(current->group_leader);
    1087             : 
    1088             :         posix_cpu_timers_init_group(sig);
    1089             : 
    1090        2987 :         tty_audit_fork(sig);
    1091        2987 :         sched_autogroup_fork(sig);
    1092             : 
    1093             : #ifdef CONFIG_CGROUPS
    1094        2987 :         init_rwsem(&sig->group_rwsem);
    1095             : #endif
    1096             : 
    1097        2987 :         sig->oom_score_adj = current->signal->oom_score_adj;
    1098        2987 :         sig->oom_score_adj_min = current->signal->oom_score_adj_min;
    1099             : 
    1100        2987 :         sig->has_child_subreaper = current->signal->has_child_subreaper ||
    1101             :                                    current->signal->is_child_subreaper;
    1102             : 
    1103        2987 :         mutex_init(&sig->cred_guard_mutex);
    1104             : 
    1105        2987 :         return 0;
    1106             : }
    1107             : 
    1108        2993 : static void copy_seccomp(struct task_struct *p)
    1109             : {
    1110             : #ifdef CONFIG_SECCOMP
    1111             :         /*
    1112             :          * Must be called with sighand->lock held, which is common to
    1113             :          * all threads in the group. Holding cred_guard_mutex is not
    1114             :          * needed because this new task is not yet running and cannot
    1115             :          * be racing exec.
    1116             :          */
    1117             :         assert_spin_locked(&current->sighand->siglock);
    1118             : 
    1119             :         /* Ref-count the new filter user, and assign it. */
    1120        2993 :         get_seccomp_filter(current);
    1121        2993 :         p->seccomp = current->seccomp;
    1122             : 
    1123             :         /*
    1124             :          * Explicitly enable no_new_privs here in case it got set
    1125             :          * between the task_struct being duplicated and holding the
    1126             :          * sighand lock. The seccomp state and nnp must be in sync.
    1127             :          */
    1128        5986 :         if (task_no_new_privs(current))
    1129             :                 task_set_no_new_privs(p);
    1130             : 
    1131             :         /*
    1132             :          * If the parent gained a seccomp mode after copying thread
    1133             :          * flags and between before we held the sighand lock, we have
    1134             :          * to manually enable the seccomp thread flag here.
    1135             :          */
    1136        2993 :         if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
    1137             :                 set_tsk_thread_flag(p, TIF_SECCOMP);
    1138             : #endif
    1139        2993 : }
    1140             : 
    1141         478 : SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
    1142             : {
    1143         239 :         current->clear_child_tid = tidptr;
    1144             : 
    1145         239 :         return task_pid_vnr(current);
    1146             : }
    1147             : 
    1148             : static void rt_mutex_init_task(struct task_struct *p)
    1149             : {
    1150             :         raw_spin_lock_init(&p->pi_lock);
    1151             : #ifdef CONFIG_RT_MUTEXES
    1152        2993 :         p->pi_waiters = RB_ROOT;
    1153        2993 :         p->pi_waiters_leftmost = NULL;
    1154        2993 :         p->pi_blocked_on = NULL;
    1155             : #endif
    1156             : }
    1157             : 
    1158             : /*
    1159             :  * Initialize POSIX timer handling for a single task.
    1160             :  */
    1161             : static void posix_cpu_timers_init(struct task_struct *tsk)
    1162             : {
    1163        2993 :         tsk->cputime_expires.prof_exp = 0;
    1164        2993 :         tsk->cputime_expires.virt_exp = 0;
    1165        2993 :         tsk->cputime_expires.sched_exp = 0;
    1166        2993 :         INIT_LIST_HEAD(&tsk->cpu_timers[0]);
    1167        2993 :         INIT_LIST_HEAD(&tsk->cpu_timers[1]);
    1168        2993 :         INIT_LIST_HEAD(&tsk->cpu_timers[2]);
    1169             : }
    1170             : 
    1171             : static inline void
    1172             : init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
    1173             : {
    1174        8964 :          task->pids[type].pid = pid;
    1175             : }
    1176             : 
    1177             : /*
    1178             :  * This creates a new process as a copy of the old one,
    1179             :  * but does not actually start it yet.
    1180             :  *
    1181             :  * It copies the registers, and all the appropriate
    1182             :  * parts of the process environment (as per the clone
    1183             :  * flags). The actual kick-off is left to the caller.
    1184             :  */
    1185        2993 : static struct task_struct *copy_process(unsigned long clone_flags,
    1186             :                                         unsigned long stack_start,
    1187             :                                         unsigned long stack_size,
    1188             :                                         int __user *child_tidptr,
    1189             :                                         struct pid *pid,
    1190             :                                         int trace)
    1191             : {
    1192             :         int retval;
    1193             :         struct task_struct *p;
    1194             : 
    1195        2993 :         if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
    1196             :                 return ERR_PTR(-EINVAL);
    1197             : 
    1198        2993 :         if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
    1199             :                 return ERR_PTR(-EINVAL);
    1200             : 
    1201             :         /*
    1202             :          * Thread groups must share signals as well, and detached threads
    1203             :          * can only be started up within the thread group.
    1204             :          */
    1205        2993 :         if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
    1206             :                 return ERR_PTR(-EINVAL);
    1207             : 
    1208             :         /*
    1209             :          * Shared signal handlers imply shared VM. By way of the above,
    1210             :          * thread groups also imply shared VM. Blocking this case allows
    1211             :          * for various simplifications in other code.
    1212             :          */
    1213        2993 :         if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
    1214             :                 return ERR_PTR(-EINVAL);
    1215             : 
    1216             :         /*
    1217             :          * Siblings of global init remain as zombies on exit since they are
    1218             :          * not reaped by their parent (swapper). To solve this and to avoid
    1219             :          * multi-rooted process trees, prevent global and container-inits
    1220             :          * from creating siblings.
    1221             :          */
    1222        2993 :         if ((clone_flags & CLONE_PARENT) &&
    1223           0 :                                 current->signal->flags & SIGNAL_UNKILLABLE)
    1224             :                 return ERR_PTR(-EINVAL);
    1225             : 
    1226             :         /*
    1227             :          * If the new process will be in a different pid or user namespace
    1228             :          * do not allow it to share a thread group or signal handlers or
    1229             :          * parent with the forking task.
    1230             :          */
    1231        2993 :         if (clone_flags & CLONE_SIGHAND) {
    1232          12 :                 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
    1233           6 :                     (task_active_pid_ns(current) !=
    1234           6 :                                 current->nsproxy->pid_ns_for_children))
    1235             :                         return ERR_PTR(-EINVAL);
    1236             :         }
    1237             : 
    1238             :         retval = security_task_create(clone_flags);
    1239             :         if (retval)
    1240             :                 goto fork_out;
    1241             : 
    1242             :         retval = -ENOMEM;
    1243        2993 :         p = dup_task_struct(current);
    1244        2993 :         if (!p)
    1245             :                 goto fork_out;
    1246             : 
    1247             :         ftrace_graph_init_task(p);
    1248             : 
    1249             :         rt_mutex_init_task(p);
    1250             : 
    1251             : #ifdef CONFIG_PROVE_LOCKING
    1252             :         DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
    1253             :         DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
    1254             : #endif
    1255             :         retval = -EAGAIN;
    1256        5986 :         if (atomic_read(&p->real_cred->user->processes) >=
    1257             :                         task_rlimit(p, RLIMIT_NPROC)) {
    1258           0 :                 if (p->real_cred->user != INIT_USER &&
    1259           0 :                     !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
    1260             :                         goto bad_fork_free;
    1261             :         }
    1262        2993 :         current->flags &= ~PF_NPROC_EXCEEDED;
    1263             : 
    1264        2993 :         retval = copy_creds(p, clone_flags);
    1265        2993 :         if (retval < 0)
    1266             :                 goto bad_fork_free;
    1267             : 
    1268             :         /*
    1269             :          * If multiple threads are within copy_process(), then this check
    1270             :          * triggers too late. This doesn't hurt, the check is only there
    1271             :          * to stop root fork bombs.
    1272             :          */
    1273             :         retval = -EAGAIN;
    1274        2993 :         if (nr_threads >= max_threads)
    1275             :                 goto bad_fork_cleanup_count;
    1276             : 
    1277        2993 :         if (!try_module_get(task_thread_info(p)->exec_domain->module))
    1278             :                 goto bad_fork_cleanup_count;
    1279             : 
    1280             :         delayacct_tsk_init(p);  /* Must remain after dup_task_struct() */
    1281        2993 :         p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
    1282        2993 :         p->flags |= PF_FORKNOEXEC;
    1283        2993 :         INIT_LIST_HEAD(&p->children);
    1284        2993 :         INIT_LIST_HEAD(&p->sibling);
    1285             :         rcu_copy_process(p);
    1286        2993 :         p->vfork_done = NULL;
    1287             :         spin_lock_init(&p->alloc_lock);
    1288             : 
    1289             :         init_sigpending(&p->pending);
    1290             : 
    1291        2993 :         p->utime = p->stime = p->gtime = 0;
    1292        2993 :         p->utimescaled = p->stimescaled = 0;
    1293             : #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
    1294        2993 :         p->prev_cputime.utime = p->prev_cputime.stime = 0;
    1295             : #endif
    1296             : #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
    1297             :         seqlock_init(&p->vtime_seqlock);
    1298             :         p->vtime_snap = 0;
    1299             :         p->vtime_snap_whence = VTIME_SLEEPING;
    1300             : #endif
    1301             : 
    1302             : #if defined(SPLIT_RSS_COUNTING)
    1303             :         memset(&p->rss_stat, 0, sizeof(p->rss_stat));
    1304             : #endif
    1305             : 
    1306        2993 :         p->default_timer_slack_ns = current->timer_slack_ns;
    1307             : 
    1308        2993 :         task_io_accounting_init(&p->ioac);
    1309        2993 :         acct_clear_integrals(p);
    1310             : 
    1311             :         posix_cpu_timers_init(p);
    1312             : 
    1313        2993 :         p->start_time = ktime_get_ns();
    1314        2993 :         p->real_start_time = ktime_get_boot_ns();
    1315        2993 :         p->io_context = NULL;
    1316        2993 :         p->audit_context = NULL;
    1317        2993 :         if (clone_flags & CLONE_THREAD)
    1318           6 :                 threadgroup_change_begin(current);
    1319        2993 :         cgroup_fork(p);
    1320             : #ifdef CONFIG_NUMA
    1321             :         p->mempolicy = mpol_dup(p->mempolicy);
    1322             :         if (IS_ERR(p->mempolicy)) {
    1323             :                 retval = PTR_ERR(p->mempolicy);
    1324             :                 p->mempolicy = NULL;
    1325             :                 goto bad_fork_cleanup_threadgroup_lock;
    1326             :         }
    1327             : #endif
    1328             : #ifdef CONFIG_CPUSETS
    1329             :         p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
    1330             :         p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
    1331             :         seqcount_init(&p->mems_allowed_seq);
    1332             : #endif
    1333             : #ifdef CONFIG_TRACE_IRQFLAGS
    1334             :         p->irq_events = 0;
    1335             :         p->hardirqs_enabled = 0;
    1336             :         p->hardirq_enable_ip = 0;
    1337             :         p->hardirq_enable_event = 0;
    1338             :         p->hardirq_disable_ip = _THIS_IP_;
    1339             :         p->hardirq_disable_event = 0;
    1340             :         p->softirqs_enabled = 1;
    1341             :         p->softirq_enable_ip = _THIS_IP_;
    1342             :         p->softirq_enable_event = 0;
    1343             :         p->softirq_disable_ip = 0;
    1344             :         p->softirq_disable_event = 0;
    1345             :         p->hardirq_context = 0;
    1346             :         p->softirq_context = 0;
    1347             : #endif
    1348             : #ifdef CONFIG_LOCKDEP
    1349             :         p->lockdep_depth = 0; /* no locks held yet */
    1350             :         p->curr_chain_key = 0;
    1351             :         p->lockdep_recursion = 0;
    1352             : #endif
    1353             : 
    1354             : #ifdef CONFIG_DEBUG_MUTEXES
    1355             :         p->blocked_on = NULL; /* not blocked yet */
    1356             : #endif
    1357             : #ifdef CONFIG_BCACHE
    1358             :         p->sequential_io     = 0;
    1359             :         p->sequential_io_avg = 0;
    1360             : #endif
    1361             : 
    1362             :         /* Perform scheduler related setup. Assign this task to a CPU. */
    1363        2993 :         retval = sched_fork(clone_flags, p);
    1364        2993 :         if (retval)
    1365             :                 goto bad_fork_cleanup_policy;
    1366             : 
    1367        2993 :         retval = perf_event_init_task(p);
    1368        2993 :         if (retval)
    1369             :                 goto bad_fork_cleanup_policy;
    1370             :         retval = audit_alloc(p);
    1371             :         if (retval)
    1372             :                 goto bad_fork_cleanup_perf;
    1373             :         /* copy all the process information */
    1374        2993 :         shm_init_task(p);
    1375        2993 :         retval = copy_semundo(clone_flags, p);
    1376        2993 :         if (retval)
    1377             :                 goto bad_fork_cleanup_audit;
    1378        2993 :         retval = copy_files(clone_flags, p);
    1379        2993 :         if (retval)
    1380             :                 goto bad_fork_cleanup_semundo;
    1381        2993 :         retval = copy_fs(clone_flags, p);
    1382        2993 :         if (retval)
    1383             :                 goto bad_fork_cleanup_files;
    1384        2993 :         retval = copy_sighand(clone_flags, p);
    1385        2993 :         if (retval)
    1386             :                 goto bad_fork_cleanup_fs;
    1387        2993 :         retval = copy_signal(clone_flags, p);
    1388        2993 :         if (retval)
    1389             :                 goto bad_fork_cleanup_sighand;
    1390        2993 :         retval = copy_mm(clone_flags, p);
    1391        2992 :         if (retval)
    1392             :                 goto bad_fork_cleanup_signal;
    1393        2993 :         retval = copy_namespaces(clone_flags, p);
    1394        2993 :         if (retval)
    1395             :                 goto bad_fork_cleanup_mm;
    1396        2993 :         retval = copy_io(clone_flags, p);
    1397        2993 :         if (retval)
    1398             :                 goto bad_fork_cleanup_namespaces;
    1399        2993 :         retval = copy_thread(clone_flags, stack_start, stack_size, p);
    1400        2993 :         if (retval)
    1401             :                 goto bad_fork_cleanup_io;
    1402             : 
    1403        2993 :         if (pid != &init_struct_pid) {
    1404             :                 retval = -ENOMEM;
    1405        2993 :                 pid = alloc_pid(p->nsproxy->pid_ns_for_children);
    1406        2993 :                 if (!pid)
    1407             :                         goto bad_fork_cleanup_io;
    1408             :         }
    1409             : 
    1410        2993 :         p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
    1411             :         /*
    1412             :          * Clear TID on mm_release()?
    1413             :          */
    1414        2993 :         p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
    1415             : #ifdef CONFIG_BLOCK
    1416        2993 :         p->plug = NULL;
    1417             : #endif
    1418             : #ifdef CONFIG_FUTEX
    1419        2993 :         p->robust_list = NULL;
    1420             : #ifdef CONFIG_COMPAT
    1421             :         p->compat_robust_list = NULL;
    1422             : #endif
    1423        2993 :         INIT_LIST_HEAD(&p->pi_state_list);
    1424        2993 :         p->pi_state_cache = NULL;
    1425             : #endif
    1426             :         /*
    1427             :          * sigaltstack should be cleared when sharing the same VM
    1428             :          */
    1429        2993 :         if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
    1430          98 :                 p->sas_ss_sp = p->sas_ss_size = 0;
    1431             : 
    1432             :         /*
    1433             :          * Syscall tracing and stepping should be turned off in the
    1434             :          * child regardless of CLONE_PTRACE.
    1435             :          */
    1436             :         user_disable_single_step(p);
    1437             :         clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
    1438             : #ifdef TIF_SYSCALL_EMU
    1439             :         clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
    1440             : #endif
    1441             :         clear_all_latency_tracing(p);
    1442             : 
    1443             :         /* ok, now we should be set up.. */
    1444        2993 :         p->pid = pid_nr(pid);
    1445        2993 :         if (clone_flags & CLONE_THREAD) {
    1446           6 :                 p->exit_signal = -1;
    1447           6 :                 p->group_leader = current->group_leader;
    1448           6 :                 p->tgid = current->tgid;
    1449             :         } else {
    1450        2987 :                 if (clone_flags & CLONE_PARENT)
    1451           0 :                         p->exit_signal = current->group_leader->exit_signal;
    1452             :                 else
    1453        2987 :                         p->exit_signal = (clone_flags & CSIGNAL);
    1454        2987 :                 p->group_leader = p;
    1455        2987 :                 p->tgid = p->pid;
    1456             :         }
    1457             : 
    1458        2993 :         p->nr_dirtied = 0;
    1459        2993 :         p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
    1460        2993 :         p->dirty_paused_when = 0;
    1461             : 
    1462        2993 :         p->pdeath_signal = 0;
    1463        2993 :         INIT_LIST_HEAD(&p->thread_group);
    1464        2993 :         p->task_works = NULL;
    1465             : 
    1466             :         /*
    1467             :          * Make it visible to the rest of the system, but dont wake it up yet.
    1468             :          * Need tasklist lock for parent etc handling!
    1469             :          */
    1470        2993 :         write_lock_irq(&tasklist_lock);
    1471             : 
    1472             :         /* CLONE_PARENT re-uses the old parent */
    1473        2993 :         if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
    1474           6 :                 p->real_parent = current->real_parent;
    1475           6 :                 p->parent_exec_id = current->parent_exec_id;
    1476             :         } else {
    1477        2987 :                 p->real_parent = current;
    1478        2987 :                 p->parent_exec_id = current->self_exec_id;
    1479             :         }
    1480             : 
    1481             :         spin_lock(&current->sighand->siglock);
    1482             : 
    1483             :         /*
    1484             :          * Copy seccomp details explicitly here, in case they were changed
    1485             :          * before holding sighand lock.
    1486             :          */
    1487        2993 :         copy_seccomp(p);
    1488             : 
    1489             :         /*
    1490             :          * Process group and session signals need to be delivered to just the
    1491             :          * parent before the fork or both the parent and the child after the
    1492             :          * fork. Restart if a signal comes in before we add the new process to
    1493             :          * it's process group.
    1494             :          * A fatal signal pending means that current will exit, so the new
    1495             :          * thread can't slip out of an OOM kill (or normal SIGKILL).
    1496             :         */
    1497        2993 :         recalc_sigpending();
    1498        5986 :         if (signal_pending(current)) {
    1499             :                 spin_unlock(&current->sighand->siglock);
    1500           2 :                 write_unlock_irq(&tasklist_lock);
    1501             :                 retval = -ERESTARTNOINTR;
    1502             :                 goto bad_fork_free_pid;
    1503             :         }
    1504             : 
    1505        2992 :         if (likely(p->pid)) {
    1506        2992 :                 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
    1507             : 
    1508             :                 init_task_pid(p, PIDTYPE_PID, pid);
    1509        2992 :                 if (thread_group_leader(p)) {
    1510        2986 :                         init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
    1511        2986 :                         init_task_pid(p, PIDTYPE_SID, task_session(current));
    1512             : 
    1513        2986 :                         if (is_child_reaper(pid)) {
    1514           1 :                                 ns_of_pid(pid)->child_reaper = p;
    1515           1 :                                 p->signal->flags |= SIGNAL_UNKILLABLE;
    1516             :                         }
    1517             : 
    1518        2986 :                         p->signal->leader_pid = pid;
    1519        5972 :                         p->signal->tty = tty_kref_get(current->signal->tty);
    1520        2986 :                         list_add_tail(&p->sibling, &p->real_parent->children);
    1521        2986 :                         list_add_tail_rcu(&p->tasks, &init_task.tasks);
    1522        2986 :                         attach_pid(p, PIDTYPE_PGID);
    1523        2986 :                         attach_pid(p, PIDTYPE_SID);
    1524        2986 :                         __this_cpu_inc(process_counts);
    1525             :                 } else {
    1526           6 :                         current->signal->nr_threads++;
    1527           6 :                         atomic_inc(&current->signal->live);
    1528           6 :                         atomic_inc(&current->signal->sigcnt);
    1529           6 :                         list_add_tail_rcu(&p->thread_group,
    1530           6 :                                           &p->group_leader->thread_group);
    1531          12 :                         list_add_tail_rcu(&p->thread_node,
    1532           6 :                                           &p->signal->thread_head);
    1533             :                 }
    1534        2992 :                 attach_pid(p, PIDTYPE_PID);
    1535        2992 :                 nr_threads++;
    1536             :         }
    1537             : 
    1538        2992 :         total_forks++;
    1539             :         spin_unlock(&current->sighand->siglock);
    1540             :         syscall_tracepoint_update(p);
    1541        5984 :         write_unlock_irq(&tasklist_lock);
    1542             : 
    1543             :         proc_fork_connector(p);
    1544        2992 :         cgroup_post_fork(p);
    1545        2992 :         if (clone_flags & CLONE_THREAD)
    1546           6 :                 threadgroup_change_end(current);
    1547        2992 :         perf_event_fork(p);
    1548             : 
    1549             :         trace_task_newtask(p, clone_flags);
    1550             :         uprobe_copy_process(p, clone_flags);
    1551             : 
    1552        2992 :         return p;
    1553             : 
    1554             : bad_fork_free_pid:
    1555           1 :         if (pid != &init_struct_pid)
    1556           1 :                 free_pid(pid);
    1557             : bad_fork_cleanup_io:
    1558           1 :         if (p->io_context)
    1559           0 :                 exit_io_context(p);
    1560             : bad_fork_cleanup_namespaces:
    1561           1 :         exit_task_namespaces(p);
    1562             : bad_fork_cleanup_mm:
    1563           2 :         if (p->mm)
    1564           1 :                 mmput(p->mm);
    1565             : bad_fork_cleanup_signal:
    1566           1 :         if (!(clone_flags & CLONE_THREAD))
    1567           1 :                 free_signal_struct(p->signal);
    1568             : bad_fork_cleanup_sighand:
    1569           1 :         __cleanup_sighand(p->sighand);
    1570             : bad_fork_cleanup_fs:
    1571           1 :         exit_fs(p); /* blocking */
    1572             : bad_fork_cleanup_files:
    1573           1 :         exit_files(p); /* blocking */
    1574             : bad_fork_cleanup_semundo:
    1575           1 :         exit_sem(p);
    1576             : bad_fork_cleanup_audit:
    1577             :         audit_free(p);
    1578             : bad_fork_cleanup_perf:
    1579           1 :         perf_event_free_task(p);
    1580             : bad_fork_cleanup_policy:
    1581             : #ifdef CONFIG_NUMA
    1582             :         mpol_put(p->mempolicy);
    1583             : bad_fork_cleanup_threadgroup_lock:
    1584             : #endif
    1585           1 :         if (clone_flags & CLONE_THREAD)
    1586           0 :                 threadgroup_change_end(current);
    1587             :         delayacct_tsk_free(p);
    1588           1 :         module_put(task_thread_info(p)->exec_domain->module);
    1589             : bad_fork_cleanup_count:
    1590           1 :         atomic_dec(&p->cred->user->processes);
    1591           1 :         exit_creds(p);
    1592             : bad_fork_free:
    1593           1 :         free_task(p);
    1594             : fork_out:
    1595           1 :         return ERR_PTR(retval);
    1596             : }
    1597             : 
    1598             : static inline void init_idle_pids(struct pid_link *links)
    1599             : {
    1600             :         enum pid_type type;
    1601             : 
    1602           0 :         for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
    1603           0 :                 INIT_HLIST_NODE(&links[type].node); /* not really needed */
    1604           0 :                 links[type].pid = &init_struct_pid;
    1605             :         }
    1606             : }
    1607             : 
    1608           0 : struct task_struct *fork_idle(int cpu)
    1609             : {
    1610             :         struct task_struct *task;
    1611           0 :         task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
    1612           0 :         if (!IS_ERR(task)) {
    1613           0 :                 init_idle_pids(task->pids);
    1614           0 :                 init_idle(task, cpu);
    1615             :         }
    1616             : 
    1617           0 :         return task;
    1618             : }
    1619             : 
    1620             : /*
    1621             :  *  Ok, this is the main fork-routine.
    1622             :  *
    1623             :  * It copies the process, and if successful kick-starts
    1624             :  * it and waits for it to finish using the VM if required.
    1625             :  */
    1626        2993 : long do_fork(unsigned long clone_flags,
    1627             :               unsigned long stack_start,
    1628             :               unsigned long stack_size,
    1629             :               int __user *parent_tidptr,
    1630             :               int __user *child_tidptr)
    1631             : {
    1632             :         struct task_struct *p;
    1633             :         int trace = 0;
    1634             :         long nr;
    1635             : 
    1636             :         /*
    1637             :          * Determine whether and which event to report to ptracer.  When
    1638             :          * called from kernel_thread or CLONE_UNTRACED is explicitly
    1639             :          * requested, no event is reported; otherwise, report if the event
    1640             :          * for the type of forking is enabled.
    1641             :          */
    1642        2993 :         if (!(clone_flags & CLONE_UNTRACED)) {
    1643        2901 :                 if (clone_flags & CLONE_VFORK)
    1644             :                         trace = PTRACE_EVENT_VFORK;
    1645        2901 :                 else if ((clone_flags & CSIGNAL) != SIGCHLD)
    1646             :                         trace = PTRACE_EVENT_CLONE;
    1647             :                 else
    1648             :                         trace = PTRACE_EVENT_FORK;
    1649             : 
    1650        5802 :                 if (likely(!ptrace_event_enabled(current, trace)))
    1651             :                         trace = 0;
    1652             :         }
    1653             : 
    1654        2993 :         p = copy_process(clone_flags, stack_start, stack_size,
    1655             :                          child_tidptr, NULL, trace);
    1656             :         /*
    1657             :          * Do this prior waking up the new thread - the thread pointer
    1658             :          * might get invalid after that point, if the thread exits quickly.
    1659             :          */
    1660        2993 :         if (!IS_ERR(p)) {
    1661             :                 struct completion vfork;
    1662             :                 struct pid *pid;
    1663             : 
    1664             :                 trace_sched_process_fork(current, p);
    1665             : 
    1666        2992 :                 pid = get_task_pid(p, PIDTYPE_PID);
    1667        2992 :                 nr = pid_vnr(pid);
    1668             : 
    1669        2992 :                 if (clone_flags & CLONE_PARENT_SETTID)
    1670           6 :                         put_user(nr, parent_tidptr);
    1671             : 
    1672        2992 :                 if (clone_flags & CLONE_VFORK) {
    1673           0 :                         p->vfork_done = &vfork;
    1674             :                         init_completion(&vfork);
    1675           0 :                         get_task_struct(p);
    1676             :                 }
    1677             : 
    1678        2992 :                 wake_up_new_task(p);
    1679             : 
    1680             :                 /* forking complete and child started to run, tell ptracer */
    1681        2992 :                 if (unlikely(trace))
    1682             :                         ptrace_event_pid(trace, pid);
    1683             : 
    1684        2992 :                 if (clone_flags & CLONE_VFORK) {
    1685           0 :                         if (!wait_for_vfork_done(p, &vfork))
    1686             :                                 ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
    1687             :                 }
    1688             : 
    1689        2992 :                 put_pid(pid);
    1690             :         } else {
    1691             :                 nr = PTR_ERR(p);
    1692             :         }
    1693        2993 :         return nr;
    1694             : }
    1695             : 
    1696             : /*
    1697             :  * Create a kernel thread.
    1698             :  */
    1699          92 : pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
    1700             : {
    1701          92 :         return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
    1702             :                 (unsigned long)arg, NULL, NULL);
    1703             : }
    1704             : 
    1705             : #ifdef __ARCH_WANT_SYS_FORK
    1706           0 : SYSCALL_DEFINE0(fork)
    1707             : {
    1708             : #ifdef CONFIG_MMU
    1709           0 :         return do_fork(SIGCHLD, 0, 0, NULL, NULL);
    1710             : #else
    1711             :         /* can not support in nommu mode */
    1712             :         return -EINVAL;
    1713             : #endif
    1714             : }
    1715             : #endif
    1716             : 
    1717             : #ifdef __ARCH_WANT_SYS_VFORK
    1718           0 : SYSCALL_DEFINE0(vfork)
    1719             : {
    1720           0 :         return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
    1721             :                         0, NULL, NULL);
    1722             : }
    1723             : #endif
    1724             : 
    1725             : #ifdef __ARCH_WANT_SYS_CLONE
    1726             : #ifdef CONFIG_CLONE_BACKWARDS
    1727        5802 : SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
    1728             :                  int __user *, parent_tidptr,
    1729             :                  int, tls_val,
    1730             :                  int __user *, child_tidptr)
    1731             : #elif defined(CONFIG_CLONE_BACKWARDS2)
    1732             : SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
    1733             :                  int __user *, parent_tidptr,
    1734             :                  int __user *, child_tidptr,
    1735             :                  int, tls_val)
    1736             : #elif defined(CONFIG_CLONE_BACKWARDS3)
    1737             : SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
    1738             :                 int, stack_size,
    1739             :                 int __user *, parent_tidptr,
    1740             :                 int __user *, child_tidptr,
    1741             :                 int, tls_val)
    1742             : #else
    1743             : SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
    1744             :                  int __user *, parent_tidptr,
    1745             :                  int __user *, child_tidptr,
    1746             :                  int, tls_val)
    1747             : #endif
    1748             : {
    1749        2901 :         return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
    1750             : }
    1751             : #endif
    1752             : 
    1753             : #ifndef ARCH_MIN_MMSTRUCT_ALIGN
    1754             : #define ARCH_MIN_MMSTRUCT_ALIGN 0
    1755             : #endif
    1756             : 
    1757         108 : static void sighand_ctor(void *data)
    1758             : {
    1759             :         struct sighand_struct *sighand = data;
    1760             : 
    1761             :         spin_lock_init(&sighand->siglock);
    1762         108 :         init_waitqueue_head(&sighand->signalfd_wqh);
    1763         108 : }
    1764             : 
    1765           1 : void __init proc_caches_init(void)
    1766             : {
    1767           1 :         sighand_cachep = kmem_cache_create("sighand_cache",
    1768             :                         sizeof(struct sighand_struct), 0,
    1769             :                         SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
    1770             :                         SLAB_NOTRACK, sighand_ctor);
    1771           1 :         signal_cachep = kmem_cache_create("signal_cache",
    1772             :                         sizeof(struct signal_struct), 0,
    1773             :                         SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
    1774           1 :         files_cachep = kmem_cache_create("files_cache",
    1775             :                         sizeof(struct files_struct), 0,
    1776             :                         SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
    1777           1 :         fs_cachep = kmem_cache_create("fs_cache",
    1778             :                         sizeof(struct fs_struct), 0,
    1779             :                         SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
    1780             :         /*
    1781             :          * FIXME! The "sizeof(struct mm_struct)" currently includes the
    1782             :          * whole struct cpumask for the OFFSTACK case. We could change
    1783             :          * this to *only* allocate as much of it as required by the
    1784             :          * maximum number of CPU's we can ever have.  The cpumask_allocation
    1785             :          * is at the end of the structure, exactly for that reason.
    1786             :          */
    1787           1 :         mm_cachep = kmem_cache_create("mm_struct",
    1788             :                         sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
    1789             :                         SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
    1790           1 :         vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
    1791           1 :         mmap_init();
    1792           1 :         nsproxy_cache_init();
    1793           1 : }
    1794             : 
    1795             : /*
    1796             :  * Check constraints on flags passed to the unshare system call.
    1797             :  */
    1798           1 : static int check_unshare_flags(unsigned long unshare_flags)
    1799             : {
    1800           1 :         if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
    1801             :                                 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
    1802             :                                 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
    1803             :                                 CLONE_NEWUSER|CLONE_NEWPID))
    1804             :                 return -EINVAL;
    1805             :         /*
    1806             :          * Not implemented, but pretend it works if there is nothing to
    1807             :          * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
    1808             :          * needs to unshare vm.
    1809             :          */
    1810           1 :         if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
    1811             :                 /* FIXME: get_task_mm() increments ->mm_users */
    1812           0 :                 if (atomic_read(&current->mm->mm_users) > 1)
    1813             :                         return -EINVAL;
    1814             :         }
    1815             : 
    1816           1 :         return 0;
    1817             : }
    1818             : 
    1819             : /*
    1820             :  * Unshare the filesystem structure if it is being shared
    1821             :  */
    1822           1 : static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
    1823             : {
    1824           1 :         struct fs_struct *fs = current->fs;
    1825             : 
    1826           1 :         if (!(unshare_flags & CLONE_FS) || !fs)
    1827             :                 return 0;
    1828             : 
    1829             :         /* don't need lock here; in the worst case we'll do useless copy */
    1830           1 :         if (fs->users == 1)
    1831             :                 return 0;
    1832             : 
    1833           1 :         *new_fsp = copy_fs_struct(fs);
    1834           1 :         if (!*new_fsp)
    1835             :                 return -ENOMEM;
    1836             : 
    1837           1 :         return 0;
    1838             : }
    1839             : 
    1840             : /*
    1841             :  * Unshare file descriptor table if it is being shared
    1842             :  */
    1843        2248 : static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
    1844             : {
    1845        2248 :         struct files_struct *fd = current->files;
    1846        2248 :         int error = 0;
    1847             : 
    1848        2248 :         if ((unshare_flags & CLONE_FILES) &&
    1849        2247 :             (fd && atomic_read(&fd->count) > 1)) {
    1850           0 :                 *new_fdp = dup_fd(fd, &error);
    1851           0 :                 if (!*new_fdp)
    1852           0 :                         return error;
    1853             :         }
    1854             : 
    1855             :         return 0;
    1856             : }
    1857             : 
    1858             : /*
    1859             :  * unshare allows a process to 'unshare' part of the process
    1860             :  * context which was originally shared using clone.  copy_*
    1861             :  * functions used by do_fork() cannot be used here directly
    1862             :  * because they modify an inactive task_struct that is being
    1863             :  * constructed. Here we are modifying the current, active,
    1864             :  * task_struct.
    1865             :  */
    1866           2 : SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
    1867             : {
    1868           1 :         struct fs_struct *fs, *new_fs = NULL;
    1869           1 :         struct files_struct *fd, *new_fd = NULL;
    1870             :         struct cred *new_cred = NULL;
    1871           1 :         struct nsproxy *new_nsproxy = NULL;
    1872             :         int do_sysvsem = 0;
    1873             :         int err;
    1874             : 
    1875             :         /*
    1876             :          * If unsharing a user namespace must also unshare the thread.
    1877             :          */
    1878           1 :         if (unshare_flags & CLONE_NEWUSER)
    1879           0 :                 unshare_flags |= CLONE_THREAD | CLONE_FS;
    1880             :         /*
    1881             :          * If unsharing a thread from a thread group, must also unshare vm.
    1882             :          */
    1883           1 :         if (unshare_flags & CLONE_THREAD)
    1884           0 :                 unshare_flags |= CLONE_VM;
    1885             :         /*
    1886             :          * If unsharing vm, must also unshare signal handlers.
    1887             :          */
    1888           1 :         if (unshare_flags & CLONE_VM)
    1889           0 :                 unshare_flags |= CLONE_SIGHAND;
    1890             :         /*
    1891             :          * If unsharing namespace, must also unshare filesystem information.
    1892             :          */
    1893           1 :         if (unshare_flags & CLONE_NEWNS)
    1894           1 :                 unshare_flags |= CLONE_FS;
    1895             : 
    1896           1 :         err = check_unshare_flags(unshare_flags);
    1897           1 :         if (err)
    1898             :                 goto bad_unshare_out;
    1899             :         /*
    1900             :          * CLONE_NEWIPC must also detach from the undolist: after switching
    1901             :          * to a new ipc namespace, the semaphore arrays from the old
    1902             :          * namespace are unreachable.
    1903             :          */
    1904           1 :         if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
    1905             :                 do_sysvsem = 1;
    1906           1 :         err = unshare_fs(unshare_flags, &new_fs);
    1907           1 :         if (err)
    1908             :                 goto bad_unshare_out;
    1909           1 :         err = unshare_fd(unshare_flags, &new_fd);
    1910           1 :         if (err)
    1911             :                 goto bad_unshare_cleanup_fs;
    1912             :         err = unshare_userns(unshare_flags, &new_cred);
    1913           1 :         if (err)
    1914             :                 goto bad_unshare_cleanup_fd;
    1915           1 :         err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
    1916             :                                          new_cred, new_fs);
    1917           1 :         if (err)
    1918             :                 goto bad_unshare_cleanup_cred;
    1919             : 
    1920           1 :         if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
    1921           1 :                 if (do_sysvsem) {
    1922             :                         /*
    1923             :                          * CLONE_SYSVSEM is equivalent to sys_exit().
    1924             :                          */
    1925           0 :                         exit_sem(current);
    1926             :                 }
    1927           1 :                 if (unshare_flags & CLONE_NEWIPC) {
    1928             :                         /* Orphan segments in old ns (see sem above). */
    1929           0 :                         exit_shm(current);
    1930           0 :                         shm_init_task(current);
    1931             :                 }
    1932             : 
    1933           1 :                 if (new_nsproxy)
    1934           1 :                         switch_task_namespaces(current, new_nsproxy);
    1935             : 
    1936             :                 task_lock(current);
    1937             : 
    1938           1 :                 if (new_fs) {
    1939           1 :                         fs = current->fs;
    1940             :                         spin_lock(&fs->lock);
    1941           1 :                         current->fs = new_fs;
    1942           1 :                         if (--fs->users)
    1943           1 :                                 new_fs = NULL;
    1944             :                         else
    1945           0 :                                 new_fs = fs;
    1946             :                         spin_unlock(&fs->lock);
    1947             :                 }
    1948             : 
    1949           1 :                 if (new_fd) {
    1950           0 :                         fd = current->files;
    1951           0 :                         current->files = new_fd;
    1952           0 :                         new_fd = fd;
    1953             :                 }
    1954             : 
    1955             :                 task_unlock(current);
    1956             : 
    1957             :                 if (new_cred) {
    1958             :                         /* Install the new user namespace */
    1959             :                         commit_creds(new_cred);
    1960             :                         new_cred = NULL;
    1961             :                 }
    1962             :         }
    1963             : 
    1964             : bad_unshare_cleanup_cred:
    1965             :         if (new_cred)
    1966             :                 put_cred(new_cred);
    1967             : bad_unshare_cleanup_fd:
    1968           1 :         if (new_fd)
    1969           0 :                 put_files_struct(new_fd);
    1970             : 
    1971             : bad_unshare_cleanup_fs:
    1972           1 :         if (new_fs)
    1973           0 :                 free_fs_struct(new_fs);
    1974             : 
    1975             : bad_unshare_out:
    1976             :         return err;
    1977             : }
    1978             : 
    1979             : /*
    1980             :  *      Helper to unshare the files of the current task.
    1981             :  *      We don't want to expose copy_files internals to
    1982             :  *      the exec layer of the kernel.
    1983             :  */
    1984             : 
    1985        2247 : int unshare_files(struct files_struct **displaced)
    1986             : {
    1987        2247 :         struct task_struct *task = current;
    1988        2247 :         struct files_struct *copy = NULL;
    1989             :         int error;
    1990             : 
    1991        2247 :         error = unshare_fd(CLONE_FILES, &copy);
    1992        2247 :         if (error || !copy) {
    1993        2247 :                 *displaced = NULL;
    1994        2247 :                 return error;
    1995             :         }
    1996           0 :         *displaced = task->files;
    1997             :         task_lock(task);
    1998           0 :         task->files = copy;
    1999             :         task_unlock(task);
    2000             :         return 0;
    2001             : }

Generated by: LCOV version 1.11