Line data Source code
1 : /*
2 : * linux/kernel/exit.c
3 : *
4 : * Copyright (C) 1991, 1992 Linus Torvalds
5 : */
6 :
7 : #include <linux/mm.h>
8 : #include <linux/slab.h>
9 : #include <linux/interrupt.h>
10 : #include <linux/module.h>
11 : #include <linux/capability.h>
12 : #include <linux/completion.h>
13 : #include <linux/personality.h>
14 : #include <linux/tty.h>
15 : #include <linux/iocontext.h>
16 : #include <linux/key.h>
17 : #include <linux/security.h>
18 : #include <linux/cpu.h>
19 : #include <linux/acct.h>
20 : #include <linux/tsacct_kern.h>
21 : #include <linux/file.h>
22 : #include <linux/fdtable.h>
23 : #include <linux/freezer.h>
24 : #include <linux/binfmts.h>
25 : #include <linux/nsproxy.h>
26 : #include <linux/pid_namespace.h>
27 : #include <linux/ptrace.h>
28 : #include <linux/profile.h>
29 : #include <linux/mount.h>
30 : #include <linux/proc_fs.h>
31 : #include <linux/kthread.h>
32 : #include <linux/mempolicy.h>
33 : #include <linux/taskstats_kern.h>
34 : #include <linux/delayacct.h>
35 : #include <linux/cgroup.h>
36 : #include <linux/syscalls.h>
37 : #include <linux/signal.h>
38 : #include <linux/posix-timers.h>
39 : #include <linux/cn_proc.h>
40 : #include <linux/mutex.h>
41 : #include <linux/futex.h>
42 : #include <linux/pipe_fs_i.h>
43 : #include <linux/audit.h> /* for audit_free() */
44 : #include <linux/resource.h>
45 : #include <linux/blkdev.h>
46 : #include <linux/task_io_accounting_ops.h>
47 : #include <linux/tracehook.h>
48 : #include <linux/fs_struct.h>
49 : #include <linux/init_task.h>
50 : #include <linux/perf_event.h>
51 : #include <trace/events/sched.h>
52 : #include <linux/hw_breakpoint.h>
53 : #include <linux/oom.h>
54 : #include <linux/writeback.h>
55 : #include <linux/shm.h>
56 :
57 : #include <asm/uaccess.h>
58 : #include <asm/unistd.h>
59 : #include <asm/pgtable.h>
60 : #include <asm/mmu_context.h>
61 :
62 : static void exit_mm(struct task_struct *tsk);
63 :
64 2914 : static void __unhash_process(struct task_struct *p, bool group_dead)
65 : {
66 2914 : nr_threads--;
67 2914 : detach_pid(p, PIDTYPE_PID);
68 2914 : if (group_dead) {
69 2914 : detach_pid(p, PIDTYPE_PGID);
70 2914 : detach_pid(p, PIDTYPE_SID);
71 :
72 : list_del_rcu(&p->tasks);
73 2914 : list_del_init(&p->sibling);
74 2914 : __this_cpu_dec(process_counts);
75 : }
76 : list_del_rcu(&p->thread_group);
77 : list_del_rcu(&p->thread_node);
78 2914 : }
79 :
80 : /*
81 : * This function expects the tasklist_lock write-locked.
82 : */
83 2914 : static void __exit_signal(struct task_struct *tsk)
84 : {
85 2914 : struct signal_struct *sig = tsk->signal;
86 : bool group_dead = thread_group_leader(tsk);
87 : struct sighand_struct *sighand;
88 : struct tty_struct *uninitialized_var(tty);
89 : cputime_t utime, stime;
90 :
91 2914 : sighand = rcu_dereference_check(tsk->sighand,
92 : lockdep_tasklist_lock_is_held());
93 : spin_lock(&sighand->siglock);
94 :
95 2914 : posix_cpu_timers_exit(tsk);
96 2914 : if (group_dead) {
97 2914 : posix_cpu_timers_exit_group(tsk);
98 2914 : tty = sig->tty;
99 2914 : sig->tty = NULL;
100 : } else {
101 : /*
102 : * This can only happen if the caller is de_thread().
103 : * FIXME: this is the temporary hack, we should teach
104 : * posix-cpu-timers to handle this case correctly.
105 : */
106 0 : if (unlikely(has_group_leader_pid(tsk)))
107 0 : posix_cpu_timers_exit_group(tsk);
108 :
109 : /*
110 : * If there is any task waiting for the group exit
111 : * then notify it:
112 : */
113 0 : if (sig->notify_count > 0 && !--sig->notify_count)
114 0 : wake_up_process(sig->group_exit_task);
115 :
116 0 : if (tsk == sig->curr_target)
117 0 : sig->curr_target = next_thread(tsk);
118 : }
119 :
120 : /*
121 : * Accumulate here the counters for all threads as they die. We could
122 : * skip the group leader because it is the last user of signal_struct,
123 : * but we want to avoid the race with thread_group_cputime() which can
124 : * see the empty ->thread_head list.
125 : */
126 : task_cputime(tsk, &utime, &stime);
127 : write_seqlock(&sig->stats_lock);
128 2914 : sig->utime += utime;
129 2914 : sig->stime += stime;
130 5828 : sig->gtime += task_gtime(tsk);
131 2914 : sig->min_flt += tsk->min_flt;
132 2914 : sig->maj_flt += tsk->maj_flt;
133 2914 : sig->nvcsw += tsk->nvcsw;
134 2914 : sig->nivcsw += tsk->nivcsw;
135 5828 : sig->inblock += task_io_get_inblock(tsk);
136 5828 : sig->oublock += task_io_get_oublock(tsk);
137 : task_io_accounting_add(&sig->ioac, &tsk->ioac);
138 2914 : sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
139 2914 : sig->nr_threads--;
140 2914 : __unhash_process(tsk, group_dead);
141 : write_sequnlock(&sig->stats_lock);
142 :
143 : /*
144 : * Do this under ->siglock, we can race with another thread
145 : * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
146 : */
147 2914 : flush_sigqueue(&tsk->pending);
148 2914 : tsk->sighand = NULL;
149 : spin_unlock(&sighand->siglock);
150 :
151 2914 : __cleanup_sighand(sighand);
152 : clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
153 2914 : if (group_dead) {
154 2914 : flush_sigqueue(&sig->shared_pending);
155 2914 : tty_kref_put(tty);
156 : }
157 2914 : }
158 :
159 2914 : static void delayed_put_task_struct(struct rcu_head *rhp)
160 : {
161 2914 : struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
162 :
163 2914 : perf_event_delayed_put(tsk);
164 : trace_sched_process_free(tsk);
165 : put_task_struct(tsk);
166 2914 : }
167 :
168 :
169 2914 : void release_task(struct task_struct *p)
170 : {
171 : struct task_struct *leader;
172 : int zap_leader;
173 : repeat:
174 : /* don't need to get the RCU readlock here - the process is dead and
175 : * can't be modifying its own credentials. But shut RCU-lockdep up */
176 : rcu_read_lock();
177 2914 : atomic_dec(&__task_cred(p)->user->processes);
178 : rcu_read_unlock();
179 :
180 2914 : proc_flush_task(p);
181 :
182 2914 : write_lock_irq(&tasklist_lock);
183 : ptrace_release_task(p);
184 2914 : __exit_signal(p);
185 :
186 : /*
187 : * If we are the last non-leader member of the thread
188 : * group, and the leader is zombie, then notify the
189 : * group leader's parent process. (if it wants notification.)
190 : */
191 : zap_leader = 0;
192 2914 : leader = p->group_leader;
193 2914 : if (leader != p && thread_group_empty(leader)
194 0 : && leader->exit_state == EXIT_ZOMBIE) {
195 : /*
196 : * If we were the last child thread and the leader has
197 : * exited already, and the leader's parent ignores SIGCHLD,
198 : * then we are the one who should release the leader.
199 : */
200 0 : zap_leader = do_notify_parent(leader, leader->exit_signal);
201 0 : if (zap_leader)
202 0 : leader->exit_state = EXIT_DEAD;
203 : }
204 :
205 5828 : write_unlock_irq(&tasklist_lock);
206 2914 : release_thread(p);
207 2914 : call_rcu(&p->rcu, delayed_put_task_struct);
208 :
209 : p = leader;
210 2914 : if (unlikely(zap_leader))
211 : goto repeat;
212 2914 : }
213 :
214 : /*
215 : * Determine if a process group is "orphaned", according to the POSIX
216 : * definition in 2.2.2.52. Orphaned process groups are not to be affected
217 : * by terminal-generated stop signals. Newly orphaned process groups are
218 : * to receive a SIGHUP and a SIGCONT.
219 : *
220 : * "I ask you, have you ever known what it is to be an orphan?"
221 : */
222 138 : static int will_become_orphaned_pgrp(struct pid *pgrp,
223 : struct task_struct *ignored_task)
224 : {
225 : struct task_struct *p;
226 :
227 279 : do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
228 147 : if ((p == ignored_task) ||
229 10 : (p->exit_state && thread_group_empty(p)) ||
230 3 : is_global_init(p->real_parent))
231 141 : continue;
232 :
233 2 : if (task_pgrp(p->real_parent) != pgrp &&
234 : task_session(p->real_parent) == task_session(p))
235 : return 0;
236 : } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
237 :
238 : return 1;
239 : }
240 :
241 0 : int is_current_pgrp_orphaned(void)
242 : {
243 : int retval;
244 :
245 0 : read_lock(&tasklist_lock);
246 0 : retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
247 0 : read_unlock(&tasklist_lock);
248 :
249 0 : return retval;
250 : }
251 :
252 137 : static bool has_stopped_jobs(struct pid *pgrp)
253 : {
254 : struct task_struct *p;
255 :
256 278 : do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
257 141 : if (p->signal->flags & SIGNAL_STOP_STOPPED)
258 : return true;
259 : } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
260 :
261 : return false;
262 : }
263 :
264 : /*
265 : * Check to see if any process groups have become orphaned as
266 : * a result of our exiting, and if they have any stopped jobs,
267 : * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
268 : */
269 : static void
270 2952 : kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
271 : {
272 : struct pid *pgrp = task_pgrp(tsk);
273 : struct task_struct *ignored_task = tsk;
274 :
275 2952 : if (!parent)
276 : /* exit: our father is in a different pgrp than
277 : * we are and we were the only connection outside.
278 : */
279 2914 : parent = tsk->real_parent;
280 : else
281 : /* reparent: our child is in a different pgrp than
282 : * we are, and it was the only connection outside.
283 : */
284 : ignored_task = NULL;
285 :
286 3130 : if (task_pgrp(parent) != pgrp &&
287 138 : task_session(parent) == task_session(tsk) &&
288 275 : will_become_orphaned_pgrp(pgrp, ignored_task) &&
289 137 : has_stopped_jobs(pgrp)) {
290 0 : __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
291 0 : __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
292 : }
293 2952 : }
294 :
295 : #ifdef CONFIG_MEMCG
296 : /*
297 : * A task is exiting. If it owned this mm, find a new owner for the mm.
298 : */
299 5007 : void mm_update_next_owner(struct mm_struct *mm)
300 : {
301 5007 : struct task_struct *c, *g, *p = current;
302 :
303 : retry:
304 : /*
305 : * If the exiting or execing task is not the owner, it's
306 : * someone else's problem.
307 : */
308 5007 : if (mm->owner != p)
309 : return;
310 : /*
311 : * The current owner is exiting/execing and there are no other
312 : * candidates. Do not leave the mm pointing to a possibly
313 : * freed task structure.
314 : */
315 5007 : if (atomic_read(&mm->mm_users) <= 1) {
316 5007 : mm->owner = NULL;
317 5007 : return;
318 : }
319 :
320 0 : read_lock(&tasklist_lock);
321 : /*
322 : * Search in the children
323 : */
324 0 : list_for_each_entry(c, &p->children, sibling) {
325 0 : if (c->mm == mm)
326 : goto assign_new_owner;
327 : }
328 :
329 : /*
330 : * Search in the siblings
331 : */
332 0 : list_for_each_entry(c, &p->real_parent->children, sibling) {
333 0 : if (c->mm == mm)
334 : goto assign_new_owner;
335 : }
336 :
337 : /*
338 : * Search through everything else, we should not get here often.
339 : */
340 0 : for_each_process(g) {
341 0 : if (g->flags & PF_KTHREAD)
342 0 : continue;
343 0 : for_each_thread(g, c) {
344 0 : if (c->mm == mm)
345 : goto assign_new_owner;
346 0 : if (c->mm)
347 : break;
348 : }
349 : }
350 0 : read_unlock(&tasklist_lock);
351 : /*
352 : * We found no owner yet mm_users > 1: this implies that we are
353 : * most likely racing with swapoff (try_to_unuse()) or /proc or
354 : * ptrace or page migration (get_task_mm()). Mark owner as NULL.
355 : */
356 0 : mm->owner = NULL;
357 0 : return;
358 :
359 : assign_new_owner:
360 : BUG_ON(c == p);
361 0 : get_task_struct(c);
362 : /*
363 : * The task_lock protects c->mm from changing.
364 : * We always want mm->owner->mm == mm
365 : */
366 : task_lock(c);
367 : /*
368 : * Delay read_unlock() till we have the task_lock()
369 : * to ensure that c does not slip away underneath us
370 : */
371 0 : read_unlock(&tasklist_lock);
372 0 : if (c->mm != mm) {
373 : task_unlock(c);
374 : put_task_struct(c);
375 : goto retry;
376 : }
377 0 : mm->owner = c;
378 : task_unlock(c);
379 : put_task_struct(c);
380 : }
381 : #endif /* CONFIG_MEMCG */
382 :
383 : /*
384 : * Turn us into a lazy TLB process if we
385 : * aren't already..
386 : */
387 2914 : static void exit_mm(struct task_struct *tsk)
388 : {
389 2914 : struct mm_struct *mm = tsk->mm;
390 : struct core_state *core_state;
391 :
392 2914 : mm_release(tsk, mm);
393 2914 : if (!mm)
394 2914 : return;
395 : sync_mm_rss(mm);
396 : /*
397 : * Serialize with any possible pending coredump.
398 : * We must hold mmap_sem around checking core_state
399 : * and clearing tsk->mm. The core-inducing thread
400 : * will increment ->nr_threads for each thread in the
401 : * group with ->mm != NULL.
402 : */
403 2881 : down_read(&mm->mmap_sem);
404 2881 : core_state = mm->core_state;
405 2881 : if (core_state) {
406 : struct core_thread self;
407 :
408 0 : up_read(&mm->mmap_sem);
409 :
410 0 : self.task = tsk;
411 0 : self.next = xchg(&core_state->dumper.next, &self);
412 : /*
413 : * Implies mb(), the result of xchg() must be visible
414 : * to core_state->dumper.
415 : */
416 0 : if (atomic_dec_and_test(&core_state->nr_threads))
417 0 : complete(&core_state->startup);
418 :
419 : for (;;) {
420 0 : set_task_state(tsk, TASK_UNINTERRUPTIBLE);
421 0 : if (!self.task) /* see coredump_finish() */
422 : break;
423 : freezable_schedule();
424 : }
425 0 : __set_task_state(tsk, TASK_RUNNING);
426 0 : down_read(&mm->mmap_sem);
427 : }
428 2881 : atomic_inc(&mm->mm_count);
429 : BUG_ON(mm != tsk->active_mm);
430 : /* more a memory barrier than a real lock */
431 : task_lock(tsk);
432 2881 : tsk->mm = NULL;
433 2881 : up_read(&mm->mmap_sem);
434 : enter_lazy_tlb(mm, current);
435 : task_unlock(tsk);
436 2881 : mm_update_next_owner(mm);
437 2881 : mmput(mm);
438 : clear_thread_flag(TIF_MEMDIE);
439 : }
440 :
441 : static struct task_struct *find_alive_thread(struct task_struct *p)
442 : {
443 : struct task_struct *t;
444 :
445 74 : for_each_thread(p, t) {
446 37 : if (!(t->flags & PF_EXITING))
447 : return t;
448 : }
449 : return NULL;
450 : }
451 :
452 2914 : static struct task_struct *find_child_reaper(struct task_struct *father)
453 : __releases(&tasklist_lock)
454 : __acquires(&tasklist_lock)
455 : {
456 2914 : struct pid_namespace *pid_ns = task_active_pid_ns(father);
457 2914 : struct task_struct *reaper = pid_ns->child_reaper;
458 :
459 2914 : if (likely(reaper != father))
460 : return reaper;
461 :
462 : reaper = find_alive_thread(father);
463 0 : if (reaper) {
464 0 : pid_ns->child_reaper = reaper;
465 0 : return reaper;
466 : }
467 :
468 0 : write_unlock_irq(&tasklist_lock);
469 0 : if (unlikely(pid_ns == &init_pid_ns)) {
470 0 : panic("Attempted to kill init! exitcode=0x%08x\n",
471 0 : father->signal->group_exit_code ?: father->exit_code);
472 : }
473 0 : zap_pid_ns_processes(pid_ns);
474 0 : write_lock_irq(&tasklist_lock);
475 :
476 0 : return father;
477 : }
478 :
479 : /*
480 : * When we die, we re-parent all our children, and try to:
481 : * 1. give them to another thread in our thread group, if such a member exists
482 : * 2. give it to the first ancestor process which prctl'd itself as a
483 : * child_subreaper for its children (like a service manager)
484 : * 3. give it to the init process (PID 1) in our pid namespace
485 : */
486 37 : static struct task_struct *find_new_reaper(struct task_struct *father,
487 : struct task_struct *child_reaper)
488 : {
489 : struct task_struct *thread, *reaper;
490 :
491 : thread = find_alive_thread(father);
492 37 : if (thread)
493 : return thread;
494 :
495 37 : if (father->signal->has_child_subreaper) {
496 : /*
497 : * Find the first ->is_child_subreaper ancestor in our pid_ns.
498 : * We start from father to ensure we can not look into another
499 : * namespace, this is safe because all its threads are dead.
500 : */
501 0 : for (reaper = father;
502 : !same_thread_group(reaper, child_reaper);
503 0 : reaper = reaper->real_parent) {
504 : /* call_usermodehelper() descendants need this check */
505 0 : if (reaper == &init_task)
506 : break;
507 0 : if (!reaper->signal->is_child_subreaper)
508 0 : continue;
509 : thread = find_alive_thread(reaper);
510 0 : if (thread)
511 : return thread;
512 : }
513 : }
514 :
515 37 : return child_reaper;
516 : }
517 :
518 : /*
519 : * Any that need to be release_task'd are put on the @dead list.
520 : */
521 38 : static void reparent_leader(struct task_struct *father, struct task_struct *p,
522 : struct list_head *dead)
523 : {
524 38 : if (unlikely(p->exit_state == EXIT_DEAD))
525 38 : return;
526 :
527 : /* We don't want people slaying init. */
528 38 : p->exit_signal = SIGCHLD;
529 :
530 : /* If it has exited notify the new parent about this child's death. */
531 38 : if (!p->ptrace &&
532 11 : p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
533 11 : if (do_notify_parent(p, p->exit_signal)) {
534 0 : p->exit_state = EXIT_DEAD;
535 0 : list_add(&p->ptrace_entry, dead);
536 : }
537 : }
538 :
539 38 : kill_orphaned_pgrp(p, father);
540 : }
541 :
542 : /*
543 : * This does two things:
544 : *
545 : * A. Make init inherit all the child processes
546 : * B. Check to see if any process groups have become orphaned
547 : * as a result of our exiting, and if they have any stopped
548 : * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
549 : */
550 2914 : static void forget_original_parent(struct task_struct *father,
551 : struct list_head *dead)
552 : {
553 : struct task_struct *p, *t, *reaper;
554 :
555 5828 : if (unlikely(!list_empty(&father->ptraced)))
556 0 : exit_ptrace(father, dead);
557 :
558 : /* Can drop and reacquire tasklist_lock */
559 2914 : reaper = find_child_reaper(father);
560 5828 : if (list_empty(&father->children))
561 2914 : return;
562 :
563 37 : reaper = find_new_reaper(father, reaper);
564 75 : list_for_each_entry(p, &father->children, sibling) {
565 77 : for_each_thread(p, t) {
566 39 : t->real_parent = reaper;
567 : BUG_ON((!t->ptrace) != (t->parent == father));
568 39 : if (likely(!t->ptrace))
569 39 : t->parent = t->real_parent;
570 39 : if (t->pdeath_signal)
571 0 : group_send_sig_info(t->pdeath_signal,
572 : SEND_SIG_NOINFO, t);
573 : }
574 : /*
575 : * If this is a threaded reparent there is no need to
576 : * notify anyone anything has happened.
577 : */
578 38 : if (!same_thread_group(reaper, father))
579 38 : reparent_leader(father, p, dead);
580 : }
581 37 : list_splice_tail_init(&father->children, &reaper->children);
582 : }
583 :
584 : /*
585 : * Send signals to all our closest relatives so that they know
586 : * to properly mourn us..
587 : */
588 2914 : static void exit_notify(struct task_struct *tsk, int group_dead)
589 : {
590 : bool autoreap;
591 : struct task_struct *p, *n;
592 2914 : LIST_HEAD(dead);
593 :
594 2914 : write_lock_irq(&tasklist_lock);
595 2914 : forget_original_parent(tsk, &dead);
596 :
597 2914 : if (group_dead)
598 2914 : kill_orphaned_pgrp(tsk->group_leader, NULL);
599 :
600 2914 : if (unlikely(tsk->ptrace)) {
601 0 : int sig = thread_group_leader(tsk) &&
602 0 : thread_group_empty(tsk) &&
603 : !ptrace_reparented(tsk) ?
604 0 : tsk->exit_signal : SIGCHLD;
605 0 : autoreap = do_notify_parent(tsk, sig);
606 2914 : } else if (thread_group_leader(tsk)) {
607 5828 : autoreap = thread_group_empty(tsk) &&
608 2914 : do_notify_parent(tsk, tsk->exit_signal);
609 : } else {
610 : autoreap = true;
611 : }
612 :
613 2914 : tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
614 2914 : if (tsk->exit_state == EXIT_DEAD)
615 33 : list_add(&tsk->ptrace_entry, &dead);
616 :
617 : /* mt-exec, de_thread() is waiting for group leader */
618 2914 : if (unlikely(tsk->signal->notify_count < 0))
619 0 : wake_up_process(tsk->signal->group_exit_task);
620 5828 : write_unlock_irq(&tasklist_lock);
621 :
622 2947 : list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
623 : list_del_init(&p->ptrace_entry);
624 33 : release_task(p);
625 : }
626 2914 : }
627 :
628 : #ifdef CONFIG_DEBUG_STACK_USAGE
629 : static void check_stack_usage(void)
630 : {
631 : static DEFINE_SPINLOCK(low_water_lock);
632 : static int lowest_to_date = THREAD_SIZE;
633 : unsigned long free;
634 :
635 : free = stack_not_used(current);
636 :
637 : if (free >= lowest_to_date)
638 : return;
639 :
640 : spin_lock(&low_water_lock);
641 : if (free < lowest_to_date) {
642 : pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n",
643 : current->comm, task_pid_nr(current), free);
644 : lowest_to_date = free;
645 : }
646 : spin_unlock(&low_water_lock);
647 : }
648 : #else
649 : static inline void check_stack_usage(void) {}
650 : #endif
651 :
652 2914 : void do_exit(long code)
653 : {
654 2914 : struct task_struct *tsk = current;
655 : int group_dead;
656 : TASKS_RCU(int tasks_rcu_i);
657 :
658 : profile_task_exit(tsk);
659 :
660 : WARN_ON(blk_needs_flush_plug(tsk));
661 :
662 2914 : if (unlikely(in_interrupt()))
663 0 : panic("Aiee, killing interrupt handler!");
664 2914 : if (unlikely(!tsk->pid))
665 0 : panic("Attempted to kill the idle task!");
666 :
667 : /*
668 : * If do_exit is called because this processes oopsed, it's possible
669 : * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
670 : * continuing. Amongst other possible reasons, this is to prevent
671 : * mm_release()->clear_child_tid() from writing to a user-controlled
672 : * kernel address.
673 : */
674 : set_fs(USER_DS);
675 :
676 2914 : ptrace_event(PTRACE_EVENT_EXIT, code);
677 :
678 : validate_creds_for_do_exit(tsk);
679 :
680 : /*
681 : * We're taking recursive faults here in do_exit. Safest is to just
682 : * leave this task alone and wait for reboot.
683 : */
684 2914 : if (unlikely(tsk->flags & PF_EXITING)) {
685 0 : pr_alert("Fixing recursive fault but reboot is needed!\n");
686 : /*
687 : * We can do this unlocked here. The futex code uses
688 : * this flag just to verify whether the pi state
689 : * cleanup has been done or not. In the worst case it
690 : * loops once more. We pretend that the cleanup was
691 : * done as there is no way to return. Either the
692 : * OWNER_DIED bit is set by now or we push the blocked
693 : * task into the wait for ever nirwana as well.
694 : */
695 0 : tsk->flags |= PF_EXITPIDONE;
696 0 : set_current_state(TASK_UNINTERRUPTIBLE);
697 0 : schedule();
698 : }
699 :
700 2914 : exit_signals(tsk); /* sets PF_EXITING */
701 : /*
702 : * tsk->flags are checked in the futex code to protect against
703 : * an exiting task cleaning up the robust pi futexes.
704 : */
705 2914 : smp_mb();
706 2914 : raw_spin_unlock_wait(&tsk->pi_lock);
707 :
708 2914 : if (unlikely(in_atomic()))
709 0 : pr_info("note: %s[%d] exited with preempt_count %d\n",
710 : current->comm, task_pid_nr(current),
711 : preempt_count());
712 :
713 2914 : acct_update_integrals(tsk);
714 : /* sync mm's RSS info before statistics gathering */
715 : if (tsk->mm)
716 : sync_mm_rss(tsk->mm);
717 5828 : group_dead = atomic_dec_and_test(&tsk->signal->live);
718 2914 : if (group_dead) {
719 2914 : hrtimer_cancel(&tsk->signal->real_timer);
720 2914 : exit_itimers(tsk->signal);
721 2914 : if (tsk->mm)
722 2881 : setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
723 : }
724 2914 : acct_collect(code, group_dead);
725 2914 : if (group_dead)
726 2914 : tty_audit_exit();
727 : audit_free(tsk);
728 :
729 2914 : tsk->exit_code = code;
730 2914 : taskstats_exit(tsk, group_dead);
731 :
732 2914 : exit_mm(tsk);
733 :
734 2914 : if (group_dead)
735 2914 : acct_process();
736 : trace_sched_process_exit(tsk);
737 :
738 2914 : exit_sem(tsk);
739 2914 : exit_shm(tsk);
740 2914 : exit_files(tsk);
741 2914 : exit_fs(tsk);
742 2914 : if (group_dead)
743 2914 : disassociate_ctty(1);
744 2914 : exit_task_namespaces(tsk);
745 : exit_task_work(tsk);
746 2914 : exit_thread();
747 :
748 : /*
749 : * Flush inherited counters to the parent - before the parent
750 : * gets woken up by child-exit notifications.
751 : *
752 : * because of cgroup mode, must be called before cgroup_exit()
753 : */
754 2914 : perf_event_exit_task(tsk);
755 :
756 2914 : cgroup_exit(tsk);
757 :
758 2914 : module_put(task_thread_info(tsk)->exec_domain->module);
759 :
760 : /*
761 : * FIXME: do that only when needed, using sched_exit tracepoint
762 : */
763 2914 : flush_ptrace_hw_breakpoint(tsk);
764 :
765 : TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
766 2914 : exit_notify(tsk, group_dead);
767 : proc_exit_connector(tsk);
768 : #ifdef CONFIG_NUMA
769 : task_lock(tsk);
770 : mpol_put(tsk->mempolicy);
771 : tsk->mempolicy = NULL;
772 : task_unlock(tsk);
773 : #endif
774 : #ifdef CONFIG_FUTEX
775 2914 : if (unlikely(current->pi_state_cache))
776 0 : kfree(current->pi_state_cache);
777 : #endif
778 : /*
779 : * Make sure we are holding no locks:
780 : */
781 : debug_check_no_locks_held();
782 : /*
783 : * We can do this unlocked here. The futex code uses this flag
784 : * just to verify whether the pi state cleanup has been done
785 : * or not. In the worst case it loops once more.
786 : */
787 2914 : tsk->flags |= PF_EXITPIDONE;
788 :
789 2914 : if (tsk->io_context)
790 212 : exit_io_context(tsk);
791 :
792 2914 : if (tsk->splice_pipe)
793 0 : free_pipe_info(tsk->splice_pipe);
794 :
795 2914 : if (tsk->task_frag.page)
796 0 : put_page(tsk->task_frag.page);
797 :
798 : validate_creds_for_do_exit(tsk);
799 :
800 : check_stack_usage();
801 2914 : preempt_disable();
802 2914 : if (tsk->nr_dirtied)
803 15 : __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
804 2914 : exit_rcu();
805 : TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
806 :
807 : /*
808 : * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
809 : * when the following two conditions become true.
810 : * - There is race condition of mmap_sem (It is acquired by
811 : * exit_mm()), and
812 : * - SMI occurs before setting TASK_RUNINNG.
813 : * (or hypervisor of virtual machine switches to other guest)
814 : * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
815 : *
816 : * To avoid it, we have to wait for releasing tsk->pi_lock which
817 : * is held by try_to_wake_up()
818 : */
819 2914 : smp_mb();
820 2914 : raw_spin_unlock_wait(&tsk->pi_lock);
821 :
822 : /* causes final put_task_struct in finish_task_switch(). */
823 2914 : tsk->state = TASK_DEAD;
824 2914 : tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
825 2914 : schedule();
826 : BUG();
827 : /* Avoid "noreturn function does return". */
828 : for (;;)
829 : cpu_relax(); /* For when BUG is null */
830 : }
831 : EXPORT_SYMBOL_GPL(do_exit);
832 :
833 0 : void complete_and_exit(struct completion *comp, long code)
834 : {
835 0 : if (comp)
836 0 : complete(comp);
837 :
838 0 : do_exit(code);
839 : }
840 : EXPORT_SYMBOL(complete_and_exit);
841 :
842 0 : SYSCALL_DEFINE1(exit, int, error_code)
843 : {
844 0 : do_exit((error_code&0xff)<<8);
845 : }
846 :
847 : /*
848 : * Take down every thread in the group. This is called by fatal signals
849 : * as well as by sys_exit_group (below).
850 : */
851 : void
852 2881 : do_group_exit(int exit_code)
853 : {
854 2881 : struct signal_struct *sig = current->signal;
855 :
856 : BUG_ON(exit_code & 0x80); /* core dumps don't get here */
857 :
858 2881 : if (signal_group_exit(sig))
859 10 : exit_code = sig->group_exit_code;
860 2871 : else if (!thread_group_empty(current)) {
861 : struct sighand_struct *const sighand = current->sighand;
862 :
863 : spin_lock_irq(&sighand->siglock);
864 0 : if (signal_group_exit(sig))
865 : /* Another thread got here before we took the lock. */
866 0 : exit_code = sig->group_exit_code;
867 : else {
868 0 : sig->group_exit_code = exit_code;
869 0 : sig->flags = SIGNAL_GROUP_EXIT;
870 0 : zap_other_threads(current);
871 : }
872 : spin_unlock_irq(&sighand->siglock);
873 : }
874 :
875 2881 : do_exit(exit_code);
876 : /* NOTREACHED */
877 : }
878 :
879 : /*
880 : * this kills every thread in the thread group. Note that any externally
881 : * wait4()-ing process will get the correct exit code - even if this
882 : * thread is not the thread group leader.
883 : */
884 2871 : SYSCALL_DEFINE1(exit_group, int, error_code)
885 : {
886 2871 : do_group_exit((error_code & 0xff) << 8);
887 : /* NOTREACHED */
888 : return 0;
889 : }
890 :
891 : struct wait_opts {
892 : enum pid_type wo_type;
893 : int wo_flags;
894 : struct pid *wo_pid;
895 :
896 : struct siginfo __user *wo_info;
897 : int __user *wo_stat;
898 : struct rusage __user *wo_rusage;
899 :
900 : wait_queue_t child_wait;
901 : int notask_error;
902 : };
903 :
904 : static inline
905 : struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
906 : {
907 1919 : if (type != PIDTYPE_PID)
908 0 : task = task->group_leader;
909 1919 : return task->pids[type].pid;
910 : }
911 :
912 : static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
913 : {
914 13124 : return wo->wo_type == PIDTYPE_MAX ||
915 1919 : task_pid_type(p, wo->wo_type) == wo->wo_pid;
916 : }
917 :
918 8577 : static int eligible_child(struct wait_opts *wo, struct task_struct *p)
919 : {
920 8577 : if (!eligible_pid(wo, p))
921 : return 0;
922 : /* Wait for all children (clone and not) if __WALL is set;
923 : * otherwise, wait for clone children *only* if __WCLONE is
924 : * set; otherwise, wait for non-clone children *only*. (Note:
925 : * A "clone" child here is one that reports to its parent
926 : * using a signal other than SIGCHLD.) */
927 8577 : if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
928 0 : && !(wo->wo_flags & __WALL))
929 : return 0;
930 :
931 8577 : return 1;
932 : }
933 :
934 0 : static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
935 : pid_t pid, uid_t uid, int why, int status)
936 : {
937 : struct siginfo __user *infop;
938 0 : int retval = wo->wo_rusage
939 0 : ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
940 :
941 : put_task_struct(p);
942 0 : infop = wo->wo_info;
943 0 : if (infop) {
944 0 : if (!retval)
945 0 : retval = put_user(SIGCHLD, &infop->si_signo);
946 0 : if (!retval)
947 0 : retval = put_user(0, &infop->si_errno);
948 0 : if (!retval)
949 0 : retval = put_user((short)why, &infop->si_code);
950 0 : if (!retval)
951 0 : retval = put_user(pid, &infop->si_pid);
952 0 : if (!retval)
953 0 : retval = put_user(uid, &infop->si_uid);
954 0 : if (!retval)
955 0 : retval = put_user(status, &infop->si_status);
956 : }
957 0 : if (!retval)
958 : retval = pid;
959 0 : return retval;
960 : }
961 :
962 : /*
963 : * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
964 : * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
965 : * the lock and this task is uninteresting. If we return nonzero, we have
966 : * released the lock and the system call should return.
967 : */
968 2881 : static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
969 : {
970 : int state, retval, status;
971 : pid_t pid = task_pid_vnr(p);
972 2881 : uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
973 : struct siginfo __user *infop;
974 :
975 2881 : if (!likely(wo->wo_flags & WEXITED))
976 : return 0;
977 :
978 2881 : if (unlikely(wo->wo_flags & WNOWAIT)) {
979 0 : int exit_code = p->exit_code;
980 : int why;
981 :
982 0 : get_task_struct(p);
983 0 : read_unlock(&tasklist_lock);
984 : sched_annotate_sleep();
985 :
986 0 : if ((exit_code & 0x7f) == 0) {
987 : why = CLD_EXITED;
988 0 : status = exit_code >> 8;
989 : } else {
990 0 : why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
991 : status = exit_code & 0x7f;
992 : }
993 0 : return wait_noreap_copyout(wo, p, pid, uid, why, status);
994 : }
995 : /*
996 : * Move the task's state to DEAD/TRACE, only one thread can do this.
997 : */
998 0 : state = (ptrace_reparented(p) && thread_group_leader(p)) ?
999 2881 : EXIT_TRACE : EXIT_DEAD;
1000 5762 : if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
1001 : return 0;
1002 : /*
1003 : * We own this thread, nobody else can reap it.
1004 : */
1005 5762 : read_unlock(&tasklist_lock);
1006 : sched_annotate_sleep();
1007 :
1008 : /*
1009 : * Check thread_group_leader() to exclude the traced sub-threads.
1010 : */
1011 5762 : if (state == EXIT_DEAD && thread_group_leader(p)) {
1012 2881 : struct signal_struct *sig = p->signal;
1013 2881 : struct signal_struct *psig = current->signal;
1014 : unsigned long maxrss;
1015 : cputime_t tgutime, tgstime;
1016 :
1017 : /*
1018 : * The resource counters for the group leader are in its
1019 : * own task_struct. Those for dead threads in the group
1020 : * are in its signal_struct, as are those for the child
1021 : * processes it has previously reaped. All these
1022 : * accumulate in the parent's signal_struct c* fields.
1023 : *
1024 : * We don't bother to take a lock here to protect these
1025 : * p->signal fields because the whole thread group is dead
1026 : * and nobody can change them.
1027 : *
1028 : * psig->stats_lock also protects us from our sub-theads
1029 : * which can reap other children at the same time. Until
1030 : * we change k_getrusage()-like users to rely on this lock
1031 : * we have to take ->siglock as well.
1032 : *
1033 : * We use thread_group_cputime_adjusted() to get times for
1034 : * the thread group, which consolidates times for all threads
1035 : * in the group including the group leader.
1036 : */
1037 2881 : thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1038 : spin_lock_irq(¤t->sighand->siglock);
1039 : write_seqlock(&psig->stats_lock);
1040 2881 : psig->cutime += tgutime + sig->cutime;
1041 2881 : psig->cstime += tgstime + sig->cstime;
1042 5762 : psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
1043 5762 : psig->cmin_flt +=
1044 2881 : p->min_flt + sig->min_flt + sig->cmin_flt;
1045 5762 : psig->cmaj_flt +=
1046 2881 : p->maj_flt + sig->maj_flt + sig->cmaj_flt;
1047 5762 : psig->cnvcsw +=
1048 2881 : p->nvcsw + sig->nvcsw + sig->cnvcsw;
1049 5762 : psig->cnivcsw +=
1050 2881 : p->nivcsw + sig->nivcsw + sig->cnivcsw;
1051 5762 : psig->cinblock +=
1052 2881 : task_io_get_inblock(p) +
1053 5762 : sig->inblock + sig->cinblock;
1054 5762 : psig->coublock +=
1055 2881 : task_io_get_oublock(p) +
1056 5762 : sig->oublock + sig->coublock;
1057 2881 : maxrss = max(sig->maxrss, sig->cmaxrss);
1058 2881 : if (psig->cmaxrss < maxrss)
1059 1110 : psig->cmaxrss = maxrss;
1060 : task_io_accounting_add(&psig->ioac, &p->ioac);
1061 : task_io_accounting_add(&psig->ioac, &sig->ioac);
1062 : write_sequnlock(&psig->stats_lock);
1063 : spin_unlock_irq(¤t->sighand->siglock);
1064 : }
1065 :
1066 2881 : retval = wo->wo_rusage
1067 2881 : ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1068 2881 : status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1069 2881 : ? p->signal->group_exit_code : p->exit_code;
1070 2881 : if (!retval && wo->wo_stat)
1071 2879 : retval = put_user(status, wo->wo_stat);
1072 :
1073 2881 : infop = wo->wo_info;
1074 2881 : if (!retval && infop)
1075 0 : retval = put_user(SIGCHLD, &infop->si_signo);
1076 2881 : if (!retval && infop)
1077 0 : retval = put_user(0, &infop->si_errno);
1078 2881 : if (!retval && infop) {
1079 : int why;
1080 :
1081 0 : if ((status & 0x7f) == 0) {
1082 : why = CLD_EXITED;
1083 0 : status >>= 8;
1084 : } else {
1085 0 : why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
1086 : status &= 0x7f;
1087 : }
1088 0 : retval = put_user((short)why, &infop->si_code);
1089 0 : if (!retval)
1090 0 : retval = put_user(status, &infop->si_status);
1091 : }
1092 2881 : if (!retval && infop)
1093 0 : retval = put_user(pid, &infop->si_pid);
1094 2881 : if (!retval && infop)
1095 0 : retval = put_user(uid, &infop->si_uid);
1096 2881 : if (!retval)
1097 : retval = pid;
1098 :
1099 2881 : if (state == EXIT_TRACE) {
1100 0 : write_lock_irq(&tasklist_lock);
1101 : /* We dropped tasklist, ptracer could die and untrace */
1102 : ptrace_unlink(p);
1103 :
1104 : /* If parent wants a zombie, don't release it now */
1105 : state = EXIT_ZOMBIE;
1106 0 : if (do_notify_parent(p, p->exit_signal))
1107 : state = EXIT_DEAD;
1108 0 : p->exit_state = state;
1109 0 : write_unlock_irq(&tasklist_lock);
1110 : }
1111 2881 : if (state == EXIT_DEAD)
1112 2881 : release_task(p);
1113 :
1114 2881 : return retval;
1115 : }
1116 :
1117 : static int *task_stopped_code(struct task_struct *p, bool ptrace)
1118 : {
1119 446 : if (ptrace) {
1120 0 : if (task_is_stopped_or_traced(p) &&
1121 0 : !(p->jobctl & JOBCTL_LISTENING))
1122 0 : return &p->exit_code;
1123 : } else {
1124 446 : if (p->signal->flags & SIGNAL_STOP_STOPPED)
1125 0 : return &p->signal->group_exit_code;
1126 : }
1127 : return NULL;
1128 : }
1129 :
1130 : /**
1131 : * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
1132 : * @wo: wait options
1133 : * @ptrace: is the wait for ptrace
1134 : * @p: task to wait for
1135 : *
1136 : * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
1137 : *
1138 : * CONTEXT:
1139 : * read_lock(&tasklist_lock), which is released if return value is
1140 : * non-zero. Also, grabs and releases @p->sighand->siglock.
1141 : *
1142 : * RETURNS:
1143 : * 0 if wait condition didn't exist and search for other wait conditions
1144 : * should continue. Non-zero return, -errno on failure and @p's pid on
1145 : * success, implies that tasklist_lock is released and wait condition
1146 : * search should terminate.
1147 : */
1148 5696 : static int wait_task_stopped(struct wait_opts *wo,
1149 : int ptrace, struct task_struct *p)
1150 : {
1151 : struct siginfo __user *infop;
1152 : int retval, exit_code, *p_code, why;
1153 : uid_t uid = 0; /* unneeded, required by compiler */
1154 : pid_t pid;
1155 :
1156 : /*
1157 : * Traditionally we see ptrace'd stopped tasks regardless of options.
1158 : */
1159 5696 : if (!ptrace && !(wo->wo_flags & WUNTRACED))
1160 : return 0;
1161 :
1162 446 : if (!task_stopped_code(p, ptrace))
1163 : return 0;
1164 :
1165 : exit_code = 0;
1166 : spin_lock_irq(&p->sighand->siglock);
1167 :
1168 : p_code = task_stopped_code(p, ptrace);
1169 0 : if (unlikely(!p_code))
1170 : goto unlock_sig;
1171 :
1172 0 : exit_code = *p_code;
1173 0 : if (!exit_code)
1174 : goto unlock_sig;
1175 :
1176 0 : if (!unlikely(wo->wo_flags & WNOWAIT))
1177 0 : *p_code = 0;
1178 :
1179 0 : uid = from_kuid_munged(current_user_ns(), task_uid(p));
1180 : unlock_sig:
1181 : spin_unlock_irq(&p->sighand->siglock);
1182 0 : if (!exit_code)
1183 : return 0;
1184 :
1185 : /*
1186 : * Now we are pretty sure this task is interesting.
1187 : * Make sure it doesn't get reaped out from under us while we
1188 : * give up the lock and then examine it below. We don't want to
1189 : * keep holding onto the tasklist_lock while we call getrusage and
1190 : * possibly take page faults for user memory.
1191 : */
1192 0 : get_task_struct(p);
1193 : pid = task_pid_vnr(p);
1194 0 : why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1195 0 : read_unlock(&tasklist_lock);
1196 : sched_annotate_sleep();
1197 :
1198 0 : if (unlikely(wo->wo_flags & WNOWAIT))
1199 0 : return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
1200 :
1201 0 : retval = wo->wo_rusage
1202 0 : ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1203 0 : if (!retval && wo->wo_stat)
1204 0 : retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
1205 :
1206 0 : infop = wo->wo_info;
1207 0 : if (!retval && infop)
1208 0 : retval = put_user(SIGCHLD, &infop->si_signo);
1209 0 : if (!retval && infop)
1210 0 : retval = put_user(0, &infop->si_errno);
1211 0 : if (!retval && infop)
1212 0 : retval = put_user((short)why, &infop->si_code);
1213 0 : if (!retval && infop)
1214 0 : retval = put_user(exit_code, &infop->si_status);
1215 0 : if (!retval && infop)
1216 0 : retval = put_user(pid, &infop->si_pid);
1217 0 : if (!retval && infop)
1218 0 : retval = put_user(uid, &infop->si_uid);
1219 0 : if (!retval)
1220 : retval = pid;
1221 : put_task_struct(p);
1222 :
1223 : BUG_ON(!retval);
1224 0 : return retval;
1225 : }
1226 :
1227 : /*
1228 : * Handle do_wait work for one task in a live, non-stopped state.
1229 : * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
1230 : * the lock and this task is uninteresting. If we return nonzero, we have
1231 : * released the lock and the system call should return.
1232 : */
1233 5696 : static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1234 : {
1235 : int retval;
1236 : pid_t pid;
1237 : uid_t uid;
1238 :
1239 5696 : if (!unlikely(wo->wo_flags & WCONTINUED))
1240 : return 0;
1241 :
1242 303 : if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1243 : return 0;
1244 :
1245 : spin_lock_irq(&p->sighand->siglock);
1246 : /* Re-check with the lock held. */
1247 0 : if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
1248 : spin_unlock_irq(&p->sighand->siglock);
1249 : return 0;
1250 : }
1251 0 : if (!unlikely(wo->wo_flags & WNOWAIT))
1252 0 : p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1253 0 : uid = from_kuid_munged(current_user_ns(), task_uid(p));
1254 : spin_unlock_irq(&p->sighand->siglock);
1255 :
1256 : pid = task_pid_vnr(p);
1257 0 : get_task_struct(p);
1258 0 : read_unlock(&tasklist_lock);
1259 : sched_annotate_sleep();
1260 :
1261 0 : if (!wo->wo_info) {
1262 0 : retval = wo->wo_rusage
1263 0 : ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1264 : put_task_struct(p);
1265 0 : if (!retval && wo->wo_stat)
1266 0 : retval = put_user(0xffff, wo->wo_stat);
1267 0 : if (!retval)
1268 : retval = pid;
1269 : } else {
1270 0 : retval = wait_noreap_copyout(wo, p, pid, uid,
1271 : CLD_CONTINUED, SIGCONT);
1272 : BUG_ON(retval == 0);
1273 : }
1274 :
1275 0 : return retval;
1276 : }
1277 :
1278 : /*
1279 : * Consider @p for a wait by @parent.
1280 : *
1281 : * -ECHILD should be in ->notask_error before the first call.
1282 : * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1283 : * Returns zero if the search for a child should continue;
1284 : * then ->notask_error is 0 if @p is an eligible child,
1285 : * or another error from security_task_wait(), or still -ECHILD.
1286 : */
1287 8577 : static int wait_consider_task(struct wait_opts *wo, int ptrace,
1288 : struct task_struct *p)
1289 : {
1290 : /*
1291 : * We can race with wait_task_zombie() from another thread.
1292 : * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
1293 : * can't confuse the checks below.
1294 : */
1295 8577 : int exit_state = ACCESS_ONCE(p->exit_state);
1296 : int ret;
1297 :
1298 8577 : if (unlikely(exit_state == EXIT_DEAD))
1299 : return 0;
1300 :
1301 8577 : ret = eligible_child(wo, p);
1302 8577 : if (!ret)
1303 : return ret;
1304 :
1305 : ret = security_task_wait(p);
1306 : if (unlikely(ret < 0)) {
1307 : /*
1308 : * If we have not yet seen any eligible child,
1309 : * then let this error code replace -ECHILD.
1310 : * A permission error will give the user a clue
1311 : * to look for security policy problems, rather
1312 : * than for mysterious wait bugs.
1313 : */
1314 : if (wo->notask_error)
1315 : wo->notask_error = ret;
1316 : return 0;
1317 : }
1318 :
1319 8577 : if (unlikely(exit_state == EXIT_TRACE)) {
1320 : /*
1321 : * ptrace == 0 means we are the natural parent. In this case
1322 : * we should clear notask_error, debugger will notify us.
1323 : */
1324 0 : if (likely(!ptrace))
1325 0 : wo->notask_error = 0;
1326 : return 0;
1327 : }
1328 :
1329 8577 : if (likely(!ptrace) && unlikely(p->ptrace)) {
1330 : /*
1331 : * If it is traced by its real parent's group, just pretend
1332 : * the caller is ptrace_do_wait() and reap this child if it
1333 : * is zombie.
1334 : *
1335 : * This also hides group stop state from real parent; otherwise
1336 : * a single stop can be reported twice as group and ptrace stop.
1337 : * If a ptracer wants to distinguish these two events for its
1338 : * own children it should create a separate process which takes
1339 : * the role of real parent.
1340 : */
1341 0 : if (!ptrace_reparented(p))
1342 : ptrace = 1;
1343 : }
1344 :
1345 : /* slay zombie? */
1346 8577 : if (exit_state == EXIT_ZOMBIE) {
1347 : /* we don't reap group leaders with subthreads */
1348 5762 : if (!delay_group_leader(p)) {
1349 : /*
1350 : * A zombie ptracee is only visible to its ptracer.
1351 : * Notification and reaping will be cascaded to the
1352 : * real parent when the ptracer detaches.
1353 : */
1354 2881 : if (unlikely(ptrace) || likely(!p->ptrace))
1355 2881 : return wait_task_zombie(wo, p);
1356 : }
1357 :
1358 : /*
1359 : * Allow access to stopped/continued state via zombie by
1360 : * falling through. Clearing of notask_error is complex.
1361 : *
1362 : * When !@ptrace:
1363 : *
1364 : * If WEXITED is set, notask_error should naturally be
1365 : * cleared. If not, subset of WSTOPPED|WCONTINUED is set,
1366 : * so, if there are live subthreads, there are events to
1367 : * wait for. If all subthreads are dead, it's still safe
1368 : * to clear - this function will be called again in finite
1369 : * amount time once all the subthreads are released and
1370 : * will then return without clearing.
1371 : *
1372 : * When @ptrace:
1373 : *
1374 : * Stopped state is per-task and thus can't change once the
1375 : * target task dies. Only continued and exited can happen.
1376 : * Clear notask_error if WCONTINUED | WEXITED.
1377 : */
1378 0 : if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
1379 0 : wo->notask_error = 0;
1380 : } else {
1381 : /*
1382 : * @p is alive and it's gonna stop, continue or exit, so
1383 : * there always is something to wait for.
1384 : */
1385 5696 : wo->notask_error = 0;
1386 : }
1387 :
1388 : /*
1389 : * Wait for stopped. Depending on @ptrace, different stopped state
1390 : * is used and the two don't interact with each other.
1391 : */
1392 5696 : ret = wait_task_stopped(wo, ptrace, p);
1393 5696 : if (ret)
1394 : return ret;
1395 :
1396 : /*
1397 : * Wait for continued. There's only one continued state and the
1398 : * ptracer can consume it which can confuse the real parent. Don't
1399 : * use WCONTINUED from ptracer. You don't need or want it.
1400 : */
1401 5696 : return wait_task_continued(wo, p);
1402 : }
1403 :
1404 : /*
1405 : * Do the work of do_wait() for one thread in the group, @tsk.
1406 : *
1407 : * -ECHILD should be in ->notask_error before the first call.
1408 : * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1409 : * Returns zero if the search for a child should continue; then
1410 : * ->notask_error is 0 if there were any eligible children,
1411 : * or another error from security_task_wait(), or still -ECHILD.
1412 : */
1413 7426 : static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1414 : {
1415 : struct task_struct *p;
1416 :
1417 13122 : list_for_each_entry(p, &tsk->children, sibling) {
1418 8577 : int ret = wait_consider_task(wo, 0, p);
1419 :
1420 8577 : if (ret)
1421 : return ret;
1422 : }
1423 :
1424 : return 0;
1425 : }
1426 :
1427 4545 : static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1428 : {
1429 : struct task_struct *p;
1430 :
1431 4545 : list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1432 0 : int ret = wait_consider_task(wo, 1, p);
1433 :
1434 0 : if (ret)
1435 : return ret;
1436 : }
1437 :
1438 : return 0;
1439 : }
1440 :
1441 2628 : static int child_wait_callback(wait_queue_t *wait, unsigned mode,
1442 : int sync, void *key)
1443 : {
1444 2628 : struct wait_opts *wo = container_of(wait, struct wait_opts,
1445 : child_wait);
1446 : struct task_struct *p = key;
1447 :
1448 2628 : if (!eligible_pid(wo, p))
1449 : return 0;
1450 :
1451 2628 : if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
1452 : return 0;
1453 :
1454 2628 : return default_wake_function(wait, mode, sync, key);
1455 : }
1456 :
1457 2925 : void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
1458 : {
1459 2925 : __wake_up_sync_key(&parent->signal->wait_chldexit,
1460 : TASK_INTERRUPTIBLE, 1, p);
1461 2925 : }
1462 :
1463 4827 : static long do_wait(struct wait_opts *wo)
1464 : {
1465 : struct task_struct *tsk;
1466 : int retval;
1467 :
1468 : trace_sched_process_wait(wo->wo_pid);
1469 :
1470 : init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
1471 4827 : wo->child_wait.private = current;
1472 4827 : add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait);
1473 : repeat:
1474 : /*
1475 : * If there is nothing that can match our critiera just get out.
1476 : * We will clear ->notask_error to zero if we see any child that
1477 : * might later match our criteria, even if we are not able to reap
1478 : * it yet.
1479 : */
1480 7426 : wo->notask_error = -ECHILD;
1481 9235 : if ((wo->wo_type < PIDTYPE_MAX) &&
1482 5427 : (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
1483 : goto notask;
1484 :
1485 7426 : set_current_state(TASK_INTERRUPTIBLE);
1486 7426 : read_lock(&tasklist_lock);
1487 7426 : tsk = current;
1488 : do {
1489 7426 : retval = do_wait_thread(wo, tsk);
1490 7426 : if (retval)
1491 : goto end;
1492 :
1493 4545 : retval = ptrace_do_wait(wo, tsk);
1494 4545 : if (retval)
1495 : goto end;
1496 :
1497 4545 : if (wo->wo_flags & __WNOTHREAD)
1498 : break;
1499 9090 : } while_each_thread(current, tsk);
1500 9090 : read_unlock(&tasklist_lock);
1501 :
1502 : notask:
1503 4545 : retval = wo->notask_error;
1504 4545 : if (!retval && !(wo->wo_flags & WNOHANG)) {
1505 : retval = -ERESTARTSYS;
1506 5206 : if (!signal_pending(current)) {
1507 2602 : schedule();
1508 2599 : goto repeat;
1509 : }
1510 : }
1511 : end:
1512 4824 : __set_current_state(TASK_RUNNING);
1513 4824 : remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait);
1514 4824 : return retval;
1515 : }
1516 :
1517 0 : SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1518 : infop, int, options, struct rusage __user *, ru)
1519 : {
1520 : struct wait_opts wo;
1521 : struct pid *pid = NULL;
1522 : enum pid_type type;
1523 : long ret;
1524 :
1525 0 : if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
1526 : return -EINVAL;
1527 0 : if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
1528 : return -EINVAL;
1529 :
1530 0 : switch (which) {
1531 : case P_ALL:
1532 : type = PIDTYPE_MAX;
1533 : break;
1534 : case P_PID:
1535 : type = PIDTYPE_PID;
1536 0 : if (upid <= 0)
1537 : return -EINVAL;
1538 : break;
1539 : case P_PGID:
1540 : type = PIDTYPE_PGID;
1541 0 : if (upid <= 0)
1542 : return -EINVAL;
1543 : break;
1544 : default:
1545 : return -EINVAL;
1546 : }
1547 :
1548 0 : if (type < PIDTYPE_MAX)
1549 0 : pid = find_get_pid(upid);
1550 :
1551 0 : wo.wo_type = type;
1552 0 : wo.wo_pid = pid;
1553 0 : wo.wo_flags = options;
1554 0 : wo.wo_info = infop;
1555 0 : wo.wo_stat = NULL;
1556 0 : wo.wo_rusage = ru;
1557 0 : ret = do_wait(&wo);
1558 :
1559 0 : if (ret > 0) {
1560 : ret = 0;
1561 0 : } else if (infop) {
1562 : /*
1563 : * For a WNOHANG return, clear out all the fields
1564 : * we would set so the user can easily tell the
1565 : * difference.
1566 : */
1567 0 : if (!ret)
1568 0 : ret = put_user(0, &infop->si_signo);
1569 0 : if (!ret)
1570 0 : ret = put_user(0, &infop->si_errno);
1571 0 : if (!ret)
1572 0 : ret = put_user(0, &infop->si_code);
1573 0 : if (!ret)
1574 0 : ret = put_user(0, &infop->si_pid);
1575 0 : if (!ret)
1576 0 : ret = put_user(0, &infop->si_uid);
1577 0 : if (!ret)
1578 0 : ret = put_user(0, &infop->si_status);
1579 : }
1580 :
1581 0 : put_pid(pid);
1582 : return ret;
1583 : }
1584 :
1585 9651 : SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1586 : int, options, struct rusage __user *, ru)
1587 : {
1588 : struct wait_opts wo;
1589 : struct pid *pid = NULL;
1590 : enum pid_type type;
1591 : long ret;
1592 :
1593 4827 : if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
1594 : __WNOTHREAD|__WCLONE|__WALL))
1595 : return -EINVAL;
1596 :
1597 4827 : if (upid == -1)
1598 : type = PIDTYPE_MAX;
1599 1699 : else if (upid < 0) {
1600 : type = PIDTYPE_PGID;
1601 0 : pid = find_get_pid(-upid);
1602 1699 : } else if (upid == 0) {
1603 : type = PIDTYPE_PGID;
1604 0 : pid = get_task_pid(current, PIDTYPE_PGID);
1605 : } else /* upid > 0 */ {
1606 : type = PIDTYPE_PID;
1607 1699 : pid = find_get_pid(upid);
1608 : }
1609 :
1610 4827 : wo.wo_type = type;
1611 4827 : wo.wo_pid = pid;
1612 4827 : wo.wo_flags = options | WEXITED;
1613 4827 : wo.wo_info = NULL;
1614 4827 : wo.wo_stat = stat_addr;
1615 4827 : wo.wo_rusage = ru;
1616 4827 : ret = do_wait(&wo);
1617 4824 : put_pid(pid);
1618 :
1619 : return ret;
1620 : }
1621 :
1622 : #ifdef __ARCH_WANT_SYS_WAITPID
1623 :
1624 : /*
1625 : * sys_waitpid() remains for compatibility. waitpid() should be
1626 : * implemented by calling sys_wait4() from libc.a.
1627 : */
1628 : SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
1629 : {
1630 : return sys_wait4(pid, stat_addr, options, NULL);
1631 : }
1632 :
1633 : #endif
|