LCOV - code coverage report
Current view: top level - kernel - cgroup.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 356 1586 22.4 %
Date: 2015-04-12 14:34:49 Functions: 37 141 26.2 %

          Line data    Source code
       1             : /*
       2             :  *  Generic process-grouping system.
       3             :  *
       4             :  *  Based originally on the cpuset system, extracted by Paul Menage
       5             :  *  Copyright (C) 2006 Google, Inc
       6             :  *
       7             :  *  Notifications support
       8             :  *  Copyright (C) 2009 Nokia Corporation
       9             :  *  Author: Kirill A. Shutemov
      10             :  *
      11             :  *  Copyright notices from the original cpuset code:
      12             :  *  --------------------------------------------------
      13             :  *  Copyright (C) 2003 BULL SA.
      14             :  *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
      15             :  *
      16             :  *  Portions derived from Patrick Mochel's sysfs code.
      17             :  *  sysfs is Copyright (c) 2001-3 Patrick Mochel
      18             :  *
      19             :  *  2003-10-10 Written by Simon Derr.
      20             :  *  2003-10-22 Updates by Stephen Hemminger.
      21             :  *  2004 May-July Rework by Paul Jackson.
      22             :  *  ---------------------------------------------------
      23             :  *
      24             :  *  This file is subject to the terms and conditions of the GNU General Public
      25             :  *  License.  See the file COPYING in the main directory of the Linux
      26             :  *  distribution for more details.
      27             :  */
      28             : 
      29             : #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
      30             : 
      31             : #include <linux/cgroup.h>
      32             : #include <linux/cred.h>
      33             : #include <linux/ctype.h>
      34             : #include <linux/errno.h>
      35             : #include <linux/init_task.h>
      36             : #include <linux/kernel.h>
      37             : #include <linux/list.h>
      38             : #include <linux/magic.h>
      39             : #include <linux/mm.h>
      40             : #include <linux/mutex.h>
      41             : #include <linux/mount.h>
      42             : #include <linux/pagemap.h>
      43             : #include <linux/proc_fs.h>
      44             : #include <linux/rcupdate.h>
      45             : #include <linux/sched.h>
      46             : #include <linux/slab.h>
      47             : #include <linux/spinlock.h>
      48             : #include <linux/rwsem.h>
      49             : #include <linux/string.h>
      50             : #include <linux/sort.h>
      51             : #include <linux/kmod.h>
      52             : #include <linux/delayacct.h>
      53             : #include <linux/cgroupstats.h>
      54             : #include <linux/hashtable.h>
      55             : #include <linux/pid_namespace.h>
      56             : #include <linux/idr.h>
      57             : #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
      58             : #include <linux/kthread.h>
      59             : #include <linux/delay.h>
      60             : 
      61             : #include <linux/atomic.h>
      62             : 
      63             : /*
      64             :  * pidlists linger the following amount before being destroyed.  The goal
      65             :  * is avoiding frequent destruction in the middle of consecutive read calls
      66             :  * Expiring in the middle is a performance problem not a correctness one.
      67             :  * 1 sec should be enough.
      68             :  */
      69             : #define CGROUP_PIDLIST_DESTROY_DELAY    HZ
      70             : 
      71             : #define CGROUP_FILE_NAME_MAX            (MAX_CGROUP_TYPE_NAMELEN +      \
      72             :                                          MAX_CFTYPE_NAME + 2)
      73             : 
      74             : /*
      75             :  * cgroup_mutex is the master lock.  Any modification to cgroup or its
      76             :  * hierarchy must be performed while holding it.
      77             :  *
      78             :  * css_set_rwsem protects task->cgroups pointer, the list of css_set
      79             :  * objects, and the chain of tasks off each css_set.
      80             :  *
      81             :  * These locks are exported if CONFIG_PROVE_RCU so that accessors in
      82             :  * cgroup.h can use them for lockdep annotations.
      83             :  */
      84             : #ifdef CONFIG_PROVE_RCU
      85             : DEFINE_MUTEX(cgroup_mutex);
      86             : DECLARE_RWSEM(css_set_rwsem);
      87             : EXPORT_SYMBOL_GPL(cgroup_mutex);
      88             : EXPORT_SYMBOL_GPL(css_set_rwsem);
      89             : #else
      90             : static DEFINE_MUTEX(cgroup_mutex);
      91             : static DECLARE_RWSEM(css_set_rwsem);
      92             : #endif
      93             : 
      94             : /*
      95             :  * Protects cgroup_idr and css_idr so that IDs can be released without
      96             :  * grabbing cgroup_mutex.
      97             :  */
      98             : static DEFINE_SPINLOCK(cgroup_idr_lock);
      99             : 
     100             : /*
     101             :  * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
     102             :  * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
     103             :  */
     104             : static DEFINE_SPINLOCK(release_agent_path_lock);
     105             : 
     106             : #define cgroup_assert_mutex_or_rcu_locked()                             \
     107             :         rcu_lockdep_assert(rcu_read_lock_held() ||                      \
     108             :                            lockdep_is_held(&cgroup_mutex),          \
     109             :                            "cgroup_mutex or RCU read lock required");
     110             : 
     111             : /*
     112             :  * cgroup destruction makes heavy use of work items and there can be a lot
     113             :  * of concurrent destructions.  Use a separate workqueue so that cgroup
     114             :  * destruction work items don't end up filling up max_active of system_wq
     115             :  * which may lead to deadlock.
     116             :  */
     117             : static struct workqueue_struct *cgroup_destroy_wq;
     118             : 
     119             : /*
     120             :  * pidlist destructions need to be flushed on cgroup destruction.  Use a
     121             :  * separate workqueue as flush domain.
     122             :  */
     123             : static struct workqueue_struct *cgroup_pidlist_destroy_wq;
     124             : 
     125             : /* generate an array of cgroup subsystem pointers */
     126             : #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
     127             : static struct cgroup_subsys *cgroup_subsys[] = {
     128             : #include <linux/cgroup_subsys.h>
     129             : };
     130             : #undef SUBSYS
     131             : 
     132             : /* array of cgroup subsystem names */
     133             : #define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
     134             : static const char *cgroup_subsys_name[] = {
     135             : #include <linux/cgroup_subsys.h>
     136             : };
     137             : #undef SUBSYS
     138             : 
     139             : /*
     140             :  * The default hierarchy, reserved for the subsystems that are otherwise
     141             :  * unattached - it never has more than a single cgroup, and all tasks are
     142             :  * part of that cgroup.
     143             :  */
     144             : struct cgroup_root cgrp_dfl_root;
     145             : 
     146             : /*
     147             :  * The default hierarchy always exists but is hidden until mounted for the
     148             :  * first time.  This is for backward compatibility.
     149             :  */
     150             : static bool cgrp_dfl_root_visible;
     151             : 
     152             : /*
     153             :  * Set by the boot param of the same name and makes subsystems with NULL
     154             :  * ->dfl_files to use ->legacy_files on the default hierarchy.
     155             :  */
     156             : static bool cgroup_legacy_files_on_dfl;
     157             : 
     158             : /* some controllers are not supported in the default hierarchy */
     159             : static unsigned int cgrp_dfl_root_inhibit_ss_mask;
     160             : 
     161             : /* The list of hierarchy roots */
     162             : 
     163             : static LIST_HEAD(cgroup_roots);
     164             : static int cgroup_root_count;
     165             : 
     166             : /* hierarchy ID allocation and mapping, protected by cgroup_mutex */
     167             : static DEFINE_IDR(cgroup_hierarchy_idr);
     168             : 
     169             : /*
     170             :  * Assign a monotonically increasing serial number to csses.  It guarantees
     171             :  * cgroups with bigger numbers are newer than those with smaller numbers.
     172             :  * Also, as csses are always appended to the parent's ->children list, it
     173             :  * guarantees that sibling csses are always sorted in the ascending serial
     174             :  * number order on the list.  Protected by cgroup_mutex.
     175             :  */
     176             : static u64 css_serial_nr_next = 1;
     177             : 
     178             : /* This flag indicates whether tasks in the fork and exit paths should
     179             :  * check for fork/exit handlers to call. This avoids us having to do
     180             :  * extra work in the fork/exit path if none of the subsystems need to
     181             :  * be called.
     182             :  */
     183             : static int need_forkexit_callback __read_mostly;
     184             : 
     185             : static struct cftype cgroup_dfl_base_files[];
     186             : static struct cftype cgroup_legacy_base_files[];
     187             : 
     188             : static int rebind_subsystems(struct cgroup_root *dst_root,
     189             :                              unsigned int ss_mask);
     190             : static int cgroup_destroy_locked(struct cgroup *cgrp);
     191             : static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
     192             :                       bool visible);
     193             : static void css_release(struct percpu_ref *ref);
     194             : static void kill_css(struct cgroup_subsys_state *css);
     195             : static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
     196             :                               bool is_add);
     197             : 
     198             : /* IDR wrappers which synchronize using cgroup_idr_lock */
     199           8 : static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
     200             :                             gfp_t gfp_mask)
     201             : {
     202             :         int ret;
     203             : 
     204           8 :         idr_preload(gfp_mask);
     205             :         spin_lock_bh(&cgroup_idr_lock);
     206           8 :         ret = idr_alloc(idr, ptr, start, end, gfp_mask);
     207             :         spin_unlock_bh(&cgroup_idr_lock);
     208             :         idr_preload_end();
     209           8 :         return ret;
     210             : }
     211             : 
     212           0 : static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
     213             : {
     214             :         void *ret;
     215             : 
     216             :         spin_lock_bh(&cgroup_idr_lock);
     217           0 :         ret = idr_replace(idr, ptr, id);
     218             :         spin_unlock_bh(&cgroup_idr_lock);
     219           0 :         return ret;
     220             : }
     221             : 
     222           0 : static void cgroup_idr_remove(struct idr *idr, int id)
     223             : {
     224             :         spin_lock_bh(&cgroup_idr_lock);
     225           0 :         idr_remove(idr, id);
     226             :         spin_unlock_bh(&cgroup_idr_lock);
     227           0 : }
     228             : 
     229             : static struct cgroup *cgroup_parent(struct cgroup *cgrp)
     230             : {
     231             :         struct cgroup_subsys_state *parent_css = cgrp->self.parent;
     232             : 
     233          11 :         if (parent_css)
     234             :                 return container_of(parent_css, struct cgroup, self);
     235             :         return NULL;
     236             : }
     237             : 
     238             : /**
     239             :  * cgroup_css - obtain a cgroup's css for the specified subsystem
     240             :  * @cgrp: the cgroup of interest
     241             :  * @ss: the subsystem of interest (%NULL returns @cgrp->self)
     242             :  *
     243             :  * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
     244             :  * function must be called either under cgroup_mutex or rcu_read_lock() and
     245             :  * the caller is responsible for pinning the returned css if it wants to
     246             :  * keep accessing it outside the said locks.  This function may return
     247             :  * %NULL if @cgrp doesn't have @subsys_id enabled.
     248             :  */
     249             : static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
     250             :                                               struct cgroup_subsys *ss)
     251             : {
     252          30 :         if (ss)
     253          30 :                 return rcu_dereference_check(cgrp->subsys[ss->id],
     254             :                                         lockdep_is_held(&cgroup_mutex));
     255             :         else
     256           0 :                 return &cgrp->self;
     257             : }
     258             : 
     259             : /**
     260             :  * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
     261             :  * @cgrp: the cgroup of interest
     262             :  * @ss: the subsystem of interest (%NULL returns @cgrp->self)
     263             :  *
     264             :  * Similar to cgroup_css() but returns the effctive css, which is defined
     265             :  * as the matching css of the nearest ancestor including self which has @ss
     266             :  * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
     267             :  * function is guaranteed to return non-NULL css.
     268             :  */
     269           0 : static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
     270             :                                                 struct cgroup_subsys *ss)
     271             : {
     272             :         lockdep_assert_held(&cgroup_mutex);
     273             : 
     274           0 :         if (!ss)
     275           0 :                 return &cgrp->self;
     276             : 
     277           0 :         if (!(cgrp->root->subsys_mask & (1 << ss->id)))
     278             :                 return NULL;
     279             : 
     280             :         /*
     281             :          * This function is used while updating css associations and thus
     282             :          * can't test the csses directly.  Use ->child_subsys_mask.
     283             :          */
     284           0 :         while (cgroup_parent(cgrp) &&
     285           0 :                !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
     286             :                 cgrp = cgroup_parent(cgrp);
     287             : 
     288           0 :         return cgroup_css(cgrp, ss);
     289             : }
     290             : 
     291             : /**
     292             :  * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
     293             :  * @cgrp: the cgroup of interest
     294             :  * @ss: the subsystem of interest
     295             :  *
     296             :  * Find and get the effective css of @cgrp for @ss.  The effective css is
     297             :  * defined as the matching css of the nearest ancestor including self which
     298             :  * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
     299             :  * the root css is returned, so this function always returns a valid css.
     300             :  * The returned css must be put using css_put().
     301             :  */
     302           0 : struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
     303             :                                              struct cgroup_subsys *ss)
     304             : {
     305             :         struct cgroup_subsys_state *css;
     306             : 
     307             :         rcu_read_lock();
     308             : 
     309             :         do {
     310             :                 css = cgroup_css(cgrp, ss);
     311             : 
     312           0 :                 if (css && css_tryget_online(css))
     313             :                         goto out_unlock;
     314             :                 cgrp = cgroup_parent(cgrp);
     315           0 :         } while (cgrp);
     316             : 
     317           0 :         css = init_css_set.subsys[ss->id];
     318             :         css_get(css);
     319             : out_unlock:
     320             :         rcu_read_unlock();
     321           0 :         return css;
     322             : }
     323             : 
     324             : /* convenient tests for these bits */
     325             : static inline bool cgroup_is_dead(const struct cgroup *cgrp)
     326             : {
     327           8 :         return !(cgrp->self.flags & CSS_ONLINE);
     328             : }
     329             : 
     330           0 : struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
     331             : {
     332           0 :         struct cgroup *cgrp = of->kn->parent->priv;
     333             :         struct cftype *cft = of_cft(of);
     334             : 
     335             :         /*
     336             :          * This is open and unprotected implementation of cgroup_css().
     337             :          * seq_css() is only called from a kernfs file operation which has
     338             :          * an active reference on the file.  Because all the subsystem
     339             :          * files are drained before a css is disassociated with a cgroup,
     340             :          * the matching css from the cgroup's subsys table is guaranteed to
     341             :          * be and stay valid until the enclosing operation is complete.
     342             :          */
     343           0 :         if (cft->ss)
     344           0 :                 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
     345             :         else
     346           0 :                 return &cgrp->self;
     347             : }
     348             : EXPORT_SYMBOL_GPL(of_css);
     349             : 
     350             : /**
     351             :  * cgroup_is_descendant - test ancestry
     352             :  * @cgrp: the cgroup to be tested
     353             :  * @ancestor: possible ancestor of @cgrp
     354             :  *
     355             :  * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
     356             :  * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
     357             :  * and @ancestor are accessible.
     358             :  */
     359           0 : bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
     360             : {
     361           0 :         while (cgrp) {
     362           0 :                 if (cgrp == ancestor)
     363             :                         return true;
     364             :                 cgrp = cgroup_parent(cgrp);
     365             :         }
     366             :         return false;
     367             : }
     368             : 
     369             : static int notify_on_release(const struct cgroup *cgrp)
     370             : {
     371             :         return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
     372             : }
     373             : 
     374             : /**
     375             :  * for_each_css - iterate all css's of a cgroup
     376             :  * @css: the iteration cursor
     377             :  * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
     378             :  * @cgrp: the target cgroup to iterate css's of
     379             :  *
     380             :  * Should be called under cgroup_[tree_]mutex.
     381             :  */
     382             : #define for_each_css(css, ssid, cgrp)                                   \
     383             :         for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)     \
     384             :                 if (!((css) = rcu_dereference_check(                    \
     385             :                                 (cgrp)->subsys[(ssid)],                      \
     386             :                                 lockdep_is_held(&cgroup_mutex)))) { }       \
     387             :                 else
     388             : 
     389             : /**
     390             :  * for_each_e_css - iterate all effective css's of a cgroup
     391             :  * @css: the iteration cursor
     392             :  * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
     393             :  * @cgrp: the target cgroup to iterate css's of
     394             :  *
     395             :  * Should be called under cgroup_[tree_]mutex.
     396             :  */
     397             : #define for_each_e_css(css, ssid, cgrp)                                 \
     398             :         for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)     \
     399             :                 if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
     400             :                         ;                                               \
     401             :                 else
     402             : 
     403             : /**
     404             :  * for_each_subsys - iterate all enabled cgroup subsystems
     405             :  * @ss: the iteration cursor
     406             :  * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
     407             :  */
     408             : #define for_each_subsys(ss, ssid)                                       \
     409             :         for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&             \
     410             :              (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
     411             : 
     412             : /* iterate across the hierarchies */
     413             : #define for_each_root(root)                                             \
     414             :         list_for_each_entry((root), &cgroup_roots, root_list)
     415             : 
     416             : /* iterate over child cgrps, lock should be held throughout iteration */
     417             : #define cgroup_for_each_live_child(child, cgrp)                         \
     418             :         list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
     419             :                 if (({ lockdep_assert_held(&cgroup_mutex);          \
     420             :                        cgroup_is_dead(child); }))                       \
     421             :                         ;                                               \
     422             :                 else
     423             : 
     424             : static void cgroup_release_agent(struct work_struct *work);
     425             : static void check_for_release(struct cgroup *cgrp);
     426             : 
     427             : /*
     428             :  * A cgroup can be associated with multiple css_sets as different tasks may
     429             :  * belong to different cgroups on different hierarchies.  In the other
     430             :  * direction, a css_set is naturally associated with multiple cgroups.
     431             :  * This M:N relationship is represented by the following link structure
     432             :  * which exists for each association and allows traversing the associations
     433             :  * from both sides.
     434             :  */
     435             : struct cgrp_cset_link {
     436             :         /* the cgroup and css_set this link associates */
     437             :         struct cgroup           *cgrp;
     438             :         struct css_set          *cset;
     439             : 
     440             :         /* list of cgrp_cset_links anchored at cgrp->cset_links */
     441             :         struct list_head        cset_link;
     442             : 
     443             :         /* list of cgrp_cset_links anchored at css_set->cgrp_links */
     444             :         struct list_head        cgrp_link;
     445             : };
     446             : 
     447             : /*
     448             :  * The default css_set - used by init and its children prior to any
     449             :  * hierarchies being mounted. It contains a pointer to the root state
     450             :  * for each subsystem. Also used to anchor the list of css_sets. Not
     451             :  * reference-counted, to improve performance when child cgroups
     452             :  * haven't been created.
     453             :  */
     454             : struct css_set init_css_set = {
     455             :         .refcount               = ATOMIC_INIT(1),
     456             :         .cgrp_links             = LIST_HEAD_INIT(init_css_set.cgrp_links),
     457             :         .tasks                  = LIST_HEAD_INIT(init_css_set.tasks),
     458             :         .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
     459             :         .mg_preload_node        = LIST_HEAD_INIT(init_css_set.mg_preload_node),
     460             :         .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),
     461             : };
     462             : 
     463             : static int css_set_count        = 1;    /* 1 for init_css_set */
     464             : 
     465             : /**
     466             :  * cgroup_update_populated - updated populated count of a cgroup
     467             :  * @cgrp: the target cgroup
     468             :  * @populated: inc or dec populated count
     469             :  *
     470             :  * @cgrp is either getting the first task (css_set) or losing the last.
     471             :  * Update @cgrp->populated_cnt accordingly.  The count is propagated
     472             :  * towards root so that a given cgroup's populated_cnt is zero iff the
     473             :  * cgroup and all its descendants are empty.
     474             :  *
     475             :  * @cgrp's interface file "cgroup.populated" is zero if
     476             :  * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
     477             :  * changes from or to zero, userland is notified that the content of the
     478             :  * interface file has changed.  This can be used to detect when @cgrp and
     479             :  * its descendants become populated or empty.
     480             :  */
     481           2 : static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
     482             : {
     483             :         lockdep_assert_held(&css_set_rwsem);
     484             : 
     485             :         do {
     486             :                 bool trigger;
     487             : 
     488           1 :                 if (populated)
     489           1 :                         trigger = !cgrp->populated_cnt++;
     490             :                 else
     491           0 :                         trigger = !--cgrp->populated_cnt;
     492             : 
     493           1 :                 if (!trigger)
     494             :                         break;
     495             : 
     496           1 :                 if (cgrp->populated_kn)
     497           0 :                         kernfs_notify(cgrp->populated_kn);
     498             :                 cgrp = cgroup_parent(cgrp);
     499           1 :         } while (cgrp);
     500           1 : }
     501             : 
     502             : /*
     503             :  * hash table for cgroup groups. This improves the performance to find
     504             :  * an existing css_set. This hash doesn't (currently) take into
     505             :  * account cgroups in empty hierarchies.
     506             :  */
     507             : #define CSS_SET_HASH_BITS       7
     508             : static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
     509             : 
     510             : static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
     511             : {
     512             :         unsigned long key = 0UL;
     513             :         struct cgroup_subsys *ss;
     514             :         int i;
     515             : 
     516          15 :         for_each_subsys(ss, i)
     517           7 :                 key += (unsigned long)css[i];
     518           1 :         key = (key >> 16) ^ key;
     519             : 
     520             :         return key;
     521             : }
     522             : 
     523           0 : static void put_css_set_locked(struct css_set *cset)
     524             : {
     525             :         struct cgrp_cset_link *link, *tmp_link;
     526             :         struct cgroup_subsys *ss;
     527             :         int ssid;
     528             : 
     529             :         lockdep_assert_held(&css_set_rwsem);
     530             : 
     531           0 :         if (!atomic_dec_and_test(&cset->refcount))
     532           0 :                 return;
     533             : 
     534             :         /* This css_set is dead. unlink it and release cgroup refcounts */
     535           0 :         for_each_subsys(ss, ssid)
     536           0 :                 list_del(&cset->e_cset_node[ssid]);
     537             :         hash_del(&cset->hlist);
     538           0 :         css_set_count--;
     539             : 
     540           0 :         list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
     541           0 :                 struct cgroup *cgrp = link->cgrp;
     542             : 
     543             :                 list_del(&link->cset_link);
     544             :                 list_del(&link->cgrp_link);
     545             : 
     546             :                 /* @cgrp can't go away while we're holding css_set_rwsem */
     547           0 :                 if (list_empty(&cgrp->cset_links)) {
     548           0 :                         cgroup_update_populated(cgrp, false);
     549           0 :                         check_for_release(cgrp);
     550             :                 }
     551             : 
     552           0 :                 kfree(link);
     553             :         }
     554             : 
     555           0 :         kfree_rcu(cset, rcu_head);
     556             : }
     557             : 
     558         887 : static void put_css_set(struct css_set *cset)
     559             : {
     560             :         /*
     561             :          * Ensure that the refcount doesn't hit zero while any readers
     562             :          * can see it. Similar to atomic_dec_and_lock(), but for an
     563             :          * rwlock
     564             :          */
     565        1774 :         if (atomic_add_unless(&cset->refcount, -1, 1))
     566         887 :                 return;
     567             : 
     568           0 :         down_write(&css_set_rwsem);
     569           0 :         put_css_set_locked(cset);
     570           0 :         up_write(&css_set_rwsem);
     571             : }
     572             : 
     573             : /*
     574             :  * refcounted get/put for css_set objects
     575             :  */
     576             : static inline void get_css_set(struct css_set *cset)
     577             : {
     578         965 :         atomic_inc(&cset->refcount);
     579             : }
     580             : 
     581             : /**
     582             :  * compare_css_sets - helper function for find_existing_css_set().
     583             :  * @cset: candidate css_set being tested
     584             :  * @old_cset: existing css_set for a task
     585             :  * @new_cgrp: cgroup that's being entered by the task
     586             :  * @template: desired set of css pointers in css_set (pre-calculated)
     587             :  *
     588             :  * Returns true if "cset" matches "old_cset" except for the hierarchy
     589             :  * which "new_cgrp" belongs to, for which it should match "new_cgrp".
     590             :  */
     591           0 : static bool compare_css_sets(struct css_set *cset,
     592             :                              struct css_set *old_cset,
     593             :                              struct cgroup *new_cgrp,
     594             :                              struct cgroup_subsys_state *template[])
     595             : {
     596             :         struct list_head *l1, *l2;
     597             : 
     598             :         /*
     599             :          * On the default hierarchy, there can be csets which are
     600             :          * associated with the same set of cgroups but different csses.
     601             :          * Let's first ensure that csses match.
     602             :          */
     603           0 :         if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
     604             :                 return false;
     605             : 
     606             :         /*
     607             :          * Compare cgroup pointers in order to distinguish between
     608             :          * different cgroups in hierarchies.  As different cgroups may
     609             :          * share the same effective css, this comparison is always
     610             :          * necessary.
     611             :          */
     612           0 :         l1 = &cset->cgrp_links;
     613           0 :         l2 = &old_cset->cgrp_links;
     614             :         while (1) {
     615             :                 struct cgrp_cset_link *link1, *link2;
     616             :                 struct cgroup *cgrp1, *cgrp2;
     617             : 
     618           0 :                 l1 = l1->next;
     619           0 :                 l2 = l2->next;
     620             :                 /* See if we reached the end - both lists are equal length. */
     621           0 :                 if (l1 == &cset->cgrp_links) {
     622             :                         BUG_ON(l2 != &old_cset->cgrp_links);
     623             :                         break;
     624             :                 } else {
     625             :                         BUG_ON(l2 == &old_cset->cgrp_links);
     626             :                 }
     627             :                 /* Locate the cgroups associated with these links. */
     628             :                 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
     629             :                 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
     630           0 :                 cgrp1 = link1->cgrp;
     631           0 :                 cgrp2 = link2->cgrp;
     632             :                 /* Hierarchies should be linked in the same order. */
     633             :                 BUG_ON(cgrp1->root != cgrp2->root);
     634             : 
     635             :                 /*
     636             :                  * If this hierarchy is the hierarchy of the cgroup
     637             :                  * that's changing, then we need to check that this
     638             :                  * css_set points to the new cgroup; if it's any other
     639             :                  * hierarchy, then this css_set should point to the
     640             :                  * same cgroup as the old css_set.
     641             :                  */
     642           0 :                 if (cgrp1->root == new_cgrp->root) {
     643           0 :                         if (cgrp1 != new_cgrp)
     644             :                                 return false;
     645             :                 } else {
     646           0 :                         if (cgrp1 != cgrp2)
     647             :                                 return false;
     648             :                 }
     649             :         }
     650             :         return true;
     651             : }
     652             : 
     653             : /**
     654             :  * find_existing_css_set - init css array and find the matching css_set
     655             :  * @old_cset: the css_set that we're using before the cgroup transition
     656             :  * @cgrp: the cgroup that we're moving into
     657             :  * @template: out param for the new set of csses, should be clear on entry
     658             :  */
     659           0 : static struct css_set *find_existing_css_set(struct css_set *old_cset,
     660             :                                         struct cgroup *cgrp,
     661             :                                         struct cgroup_subsys_state *template[])
     662             : {
     663           0 :         struct cgroup_root *root = cgrp->root;
     664             :         struct cgroup_subsys *ss;
     665             :         struct css_set *cset;
     666             :         unsigned long key;
     667             :         int i;
     668             : 
     669             :         /*
     670             :          * Build the set of subsystem state objects that we want to see in the
     671             :          * new css_set. while subsystems can change globally, the entries here
     672             :          * won't change, so no need for locking.
     673             :          */
     674           0 :         for_each_subsys(ss, i) {
     675           0 :                 if (root->subsys_mask & (1UL << i)) {
     676             :                         /*
     677             :                          * @ss is in this hierarchy, so we want the
     678             :                          * effective css from @cgrp.
     679             :                          */
     680           0 :                         template[i] = cgroup_e_css(cgrp, ss);
     681             :                 } else {
     682             :                         /*
     683             :                          * @ss is not in this hierarchy, so we don't want
     684             :                          * to change the css.
     685             :                          */
     686           0 :                         template[i] = old_cset->subsys[i];
     687             :                 }
     688             :         }
     689             : 
     690             :         key = css_set_hash(template);
     691           0 :         hash_for_each_possible(css_set_table, cset, hlist, key) {
     692           0 :                 if (!compare_css_sets(cset, old_cset, cgrp, template))
     693           0 :                         continue;
     694             : 
     695             :                 /* This css_set matches what we need */
     696             :                 return cset;
     697             :         }
     698             : 
     699             :         /* No existing cgroup group matched */
     700             :         return NULL;
     701             : }
     702             : 
     703           1 : static void free_cgrp_cset_links(struct list_head *links_to_free)
     704             : {
     705             :         struct cgrp_cset_link *link, *tmp_link;
     706             : 
     707           1 :         list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
     708             :                 list_del(&link->cset_link);
     709           0 :                 kfree(link);
     710             :         }
     711           1 : }
     712             : 
     713             : /**
     714             :  * allocate_cgrp_cset_links - allocate cgrp_cset_links
     715             :  * @count: the number of links to allocate
     716             :  * @tmp_links: list_head the allocated links are put on
     717             :  *
     718             :  * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
     719             :  * through ->cset_link.  Returns 0 on success or -errno.
     720             :  */
     721           1 : static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
     722             : {
     723             :         struct cgrp_cset_link *link;
     724             :         int i;
     725             : 
     726             :         INIT_LIST_HEAD(tmp_links);
     727             : 
     728           2 :         for (i = 0; i < count; i++) {
     729             :                 link = kzalloc(sizeof(*link), GFP_KERNEL);
     730           1 :                 if (!link) {
     731           0 :                         free_cgrp_cset_links(tmp_links);
     732           0 :                         return -ENOMEM;
     733             :                 }
     734           1 :                 list_add(&link->cset_link, tmp_links);
     735             :         }
     736             :         return 0;
     737             : }
     738             : 
     739             : /**
     740             :  * link_css_set - a helper function to link a css_set to a cgroup
     741             :  * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
     742             :  * @cset: the css_set to be linked
     743             :  * @cgrp: the destination cgroup
     744             :  */
     745           1 : static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
     746           1 :                          struct cgroup *cgrp)
     747             : {
     748             :         struct cgrp_cset_link *link;
     749             : 
     750             :         BUG_ON(list_empty(tmp_links));
     751             : 
     752           1 :         if (cgroup_on_dfl(cgrp))
     753           1 :                 cset->dfl_cgrp = cgrp;
     754             : 
     755           1 :         link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
     756           1 :         link->cset = cset;
     757           1 :         link->cgrp = cgrp;
     758             : 
     759           2 :         if (list_empty(&cgrp->cset_links))
     760           1 :                 cgroup_update_populated(cgrp, true);
     761           1 :         list_move(&link->cset_link, &cgrp->cset_links);
     762             : 
     763             :         /*
     764             :          * Always add links to the tail of the list so that the list
     765             :          * is sorted by order of hierarchy creation
     766             :          */
     767           1 :         list_add_tail(&link->cgrp_link, &cset->cgrp_links);
     768           1 : }
     769             : 
     770             : /**
     771             :  * find_css_set - return a new css_set with one cgroup updated
     772             :  * @old_cset: the baseline css_set
     773             :  * @cgrp: the cgroup to be updated
     774             :  *
     775             :  * Return a new css_set that's equivalent to @old_cset, but with @cgrp
     776             :  * substituted into the appropriate hierarchy.
     777             :  */
     778           0 : static struct css_set *find_css_set(struct css_set *old_cset,
     779             :                                     struct cgroup *cgrp)
     780             : {
     781           0 :         struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
     782             :         struct css_set *cset;
     783             :         struct list_head tmp_links;
     784             :         struct cgrp_cset_link *link;
     785             :         struct cgroup_subsys *ss;
     786             :         unsigned long key;
     787             :         int ssid;
     788             : 
     789             :         lockdep_assert_held(&cgroup_mutex);
     790             : 
     791             :         /* First see if we already have a cgroup group that matches
     792             :          * the desired set */
     793           0 :         down_read(&css_set_rwsem);
     794           0 :         cset = find_existing_css_set(old_cset, cgrp, template);
     795           0 :         if (cset)
     796             :                 get_css_set(cset);
     797           0 :         up_read(&css_set_rwsem);
     798             : 
     799           0 :         if (cset)
     800             :                 return cset;
     801             : 
     802             :         cset = kzalloc(sizeof(*cset), GFP_KERNEL);
     803           0 :         if (!cset)
     804             :                 return NULL;
     805             : 
     806             :         /* Allocate all the cgrp_cset_link objects that we'll need */
     807           0 :         if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
     808           0 :                 kfree(cset);
     809           0 :                 return NULL;
     810             :         }
     811             : 
     812           0 :         atomic_set(&cset->refcount, 1);
     813           0 :         INIT_LIST_HEAD(&cset->cgrp_links);
     814           0 :         INIT_LIST_HEAD(&cset->tasks);
     815           0 :         INIT_LIST_HEAD(&cset->mg_tasks);
     816           0 :         INIT_LIST_HEAD(&cset->mg_preload_node);
     817           0 :         INIT_LIST_HEAD(&cset->mg_node);
     818             :         INIT_HLIST_NODE(&cset->hlist);
     819             : 
     820             :         /* Copy the set of subsystem state objects generated in
     821             :          * find_existing_css_set() */
     822           0 :         memcpy(cset->subsys, template, sizeof(cset->subsys));
     823             : 
     824           0 :         down_write(&css_set_rwsem);
     825             :         /* Add reference counts and links from the new css_set. */
     826           0 :         list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
     827           0 :                 struct cgroup *c = link->cgrp;
     828             : 
     829           0 :                 if (c->root == cgrp->root)
     830             :                         c = cgrp;
     831           0 :                 link_css_set(&tmp_links, cset, c);
     832             :         }
     833             : 
     834             :         BUG_ON(!list_empty(&tmp_links));
     835             : 
     836           0 :         css_set_count++;
     837             : 
     838             :         /* Add @cset to the hash table */
     839             :         key = css_set_hash(cset->subsys);
     840           0 :         hash_add(css_set_table, &cset->hlist, key);
     841             : 
     842           0 :         for_each_subsys(ss, ssid)
     843           0 :                 list_add_tail(&cset->e_cset_node[ssid],
     844           0 :                               &cset->subsys[ssid]->cgroup->e_csets[ssid]);
     845             : 
     846           0 :         up_write(&css_set_rwsem);
     847             : 
     848           0 :         return cset;
     849             : }
     850             : 
     851             : static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
     852             : {
     853           0 :         struct cgroup *root_cgrp = kf_root->kn->priv;
     854             : 
     855           0 :         return root_cgrp->root;
     856             : }
     857             : 
     858           1 : static int cgroup_init_root_id(struct cgroup_root *root)
     859             : {
     860             :         int id;
     861             : 
     862             :         lockdep_assert_held(&cgroup_mutex);
     863             : 
     864           1 :         id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
     865           1 :         if (id < 0)
     866             :                 return id;
     867             : 
     868           1 :         root->hierarchy_id = id;
     869           1 :         return 0;
     870             : }
     871             : 
     872             : static void cgroup_exit_root_id(struct cgroup_root *root)
     873             : {
     874             :         lockdep_assert_held(&cgroup_mutex);
     875             : 
     876           0 :         if (root->hierarchy_id) {
     877           0 :                 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
     878           0 :                 root->hierarchy_id = 0;
     879             :         }
     880             : }
     881             : 
     882           0 : static void cgroup_free_root(struct cgroup_root *root)
     883             : {
     884           0 :         if (root) {
     885             :                 /* hierarhcy ID shoulid already have been released */
     886             :                 WARN_ON_ONCE(root->hierarchy_id);
     887             : 
     888           0 :                 idr_destroy(&root->cgroup_idr);
     889           0 :                 kfree(root);
     890             :         }
     891           0 : }
     892             : 
     893           0 : static void cgroup_destroy_root(struct cgroup_root *root)
     894             : {
     895             :         struct cgroup *cgrp = &root->cgrp;
     896             :         struct cgrp_cset_link *link, *tmp_link;
     897             : 
     898           0 :         mutex_lock(&cgroup_mutex);
     899             : 
     900           0 :         BUG_ON(atomic_read(&root->nr_cgrps));
     901             :         BUG_ON(!list_empty(&cgrp->self.children));
     902             : 
     903             :         /* Rebind all subsystems back to the default hierarchy */
     904           0 :         rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
     905             : 
     906             :         /*
     907             :          * Release all the links from cset_links to this hierarchy's
     908             :          * root cgroup
     909             :          */
     910           0 :         down_write(&css_set_rwsem);
     911             : 
     912           0 :         list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
     913             :                 list_del(&link->cset_link);
     914             :                 list_del(&link->cgrp_link);
     915           0 :                 kfree(link);
     916             :         }
     917           0 :         up_write(&css_set_rwsem);
     918             : 
     919           0 :         if (!list_empty(&root->root_list)) {
     920             :                 list_del(&root->root_list);
     921           0 :                 cgroup_root_count--;
     922             :         }
     923             : 
     924             :         cgroup_exit_root_id(root);
     925             : 
     926           0 :         mutex_unlock(&cgroup_mutex);
     927             : 
     928           0 :         kernfs_destroy_root(root->kf_root);
     929           0 :         cgroup_free_root(root);
     930           0 : }
     931             : 
     932             : /* look up cgroup associated with given css_set on the specified hierarchy */
     933             : static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
     934             :                                             struct cgroup_root *root)
     935             : {
     936             :         struct cgroup *res = NULL;
     937             : 
     938             :         lockdep_assert_held(&cgroup_mutex);
     939             :         lockdep_assert_held(&css_set_rwsem);
     940             : 
     941           0 :         if (cset == &init_css_set) {
     942           0 :                 res = &root->cgrp;
     943             :         } else {
     944             :                 struct cgrp_cset_link *link;
     945             : 
     946           0 :                 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
     947           0 :                         struct cgroup *c = link->cgrp;
     948             : 
     949           0 :                         if (c->root == root) {
     950             :                                 res = c;
     951             :                                 break;
     952             :                         }
     953             :                 }
     954             :         }
     955             : 
     956             :         BUG_ON(!res);
     957             :         return res;
     958             : }
     959             : 
     960             : /*
     961             :  * Return the cgroup for "task" from the given hierarchy. Must be
     962             :  * called with cgroup_mutex and css_set_rwsem held.
     963             :  */
     964             : static struct cgroup *task_cgroup_from_root(struct task_struct *task,
     965             :                                             struct cgroup_root *root)
     966             : {
     967             :         /*
     968             :          * No need to lock the task - since we hold cgroup_mutex the
     969             :          * task can't change groups, so the only thing that can happen
     970             :          * is that it exits and its css is set back to init_css_set.
     971             :          */
     972             :         return cset_cgroup_from_root(task_css_set(task), root);
     973             : }
     974             : 
     975             : /*
     976             :  * A task must hold cgroup_mutex to modify cgroups.
     977             :  *
     978             :  * Any task can increment and decrement the count field without lock.
     979             :  * So in general, code holding cgroup_mutex can't rely on the count
     980             :  * field not changing.  However, if the count goes to zero, then only
     981             :  * cgroup_attach_task() can increment it again.  Because a count of zero
     982             :  * means that no tasks are currently attached, therefore there is no
     983             :  * way a task attached to that cgroup can fork (the other way to
     984             :  * increment the count).  So code holding cgroup_mutex can safely
     985             :  * assume that if the count is zero, it will stay zero. Similarly, if
     986             :  * a task holds cgroup_mutex on a cgroup with zero count, it
     987             :  * knows that the cgroup won't be removed, as cgroup_rmdir()
     988             :  * needs that mutex.
     989             :  *
     990             :  * A cgroup can only be deleted if both its 'count' of using tasks
     991             :  * is zero, and its list of 'children' cgroups is empty.  Since all
     992             :  * tasks in the system use _some_ cgroup, and since there is always at
     993             :  * least one task in the system (init, pid == 1), therefore, root cgroup
     994             :  * always has either children cgroups and/or using tasks.  So we don't
     995             :  * need a special hack to ensure that root cgroup cannot be deleted.
     996             :  *
     997             :  * P.S.  One more locking exception.  RCU is used to guard the
     998             :  * update of a tasks cgroup pointer by cgroup_attach_task()
     999             :  */
    1000             : 
    1001             : static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
    1002             : static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
    1003             : static const struct file_operations proc_cgroupstats_operations;
    1004             : 
    1005           3 : static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
    1006             :                               char *buf)
    1007             : {
    1008           3 :         if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
    1009           0 :             !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
    1010           0 :                 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
    1011           0 :                          cft->ss->name, cft->name);
    1012             :         else
    1013           3 :                 strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
    1014           3 :         return buf;
    1015             : }
    1016             : 
    1017             : /**
    1018             :  * cgroup_file_mode - deduce file mode of a control file
    1019             :  * @cft: the control file in question
    1020             :  *
    1021             :  * returns cft->mode if ->mode is not 0
    1022             :  * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
    1023             :  * returns S_IRUGO if it has only a read handler
    1024             :  * returns S_IWUSR if it has only a write hander
    1025             :  */
    1026           3 : static umode_t cgroup_file_mode(const struct cftype *cft)
    1027             : {
    1028             :         umode_t mode = 0;
    1029             : 
    1030           3 :         if (cft->mode)
    1031             :                 return cft->mode;
    1032             : 
    1033           2 :         if (cft->read_u64 || cft->read_s64 || cft->seq_show)
    1034             :                 mode |= S_IRUGO;
    1035             : 
    1036           2 :         if (cft->write_u64 || cft->write_s64 || cft->write)
    1037           1 :                 mode |= S_IWUSR;
    1038             : 
    1039           2 :         return mode;
    1040             : }
    1041             : 
    1042           7 : static void cgroup_get(struct cgroup *cgrp)
    1043             : {
    1044             :         WARN_ON_ONCE(cgroup_is_dead(cgrp));
    1045             :         css_get(&cgrp->self);
    1046           7 : }
    1047             : 
    1048           0 : static bool cgroup_tryget(struct cgroup *cgrp)
    1049             : {
    1050           0 :         return css_tryget(&cgrp->self);
    1051             : }
    1052             : 
    1053           0 : static void cgroup_put(struct cgroup *cgrp)
    1054             : {
    1055             :         css_put(&cgrp->self);
    1056           0 : }
    1057             : 
    1058             : /**
    1059             :  * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
    1060             :  * @cgrp: the target cgroup
    1061             :  * @subtree_control: the new subtree_control mask to consider
    1062             :  *
    1063             :  * On the default hierarchy, a subsystem may request other subsystems to be
    1064             :  * enabled together through its ->depends_on mask.  In such cases, more
    1065             :  * subsystems than specified in "cgroup.subtree_control" may be enabled.
    1066             :  *
    1067             :  * This function calculates which subsystems need to be enabled if
    1068             :  * @subtree_control is to be applied to @cgrp.  The returned mask is always
    1069             :  * a superset of @subtree_control and follows the usual hierarchy rules.
    1070             :  */
    1071           0 : static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
    1072             :                                                   unsigned int subtree_control)
    1073             : {
    1074             :         struct cgroup *parent = cgroup_parent(cgrp);
    1075             :         unsigned int cur_ss_mask = subtree_control;
    1076             :         struct cgroup_subsys *ss;
    1077             :         int ssid;
    1078             : 
    1079             :         lockdep_assert_held(&cgroup_mutex);
    1080             : 
    1081           0 :         if (!cgroup_on_dfl(cgrp))
    1082             :                 return cur_ss_mask;
    1083             : 
    1084             :         while (true) {
    1085             :                 unsigned int new_ss_mask = cur_ss_mask;
    1086             : 
    1087           0 :                 for_each_subsys(ss, ssid)
    1088           0 :                         if (cur_ss_mask & (1 << ssid))
    1089           0 :                                 new_ss_mask |= ss->depends_on;
    1090             : 
    1091             :                 /*
    1092             :                  * Mask out subsystems which aren't available.  This can
    1093             :                  * happen only if some depended-upon subsystems were bound
    1094             :                  * to non-default hierarchies.
    1095             :                  */
    1096           0 :                 if (parent)
    1097           0 :                         new_ss_mask &= parent->child_subsys_mask;
    1098             :                 else
    1099           0 :                         new_ss_mask &= cgrp->root->subsys_mask;
    1100             : 
    1101           0 :                 if (new_ss_mask == cur_ss_mask)
    1102             :                         break;
    1103             :                 cur_ss_mask = new_ss_mask;
    1104             :         }
    1105             : 
    1106             :         return cur_ss_mask;
    1107             : }
    1108             : 
    1109             : /**
    1110             :  * cgroup_refresh_child_subsys_mask - update child_subsys_mask
    1111             :  * @cgrp: the target cgroup
    1112             :  *
    1113             :  * Update @cgrp->child_subsys_mask according to the current
    1114             :  * @cgrp->subtree_control using cgroup_calc_child_subsys_mask().
    1115             :  */
    1116             : static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
    1117             : {
    1118           0 :         cgrp->child_subsys_mask =
    1119           0 :                 cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control);
    1120             : }
    1121             : 
    1122             : /**
    1123             :  * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
    1124             :  * @kn: the kernfs_node being serviced
    1125             :  *
    1126             :  * This helper undoes cgroup_kn_lock_live() and should be invoked before
    1127             :  * the method finishes if locking succeeded.  Note that once this function
    1128             :  * returns the cgroup returned by cgroup_kn_lock_live() may become
    1129             :  * inaccessible any time.  If the caller intends to continue to access the
    1130             :  * cgroup, it should pin it before invoking this function.
    1131             :  */
    1132           0 : static void cgroup_kn_unlock(struct kernfs_node *kn)
    1133             : {
    1134             :         struct cgroup *cgrp;
    1135             : 
    1136           0 :         if (kernfs_type(kn) == KERNFS_DIR)
    1137           0 :                 cgrp = kn->priv;
    1138             :         else
    1139           0 :                 cgrp = kn->parent->priv;
    1140             : 
    1141           0 :         mutex_unlock(&cgroup_mutex);
    1142             : 
    1143           0 :         kernfs_unbreak_active_protection(kn);
    1144           0 :         cgroup_put(cgrp);
    1145           0 : }
    1146             : 
    1147             : /**
    1148             :  * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
    1149             :  * @kn: the kernfs_node being serviced
    1150             :  *
    1151             :  * This helper is to be used by a cgroup kernfs method currently servicing
    1152             :  * @kn.  It breaks the active protection, performs cgroup locking and
    1153             :  * verifies that the associated cgroup is alive.  Returns the cgroup if
    1154             :  * alive; otherwise, %NULL.  A successful return should be undone by a
    1155             :  * matching cgroup_kn_unlock() invocation.
    1156             :  *
    1157             :  * Any cgroup kernfs method implementation which requires locking the
    1158             :  * associated cgroup should use this helper.  It avoids nesting cgroup
    1159             :  * locking under kernfs active protection and allows all kernfs operations
    1160             :  * including self-removal.
    1161             :  */
    1162           0 : static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
    1163             : {
    1164           0 :         struct cgroup *cgrp;
    1165             : 
    1166           0 :         if (kernfs_type(kn) == KERNFS_DIR)
    1167           0 :                 cgrp = kn->priv;
    1168             :         else
    1169           0 :                 cgrp = kn->parent->priv;
    1170             : 
    1171             :         /*
    1172             :          * We're gonna grab cgroup_mutex which nests outside kernfs
    1173             :          * active_ref.  cgroup liveliness check alone provides enough
    1174             :          * protection against removal.  Ensure @cgrp stays accessible and
    1175             :          * break the active_ref protection.
    1176             :          */
    1177           0 :         if (!cgroup_tryget(cgrp))
    1178             :                 return NULL;
    1179           0 :         kernfs_break_active_protection(kn);
    1180             : 
    1181           0 :         mutex_lock(&cgroup_mutex);
    1182             : 
    1183           0 :         if (!cgroup_is_dead(cgrp))
    1184             :                 return cgrp;
    1185             : 
    1186           0 :         cgroup_kn_unlock(kn);
    1187           0 :         return NULL;
    1188             : }
    1189             : 
    1190           0 : static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
    1191             : {
    1192             :         char name[CGROUP_FILE_NAME_MAX];
    1193             : 
    1194             :         lockdep_assert_held(&cgroup_mutex);
    1195           0 :         kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
    1196           0 : }
    1197             : 
    1198             : /**
    1199             :  * cgroup_clear_dir - remove subsys files in a cgroup directory
    1200             :  * @cgrp: target cgroup
    1201             :  * @subsys_mask: mask of the subsystem ids whose files should be removed
    1202             :  */
    1203           0 : static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
    1204             : {
    1205             :         struct cgroup_subsys *ss;
    1206             :         int i;
    1207             : 
    1208           0 :         for_each_subsys(ss, i) {
    1209             :                 struct cftype *cfts;
    1210             : 
    1211           0 :                 if (!(subsys_mask & (1 << i)))
    1212           0 :                         continue;
    1213           0 :                 list_for_each_entry(cfts, &ss->cfts, node)
    1214           0 :                         cgroup_addrm_files(cgrp, cfts, false);
    1215             :         }
    1216           0 : }
    1217             : 
    1218           1 : static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
    1219             : {
    1220             :         struct cgroup_subsys *ss;
    1221             :         unsigned int tmp_ss_mask;
    1222             :         int ssid, i, ret;
    1223             : 
    1224             :         lockdep_assert_held(&cgroup_mutex);
    1225             : 
    1226           8 :         for_each_subsys(ss, ssid) {
    1227           7 :                 if (!(ss_mask & (1 << ssid)))
    1228           7 :                         continue;
    1229             : 
    1230             :                 /* if @ss has non-root csses attached to it, can't move */
    1231           0 :                 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
    1232             :                         return -EBUSY;
    1233             : 
    1234             :                 /* can't move between two non-dummy roots either */
    1235           0 :                 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
    1236             :                         return -EBUSY;
    1237             :         }
    1238             : 
    1239             :         /* skip creating root files on dfl_root for inhibited subsystems */
    1240             :         tmp_ss_mask = ss_mask;
    1241           1 :         if (dst_root == &cgrp_dfl_root)
    1242           1 :                 tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
    1243             : 
    1244           1 :         ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
    1245           1 :         if (ret) {
    1246           0 :                 if (dst_root != &cgrp_dfl_root)
    1247             :                         return ret;
    1248             : 
    1249             :                 /*
    1250             :                  * Rebinding back to the default root is not allowed to
    1251             :                  * fail.  Using both default and non-default roots should
    1252             :                  * be rare.  Moving subsystems back and forth even more so.
    1253             :                  * Just warn about it and continue.
    1254             :                  */
    1255           0 :                 if (cgrp_dfl_root_visible) {
    1256           0 :                         pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
    1257             :                                 ret, ss_mask);
    1258           0 :                         pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
    1259             :                 }
    1260             :         }
    1261             : 
    1262             :         /*
    1263             :          * Nothing can fail from this point on.  Remove files for the
    1264             :          * removed subsystems and rebind each subsystem.
    1265             :          */
    1266           7 :         for_each_subsys(ss, ssid)
    1267           7 :                 if (ss_mask & (1 << ssid))
    1268           0 :                         cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
    1269             : 
    1270           7 :         for_each_subsys(ss, ssid) {
    1271             :                 struct cgroup_root *src_root;
    1272             :                 struct cgroup_subsys_state *css;
    1273             :                 struct css_set *cset;
    1274             : 
    1275           7 :                 if (!(ss_mask & (1 << ssid)))
    1276           7 :                         continue;
    1277             : 
    1278           0 :                 src_root = ss->root;
    1279             :                 css = cgroup_css(&src_root->cgrp, ss);
    1280             : 
    1281           0 :                 WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
    1282             : 
    1283           0 :                 RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
    1284           0 :                 rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
    1285           0 :                 ss->root = dst_root;
    1286           0 :                 css->cgroup = &dst_root->cgrp;
    1287             : 
    1288           0 :                 down_write(&css_set_rwsem);
    1289           0 :                 hash_for_each(css_set_table, i, cset, hlist)
    1290           0 :                         list_move_tail(&cset->e_cset_node[ss->id],
    1291             :                                        &dst_root->cgrp.e_csets[ss->id]);
    1292           0 :                 up_write(&css_set_rwsem);
    1293             : 
    1294           0 :                 src_root->subsys_mask &= ~(1 << ssid);
    1295           0 :                 src_root->cgrp.subtree_control &= ~(1 << ssid);
    1296           0 :                 cgroup_refresh_child_subsys_mask(&src_root->cgrp);
    1297             : 
    1298             :                 /* default hierarchy doesn't enable controllers by default */
    1299           0 :                 dst_root->subsys_mask |= 1 << ssid;
    1300           0 :                 if (dst_root != &cgrp_dfl_root) {
    1301           0 :                         dst_root->cgrp.subtree_control |= 1 << ssid;
    1302             :                         cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
    1303             :                 }
    1304             : 
    1305           0 :                 if (ss->bind)
    1306           0 :                         ss->bind(css);
    1307             :         }
    1308             : 
    1309           1 :         kernfs_activate(dst_root->cgrp.kn);
    1310           1 :         return 0;
    1311             : }
    1312             : 
    1313           0 : static int cgroup_show_options(struct seq_file *seq,
    1314           0 :                                struct kernfs_root *kf_root)
    1315             : {
    1316             :         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
    1317             :         struct cgroup_subsys *ss;
    1318             :         int ssid;
    1319             : 
    1320           0 :         for_each_subsys(ss, ssid)
    1321           0 :                 if (root->subsys_mask & (1 << ssid))
    1322           0 :                         seq_printf(seq, ",%s", ss->name);
    1323           0 :         if (root->flags & CGRP_ROOT_NOPREFIX)
    1324           0 :                 seq_puts(seq, ",noprefix");
    1325           0 :         if (root->flags & CGRP_ROOT_XATTR)
    1326           0 :                 seq_puts(seq, ",xattr");
    1327             : 
    1328             :         spin_lock(&release_agent_path_lock);
    1329           0 :         if (strlen(root->release_agent_path))
    1330           0 :                 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
    1331             :         spin_unlock(&release_agent_path_lock);
    1332             : 
    1333           0 :         if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
    1334           0 :                 seq_puts(seq, ",clone_children");
    1335           0 :         if (strlen(root->name))
    1336           0 :                 seq_printf(seq, ",name=%s", root->name);
    1337           0 :         return 0;
    1338             : }
    1339             : 
    1340             : struct cgroup_sb_opts {
    1341             :         unsigned int subsys_mask;
    1342             :         unsigned int flags;
    1343             :         char *release_agent;
    1344             :         bool cpuset_clone_children;
    1345             :         char *name;
    1346             :         /* User explicitly requested empty subsystem */
    1347             :         bool none;
    1348             : };
    1349             : 
    1350           1 : static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
    1351             : {
    1352           1 :         char *token, *o = data;
    1353             :         bool all_ss = false, one_ss = false;
    1354             :         unsigned int mask = -1U;
    1355             :         struct cgroup_subsys *ss;
    1356             :         int nr_opts = 0;
    1357             :         int i;
    1358             : 
    1359             : #ifdef CONFIG_CPUSETS
    1360             :         mask = ~(1U << cpuset_cgrp_id);
    1361             : #endif
    1362             : 
    1363           1 :         memset(opts, 0, sizeof(*opts));
    1364             : 
    1365           2 :         while ((token = strsep(&o, ",")) != NULL) {
    1366           1 :                 nr_opts++;
    1367             : 
    1368           1 :                 if (!*token)
    1369             :                         return -EINVAL;
    1370           1 :                 if (!strcmp(token, "none")) {
    1371             :                         /* Explicitly have no subsystems */
    1372           0 :                         opts->none = true;
    1373           0 :                         continue;
    1374             :                 }
    1375           1 :                 if (!strcmp(token, "all")) {
    1376             :                         /* Mutually exclusive option 'all' + subsystem name */
    1377           0 :                         if (one_ss)
    1378             :                                 return -EINVAL;
    1379             :                         all_ss = true;
    1380           0 :                         continue;
    1381             :                 }
    1382           1 :                 if (!strcmp(token, "__DEVEL__sane_behavior")) {
    1383           0 :                         opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
    1384           0 :                         continue;
    1385             :                 }
    1386           1 :                 if (!strcmp(token, "noprefix")) {
    1387           0 :                         opts->flags |= CGRP_ROOT_NOPREFIX;
    1388           0 :                         continue;
    1389             :                 }
    1390           1 :                 if (!strcmp(token, "clone_children")) {
    1391           0 :                         opts->cpuset_clone_children = true;
    1392           0 :                         continue;
    1393             :                 }
    1394           1 :                 if (!strcmp(token, "xattr")) {
    1395           0 :                         opts->flags |= CGRP_ROOT_XATTR;
    1396           0 :                         continue;
    1397             :                 }
    1398           1 :                 if (!strncmp(token, "release_agent=", 14)) {
    1399             :                         /* Specifying two release agents is forbidden */
    1400           0 :                         if (opts->release_agent)
    1401             :                                 return -EINVAL;
    1402           0 :                         opts->release_agent =
    1403           0 :                                 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
    1404           0 :                         if (!opts->release_agent)
    1405             :                                 return -ENOMEM;
    1406           0 :                         continue;
    1407             :                 }
    1408           1 :                 if (!strncmp(token, "name=", 5)) {
    1409           0 :                         const char *name = token + 5;
    1410             :                         /* Can't specify an empty name */
    1411           0 :                         if (!strlen(name))
    1412             :                                 return -EINVAL;
    1413             :                         /* Must match [\w.-]+ */
    1414           0 :                         for (i = 0; i < strlen(name); i++) {
    1415           0 :                                 char c = name[i];
    1416           0 :                                 if (isalnum(c))
    1417           0 :                                         continue;
    1418           0 :                                 if ((c == '.') || (c == '-') || (c == '_'))
    1419           0 :                                         continue;
    1420             :                                 return -EINVAL;
    1421             :                         }
    1422             :                         /* Specifying two names is forbidden */
    1423           0 :                         if (opts->name)
    1424             :                                 return -EINVAL;
    1425           0 :                         opts->name = kstrndup(name,
    1426             :                                               MAX_CGROUP_ROOT_NAMELEN - 1,
    1427             :                                               GFP_KERNEL);
    1428           0 :                         if (!opts->name)
    1429             :                                 return -ENOMEM;
    1430             : 
    1431           0 :                         continue;
    1432             :                 }
    1433             : 
    1434           7 :                 for_each_subsys(ss, i) {
    1435           7 :                         if (strcmp(token, ss->name))
    1436           6 :                                 continue;
    1437           1 :                         if (ss->disabled)
    1438           1 :                                 continue;
    1439             : 
    1440             :                         /* Mutually exclusive option 'all' + subsystem name */
    1441           0 :                         if (all_ss)
    1442             :                                 return -EINVAL;
    1443           0 :                         opts->subsys_mask |= (1 << i);
    1444             :                         one_ss = true;
    1445             : 
    1446           0 :                         break;
    1447             :                 }
    1448           1 :                 if (i == CGROUP_SUBSYS_COUNT)
    1449             :                         return -ENOENT;
    1450             :         }
    1451             : 
    1452           0 :         if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
    1453           0 :                 pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
    1454           0 :                 if (nr_opts != 1) {
    1455           0 :                         pr_err("sane_behavior: no other mount options allowed\n");
    1456           0 :                         return -EINVAL;
    1457             :                 }
    1458             :                 return 0;
    1459             :         }
    1460             : 
    1461             :         /*
    1462             :          * If the 'all' option was specified select all the subsystems,
    1463             :          * otherwise if 'none', 'name=' and a subsystem name options were
    1464             :          * not specified, let's default to 'all'
    1465             :          */
    1466           0 :         if (all_ss || (!one_ss && !opts->none && !opts->name))
    1467           0 :                 for_each_subsys(ss, i)
    1468           0 :                         if (!ss->disabled)
    1469           0 :                                 opts->subsys_mask |= (1 << i);
    1470             : 
    1471             :         /*
    1472             :          * We either have to specify by name or by subsystems. (So all
    1473             :          * empty hierarchies must have a name).
    1474             :          */
    1475           0 :         if (!opts->subsys_mask && !opts->name)
    1476             :                 return -EINVAL;
    1477             : 
    1478             :         /*
    1479             :          * Option noprefix was introduced just for backward compatibility
    1480             :          * with the old cpuset, so we allow noprefix only if mounting just
    1481             :          * the cpuset subsystem.
    1482             :          */
    1483           0 :         if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
    1484             :                 return -EINVAL;
    1485             : 
    1486             :         /* Can't specify "none" and some subsystems */
    1487           0 :         if (opts->subsys_mask && opts->none)
    1488             :                 return -EINVAL;
    1489             : 
    1490           0 :         return 0;
    1491             : }
    1492             : 
    1493           0 : static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
    1494             : {
    1495             :         int ret = 0;
    1496             :         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
    1497             :         struct cgroup_sb_opts opts;
    1498             :         unsigned int added_mask, removed_mask;
    1499             : 
    1500           0 :         if (root == &cgrp_dfl_root) {
    1501           0 :                 pr_err("remount is not allowed\n");
    1502           0 :                 return -EINVAL;
    1503             :         }
    1504             : 
    1505           0 :         mutex_lock(&cgroup_mutex);
    1506             : 
    1507             :         /* See what subsystems are wanted */
    1508           0 :         ret = parse_cgroupfs_options(data, &opts);
    1509           0 :         if (ret)
    1510             :                 goto out_unlock;
    1511             : 
    1512           0 :         if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
    1513           0 :                 pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
    1514             :                         task_tgid_nr(current), current->comm);
    1515             : 
    1516           0 :         added_mask = opts.subsys_mask & ~root->subsys_mask;
    1517           0 :         removed_mask = root->subsys_mask & ~opts.subsys_mask;
    1518             : 
    1519             :         /* Don't allow flags or name to change at remount */
    1520           0 :         if ((opts.flags ^ root->flags) ||
    1521           0 :             (opts.name && strcmp(opts.name, root->name))) {
    1522           0 :                 pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
    1523             :                        opts.flags, opts.name ?: "", root->flags, root->name);
    1524             :                 ret = -EINVAL;
    1525           0 :                 goto out_unlock;
    1526             :         }
    1527             : 
    1528             :         /* remounting is not allowed for populated hierarchies */
    1529           0 :         if (!list_empty(&root->cgrp.self.children)) {
    1530             :                 ret = -EBUSY;
    1531             :                 goto out_unlock;
    1532             :         }
    1533             : 
    1534           0 :         ret = rebind_subsystems(root, added_mask);
    1535           0 :         if (ret)
    1536             :                 goto out_unlock;
    1537             : 
    1538           0 :         rebind_subsystems(&cgrp_dfl_root, removed_mask);
    1539             : 
    1540           0 :         if (opts.release_agent) {
    1541             :                 spin_lock(&release_agent_path_lock);
    1542           0 :                 strcpy(root->release_agent_path, opts.release_agent);
    1543             :                 spin_unlock(&release_agent_path_lock);
    1544             :         }
    1545             :  out_unlock:
    1546           0 :         kfree(opts.release_agent);
    1547           0 :         kfree(opts.name);
    1548           0 :         mutex_unlock(&cgroup_mutex);
    1549           0 :         return ret;
    1550             : }
    1551             : 
    1552             : /*
    1553             :  * To reduce the fork() overhead for systems that are not actually using
    1554             :  * their cgroups capability, we don't maintain the lists running through
    1555             :  * each css_set to its tasks until we see the list actually used - in other
    1556             :  * words after the first mount.
    1557             :  */
    1558             : static bool use_task_css_set_links __read_mostly;
    1559             : 
    1560           1 : static void cgroup_enable_task_cg_lists(void)
    1561             : {
    1562             :         struct task_struct *p, *g;
    1563             : 
    1564           1 :         down_write(&css_set_rwsem);
    1565             : 
    1566           1 :         if (use_task_css_set_links)
    1567             :                 goto out_unlock;
    1568             : 
    1569           1 :         use_task_css_set_links = true;
    1570             : 
    1571             :         /*
    1572             :          * We need tasklist_lock because RCU is not safe against
    1573             :          * while_each_thread(). Besides, a forking task that has passed
    1574             :          * cgroup_post_fork() without seeing use_task_css_set_links = 1
    1575             :          * is not guaranteed to have its child immediately visible in the
    1576             :          * tasklist if we walk through it with RCU.
    1577             :          */
    1578           1 :         read_lock(&tasklist_lock);
    1579          58 :         do_each_thread(g, p) {
    1580         112 :                 WARN_ON_ONCE(!list_empty(&p->cg_list) ||
    1581             :                              task_css_set(p) != &init_css_set);
    1582             : 
    1583             :                 /*
    1584             :                  * We should check if the process is exiting, otherwise
    1585             :                  * it will race with cgroup_exit() in that the list
    1586             :                  * entry won't be deleted though the process has exited.
    1587             :                  * Do it while holding siglock so that we don't end up
    1588             :                  * racing against cgroup_exit().
    1589             :                  */
    1590             :                 spin_lock_irq(&p->sighand->siglock);
    1591          56 :                 if (!(p->flags & PF_EXITING)) {
    1592             :                         struct css_set *cset = task_css_set(p);
    1593             : 
    1594          56 :                         list_add(&p->cg_list, &cset->tasks);
    1595             :                         get_css_set(cset);
    1596             :                 }
    1597             :                 spin_unlock_irq(&p->sighand->siglock);
    1598          56 :         } while_each_thread(g, p);
    1599           2 :         read_unlock(&tasklist_lock);
    1600             : out_unlock:
    1601           1 :         up_write(&css_set_rwsem);
    1602           1 : }
    1603             : 
    1604           1 : static void init_cgroup_housekeeping(struct cgroup *cgrp)
    1605             : {
    1606             :         struct cgroup_subsys *ss;
    1607             :         int ssid;
    1608             : 
    1609           1 :         INIT_LIST_HEAD(&cgrp->self.sibling);
    1610           1 :         INIT_LIST_HEAD(&cgrp->self.children);
    1611           1 :         INIT_LIST_HEAD(&cgrp->cset_links);
    1612           1 :         INIT_LIST_HEAD(&cgrp->pidlists);
    1613           1 :         mutex_init(&cgrp->pidlist_mutex);
    1614           1 :         cgrp->self.cgroup = cgrp;
    1615           1 :         cgrp->self.flags |= CSS_ONLINE;
    1616             : 
    1617           8 :         for_each_subsys(ss, ssid)
    1618           7 :                 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
    1619             : 
    1620           1 :         init_waitqueue_head(&cgrp->offline_waitq);
    1621           2 :         INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
    1622           1 : }
    1623             : 
    1624           1 : static void init_cgroup_root(struct cgroup_root *root,
    1625             :                              struct cgroup_sb_opts *opts)
    1626             : {
    1627           1 :         struct cgroup *cgrp = &root->cgrp;
    1628             : 
    1629           1 :         INIT_LIST_HEAD(&root->root_list);
    1630           1 :         atomic_set(&root->nr_cgrps, 1);
    1631           1 :         cgrp->root = root;
    1632           1 :         init_cgroup_housekeeping(cgrp);
    1633           1 :         idr_init(&root->cgroup_idr);
    1634             : 
    1635           1 :         root->flags = opts->flags;
    1636           1 :         if (opts->release_agent)
    1637           0 :                 strcpy(root->release_agent_path, opts->release_agent);
    1638           1 :         if (opts->name)
    1639           0 :                 strcpy(root->name, opts->name);
    1640           1 :         if (opts->cpuset_clone_children)
    1641             :                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
    1642           1 : }
    1643             : 
    1644           1 : static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
    1645             : {
    1646           1 :         LIST_HEAD(tmp_links);
    1647           1 :         struct cgroup *root_cgrp = &root->cgrp;
    1648             :         struct cftype *base_files;
    1649             :         struct css_set *cset;
    1650             :         int i, ret;
    1651             : 
    1652             :         lockdep_assert_held(&cgroup_mutex);
    1653             : 
    1654           1 :         ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
    1655           1 :         if (ret < 0)
    1656             :                 goto out;
    1657           1 :         root_cgrp->id = ret;
    1658             : 
    1659           1 :         ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
    1660             :                               GFP_KERNEL);
    1661           1 :         if (ret)
    1662             :                 goto out;
    1663             : 
    1664             :         /*
    1665             :          * We're accessing css_set_count without locking css_set_rwsem here,
    1666             :          * but that's OK - it can only be increased by someone holding
    1667             :          * cgroup_lock, and that's us. The worst that can happen is that we
    1668             :          * have some link structures left over
    1669             :          */
    1670           1 :         ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
    1671           1 :         if (ret)
    1672             :                 goto cancel_ref;
    1673             : 
    1674           1 :         ret = cgroup_init_root_id(root);
    1675           1 :         if (ret)
    1676             :                 goto cancel_ref;
    1677             : 
    1678           1 :         root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
    1679             :                                            KERNFS_ROOT_CREATE_DEACTIVATED,
    1680             :                                            root_cgrp);
    1681           1 :         if (IS_ERR(root->kf_root)) {
    1682             :                 ret = PTR_ERR(root->kf_root);
    1683           0 :                 goto exit_root_id;
    1684             :         }
    1685           1 :         root_cgrp->kn = root->kf_root->kn;
    1686             : 
    1687           1 :         if (root == &cgrp_dfl_root)
    1688             :                 base_files = cgroup_dfl_base_files;
    1689             :         else
    1690             :                 base_files = cgroup_legacy_base_files;
    1691             : 
    1692           1 :         ret = cgroup_addrm_files(root_cgrp, base_files, true);
    1693           1 :         if (ret)
    1694             :                 goto destroy_root;
    1695             : 
    1696           1 :         ret = rebind_subsystems(root, ss_mask);
    1697           1 :         if (ret)
    1698             :                 goto destroy_root;
    1699             : 
    1700             :         /*
    1701             :          * There must be no failure case after here, since rebinding takes
    1702             :          * care of subsystems' refcounts, which are explicitly dropped in
    1703             :          * the failure exit path.
    1704             :          */
    1705           1 :         list_add(&root->root_list, &cgroup_roots);
    1706           1 :         cgroup_root_count++;
    1707             : 
    1708             :         /*
    1709             :          * Link the root cgroup in this hierarchy into all the css_set
    1710             :          * objects.
    1711             :          */
    1712           1 :         down_write(&css_set_rwsem);
    1713           2 :         hash_for_each(css_set_table, i, cset, hlist)
    1714           1 :                 link_css_set(&tmp_links, cset, root_cgrp);
    1715           1 :         up_write(&css_set_rwsem);
    1716             : 
    1717             :         BUG_ON(!list_empty(&root_cgrp->self.children));
    1718           1 :         BUG_ON(atomic_read(&root->nr_cgrps) != 1);
    1719             : 
    1720           1 :         kernfs_activate(root_cgrp->kn);
    1721             :         ret = 0;
    1722           1 :         goto out;
    1723             : 
    1724             : destroy_root:
    1725           0 :         kernfs_destroy_root(root->kf_root);
    1726           0 :         root->kf_root = NULL;
    1727             : exit_root_id:
    1728             :         cgroup_exit_root_id(root);
    1729             : cancel_ref:
    1730           0 :         percpu_ref_exit(&root_cgrp->self.refcnt);
    1731             : out:
    1732           1 :         free_cgrp_cset_links(&tmp_links);
    1733           1 :         return ret;
    1734             : }
    1735             : 
    1736           1 : static struct dentry *cgroup_mount(struct file_system_type *fs_type,
    1737             :                          int flags, const char *unused_dev_name,
    1738             :                          void *data)
    1739             : {
    1740             :         struct super_block *pinned_sb = NULL;
    1741             :         struct cgroup_subsys *ss;
    1742             :         struct cgroup_root *root;
    1743             :         struct cgroup_sb_opts opts;
    1744             :         struct dentry *dentry;
    1745             :         int ret;
    1746             :         int i;
    1747             :         bool new_sb;
    1748             : 
    1749             :         /*
    1750             :          * The first time anyone tries to mount a cgroup, enable the list
    1751             :          * linking each css_set to its tasks and fix up all existing tasks.
    1752             :          */
    1753           1 :         if (!use_task_css_set_links)
    1754           1 :                 cgroup_enable_task_cg_lists();
    1755             : 
    1756           1 :         mutex_lock(&cgroup_mutex);
    1757             : 
    1758             :         /* First find the desired set of subsystems */
    1759           1 :         ret = parse_cgroupfs_options(data, &opts);
    1760           1 :         if (ret)
    1761             :                 goto out_unlock;
    1762             : 
    1763             :         /* look for a matching existing root */
    1764           0 :         if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
    1765           0 :                 cgrp_dfl_root_visible = true;
    1766             :                 root = &cgrp_dfl_root;
    1767           0 :                 cgroup_get(&root->cgrp);
    1768             :                 ret = 0;
    1769           0 :                 goto out_unlock;
    1770             :         }
    1771             : 
    1772             :         /*
    1773             :          * Destruction of cgroup root is asynchronous, so subsystems may
    1774             :          * still be dying after the previous unmount.  Let's drain the
    1775             :          * dying subsystems.  We just need to ensure that the ones
    1776             :          * unmounted previously finish dying and don't care about new ones
    1777             :          * starting.  Testing ref liveliness is good enough.
    1778             :          */
    1779           0 :         for_each_subsys(ss, i) {
    1780           0 :                 if (!(opts.subsys_mask & (1 << i)) ||
    1781           0 :                     ss->root == &cgrp_dfl_root)
    1782           0 :                         continue;
    1783             : 
    1784           0 :                 if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
    1785           0 :                         mutex_unlock(&cgroup_mutex);
    1786           0 :                         msleep(10);
    1787             :                         ret = restart_syscall();
    1788           0 :                         goto out_free;
    1789             :                 }
    1790           0 :                 cgroup_put(&ss->root->cgrp);
    1791             :         }
    1792             : 
    1793           0 :         for_each_root(root) {
    1794             :                 bool name_match = false;
    1795             : 
    1796           0 :                 if (root == &cgrp_dfl_root)
    1797           0 :                         continue;
    1798             : 
    1799             :                 /*
    1800             :                  * If we asked for a name then it must match.  Also, if
    1801             :                  * name matches but sybsys_mask doesn't, we should fail.
    1802             :                  * Remember whether name matched.
    1803             :                  */
    1804           0 :                 if (opts.name) {
    1805           0 :                         if (strcmp(opts.name, root->name))
    1806           0 :                                 continue;
    1807             :                         name_match = true;
    1808             :                 }
    1809             : 
    1810             :                 /*
    1811             :                  * If we asked for subsystems (or explicitly for no
    1812             :                  * subsystems) then they must match.
    1813             :                  */
    1814           0 :                 if ((opts.subsys_mask || opts.none) &&
    1815           0 :                     (opts.subsys_mask != root->subsys_mask)) {
    1816           0 :                         if (!name_match)
    1817           0 :                                 continue;
    1818             :                         ret = -EBUSY;
    1819             :                         goto out_unlock;
    1820             :                 }
    1821             : 
    1822           0 :                 if (root->flags ^ opts.flags)
    1823           0 :                         pr_warn("new mount options do not match the existing superblock, will be ignored\n");
    1824             : 
    1825             :                 /*
    1826             :                  * We want to reuse @root whose lifetime is governed by its
    1827             :                  * ->cgrp.  Let's check whether @root is alive and keep it
    1828             :                  * that way.  As cgroup_kill_sb() can happen anytime, we
    1829             :                  * want to block it by pinning the sb so that @root doesn't
    1830             :                  * get killed before mount is complete.
    1831             :                  *
    1832             :                  * With the sb pinned, tryget_live can reliably indicate
    1833             :                  * whether @root can be reused.  If it's being killed,
    1834             :                  * drain it.  We can use wait_queue for the wait but this
    1835             :                  * path is super cold.  Let's just sleep a bit and retry.
    1836             :                  */
    1837           0 :                 pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
    1838           0 :                 if (IS_ERR(pinned_sb) ||
    1839             :                     !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
    1840           0 :                         mutex_unlock(&cgroup_mutex);
    1841           0 :                         if (!IS_ERR_OR_NULL(pinned_sb))
    1842           0 :                                 deactivate_super(pinned_sb);
    1843           0 :                         msleep(10);
    1844             :                         ret = restart_syscall();
    1845           0 :                         goto out_free;
    1846             :                 }
    1847             : 
    1848             :                 ret = 0;
    1849             :                 goto out_unlock;
    1850             :         }
    1851             : 
    1852             :         /*
    1853             :          * No such thing, create a new one.  name= matching without subsys
    1854             :          * specification is allowed for already existing hierarchies but we
    1855             :          * can't create new one without subsys specification.
    1856             :          */
    1857           0 :         if (!opts.subsys_mask && !opts.none) {
    1858             :                 ret = -EINVAL;
    1859             :                 goto out_unlock;
    1860             :         }
    1861             : 
    1862             :         root = kzalloc(sizeof(*root), GFP_KERNEL);
    1863           0 :         if (!root) {
    1864             :                 ret = -ENOMEM;
    1865             :                 goto out_unlock;
    1866             :         }
    1867             : 
    1868           0 :         init_cgroup_root(root, &opts);
    1869             : 
    1870           0 :         ret = cgroup_setup_root(root, opts.subsys_mask);
    1871           0 :         if (ret)
    1872           0 :                 cgroup_free_root(root);
    1873             : 
    1874             : out_unlock:
    1875           1 :         mutex_unlock(&cgroup_mutex);
    1876             : out_free:
    1877           1 :         kfree(opts.release_agent);
    1878           1 :         kfree(opts.name);
    1879             : 
    1880           1 :         if (ret)
    1881           1 :                 return ERR_PTR(ret);
    1882             : 
    1883           0 :         dentry = kernfs_mount(fs_type, flags, root->kf_root,
    1884             :                                 CGROUP_SUPER_MAGIC, &new_sb);
    1885           0 :         if (IS_ERR(dentry) || !new_sb)
    1886           0 :                 cgroup_put(&root->cgrp);
    1887             : 
    1888             :         /*
    1889             :          * If @pinned_sb, we're reusing an existing root and holding an
    1890             :          * extra ref on its sb.  Mount is complete.  Put the extra ref.
    1891             :          */
    1892           0 :         if (pinned_sb) {
    1893             :                 WARN_ON(new_sb);
    1894           0 :                 deactivate_super(pinned_sb);
    1895             :         }
    1896             : 
    1897           0 :         return dentry;
    1898             : }
    1899             : 
    1900           0 : static void cgroup_kill_sb(struct super_block *sb)
    1901             : {
    1902           0 :         struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
    1903             :         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
    1904             : 
    1905             :         /*
    1906             :          * If @root doesn't have any mounts or children, start killing it.
    1907             :          * This prevents new mounts by disabling percpu_ref_tryget_live().
    1908             :          * cgroup_mount() may wait for @root's release.
    1909             :          *
    1910             :          * And don't kill the default root.
    1911             :          */
    1912           0 :         if (!list_empty(&root->cgrp.self.children) ||
    1913             :             root == &cgrp_dfl_root)
    1914           0 :                 cgroup_put(&root->cgrp);
    1915             :         else
    1916           0 :                 percpu_ref_kill(&root->cgrp.self.refcnt);
    1917             : 
    1918           0 :         kernfs_kill_sb(sb);
    1919           0 : }
    1920             : 
    1921             : static struct file_system_type cgroup_fs_type = {
    1922             :         .name = "cgroup",
    1923             :         .mount = cgroup_mount,
    1924             :         .kill_sb = cgroup_kill_sb,
    1925             : };
    1926             : 
    1927             : static struct kobject *cgroup_kobj;
    1928             : 
    1929             : /**
    1930             :  * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
    1931             :  * @task: target task
    1932             :  * @buf: the buffer to write the path into
    1933             :  * @buflen: the length of the buffer
    1934             :  *
    1935             :  * Determine @task's cgroup on the first (the one with the lowest non-zero
    1936             :  * hierarchy_id) cgroup hierarchy and copy its path into @buf.  This
    1937             :  * function grabs cgroup_mutex and shouldn't be used inside locks used by
    1938             :  * cgroup controller callbacks.
    1939             :  *
    1940             :  * Return value is the same as kernfs_path().
    1941             :  */
    1942           0 : char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
    1943             : {
    1944             :         struct cgroup_root *root;
    1945             :         struct cgroup *cgrp;
    1946           0 :         int hierarchy_id = 1;
    1947             :         char *path = NULL;
    1948             : 
    1949           0 :         mutex_lock(&cgroup_mutex);
    1950           0 :         down_read(&css_set_rwsem);
    1951             : 
    1952           0 :         root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
    1953             : 
    1954           0 :         if (root) {
    1955             :                 cgrp = task_cgroup_from_root(task, root);
    1956             :                 path = cgroup_path(cgrp, buf, buflen);
    1957             :         } else {
    1958             :                 /* if no hierarchy exists, everyone is in "/" */
    1959           0 :                 if (strlcpy(buf, "/", buflen) < buflen)
    1960             :                         path = buf;
    1961             :         }
    1962             : 
    1963           0 :         up_read(&css_set_rwsem);
    1964           0 :         mutex_unlock(&cgroup_mutex);
    1965           0 :         return path;
    1966             : }
    1967             : EXPORT_SYMBOL_GPL(task_cgroup_path);
    1968             : 
    1969             : /* used to track tasks and other necessary states during migration */
    1970             : struct cgroup_taskset {
    1971             :         /* the src and dst cset list running through cset->mg_node */
    1972             :         struct list_head        src_csets;
    1973             :         struct list_head        dst_csets;
    1974             : 
    1975             :         /*
    1976             :          * Fields for cgroup_taskset_*() iteration.
    1977             :          *
    1978             :          * Before migration is committed, the target migration tasks are on
    1979             :          * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
    1980             :          * the csets on ->dst_csets.  ->csets point to either ->src_csets
    1981             :          * or ->dst_csets depending on whether migration is committed.
    1982             :          *
    1983             :          * ->cur_csets and ->cur_task point to the current task position
    1984             :          * during iteration.
    1985             :          */
    1986             :         struct list_head        *csets;
    1987             :         struct css_set          *cur_cset;
    1988             :         struct task_struct      *cur_task;
    1989             : };
    1990             : 
    1991             : /**
    1992             :  * cgroup_taskset_first - reset taskset and return the first task
    1993             :  * @tset: taskset of interest
    1994             :  *
    1995             :  * @tset iteration is initialized and the first task is returned.
    1996             :  */
    1997           0 : struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
    1998             : {
    1999           0 :         tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
    2000           0 :         tset->cur_task = NULL;
    2001             : 
    2002           0 :         return cgroup_taskset_next(tset);
    2003             : }
    2004             : 
    2005             : /**
    2006             :  * cgroup_taskset_next - iterate to the next task in taskset
    2007             :  * @tset: taskset of interest
    2008             :  *
    2009             :  * Return the next task in @tset.  Iteration must have been initialized
    2010             :  * with cgroup_taskset_first().
    2011             :  */
    2012           0 : struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
    2013             : {
    2014           0 :         struct css_set *cset = tset->cur_cset;
    2015           0 :         struct task_struct *task = tset->cur_task;
    2016             : 
    2017           0 :         while (&cset->mg_node != tset->csets) {
    2018           0 :                 if (!task)
    2019           0 :                         task = list_first_entry(&cset->mg_tasks,
    2020             :                                                 struct task_struct, cg_list);
    2021             :                 else
    2022           0 :                         task = list_next_entry(task, cg_list);
    2023             : 
    2024           0 :                 if (&task->cg_list != &cset->mg_tasks) {
    2025           0 :                         tset->cur_cset = cset;
    2026           0 :                         tset->cur_task = task;
    2027           0 :                         return task;
    2028             :                 }
    2029             : 
    2030           0 :                 cset = list_next_entry(cset, mg_node);
    2031             :                 task = NULL;
    2032             :         }
    2033             : 
    2034             :         return NULL;
    2035             : }
    2036             : 
    2037             : /**
    2038             :  * cgroup_task_migrate - move a task from one cgroup to another.
    2039             :  * @old_cgrp: the cgroup @tsk is being migrated from
    2040             :  * @tsk: the task being migrated
    2041             :  * @new_cset: the new css_set @tsk is being attached to
    2042             :  *
    2043             :  * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
    2044             :  */
    2045           0 : static void cgroup_task_migrate(struct cgroup *old_cgrp,
    2046             :                                 struct task_struct *tsk,
    2047             :                                 struct css_set *new_cset)
    2048             : {
    2049             :         struct css_set *old_cset;
    2050             : 
    2051             :         lockdep_assert_held(&cgroup_mutex);
    2052             :         lockdep_assert_held(&css_set_rwsem);
    2053             : 
    2054             :         /*
    2055             :          * We are synchronized through threadgroup_lock() against PF_EXITING
    2056             :          * setting such that we can't race against cgroup_exit() changing the
    2057             :          * css_set to init_css_set and dropping the old one.
    2058             :          */
    2059             :         WARN_ON_ONCE(tsk->flags & PF_EXITING);
    2060             :         old_cset = task_css_set(tsk);
    2061             : 
    2062             :         get_css_set(new_cset);
    2063           0 :         rcu_assign_pointer(tsk->cgroups, new_cset);
    2064             : 
    2065             :         /*
    2066             :          * Use move_tail so that cgroup_taskset_first() still returns the
    2067             :          * leader after migration.  This works because cgroup_migrate()
    2068             :          * ensures that the dst_cset of the leader is the first on the
    2069             :          * tset's dst_csets list.
    2070             :          */
    2071           0 :         list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
    2072             : 
    2073             :         /*
    2074             :          * We just gained a reference on old_cset by taking it from the
    2075             :          * task. As trading it for new_cset is protected by cgroup_mutex,
    2076             :          * we're safe to drop it here; it will be freed under RCU.
    2077             :          */
    2078           0 :         put_css_set_locked(old_cset);
    2079           0 : }
    2080             : 
    2081             : /**
    2082             :  * cgroup_migrate_finish - cleanup after attach
    2083             :  * @preloaded_csets: list of preloaded css_sets
    2084             :  *
    2085             :  * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
    2086             :  * those functions for details.
    2087             :  */
    2088           0 : static void cgroup_migrate_finish(struct list_head *preloaded_csets)
    2089             : {
    2090             :         struct css_set *cset, *tmp_cset;
    2091             : 
    2092             :         lockdep_assert_held(&cgroup_mutex);
    2093             : 
    2094           0 :         down_write(&css_set_rwsem);
    2095           0 :         list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
    2096           0 :                 cset->mg_src_cgrp = NULL;
    2097           0 :                 cset->mg_dst_cset = NULL;
    2098             :                 list_del_init(&cset->mg_preload_node);
    2099           0 :                 put_css_set_locked(cset);
    2100             :         }
    2101           0 :         up_write(&css_set_rwsem);
    2102           0 : }
    2103             : 
    2104             : /**
    2105             :  * cgroup_migrate_add_src - add a migration source css_set
    2106             :  * @src_cset: the source css_set to add
    2107             :  * @dst_cgrp: the destination cgroup
    2108             :  * @preloaded_csets: list of preloaded css_sets
    2109             :  *
    2110             :  * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
    2111             :  * @src_cset and add it to @preloaded_csets, which should later be cleaned
    2112             :  * up by cgroup_migrate_finish().
    2113             :  *
    2114             :  * This function may be called without holding threadgroup_lock even if the
    2115             :  * target is a process.  Threads may be created and destroyed but as long
    2116             :  * as cgroup_mutex is not dropped, no new css_set can be put into play and
    2117             :  * the preloaded css_sets are guaranteed to cover all migrations.
    2118             :  */
    2119           0 : static void cgroup_migrate_add_src(struct css_set *src_cset,
    2120             :                                    struct cgroup *dst_cgrp,
    2121             :                                    struct list_head *preloaded_csets)
    2122             : {
    2123             :         struct cgroup *src_cgrp;
    2124             : 
    2125             :         lockdep_assert_held(&cgroup_mutex);
    2126             :         lockdep_assert_held(&css_set_rwsem);
    2127             : 
    2128           0 :         src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
    2129             : 
    2130           0 :         if (!list_empty(&src_cset->mg_preload_node))
    2131           0 :                 return;
    2132             : 
    2133             :         WARN_ON(src_cset->mg_src_cgrp);
    2134             :         WARN_ON(!list_empty(&src_cset->mg_tasks));
    2135             :         WARN_ON(!list_empty(&src_cset->mg_node));
    2136             : 
    2137           0 :         src_cset->mg_src_cgrp = src_cgrp;
    2138             :         get_css_set(src_cset);
    2139             :         list_add(&src_cset->mg_preload_node, preloaded_csets);
    2140             : }
    2141             : 
    2142             : /**
    2143             :  * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
    2144             :  * @dst_cgrp: the destination cgroup (may be %NULL)
    2145             :  * @preloaded_csets: list of preloaded source css_sets
    2146             :  *
    2147             :  * Tasks are about to be moved to @dst_cgrp and all the source css_sets
    2148             :  * have been preloaded to @preloaded_csets.  This function looks up and
    2149             :  * pins all destination css_sets, links each to its source, and append them
    2150             :  * to @preloaded_csets.  If @dst_cgrp is %NULL, the destination of each
    2151             :  * source css_set is assumed to be its cgroup on the default hierarchy.
    2152             :  *
    2153             :  * This function must be called after cgroup_migrate_add_src() has been
    2154             :  * called on each migration source css_set.  After migration is performed
    2155             :  * using cgroup_migrate(), cgroup_migrate_finish() must be called on
    2156             :  * @preloaded_csets.
    2157             :  */
    2158           0 : static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
    2159             :                                       struct list_head *preloaded_csets)
    2160             : {
    2161           0 :         LIST_HEAD(csets);
    2162             :         struct css_set *src_cset, *tmp_cset;
    2163             : 
    2164             :         lockdep_assert_held(&cgroup_mutex);
    2165             : 
    2166             :         /*
    2167             :          * Except for the root, child_subsys_mask must be zero for a cgroup
    2168             :          * with tasks so that child cgroups don't compete against tasks.
    2169             :          */
    2170           0 :         if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
    2171           0 :             dst_cgrp->child_subsys_mask)
    2172             :                 return -EBUSY;
    2173             : 
    2174             :         /* look up the dst cset for each src cset and link it to src */
    2175           0 :         list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
    2176             :                 struct css_set *dst_cset;
    2177             : 
    2178           0 :                 dst_cset = find_css_set(src_cset,
    2179             :                                         dst_cgrp ?: src_cset->dfl_cgrp);
    2180           0 :                 if (!dst_cset)
    2181             :                         goto err;
    2182             : 
    2183             :                 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
    2184             : 
    2185             :                 /*
    2186             :                  * If src cset equals dst, it's noop.  Drop the src.
    2187             :                  * cgroup_migrate() will skip the cset too.  Note that we
    2188             :                  * can't handle src == dst as some nodes are used by both.
    2189             :                  */
    2190           0 :                 if (src_cset == dst_cset) {
    2191           0 :                         src_cset->mg_src_cgrp = NULL;
    2192             :                         list_del_init(&src_cset->mg_preload_node);
    2193           0 :                         put_css_set(src_cset);
    2194           0 :                         put_css_set(dst_cset);
    2195           0 :                         continue;
    2196             :                 }
    2197             : 
    2198           0 :                 src_cset->mg_dst_cset = dst_cset;
    2199             : 
    2200           0 :                 if (list_empty(&dst_cset->mg_preload_node))
    2201             :                         list_add(&dst_cset->mg_preload_node, &csets);
    2202             :                 else
    2203           0 :                         put_css_set(dst_cset);
    2204             :         }
    2205             : 
    2206             :         list_splice_tail(&csets, preloaded_csets);
    2207             :         return 0;
    2208             : err:
    2209           0 :         cgroup_migrate_finish(&csets);
    2210           0 :         return -ENOMEM;
    2211             : }
    2212             : 
    2213             : /**
    2214             :  * cgroup_migrate - migrate a process or task to a cgroup
    2215             :  * @cgrp: the destination cgroup
    2216             :  * @leader: the leader of the process or the task to migrate
    2217             :  * @threadgroup: whether @leader points to the whole process or a single task
    2218             :  *
    2219             :  * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
    2220             :  * process, the caller must be holding threadgroup_lock of @leader.  The
    2221             :  * caller is also responsible for invoking cgroup_migrate_add_src() and
    2222             :  * cgroup_migrate_prepare_dst() on the targets before invoking this
    2223             :  * function and following up with cgroup_migrate_finish().
    2224             :  *
    2225             :  * As long as a controller's ->can_attach() doesn't fail, this function is
    2226             :  * guaranteed to succeed.  This means that, excluding ->can_attach()
    2227             :  * failure, when migrating multiple targets, the success or failure can be
    2228             :  * decided for all targets by invoking group_migrate_prepare_dst() before
    2229             :  * actually starting migrating.
    2230             :  */
    2231           0 : static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
    2232             :                           bool threadgroup)
    2233             : {
    2234           0 :         struct cgroup_taskset tset = {
    2235             :                 .src_csets      = LIST_HEAD_INIT(tset.src_csets),
    2236             :                 .dst_csets      = LIST_HEAD_INIT(tset.dst_csets),
    2237             :                 .csets          = &tset.src_csets,
    2238             :         };
    2239             :         struct cgroup_subsys_state *css, *failed_css = NULL;
    2240             :         struct css_set *cset, *tmp_cset;
    2241             :         struct task_struct *task, *tmp_task;
    2242             :         int i, ret;
    2243             : 
    2244             :         /*
    2245             :          * Prevent freeing of tasks while we take a snapshot. Tasks that are
    2246             :          * already PF_EXITING could be freed from underneath us unless we
    2247             :          * take an rcu_read_lock.
    2248             :          */
    2249           0 :         down_write(&css_set_rwsem);
    2250             :         rcu_read_lock();
    2251             :         task = leader;
    2252             :         do {
    2253             :                 /* @task either already exited or can't exit until the end */
    2254           0 :                 if (task->flags & PF_EXITING)
    2255             :                         goto next;
    2256             : 
    2257             :                 /* leave @task alone if post_fork() hasn't linked it yet */
    2258           0 :                 if (list_empty(&task->cg_list))
    2259             :                         goto next;
    2260             : 
    2261             :                 cset = task_css_set(task);
    2262           0 :                 if (!cset->mg_src_cgrp)
    2263             :                         goto next;
    2264             : 
    2265             :                 /*
    2266             :                  * cgroup_taskset_first() must always return the leader.
    2267             :                  * Take care to avoid disturbing the ordering.
    2268             :                  */
    2269           0 :                 list_move_tail(&task->cg_list, &cset->mg_tasks);
    2270           0 :                 if (list_empty(&cset->mg_node))
    2271             :                         list_add_tail(&cset->mg_node, &tset.src_csets);
    2272           0 :                 if (list_empty(&cset->mg_dst_cset->mg_node))
    2273             :                         list_move_tail(&cset->mg_dst_cset->mg_node,
    2274             :                                        &tset.dst_csets);
    2275             :         next:
    2276           0 :                 if (!threadgroup)
    2277             :                         break;
    2278           0 :         } while_each_thread(leader, task);
    2279             :         rcu_read_unlock();
    2280           0 :         up_write(&css_set_rwsem);
    2281             : 
    2282             :         /* methods shouldn't be called if no task is actually migrating */
    2283           0 :         if (list_empty(&tset.src_csets))
    2284             :                 return 0;
    2285             : 
    2286             :         /* check that we can legitimately attach to the cgroup */
    2287           0 :         for_each_e_css(css, i, cgrp) {
    2288           0 :                 if (css->ss->can_attach) {
    2289           0 :                         ret = css->ss->can_attach(css, &tset);
    2290           0 :                         if (ret) {
    2291             :                                 failed_css = css;
    2292             :                                 goto out_cancel_attach;
    2293             :                         }
    2294             :                 }
    2295             :         }
    2296             : 
    2297             :         /*
    2298             :          * Now that we're guaranteed success, proceed to move all tasks to
    2299             :          * the new cgroup.  There are no failure cases after here, so this
    2300             :          * is the commit point.
    2301             :          */
    2302           0 :         down_write(&css_set_rwsem);
    2303           0 :         list_for_each_entry(cset, &tset.src_csets, mg_node) {
    2304           0 :                 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
    2305           0 :                         cgroup_task_migrate(cset->mg_src_cgrp, task,
    2306             :                                             cset->mg_dst_cset);
    2307             :         }
    2308           0 :         up_write(&css_set_rwsem);
    2309             : 
    2310             :         /*
    2311             :          * Migration is committed, all target tasks are now on dst_csets.
    2312             :          * Nothing is sensitive to fork() after this point.  Notify
    2313             :          * controllers that migration is complete.
    2314             :          */
    2315           0 :         tset.csets = &tset.dst_csets;
    2316             : 
    2317           0 :         for_each_e_css(css, i, cgrp)
    2318           0 :                 if (css->ss->attach)
    2319           0 :                         css->ss->attach(css, &tset);
    2320             : 
    2321             :         ret = 0;
    2322             :         goto out_release_tset;
    2323             : 
    2324             : out_cancel_attach:
    2325           0 :         for_each_e_css(css, i, cgrp) {
    2326           0 :                 if (css == failed_css)
    2327             :                         break;
    2328           0 :                 if (css->ss->cancel_attach)
    2329           0 :                         css->ss->cancel_attach(css, &tset);
    2330             :         }
    2331             : out_release_tset:
    2332           0 :         down_write(&css_set_rwsem);
    2333             :         list_splice_init(&tset.dst_csets, &tset.src_csets);
    2334           0 :         list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
    2335           0 :                 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
    2336             :                 list_del_init(&cset->mg_node);
    2337             :         }
    2338           0 :         up_write(&css_set_rwsem);
    2339           0 :         return ret;
    2340             : }
    2341             : 
    2342             : /**
    2343             :  * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
    2344             :  * @dst_cgrp: the cgroup to attach to
    2345             :  * @leader: the task or the leader of the threadgroup to be attached
    2346             :  * @threadgroup: attach the whole threadgroup?
    2347             :  *
    2348             :  * Call holding cgroup_mutex and threadgroup_lock of @leader.
    2349             :  */
    2350           0 : static int cgroup_attach_task(struct cgroup *dst_cgrp,
    2351             :                               struct task_struct *leader, bool threadgroup)
    2352             : {
    2353           0 :         LIST_HEAD(preloaded_csets);
    2354             :         struct task_struct *task;
    2355             :         int ret;
    2356             : 
    2357             :         /* look up all src csets */
    2358           0 :         down_read(&css_set_rwsem);
    2359             :         rcu_read_lock();
    2360             :         task = leader;
    2361             :         do {
    2362           0 :                 cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
    2363             :                                        &preloaded_csets);
    2364           0 :                 if (!threadgroup)
    2365             :                         break;
    2366           0 :         } while_each_thread(leader, task);
    2367             :         rcu_read_unlock();
    2368           0 :         up_read(&css_set_rwsem);
    2369             : 
    2370             :         /* prepare dst csets and commit */
    2371           0 :         ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
    2372           0 :         if (!ret)
    2373           0 :                 ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
    2374             : 
    2375           0 :         cgroup_migrate_finish(&preloaded_csets);
    2376           0 :         return ret;
    2377             : }
    2378             : 
    2379             : /*
    2380             :  * Find the task_struct of the task to attach by vpid and pass it along to the
    2381             :  * function to attach either it or all tasks in its threadgroup. Will lock
    2382             :  * cgroup_mutex and threadgroup.
    2383             :  */
    2384           0 : static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
    2385             :                                     size_t nbytes, loff_t off, bool threadgroup)
    2386             : {
    2387             :         struct task_struct *tsk;
    2388           0 :         const struct cred *cred = current_cred(), *tcred;
    2389             :         struct cgroup *cgrp;
    2390             :         pid_t pid;
    2391             :         int ret;
    2392             : 
    2393           0 :         if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
    2394             :                 return -EINVAL;
    2395             : 
    2396           0 :         cgrp = cgroup_kn_lock_live(of->kn);
    2397           0 :         if (!cgrp)
    2398             :                 return -ENODEV;
    2399             : 
    2400             : retry_find_task:
    2401             :         rcu_read_lock();
    2402           0 :         if (pid) {
    2403           0 :                 tsk = find_task_by_vpid(pid);
    2404           0 :                 if (!tsk) {
    2405             :                         rcu_read_unlock();
    2406             :                         ret = -ESRCH;
    2407             :                         goto out_unlock_cgroup;
    2408             :                 }
    2409             :                 /*
    2410             :                  * even if we're attaching all tasks in the thread group, we
    2411             :                  * only need to check permissions on one of them.
    2412             :                  */
    2413           0 :                 tcred = __task_cred(tsk);
    2414           0 :                 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
    2415           0 :                     !uid_eq(cred->euid, tcred->uid) &&
    2416             :                     !uid_eq(cred->euid, tcred->suid)) {
    2417             :                         rcu_read_unlock();
    2418             :                         ret = -EACCES;
    2419             :                         goto out_unlock_cgroup;
    2420             :                 }
    2421             :         } else
    2422           0 :                 tsk = current;
    2423             : 
    2424           0 :         if (threadgroup)
    2425           0 :                 tsk = tsk->group_leader;
    2426             : 
    2427             :         /*
    2428             :          * Workqueue threads may acquire PF_NO_SETAFFINITY and become
    2429             :          * trapped in a cpuset, or RT worker may be born in a cgroup
    2430             :          * with no rt_runtime allocated.  Just say no.
    2431             :          */
    2432           0 :         if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
    2433             :                 ret = -EINVAL;
    2434             :                 rcu_read_unlock();
    2435             :                 goto out_unlock_cgroup;
    2436             :         }
    2437             : 
    2438           0 :         get_task_struct(tsk);
    2439             :         rcu_read_unlock();
    2440             : 
    2441             :         threadgroup_lock(tsk);
    2442           0 :         if (threadgroup) {
    2443           0 :                 if (!thread_group_leader(tsk)) {
    2444             :                         /*
    2445             :                          * a race with de_thread from another thread's exec()
    2446             :                          * may strip us of our leadership, if this happens,
    2447             :                          * there is no choice but to throw this task away and
    2448             :                          * try again; this is
    2449             :                          * "double-double-toil-and-trouble-check locking".
    2450             :                          */
    2451             :                         threadgroup_unlock(tsk);
    2452             :                         put_task_struct(tsk);
    2453             :                         goto retry_find_task;
    2454             :                 }
    2455             :         }
    2456             : 
    2457           0 :         ret = cgroup_attach_task(cgrp, tsk, threadgroup);
    2458             : 
    2459             :         threadgroup_unlock(tsk);
    2460             : 
    2461             :         put_task_struct(tsk);
    2462             : out_unlock_cgroup:
    2463           0 :         cgroup_kn_unlock(of->kn);
    2464           0 :         return ret ?: nbytes;
    2465             : }
    2466             : 
    2467             : /**
    2468             :  * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
    2469             :  * @from: attach to all cgroups of a given task
    2470             :  * @tsk: the task to be attached
    2471             :  */
    2472           0 : int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
    2473             : {
    2474             :         struct cgroup_root *root;
    2475             :         int retval = 0;
    2476             : 
    2477           0 :         mutex_lock(&cgroup_mutex);
    2478           0 :         for_each_root(root) {
    2479             :                 struct cgroup *from_cgrp;
    2480             : 
    2481           0 :                 if (root == &cgrp_dfl_root)
    2482           0 :                         continue;
    2483             : 
    2484           0 :                 down_read(&css_set_rwsem);
    2485             :                 from_cgrp = task_cgroup_from_root(from, root);
    2486           0 :                 up_read(&css_set_rwsem);
    2487             : 
    2488           0 :                 retval = cgroup_attach_task(from_cgrp, tsk, false);
    2489           0 :                 if (retval)
    2490             :                         break;
    2491             :         }
    2492           0 :         mutex_unlock(&cgroup_mutex);
    2493             : 
    2494           0 :         return retval;
    2495             : }
    2496             : EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
    2497             : 
    2498           0 : static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
    2499             :                                   char *buf, size_t nbytes, loff_t off)
    2500             : {
    2501           0 :         return __cgroup_procs_write(of, buf, nbytes, off, false);
    2502             : }
    2503             : 
    2504           0 : static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
    2505             :                                   char *buf, size_t nbytes, loff_t off)
    2506             : {
    2507           0 :         return __cgroup_procs_write(of, buf, nbytes, off, true);
    2508             : }
    2509             : 
    2510           0 : static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
    2511             :                                           char *buf, size_t nbytes, loff_t off)
    2512             : {
    2513             :         struct cgroup *cgrp;
    2514             : 
    2515             :         BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
    2516             : 
    2517           0 :         cgrp = cgroup_kn_lock_live(of->kn);
    2518           0 :         if (!cgrp)
    2519             :                 return -ENODEV;
    2520             :         spin_lock(&release_agent_path_lock);
    2521           0 :         strlcpy(cgrp->root->release_agent_path, strstrip(buf),
    2522             :                 sizeof(cgrp->root->release_agent_path));
    2523             :         spin_unlock(&release_agent_path_lock);
    2524           0 :         cgroup_kn_unlock(of->kn);
    2525           0 :         return nbytes;
    2526             : }
    2527             : 
    2528           0 : static int cgroup_release_agent_show(struct seq_file *seq, void *v)
    2529             : {
    2530           0 :         struct cgroup *cgrp = seq_css(seq)->cgroup;
    2531             : 
    2532             :         spin_lock(&release_agent_path_lock);
    2533           0 :         seq_puts(seq, cgrp->root->release_agent_path);
    2534             :         spin_unlock(&release_agent_path_lock);
    2535           0 :         seq_putc(seq, '\n');
    2536           0 :         return 0;
    2537             : }
    2538             : 
    2539           0 : static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
    2540             : {
    2541           0 :         seq_puts(seq, "0\n");
    2542           0 :         return 0;
    2543             : }
    2544             : 
    2545           0 : static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
    2546             : {
    2547             :         struct cgroup_subsys *ss;
    2548             :         bool printed = false;
    2549             :         int ssid;
    2550             : 
    2551           0 :         for_each_subsys(ss, ssid) {
    2552           0 :                 if (ss_mask & (1 << ssid)) {
    2553           0 :                         if (printed)
    2554           0 :                                 seq_putc(seq, ' ');
    2555           0 :                         seq_printf(seq, "%s", ss->name);
    2556             :                         printed = true;
    2557             :                 }
    2558             :         }
    2559           0 :         if (printed)
    2560           0 :                 seq_putc(seq, '\n');
    2561           0 : }
    2562             : 
    2563             : /* show controllers which are currently attached to the default hierarchy */
    2564           0 : static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
    2565             : {
    2566           0 :         struct cgroup *cgrp = seq_css(seq)->cgroup;
    2567             : 
    2568           0 :         cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
    2569           0 :                              ~cgrp_dfl_root_inhibit_ss_mask);
    2570           0 :         return 0;
    2571             : }
    2572             : 
    2573             : /* show controllers which are enabled from the parent */
    2574           0 : static int cgroup_controllers_show(struct seq_file *seq, void *v)
    2575             : {
    2576           0 :         struct cgroup *cgrp = seq_css(seq)->cgroup;
    2577             : 
    2578           0 :         cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);
    2579           0 :         return 0;
    2580             : }
    2581             : 
    2582             : /* show controllers which are enabled for a given cgroup's children */
    2583           0 : static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
    2584             : {
    2585           0 :         struct cgroup *cgrp = seq_css(seq)->cgroup;
    2586             : 
    2587           0 :         cgroup_print_ss_mask(seq, cgrp->subtree_control);
    2588           0 :         return 0;
    2589             : }
    2590             : 
    2591             : /**
    2592             :  * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
    2593             :  * @cgrp: root of the subtree to update csses for
    2594             :  *
    2595             :  * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
    2596             :  * css associations need to be updated accordingly.  This function looks up
    2597             :  * all css_sets which are attached to the subtree, creates the matching
    2598             :  * updated css_sets and migrates the tasks to the new ones.
    2599             :  */
    2600           0 : static int cgroup_update_dfl_csses(struct cgroup *cgrp)
    2601             : {
    2602           0 :         LIST_HEAD(preloaded_csets);
    2603             :         struct cgroup_subsys_state *css;
    2604             :         struct css_set *src_cset;
    2605             :         int ret;
    2606             : 
    2607             :         lockdep_assert_held(&cgroup_mutex);
    2608             : 
    2609             :         /* look up all csses currently attached to @cgrp's subtree */
    2610           0 :         down_read(&css_set_rwsem);
    2611           0 :         css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
    2612             :                 struct cgrp_cset_link *link;
    2613             : 
    2614             :                 /* self is not affected by child_subsys_mask change */
    2615           0 :                 if (css->cgroup == cgrp)
    2616           0 :                         continue;
    2617             : 
    2618           0 :                 list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
    2619           0 :                         cgroup_migrate_add_src(link->cset, cgrp,
    2620             :                                                &preloaded_csets);
    2621             :         }
    2622           0 :         up_read(&css_set_rwsem);
    2623             : 
    2624             :         /* NULL dst indicates self on default hierarchy */
    2625           0 :         ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
    2626           0 :         if (ret)
    2627             :                 goto out_finish;
    2628             : 
    2629           0 :         list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
    2630             :                 struct task_struct *last_task = NULL, *task;
    2631             : 
    2632             :                 /* src_csets precede dst_csets, break on the first dst_cset */
    2633           0 :                 if (!src_cset->mg_src_cgrp)
    2634             :                         break;
    2635             : 
    2636             :                 /*
    2637             :                  * All tasks in src_cset need to be migrated to the
    2638             :                  * matching dst_cset.  Empty it process by process.  We
    2639             :                  * walk tasks but migrate processes.  The leader might even
    2640             :                  * belong to a different cset but such src_cset would also
    2641             :                  * be among the target src_csets because the default
    2642             :                  * hierarchy enforces per-process membership.
    2643             :                  */
    2644             :                 while (true) {
    2645           0 :                         down_read(&css_set_rwsem);
    2646           0 :                         task = list_first_entry_or_null(&src_cset->tasks,
    2647             :                                                 struct task_struct, cg_list);
    2648           0 :                         if (task) {
    2649           0 :                                 task = task->group_leader;
    2650             :                                 WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
    2651           0 :                                 get_task_struct(task);
    2652             :                         }
    2653           0 :                         up_read(&css_set_rwsem);
    2654             : 
    2655           0 :                         if (!task)
    2656             :                                 break;
    2657             : 
    2658             :                         /* guard against possible infinite loop */
    2659           0 :                         if (WARN(last_task == task,
    2660             :                                  "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
    2661             :                                 goto out_finish;
    2662             :                         last_task = task;
    2663             : 
    2664             :                         threadgroup_lock(task);
    2665             :                         /* raced against de_thread() from another thread? */
    2666           0 :                         if (!thread_group_leader(task)) {
    2667             :                                 threadgroup_unlock(task);
    2668             :                                 put_task_struct(task);
    2669           0 :                                 continue;
    2670             :                         }
    2671             : 
    2672           0 :                         ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
    2673             : 
    2674             :                         threadgroup_unlock(task);
    2675             :                         put_task_struct(task);
    2676             : 
    2677           0 :                         if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
    2678             :                                 goto out_finish;
    2679             :                 }
    2680             :         }
    2681             : 
    2682             : out_finish:
    2683           0 :         cgroup_migrate_finish(&preloaded_csets);
    2684           0 :         return ret;
    2685             : }
    2686             : 
    2687             : /* change the enabled child controllers for a cgroup in the default hierarchy */
    2688           0 : static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
    2689             :                                             char *buf, size_t nbytes,
    2690             :                                             loff_t off)
    2691             : {
    2692             :         unsigned int enable = 0, disable = 0;
    2693             :         unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
    2694           0 :         struct cgroup *cgrp, *child;
    2695             :         struct cgroup_subsys *ss;
    2696             :         char *tok;
    2697             :         int ssid, ret;
    2698             : 
    2699             :         /*
    2700             :          * Parse input - space separated list of subsystem names prefixed
    2701             :          * with either + or -.
    2702             :          */
    2703           0 :         buf = strstrip(buf);
    2704           0 :         while ((tok = strsep(&buf, " "))) {
    2705           0 :                 if (tok[0] == '\0')
    2706           0 :                         continue;
    2707           0 :                 for_each_subsys(ss, ssid) {
    2708           0 :                         if (ss->disabled || strcmp(tok + 1, ss->name) ||
    2709           0 :                             ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
    2710           0 :                                 continue;
    2711             : 
    2712           0 :                         if (*tok == '+') {
    2713           0 :                                 enable |= 1 << ssid;
    2714           0 :                                 disable &= ~(1 << ssid);
    2715           0 :                         } else if (*tok == '-') {
    2716           0 :                                 disable |= 1 << ssid;
    2717           0 :                                 enable &= ~(1 << ssid);
    2718             :                         } else {
    2719             :                                 return -EINVAL;
    2720             :                         }
    2721             :                         break;
    2722             :                 }
    2723           0 :                 if (ssid == CGROUP_SUBSYS_COUNT)
    2724             :                         return -EINVAL;
    2725             :         }
    2726             : 
    2727           0 :         cgrp = cgroup_kn_lock_live(of->kn);
    2728           0 :         if (!cgrp)
    2729             :                 return -ENODEV;
    2730             : 
    2731           0 :         for_each_subsys(ss, ssid) {
    2732           0 :                 if (enable & (1 << ssid)) {
    2733           0 :                         if (cgrp->subtree_control & (1 << ssid)) {
    2734           0 :                                 enable &= ~(1 << ssid);
    2735           0 :                                 continue;
    2736             :                         }
    2737             : 
    2738             :                         /* unavailable or not enabled on the parent? */
    2739           0 :                         if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
    2740           0 :                             (cgroup_parent(cgrp) &&
    2741           0 :                              !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
    2742             :                                 ret = -ENOENT;
    2743             :                                 goto out_unlock;
    2744             :                         }
    2745           0 :                 } else if (disable & (1 << ssid)) {
    2746           0 :                         if (!(cgrp->subtree_control & (1 << ssid))) {
    2747           0 :                                 disable &= ~(1 << ssid);
    2748           0 :                                 continue;
    2749             :                         }
    2750             : 
    2751             :                         /* a child has it enabled? */
    2752           0 :                         cgroup_for_each_live_child(child, cgrp) {
    2753           0 :                                 if (child->subtree_control & (1 << ssid)) {
    2754             :                                         ret = -EBUSY;
    2755             :                                         goto out_unlock;
    2756             :                                 }
    2757             :                         }
    2758             :                 }
    2759             :         }
    2760             : 
    2761           0 :         if (!enable && !disable) {
    2762             :                 ret = 0;
    2763             :                 goto out_unlock;
    2764             :         }
    2765             : 
    2766             :         /*
    2767             :          * Except for the root, subtree_control must be zero for a cgroup
    2768             :          * with tasks so that child cgroups don't compete against tasks.
    2769             :          */
    2770           0 :         if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
    2771             :                 ret = -EBUSY;
    2772             :                 goto out_unlock;
    2773             :         }
    2774             : 
    2775             :         /*
    2776             :          * Update subsys masks and calculate what needs to be done.  More
    2777             :          * subsystems than specified may need to be enabled or disabled
    2778             :          * depending on subsystem dependencies.
    2779             :          */
    2780           0 :         old_sc = cgrp->subtree_control;
    2781           0 :         old_ss = cgrp->child_subsys_mask;
    2782           0 :         new_sc = (old_sc | enable) & ~disable;
    2783           0 :         new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc);
    2784             : 
    2785           0 :         css_enable = ~old_ss & new_ss;
    2786           0 :         css_disable = old_ss & ~new_ss;
    2787           0 :         enable |= css_enable;
    2788           0 :         disable |= css_disable;
    2789             : 
    2790             :         /*
    2791             :          * Because css offlining is asynchronous, userland might try to
    2792             :          * re-enable the same controller while the previous instance is
    2793             :          * still around.  In such cases, wait till it's gone using
    2794             :          * offline_waitq.
    2795             :          */
    2796           0 :         for_each_subsys(ss, ssid) {
    2797           0 :                 if (!(css_enable & (1 << ssid)))
    2798           0 :                         continue;
    2799             : 
    2800           0 :                 cgroup_for_each_live_child(child, cgrp) {
    2801           0 :                         DEFINE_WAIT(wait);
    2802             : 
    2803           0 :                         if (!cgroup_css(child, ss))
    2804           0 :                                 continue;
    2805             : 
    2806           0 :                         cgroup_get(child);
    2807           0 :                         prepare_to_wait(&child->offline_waitq, &wait,
    2808             :                                         TASK_UNINTERRUPTIBLE);
    2809           0 :                         cgroup_kn_unlock(of->kn);
    2810           0 :                         schedule();
    2811           0 :                         finish_wait(&child->offline_waitq, &wait);
    2812           0 :                         cgroup_put(child);
    2813             : 
    2814           0 :                         return restart_syscall();
    2815             :                 }
    2816             :         }
    2817             : 
    2818           0 :         cgrp->subtree_control = new_sc;
    2819           0 :         cgrp->child_subsys_mask = new_ss;
    2820             : 
    2821             :         /*
    2822             :          * Create new csses or make the existing ones visible.  A css is
    2823             :          * created invisible if it's being implicitly enabled through
    2824             :          * dependency.  An invisible css is made visible when the userland
    2825             :          * explicitly enables it.
    2826             :          */
    2827           0 :         for_each_subsys(ss, ssid) {
    2828           0 :                 if (!(enable & (1 << ssid)))
    2829           0 :                         continue;
    2830             : 
    2831           0 :                 cgroup_for_each_live_child(child, cgrp) {
    2832           0 :                         if (css_enable & (1 << ssid))
    2833           0 :                                 ret = create_css(child, ss,
    2834           0 :                                         cgrp->subtree_control & (1 << ssid));
    2835             :                         else
    2836           0 :                                 ret = cgroup_populate_dir(child, 1 << ssid);
    2837           0 :                         if (ret)
    2838             :                                 goto err_undo_css;
    2839             :                 }
    2840             :         }
    2841             : 
    2842             :         /*
    2843             :          * At this point, cgroup_e_css() results reflect the new csses
    2844             :          * making the following cgroup_update_dfl_csses() properly update
    2845             :          * css associations of all tasks in the subtree.
    2846             :          */
    2847           0 :         ret = cgroup_update_dfl_csses(cgrp);
    2848           0 :         if (ret)
    2849             :                 goto err_undo_css;
    2850             : 
    2851             :         /*
    2852             :          * All tasks are migrated out of disabled csses.  Kill or hide
    2853             :          * them.  A css is hidden when the userland requests it to be
    2854             :          * disabled while other subsystems are still depending on it.  The
    2855             :          * css must not actively control resources and be in the vanilla
    2856             :          * state if it's made visible again later.  Controllers which may
    2857             :          * be depended upon should provide ->css_reset() for this purpose.
    2858             :          */
    2859           0 :         for_each_subsys(ss, ssid) {
    2860           0 :                 if (!(disable & (1 << ssid)))
    2861           0 :                         continue;
    2862             : 
    2863           0 :                 cgroup_for_each_live_child(child, cgrp) {
    2864             :                         struct cgroup_subsys_state *css = cgroup_css(child, ss);
    2865             : 
    2866           0 :                         if (css_disable & (1 << ssid)) {
    2867           0 :                                 kill_css(css);
    2868             :                         } else {
    2869           0 :                                 cgroup_clear_dir(child, 1 << ssid);
    2870           0 :                                 if (ss->css_reset)
    2871           0 :                                         ss->css_reset(css);
    2872             :                         }
    2873             :                 }
    2874             :         }
    2875             : 
    2876             :         /*
    2877             :          * The effective csses of all the descendants (excluding @cgrp) may
    2878             :          * have changed.  Subsystems can optionally subscribe to this event
    2879             :          * by implementing ->css_e_css_changed() which is invoked if any of
    2880             :          * the effective csses seen from the css's cgroup may have changed.
    2881             :          */
    2882           0 :         for_each_subsys(ss, ssid) {
    2883             :                 struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss);
    2884             :                 struct cgroup_subsys_state *css;
    2885             : 
    2886           0 :                 if (!ss->css_e_css_changed || !this_css)
    2887           0 :                         continue;
    2888             : 
    2889           0 :                 css_for_each_descendant_pre(css, this_css)
    2890           0 :                         if (css != this_css)
    2891           0 :                                 ss->css_e_css_changed(css);
    2892             :         }
    2893             : 
    2894           0 :         kernfs_activate(cgrp->kn);
    2895             :         ret = 0;
    2896             : out_unlock:
    2897           0 :         cgroup_kn_unlock(of->kn);
    2898           0 :         return ret ?: nbytes;
    2899             : 
    2900             : err_undo_css:
    2901           0 :         cgrp->subtree_control = old_sc;
    2902           0 :         cgrp->child_subsys_mask = old_ss;
    2903             : 
    2904           0 :         for_each_subsys(ss, ssid) {
    2905           0 :                 if (!(enable & (1 << ssid)))
    2906           0 :                         continue;
    2907             : 
    2908           0 :                 cgroup_for_each_live_child(child, cgrp) {
    2909             :                         struct cgroup_subsys_state *css = cgroup_css(child, ss);
    2910             : 
    2911           0 :                         if (!css)
    2912           0 :                                 continue;
    2913             : 
    2914           0 :                         if (css_enable & (1 << ssid))
    2915           0 :                                 kill_css(css);
    2916             :                         else
    2917           0 :                                 cgroup_clear_dir(child, 1 << ssid);
    2918             :                 }
    2919             :         }
    2920             :         goto out_unlock;
    2921             : }
    2922             : 
    2923           0 : static int cgroup_populated_show(struct seq_file *seq, void *v)
    2924             : {
    2925           0 :         seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
    2926           0 :         return 0;
    2927             : }
    2928             : 
    2929           0 : static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
    2930             :                                  size_t nbytes, loff_t off)
    2931             : {
    2932           0 :         struct cgroup *cgrp = of->kn->parent->priv;
    2933           0 :         struct cftype *cft = of->kn->priv;
    2934             :         struct cgroup_subsys_state *css;
    2935             :         int ret;
    2936             : 
    2937           0 :         if (cft->write)
    2938           0 :                 return cft->write(of, buf, nbytes, off);
    2939             : 
    2940             :         /*
    2941             :          * kernfs guarantees that a file isn't deleted with operations in
    2942             :          * flight, which means that the matching css is and stays alive and
    2943             :          * doesn't need to be pinned.  The RCU locking is not necessary
    2944             :          * either.  It's just for the convenience of using cgroup_css().
    2945             :          */
    2946             :         rcu_read_lock();
    2947           0 :         css = cgroup_css(cgrp, cft->ss);
    2948             :         rcu_read_unlock();
    2949             : 
    2950           0 :         if (cft->write_u64) {
    2951             :                 unsigned long long v;
    2952           0 :                 ret = kstrtoull(buf, 0, &v);
    2953           0 :                 if (!ret)
    2954           0 :                         ret = cft->write_u64(css, cft, v);
    2955           0 :         } else if (cft->write_s64) {
    2956             :                 long long v;
    2957           0 :                 ret = kstrtoll(buf, 0, &v);
    2958           0 :                 if (!ret)
    2959           0 :                         ret = cft->write_s64(css, cft, v);
    2960             :         } else {
    2961             :                 ret = -EINVAL;
    2962             :         }
    2963             : 
    2964           0 :         return ret ?: nbytes;
    2965             : }
    2966             : 
    2967           0 : static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
    2968             : {
    2969           0 :         return seq_cft(seq)->seq_start(seq, ppos);
    2970             : }
    2971             : 
    2972           0 : static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
    2973             : {
    2974           0 :         return seq_cft(seq)->seq_next(seq, v, ppos);
    2975             : }
    2976             : 
    2977           0 : static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
    2978             : {
    2979           0 :         seq_cft(seq)->seq_stop(seq, v);
    2980           0 : }
    2981             : 
    2982           0 : static int cgroup_seqfile_show(struct seq_file *m, void *arg)
    2983             : {
    2984             :         struct cftype *cft = seq_cft(m);
    2985             :         struct cgroup_subsys_state *css = seq_css(m);
    2986             : 
    2987           0 :         if (cft->seq_show)
    2988           0 :                 return cft->seq_show(m, arg);
    2989             : 
    2990           0 :         if (cft->read_u64)
    2991           0 :                 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
    2992           0 :         else if (cft->read_s64)
    2993           0 :                 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
    2994             :         else
    2995             :                 return -EINVAL;
    2996             :         return 0;
    2997             : }
    2998             : 
    2999             : static struct kernfs_ops cgroup_kf_single_ops = {
    3000             :         .atomic_write_len       = PAGE_SIZE,
    3001             :         .write                  = cgroup_file_write,
    3002             :         .seq_show               = cgroup_seqfile_show,
    3003             : };
    3004             : 
    3005             : static struct kernfs_ops cgroup_kf_ops = {
    3006             :         .atomic_write_len       = PAGE_SIZE,
    3007             :         .write                  = cgroup_file_write,
    3008             :         .seq_start              = cgroup_seqfile_start,
    3009             :         .seq_next               = cgroup_seqfile_next,
    3010             :         .seq_stop               = cgroup_seqfile_stop,
    3011             :         .seq_show               = cgroup_seqfile_show,
    3012             : };
    3013             : 
    3014             : /*
    3015             :  * cgroup_rename - Only allow simple rename of directories in place.
    3016             :  */
    3017           0 : static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
    3018             :                          const char *new_name_str)
    3019             : {
    3020           0 :         struct cgroup *cgrp = kn->priv;
    3021             :         int ret;
    3022             : 
    3023           0 :         if (kernfs_type(kn) != KERNFS_DIR)
    3024             :                 return -ENOTDIR;
    3025           0 :         if (kn->parent != new_parent)
    3026             :                 return -EIO;
    3027             : 
    3028             :         /*
    3029             :          * This isn't a proper migration and its usefulness is very
    3030             :          * limited.  Disallow on the default hierarchy.
    3031             :          */
    3032           0 :         if (cgroup_on_dfl(cgrp))
    3033             :                 return -EPERM;
    3034             : 
    3035             :         /*
    3036             :          * We're gonna grab cgroup_mutex which nests outside kernfs
    3037             :          * active_ref.  kernfs_rename() doesn't require active_ref
    3038             :          * protection.  Break them before grabbing cgroup_mutex.
    3039             :          */
    3040           0 :         kernfs_break_active_protection(new_parent);
    3041           0 :         kernfs_break_active_protection(kn);
    3042             : 
    3043           0 :         mutex_lock(&cgroup_mutex);
    3044             : 
    3045             :         ret = kernfs_rename(kn, new_parent, new_name_str);
    3046             : 
    3047           0 :         mutex_unlock(&cgroup_mutex);
    3048             : 
    3049           0 :         kernfs_unbreak_active_protection(kn);
    3050           0 :         kernfs_unbreak_active_protection(new_parent);
    3051           0 :         return ret;
    3052             : }
    3053             : 
    3054             : /* set uid and gid of cgroup dirs and files to that of the creator */
    3055           3 : static int cgroup_kn_set_ugid(struct kernfs_node *kn)
    3056             : {
    3057           6 :         struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
    3058           3 :                                .ia_uid = current_fsuid(),
    3059             :                                .ia_gid = current_fsgid(), };
    3060             : 
    3061           3 :         if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
    3062             :             gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
    3063             :                 return 0;
    3064             : 
    3065           0 :         return kernfs_setattr(kn, &iattr);
    3066             : }
    3067             : 
    3068           3 : static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
    3069             : {
    3070             :         char name[CGROUP_FILE_NAME_MAX];
    3071             :         struct kernfs_node *kn;
    3072             :         struct lock_class_key *key = NULL;
    3073             :         int ret;
    3074             : 
    3075             : #ifdef CONFIG_DEBUG_LOCK_ALLOC
    3076             :         key = &cft->lockdep_key;
    3077             : #endif
    3078           6 :         kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
    3079           3 :                                   cgroup_file_mode(cft), 0, cft->kf_ops, cft,
    3080             :                                   NULL, false, key);
    3081           3 :         if (IS_ERR(kn))
    3082           0 :                 return PTR_ERR(kn);
    3083             : 
    3084           3 :         ret = cgroup_kn_set_ugid(kn);
    3085           3 :         if (ret) {
    3086           0 :                 kernfs_remove(kn);
    3087           0 :                 return ret;
    3088             :         }
    3089             : 
    3090           3 :         if (cft->seq_show == cgroup_populated_show)
    3091           0 :                 cgrp->populated_kn = kn;
    3092             :         return 0;
    3093             : }
    3094             : 
    3095             : /**
    3096             :  * cgroup_addrm_files - add or remove files to a cgroup directory
    3097             :  * @cgrp: the target cgroup
    3098             :  * @cfts: array of cftypes to be added
    3099             :  * @is_add: whether to add or remove
    3100             :  *
    3101             :  * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
    3102             :  * For removals, this function never fails.  If addition fails, this
    3103             :  * function doesn't remove files already added.  The caller is responsible
    3104             :  * for cleaning up.
    3105             :  */
    3106          52 : static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
    3107             :                               bool is_add)
    3108             : {
    3109             :         struct cftype *cft;
    3110             :         int ret;
    3111             : 
    3112             :         lockdep_assert_held(&cgroup_mutex);
    3113             : 
    3114          54 :         for (cft = cfts; cft->name[0] != '\0'; cft++) {
    3115             :                 /* does cft->flags tell us to skip this file on @cgrp? */
    3116          45 :                 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
    3117           0 :                         continue;
    3118          85 :                 if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
    3119          40 :                         continue;
    3120           7 :                 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
    3121           2 :                         continue;
    3122           4 :                 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
    3123           0 :                         continue;
    3124             : 
    3125           3 :                 if (is_add) {
    3126           3 :                         ret = cgroup_add_file(cgrp, cft);
    3127           3 :                         if (ret) {
    3128           0 :                                 pr_warn("%s: failed to add %s, err=%d\n",
    3129             :                                         __func__, cft->name, ret);
    3130           0 :                                 return ret;
    3131             :                         }
    3132             :                 } else {
    3133           0 :                         cgroup_rm_file(cgrp, cft);
    3134             :                 }
    3135             :         }
    3136             :         return 0;
    3137             : }
    3138             : 
    3139           8 : static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
    3140             : {
    3141           8 :         LIST_HEAD(pending);
    3142           8 :         struct cgroup_subsys *ss = cfts[0].ss;
    3143           8 :         struct cgroup *root = &ss->root->cgrp;
    3144             :         struct cgroup_subsys_state *css;
    3145             :         int ret = 0;
    3146             : 
    3147             :         lockdep_assert_held(&cgroup_mutex);
    3148             : 
    3149             :         /* add/rm files for all cgroups created before */
    3150          16 :         css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
    3151           8 :                 struct cgroup *cgrp = css->cgroup;
    3152             : 
    3153           8 :                 if (cgroup_is_dead(cgrp))
    3154           0 :                         continue;
    3155             : 
    3156           8 :                 ret = cgroup_addrm_files(cgrp, cfts, is_add);
    3157           8 :                 if (ret)
    3158             :                         break;
    3159             :         }
    3160             : 
    3161           8 :         if (is_add && !ret)
    3162           8 :                 kernfs_activate(root->kn);
    3163           8 :         return ret;
    3164             : }
    3165             : 
    3166           0 : static void cgroup_exit_cftypes(struct cftype *cfts)
    3167             : {
    3168             :         struct cftype *cft;
    3169             : 
    3170           0 :         for (cft = cfts; cft->name[0] != '\0'; cft++) {
    3171             :                 /* free copy for custom atomic_write_len, see init_cftypes() */
    3172           0 :                 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
    3173           0 :                         kfree(cft->kf_ops);
    3174           0 :                 cft->kf_ops = NULL;
    3175           0 :                 cft->ss = NULL;
    3176             : 
    3177             :                 /* revert flags set by cgroup core while adding @cfts */
    3178           0 :                 cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
    3179             :         }
    3180           0 : }
    3181             : 
    3182          10 : static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
    3183             : {
    3184             :         struct cftype *cft;
    3185             : 
    3186          61 :         for (cft = cfts; cft->name[0] != '\0'; cft++) {
    3187             :                 struct kernfs_ops *kf_ops;
    3188             : 
    3189             :                 WARN_ON(cft->ss || cft->kf_ops);
    3190             : 
    3191          51 :                 if (cft->seq_start)
    3192             :                         kf_ops = &cgroup_kf_ops;
    3193             :                 else
    3194             :                         kf_ops = &cgroup_kf_single_ops;
    3195             : 
    3196             :                 /*
    3197             :                  * Ugh... if @cft wants a custom max_write_len, we need to
    3198             :                  * make a copy of kf_ops to set its atomic_write_len.
    3199             :                  */
    3200          51 :                 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
    3201           1 :                         kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
    3202           1 :                         if (!kf_ops) {
    3203           0 :                                 cgroup_exit_cftypes(cfts);
    3204           0 :                                 return -ENOMEM;
    3205             :                         }
    3206           1 :                         kf_ops->atomic_write_len = cft->max_write_len;
    3207             :                 }
    3208             : 
    3209          51 :                 cft->kf_ops = kf_ops;
    3210          51 :                 cft->ss = ss;
    3211             :         }
    3212             : 
    3213             :         return 0;
    3214             : }
    3215             : 
    3216           0 : static int cgroup_rm_cftypes_locked(struct cftype *cfts)
    3217             : {
    3218             :         lockdep_assert_held(&cgroup_mutex);
    3219             : 
    3220           0 :         if (!cfts || !cfts[0].ss)
    3221             :                 return -ENOENT;
    3222             : 
    3223             :         list_del(&cfts->node);
    3224           0 :         cgroup_apply_cftypes(cfts, false);
    3225           0 :         cgroup_exit_cftypes(cfts);
    3226           0 :         return 0;
    3227             : }
    3228             : 
    3229             : /**
    3230             :  * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
    3231             :  * @cfts: zero-length name terminated array of cftypes
    3232             :  *
    3233             :  * Unregister @cfts.  Files described by @cfts are removed from all
    3234             :  * existing cgroups and all future cgroups won't have them either.  This
    3235             :  * function can be called anytime whether @cfts' subsys is attached or not.
    3236             :  *
    3237             :  * Returns 0 on successful unregistration, -ENOENT if @cfts is not
    3238             :  * registered.
    3239             :  */
    3240           0 : int cgroup_rm_cftypes(struct cftype *cfts)
    3241             : {
    3242             :         int ret;
    3243             : 
    3244           0 :         mutex_lock(&cgroup_mutex);
    3245           0 :         ret = cgroup_rm_cftypes_locked(cfts);
    3246           0 :         mutex_unlock(&cgroup_mutex);
    3247           0 :         return ret;
    3248             : }
    3249             : 
    3250             : /**
    3251             :  * cgroup_add_cftypes - add an array of cftypes to a subsystem
    3252             :  * @ss: target cgroup subsystem
    3253             :  * @cfts: zero-length name terminated array of cftypes
    3254             :  *
    3255             :  * Register @cfts to @ss.  Files described by @cfts are created for all
    3256             :  * existing cgroups to which @ss is attached and all future cgroups will
    3257             :  * have them too.  This function can be called anytime whether @ss is
    3258             :  * attached or not.
    3259             :  *
    3260             :  * Returns 0 on successful registration, -errno on failure.  Note that this
    3261             :  * function currently returns 0 as long as @cfts registration is successful
    3262             :  * even if some file creation attempts on existing cgroups fail.
    3263             :  */
    3264          14 : static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
    3265             : {
    3266             :         int ret;
    3267             : 
    3268          14 :         if (ss->disabled)
    3269             :                 return 0;
    3270             : 
    3271          14 :         if (!cfts || cfts[0].name[0] == '\0')
    3272             :                 return 0;
    3273             : 
    3274           8 :         ret = cgroup_init_cftypes(ss, cfts);
    3275           8 :         if (ret)
    3276             :                 return ret;
    3277             : 
    3278           8 :         mutex_lock(&cgroup_mutex);
    3279             : 
    3280           8 :         list_add_tail(&cfts->node, &ss->cfts);
    3281           8 :         ret = cgroup_apply_cftypes(cfts, true);
    3282           8 :         if (ret)
    3283           0 :                 cgroup_rm_cftypes_locked(cfts);
    3284             : 
    3285           8 :         mutex_unlock(&cgroup_mutex);
    3286           8 :         return ret;
    3287             : }
    3288             : 
    3289             : /**
    3290             :  * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
    3291             :  * @ss: target cgroup subsystem
    3292             :  * @cfts: zero-length name terminated array of cftypes
    3293             :  *
    3294             :  * Similar to cgroup_add_cftypes() but the added files are only used for
    3295             :  * the default hierarchy.
    3296             :  */
    3297           6 : int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
    3298             : {
    3299             :         struct cftype *cft;
    3300             : 
    3301           6 :         for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
    3302           0 :                 cft->flags |= __CFTYPE_ONLY_ON_DFL;
    3303           6 :         return cgroup_add_cftypes(ss, cfts);
    3304             : }
    3305             : 
    3306             : /**
    3307             :  * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
    3308             :  * @ss: target cgroup subsystem
    3309             :  * @cfts: zero-length name terminated array of cftypes
    3310             :  *
    3311             :  * Similar to cgroup_add_cftypes() but the added files are only used for
    3312             :  * the legacy hierarchies.
    3313             :  */
    3314           8 : int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
    3315             : {
    3316             :         struct cftype *cft;
    3317             : 
    3318             :         /*
    3319             :          * If legacy_flies_on_dfl, we want to show the legacy files on the
    3320             :          * dfl hierarchy but iff the target subsystem hasn't been updated
    3321             :          * for the dfl hierarchy yet.
    3322             :          */
    3323           8 :         if (!cgroup_legacy_files_on_dfl ||
    3324           0 :             ss->dfl_cftypes != ss->legacy_cftypes) {
    3325          40 :                 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
    3326          40 :                         cft->flags |= __CFTYPE_NOT_ON_DFL;
    3327             :         }
    3328             : 
    3329           8 :         return cgroup_add_cftypes(ss, cfts);
    3330             : }
    3331             : 
    3332             : /**
    3333             :  * cgroup_task_count - count the number of tasks in a cgroup.
    3334             :  * @cgrp: the cgroup in question
    3335             :  *
    3336             :  * Return the number of tasks in the cgroup.
    3337             :  */
    3338           0 : static int cgroup_task_count(const struct cgroup *cgrp)
    3339             : {
    3340             :         int count = 0;
    3341             :         struct cgrp_cset_link *link;
    3342             : 
    3343           0 :         down_read(&css_set_rwsem);
    3344           0 :         list_for_each_entry(link, &cgrp->cset_links, cset_link)
    3345           0 :                 count += atomic_read(&link->cset->refcount);
    3346           0 :         up_read(&css_set_rwsem);
    3347           0 :         return count;
    3348             : }
    3349             : 
    3350             : /**
    3351             :  * css_next_child - find the next child of a given css
    3352             :  * @pos: the current position (%NULL to initiate traversal)
    3353             :  * @parent: css whose children to walk
    3354             :  *
    3355             :  * This function returns the next child of @parent and should be called
    3356             :  * under either cgroup_mutex or RCU read lock.  The only requirement is
    3357             :  * that @parent and @pos are accessible.  The next sibling is guaranteed to
    3358             :  * be returned regardless of their states.
    3359             :  *
    3360             :  * If a subsystem synchronizes ->css_online() and the start of iteration, a
    3361             :  * css which finished ->css_online() is guaranteed to be visible in the
    3362             :  * future iterations and will stay visible until the last reference is put.
    3363             :  * A css which hasn't finished ->css_online() or already finished
    3364             :  * ->css_offline() may show up during traversal.  It's each subsystem's
    3365             :  * responsibility to synchronize against on/offlining.
    3366             :  */
    3367           8 : struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
    3368             :                                            struct cgroup_subsys_state *parent)
    3369             : {
    3370             :         struct cgroup_subsys_state *next;
    3371             : 
    3372             :         cgroup_assert_mutex_or_rcu_locked();
    3373             : 
    3374             :         /*
    3375             :          * @pos could already have been unlinked from the sibling list.
    3376             :          * Once a cgroup is removed, its ->sibling.next is no longer
    3377             :          * updated when its next sibling changes.  CSS_RELEASED is set when
    3378             :          * @pos is taken off list, at which time its next pointer is valid,
    3379             :          * and, as releases are serialized, the one pointed to by the next
    3380             :          * pointer is guaranteed to not have started release yet.  This
    3381             :          * implies that if we observe !CSS_RELEASED on @pos in this RCU
    3382             :          * critical section, the one pointed to by its next pointer is
    3383             :          * guaranteed to not have finished its RCU grace period even if we
    3384             :          * have dropped rcu_read_lock() inbetween iterations.
    3385             :          *
    3386             :          * If @pos has CSS_RELEASED set, its next pointer can't be
    3387             :          * dereferenced; however, as each css is given a monotonically
    3388             :          * increasing unique serial number and always appended to the
    3389             :          * sibling list, the next one can be found by walking the parent's
    3390             :          * children until the first css with higher serial number than
    3391             :          * @pos's.  While this path can be slower, it happens iff iteration
    3392             :          * races against release and the race window is very small.
    3393             :          */
    3394           8 :         if (!pos) {
    3395           8 :                 next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
    3396           0 :         } else if (likely(!(pos->flags & CSS_RELEASED))) {
    3397           0 :                 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
    3398             :         } else {
    3399           0 :                 list_for_each_entry_rcu(next, &parent->children, sibling)
    3400           0 :                         if (next->serial_nr > pos->serial_nr)
    3401             :                                 break;
    3402             :         }
    3403             : 
    3404             :         /*
    3405             :          * @next, if not pointing to the head, can be dereferenced and is
    3406             :          * the next sibling.
    3407             :          */
    3408           8 :         if (&next->sibling != &parent->children)
    3409           0 :                 return next;
    3410             :         return NULL;
    3411             : }
    3412             : 
    3413             : /**
    3414             :  * css_next_descendant_pre - find the next descendant for pre-order walk
    3415             :  * @pos: the current position (%NULL to initiate traversal)
    3416             :  * @root: css whose descendants to walk
    3417             :  *
    3418             :  * To be used by css_for_each_descendant_pre().  Find the next descendant
    3419             :  * to visit for pre-order traversal of @root's descendants.  @root is
    3420             :  * included in the iteration and the first node to be visited.
    3421             :  *
    3422             :  * While this function requires cgroup_mutex or RCU read locking, it
    3423             :  * doesn't require the whole traversal to be contained in a single critical
    3424             :  * section.  This function will return the correct next descendant as long
    3425             :  * as both @pos and @root are accessible and @pos is a descendant of @root.
    3426             :  *
    3427             :  * If a subsystem synchronizes ->css_online() and the start of iteration, a
    3428             :  * css which finished ->css_online() is guaranteed to be visible in the
    3429             :  * future iterations and will stay visible until the last reference is put.
    3430             :  * A css which hasn't finished ->css_online() or already finished
    3431             :  * ->css_offline() may show up during traversal.  It's each subsystem's
    3432             :  * responsibility to synchronize against on/offlining.
    3433             :  */
    3434             : struct cgroup_subsys_state *
    3435          16 : css_next_descendant_pre(struct cgroup_subsys_state *pos,
    3436             :                         struct cgroup_subsys_state *root)
    3437             : {
    3438             :         struct cgroup_subsys_state *next;
    3439             : 
    3440             :         cgroup_assert_mutex_or_rcu_locked();
    3441             : 
    3442             :         /* if first iteration, visit @root */
    3443          16 :         if (!pos)
    3444             :                 return root;
    3445             : 
    3446             :         /* visit the first child if exists */
    3447           8 :         next = css_next_child(NULL, pos);
    3448           8 :         if (next)
    3449             :                 return next;
    3450             : 
    3451             :         /* no child, visit my or the closest ancestor's next sibling */
    3452           8 :         while (pos != root) {
    3453           0 :                 next = css_next_child(pos, pos->parent);
    3454           0 :                 if (next)
    3455             :                         return next;
    3456           0 :                 pos = pos->parent;
    3457             :         }
    3458             : 
    3459             :         return NULL;
    3460             : }
    3461             : 
    3462             : /**
    3463             :  * css_rightmost_descendant - return the rightmost descendant of a css
    3464             :  * @pos: css of interest
    3465             :  *
    3466             :  * Return the rightmost descendant of @pos.  If there's no descendant, @pos
    3467             :  * is returned.  This can be used during pre-order traversal to skip
    3468             :  * subtree of @pos.
    3469             :  *
    3470             :  * While this function requires cgroup_mutex or RCU read locking, it
    3471             :  * doesn't require the whole traversal to be contained in a single critical
    3472             :  * section.  This function will return the correct rightmost descendant as
    3473             :  * long as @pos is accessible.
    3474             :  */
    3475             : struct cgroup_subsys_state *
    3476           0 : css_rightmost_descendant(struct cgroup_subsys_state *pos)
    3477             : {
    3478             :         struct cgroup_subsys_state *last, *tmp;
    3479             : 
    3480             :         cgroup_assert_mutex_or_rcu_locked();
    3481             : 
    3482             :         do {
    3483             :                 last = pos;
    3484             :                 /* ->prev isn't RCU safe, walk ->next till the end */
    3485             :                 pos = NULL;
    3486           0 :                 css_for_each_child(tmp, last)
    3487             :                         pos = tmp;
    3488           0 :         } while (pos);
    3489             : 
    3490           0 :         return last;
    3491             : }
    3492             : 
    3493             : static struct cgroup_subsys_state *
    3494             : css_leftmost_descendant(struct cgroup_subsys_state *pos)
    3495             : {
    3496             :         struct cgroup_subsys_state *last;
    3497             : 
    3498             :         do {
    3499             :                 last = pos;
    3500           0 :                 pos = css_next_child(NULL, pos);
    3501           0 :         } while (pos);
    3502             : 
    3503             :         return last;
    3504             : }
    3505             : 
    3506             : /**
    3507             :  * css_next_descendant_post - find the next descendant for post-order walk
    3508             :  * @pos: the current position (%NULL to initiate traversal)
    3509             :  * @root: css whose descendants to walk
    3510             :  *
    3511             :  * To be used by css_for_each_descendant_post().  Find the next descendant
    3512             :  * to visit for post-order traversal of @root's descendants.  @root is
    3513             :  * included in the iteration and the last node to be visited.
    3514             :  *
    3515             :  * While this function requires cgroup_mutex or RCU read locking, it
    3516             :  * doesn't require the whole traversal to be contained in a single critical
    3517             :  * section.  This function will return the correct next descendant as long
    3518             :  * as both @pos and @cgroup are accessible and @pos is a descendant of
    3519             :  * @cgroup.
    3520             :  *
    3521             :  * If a subsystem synchronizes ->css_online() and the start of iteration, a
    3522             :  * css which finished ->css_online() is guaranteed to be visible in the
    3523             :  * future iterations and will stay visible until the last reference is put.
    3524             :  * A css which hasn't finished ->css_online() or already finished
    3525             :  * ->css_offline() may show up during traversal.  It's each subsystem's
    3526             :  * responsibility to synchronize against on/offlining.
    3527             :  */
    3528             : struct cgroup_subsys_state *
    3529           0 : css_next_descendant_post(struct cgroup_subsys_state *pos,
    3530             :                          struct cgroup_subsys_state *root)
    3531             : {
    3532             :         struct cgroup_subsys_state *next;
    3533             : 
    3534             :         cgroup_assert_mutex_or_rcu_locked();
    3535             : 
    3536             :         /* if first iteration, visit leftmost descendant which may be @root */
    3537           0 :         if (!pos)
    3538             :                 return css_leftmost_descendant(root);
    3539             : 
    3540             :         /* if we visited @root, we're done */
    3541           0 :         if (pos == root)
    3542             :                 return NULL;
    3543             : 
    3544             :         /* if there's an unvisited sibling, visit its leftmost descendant */
    3545           0 :         next = css_next_child(pos, pos->parent);
    3546           0 :         if (next)
    3547             :                 return css_leftmost_descendant(next);
    3548             : 
    3549             :         /* no sibling left, visit parent */
    3550           0 :         return pos->parent;
    3551             : }
    3552             : 
    3553             : /**
    3554             :  * css_has_online_children - does a css have online children
    3555             :  * @css: the target css
    3556             :  *
    3557             :  * Returns %true if @css has any online children; otherwise, %false.  This
    3558             :  * function can be called from any context but the caller is responsible
    3559             :  * for synchronizing against on/offlining as necessary.
    3560             :  */
    3561           0 : bool css_has_online_children(struct cgroup_subsys_state *css)
    3562             : {
    3563             :         struct cgroup_subsys_state *child;
    3564             :         bool ret = false;
    3565             : 
    3566             :         rcu_read_lock();
    3567           0 :         css_for_each_child(child, css) {
    3568           0 :                 if (child->flags & CSS_ONLINE) {
    3569             :                         ret = true;
    3570             :                         break;
    3571             :                 }
    3572             :         }
    3573             :         rcu_read_unlock();
    3574           0 :         return ret;
    3575             : }
    3576             : 
    3577             : /**
    3578             :  * css_advance_task_iter - advance a task itererator to the next css_set
    3579             :  * @it: the iterator to advance
    3580             :  *
    3581             :  * Advance @it to the next css_set to walk.
    3582             :  */
    3583           0 : static void css_advance_task_iter(struct css_task_iter *it)
    3584             : {
    3585           0 :         struct list_head *l = it->cset_pos;
    3586             :         struct cgrp_cset_link *link;
    3587             :         struct css_set *cset;
    3588             : 
    3589             :         /* Advance to the next non-empty css_set */
    3590             :         do {
    3591           0 :                 l = l->next;
    3592           0 :                 if (l == it->cset_head) {
    3593           0 :                         it->cset_pos = NULL;
    3594           0 :                         return;
    3595             :                 }
    3596             : 
    3597           0 :                 if (it->ss) {
    3598           0 :                         cset = container_of(l, struct css_set,
    3599             :                                             e_cset_node[it->ss->id]);
    3600             :                 } else {
    3601             :                         link = list_entry(l, struct cgrp_cset_link, cset_link);
    3602           0 :                         cset = link->cset;
    3603             :                 }
    3604           0 :         } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
    3605             : 
    3606           0 :         it->cset_pos = l;
    3607             : 
    3608           0 :         if (!list_empty(&cset->tasks))
    3609           0 :                 it->task_pos = cset->tasks.next;
    3610             :         else
    3611           0 :                 it->task_pos = cset->mg_tasks.next;
    3612             : 
    3613           0 :         it->tasks_head = &cset->tasks;
    3614           0 :         it->mg_tasks_head = &cset->mg_tasks;
    3615             : }
    3616             : 
    3617             : /**
    3618             :  * css_task_iter_start - initiate task iteration
    3619             :  * @css: the css to walk tasks of
    3620             :  * @it: the task iterator to use
    3621             :  *
    3622             :  * Initiate iteration through the tasks of @css.  The caller can call
    3623             :  * css_task_iter_next() to walk through the tasks until the function
    3624             :  * returns NULL.  On completion of iteration, css_task_iter_end() must be
    3625             :  * called.
    3626             :  *
    3627             :  * Note that this function acquires a lock which is released when the
    3628             :  * iteration finishes.  The caller can't sleep while iteration is in
    3629             :  * progress.
    3630             :  */
    3631           0 : void css_task_iter_start(struct cgroup_subsys_state *css,
    3632             :                          struct css_task_iter *it)
    3633             :         __acquires(css_set_rwsem)
    3634             : {
    3635             :         /* no one should try to iterate before mounting cgroups */
    3636             :         WARN_ON_ONCE(!use_task_css_set_links);
    3637             : 
    3638           0 :         down_read(&css_set_rwsem);
    3639             : 
    3640           0 :         it->ss = css->ss;
    3641             : 
    3642           0 :         if (it->ss)
    3643           0 :                 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
    3644             :         else
    3645           0 :                 it->cset_pos = &css->cgroup->cset_links;
    3646             : 
    3647           0 :         it->cset_head = it->cset_pos;
    3648             : 
    3649           0 :         css_advance_task_iter(it);
    3650           0 : }
    3651             : 
    3652             : /**
    3653             :  * css_task_iter_next - return the next task for the iterator
    3654             :  * @it: the task iterator being iterated
    3655             :  *
    3656             :  * The "next" function for task iteration.  @it should have been
    3657             :  * initialized via css_task_iter_start().  Returns NULL when the iteration
    3658             :  * reaches the end.
    3659             :  */
    3660           0 : struct task_struct *css_task_iter_next(struct css_task_iter *it)
    3661             : {
    3662             :         struct task_struct *res;
    3663           0 :         struct list_head *l = it->task_pos;
    3664             : 
    3665             :         /* If the iterator cg is NULL, we have no tasks */
    3666           0 :         if (!it->cset_pos)
    3667             :                 return NULL;
    3668             :         res = list_entry(l, struct task_struct, cg_list);
    3669             : 
    3670             :         /*
    3671             :          * Advance iterator to find next entry.  cset->tasks is consumed
    3672             :          * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
    3673             :          * next cset.
    3674             :          */
    3675           0 :         l = l->next;
    3676             : 
    3677           0 :         if (l == it->tasks_head)
    3678           0 :                 l = it->mg_tasks_head->next;
    3679             : 
    3680           0 :         if (l == it->mg_tasks_head)
    3681           0 :                 css_advance_task_iter(it);
    3682             :         else
    3683           0 :                 it->task_pos = l;
    3684             : 
    3685           0 :         return res;
    3686             : }
    3687             : 
    3688             : /**
    3689             :  * css_task_iter_end - finish task iteration
    3690             :  * @it: the task iterator to finish
    3691             :  *
    3692             :  * Finish task iteration started by css_task_iter_start().
    3693             :  */
    3694           0 : void css_task_iter_end(struct css_task_iter *it)
    3695             :         __releases(css_set_rwsem)
    3696             : {
    3697           0 :         up_read(&css_set_rwsem);
    3698           0 : }
    3699             : 
    3700             : /**
    3701             :  * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
    3702             :  * @to: cgroup to which the tasks will be moved
    3703             :  * @from: cgroup in which the tasks currently reside
    3704             :  *
    3705             :  * Locking rules between cgroup_post_fork() and the migration path
    3706             :  * guarantee that, if a task is forking while being migrated, the new child
    3707             :  * is guaranteed to be either visible in the source cgroup after the
    3708             :  * parent's migration is complete or put into the target cgroup.  No task
    3709             :  * can slip out of migration through forking.
    3710             :  */
    3711           0 : int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
    3712             : {
    3713           0 :         LIST_HEAD(preloaded_csets);
    3714             :         struct cgrp_cset_link *link;
    3715             :         struct css_task_iter it;
    3716             :         struct task_struct *task;
    3717             :         int ret;
    3718             : 
    3719           0 :         mutex_lock(&cgroup_mutex);
    3720             : 
    3721             :         /* all tasks in @from are being moved, all csets are source */
    3722           0 :         down_read(&css_set_rwsem);
    3723           0 :         list_for_each_entry(link, &from->cset_links, cset_link)
    3724           0 :                 cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
    3725           0 :         up_read(&css_set_rwsem);
    3726             : 
    3727           0 :         ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
    3728           0 :         if (ret)
    3729             :                 goto out_err;
    3730             : 
    3731             :         /*
    3732             :          * Migrate tasks one-by-one until @form is empty.  This fails iff
    3733             :          * ->can_attach() fails.
    3734             :          */
    3735             :         do {
    3736           0 :                 css_task_iter_start(&from->self, &it);
    3737           0 :                 task = css_task_iter_next(&it);
    3738           0 :                 if (task)
    3739           0 :                         get_task_struct(task);
    3740             :                 css_task_iter_end(&it);
    3741             : 
    3742           0 :                 if (task) {
    3743           0 :                         ret = cgroup_migrate(to, task, false);
    3744             :                         put_task_struct(task);
    3745             :                 }
    3746           0 :         } while (task && !ret);
    3747             : out_err:
    3748           0 :         cgroup_migrate_finish(&preloaded_csets);
    3749           0 :         mutex_unlock(&cgroup_mutex);
    3750           0 :         return ret;
    3751             : }
    3752             : 
    3753             : /*
    3754             :  * Stuff for reading the 'tasks'/'procs' files.
    3755             :  *
    3756             :  * Reading this file can return large amounts of data if a cgroup has
    3757             :  * *lots* of attached tasks. So it may need several calls to read(),
    3758             :  * but we cannot guarantee that the information we produce is correct
    3759             :  * unless we produce it entirely atomically.
    3760             :  *
    3761             :  */
    3762             : 
    3763             : /* which pidlist file are we talking about? */
    3764             : enum cgroup_filetype {
    3765             :         CGROUP_FILE_PROCS,
    3766             :         CGROUP_FILE_TASKS,
    3767             : };
    3768             : 
    3769             : /*
    3770             :  * A pidlist is a list of pids that virtually represents the contents of one
    3771             :  * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
    3772             :  * a pair (one each for procs, tasks) for each pid namespace that's relevant
    3773             :  * to the cgroup.
    3774             :  */
    3775             : struct cgroup_pidlist {
    3776             :         /*
    3777             :          * used to find which pidlist is wanted. doesn't change as long as
    3778             :          * this particular list stays in the list.
    3779             :         */
    3780             :         struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
    3781             :         /* array of xids */
    3782             :         pid_t *list;
    3783             :         /* how many elements the above list has */
    3784             :         int length;
    3785             :         /* each of these stored in a list by its cgroup */
    3786             :         struct list_head links;
    3787             :         /* pointer to the cgroup we belong to, for list removal purposes */
    3788             :         struct cgroup *owner;
    3789             :         /* for delayed destruction */
    3790             :         struct delayed_work destroy_dwork;
    3791             : };
    3792             : 
    3793             : /*
    3794             :  * The following two functions "fix" the issue where there are more pids
    3795             :  * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
    3796             :  * TODO: replace with a kernel-wide solution to this problem
    3797             :  */
    3798             : #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
    3799           0 : static void *pidlist_allocate(int count)
    3800             : {
    3801           0 :         if (PIDLIST_TOO_LARGE(count))
    3802           0 :                 return vmalloc(count * sizeof(pid_t));
    3803             :         else
    3804           0 :                 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
    3805             : }
    3806             : 
    3807           0 : static void pidlist_free(void *p)
    3808             : {
    3809           0 :         if (is_vmalloc_addr(p))
    3810           0 :                 vfree(p);
    3811             :         else
    3812           0 :                 kfree(p);
    3813           0 : }
    3814             : 
    3815             : /*
    3816             :  * Used to destroy all pidlists lingering waiting for destroy timer.  None
    3817             :  * should be left afterwards.
    3818             :  */
    3819           0 : static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
    3820             : {
    3821             :         struct cgroup_pidlist *l, *tmp_l;
    3822             : 
    3823           0 :         mutex_lock(&cgrp->pidlist_mutex);
    3824           0 :         list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
    3825           0 :                 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
    3826           0 :         mutex_unlock(&cgrp->pidlist_mutex);
    3827             : 
    3828           0 :         flush_workqueue(cgroup_pidlist_destroy_wq);
    3829             :         BUG_ON(!list_empty(&cgrp->pidlists));
    3830           0 : }
    3831             : 
    3832           0 : static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
    3833             : {
    3834             :         struct delayed_work *dwork = to_delayed_work(work);
    3835           0 :         struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
    3836             :                                                 destroy_dwork);
    3837             :         struct cgroup_pidlist *tofree = NULL;
    3838             : 
    3839           0 :         mutex_lock(&l->owner->pidlist_mutex);
    3840             : 
    3841             :         /*
    3842             :          * Destroy iff we didn't get queued again.  The state won't change
    3843             :          * as destroy_dwork can only be queued while locked.
    3844             :          */
    3845           0 :         if (!delayed_work_pending(dwork)) {
    3846             :                 list_del(&l->links);
    3847           0 :                 pidlist_free(l->list);
    3848           0 :                 put_pid_ns(l->key.ns);
    3849             :                 tofree = l;
    3850             :         }
    3851             : 
    3852           0 :         mutex_unlock(&l->owner->pidlist_mutex);
    3853           0 :         kfree(tofree);
    3854           0 : }
    3855             : 
    3856             : /*
    3857             :  * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
    3858             :  * Returns the number of unique elements.
    3859             :  */
    3860           0 : static int pidlist_uniq(pid_t *list, int length)
    3861             : {
    3862             :         int src, dest = 1;
    3863             : 
    3864             :         /*
    3865             :          * we presume the 0th element is unique, so i starts at 1. trivial
    3866             :          * edge cases first; no work needs to be done for either
    3867             :          */
    3868           0 :         if (length == 0 || length == 1)
    3869             :                 return length;
    3870             :         /* src and dest walk down the list; dest counts unique elements */
    3871           0 :         for (src = 1; src < length; src++) {
    3872             :                 /* find next unique element */
    3873           0 :                 while (list[src] == list[src-1]) {
    3874           0 :                         src++;
    3875           0 :                         if (src == length)
    3876             :                                 goto after;
    3877             :                 }
    3878             :                 /* dest always points to where the next unique element goes */
    3879           0 :                 list[dest] = list[src];
    3880           0 :                 dest++;
    3881             :         }
    3882             : after:
    3883             :         return dest;
    3884             : }
    3885             : 
    3886             : /*
    3887             :  * The two pid files - task and cgroup.procs - guaranteed that the result
    3888             :  * is sorted, which forced this whole pidlist fiasco.  As pid order is
    3889             :  * different per namespace, each namespace needs differently sorted list,
    3890             :  * making it impossible to use, for example, single rbtree of member tasks
    3891             :  * sorted by task pointer.  As pidlists can be fairly large, allocating one
    3892             :  * per open file is dangerous, so cgroup had to implement shared pool of
    3893             :  * pidlists keyed by cgroup and namespace.
    3894             :  *
    3895             :  * All this extra complexity was caused by the original implementation
    3896             :  * committing to an entirely unnecessary property.  In the long term, we
    3897             :  * want to do away with it.  Explicitly scramble sort order if on the
    3898             :  * default hierarchy so that no such expectation exists in the new
    3899             :  * interface.
    3900             :  *
    3901             :  * Scrambling is done by swapping every two consecutive bits, which is
    3902             :  * non-identity one-to-one mapping which disturbs sort order sufficiently.
    3903             :  */
    3904             : static pid_t pid_fry(pid_t pid)
    3905             : {
    3906           0 :         unsigned a = pid & 0x55555555;
    3907           0 :         unsigned b = pid & 0xAAAAAAAA;
    3908             : 
    3909           0 :         return (a << 1) | (b >> 1);
    3910             : }
    3911             : 
    3912           0 : static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
    3913             : {
    3914           0 :         if (cgroup_on_dfl(cgrp))
    3915             :                 return pid_fry(pid);
    3916             :         else
    3917             :                 return pid;
    3918             : }
    3919             : 
    3920           0 : static int cmppid(const void *a, const void *b)
    3921             : {
    3922           0 :         return *(pid_t *)a - *(pid_t *)b;
    3923             : }
    3924             : 
    3925           0 : static int fried_cmppid(const void *a, const void *b)
    3926             : {
    3927           0 :         return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
    3928             : }
    3929             : 
    3930           0 : static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
    3931             :                                                   enum cgroup_filetype type)
    3932             : {
    3933             :         struct cgroup_pidlist *l;
    3934             :         /* don't need task_nsproxy() if we're looking at ourself */
    3935           0 :         struct pid_namespace *ns = task_active_pid_ns(current);
    3936             : 
    3937             :         lockdep_assert_held(&cgrp->pidlist_mutex);
    3938             : 
    3939           0 :         list_for_each_entry(l, &cgrp->pidlists, links)
    3940           0 :                 if (l->key.type == type && l->key.ns == ns)
    3941             :                         return l;
    3942             :         return NULL;
    3943             : }
    3944             : 
    3945             : /*
    3946             :  * find the appropriate pidlist for our purpose (given procs vs tasks)
    3947             :  * returns with the lock on that pidlist already held, and takes care
    3948             :  * of the use count, or returns NULL with no locks held if we're out of
    3949             :  * memory.
    3950             :  */
    3951           0 : static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
    3952             :                                                 enum cgroup_filetype type)
    3953             : {
    3954             :         struct cgroup_pidlist *l;
    3955             : 
    3956             :         lockdep_assert_held(&cgrp->pidlist_mutex);
    3957             : 
    3958           0 :         l = cgroup_pidlist_find(cgrp, type);
    3959           0 :         if (l)
    3960             :                 return l;
    3961             : 
    3962             :         /* entry not found; create a new one */
    3963             :         l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
    3964           0 :         if (!l)
    3965             :                 return l;
    3966             : 
    3967           0 :         INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
    3968           0 :         l->key.type = type;
    3969             :         /* don't need task_nsproxy() if we're looking at ourself */
    3970           0 :         l->key.ns = get_pid_ns(task_active_pid_ns(current));
    3971           0 :         l->owner = cgrp;
    3972           0 :         list_add(&l->links, &cgrp->pidlists);
    3973           0 :         return l;
    3974             : }
    3975             : 
    3976             : /*
    3977             :  * Load a cgroup's pidarray with either procs' tgids or tasks' pids
    3978             :  */
    3979           0 : static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
    3980             :                               struct cgroup_pidlist **lp)
    3981             : {
    3982             :         pid_t *array;
    3983             :         int length;
    3984             :         int pid, n = 0; /* used for populating the array */
    3985             :         struct css_task_iter it;
    3986             :         struct task_struct *tsk;
    3987             :         struct cgroup_pidlist *l;
    3988             : 
    3989             :         lockdep_assert_held(&cgrp->pidlist_mutex);
    3990             : 
    3991             :         /*
    3992             :          * If cgroup gets more users after we read count, we won't have
    3993             :          * enough space - tough.  This race is indistinguishable to the
    3994             :          * caller from the case that the additional cgroup users didn't
    3995             :          * show up until sometime later on.
    3996             :          */
    3997           0 :         length = cgroup_task_count(cgrp);
    3998           0 :         array = pidlist_allocate(length);
    3999           0 :         if (!array)
    4000             :                 return -ENOMEM;
    4001             :         /* now, populate the array */
    4002           0 :         css_task_iter_start(&cgrp->self, &it);
    4003           0 :         while ((tsk = css_task_iter_next(&it))) {
    4004           0 :                 if (unlikely(n == length))
    4005             :                         break;
    4006             :                 /* get tgid or pid for procs or tasks file respectively */
    4007           0 :                 if (type == CGROUP_FILE_PROCS)
    4008             :                         pid = task_tgid_vnr(tsk);
    4009             :                 else
    4010             :                         pid = task_pid_vnr(tsk);
    4011           0 :                 if (pid > 0) /* make sure to only use valid results */
    4012           0 :                         array[n++] = pid;
    4013             :         }
    4014             :         css_task_iter_end(&it);
    4015             :         length = n;
    4016             :         /* now sort & (if procs) strip out duplicates */
    4017           0 :         if (cgroup_on_dfl(cgrp))
    4018           0 :                 sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
    4019             :         else
    4020           0 :                 sort(array, length, sizeof(pid_t), cmppid, NULL);
    4021           0 :         if (type == CGROUP_FILE_PROCS)
    4022           0 :                 length = pidlist_uniq(array, length);
    4023             : 
    4024           0 :         l = cgroup_pidlist_find_create(cgrp, type);
    4025           0 :         if (!l) {
    4026           0 :                 pidlist_free(array);
    4027           0 :                 return -ENOMEM;
    4028             :         }
    4029             : 
    4030             :         /* store array, freeing old if necessary */
    4031           0 :         pidlist_free(l->list);
    4032           0 :         l->list = array;
    4033           0 :         l->length = length;
    4034           0 :         *lp = l;
    4035           0 :         return 0;
    4036             : }
    4037             : 
    4038             : /**
    4039             :  * cgroupstats_build - build and fill cgroupstats
    4040             :  * @stats: cgroupstats to fill information into
    4041             :  * @dentry: A dentry entry belonging to the cgroup for which stats have
    4042             :  * been requested.
    4043             :  *
    4044             :  * Build and fill cgroupstats so that taskstats can export it to user
    4045             :  * space.
    4046             :  */
    4047           0 : int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
    4048             : {
    4049           0 :         struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
    4050           0 :         struct cgroup *cgrp;
    4051             :         struct css_task_iter it;
    4052             :         struct task_struct *tsk;
    4053             : 
    4054             :         /* it should be kernfs_node belonging to cgroupfs and is a directory */
    4055           0 :         if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
    4056             :             kernfs_type(kn) != KERNFS_DIR)
    4057             :                 return -EINVAL;
    4058             : 
    4059           0 :         mutex_lock(&cgroup_mutex);
    4060             : 
    4061             :         /*
    4062             :          * We aren't being called from kernfs and there's no guarantee on
    4063             :          * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
    4064             :          * @kn->priv is RCU safe.  Let's do the RCU dancing.
    4065             :          */
    4066             :         rcu_read_lock();
    4067           0 :         cgrp = rcu_dereference(kn->priv);
    4068           0 :         if (!cgrp || cgroup_is_dead(cgrp)) {
    4069             :                 rcu_read_unlock();
    4070           0 :                 mutex_unlock(&cgroup_mutex);
    4071           0 :                 return -ENOENT;
    4072             :         }
    4073             :         rcu_read_unlock();
    4074             : 
    4075           0 :         css_task_iter_start(&cgrp->self, &it);
    4076           0 :         while ((tsk = css_task_iter_next(&it))) {
    4077           0 :                 switch (tsk->state) {
    4078             :                 case TASK_RUNNING:
    4079           0 :                         stats->nr_running++;
    4080           0 :                         break;
    4081             :                 case TASK_INTERRUPTIBLE:
    4082           0 :                         stats->nr_sleeping++;
    4083           0 :                         break;
    4084             :                 case TASK_UNINTERRUPTIBLE:
    4085           0 :                         stats->nr_uninterruptible++;
    4086           0 :                         break;
    4087             :                 case TASK_STOPPED:
    4088           0 :                         stats->nr_stopped++;
    4089           0 :                         break;
    4090             :                 default:
    4091           0 :                         if (delayacct_is_task_waiting_on_io(tsk))
    4092           0 :                                 stats->nr_io_wait++;
    4093             :                         break;
    4094             :                 }
    4095             :         }
    4096             :         css_task_iter_end(&it);
    4097             : 
    4098           0 :         mutex_unlock(&cgroup_mutex);
    4099           0 :         return 0;
    4100             : }
    4101             : 
    4102             : 
    4103             : /*
    4104             :  * seq_file methods for the tasks/procs files. The seq_file position is the
    4105             :  * next pid to display; the seq_file iterator is a pointer to the pid
    4106             :  * in the cgroup->l->list array.
    4107             :  */
    4108             : 
    4109           0 : static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
    4110             : {
    4111             :         /*
    4112             :          * Initially we receive a position value that corresponds to
    4113             :          * one more than the last pid shown (or 0 on the first call or
    4114             :          * after a seek to the start). Use a binary-search to find the
    4115             :          * next pid to display, if any
    4116             :          */
    4117           0 :         struct kernfs_open_file *of = s->private;
    4118           0 :         struct cgroup *cgrp = seq_css(s)->cgroup;
    4119             :         struct cgroup_pidlist *l;
    4120           0 :         enum cgroup_filetype type = seq_cft(s)->private;
    4121           0 :         int index = 0, pid = *pos;
    4122             :         int *iter, ret;
    4123             : 
    4124           0 :         mutex_lock(&cgrp->pidlist_mutex);
    4125             : 
    4126             :         /*
    4127             :          * !NULL @of->priv indicates that this isn't the first start()
    4128             :          * after open.  If the matching pidlist is around, we can use that.
    4129             :          * Look for it.  Note that @of->priv can't be used directly.  It
    4130             :          * could already have been destroyed.
    4131             :          */
    4132           0 :         if (of->priv)
    4133           0 :                 of->priv = cgroup_pidlist_find(cgrp, type);
    4134             : 
    4135             :         /*
    4136             :          * Either this is the first start() after open or the matching
    4137             :          * pidlist has been destroyed inbetween.  Create a new one.
    4138             :          */
    4139           0 :         if (!of->priv) {
    4140           0 :                 ret = pidlist_array_load(cgrp, type,
    4141           0 :                                          (struct cgroup_pidlist **)&of->priv);
    4142           0 :                 if (ret)
    4143           0 :                         return ERR_PTR(ret);
    4144             :         }
    4145           0 :         l = of->priv;
    4146             : 
    4147           0 :         if (pid) {
    4148           0 :                 int end = l->length;
    4149             : 
    4150           0 :                 while (index < end) {
    4151           0 :                         int mid = (index + end) / 2;
    4152           0 :                         if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
    4153             :                                 index = mid;
    4154             :                                 break;
    4155           0 :                         } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
    4156           0 :                                 index = mid + 1;
    4157             :                         else
    4158             :                                 end = mid;
    4159             :                 }
    4160             :         }
    4161             :         /* If we're off the end of the array, we're done */
    4162           0 :         if (index >= l->length)
    4163             :                 return NULL;
    4164             :         /* Update the abstract position to be the actual pid that we found */
    4165           0 :         iter = l->list + index;
    4166           0 :         *pos = cgroup_pid_fry(cgrp, *iter);
    4167           0 :         return iter;
    4168             : }
    4169             : 
    4170           0 : static void cgroup_pidlist_stop(struct seq_file *s, void *v)
    4171             : {
    4172           0 :         struct kernfs_open_file *of = s->private;
    4173           0 :         struct cgroup_pidlist *l = of->priv;
    4174             : 
    4175           0 :         if (l)
    4176           0 :                 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
    4177             :                                  CGROUP_PIDLIST_DESTROY_DELAY);
    4178           0 :         mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
    4179           0 : }
    4180             : 
    4181           0 : static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
    4182             : {
    4183           0 :         struct kernfs_open_file *of = s->private;
    4184           0 :         struct cgroup_pidlist *l = of->priv;
    4185             :         pid_t *p = v;
    4186           0 :         pid_t *end = l->list + l->length;
    4187             :         /*
    4188             :          * Advance to the next pid in the array. If this goes off the
    4189             :          * end, we're done
    4190             :          */
    4191           0 :         p++;
    4192           0 :         if (p >= end) {
    4193             :                 return NULL;
    4194             :         } else {
    4195           0 :                 *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
    4196           0 :                 return p;
    4197             :         }
    4198             : }
    4199             : 
    4200           0 : static int cgroup_pidlist_show(struct seq_file *s, void *v)
    4201             : {
    4202           0 :         return seq_printf(s, "%d\n", *(int *)v);
    4203             : }
    4204             : 
    4205           0 : static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
    4206             :                                          struct cftype *cft)
    4207             : {
    4208           0 :         return notify_on_release(css->cgroup);
    4209             : }
    4210             : 
    4211           0 : static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
    4212             :                                           struct cftype *cft, u64 val)
    4213             : {
    4214           0 :         if (val)
    4215           0 :                 set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
    4216             :         else
    4217           0 :                 clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
    4218           0 :         return 0;
    4219             : }
    4220             : 
    4221           0 : static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
    4222             :                                       struct cftype *cft)
    4223             : {
    4224           0 :         return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
    4225             : }
    4226             : 
    4227           0 : static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
    4228             :                                        struct cftype *cft, u64 val)
    4229             : {
    4230           0 :         if (val)
    4231           0 :                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
    4232             :         else
    4233           0 :                 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
    4234           0 :         return 0;
    4235             : }
    4236             : 
    4237             : /* cgroup core interface files for the default hierarchy */
    4238             : static struct cftype cgroup_dfl_base_files[] = {
    4239             :         {
    4240             :                 .name = "cgroup.procs",
    4241             :                 .seq_start = cgroup_pidlist_start,
    4242             :                 .seq_next = cgroup_pidlist_next,
    4243             :                 .seq_stop = cgroup_pidlist_stop,
    4244             :                 .seq_show = cgroup_pidlist_show,
    4245             :                 .private = CGROUP_FILE_PROCS,
    4246             :                 .write = cgroup_procs_write,
    4247             :                 .mode = S_IRUGO | S_IWUSR,
    4248             :         },
    4249             :         {
    4250             :                 .name = "cgroup.controllers",
    4251             :                 .flags = CFTYPE_ONLY_ON_ROOT,
    4252             :                 .seq_show = cgroup_root_controllers_show,
    4253             :         },
    4254             :         {
    4255             :                 .name = "cgroup.controllers",
    4256             :                 .flags = CFTYPE_NOT_ON_ROOT,
    4257             :                 .seq_show = cgroup_controllers_show,
    4258             :         },
    4259             :         {
    4260             :                 .name = "cgroup.subtree_control",
    4261             :                 .seq_show = cgroup_subtree_control_show,
    4262             :                 .write = cgroup_subtree_control_write,
    4263             :         },
    4264             :         {
    4265             :                 .name = "cgroup.populated",
    4266             :                 .flags = CFTYPE_NOT_ON_ROOT,
    4267             :                 .seq_show = cgroup_populated_show,
    4268             :         },
    4269             :         { }     /* terminate */
    4270             : };
    4271             : 
    4272             : /* cgroup core interface files for the legacy hierarchies */
    4273             : static struct cftype cgroup_legacy_base_files[] = {
    4274             :         {
    4275             :                 .name = "cgroup.procs",
    4276             :                 .seq_start = cgroup_pidlist_start,
    4277             :                 .seq_next = cgroup_pidlist_next,
    4278             :                 .seq_stop = cgroup_pidlist_stop,
    4279             :                 .seq_show = cgroup_pidlist_show,
    4280             :                 .private = CGROUP_FILE_PROCS,
    4281             :                 .write = cgroup_procs_write,
    4282             :                 .mode = S_IRUGO | S_IWUSR,
    4283             :         },
    4284             :         {
    4285             :                 .name = "cgroup.clone_children",
    4286             :                 .read_u64 = cgroup_clone_children_read,
    4287             :                 .write_u64 = cgroup_clone_children_write,
    4288             :         },
    4289             :         {
    4290             :                 .name = "cgroup.sane_behavior",
    4291             :                 .flags = CFTYPE_ONLY_ON_ROOT,
    4292             :                 .seq_show = cgroup_sane_behavior_show,
    4293             :         },
    4294             :         {
    4295             :                 .name = "tasks",
    4296             :                 .seq_start = cgroup_pidlist_start,
    4297             :                 .seq_next = cgroup_pidlist_next,
    4298             :                 .seq_stop = cgroup_pidlist_stop,
    4299             :                 .seq_show = cgroup_pidlist_show,
    4300             :                 .private = CGROUP_FILE_TASKS,
    4301             :                 .write = cgroup_tasks_write,
    4302             :                 .mode = S_IRUGO | S_IWUSR,
    4303             :         },
    4304             :         {
    4305             :                 .name = "notify_on_release",
    4306             :                 .read_u64 = cgroup_read_notify_on_release,
    4307             :                 .write_u64 = cgroup_write_notify_on_release,
    4308             :         },
    4309             :         {
    4310             :                 .name = "release_agent",
    4311             :                 .flags = CFTYPE_ONLY_ON_ROOT,
    4312             :                 .seq_show = cgroup_release_agent_show,
    4313             :                 .write = cgroup_release_agent_write,
    4314             :                 .max_write_len = PATH_MAX - 1,
    4315             :         },
    4316             :         { }     /* terminate */
    4317             : };
    4318             : 
    4319             : /**
    4320             :  * cgroup_populate_dir - create subsys files in a cgroup directory
    4321             :  * @cgrp: target cgroup
    4322             :  * @subsys_mask: mask of the subsystem ids whose files should be added
    4323             :  *
    4324             :  * On failure, no file is added.
    4325             :  */
    4326           1 : static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
    4327             : {
    4328             :         struct cgroup_subsys *ss;
    4329             :         int i, ret = 0;
    4330             : 
    4331             :         /* process cftsets of each subsystem */
    4332           8 :         for_each_subsys(ss, i) {
    4333             :                 struct cftype *cfts;
    4334             : 
    4335           7 :                 if (!(subsys_mask & (1 << i)))
    4336           7 :                         continue;
    4337             : 
    4338           0 :                 list_for_each_entry(cfts, &ss->cfts, node) {
    4339           0 :                         ret = cgroup_addrm_files(cgrp, cfts, true);
    4340           0 :                         if (ret < 0)
    4341             :                                 goto err;
    4342             :                 }
    4343             :         }
    4344             :         return 0;
    4345             : err:
    4346           0 :         cgroup_clear_dir(cgrp, subsys_mask);
    4347           0 :         return ret;
    4348             : }
    4349             : 
    4350             : /*
    4351             :  * css destruction is four-stage process.
    4352             :  *
    4353             :  * 1. Destruction starts.  Killing of the percpu_ref is initiated.
    4354             :  *    Implemented in kill_css().
    4355             :  *
    4356             :  * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
    4357             :  *    and thus css_tryget_online() is guaranteed to fail, the css can be
    4358             :  *    offlined by invoking offline_css().  After offlining, the base ref is
    4359             :  *    put.  Implemented in css_killed_work_fn().
    4360             :  *
    4361             :  * 3. When the percpu_ref reaches zero, the only possible remaining
    4362             :  *    accessors are inside RCU read sections.  css_release() schedules the
    4363             :  *    RCU callback.
    4364             :  *
    4365             :  * 4. After the grace period, the css can be freed.  Implemented in
    4366             :  *    css_free_work_fn().
    4367             :  *
    4368             :  * It is actually hairier because both step 2 and 4 require process context
    4369             :  * and thus involve punting to css->destroy_work adding two additional
    4370             :  * steps to the already complex sequence.
    4371             :  */
    4372           0 : static void css_free_work_fn(struct work_struct *work)
    4373             : {
    4374           0 :         struct cgroup_subsys_state *css =
    4375             :                 container_of(work, struct cgroup_subsys_state, destroy_work);
    4376           0 :         struct cgroup *cgrp = css->cgroup;
    4377             : 
    4378           0 :         percpu_ref_exit(&css->refcnt);
    4379             : 
    4380           0 :         if (css->ss) {
    4381             :                 /* css free path */
    4382           0 :                 if (css->parent)
    4383             :                         css_put(css->parent);
    4384             : 
    4385           0 :                 css->ss->css_free(css);
    4386           0 :                 cgroup_put(cgrp);
    4387             :         } else {
    4388             :                 /* cgroup free path */
    4389           0 :                 atomic_dec(&cgrp->root->nr_cgrps);
    4390           0 :                 cgroup_pidlist_destroy_all(cgrp);
    4391           0 :                 cancel_work_sync(&cgrp->release_agent_work);
    4392             : 
    4393           0 :                 if (cgroup_parent(cgrp)) {
    4394             :                         /*
    4395             :                          * We get a ref to the parent, and put the ref when
    4396             :                          * this cgroup is being freed, so it's guaranteed
    4397             :                          * that the parent won't be destroyed before its
    4398             :                          * children.
    4399             :                          */
    4400           0 :                         cgroup_put(cgroup_parent(cgrp));
    4401           0 :                         kernfs_put(cgrp->kn);
    4402           0 :                         kfree(cgrp);
    4403             :                 } else {
    4404             :                         /*
    4405             :                          * This is root cgroup's refcnt reaching zero,
    4406             :                          * which indicates that the root should be
    4407             :                          * released.
    4408             :                          */
    4409           0 :                         cgroup_destroy_root(cgrp->root);
    4410             :                 }
    4411             :         }
    4412           0 : }
    4413             : 
    4414           0 : static void css_free_rcu_fn(struct rcu_head *rcu_head)
    4415             : {
    4416             :         struct cgroup_subsys_state *css =
    4417             :                 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
    4418             : 
    4419           0 :         INIT_WORK(&css->destroy_work, css_free_work_fn);
    4420           0 :         queue_work(cgroup_destroy_wq, &css->destroy_work);
    4421           0 : }
    4422             : 
    4423           0 : static void css_release_work_fn(struct work_struct *work)
    4424             : {
    4425           0 :         struct cgroup_subsys_state *css =
    4426             :                 container_of(work, struct cgroup_subsys_state, destroy_work);
    4427           0 :         struct cgroup_subsys *ss = css->ss;
    4428           0 :         struct cgroup *cgrp = css->cgroup;
    4429             : 
    4430           0 :         mutex_lock(&cgroup_mutex);
    4431             : 
    4432           0 :         css->flags |= CSS_RELEASED;
    4433             :         list_del_rcu(&css->sibling);
    4434             : 
    4435           0 :         if (ss) {
    4436             :                 /* css release path */
    4437           0 :                 cgroup_idr_remove(&ss->css_idr, css->id);
    4438           0 :                 if (ss->css_released)
    4439           0 :                         ss->css_released(css);
    4440             :         } else {
    4441             :                 /* cgroup release path */
    4442           0 :                 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
    4443           0 :                 cgrp->id = -1;
    4444             : 
    4445             :                 /*
    4446             :                  * There are two control paths which try to determine
    4447             :                  * cgroup from dentry without going through kernfs -
    4448             :                  * cgroupstats_build() and css_tryget_online_from_dir().
    4449             :                  * Those are supported by RCU protecting clearing of
    4450             :                  * cgrp->kn->priv backpointer.
    4451             :                  */
    4452           0 :                 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
    4453             :         }
    4454             : 
    4455           0 :         mutex_unlock(&cgroup_mutex);
    4456             : 
    4457           0 :         call_rcu(&css->rcu_head, css_free_rcu_fn);
    4458           0 : }
    4459             : 
    4460           0 : static void css_release(struct percpu_ref *ref)
    4461             : {
    4462             :         struct cgroup_subsys_state *css =
    4463             :                 container_of(ref, struct cgroup_subsys_state, refcnt);
    4464             : 
    4465           0 :         INIT_WORK(&css->destroy_work, css_release_work_fn);
    4466           0 :         queue_work(cgroup_destroy_wq, &css->destroy_work);
    4467           0 : }
    4468             : 
    4469           7 : static void init_and_link_css(struct cgroup_subsys_state *css,
    4470           7 :                               struct cgroup_subsys *ss, struct cgroup *cgrp)
    4471             : {
    4472             :         lockdep_assert_held(&cgroup_mutex);
    4473             : 
    4474           7 :         cgroup_get(cgrp);
    4475             : 
    4476           7 :         memset(css, 0, sizeof(*css));
    4477           7 :         css->cgroup = cgrp;
    4478           7 :         css->ss = ss;
    4479           7 :         INIT_LIST_HEAD(&css->sibling);
    4480           7 :         INIT_LIST_HEAD(&css->children);
    4481           7 :         css->serial_nr = css_serial_nr_next++;
    4482             : 
    4483           7 :         if (cgroup_parent(cgrp)) {
    4484           0 :                 css->parent = cgroup_css(cgroup_parent(cgrp), ss);
    4485             :                 css_get(css->parent);
    4486             :         }
    4487             : 
    4488             :         BUG_ON(cgroup_css(cgrp, ss));
    4489           7 : }
    4490             : 
    4491             : /* invoke ->css_online() on a new CSS and mark it online if successful */
    4492           7 : static int online_css(struct cgroup_subsys_state *css)
    4493             : {
    4494           7 :         struct cgroup_subsys *ss = css->ss;
    4495             :         int ret = 0;
    4496             : 
    4497             :         lockdep_assert_held(&cgroup_mutex);
    4498             : 
    4499           7 :         if (ss->css_online)
    4500           5 :                 ret = ss->css_online(css);
    4501           7 :         if (!ret) {
    4502           7 :                 css->flags |= CSS_ONLINE;
    4503           7 :                 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
    4504             :         }
    4505           7 :         return ret;
    4506             : }
    4507             : 
    4508             : /* if the CSS is online, invoke ->css_offline() on it and mark it offline */
    4509           0 : static void offline_css(struct cgroup_subsys_state *css)
    4510             : {
    4511           0 :         struct cgroup_subsys *ss = css->ss;
    4512             : 
    4513             :         lockdep_assert_held(&cgroup_mutex);
    4514             : 
    4515           0 :         if (!(css->flags & CSS_ONLINE))
    4516           0 :                 return;
    4517             : 
    4518           0 :         if (ss->css_offline)
    4519           0 :                 ss->css_offline(css);
    4520             : 
    4521           0 :         css->flags &= ~CSS_ONLINE;
    4522           0 :         RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
    4523             : 
    4524           0 :         wake_up_all(&css->cgroup->offline_waitq);
    4525             : }
    4526             : 
    4527             : /**
    4528             :  * create_css - create a cgroup_subsys_state
    4529             :  * @cgrp: the cgroup new css will be associated with
    4530             :  * @ss: the subsys of new css
    4531             :  * @visible: whether to create control knobs for the new css or not
    4532             :  *
    4533             :  * Create a new css associated with @cgrp - @ss pair.  On success, the new
    4534             :  * css is online and installed in @cgrp with all interface files created if
    4535             :  * @visible.  Returns 0 on success, -errno on failure.
    4536             :  */
    4537           0 : static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
    4538             :                       bool visible)
    4539             : {
    4540           0 :         struct cgroup *parent = cgroup_parent(cgrp);
    4541             :         struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
    4542             :         struct cgroup_subsys_state *css;
    4543             :         int err;
    4544             : 
    4545             :         lockdep_assert_held(&cgroup_mutex);
    4546             : 
    4547           0 :         css = ss->css_alloc(parent_css);
    4548           0 :         if (IS_ERR(css))
    4549           0 :                 return PTR_ERR(css);
    4550             : 
    4551           0 :         init_and_link_css(css, ss, cgrp);
    4552             : 
    4553           0 :         err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
    4554           0 :         if (err)
    4555             :                 goto err_free_css;
    4556             : 
    4557           0 :         err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
    4558           0 :         if (err < 0)
    4559             :                 goto err_free_percpu_ref;
    4560           0 :         css->id = err;
    4561             : 
    4562           0 :         if (visible) {
    4563           0 :                 err = cgroup_populate_dir(cgrp, 1 << ss->id);
    4564           0 :                 if (err)
    4565             :                         goto err_free_id;
    4566             :         }
    4567             : 
    4568             :         /* @css is ready to be brought online now, make it visible */
    4569           0 :         list_add_tail_rcu(&css->sibling, &parent_css->children);
    4570           0 :         cgroup_idr_replace(&ss->css_idr, css, css->id);
    4571             : 
    4572           0 :         err = online_css(css);
    4573           0 :         if (err)
    4574             :                 goto err_list_del;
    4575             : 
    4576           0 :         if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
    4577             :             cgroup_parent(parent)) {
    4578           0 :                 pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
    4579             :                         current->comm, current->pid, ss->name);
    4580           0 :                 if (!strcmp(ss->name, "memory"))
    4581           0 :                         pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
    4582           0 :                 ss->warned_broken_hierarchy = true;
    4583             :         }
    4584             : 
    4585             :         return 0;
    4586             : 
    4587             : err_list_del:
    4588             :         list_del_rcu(&css->sibling);
    4589           0 :         cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
    4590             : err_free_id:
    4591           0 :         cgroup_idr_remove(&ss->css_idr, css->id);
    4592             : err_free_percpu_ref:
    4593           0 :         percpu_ref_exit(&css->refcnt);
    4594             : err_free_css:
    4595           0 :         call_rcu(&css->rcu_head, css_free_rcu_fn);
    4596           0 :         return err;
    4597             : }
    4598             : 
    4599           0 : static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
    4600             :                         umode_t mode)
    4601             : {
    4602           0 :         struct cgroup *parent, *cgrp;
    4603             :         struct cgroup_root *root;
    4604             :         struct cgroup_subsys *ss;
    4605             :         struct kernfs_node *kn;
    4606             :         struct cftype *base_files;
    4607             :         int ssid, ret;
    4608             : 
    4609             :         /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
    4610             :          */
    4611           0 :         if (strchr(name, '\n'))
    4612             :                 return -EINVAL;
    4613             : 
    4614           0 :         parent = cgroup_kn_lock_live(parent_kn);
    4615           0 :         if (!parent)
    4616             :                 return -ENODEV;
    4617           0 :         root = parent->root;
    4618             : 
    4619             :         /* allocate the cgroup and its ID, 0 is reserved for the root */
    4620             :         cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
    4621           0 :         if (!cgrp) {
    4622             :                 ret = -ENOMEM;
    4623             :                 goto out_unlock;
    4624             :         }
    4625             : 
    4626           0 :         ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
    4627           0 :         if (ret)
    4628             :                 goto out_free_cgrp;
    4629             : 
    4630             :         /*
    4631             :          * Temporarily set the pointer to NULL, so idr_find() won't return
    4632             :          * a half-baked cgroup.
    4633             :          */
    4634           0 :         cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
    4635           0 :         if (cgrp->id < 0) {
    4636             :                 ret = -ENOMEM;
    4637             :                 goto out_cancel_ref;
    4638             :         }
    4639             : 
    4640           0 :         init_cgroup_housekeeping(cgrp);
    4641             : 
    4642           0 :         cgrp->self.parent = &parent->self;
    4643           0 :         cgrp->root = root;
    4644             : 
    4645           0 :         if (notify_on_release(parent))
    4646             :                 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
    4647             : 
    4648           0 :         if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
    4649             :                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
    4650             : 
    4651             :         /* create the directory */
    4652           0 :         kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
    4653           0 :         if (IS_ERR(kn)) {
    4654             :                 ret = PTR_ERR(kn);
    4655             :                 goto out_free_id;
    4656             :         }
    4657           0 :         cgrp->kn = kn;
    4658             : 
    4659             :         /*
    4660             :          * This extra ref will be put in cgroup_free_fn() and guarantees
    4661             :          * that @cgrp->kn is always accessible.
    4662             :          */
    4663           0 :         kernfs_get(kn);
    4664             : 
    4665           0 :         cgrp->self.serial_nr = css_serial_nr_next++;
    4666             : 
    4667             :         /* allocation complete, commit to creation */
    4668           0 :         list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
    4669           0 :         atomic_inc(&root->nr_cgrps);
    4670           0 :         cgroup_get(parent);
    4671             : 
    4672             :         /*
    4673             :          * @cgrp is now fully operational.  If something fails after this
    4674             :          * point, it'll be released via the normal destruction path.
    4675             :          */
    4676           0 :         cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
    4677             : 
    4678           0 :         ret = cgroup_kn_set_ugid(kn);
    4679           0 :         if (ret)
    4680             :                 goto out_destroy;
    4681             : 
    4682           0 :         if (cgroup_on_dfl(cgrp))
    4683             :                 base_files = cgroup_dfl_base_files;
    4684             :         else
    4685             :                 base_files = cgroup_legacy_base_files;
    4686             : 
    4687           0 :         ret = cgroup_addrm_files(cgrp, base_files, true);
    4688           0 :         if (ret)
    4689             :                 goto out_destroy;
    4690             : 
    4691             :         /* let's create and online css's */
    4692           0 :         for_each_subsys(ss, ssid) {
    4693           0 :                 if (parent->child_subsys_mask & (1 << ssid)) {
    4694           0 :                         ret = create_css(cgrp, ss,
    4695           0 :                                          parent->subtree_control & (1 << ssid));
    4696           0 :                         if (ret)
    4697             :                                 goto out_destroy;
    4698             :                 }
    4699             :         }
    4700             : 
    4701             :         /*
    4702             :          * On the default hierarchy, a child doesn't automatically inherit
    4703             :          * subtree_control from the parent.  Each is configured manually.
    4704             :          */
    4705           0 :         if (!cgroup_on_dfl(cgrp)) {
    4706           0 :                 cgrp->subtree_control = parent->subtree_control;
    4707             :                 cgroup_refresh_child_subsys_mask(cgrp);
    4708             :         }
    4709             : 
    4710           0 :         kernfs_activate(kn);
    4711             : 
    4712             :         ret = 0;
    4713           0 :         goto out_unlock;
    4714             : 
    4715             : out_free_id:
    4716           0 :         cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
    4717             : out_cancel_ref:
    4718           0 :         percpu_ref_exit(&cgrp->self.refcnt);
    4719             : out_free_cgrp:
    4720           0 :         kfree(cgrp);
    4721             : out_unlock:
    4722           0 :         cgroup_kn_unlock(parent_kn);
    4723           0 :         return ret;
    4724             : 
    4725             : out_destroy:
    4726           0 :         cgroup_destroy_locked(cgrp);
    4727           0 :         goto out_unlock;
    4728             : }
    4729             : 
    4730             : /*
    4731             :  * This is called when the refcnt of a css is confirmed to be killed.
    4732             :  * css_tryget_online() is now guaranteed to fail.  Tell the subsystem to
    4733             :  * initate destruction and put the css ref from kill_css().
    4734             :  */
    4735           0 : static void css_killed_work_fn(struct work_struct *work)
    4736             : {
    4737           0 :         struct cgroup_subsys_state *css =
    4738             :                 container_of(work, struct cgroup_subsys_state, destroy_work);
    4739             : 
    4740           0 :         mutex_lock(&cgroup_mutex);
    4741           0 :         offline_css(css);
    4742           0 :         mutex_unlock(&cgroup_mutex);
    4743             : 
    4744             :         css_put(css);
    4745           0 : }
    4746             : 
    4747             : /* css kill confirmation processing requires process context, bounce */
    4748           0 : static void css_killed_ref_fn(struct percpu_ref *ref)
    4749             : {
    4750             :         struct cgroup_subsys_state *css =
    4751             :                 container_of(ref, struct cgroup_subsys_state, refcnt);
    4752             : 
    4753           0 :         INIT_WORK(&css->destroy_work, css_killed_work_fn);
    4754           0 :         queue_work(cgroup_destroy_wq, &css->destroy_work);
    4755           0 : }
    4756             : 
    4757             : /**
    4758             :  * kill_css - destroy a css
    4759             :  * @css: css to destroy
    4760             :  *
    4761             :  * This function initiates destruction of @css by removing cgroup interface
    4762             :  * files and putting its base reference.  ->css_offline() will be invoked
    4763             :  * asynchronously once css_tryget_online() is guaranteed to fail and when
    4764             :  * the reference count reaches zero, @css will be released.
    4765             :  */
    4766           0 : static void kill_css(struct cgroup_subsys_state *css)
    4767             : {
    4768             :         lockdep_assert_held(&cgroup_mutex);
    4769             : 
    4770             :         /*
    4771             :          * This must happen before css is disassociated with its cgroup.
    4772             :          * See seq_css() for details.
    4773             :          */
    4774           0 :         cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
    4775             : 
    4776             :         /*
    4777             :          * Killing would put the base ref, but we need to keep it alive
    4778             :          * until after ->css_offline().
    4779             :          */
    4780             :         css_get(css);
    4781             : 
    4782             :         /*
    4783             :          * cgroup core guarantees that, by the time ->css_offline() is
    4784             :          * invoked, no new css reference will be given out via
    4785             :          * css_tryget_online().  We can't simply call percpu_ref_kill() and
    4786             :          * proceed to offlining css's because percpu_ref_kill() doesn't
    4787             :          * guarantee that the ref is seen as killed on all CPUs on return.
    4788             :          *
    4789             :          * Use percpu_ref_kill_and_confirm() to get notifications as each
    4790             :          * css is confirmed to be seen as killed on all CPUs.
    4791             :          */
    4792           0 :         percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
    4793           0 : }
    4794             : 
    4795             : /**
    4796             :  * cgroup_destroy_locked - the first stage of cgroup destruction
    4797             :  * @cgrp: cgroup to be destroyed
    4798             :  *
    4799             :  * css's make use of percpu refcnts whose killing latency shouldn't be
    4800             :  * exposed to userland and are RCU protected.  Also, cgroup core needs to
    4801             :  * guarantee that css_tryget_online() won't succeed by the time
    4802             :  * ->css_offline() is invoked.  To satisfy all the requirements,
    4803             :  * destruction is implemented in the following two steps.
    4804             :  *
    4805             :  * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
    4806             :  *     userland visible parts and start killing the percpu refcnts of
    4807             :  *     css's.  Set up so that the next stage will be kicked off once all
    4808             :  *     the percpu refcnts are confirmed to be killed.
    4809             :  *
    4810             :  * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
    4811             :  *     rest of destruction.  Once all cgroup references are gone, the
    4812             :  *     cgroup is RCU-freed.
    4813             :  *
    4814             :  * This function implements s1.  After this step, @cgrp is gone as far as
    4815             :  * the userland is concerned and a new cgroup with the same name may be
    4816             :  * created.  As cgroup doesn't care about the names internally, this
    4817             :  * doesn't cause any problem.
    4818             :  */
    4819           0 : static int cgroup_destroy_locked(struct cgroup *cgrp)
    4820             :         __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
    4821             : {
    4822             :         struct cgroup_subsys_state *css;
    4823             :         bool empty;
    4824             :         int ssid;
    4825             : 
    4826             :         lockdep_assert_held(&cgroup_mutex);
    4827             : 
    4828             :         /*
    4829             :          * css_set_rwsem synchronizes access to ->cset_links and prevents
    4830             :          * @cgrp from being removed while put_css_set() is in progress.
    4831             :          */
    4832           0 :         down_read(&css_set_rwsem);
    4833           0 :         empty = list_empty(&cgrp->cset_links);
    4834           0 :         up_read(&css_set_rwsem);
    4835           0 :         if (!empty)
    4836             :                 return -EBUSY;
    4837             : 
    4838             :         /*
    4839             :          * Make sure there's no live children.  We can't test emptiness of
    4840             :          * ->self.children as dead children linger on it while being
    4841             :          * drained; otherwise, "rmdir parent/child parent" may fail.
    4842             :          */
    4843           0 :         if (css_has_online_children(&cgrp->self))
    4844             :                 return -EBUSY;
    4845             : 
    4846             :         /*
    4847             :          * Mark @cgrp dead.  This prevents further task migration and child
    4848             :          * creation by disabling cgroup_lock_live_group().
    4849             :          */
    4850           0 :         cgrp->self.flags &= ~CSS_ONLINE;
    4851             : 
    4852             :         /* initiate massacre of all css's */
    4853           0 :         for_each_css(css, ssid, cgrp)
    4854           0 :                 kill_css(css);
    4855             : 
    4856             :         /*
    4857             :          * Remove @cgrp directory along with the base files.  @cgrp has an
    4858             :          * extra ref on its kn.
    4859             :          */
    4860           0 :         kernfs_remove(cgrp->kn);
    4861             : 
    4862           0 :         check_for_release(cgroup_parent(cgrp));
    4863             : 
    4864             :         /* put the base reference */
    4865           0 :         percpu_ref_kill(&cgrp->self.refcnt);
    4866             : 
    4867           0 :         return 0;
    4868             : };
    4869             : 
    4870           0 : static int cgroup_rmdir(struct kernfs_node *kn)
    4871             : {
    4872             :         struct cgroup *cgrp;
    4873             :         int ret = 0;
    4874             : 
    4875           0 :         cgrp = cgroup_kn_lock_live(kn);
    4876           0 :         if (!cgrp)
    4877             :                 return 0;
    4878             : 
    4879           0 :         ret = cgroup_destroy_locked(cgrp);
    4880             : 
    4881           0 :         cgroup_kn_unlock(kn);
    4882           0 :         return ret;
    4883             : }
    4884             : 
    4885             : static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
    4886             :         .remount_fs             = cgroup_remount,
    4887             :         .show_options           = cgroup_show_options,
    4888             :         .mkdir                  = cgroup_mkdir,
    4889             :         .rmdir                  = cgroup_rmdir,
    4890             :         .rename                 = cgroup_rename,
    4891             : };
    4892             : 
    4893           7 : static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
    4894             : {
    4895             :         struct cgroup_subsys_state *css;
    4896             : 
    4897           7 :         printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
    4898             : 
    4899           7 :         mutex_lock(&cgroup_mutex);
    4900             : 
    4901           7 :         idr_init(&ss->css_idr);
    4902           7 :         INIT_LIST_HEAD(&ss->cfts);
    4903             : 
    4904             :         /* Create the root cgroup state for this subsystem */
    4905           7 :         ss->root = &cgrp_dfl_root;
    4906          14 :         css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
    4907             :         /* We don't handle early failures gracefully */
    4908             :         BUG_ON(IS_ERR(css));
    4909           7 :         init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
    4910             : 
    4911             :         /*
    4912             :          * Root csses are never destroyed and we can't initialize
    4913             :          * percpu_ref during early init.  Disable refcnting.
    4914             :          */
    4915           7 :         css->flags |= CSS_NO_REF;
    4916             : 
    4917           7 :         if (early) {
    4918             :                 /* allocation can't be done safely during early init */
    4919           2 :                 css->id = 1;
    4920             :         } else {
    4921           5 :                 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
    4922             :                 BUG_ON(css->id < 0);
    4923             :         }
    4924             : 
    4925             :         /* Update the init_css_set to contain a subsys
    4926             :          * pointer to this state - since the subsystem is
    4927             :          * newly registered, all tasks and hence the
    4928             :          * init_css_set is in the subsystem's root cgroup. */
    4929           7 :         init_css_set.subsys[ss->id] = css;
    4930             : 
    4931           7 :         need_forkexit_callback |= ss->fork || ss->exit;
    4932             : 
    4933             :         /* At system boot, before all subsystems have been
    4934             :          * registered, no tasks have been forked, so we don't
    4935             :          * need to invoke fork callbacks here. */
    4936             :         BUG_ON(!list_empty(&init_task.tasks));
    4937             : 
    4938           7 :         BUG_ON(online_css(css));
    4939             : 
    4940           7 :         mutex_unlock(&cgroup_mutex);
    4941           7 : }
    4942             : 
    4943             : /**
    4944             :  * cgroup_init_early - cgroup initialization at system boot
    4945             :  *
    4946             :  * Initialize cgroups at system boot, and initialize any
    4947             :  * subsystems that request early init.
    4948             :  */
    4949           1 : int __init cgroup_init_early(void)
    4950             : {
    4951             :         static struct cgroup_sb_opts __initdata opts;
    4952             :         struct cgroup_subsys *ss;
    4953             :         int i;
    4954             : 
    4955           1 :         init_cgroup_root(&cgrp_dfl_root, &opts);
    4956           1 :         cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
    4957             : 
    4958           1 :         RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
    4959             : 
    4960           8 :         for_each_subsys(ss, i) {
    4961             :                 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
    4962             :                      "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
    4963             :                      i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
    4964             :                      ss->id, ss->name);
    4965             :                 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
    4966             :                      "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
    4967             : 
    4968           7 :                 ss->id = i;
    4969           7 :                 ss->name = cgroup_subsys_name[i];
    4970             : 
    4971           7 :                 if (ss->early_init)
    4972           2 :                         cgroup_init_subsys(ss, true);
    4973             :         }
    4974           1 :         return 0;
    4975             : }
    4976             : 
    4977             : /**
    4978             :  * cgroup_init - cgroup initialization
    4979             :  *
    4980             :  * Register cgroup filesystem and /proc file, and initialize
    4981             :  * any subsystems that didn't request early init.
    4982             :  */
    4983           1 : int __init cgroup_init(void)
    4984             : {
    4985             :         struct cgroup_subsys *ss;
    4986             :         unsigned long key;
    4987             :         int ssid, err;
    4988             : 
    4989           1 :         BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
    4990           1 :         BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
    4991             : 
    4992           1 :         mutex_lock(&cgroup_mutex);
    4993             : 
    4994             :         /* Add init_css_set to the hash table */
    4995             :         key = css_set_hash(init_css_set.subsys);
    4996           1 :         hash_add(css_set_table, &init_css_set.hlist, key);
    4997             : 
    4998           1 :         BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
    4999             : 
    5000           1 :         mutex_unlock(&cgroup_mutex);
    5001             : 
    5002           8 :         for_each_subsys(ss, ssid) {
    5003           7 :                 if (ss->early_init) {
    5004           2 :                         struct cgroup_subsys_state *css =
    5005           2 :                                 init_css_set.subsys[ss->id];
    5006             : 
    5007           2 :                         css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
    5008             :                                                    GFP_KERNEL);
    5009             :                         BUG_ON(css->id < 0);
    5010             :                 } else {
    5011           5 :                         cgroup_init_subsys(ss, false);
    5012             :                 }
    5013             : 
    5014           7 :                 list_add_tail(&init_css_set.e_cset_node[ssid],
    5015             :                               &cgrp_dfl_root.cgrp.e_csets[ssid]);
    5016             : 
    5017             :                 /*
    5018             :                  * Setting dfl_root subsys_mask needs to consider the
    5019             :                  * disabled flag and cftype registration needs kmalloc,
    5020             :                  * both of which aren't available during early_init.
    5021             :                  */
    5022           7 :                 if (ss->disabled)
    5023           1 :                         continue;
    5024             : 
    5025           6 :                 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
    5026             : 
    5027           6 :                 if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
    5028           0 :                         ss->dfl_cftypes = ss->legacy_cftypes;
    5029             : 
    5030           6 :                 if (!ss->dfl_cftypes)
    5031           6 :                         cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
    5032             : 
    5033           6 :                 if (ss->dfl_cftypes == ss->legacy_cftypes) {
    5034           0 :                         WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
    5035             :                 } else {
    5036           6 :                         WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
    5037           6 :                         WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
    5038             :                 }
    5039             :         }
    5040             : 
    5041           1 :         cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
    5042           1 :         if (!cgroup_kobj)
    5043             :                 return -ENOMEM;
    5044             : 
    5045           1 :         err = register_filesystem(&cgroup_fs_type);
    5046           1 :         if (err < 0) {
    5047           0 :                 kobject_put(cgroup_kobj);
    5048           0 :                 return err;
    5049             :         }
    5050             : 
    5051             :         proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
    5052           1 :         return 0;
    5053             : }
    5054             : 
    5055           1 : static int __init cgroup_wq_init(void)
    5056             : {
    5057             :         /*
    5058             :          * There isn't much point in executing destruction path in
    5059             :          * parallel.  Good chunk is serialized with cgroup_mutex anyway.
    5060             :          * Use 1 for @max_active.
    5061             :          *
    5062             :          * We would prefer to do this in cgroup_init() above, but that
    5063             :          * is called before init_workqueues(): so leave this until after.
    5064             :          */
    5065           1 :         cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
    5066             :         BUG_ON(!cgroup_destroy_wq);
    5067             : 
    5068             :         /*
    5069             :          * Used to destroy pidlists and separate to serve as flush domain.
    5070             :          * Cap @max_active to 1 too.
    5071             :          */
    5072           1 :         cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
    5073             :                                                     0, 1);
    5074             :         BUG_ON(!cgroup_pidlist_destroy_wq);
    5075             : 
    5076           1 :         return 0;
    5077             : }
    5078             : core_initcall(cgroup_wq_init);
    5079             : 
    5080             : /*
    5081             :  * proc_cgroup_show()
    5082             :  *  - Print task's cgroup paths into seq_file, one line for each hierarchy
    5083             :  *  - Used for /proc/<pid>/cgroup.
    5084             :  */
    5085           0 : int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
    5086             :                      struct pid *pid, struct task_struct *tsk)
    5087             : {
    5088             :         char *buf, *path;
    5089             :         int retval;
    5090             :         struct cgroup_root *root;
    5091             : 
    5092             :         retval = -ENOMEM;
    5093             :         buf = kmalloc(PATH_MAX, GFP_KERNEL);
    5094           0 :         if (!buf)
    5095             :                 goto out;
    5096             : 
    5097           0 :         mutex_lock(&cgroup_mutex);
    5098           0 :         down_read(&css_set_rwsem);
    5099             : 
    5100           0 :         for_each_root(root) {
    5101             :                 struct cgroup_subsys *ss;
    5102             :                 struct cgroup *cgrp;
    5103             :                 int ssid, count = 0;
    5104             : 
    5105           0 :                 if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
    5106           0 :                         continue;
    5107             : 
    5108           0 :                 seq_printf(m, "%d:", root->hierarchy_id);
    5109           0 :                 for_each_subsys(ss, ssid)
    5110           0 :                         if (root->subsys_mask & (1 << ssid))
    5111           0 :                                 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
    5112           0 :                 if (strlen(root->name))
    5113           0 :                         seq_printf(m, "%sname=%s", count ? "," : "",
    5114           0 :                                    root->name);
    5115           0 :                 seq_putc(m, ':');
    5116             :                 cgrp = task_cgroup_from_root(tsk, root);
    5117             :                 path = cgroup_path(cgrp, buf, PATH_MAX);
    5118           0 :                 if (!path) {
    5119             :                         retval = -ENAMETOOLONG;
    5120             :                         goto out_unlock;
    5121             :                 }
    5122           0 :                 seq_puts(m, path);
    5123           0 :                 seq_putc(m, '\n');
    5124             :         }
    5125             : 
    5126             :         retval = 0;
    5127             : out_unlock:
    5128           0 :         up_read(&css_set_rwsem);
    5129           0 :         mutex_unlock(&cgroup_mutex);
    5130           0 :         kfree(buf);
    5131             : out:
    5132           0 :         return retval;
    5133             : }
    5134             : 
    5135             : /* Display information about each subsystem and each hierarchy */
    5136           0 : static int proc_cgroupstats_show(struct seq_file *m, void *v)
    5137             : {
    5138             :         struct cgroup_subsys *ss;
    5139             :         int i;
    5140             : 
    5141           0 :         seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
    5142             :         /*
    5143             :          * ideally we don't want subsystems moving around while we do this.
    5144             :          * cgroup_mutex is also necessary to guarantee an atomic snapshot of
    5145             :          * subsys/hierarchy state.
    5146             :          */
    5147           0 :         mutex_lock(&cgroup_mutex);
    5148             : 
    5149           0 :         for_each_subsys(ss, i)
    5150           0 :                 seq_printf(m, "%s\t%d\t%d\t%d\n",
    5151           0 :                            ss->name, ss->root->hierarchy_id,
    5152           0 :                            atomic_read(&ss->root->nr_cgrps), !ss->disabled);
    5153             : 
    5154           0 :         mutex_unlock(&cgroup_mutex);
    5155           0 :         return 0;
    5156             : }
    5157             : 
    5158           0 : static int cgroupstats_open(struct inode *inode, struct file *file)
    5159             : {
    5160           0 :         return single_open(file, proc_cgroupstats_show, NULL);
    5161             : }
    5162             : 
    5163             : static const struct file_operations proc_cgroupstats_operations = {
    5164             :         .open = cgroupstats_open,
    5165             :         .read = seq_read,
    5166             :         .llseek = seq_lseek,
    5167             :         .release = single_release,
    5168             : };
    5169             : 
    5170             : /**
    5171             :  * cgroup_fork - initialize cgroup related fields during copy_process()
    5172             :  * @child: pointer to task_struct of forking parent process.
    5173             :  *
    5174             :  * A task is associated with the init_css_set until cgroup_post_fork()
    5175             :  * attaches it to the parent's css_set.  Empty cg_list indicates that
    5176             :  * @child isn't holding reference to its css_set.
    5177             :  */
    5178        2993 : void cgroup_fork(struct task_struct *child)
    5179             : {
    5180        2993 :         RCU_INIT_POINTER(child->cgroups, &init_css_set);
    5181        2993 :         INIT_LIST_HEAD(&child->cg_list);
    5182        2993 : }
    5183             : 
    5184             : /**
    5185             :  * cgroup_post_fork - called on a new task after adding it to the task list
    5186             :  * @child: the task in question
    5187             :  *
    5188             :  * Adds the task to the list running through its css_set if necessary and
    5189             :  * call the subsystem fork() callbacks.  Has to be after the task is
    5190             :  * visible on the task list in case we race with the first call to
    5191             :  * cgroup_task_iter_start() - to guarantee that the new task ends up on its
    5192             :  * list.
    5193             :  */
    5194        2992 : void cgroup_post_fork(struct task_struct *child)
    5195             : {
    5196             :         struct cgroup_subsys *ss;
    5197             :         int i;
    5198             : 
    5199             :         /*
    5200             :          * This may race against cgroup_enable_task_cg_lists().  As that
    5201             :          * function sets use_task_css_set_links before grabbing
    5202             :          * tasklist_lock and we just went through tasklist_lock to add
    5203             :          * @child, it's guaranteed that either we see the set
    5204             :          * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
    5205             :          * @child during its iteration.
    5206             :          *
    5207             :          * If we won the race, @child is associated with %current's
    5208             :          * css_set.  Grabbing css_set_rwsem guarantees both that the
    5209             :          * association is stable, and, on completion of the parent's
    5210             :          * migration, @child is visible in the source of migration or
    5211             :          * already in the destination cgroup.  This guarantee is necessary
    5212             :          * when implementing operations which need to migrate all tasks of
    5213             :          * a cgroup to another.
    5214             :          *
    5215             :          * Note that if we lose to cgroup_enable_task_cg_lists(), @child
    5216             :          * will remain in init_css_set.  This is safe because all tasks are
    5217             :          * in the init_css_set before cg_links is enabled and there's no
    5218             :          * operation which transfers all tasks out of init_css_set.
    5219             :          */
    5220        2992 :         if (use_task_css_set_links) {
    5221             :                 struct css_set *cset;
    5222             : 
    5223         909 :                 down_write(&css_set_rwsem);
    5224         909 :                 cset = task_css_set(current);
    5225        1818 :                 if (list_empty(&child->cg_list)) {
    5226         909 :                         rcu_assign_pointer(child->cgroups, cset);
    5227         909 :                         list_add(&child->cg_list, &cset->tasks);
    5228             :                         get_css_set(cset);
    5229             :                 }
    5230         909 :                 up_write(&css_set_rwsem);
    5231             :         }
    5232             : 
    5233             :         /*
    5234             :          * Call ss->fork().  This must happen after @child is linked on
    5235             :          * css_set; otherwise, @child might change state between ->fork()
    5236             :          * and addition to css_set.
    5237             :          */
    5238        2992 :         if (need_forkexit_callback) {
    5239       20944 :                 for_each_subsys(ss, i)
    5240       20944 :                         if (ss->fork)
    5241        5984 :                                 ss->fork(child);
    5242             :         }
    5243        2992 : }
    5244             : 
    5245             : /**
    5246             :  * cgroup_exit - detach cgroup from exiting task
    5247             :  * @tsk: pointer to task_struct of exiting process
    5248             :  *
    5249             :  * Description: Detach cgroup from @tsk and release it.
    5250             :  *
    5251             :  * Note that cgroups marked notify_on_release force every task in
    5252             :  * them to take the global cgroup_mutex mutex when exiting.
    5253             :  * This could impact scaling on very large systems.  Be reluctant to
    5254             :  * use notify_on_release cgroups where very high task exit scaling
    5255             :  * is required on large systems.
    5256             :  *
    5257             :  * We set the exiting tasks cgroup to the root cgroup (top_cgroup).  We
    5258             :  * call cgroup_exit() while the task is still competent to handle
    5259             :  * notify_on_release(), then leave the task attached to the root cgroup in
    5260             :  * each hierarchy for the remainder of its exit.  No need to bother with
    5261             :  * init_css_set refcnting.  init_css_set never goes away and we can't race
    5262             :  * with migration path - PF_EXITING is visible to migration path.
    5263             :  */
    5264        2914 : void cgroup_exit(struct task_struct *tsk)
    5265             : {
    5266             :         struct cgroup_subsys *ss;
    5267             :         struct css_set *cset;
    5268             :         bool put_cset = false;
    5269             :         int i;
    5270             : 
    5271             :         /*
    5272             :          * Unlink from @tsk from its css_set.  As migration path can't race
    5273             :          * with us, we can check cg_list without grabbing css_set_rwsem.
    5274             :          */
    5275        5828 :         if (!list_empty(&tsk->cg_list)) {
    5276         887 :                 down_write(&css_set_rwsem);
    5277             :                 list_del_init(&tsk->cg_list);
    5278         887 :                 up_write(&css_set_rwsem);
    5279             :                 put_cset = true;
    5280             :         }
    5281             : 
    5282             :         /* Reassign the task to the init_css_set. */
    5283             :         cset = task_css_set(tsk);
    5284        2914 :         RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
    5285             : 
    5286        2914 :         if (need_forkexit_callback) {
    5287             :                 /* see cgroup_post_fork() for details */
    5288       20398 :                 for_each_subsys(ss, i) {
    5289       20398 :                         if (ss->exit) {
    5290        2914 :                                 struct cgroup_subsys_state *old_css = cset->subsys[i];
    5291             :                                 struct cgroup_subsys_state *css = task_css(tsk, i);
    5292             : 
    5293        2914 :                                 ss->exit(css, old_css, tsk);
    5294             :                         }
    5295             :                 }
    5296             :         }
    5297             : 
    5298        2914 :         if (put_cset)
    5299         887 :                 put_css_set(cset);
    5300        2914 : }
    5301             : 
    5302           0 : static void check_for_release(struct cgroup *cgrp)
    5303             : {
    5304           0 :         if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) &&
    5305           0 :             !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
    5306           0 :                 schedule_work(&cgrp->release_agent_work);
    5307           0 : }
    5308             : 
    5309             : /*
    5310             :  * Notify userspace when a cgroup is released, by running the
    5311             :  * configured release agent with the name of the cgroup (path
    5312             :  * relative to the root of cgroup file system) as the argument.
    5313             :  *
    5314             :  * Most likely, this user command will try to rmdir this cgroup.
    5315             :  *
    5316             :  * This races with the possibility that some other task will be
    5317             :  * attached to this cgroup before it is removed, or that some other
    5318             :  * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
    5319             :  * The presumed 'rmdir' will fail quietly if this cgroup is no longer
    5320             :  * unused, and this cgroup will be reprieved from its death sentence,
    5321             :  * to continue to serve a useful existence.  Next time it's released,
    5322             :  * we will get notified again, if it still has 'notify_on_release' set.
    5323             :  *
    5324             :  * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
    5325             :  * means only wait until the task is successfully execve()'d.  The
    5326             :  * separate release agent task is forked by call_usermodehelper(),
    5327             :  * then control in this thread returns here, without waiting for the
    5328             :  * release agent task.  We don't bother to wait because the caller of
    5329             :  * this routine has no use for the exit status of the release agent
    5330             :  * task, so no sense holding our caller up for that.
    5331             :  */
    5332           0 : static void cgroup_release_agent(struct work_struct *work)
    5333             : {
    5334             :         struct cgroup *cgrp =
    5335             :                 container_of(work, struct cgroup, release_agent_work);
    5336             :         char *pathbuf = NULL, *agentbuf = NULL, *path;
    5337             :         char *argv[3], *envp[3];
    5338             : 
    5339           0 :         mutex_lock(&cgroup_mutex);
    5340             : 
    5341             :         pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
    5342           0 :         agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
    5343           0 :         if (!pathbuf || !agentbuf)
    5344             :                 goto out;
    5345             : 
    5346             :         path = cgroup_path(cgrp, pathbuf, PATH_MAX);
    5347           0 :         if (!path)
    5348             :                 goto out;
    5349             : 
    5350           0 :         argv[0] = agentbuf;
    5351           0 :         argv[1] = path;
    5352           0 :         argv[2] = NULL;
    5353             : 
    5354             :         /* minimal command environment */
    5355           0 :         envp[0] = "HOME=/";
    5356           0 :         envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
    5357           0 :         envp[2] = NULL;
    5358             : 
    5359           0 :         mutex_unlock(&cgroup_mutex);
    5360           0 :         call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
    5361           0 :         goto out_free;
    5362             : out:
    5363           0 :         mutex_unlock(&cgroup_mutex);
    5364             : out_free:
    5365           0 :         kfree(agentbuf);
    5366           0 :         kfree(pathbuf);
    5367           0 : }
    5368             : 
    5369           0 : static int __init cgroup_disable(char *str)
    5370             : {
    5371             :         struct cgroup_subsys *ss;
    5372             :         char *token;
    5373             :         int i;
    5374             : 
    5375           0 :         while ((token = strsep(&str, ",")) != NULL) {
    5376           0 :                 if (!*token)
    5377           0 :                         continue;
    5378             : 
    5379           0 :                 for_each_subsys(ss, i) {
    5380           0 :                         if (!strcmp(token, ss->name)) {
    5381           0 :                                 ss->disabled = 1;
    5382           0 :                                 printk(KERN_INFO "Disabling %s control group"
    5383             :                                         " subsystem\n", ss->name);
    5384           0 :                                 break;
    5385             :                         }
    5386             :                 }
    5387             :         }
    5388           0 :         return 1;
    5389             : }
    5390             : __setup("cgroup_disable=", cgroup_disable);
    5391             : 
    5392           0 : static int __init cgroup_enable(char *str)
    5393             : {
    5394             :         struct cgroup_subsys *ss;
    5395             :         char *token;
    5396             :         int i;
    5397             : 
    5398           0 :         while ((token = strsep(&str, ",")) != NULL) {
    5399           0 :                 if (!*token)
    5400           0 :                         continue;
    5401             : 
    5402           0 :                 for_each_subsys(ss, i) {
    5403           0 :                         if (!strcmp(token, ss->name)) {
    5404           0 :                                 ss->disabled = 0;
    5405           0 :                                 printk(KERN_INFO "Enabling %s control group"
    5406             :                                         " subsystem\n", ss->name);
    5407           0 :                                 break;
    5408             :                         }
    5409             :                 }
    5410             :         }
    5411           0 :         return 1;
    5412             : }
    5413             : __setup("cgroup_enable=", cgroup_enable);
    5414             : 
    5415           0 : static int __init cgroup_set_legacy_files_on_dfl(char *str)
    5416             : {
    5417           0 :         printk("cgroup: using legacy files on the default hierarchy\n");
    5418           0 :         cgroup_legacy_files_on_dfl = true;
    5419           0 :         return 0;
    5420             : }
    5421             : __setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
    5422             : 
    5423             : /**
    5424             :  * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
    5425             :  * @dentry: directory dentry of interest
    5426             :  * @ss: subsystem of interest
    5427             :  *
    5428             :  * If @dentry is a directory for a cgroup which has @ss enabled on it, try
    5429             :  * to get the corresponding css and return it.  If such css doesn't exist
    5430             :  * or can't be pinned, an ERR_PTR value is returned.
    5431             :  */
    5432           0 : struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
    5433             :                                                        struct cgroup_subsys *ss)
    5434             : {
    5435           0 :         struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
    5436             :         struct cgroup_subsys_state *css = NULL;
    5437             :         struct cgroup *cgrp;
    5438             : 
    5439             :         /* is @dentry a cgroup dir? */
    5440           0 :         if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
    5441             :             kernfs_type(kn) != KERNFS_DIR)
    5442             :                 return ERR_PTR(-EBADF);
    5443             : 
    5444             :         rcu_read_lock();
    5445             : 
    5446             :         /*
    5447             :          * This path doesn't originate from kernfs and @kn could already
    5448             :          * have been or be removed at any point.  @kn->priv is RCU
    5449             :          * protected for this access.  See css_release_work_fn() for details.
    5450             :          */
    5451           0 :         cgrp = rcu_dereference(kn->priv);
    5452           0 :         if (cgrp)
    5453             :                 css = cgroup_css(cgrp, ss);
    5454             : 
    5455           0 :         if (!css || !css_tryget_online(css))
    5456             :                 css = ERR_PTR(-ENOENT);
    5457             : 
    5458             :         rcu_read_unlock();
    5459           0 :         return css;
    5460             : }
    5461             : 
    5462             : /**
    5463             :  * css_from_id - lookup css by id
    5464             :  * @id: the cgroup id
    5465             :  * @ss: cgroup subsys to be looked into
    5466             :  *
    5467             :  * Returns the css if there's valid one with @id, otherwise returns NULL.
    5468             :  * Should be called under rcu_read_lock().
    5469             :  */
    5470           0 : struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
    5471             : {
    5472             :         WARN_ON_ONCE(!rcu_read_lock_held());
    5473           0 :         return idr_find(&ss->css_idr, id);
    5474             : }
    5475             : 
    5476             : #ifdef CONFIG_CGROUP_DEBUG
    5477             : static struct cgroup_subsys_state *
    5478             : debug_css_alloc(struct cgroup_subsys_state *parent_css)
    5479             : {
    5480             :         struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
    5481             : 
    5482             :         if (!css)
    5483             :                 return ERR_PTR(-ENOMEM);
    5484             : 
    5485             :         return css;
    5486             : }
    5487             : 
    5488             : static void debug_css_free(struct cgroup_subsys_state *css)
    5489             : {
    5490             :         kfree(css);
    5491             : }
    5492             : 
    5493             : static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
    5494             :                                 struct cftype *cft)
    5495             : {
    5496             :         return cgroup_task_count(css->cgroup);
    5497             : }
    5498             : 
    5499             : static u64 current_css_set_read(struct cgroup_subsys_state *css,
    5500             :                                 struct cftype *cft)
    5501             : {
    5502             :         return (u64)(unsigned long)current->cgroups;
    5503             : }
    5504             : 
    5505             : static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
    5506             :                                          struct cftype *cft)
    5507             : {
    5508             :         u64 count;
    5509             : 
    5510             :         rcu_read_lock();
    5511             :         count = atomic_read(&task_css_set(current)->refcount);
    5512             :         rcu_read_unlock();
    5513             :         return count;
    5514             : }
    5515             : 
    5516             : static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
    5517             : {
    5518             :         struct cgrp_cset_link *link;
    5519             :         struct css_set *cset;
    5520             :         char *name_buf;
    5521             : 
    5522             :         name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
    5523             :         if (!name_buf)
    5524             :                 return -ENOMEM;
    5525             : 
    5526             :         down_read(&css_set_rwsem);
    5527             :         rcu_read_lock();
    5528             :         cset = rcu_dereference(current->cgroups);
    5529             :         list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
    5530             :                 struct cgroup *c = link->cgrp;
    5531             : 
    5532             :                 cgroup_name(c, name_buf, NAME_MAX + 1);
    5533             :                 seq_printf(seq, "Root %d group %s\n",
    5534             :                            c->root->hierarchy_id, name_buf);
    5535             :         }
    5536             :         rcu_read_unlock();
    5537             :         up_read(&css_set_rwsem);
    5538             :         kfree(name_buf);
    5539             :         return 0;
    5540             : }
    5541             : 
    5542             : #define MAX_TASKS_SHOWN_PER_CSS 25
    5543             : static int cgroup_css_links_read(struct seq_file *seq, void *v)
    5544             : {
    5545             :         struct cgroup_subsys_state *css = seq_css(seq);
    5546             :         struct cgrp_cset_link *link;
    5547             : 
    5548             :         down_read(&css_set_rwsem);
    5549             :         list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
    5550             :                 struct css_set *cset = link->cset;
    5551             :                 struct task_struct *task;
    5552             :                 int count = 0;
    5553             : 
    5554             :                 seq_printf(seq, "css_set %p\n", cset);
    5555             : 
    5556             :                 list_for_each_entry(task, &cset->tasks, cg_list) {
    5557             :                         if (count++ > MAX_TASKS_SHOWN_PER_CSS)
    5558             :                                 goto overflow;
    5559             :                         seq_printf(seq, "  task %d\n", task_pid_vnr(task));
    5560             :                 }
    5561             : 
    5562             :                 list_for_each_entry(task, &cset->mg_tasks, cg_list) {
    5563             :                         if (count++ > MAX_TASKS_SHOWN_PER_CSS)
    5564             :                                 goto overflow;
    5565             :                         seq_printf(seq, "  task %d\n", task_pid_vnr(task));
    5566             :                 }
    5567             :                 continue;
    5568             :         overflow:
    5569             :                 seq_puts(seq, "  ...\n");
    5570             :         }
    5571             :         up_read(&css_set_rwsem);
    5572             :         return 0;
    5573             : }
    5574             : 
    5575             : static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
    5576             : {
    5577             :         return (!cgroup_has_tasks(css->cgroup) &&
    5578             :                 !css_has_online_children(&css->cgroup->self));
    5579             : }
    5580             : 
    5581             : static struct cftype debug_files[] =  {
    5582             :         {
    5583             :                 .name = "taskcount",
    5584             :                 .read_u64 = debug_taskcount_read,
    5585             :         },
    5586             : 
    5587             :         {
    5588             :                 .name = "current_css_set",
    5589             :                 .read_u64 = current_css_set_read,
    5590             :         },
    5591             : 
    5592             :         {
    5593             :                 .name = "current_css_set_refcount",
    5594             :                 .read_u64 = current_css_set_refcount_read,
    5595             :         },
    5596             : 
    5597             :         {
    5598             :                 .name = "current_css_set_cg_links",
    5599             :                 .seq_show = current_css_set_cg_links_read,
    5600             :         },
    5601             : 
    5602             :         {
    5603             :                 .name = "cgroup_css_links",
    5604             :                 .seq_show = cgroup_css_links_read,
    5605             :         },
    5606             : 
    5607             :         {
    5608             :                 .name = "releasable",
    5609             :                 .read_u64 = releasable_read,
    5610             :         },
    5611             : 
    5612             :         { }     /* terminate */
    5613             : };
    5614             : 
    5615             : struct cgroup_subsys debug_cgrp_subsys = {
    5616             :         .css_alloc = debug_css_alloc,
    5617             :         .css_free = debug_css_free,
    5618             :         .legacy_cftypes = debug_files,
    5619             : };
    5620             : #endif /* CONFIG_CGROUP_DEBUG */

Generated by: LCOV version 1.11