blob: 7cb3b4cb9b11346e491594b4c53b8f8e41666d5a [file] [log] [blame]
Jens Axboe771b53d02019-10-22 10:25:58 -06001// SPDX-License-Identifier: GPL-2.0
2/*
3 * Basic worker thread pool for io_uring
4 *
5 * Copyright (C) 2019 Jens Axboe
6 *
7 */
8#include <linux/kernel.h>
9#include <linux/init.h>
10#include <linux/errno.h>
11#include <linux/sched/signal.h>
12#include <linux/mm.h>
Jens Axboe771b53d02019-10-22 10:25:58 -060013#include <linux/sched/mm.h>
14#include <linux/percpu.h>
15#include <linux/slab.h>
16#include <linux/kthread.h>
17#include <linux/rculist_nulls.h>
Jens Axboe9392a272020-02-06 21:42:51 -070018#include <linux/fs_struct.h>
Jens Axboeaa96bf82020-04-03 11:26:26 -060019#include <linux/task_work.h>
Dennis Zhou91d8f512020-09-16 13:41:05 -070020#include <linux/blk-cgroup.h>
Jens Axboe4ea33a92020-10-15 13:46:44 -060021#include <linux/audit.h>
Jens Axboe771b53d02019-10-22 10:25:58 -060022
23#include "io-wq.h"
24
25#define WORKER_IDLE_TIMEOUT (5 * HZ)
26
27enum {
28 IO_WORKER_F_UP = 1, /* up and active */
29 IO_WORKER_F_RUNNING = 2, /* account as running */
30 IO_WORKER_F_FREE = 4, /* worker on free list */
Jens Axboe145cc8c2020-09-26 12:37:46 -060031 IO_WORKER_F_FIXED = 8, /* static idle worker */
32 IO_WORKER_F_BOUND = 16, /* is doing bounded work */
Jens Axboe771b53d02019-10-22 10:25:58 -060033};
34
35enum {
36 IO_WQ_BIT_EXIT = 0, /* wq exiting */
37 IO_WQ_BIT_CANCEL = 1, /* cancel work on list */
Jens Axboeb60fda62019-11-19 08:37:07 -070038 IO_WQ_BIT_ERROR = 2, /* error on setup */
Jens Axboe771b53d02019-10-22 10:25:58 -060039};
40
41enum {
42 IO_WQE_FLAG_STALLED = 1, /* stalled on hash */
43};
44
45/*
46 * One for each thread in a wqe pool
47 */
48struct io_worker {
49 refcount_t ref;
50 unsigned flags;
51 struct hlist_nulls_node nulls_node;
Jens Axboee61df662019-11-13 13:54:49 -070052 struct list_head all_list;
Jens Axboe771b53d02019-10-22 10:25:58 -060053 struct task_struct *task;
Jens Axboe771b53d02019-10-22 10:25:58 -060054 struct io_wqe *wqe;
Jens Axboe36c2f922019-11-13 09:43:34 -070055
Jens Axboe771b53d02019-10-22 10:25:58 -060056 struct io_wq_work *cur_work;
Jens Axboe36c2f922019-11-13 09:43:34 -070057 spinlock_t lock;
Jens Axboe771b53d02019-10-22 10:25:58 -060058
59 struct rcu_head rcu;
60 struct mm_struct *mm;
Dennis Zhou91d8f512020-09-16 13:41:05 -070061#ifdef CONFIG_BLK_CGROUP
62 struct cgroup_subsys_state *blkcg_css;
63#endif
Jens Axboecccf0ee2020-01-27 16:34:48 -070064 const struct cred *cur_creds;
65 const struct cred *saved_creds;
Jens Axboefcb323c2019-10-24 12:39:47 -060066 struct files_struct *restore_files;
Jens Axboe9b828492020-09-18 20:13:06 -060067 struct nsproxy *restore_nsproxy;
Jens Axboe9392a272020-02-06 21:42:51 -070068 struct fs_struct *restore_fs;
Jens Axboe771b53d02019-10-22 10:25:58 -060069};
70
Jens Axboe771b53d02019-10-22 10:25:58 -060071#if BITS_PER_LONG == 64
72#define IO_WQ_HASH_ORDER 6
73#else
74#define IO_WQ_HASH_ORDER 5
75#endif
76
Pavel Begunkov86f3cd12020-03-23 22:57:22 +030077#define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER)
78
Jens Axboec5def4a2019-11-07 11:41:16 -070079struct io_wqe_acct {
80 unsigned nr_workers;
81 unsigned max_workers;
82 atomic_t nr_running;
83};
84
85enum {
86 IO_WQ_ACCT_BOUND,
87 IO_WQ_ACCT_UNBOUND,
88};
89
Jens Axboe771b53d02019-10-22 10:25:58 -060090/*
91 * Per-node worker thread pool
92 */
93struct io_wqe {
94 struct {
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +020095 raw_spinlock_t lock;
Jens Axboe6206f0e2019-11-26 11:59:32 -070096 struct io_wq_work_list work_list;
Jens Axboe771b53d02019-10-22 10:25:58 -060097 unsigned long hash_map;
98 unsigned flags;
99 } ____cacheline_aligned_in_smp;
100
101 int node;
Jens Axboec5def4a2019-11-07 11:41:16 -0700102 struct io_wqe_acct acct[2];
Jens Axboe771b53d02019-10-22 10:25:58 -0600103
Jens Axboe021d1cd2019-11-14 08:00:41 -0700104 struct hlist_nulls_head free_list;
Jens Axboee61df662019-11-13 13:54:49 -0700105 struct list_head all_list;
Jens Axboe771b53d02019-10-22 10:25:58 -0600106
107 struct io_wq *wq;
Pavel Begunkov86f3cd12020-03-23 22:57:22 +0300108 struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
Jens Axboe771b53d02019-10-22 10:25:58 -0600109};
110
111/*
112 * Per io_wq state
113 */
114struct io_wq {
115 struct io_wqe **wqes;
116 unsigned long state;
Jens Axboe771b53d02019-10-22 10:25:58 -0600117
Pavel Begunkove9fd9392020-03-04 16:14:12 +0300118 free_work_fn *free_work;
Pavel Begunkovf5fa38c2020-06-08 21:08:20 +0300119 io_wq_work_fn *do_work;
Jens Axboe7d723062019-11-12 22:31:31 -0700120
Jens Axboe771b53d02019-10-22 10:25:58 -0600121 struct task_struct *manager;
Jens Axboec5def4a2019-11-07 11:41:16 -0700122 struct user_struct *user;
Jens Axboe771b53d02019-10-22 10:25:58 -0600123 refcount_t refs;
124 struct completion done;
Jens Axboe848f7e12020-01-23 15:33:32 -0700125
126 refcount_t use_refs;
Jens Axboe771b53d02019-10-22 10:25:58 -0600127};
128
Jens Axboe771b53d02019-10-22 10:25:58 -0600129static bool io_worker_get(struct io_worker *worker)
130{
131 return refcount_inc_not_zero(&worker->ref);
132}
133
134static void io_worker_release(struct io_worker *worker)
135{
136 if (refcount_dec_and_test(&worker->ref))
137 wake_up_process(worker->task);
138}
139
140/*
141 * Note: drops the wqe->lock if returning true! The caller must re-acquire
142 * the lock in that case. Some callers need to restart handling if this
143 * happens, so we can't just re-acquire the lock on behalf of the caller.
144 */
145static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
146{
Jens Axboefcb323c2019-10-24 12:39:47 -0600147 bool dropped_lock = false;
148
Jens Axboecccf0ee2020-01-27 16:34:48 -0700149 if (worker->saved_creds) {
150 revert_creds(worker->saved_creds);
151 worker->cur_creds = worker->saved_creds = NULL;
Jens Axboe181e4482019-11-25 08:52:30 -0700152 }
153
Jens Axboefcb323c2019-10-24 12:39:47 -0600154 if (current->files != worker->restore_files) {
155 __acquire(&wqe->lock);
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +0200156 raw_spin_unlock_irq(&wqe->lock);
Jens Axboefcb323c2019-10-24 12:39:47 -0600157 dropped_lock = true;
158
159 task_lock(current);
160 current->files = worker->restore_files;
Jens Axboe9b828492020-09-18 20:13:06 -0600161 current->nsproxy = worker->restore_nsproxy;
Jens Axboefcb323c2019-10-24 12:39:47 -0600162 task_unlock(current);
163 }
164
Jens Axboe9392a272020-02-06 21:42:51 -0700165 if (current->fs != worker->restore_fs)
166 current->fs = worker->restore_fs;
167
Jens Axboe771b53d02019-10-22 10:25:58 -0600168 /*
169 * If we have an active mm, we need to drop the wq lock before unusing
170 * it. If we do, return true and let the caller retry the idle loop.
171 */
172 if (worker->mm) {
Jens Axboefcb323c2019-10-24 12:39:47 -0600173 if (!dropped_lock) {
174 __acquire(&wqe->lock);
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +0200175 raw_spin_unlock_irq(&wqe->lock);
Jens Axboefcb323c2019-10-24 12:39:47 -0600176 dropped_lock = true;
177 }
Jens Axboe771b53d02019-10-22 10:25:58 -0600178 __set_current_state(TASK_RUNNING);
Christoph Hellwigf5678e72020-06-10 18:42:06 -0700179 kthread_unuse_mm(worker->mm);
Jens Axboe771b53d02019-10-22 10:25:58 -0600180 mmput(worker->mm);
181 worker->mm = NULL;
Jens Axboe771b53d02019-10-22 10:25:58 -0600182 }
183
Dennis Zhou91d8f512020-09-16 13:41:05 -0700184#ifdef CONFIG_BLK_CGROUP
185 if (worker->blkcg_css) {
186 kthread_associate_blkcg(NULL);
187 worker->blkcg_css = NULL;
188 }
189#endif
190
Jens Axboefcb323c2019-10-24 12:39:47 -0600191 return dropped_lock;
Jens Axboe771b53d02019-10-22 10:25:58 -0600192}
193
Jens Axboec5def4a2019-11-07 11:41:16 -0700194static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe,
195 struct io_wq_work *work)
196{
197 if (work->flags & IO_WQ_WORK_UNBOUND)
198 return &wqe->acct[IO_WQ_ACCT_UNBOUND];
199
200 return &wqe->acct[IO_WQ_ACCT_BOUND];
201}
202
203static inline struct io_wqe_acct *io_wqe_get_acct(struct io_wqe *wqe,
204 struct io_worker *worker)
205{
206 if (worker->flags & IO_WORKER_F_BOUND)
207 return &wqe->acct[IO_WQ_ACCT_BOUND];
208
209 return &wqe->acct[IO_WQ_ACCT_UNBOUND];
210}
211
Jens Axboe771b53d02019-10-22 10:25:58 -0600212static void io_worker_exit(struct io_worker *worker)
213{
214 struct io_wqe *wqe = worker->wqe;
Jens Axboec5def4a2019-11-07 11:41:16 -0700215 struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
Jens Axboe771b53d02019-10-22 10:25:58 -0600216
217 /*
218 * If we're not at zero, someone else is holding a brief reference
219 * to the worker. Wait for that to go away.
220 */
221 set_current_state(TASK_INTERRUPTIBLE);
222 if (!refcount_dec_and_test(&worker->ref))
223 schedule();
224 __set_current_state(TASK_RUNNING);
225
226 preempt_disable();
227 current->flags &= ~PF_IO_WORKER;
228 if (worker->flags & IO_WORKER_F_RUNNING)
Jens Axboec5def4a2019-11-07 11:41:16 -0700229 atomic_dec(&acct->nr_running);
230 if (!(worker->flags & IO_WORKER_F_BOUND))
231 atomic_dec(&wqe->wq->user->processes);
Jens Axboe771b53d02019-10-22 10:25:58 -0600232 worker->flags = 0;
233 preempt_enable();
234
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +0200235 raw_spin_lock_irq(&wqe->lock);
Jens Axboe771b53d02019-10-22 10:25:58 -0600236 hlist_nulls_del_rcu(&worker->nulls_node);
Jens Axboee61df662019-11-13 13:54:49 -0700237 list_del_rcu(&worker->all_list);
Jens Axboe771b53d02019-10-22 10:25:58 -0600238 if (__io_worker_unuse(wqe, worker)) {
239 __release(&wqe->lock);
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +0200240 raw_spin_lock_irq(&wqe->lock);
Jens Axboe771b53d02019-10-22 10:25:58 -0600241 }
Jens Axboec5def4a2019-11-07 11:41:16 -0700242 acct->nr_workers--;
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +0200243 raw_spin_unlock_irq(&wqe->lock);
Jens Axboe771b53d02019-10-22 10:25:58 -0600244
YueHaibing364b05f2019-11-02 15:55:01 +0800245 kfree_rcu(worker, rcu);
Hillf Dantonc4068bf2020-09-26 21:26:55 +0800246 if (refcount_dec_and_test(&wqe->wq->refs))
247 complete(&wqe->wq->done);
Jens Axboe771b53d02019-10-22 10:25:58 -0600248}
249
Jens Axboec5def4a2019-11-07 11:41:16 -0700250static inline bool io_wqe_run_queue(struct io_wqe *wqe)
251 __must_hold(wqe->lock)
252{
Jens Axboe6206f0e2019-11-26 11:59:32 -0700253 if (!wq_list_empty(&wqe->work_list) &&
254 !(wqe->flags & IO_WQE_FLAG_STALLED))
Jens Axboec5def4a2019-11-07 11:41:16 -0700255 return true;
256 return false;
257}
258
259/*
260 * Check head of free list for an available worker. If one isn't available,
261 * caller must wake up the wq manager to create one.
262 */
263static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
264 __must_hold(RCU)
265{
266 struct hlist_nulls_node *n;
267 struct io_worker *worker;
268
Jens Axboe021d1cd2019-11-14 08:00:41 -0700269 n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list));
Jens Axboec5def4a2019-11-07 11:41:16 -0700270 if (is_a_nulls(n))
271 return false;
272
273 worker = hlist_nulls_entry(n, struct io_worker, nulls_node);
274 if (io_worker_get(worker)) {
Jens Axboe506d95f2019-12-07 21:03:59 -0700275 wake_up_process(worker->task);
Jens Axboec5def4a2019-11-07 11:41:16 -0700276 io_worker_release(worker);
277 return true;
278 }
279
280 return false;
281}
282
283/*
284 * We need a worker. If we find a free one, we're good. If not, and we're
285 * below the max number of workers, wake up the manager to create one.
286 */
287static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
288{
289 bool ret;
290
291 /*
292 * Most likely an attempt to queue unbounded work on an io_wq that
293 * wasn't setup with any unbounded workers.
294 */
295 WARN_ON_ONCE(!acct->max_workers);
296
297 rcu_read_lock();
298 ret = io_wqe_activate_free_worker(wqe);
299 rcu_read_unlock();
300
301 if (!ret && acct->nr_workers < acct->max_workers)
302 wake_up_process(wqe->wq->manager);
303}
304
305static void io_wqe_inc_running(struct io_wqe *wqe, struct io_worker *worker)
306{
307 struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
308
309 atomic_inc(&acct->nr_running);
310}
311
312static void io_wqe_dec_running(struct io_wqe *wqe, struct io_worker *worker)
313 __must_hold(wqe->lock)
314{
315 struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
316
317 if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe))
318 io_wqe_wake_worker(wqe, acct);
319}
320
Jens Axboe771b53d02019-10-22 10:25:58 -0600321static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker)
322{
323 allow_kernel_signal(SIGINT);
324
325 current->flags |= PF_IO_WORKER;
326
327 worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
Jens Axboefcb323c2019-10-24 12:39:47 -0600328 worker->restore_files = current->files;
Jens Axboe9b828492020-09-18 20:13:06 -0600329 worker->restore_nsproxy = current->nsproxy;
Jens Axboe9392a272020-02-06 21:42:51 -0700330 worker->restore_fs = current->fs;
Jens Axboec5def4a2019-11-07 11:41:16 -0700331 io_wqe_inc_running(wqe, worker);
Jens Axboe771b53d02019-10-22 10:25:58 -0600332}
333
334/*
335 * Worker will start processing some work. Move it to the busy list, if
336 * it's currently on the freelist
337 */
338static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
339 struct io_wq_work *work)
340 __must_hold(wqe->lock)
341{
Jens Axboec5def4a2019-11-07 11:41:16 -0700342 bool worker_bound, work_bound;
343
Jens Axboe771b53d02019-10-22 10:25:58 -0600344 if (worker->flags & IO_WORKER_F_FREE) {
345 worker->flags &= ~IO_WORKER_F_FREE;
346 hlist_nulls_del_init_rcu(&worker->nulls_node);
Jens Axboe771b53d02019-10-22 10:25:58 -0600347 }
Jens Axboec5def4a2019-11-07 11:41:16 -0700348
349 /*
350 * If worker is moving from bound to unbound (or vice versa), then
351 * ensure we update the running accounting.
352 */
Dan Carpenterb2e9c7d62019-11-19 09:22:16 +0300353 worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0;
354 work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0;
355 if (worker_bound != work_bound) {
Jens Axboec5def4a2019-11-07 11:41:16 -0700356 io_wqe_dec_running(wqe, worker);
357 if (work_bound) {
358 worker->flags |= IO_WORKER_F_BOUND;
359 wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--;
360 wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++;
361 atomic_dec(&wqe->wq->user->processes);
362 } else {
363 worker->flags &= ~IO_WORKER_F_BOUND;
364 wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++;
365 wqe->acct[IO_WQ_ACCT_BOUND].nr_workers--;
366 atomic_inc(&wqe->wq->user->processes);
367 }
368 io_wqe_inc_running(wqe, worker);
369 }
Jens Axboe771b53d02019-10-22 10:25:58 -0600370}
371
372/*
373 * No work, worker going to sleep. Move to freelist, and unuse mm if we
374 * have one attached. Dropping the mm may potentially sleep, so we drop
375 * the lock in that case and return success. Since the caller has to
376 * retry the loop in that case (we changed task state), we don't regrab
377 * the lock if we return success.
378 */
379static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
380 __must_hold(wqe->lock)
381{
382 if (!(worker->flags & IO_WORKER_F_FREE)) {
383 worker->flags |= IO_WORKER_F_FREE;
Jens Axboe021d1cd2019-11-14 08:00:41 -0700384 hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
Jens Axboe771b53d02019-10-22 10:25:58 -0600385 }
386
387 return __io_worker_unuse(wqe, worker);
388}
389
Pavel Begunkov60cf46a2020-03-14 00:31:05 +0300390static inline unsigned int io_get_work_hash(struct io_wq_work *work)
391{
392 return work->flags >> IO_WQ_HASH_SHIFT;
393}
394
395static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
Jens Axboe771b53d02019-10-22 10:25:58 -0600396 __must_hold(wqe->lock)
397{
Jens Axboe6206f0e2019-11-26 11:59:32 -0700398 struct io_wq_work_node *node, *prev;
Pavel Begunkov86f3cd12020-03-23 22:57:22 +0300399 struct io_wq_work *work, *tail;
Pavel Begunkov60cf46a2020-03-14 00:31:05 +0300400 unsigned int hash;
Jens Axboe771b53d02019-10-22 10:25:58 -0600401
Jens Axboe6206f0e2019-11-26 11:59:32 -0700402 wq_list_for_each(node, prev, &wqe->work_list) {
403 work = container_of(node, struct io_wq_work, list);
404
Jens Axboe771b53d02019-10-22 10:25:58 -0600405 /* not hashed, can run anytime */
Pavel Begunkov8766dd52020-03-14 00:31:04 +0300406 if (!io_wq_is_hashed(work)) {
Pavel Begunkov86f3cd12020-03-23 22:57:22 +0300407 wq_list_del(&wqe->work_list, node, prev);
Jens Axboe771b53d02019-10-22 10:25:58 -0600408 return work;
409 }
410
411 /* hashed, can run if not already running */
Pavel Begunkov60cf46a2020-03-14 00:31:05 +0300412 hash = io_get_work_hash(work);
413 if (!(wqe->hash_map & BIT(hash))) {
414 wqe->hash_map |= BIT(hash);
Pavel Begunkov86f3cd12020-03-23 22:57:22 +0300415 /* all items with this hash lie in [work, tail] */
416 tail = wqe->hash_tail[hash];
417 wqe->hash_tail[hash] = NULL;
418 wq_list_cut(&wqe->work_list, &tail->list, prev);
Jens Axboe771b53d02019-10-22 10:25:58 -0600419 return work;
420 }
421 }
422
423 return NULL;
424}
425
Jens Axboecccf0ee2020-01-27 16:34:48 -0700426static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
427{
428 if (worker->mm) {
Christoph Hellwigf5678e72020-06-10 18:42:06 -0700429 kthread_unuse_mm(worker->mm);
Jens Axboecccf0ee2020-01-27 16:34:48 -0700430 mmput(worker->mm);
431 worker->mm = NULL;
432 }
Christoph Hellwig37c54f92020-06-10 18:42:10 -0700433
Jens Axboe98447d62020-10-14 10:48:51 -0600434 if (mmget_not_zero(work->identity->mm)) {
435 kthread_use_mm(work->identity->mm);
436 worker->mm = work->identity->mm;
Jens Axboecccf0ee2020-01-27 16:34:48 -0700437 return;
438 }
439
440 /* failed grabbing mm, ensure work gets cancelled */
441 work->flags |= IO_WQ_WORK_CANCEL;
442}
443
Dennis Zhou91d8f512020-09-16 13:41:05 -0700444static inline void io_wq_switch_blkcg(struct io_worker *worker,
445 struct io_wq_work *work)
446{
447#ifdef CONFIG_BLK_CGROUP
Jens Axboe0f203762020-10-14 09:23:55 -0600448 if (!(work->flags & IO_WQ_WORK_BLKCG))
449 return;
Jens Axboe98447d62020-10-14 10:48:51 -0600450 if (work->identity->blkcg_css != worker->blkcg_css) {
451 kthread_associate_blkcg(work->identity->blkcg_css);
452 worker->blkcg_css = work->identity->blkcg_css;
Dennis Zhou91d8f512020-09-16 13:41:05 -0700453 }
454#endif
455}
456
Jens Axboecccf0ee2020-01-27 16:34:48 -0700457static void io_wq_switch_creds(struct io_worker *worker,
458 struct io_wq_work *work)
459{
Jens Axboe98447d62020-10-14 10:48:51 -0600460 const struct cred *old_creds = override_creds(work->identity->creds);
Jens Axboecccf0ee2020-01-27 16:34:48 -0700461
Jens Axboe98447d62020-10-14 10:48:51 -0600462 worker->cur_creds = work->identity->creds;
Jens Axboecccf0ee2020-01-27 16:34:48 -0700463 if (worker->saved_creds)
464 put_cred(old_creds); /* creds set by previous switch */
465 else
466 worker->saved_creds = old_creds;
467}
468
Pavel Begunkovdc026a72020-03-04 16:14:09 +0300469static void io_impersonate_work(struct io_worker *worker,
470 struct io_wq_work *work)
471{
Jens Axboe98447d62020-10-14 10:48:51 -0600472 if ((work->flags & IO_WQ_WORK_FILES) &&
473 current->files != work->identity->files) {
Pavel Begunkovdc026a72020-03-04 16:14:09 +0300474 task_lock(current);
Jens Axboe98447d62020-10-14 10:48:51 -0600475 current->files = work->identity->files;
476 current->nsproxy = work->identity->nsproxy;
Pavel Begunkovdc026a72020-03-04 16:14:09 +0300477 task_unlock(current);
478 }
Jens Axboe98447d62020-10-14 10:48:51 -0600479 if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs)
480 current->fs = work->identity->fs;
481 if ((work->flags & IO_WQ_WORK_MM) && work->identity->mm != worker->mm)
Pavel Begunkovdc026a72020-03-04 16:14:09 +0300482 io_wq_switch_mm(worker, work);
Jens Axboe98447d62020-10-14 10:48:51 -0600483 if ((work->flags & IO_WQ_WORK_CREDS) &&
484 worker->cur_creds != work->identity->creds)
Pavel Begunkovdc026a72020-03-04 16:14:09 +0300485 io_wq_switch_creds(worker, work);
Jens Axboe98447d62020-10-14 10:48:51 -0600486 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize;
Dennis Zhou91d8f512020-09-16 13:41:05 -0700487 io_wq_switch_blkcg(worker, work);
Jens Axboe4ea33a92020-10-15 13:46:44 -0600488#ifdef CONFIG_AUDIT
489 current->loginuid = work->identity->loginuid;
490 current->sessionid = work->identity->sessionid;
491#endif
Pavel Begunkovdc026a72020-03-04 16:14:09 +0300492}
493
494static void io_assign_current_work(struct io_worker *worker,
495 struct io_wq_work *work)
496{
Pavel Begunkovd78298e2020-03-14 00:31:03 +0300497 if (work) {
498 /* flush pending signals before assigning new work */
499 if (signal_pending(current))
500 flush_signals(current);
501 cond_resched();
502 }
Pavel Begunkovdc026a72020-03-04 16:14:09 +0300503
Jens Axboe4ea33a92020-10-15 13:46:44 -0600504#ifdef CONFIG_AUDIT
505 current->loginuid = KUIDT_INIT(AUDIT_UID_UNSET);
506 current->sessionid = AUDIT_SID_UNSET;
507#endif
508
Pavel Begunkovdc026a72020-03-04 16:14:09 +0300509 spin_lock_irq(&worker->lock);
510 worker->cur_work = work;
511 spin_unlock_irq(&worker->lock);
512}
513
Pavel Begunkov60cf46a2020-03-14 00:31:05 +0300514static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
515
Jens Axboe771b53d02019-10-22 10:25:58 -0600516static void io_worker_handle_work(struct io_worker *worker)
517 __releases(wqe->lock)
518{
Jens Axboe771b53d02019-10-22 10:25:58 -0600519 struct io_wqe *wqe = worker->wqe;
520 struct io_wq *wq = wqe->wq;
521
522 do {
Pavel Begunkov86f3cd12020-03-23 22:57:22 +0300523 struct io_wq_work *work;
Pavel Begunkovf462fd32020-03-04 16:14:11 +0300524get_next:
Jens Axboe771b53d02019-10-22 10:25:58 -0600525 /*
Jens Axboe771b53d02019-10-22 10:25:58 -0600526 * If we got some work, mark us as busy. If we didn't, but
527 * the list isn't empty, it means we stalled on hashed work.
528 * Mark us stalled so we don't keep looking for work when we
529 * can't make progress, any work completion or insertion will
530 * clear the stalled flag.
531 */
Pavel Begunkov60cf46a2020-03-14 00:31:05 +0300532 work = io_get_next_work(wqe);
Jens Axboe771b53d02019-10-22 10:25:58 -0600533 if (work)
534 __io_worker_busy(wqe, worker, work);
Jens Axboe6206f0e2019-11-26 11:59:32 -0700535 else if (!wq_list_empty(&wqe->work_list))
Jens Axboe771b53d02019-10-22 10:25:58 -0600536 wqe->flags |= IO_WQE_FLAG_STALLED;
537
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +0200538 raw_spin_unlock_irq(&wqe->lock);
Jens Axboe771b53d02019-10-22 10:25:58 -0600539 if (!work)
540 break;
Pavel Begunkov58e393192020-03-04 16:14:10 +0300541 io_assign_current_work(worker, work);
Jens Axboe36c2f922019-11-13 09:43:34 -0700542
Pavel Begunkovdc026a72020-03-04 16:14:09 +0300543 /* handle a whole dependent link */
544 do {
Pavel Begunkov86f3cd12020-03-23 22:57:22 +0300545 struct io_wq_work *old_work, *next_hashed, *linked;
Pavel Begunkovb089ed392020-07-25 14:42:00 +0300546 unsigned int hash = io_get_work_hash(work);
Hillf Dantonfd1c4bc2019-12-24 09:14:29 -0700547
Pavel Begunkov86f3cd12020-03-23 22:57:22 +0300548 next_hashed = wq_next_work(work);
Pavel Begunkov58e393192020-03-04 16:14:10 +0300549 io_impersonate_work(worker, work);
Pavel Begunkovdc026a72020-03-04 16:14:09 +0300550 /*
551 * OK to set IO_WQ_WORK_CANCEL even for uncancellable
552 * work, the worker function will do the right thing.
553 */
554 if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
555 work->flags |= IO_WQ_WORK_CANCEL;
Jens Axboe36c2f922019-11-13 09:43:34 -0700556
Pavel Begunkovf4db7182020-06-25 18:20:54 +0300557 old_work = work;
558 linked = wq->do_work(work);
Pavel Begunkovf2cf1142020-03-22 19:14:26 +0300559
Pavel Begunkov86f3cd12020-03-23 22:57:22 +0300560 work = next_hashed;
561 if (!work && linked && !io_wq_is_hashed(linked)) {
562 work = linked;
563 linked = NULL;
564 }
565 io_assign_current_work(worker, work);
Pavel Begunkove9fd9392020-03-04 16:14:12 +0300566 wq->free_work(old_work);
Pavel Begunkovdc026a72020-03-04 16:14:09 +0300567
Pavel Begunkov86f3cd12020-03-23 22:57:22 +0300568 if (linked)
569 io_wqe_enqueue(wqe, linked);
570
571 if (hash != -1U && !next_hashed) {
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +0200572 raw_spin_lock_irq(&wqe->lock);
Pavel Begunkovdc026a72020-03-04 16:14:09 +0300573 wqe->hash_map &= ~BIT_ULL(hash);
574 wqe->flags &= ~IO_WQE_FLAG_STALLED;
Pavel Begunkovf462fd32020-03-04 16:14:11 +0300575 /* skip unnecessary unlock-lock wqe->lock */
576 if (!work)
577 goto get_next;
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +0200578 raw_spin_unlock_irq(&wqe->lock);
Pavel Begunkovdc026a72020-03-04 16:14:09 +0300579 }
Pavel Begunkov58e393192020-03-04 16:14:10 +0300580 } while (work);
Jens Axboe36c2f922019-11-13 09:43:34 -0700581
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +0200582 raw_spin_lock_irq(&wqe->lock);
Jens Axboe771b53d02019-10-22 10:25:58 -0600583 } while (1);
584}
585
Jens Axboe771b53d02019-10-22 10:25:58 -0600586static int io_wqe_worker(void *data)
587{
588 struct io_worker *worker = data;
589 struct io_wqe *wqe = worker->wqe;
590 struct io_wq *wq = wqe->wq;
Jens Axboe771b53d02019-10-22 10:25:58 -0600591
592 io_worker_start(wqe, worker);
593
594 while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
Jens Axboe506d95f2019-12-07 21:03:59 -0700595 set_current_state(TASK_INTERRUPTIBLE);
Jens Axboee995d512019-12-07 21:06:46 -0700596loop:
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +0200597 raw_spin_lock_irq(&wqe->lock);
Jens Axboe771b53d02019-10-22 10:25:58 -0600598 if (io_wqe_run_queue(wqe)) {
599 __set_current_state(TASK_RUNNING);
600 io_worker_handle_work(worker);
Jens Axboee995d512019-12-07 21:06:46 -0700601 goto loop;
Jens Axboe771b53d02019-10-22 10:25:58 -0600602 }
603 /* drops the lock on success, retry */
604 if (__io_worker_idle(wqe, worker)) {
605 __release(&wqe->lock);
Jens Axboee995d512019-12-07 21:06:46 -0700606 goto loop;
Jens Axboe771b53d02019-10-22 10:25:58 -0600607 }
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +0200608 raw_spin_unlock_irq(&wqe->lock);
Jens Axboe771b53d02019-10-22 10:25:58 -0600609 if (signal_pending(current))
610 flush_signals(current);
611 if (schedule_timeout(WORKER_IDLE_TIMEOUT))
612 continue;
613 /* timed out, exit unless we're the fixed worker */
614 if (test_bit(IO_WQ_BIT_EXIT, &wq->state) ||
615 !(worker->flags & IO_WORKER_F_FIXED))
616 break;
617 }
618
Jens Axboe771b53d02019-10-22 10:25:58 -0600619 if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +0200620 raw_spin_lock_irq(&wqe->lock);
Jens Axboe6206f0e2019-11-26 11:59:32 -0700621 if (!wq_list_empty(&wqe->work_list))
Jens Axboe771b53d02019-10-22 10:25:58 -0600622 io_worker_handle_work(worker);
623 else
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +0200624 raw_spin_unlock_irq(&wqe->lock);
Jens Axboe771b53d02019-10-22 10:25:58 -0600625 }
626
627 io_worker_exit(worker);
628 return 0;
629}
630
631/*
Jens Axboe771b53d02019-10-22 10:25:58 -0600632 * Called when a worker is scheduled in. Mark us as currently running.
633 */
634void io_wq_worker_running(struct task_struct *tsk)
635{
636 struct io_worker *worker = kthread_data(tsk);
637 struct io_wqe *wqe = worker->wqe;
638
639 if (!(worker->flags & IO_WORKER_F_UP))
640 return;
641 if (worker->flags & IO_WORKER_F_RUNNING)
642 return;
643 worker->flags |= IO_WORKER_F_RUNNING;
Jens Axboec5def4a2019-11-07 11:41:16 -0700644 io_wqe_inc_running(wqe, worker);
Jens Axboe771b53d02019-10-22 10:25:58 -0600645}
646
647/*
648 * Called when worker is going to sleep. If there are no workers currently
649 * running and we have work pending, wake up a free one or have the manager
650 * set one up.
651 */
652void io_wq_worker_sleeping(struct task_struct *tsk)
653{
654 struct io_worker *worker = kthread_data(tsk);
655 struct io_wqe *wqe = worker->wqe;
656
657 if (!(worker->flags & IO_WORKER_F_UP))
658 return;
659 if (!(worker->flags & IO_WORKER_F_RUNNING))
660 return;
661
662 worker->flags &= ~IO_WORKER_F_RUNNING;
663
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +0200664 raw_spin_lock_irq(&wqe->lock);
Jens Axboec5def4a2019-11-07 11:41:16 -0700665 io_wqe_dec_running(wqe, worker);
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +0200666 raw_spin_unlock_irq(&wqe->lock);
Jens Axboe771b53d02019-10-22 10:25:58 -0600667}
668
Jens Axboeb60fda62019-11-19 08:37:07 -0700669static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
Jens Axboe771b53d02019-10-22 10:25:58 -0600670{
Hillf Dantonc4068bf2020-09-26 21:26:55 +0800671 struct io_wqe_acct *acct = &wqe->acct[index];
Jens Axboe771b53d02019-10-22 10:25:58 -0600672 struct io_worker *worker;
673
Jann Hornad6e0052019-11-26 17:39:45 +0100674 worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
Jens Axboe771b53d02019-10-22 10:25:58 -0600675 if (!worker)
Jens Axboeb60fda62019-11-19 08:37:07 -0700676 return false;
Jens Axboe771b53d02019-10-22 10:25:58 -0600677
678 refcount_set(&worker->ref, 1);
679 worker->nulls_node.pprev = NULL;
Jens Axboe771b53d02019-10-22 10:25:58 -0600680 worker->wqe = wqe;
Jens Axboe36c2f922019-11-13 09:43:34 -0700681 spin_lock_init(&worker->lock);
Jens Axboe771b53d02019-10-22 10:25:58 -0600682
683 worker->task = kthread_create_on_node(io_wqe_worker, worker, wqe->node,
Jens Axboec5def4a2019-11-07 11:41:16 -0700684 "io_wqe_worker-%d/%d", index, wqe->node);
Jens Axboe771b53d02019-10-22 10:25:58 -0600685 if (IS_ERR(worker->task)) {
686 kfree(worker);
Jens Axboeb60fda62019-11-19 08:37:07 -0700687 return false;
Jens Axboe771b53d02019-10-22 10:25:58 -0600688 }
Jens Axboea8b595b2020-10-15 10:13:07 -0600689 kthread_bind_mask(worker->task, cpumask_of_node(wqe->node));
Jens Axboe771b53d02019-10-22 10:25:58 -0600690
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +0200691 raw_spin_lock_irq(&wqe->lock);
Jens Axboe021d1cd2019-11-14 08:00:41 -0700692 hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
Jens Axboee61df662019-11-13 13:54:49 -0700693 list_add_tail_rcu(&worker->all_list, &wqe->all_list);
Jens Axboe771b53d02019-10-22 10:25:58 -0600694 worker->flags |= IO_WORKER_F_FREE;
Jens Axboec5def4a2019-11-07 11:41:16 -0700695 if (index == IO_WQ_ACCT_BOUND)
696 worker->flags |= IO_WORKER_F_BOUND;
697 if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND))
Jens Axboe771b53d02019-10-22 10:25:58 -0600698 worker->flags |= IO_WORKER_F_FIXED;
Jens Axboec5def4a2019-11-07 11:41:16 -0700699 acct->nr_workers++;
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +0200700 raw_spin_unlock_irq(&wqe->lock);
Jens Axboe771b53d02019-10-22 10:25:58 -0600701
Jens Axboec5def4a2019-11-07 11:41:16 -0700702 if (index == IO_WQ_ACCT_UNBOUND)
703 atomic_inc(&wq->user->processes);
704
Hillf Dantonc4068bf2020-09-26 21:26:55 +0800705 refcount_inc(&wq->refs);
Jens Axboe771b53d02019-10-22 10:25:58 -0600706 wake_up_process(worker->task);
Jens Axboeb60fda62019-11-19 08:37:07 -0700707 return true;
Jens Axboe771b53d02019-10-22 10:25:58 -0600708}
709
Jens Axboec5def4a2019-11-07 11:41:16 -0700710static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
Jens Axboe771b53d02019-10-22 10:25:58 -0600711 __must_hold(wqe->lock)
712{
Jens Axboec5def4a2019-11-07 11:41:16 -0700713 struct io_wqe_acct *acct = &wqe->acct[index];
Jens Axboe771b53d02019-10-22 10:25:58 -0600714
Jens Axboec5def4a2019-11-07 11:41:16 -0700715 /* if we have available workers or no work, no need */
Jens Axboe021d1cd2019-11-14 08:00:41 -0700716 if (!hlist_nulls_empty(&wqe->free_list) || !io_wqe_run_queue(wqe))
Jens Axboec5def4a2019-11-07 11:41:16 -0700717 return false;
718 return acct->nr_workers < acct->max_workers;
Jens Axboe771b53d02019-10-22 10:25:58 -0600719}
720
Hillf Dantonc4068bf2020-09-26 21:26:55 +0800721static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data)
722{
723 send_sig(SIGINT, worker->task, 1);
724 return false;
725}
726
727/*
728 * Iterate the passed in list and call the specific function for each
729 * worker that isn't exiting
730 */
731static bool io_wq_for_each_worker(struct io_wqe *wqe,
732 bool (*func)(struct io_worker *, void *),
733 void *data)
734{
735 struct io_worker *worker;
736 bool ret = false;
737
738 list_for_each_entry_rcu(worker, &wqe->all_list, all_list) {
739 if (io_worker_get(worker)) {
740 /* no task if node is/was offline */
741 if (worker->task)
742 ret = func(worker, data);
743 io_worker_release(worker);
744 if (ret)
745 break;
746 }
747 }
748
749 return ret;
750}
751
752static bool io_wq_worker_wake(struct io_worker *worker, void *data)
753{
754 wake_up_process(worker->task);
755 return false;
756}
757
Jens Axboe771b53d02019-10-22 10:25:58 -0600758/*
759 * Manager thread. Tasked with creating new workers, if we need them.
760 */
761static int io_wq_manager(void *data)
762{
763 struct io_wq *wq = data;
Jann Horn3fc50ab2019-11-26 19:10:20 +0100764 int node;
Jens Axboeb60fda62019-11-19 08:37:07 -0700765
766 /* create fixed workers */
Hillf Dantonc4068bf2020-09-26 21:26:55 +0800767 refcount_set(&wq->refs, 1);
Jann Horn3fc50ab2019-11-26 19:10:20 +0100768 for_each_node(node) {
Jens Axboe75634392020-02-11 06:30:06 -0700769 if (!node_online(node))
770 continue;
Hillf Dantonc4068bf2020-09-26 21:26:55 +0800771 if (create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND))
772 continue;
773 set_bit(IO_WQ_BIT_ERROR, &wq->state);
774 set_bit(IO_WQ_BIT_EXIT, &wq->state);
775 goto out;
Jens Axboeb60fda62019-11-19 08:37:07 -0700776 }
777
778 complete(&wq->done);
Jens Axboe771b53d02019-10-22 10:25:58 -0600779
780 while (!kthread_should_stop()) {
Jens Axboeaa96bf82020-04-03 11:26:26 -0600781 if (current->task_works)
782 task_work_run();
783
Jann Horn3fc50ab2019-11-26 19:10:20 +0100784 for_each_node(node) {
785 struct io_wqe *wqe = wq->wqes[node];
Jens Axboec5def4a2019-11-07 11:41:16 -0700786 bool fork_worker[2] = { false, false };
Jens Axboe771b53d02019-10-22 10:25:58 -0600787
Jens Axboe75634392020-02-11 06:30:06 -0700788 if (!node_online(node))
789 continue;
790
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +0200791 raw_spin_lock_irq(&wqe->lock);
Jens Axboec5def4a2019-11-07 11:41:16 -0700792 if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
793 fork_worker[IO_WQ_ACCT_BOUND] = true;
794 if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND))
795 fork_worker[IO_WQ_ACCT_UNBOUND] = true;
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +0200796 raw_spin_unlock_irq(&wqe->lock);
Jens Axboec5def4a2019-11-07 11:41:16 -0700797 if (fork_worker[IO_WQ_ACCT_BOUND])
798 create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND);
799 if (fork_worker[IO_WQ_ACCT_UNBOUND])
800 create_io_worker(wq, wqe, IO_WQ_ACCT_UNBOUND);
Jens Axboe771b53d02019-10-22 10:25:58 -0600801 }
802 set_current_state(TASK_INTERRUPTIBLE);
803 schedule_timeout(HZ);
804 }
805
Jens Axboeaa96bf82020-04-03 11:26:26 -0600806 if (current->task_works)
807 task_work_run();
808
Hillf Dantonc4068bf2020-09-26 21:26:55 +0800809out:
810 if (refcount_dec_and_test(&wq->refs)) {
Jens Axboeb60fda62019-11-19 08:37:07 -0700811 complete(&wq->done);
Hillf Dantonc4068bf2020-09-26 21:26:55 +0800812 return 0;
813 }
814 /* if ERROR is set and we get here, we have workers to wake */
815 if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
816 rcu_read_lock();
817 for_each_node(node)
818 io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
819 rcu_read_unlock();
820 }
Jens Axboeb60fda62019-11-19 08:37:07 -0700821 return 0;
Jens Axboe771b53d02019-10-22 10:25:58 -0600822}
823
Jens Axboec5def4a2019-11-07 11:41:16 -0700824static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
825 struct io_wq_work *work)
826{
827 bool free_worker;
828
829 if (!(work->flags & IO_WQ_WORK_UNBOUND))
830 return true;
831 if (atomic_read(&acct->nr_running))
832 return true;
833
834 rcu_read_lock();
Jens Axboe021d1cd2019-11-14 08:00:41 -0700835 free_worker = !hlist_nulls_empty(&wqe->free_list);
Jens Axboec5def4a2019-11-07 11:41:16 -0700836 rcu_read_unlock();
837 if (free_worker)
838 return true;
839
840 if (atomic_read(&wqe->wq->user->processes) >= acct->max_workers &&
841 !(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN)))
842 return false;
843
844 return true;
845}
846
Pavel Begunkove9fd9392020-03-04 16:14:12 +0300847static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
Pavel Begunkovfc04c392020-03-01 19:18:19 +0300848{
Pavel Begunkove9fd9392020-03-04 16:14:12 +0300849 struct io_wq *wq = wqe->wq;
850
Pavel Begunkovfc04c392020-03-01 19:18:19 +0300851 do {
852 struct io_wq_work *old_work = work;
853
854 work->flags |= IO_WQ_WORK_CANCEL;
Pavel Begunkovf4db7182020-06-25 18:20:54 +0300855 work = wq->do_work(work);
Pavel Begunkove9fd9392020-03-04 16:14:12 +0300856 wq->free_work(old_work);
Pavel Begunkovfc04c392020-03-01 19:18:19 +0300857 } while (work);
858}
859
Pavel Begunkov86f3cd12020-03-23 22:57:22 +0300860static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
861{
862 unsigned int hash;
863 struct io_wq_work *tail;
864
865 if (!io_wq_is_hashed(work)) {
866append:
867 wq_list_add_tail(&work->list, &wqe->work_list);
868 return;
869 }
870
871 hash = io_get_work_hash(work);
872 tail = wqe->hash_tail[hash];
873 wqe->hash_tail[hash] = work;
874 if (!tail)
875 goto append;
876
877 wq_list_add_after(&work->list, &tail->list, &wqe->work_list);
878}
879
Jens Axboe771b53d02019-10-22 10:25:58 -0600880static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
881{
Jens Axboec5def4a2019-11-07 11:41:16 -0700882 struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
Jens Axboe895e2ca2019-12-17 08:46:33 -0700883 int work_flags;
Jens Axboe771b53d02019-10-22 10:25:58 -0600884 unsigned long flags;
885
Jens Axboec5def4a2019-11-07 11:41:16 -0700886 /*
887 * Do early check to see if we need a new unbound worker, and if we do,
888 * if we're allowed to do so. This isn't 100% accurate as there's a
889 * gap between this check and incrementing the value, but that's OK.
890 * It's close enough to not be an issue, fork() has the same delay.
891 */
892 if (unlikely(!io_wq_can_queue(wqe, acct, work))) {
Pavel Begunkove9fd9392020-03-04 16:14:12 +0300893 io_run_cancel(work, wqe);
Jens Axboec5def4a2019-11-07 11:41:16 -0700894 return;
895 }
896
Jens Axboe895e2ca2019-12-17 08:46:33 -0700897 work_flags = work->flags;
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +0200898 raw_spin_lock_irqsave(&wqe->lock, flags);
Pavel Begunkov86f3cd12020-03-23 22:57:22 +0300899 io_wqe_insert_work(wqe, work);
Jens Axboe771b53d02019-10-22 10:25:58 -0600900 wqe->flags &= ~IO_WQE_FLAG_STALLED;
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +0200901 raw_spin_unlock_irqrestore(&wqe->lock, flags);
Jens Axboe771b53d02019-10-22 10:25:58 -0600902
Jens Axboe895e2ca2019-12-17 08:46:33 -0700903 if ((work_flags & IO_WQ_WORK_CONCURRENT) ||
904 !atomic_read(&acct->nr_running))
Jens Axboec5def4a2019-11-07 11:41:16 -0700905 io_wqe_wake_worker(wqe, acct);
Jens Axboe771b53d02019-10-22 10:25:58 -0600906}
907
908void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
909{
910 struct io_wqe *wqe = wq->wqes[numa_node_id()];
911
912 io_wqe_enqueue(wqe, work);
913}
914
915/*
Pavel Begunkov8766dd52020-03-14 00:31:04 +0300916 * Work items that hash to the same value will not be done in parallel.
917 * Used to limit concurrent writes, generally hashed by inode.
Jens Axboe771b53d02019-10-22 10:25:58 -0600918 */
Pavel Begunkov8766dd52020-03-14 00:31:04 +0300919void io_wq_hash_work(struct io_wq_work *work, void *val)
Jens Axboe771b53d02019-10-22 10:25:58 -0600920{
Pavel Begunkov8766dd52020-03-14 00:31:04 +0300921 unsigned int bit;
Jens Axboe771b53d02019-10-22 10:25:58 -0600922
923 bit = hash_ptr(val, IO_WQ_HASH_ORDER);
924 work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
Jens Axboe771b53d02019-10-22 10:25:58 -0600925}
926
Jens Axboe771b53d02019-10-22 10:25:58 -0600927void io_wq_cancel_all(struct io_wq *wq)
928{
Jann Horn3fc50ab2019-11-26 19:10:20 +0100929 int node;
Jens Axboe771b53d02019-10-22 10:25:58 -0600930
931 set_bit(IO_WQ_BIT_CANCEL, &wq->state);
932
Jens Axboe771b53d02019-10-22 10:25:58 -0600933 rcu_read_lock();
Jann Horn3fc50ab2019-11-26 19:10:20 +0100934 for_each_node(node) {
935 struct io_wqe *wqe = wq->wqes[node];
Jens Axboe771b53d02019-10-22 10:25:58 -0600936
Jens Axboee61df662019-11-13 13:54:49 -0700937 io_wq_for_each_worker(wqe, io_wqe_worker_send_sig, NULL);
Jens Axboe771b53d02019-10-22 10:25:58 -0600938 }
939 rcu_read_unlock();
940}
941
Jens Axboe62755e32019-10-28 21:49:21 -0600942struct io_cb_cancel_data {
Pavel Begunkov2293b412020-03-07 01:15:39 +0300943 work_cancel_fn *fn;
944 void *data;
Pavel Begunkov4f26bda2020-06-15 10:24:03 +0300945 int nr_running;
946 int nr_pending;
947 bool cancel_all;
Jens Axboe62755e32019-10-28 21:49:21 -0600948};
949
Pavel Begunkov2293b412020-03-07 01:15:39 +0300950static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
Jens Axboe62755e32019-10-28 21:49:21 -0600951{
Pavel Begunkov2293b412020-03-07 01:15:39 +0300952 struct io_cb_cancel_data *match = data;
Jens Axboe6f726532019-11-05 13:51:51 -0700953 unsigned long flags;
Jens Axboe62755e32019-10-28 21:49:21 -0600954
955 /*
956 * Hold the lock to avoid ->cur_work going out of scope, caller
Jens Axboe36c2f922019-11-13 09:43:34 -0700957 * may dereference the passed in work.
Jens Axboe62755e32019-10-28 21:49:21 -0600958 */
Jens Axboe36c2f922019-11-13 09:43:34 -0700959 spin_lock_irqsave(&worker->lock, flags);
Jens Axboe62755e32019-10-28 21:49:21 -0600960 if (worker->cur_work &&
Jens Axboe0c9d5cc2019-12-11 19:29:43 -0700961 !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
Pavel Begunkov2293b412020-03-07 01:15:39 +0300962 match->fn(worker->cur_work, match->data)) {
Jens Axboe771b53d02019-10-22 10:25:58 -0600963 send_sig(SIGINT, worker->task, 1);
Pavel Begunkov4f26bda2020-06-15 10:24:03 +0300964 match->nr_running++;
Jens Axboe771b53d02019-10-22 10:25:58 -0600965 }
Jens Axboe36c2f922019-11-13 09:43:34 -0700966 spin_unlock_irqrestore(&worker->lock, flags);
Jens Axboe771b53d02019-10-22 10:25:58 -0600967
Pavel Begunkov4f26bda2020-06-15 10:24:03 +0300968 return match->nr_running && !match->cancel_all;
Jens Axboe771b53d02019-10-22 10:25:58 -0600969}
970
Pavel Begunkov204361a2020-08-23 20:33:10 +0300971static inline void io_wqe_remove_pending(struct io_wqe *wqe,
972 struct io_wq_work *work,
973 struct io_wq_work_node *prev)
974{
975 unsigned int hash = io_get_work_hash(work);
976 struct io_wq_work *prev_work = NULL;
977
978 if (io_wq_is_hashed(work) && work == wqe->hash_tail[hash]) {
979 if (prev)
980 prev_work = container_of(prev, struct io_wq_work, list);
981 if (prev_work && io_get_work_hash(prev_work) == hash)
982 wqe->hash_tail[hash] = prev_work;
983 else
984 wqe->hash_tail[hash] = NULL;
985 }
986 wq_list_del(&wqe->work_list, &work->list, prev);
987}
988
Pavel Begunkov4f26bda2020-06-15 10:24:03 +0300989static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
Pavel Begunkovf4c26652020-06-15 10:24:02 +0300990 struct io_cb_cancel_data *match)
Jens Axboe771b53d02019-10-22 10:25:58 -0600991{
Jens Axboe6206f0e2019-11-26 11:59:32 -0700992 struct io_wq_work_node *node, *prev;
Jens Axboe771b53d02019-10-22 10:25:58 -0600993 struct io_wq_work *work;
Jens Axboe6f726532019-11-05 13:51:51 -0700994 unsigned long flags;
Jens Axboe771b53d02019-10-22 10:25:58 -0600995
Pavel Begunkov4f26bda2020-06-15 10:24:03 +0300996retry:
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +0200997 raw_spin_lock_irqsave(&wqe->lock, flags);
Jens Axboe6206f0e2019-11-26 11:59:32 -0700998 wq_list_for_each(node, prev, &wqe->work_list) {
999 work = container_of(node, struct io_wq_work, list);
Pavel Begunkov4f26bda2020-06-15 10:24:03 +03001000 if (!match->fn(work, match->data))
1001 continue;
Pavel Begunkov204361a2020-08-23 20:33:10 +03001002 io_wqe_remove_pending(wqe, work, prev);
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +02001003 raw_spin_unlock_irqrestore(&wqe->lock, flags);
Pavel Begunkov4f26bda2020-06-15 10:24:03 +03001004 io_run_cancel(work, wqe);
1005 match->nr_pending++;
1006 if (!match->cancel_all)
1007 return;
1008
1009 /* not safe to continue after unlock */
1010 goto retry;
Jens Axboe771b53d02019-10-22 10:25:58 -06001011 }
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +02001012 raw_spin_unlock_irqrestore(&wqe->lock, flags);
Pavel Begunkovf4c26652020-06-15 10:24:02 +03001013}
Jens Axboe771b53d02019-10-22 10:25:58 -06001014
Pavel Begunkov4f26bda2020-06-15 10:24:03 +03001015static void io_wqe_cancel_running_work(struct io_wqe *wqe,
Pavel Begunkovf4c26652020-06-15 10:24:02 +03001016 struct io_cb_cancel_data *match)
1017{
Jens Axboe771b53d02019-10-22 10:25:58 -06001018 rcu_read_lock();
Pavel Begunkov4f26bda2020-06-15 10:24:03 +03001019 io_wq_for_each_worker(wqe, io_wq_worker_cancel, match);
Jens Axboe771b53d02019-10-22 10:25:58 -06001020 rcu_read_unlock();
Jens Axboe771b53d02019-10-22 10:25:58 -06001021}
1022
Pavel Begunkov2293b412020-03-07 01:15:39 +03001023enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
Pavel Begunkov4f26bda2020-06-15 10:24:03 +03001024 void *data, bool cancel_all)
Pavel Begunkov2293b412020-03-07 01:15:39 +03001025{
1026 struct io_cb_cancel_data match = {
Pavel Begunkov4f26bda2020-06-15 10:24:03 +03001027 .fn = cancel,
1028 .data = data,
1029 .cancel_all = cancel_all,
Pavel Begunkov2293b412020-03-07 01:15:39 +03001030 };
Pavel Begunkov2293b412020-03-07 01:15:39 +03001031 int node;
1032
Pavel Begunkovf4c26652020-06-15 10:24:02 +03001033 /*
1034 * First check pending list, if we're lucky we can just remove it
1035 * from there. CANCEL_OK means that the work is returned as-new,
1036 * no completion will be posted for it.
1037 */
Pavel Begunkov2293b412020-03-07 01:15:39 +03001038 for_each_node(node) {
1039 struct io_wqe *wqe = wq->wqes[node];
1040
Pavel Begunkov4f26bda2020-06-15 10:24:03 +03001041 io_wqe_cancel_pending_work(wqe, &match);
1042 if (match.nr_pending && !match.cancel_all)
Pavel Begunkovf4c26652020-06-15 10:24:02 +03001043 return IO_WQ_CANCEL_OK;
Pavel Begunkov2293b412020-03-07 01:15:39 +03001044 }
1045
Pavel Begunkovf4c26652020-06-15 10:24:02 +03001046 /*
1047 * Now check if a free (going busy) or busy worker has the work
1048 * currently running. If we find it there, we'll return CANCEL_RUNNING
1049 * as an indication that we attempt to signal cancellation. The
1050 * completion will run normally in this case.
1051 */
1052 for_each_node(node) {
1053 struct io_wqe *wqe = wq->wqes[node];
1054
Pavel Begunkov4f26bda2020-06-15 10:24:03 +03001055 io_wqe_cancel_running_work(wqe, &match);
1056 if (match.nr_running && !match.cancel_all)
Pavel Begunkovf4c26652020-06-15 10:24:02 +03001057 return IO_WQ_CANCEL_RUNNING;
1058 }
1059
Pavel Begunkov4f26bda2020-06-15 10:24:03 +03001060 if (match.nr_running)
1061 return IO_WQ_CANCEL_RUNNING;
1062 if (match.nr_pending)
1063 return IO_WQ_CANCEL_OK;
Pavel Begunkovf4c26652020-06-15 10:24:02 +03001064 return IO_WQ_CANCEL_NOTFOUND;
Pavel Begunkov2293b412020-03-07 01:15:39 +03001065}
1066
1067static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data)
Jens Axboe00bcda12020-02-08 19:13:32 -07001068{
1069 return work == data;
1070}
1071
Jens Axboe771b53d02019-10-22 10:25:58 -06001072enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
1073{
Pavel Begunkov4f26bda2020-06-15 10:24:03 +03001074 return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork, false);
Jens Axboe771b53d02019-10-22 10:25:58 -06001075}
1076
Jens Axboe576a3472019-11-25 08:49:20 -07001077struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
Jens Axboe771b53d02019-10-22 10:25:58 -06001078{
Jann Horn3fc50ab2019-11-26 19:10:20 +01001079 int ret = -ENOMEM, node;
Jens Axboe771b53d02019-10-22 10:25:58 -06001080 struct io_wq *wq;
1081
Pavel Begunkovf5fa38c2020-06-08 21:08:20 +03001082 if (WARN_ON_ONCE(!data->free_work || !data->do_work))
Pavel Begunkove9fd9392020-03-04 16:14:12 +03001083 return ERR_PTR(-EINVAL);
1084
Jann Hornad6e0052019-11-26 17:39:45 +01001085 wq = kzalloc(sizeof(*wq), GFP_KERNEL);
Jens Axboe771b53d02019-10-22 10:25:58 -06001086 if (!wq)
1087 return ERR_PTR(-ENOMEM);
1088
Jann Horn3fc50ab2019-11-26 19:10:20 +01001089 wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL);
Jens Axboe771b53d02019-10-22 10:25:58 -06001090 if (!wq->wqes) {
1091 kfree(wq);
1092 return ERR_PTR(-ENOMEM);
1093 }
1094
Pavel Begunkove9fd9392020-03-04 16:14:12 +03001095 wq->free_work = data->free_work;
Pavel Begunkovf5fa38c2020-06-08 21:08:20 +03001096 wq->do_work = data->do_work;
Jens Axboe7d723062019-11-12 22:31:31 -07001097
Jens Axboec5def4a2019-11-07 11:41:16 -07001098 /* caller must already hold a reference to this */
Jens Axboe576a3472019-11-25 08:49:20 -07001099 wq->user = data->user;
Jens Axboec5def4a2019-11-07 11:41:16 -07001100
Jann Horn3fc50ab2019-11-26 19:10:20 +01001101 for_each_node(node) {
Jens Axboe771b53d02019-10-22 10:25:58 -06001102 struct io_wqe *wqe;
Jens Axboe75634392020-02-11 06:30:06 -07001103 int alloc_node = node;
Jens Axboe771b53d02019-10-22 10:25:58 -06001104
Jens Axboe75634392020-02-11 06:30:06 -07001105 if (!node_online(alloc_node))
1106 alloc_node = NUMA_NO_NODE;
1107 wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node);
Jens Axboe771b53d02019-10-22 10:25:58 -06001108 if (!wqe)
Jann Horn3fc50ab2019-11-26 19:10:20 +01001109 goto err;
1110 wq->wqes[node] = wqe;
Jens Axboe75634392020-02-11 06:30:06 -07001111 wqe->node = alloc_node;
Jens Axboec5def4a2019-11-07 11:41:16 -07001112 wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
1113 atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
Jens Axboe576a3472019-11-25 08:49:20 -07001114 if (wq->user) {
Jens Axboec5def4a2019-11-07 11:41:16 -07001115 wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
1116 task_rlimit(current, RLIMIT_NPROC);
1117 }
1118 atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
Jens Axboe771b53d02019-10-22 10:25:58 -06001119 wqe->wq = wq;
Sebastian Andrzej Siewior95da8462020-09-01 10:41:46 +02001120 raw_spin_lock_init(&wqe->lock);
Jens Axboe6206f0e2019-11-26 11:59:32 -07001121 INIT_WQ_LIST(&wqe->work_list);
Jens Axboe021d1cd2019-11-14 08:00:41 -07001122 INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
Jens Axboee61df662019-11-13 13:54:49 -07001123 INIT_LIST_HEAD(&wqe->all_list);
Jens Axboe771b53d02019-10-22 10:25:58 -06001124 }
1125
1126 init_completion(&wq->done);
1127
Jens Axboe771b53d02019-10-22 10:25:58 -06001128 wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
1129 if (!IS_ERR(wq->manager)) {
1130 wake_up_process(wq->manager);
Jens Axboeb60fda62019-11-19 08:37:07 -07001131 wait_for_completion(&wq->done);
1132 if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
1133 ret = -ENOMEM;
1134 goto err;
1135 }
Jens Axboe848f7e12020-01-23 15:33:32 -07001136 refcount_set(&wq->use_refs, 1);
Jens Axboeb60fda62019-11-19 08:37:07 -07001137 reinit_completion(&wq->done);
Jens Axboe771b53d02019-10-22 10:25:58 -06001138 return wq;
1139 }
1140
1141 ret = PTR_ERR(wq->manager);
Jens Axboe771b53d02019-10-22 10:25:58 -06001142 complete(&wq->done);
Jens Axboeb60fda62019-11-19 08:37:07 -07001143err:
Jann Horn3fc50ab2019-11-26 19:10:20 +01001144 for_each_node(node)
1145 kfree(wq->wqes[node]);
Jens Axboeb60fda62019-11-19 08:37:07 -07001146 kfree(wq->wqes);
1147 kfree(wq);
Jens Axboe771b53d02019-10-22 10:25:58 -06001148 return ERR_PTR(ret);
1149}
1150
Pavel Begunkoveba6f5a2020-01-28 03:15:47 +03001151bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
1152{
Pavel Begunkovf5fa38c2020-06-08 21:08:20 +03001153 if (data->free_work != wq->free_work || data->do_work != wq->do_work)
Pavel Begunkoveba6f5a2020-01-28 03:15:47 +03001154 return false;
1155
1156 return refcount_inc_not_zero(&wq->use_refs);
1157}
1158
Jens Axboe848f7e12020-01-23 15:33:32 -07001159static void __io_wq_destroy(struct io_wq *wq)
Jens Axboe771b53d02019-10-22 10:25:58 -06001160{
Jann Horn3fc50ab2019-11-26 19:10:20 +01001161 int node;
Jens Axboe771b53d02019-10-22 10:25:58 -06001162
Jens Axboeb60fda62019-11-19 08:37:07 -07001163 set_bit(IO_WQ_BIT_EXIT, &wq->state);
1164 if (wq->manager)
Jens Axboe771b53d02019-10-22 10:25:58 -06001165 kthread_stop(wq->manager);
Jens Axboe771b53d02019-10-22 10:25:58 -06001166
1167 rcu_read_lock();
Jann Horn3fc50ab2019-11-26 19:10:20 +01001168 for_each_node(node)
1169 io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
Jens Axboe771b53d02019-10-22 10:25:58 -06001170 rcu_read_unlock();
1171
1172 wait_for_completion(&wq->done);
1173
Jann Horn3fc50ab2019-11-26 19:10:20 +01001174 for_each_node(node)
1175 kfree(wq->wqes[node]);
Jens Axboe771b53d02019-10-22 10:25:58 -06001176 kfree(wq->wqes);
1177 kfree(wq);
1178}
Jens Axboe848f7e12020-01-23 15:33:32 -07001179
1180void io_wq_destroy(struct io_wq *wq)
1181{
1182 if (refcount_dec_and_test(&wq->use_refs))
1183 __io_wq_destroy(wq);
1184}
Jens Axboeaa96bf82020-04-03 11:26:26 -06001185
1186struct task_struct *io_wq_get_task(struct io_wq *wq)
1187{
1188 return wq->manager;
1189}