From 556c19f563b64dd5463b0030d19c8681f4f7446e Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Mon, 22 Jan 2024 14:31:03 +0800
Subject: [PATCH] ubifs: Queue up space reservation tasks if retrying many
 times

Recently we catched ENOSPC returned by make_reservation() while doing
fsstress on UBIFS, we got following information when it occurred (See
details in Link):

 UBIFS error (ubi0:0 pid 3640152): make_reservation [ubifs]: cannot
 reserve 112 bytes in jhead 2, error -28
 CPU: 2 PID: 3640152 Comm: kworker/u16:2 Tainted: G    B   W
 Hardware name: Hisilicon PhosphorHi1230 EMU (DT)
 Workqueue: writeback wb_workfn (flush-ubifs_0_0)
 Call trace:
  dump_stack+0x114/0x198
  make_reservation+0x564/0x610 [ubifs]
  ubifs_jnl_write_data+0x328/0x48c [ubifs]
  do_writepage+0x2a8/0x3e4 [ubifs]
  ubifs_writepage+0x16c/0x374 [ubifs]
  generic_writepages+0xb4/0x114
  do_writepages+0xcc/0x11c
  writeback_sb_inodes+0x2d0/0x564
  wb_writeback+0x20c/0x2b4
  wb_workfn+0x404/0x510
  process_one_work+0x304/0x4ac
  worker_thread+0x31c/0x4e4
  kthread+0x23c/0x290
  Budgeting info: data budget sum 17576, total budget sum 17768
	budg_data_growth 4144, budg_dd_growth 13432, budg_idx_growth 192
	min_idx_lebs 13, old_idx_sz 988640, uncommitted_idx 0
	page_budget 4144, inode_budget 160, dent_budget 312
	nospace 0, nospace_rp 0
	dark_wm 8192, dead_wm 4096, max_idx_node_sz 192
	freeable_cnt 0, calc_idx_sz 988640, idx_gc_cnt 0
	dirty_pg_cnt 4, dirty_zn_cnt 0, clean_zn_cnt 4811
	gc_lnum 21, ihead_lnum 14
	jhead 0 (GC)	 LEB 16
	jhead 1 (base)	 LEB 34
	jhead 2 (data)	 LEB 23
	bud LEB 16
	bud LEB 23
	bud LEB 34
	old bud LEB 33
	old bud LEB 31
	old bud LEB 15
	commit state 4
 Budgeting predictions:
	available: 33832, outstanding 17576, free 15356
 (pid 3640152) start dumping LEB properties
 (pid 3640152) Lprops statistics: empty_lebs 3, idx_lebs  11
	taken_empty_lebs 1, total_free 1253376, total_dirty 2445736
	total_used 3438712, total_dark 65536, total_dead 17248
 LEB 15 free 0      dirty 248000   used 5952   (taken)
 LEB 16 free 110592 dirty 896      used 142464 (taken, jhead 0 (GC))
 LEB 21 free 253952 dirty 0        used 0      (taken, GC LEB)
 LEB 23 free 0      dirty 248104   used 5848   (taken, jhead 2 (data))
 LEB 29 free 253952 dirty 0        used 0      (empty)
 LEB 33 free 0      dirty 253952   used 0      (taken)
 LEB 34 free 217088 dirty 36544    used 320    (taken, jhead 1 (base))
 LEB 37 free 253952 dirty 0        used 0      (empty)
 OTHERS: index lebs, zero-available non-index lebs

According to the budget algorithm, there are 5 LEBs reserved for budget:
three journal heads(16,23,34), 1 GC LEB(21) and 1 deletion LEB(can be
used in make_reservation()). There are 2 empty LEBs used for index nodes,
which is calculated as min_idx_lebs - idx_lebs = 2. In theory, LEB 15
and 33 should be reclaimed as free state after committing, but it is now
in taken state. After looking the realization of reserve_space(), there's
a possible situation:

LEB 15: free 2000 dirty 248000 used 3952 (jhead 2)
LEB 23: free 2000 dirty 248104 used 3848 (bud, taken)
LEB 33: free 2000 dirty 251952 used 0    (bud, taken)

      wb_workfn          wb_workfn_2
do_writepage // write 3000 bytes
 ubifs_jnl_write_data
  make_reservation
   reserve_space
    ubifs_garbage_collect
     ubifs_find_dirty_leb // ret ENOSPC, dirty LEBs are taken
   nospc_retries++  // 1
   ubifs_run_commit
    do_commit

LEB 15: free 2000 dirty 248000 used 3952 (jhead 2)
LEB 23: free 2000 dirty 248104 used 3848 (dirty)
LEB 33: free 2000 dirty 251952 used 0    (dirty)

                   do_writepage // write 2000 bytes for 3 times
		    ubifs_jnl_write_data
		    // grabs 15\23\33

LEB 15: free 0    dirty 248000 used 5952 (bud, taken)
LEB 23: free 0    dirty 248104 used 5848 (jhead 2)
LEB 33: free 0    dirty 253952 used 0    (bud, taken)

   reserve_space
    ubifs_garbage_collect
     ubifs_find_dirty_leb // ret ENOSPC, dirty LEBs are taken
   if (nospc_retries++ < 2) // false
 ubifs_ro_mode !

Fetch a reproducer in Link.

The dirty LEBs could be grabbed by other threads, which fails finding dirty
LEBs of GC in current thread, so make_reservation() could try many times to
invoke GC&&committing, but current realization limits the times of retrying
as 'nospc_retries'(twice).
Fix it by adding a wait queue, start queuing up space reservation tasks
when someone task has retried gc + commit for many times. Then there is
only one task making space reservation at any time, and it can always make
success under the premise of correct budgeting.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=218164
Fixes: 1e51764a3c2a ("UBIFS: add new flash file system")
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 fs/ubifs/journal.c | 171 +++++++++++++++++++++++++++++++++++++++------
 fs/ubifs/super.c   |   2 +
 fs/ubifs/ubifs.h   |   4 ++
 3 files changed, 155 insertions(+), 22 deletions(-)

diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index f0a5538c84b0..74aee92433d7 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -292,6 +292,96 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
 	return err;
 }
 
+/**
+ * __queue_and_wait - queue a task and wait until the task is waked up.
+ * @c: UBIFS file-system description object
+ *
+ * This function adds current task in queue and waits until the task is waked
+ * up. This function should be called with @c->reserve_space_wq locked.
+ */
+static void __queue_and_wait(struct ubifs_info *c)
+{
+	DEFINE_WAIT(wait);
+
+	__add_wait_queue_entry_tail_exclusive(&c->reserve_space_wq, &wait);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	spin_unlock(&c->reserve_space_wq.lock);
+
+	schedule();
+	finish_wait(&c->reserve_space_wq, &wait);
+}
+
+/**
+ * wait_for_reservation - try queuing current task to wait until waked up.
+ * @c: UBIFS file-system description object
+ *
+ * This function queues current task to wait until waked up, if queuing is
+ * started(@c->need_wait_space is not %0). Returns %true if current task is
+ * added in queue, otherwise %false is returned.
+ */
+static bool wait_for_reservation(struct ubifs_info *c)
+{
+	if (likely(atomic_read(&c->need_wait_space) == 0))
+		/* Quick path to check whether queuing is started. */
+		return false;
+
+	spin_lock(&c->reserve_space_wq.lock);
+	if (atomic_read(&c->need_wait_space) == 0) {
+		/* Queuing is not started, don't queue current task. */
+		spin_unlock(&c->reserve_space_wq.lock);
+		return false;
+	}
+
+	__queue_and_wait(c);
+	return true;
+}
+
+/**
+ * wake_up_reservation - wake up first task in queue or stop queuing.
+ * @c: UBIFS file-system description object
+ *
+ * This function wakes up the first task in queue if it exists, or stops
+ * queuing if no tasks in queue.
+ */
+static void wake_up_reservation(struct ubifs_info *c)
+{
+	spin_lock(&c->reserve_space_wq.lock);
+	if (waitqueue_active(&c->reserve_space_wq))
+		wake_up_locked(&c->reserve_space_wq);
+	else
+		/*
+		 * Compared with wait_for_reservation(), set @c->need_wait_space
+		 * under the protection of wait queue lock, which can avoid that
+		 * @c->need_wait_space is set to 0 after new task queued.
+		 */
+		atomic_set(&c->need_wait_space, 0);
+	spin_unlock(&c->reserve_space_wq.lock);
+}
+
+/**
+ * wake_up_reservation - add current task in queue or start queuing.
+ * @c: UBIFS file-system description object
+ *
+ * This function starts queuing if queuing is not started, otherwise adds
+ * current task in queue.
+ */
+static void add_or_start_queue(struct ubifs_info *c)
+{
+	spin_lock(&c->reserve_space_wq.lock);
+	if (atomic_cmpxchg(&c->need_wait_space, 0, 1) == 0) {
+		/* Starts queuing, task can go on directly. */
+		spin_unlock(&c->reserve_space_wq.lock);
+		return;
+	}
+
+	/*
+	 * There are at least two tasks have retried more than 32 times
+	 * at certain point, first task has started queuing, just queue
+	 * the left tasks.
+	 */
+	__queue_and_wait(c);
+}
+
 /**
  * make_reservation - reserve journal space.
  * @c: UBIFS file-system description object
@@ -311,33 +401,27 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
 static int make_reservation(struct ubifs_info *c, int jhead, int len)
 {
 	int err, cmt_retries = 0, nospc_retries = 0;
+	bool blocked = wait_for_reservation(c);
 
 again:
 	down_read(&c->commit_sem);
 	err = reserve_space(c, jhead, len);
-	if (!err)
+	if (!err) {
 		/* c->commit_sem will get released via finish_reservation(). */
-		return 0;
+		goto out_wake_up;
+	}
 	up_read(&c->commit_sem);
 
 	if (err == -ENOSPC) {
 		/*
 		 * GC could not make any progress. We should try to commit
-		 * once because it could make some dirty space and GC would
-		 * make progress, so make the error -EAGAIN so that the below
+		 * because it could make some dirty space and GC would make
+		 * progress, so make the error -EAGAIN so that the below
 		 * will commit and re-try.
 		 */
-		if (nospc_retries++ < 2) {
-			dbg_jnl("no space, retry");
-			err = -EAGAIN;
-		}
-
-		/*
-		 * This means that the budgeting is incorrect. We always have
-		 * to be able to write to the media, because all operations are
-		 * budgeted. Deletions are not budgeted, though, but we reserve
-		 * an extra LEB for them.
-		 */
+		nospc_retries++;
+		dbg_jnl("no space, retry");
+		err = -EAGAIN;
 	}
 
 	if (err != -EAGAIN)
@@ -349,15 +433,37 @@ again:
 	 */
 	if (cmt_retries > 128) {
 		/*
-		 * This should not happen unless the journal size limitations
-		 * are too tough.
+		 * This should not happen unless:
+		 * 1. The journal size limitations are too tough.
+		 * 2. The budgeting is incorrect. We always have to be able to
+		 *    write to the media, because all operations are budgeted.
+		 *    Deletions are not budgeted, though, but we reserve an
+		 *    extra LEB for them.
 		 */
-		ubifs_err(c, "stuck in space allocation");
+		ubifs_err(c, "stuck in space allocation, nospc_retries %d",
+			  nospc_retries);
 		err = -ENOSPC;
 		goto out;
-	} else if (cmt_retries > 32)
-		ubifs_warn(c, "too many space allocation re-tries (%d)",
-			   cmt_retries);
+	} else if (cmt_retries > 32) {
+		/*
+		 * It's almost impossible to happen, unless there are many tasks
+		 * making reservation concurrently and someone task has retried
+		 * gc + commit for many times, generated available space during
+		 * this period are grabbed by other tasks.
+		 * But if it happens, start queuing up all tasks that will make
+		 * space reservation, then there is only one task making space
+		 * reservation at any time, and it can always make success under
+		 * the premise of correct budgeting.
+		 */
+		ubifs_warn(c, "too many space allocation cmt_retries (%d) "
+			   "nospc_retries (%d), start queuing tasks",
+			   cmt_retries, nospc_retries);
+
+		if (!blocked) {
+			blocked = true;
+			add_or_start_queue(c);
+		}
+	}
 
 	dbg_jnl("-EAGAIN, commit and retry (retried %d times)",
 		cmt_retries);
@@ -365,7 +471,7 @@ again:
 
 	err = ubifs_run_commit(c);
 	if (err)
-		return err;
+		goto out_wake_up;
 	goto again;
 
 out:
@@ -380,6 +486,27 @@ out:
 		cmt_retries = dbg_check_lprops(c);
 		up_write(&c->commit_sem);
 	}
+out_wake_up:
+	if (blocked) {
+		/*
+		 * Only tasks that have ever started queuing or ever been queued
+		 * can wake up other queued tasks, which can make sure that
+		 * there is only one task waked up to make space reservation.
+		 * For example:
+		 *      task A          task B           task C
+		 *                 make_reservation  make_reservation
+		 * reserve_space // 0
+		 * wake_up_reservation
+		 *                  atomic_cmpxchg // 0, start queuing
+		 *                  reserve_space
+		 *                                    wait_for_reservation
+		 *                                     __queue_and_wait
+		 *                                      add_wait_queue
+		 *  if (blocked) // false
+		 *  // So that task C won't be waked up to race with task B
+		 */
+		wake_up_reservation(c);
+	}
 	return err;
 }
 
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 09e270d6ed02..571c9dc92b94 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2151,6 +2151,8 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
 		mutex_init(&c->bu_mutex);
 		mutex_init(&c->write_reserve_mutex);
 		init_waitqueue_head(&c->cmt_wq);
+		init_waitqueue_head(&c->reserve_space_wq);
+		atomic_set(&c->need_wait_space, 0);
 		c->buds = RB_ROOT;
 		c->old_idx = RB_ROOT;
 		c->size_tree = RB_ROOT;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 6eba287ae66c..1f3ea879d93a 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1047,6 +1047,8 @@ struct ubifs_debug_info;
  * @bg_bud_bytes: number of bud bytes when background commit is initiated
  * @old_buds: buds to be released after commit ends
  * @max_bud_cnt: maximum number of buds
+ * @need_wait_space: Non %0 means space reservation tasks need to wait in queue
+ * @reserve_space_wq: wait queue to sleep on if @need_wait_space is not %0
  *
  * @commit_sem: synchronizes committer with other processes
  * @cmt_state: commit state
@@ -1305,6 +1307,8 @@ struct ubifs_info {
 	long long bg_bud_bytes;
 	struct list_head old_buds;
 	int max_bud_cnt;
+	atomic_t need_wait_space;
+	wait_queue_head_t reserve_space_wq;
 
 	struct rw_semaphore commit_sem;
 	int cmt_state;