1

bcachefs: bch_sb_field_errors

Add a new superblock section to keep counts of errors seen since
filesystem creation: we'll be addingcounters for every distinct fsck
error.

The new superblock section has entries of the for [ id, count,
time_of_last_error ]; this is intended to let us see what errors are
occuring - and getting fixed - via show-super output.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2023-10-25 15:51:16 -04:00
parent 94119eeb02
commit f5d26fa31e
13 changed files with 270 additions and 23 deletions

View File

@ -70,6 +70,7 @@ bcachefs-y := \
reflink.o \
replicas.o \
sb-clean.o \
sb-errors.o \
sb-members.o \
siphash.o \
six.o \

View File

@ -209,6 +209,7 @@
#include "nocow_locking_types.h"
#include "opts.h"
#include "recovery_types.h"
#include "sb-errors_types.h"
#include "seqmutex.h"
#include "util.h"
@ -992,11 +993,6 @@ struct bch_fs {
struct bio_set dio_read_bioset;
struct bio_set nocow_flush_bioset;
/* ERRORS */
struct list_head fsck_errors;
struct mutex fsck_error_lock;
bool fsck_alloc_err;
/* QUOTAS */
struct bch_memquota_type quotas[QTYP_NR];
@ -1045,6 +1041,14 @@ struct bch_fs {
struct bch2_time_stats times[BCH_TIME_STAT_NR];
struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
/* ERRORS */
struct list_head fsck_error_msgs;
struct mutex fsck_error_msgs_lock;
bool fsck_alloc_msgs_err;
bch_sb_errors_cpu fsck_error_counts;
struct mutex fsck_error_counts_lock;
};
extern struct wait_queue_head bch2_read_only_wait;

View File

@ -1218,7 +1218,8 @@ struct bch_sb_field {
x(journal_seq_blacklist, 8) \
x(journal_v2, 9) \
x(counters, 10) \
x(members_v2, 11)
x(members_v2, 11) \
x(errors, 12)
enum bch_sb_field_type {
#define x(f, nr) BCH_SB_FIELD_##f = nr,
@ -1621,6 +1622,17 @@ struct bch_sb_field_journal_seq_blacklist {
__u64 _data[];
};
struct bch_sb_field_errors {
struct bch_sb_field field;
struct bch_sb_field_error_entry {
__le64 v;
__le64 last_error_time;
} entries[];
};
LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16);
LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64);
/* Superblock: */
/*

View File

@ -213,6 +213,7 @@
x(BCH_ERR_invalid_sb, invalid_sb_crypt) \
x(BCH_ERR_invalid_sb, invalid_sb_clean) \
x(BCH_ERR_invalid_sb, invalid_sb_quota) \
x(BCH_ERR_invalid_sb, invalid_sb_errors) \
x(BCH_ERR_invalid_sb, invalid_sb_opt_compression) \
x(BCH_ERR_invalid, invalid_bkey) \
x(BCH_ERR_operation_blocked, nocow_lock_blocked) \

View File

@ -117,27 +117,27 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
return NULL;
list_for_each_entry(s, &c->fsck_errors, list)
list_for_each_entry(s, &c->fsck_error_msgs, list)
if (s->fmt == fmt) {
/*
* move it to the head of the list: repeated fsck errors
* are common
*/
list_move(&s->list, &c->fsck_errors);
list_move(&s->list, &c->fsck_error_msgs);
return s;
}
s = kzalloc(sizeof(*s), GFP_NOFS);
if (!s) {
if (!c->fsck_alloc_err)
if (!c->fsck_alloc_msgs_err)
bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
c->fsck_alloc_err = true;
c->fsck_alloc_msgs_err = true;
return NULL;
}
INIT_LIST_HEAD(&s->list);
s->fmt = fmt;
list_add(&s->list, &c->fsck_errors);
list_add(&s->list, &c->fsck_error_msgs);
return s;
}
@ -153,7 +153,7 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
prt_vprintf(out, fmt, args);
va_end(args);
mutex_lock(&c->fsck_error_lock);
mutex_lock(&c->fsck_error_msgs_lock);
s = fsck_err_get(c, fmt);
if (s) {
/*
@ -163,7 +163,7 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
*/
if (s->last_msg && !strcmp(buf.buf, s->last_msg)) {
ret = s->ret;
mutex_unlock(&c->fsck_error_lock);
mutex_unlock(&c->fsck_error_msgs_lock);
printbuf_exit(&buf);
return ret;
}
@ -258,7 +258,7 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
if (s)
s->ret = ret;
mutex_unlock(&c->fsck_error_lock);
mutex_unlock(&c->fsck_error_msgs_lock);
printbuf_exit(&buf);
@ -279,9 +279,9 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
{
struct fsck_err_state *s, *n;
mutex_lock(&c->fsck_error_lock);
mutex_lock(&c->fsck_error_msgs_lock);
list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) {
if (s->ratelimited && s->last_msg)
bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg);
@ -290,5 +290,5 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
kfree(s);
}
mutex_unlock(&c->fsck_error_lock);
mutex_unlock(&c->fsck_error_msgs_lock);
}

175
fs/bcachefs/sb-errors.c Normal file
View File

@ -0,0 +1,175 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "sb-errors.h"
#include "super-io.h"
static const char * const bch2_sb_error_strs[] = {
#define x(t, n, ...) [n] = #t,
BCH_SB_ERRS()
NULL
};
static void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id)
{
if (id < BCH_SB_ERR_MAX)
prt_str(out, bch2_sb_error_strs[id]);
else
prt_printf(out, "(unknown error %u)", id);
}
static inline unsigned bch2_sb_field_errors_nr_entries(struct bch_sb_field_errors *e)
{
return e
? (bch2_sb_field_bytes(&e->field) - sizeof(*e)) / sizeof(e->entries[0])
: 0;
}
static inline unsigned bch2_sb_field_errors_u64s(unsigned nr)
{
return (sizeof(struct bch_sb_field_errors) +
sizeof(struct bch_sb_field_error_entry) * nr) / sizeof(u64);
}
static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f,
struct printbuf *err)
{
struct bch_sb_field_errors *e = field_to_type(f, errors);
unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
for (i = 0; i < nr; i++) {
if (!BCH_SB_ERROR_ENTRY_NR(&e->entries[i])) {
prt_printf(err, "entry with count 0 (id ");
bch2_sb_error_id_to_text(err, BCH_SB_ERROR_ENTRY_ID(&e->entries[i]));
prt_printf(err, ")");
return -BCH_ERR_invalid_sb_errors;
}
if (i + 1 < nr &&
BCH_SB_ERROR_ENTRY_ID(&e->entries[i]) >=
BCH_SB_ERROR_ENTRY_ID(&e->entries[i + 1])) {
prt_printf(err, "entries out of order");
return -BCH_ERR_invalid_sb_errors;
}
}
return 0;
}
static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb,
struct bch_sb_field *f)
{
struct bch_sb_field_errors *e = field_to_type(f, errors);
unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
u64 now = ktime_get_real_seconds();
if (out->nr_tabstops <= 1)
printbuf_tabstop_push(out, 16);
for (i = 0; i < nr; i++) {
bch2_sb_error_id_to_text(out, BCH_SB_ERROR_ENTRY_ID(&e->entries[i]));
prt_tab(out);
prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i]));
prt_tab(out);
bch2_pr_time_units(out, (now - le64_to_cpu(e->entries[i].last_error_time)) *
NSEC_PER_SEC);
prt_str(out, " ago");
prt_newline(out);
}
}
const struct bch_sb_field_ops bch_sb_field_ops_errors = {
.validate = bch2_sb_errors_validate,
.to_text = bch2_sb_errors_to_text,
};
void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err)
{
bch_sb_errors_cpu *e = &c->fsck_error_counts;
struct bch_sb_error_entry_cpu n = {
.id = err,
.nr = 1,
.last_error_time = ktime_get_real_seconds()
};
unsigned i;
mutex_lock(&c->fsck_error_counts_lock);
for (i = 0; i < e->nr; i++) {
if (err == e->data[i].id) {
e->data[i].nr++;
e->data[i].last_error_time = n.last_error_time;
goto out;
}
if (err < e->data[i].id)
break;
}
if (darray_make_room(e, 1))
goto out;
darray_insert_item(e, i, n);
out:
mutex_unlock(&c->fsck_error_counts_lock);
}
void bch2_sb_errors_from_cpu(struct bch_fs *c)
{
bch_sb_errors_cpu *src = &c->fsck_error_counts;
struct bch_sb_field_errors *dst =
bch2_sb_field_resize(&c->disk_sb, errors,
bch2_sb_field_errors_u64s(src->nr));
unsigned i;
if (!dst)
return;
for (i = 0; i < src->nr; i++) {
SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id);
SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr);
dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time);
}
}
static int bch2_sb_errors_to_cpu(struct bch_fs *c)
{
struct bch_sb_field_errors *src = bch2_sb_field_get(c->disk_sb.sb, errors);
bch_sb_errors_cpu *dst = &c->fsck_error_counts;
unsigned i, nr = bch2_sb_field_errors_nr_entries(src);
int ret;
if (!nr)
return 0;
mutex_lock(&c->fsck_error_counts_lock);
ret = darray_make_room(dst, nr);
if (ret)
goto err;
dst->nr = nr;
for (i = 0; i < nr; i++) {
dst->data[i].id = BCH_SB_ERROR_ENTRY_ID(&src->entries[i]);
dst->data[i].nr = BCH_SB_ERROR_ENTRY_NR(&src->entries[i]);
dst->data[i].last_error_time = le64_to_cpu(src->entries[i].last_error_time);
}
err:
mutex_unlock(&c->fsck_error_counts_lock);
return ret;
}
void bch2_fs_sb_errors_exit(struct bch_fs *c)
{
darray_exit(&c->fsck_error_counts);
}
void bch2_fs_sb_errors_init_early(struct bch_fs *c)
{
mutex_init(&c->fsck_error_counts_lock);
darray_init(&c->fsck_error_counts);
}
int bch2_fs_sb_errors_init(struct bch_fs *c)
{
return bch2_sb_errors_to_cpu(c);
}

26
fs/bcachefs/sb-errors.h Normal file
View File

@ -0,0 +1,26 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_SB_ERRORS_H
#define _BCACHEFS_SB_ERRORS_H
#include "sb-errors_types.h"
#define BCH_SB_ERRS()
enum bch_sb_error_id {
#define x(t, n) BCH_FSCK_ERR_##t = n,
BCH_SB_ERRS()
#undef x
BCH_SB_ERR_MAX
};
extern const struct bch_sb_field_ops bch_sb_field_ops_errors;
void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id);
void bch2_sb_errors_from_cpu(struct bch_fs *);
void bch2_fs_sb_errors_exit(struct bch_fs *);
void bch2_fs_sb_errors_init_early(struct bch_fs *);
int bch2_fs_sb_errors_init(struct bch_fs *);
#endif /* _BCACHEFS_SB_ERRORS_H */

View File

@ -0,0 +1,16 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_SB_ERRORS_TYPES_H
#define _BCACHEFS_SB_ERRORS_TYPES_H
#include "darray.h"
struct bch_sb_error_entry_cpu {
u64 id:16,
nr:48;
u64 last_error_time;
};
typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu;
#endif /* _BCACHEFS_SB_ERRORS_TYPES_H */

View File

@ -84,7 +84,7 @@ static int sb_members_v2_resize_entries(struct bch_fs *c)
return 0;
}
int bch2_members_v2_init(struct bch_fs *c)
int bch2_sb_members_v2_init(struct bch_fs *c)
{
struct bch_sb_field_members_v1 *mi1;
struct bch_sb_field_members_v2 *mi2;

View File

@ -4,7 +4,7 @@
extern char * const bch2_member_error_strs[];
int bch2_members_v2_init(struct bch_fs *c);
int bch2_sb_members_v2_init(struct bch_fs *c);
int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb);
struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i);
struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i);

View File

@ -13,6 +13,7 @@
#include "replicas.h"
#include "quota.h"
#include "sb-clean.h"
#include "sb-errors.h"
#include "sb-members.h"
#include "super-io.h"
#include "super.h"
@ -897,7 +898,9 @@ int bch2_write_super(struct bch_fs *c)
SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
bch2_sb_counters_from_cpu(c);
bch2_sb_members_from_cpu(c);
bch2_sb_members_cpy_v2_v1(&c->disk_sb);
bch2_sb_errors_from_cpu(c);
for_each_online_member(ca, c, i)
bch2_sb_from_fs(c, ca);

View File

@ -23,6 +23,11 @@ u64 bch2_upgrade_recovery_passes(struct bch_fs *c,
unsigned,
unsigned);
static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f)
{
return le32_to_cpu(f->u64s) * sizeof(u64);
}
#define field_to_type(_f, _name) \
container_of_or_null(_f, struct bch_sb_field_##_name, field)

View File

@ -49,6 +49,7 @@
#include "recovery.h"
#include "replicas.h"
#include "sb-clean.h"
#include "sb-errors.h"
#include "sb-members.h"
#include "snapshot.h"
#include "subvolume.h"
@ -400,7 +401,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
bch_info(c, "going read-write");
ret = bch2_members_v2_init(c);
ret = bch2_sb_members_v2_init(c);
if (ret)
goto err;
@ -481,6 +482,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_time_stats_exit(&c->times[i]);
bch2_free_pending_node_rewrites(c);
bch2_fs_sb_errors_exit(c);
bch2_fs_counters_exit(c);
bch2_fs_snapshots_exit(c);
bch2_fs_quota_exit(c);
@ -713,6 +715,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_quota_init(c);
bch2_fs_ec_init_early(c);
bch2_fs_move_init(c);
bch2_fs_sb_errors_init_early(c);
INIT_LIST_HEAD(&c->list);
@ -729,8 +732,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
INIT_LIST_HEAD(&c->journal_iters);
INIT_LIST_HEAD(&c->fsck_errors);
mutex_init(&c->fsck_error_lock);
INIT_LIST_HEAD(&c->fsck_error_msgs);
mutex_init(&c->fsck_error_msgs_lock);
seqcount_init(&c->gc_pos_lock);
@ -840,6 +843,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
}
ret = bch2_fs_counters_init(c) ?:
bch2_fs_sb_errors_init(c) ?:
bch2_io_clock_init(&c->io_clock[READ]) ?:
bch2_io_clock_init(&c->io_clock[WRITE]) ?:
bch2_fs_journal_init(&c->journal) ?:
@ -942,7 +946,7 @@ int bch2_fs_start(struct bch_fs *c)
mutex_lock(&c->sb_lock);
ret = bch2_members_v2_init(c);
ret = bch2_sb_members_v2_init(c);
if (ret) {
mutex_unlock(&c->sb_lock);
goto err;