1
linux/fs/afs/fs_probe.c
David Howells 495f2ae9e3 afs: Fix fileserver rotation
Fix the fileserver rotation so that it doesn't use RTT as the basis for
deciding which server and address to use as this doesn't necessarily give a
good indication of the best path.  Instead, use the configurable preference
list in conjunction with whatever probes have succeeded at the time of
looking.

To this end, make the following changes:

 (1) Keep an array of "server states" to track what addresses we've tried
     on each server and move the waitqueue entries there that we'll need
     for probing.

 (2) Each afs_server_state struct is made to pin the corresponding server's
     endpoint state rather than the afs_operation struct carrying a pin on
     the server we're currently looking at.

 (3) Drop the server list preference; we now always rescan the server list.

 (4) afs_wait_for_probes() now uses the server state list to guide it in
     what it waits for (and to provide the waitqueue entries) and returns
     an indication of whether we'd got a response, run out of responsive
     addresses or the endpoint state had been superseded and we need to
     restart the iteration.

 (5) Call afs_get_address_preferences*() occasionally to refresh the
     preference values.

 (6) When picking a server, scan the addresses of the servers for which we
     have as-yet untested communications, looking for the highest priority
     one and use that instead of trying all the addresses for a particular
     server in ascending-RTT order.

 (7) When a Busy or Offline state is seen across all available servers, do
     a short sleep.

 (8) If we detect that we accessed a future RO volume version whilst it is
     undergoing replication, reissue the op against the older version until
     at least half of the servers are replicated.

 (9) Whilst RO replication is ongoing, increase the frequency of Volume
     Location server checks for that volume to every ten minutes instead of
     hourly.

Also add a tracepoint to track progress through the rotation algorithm.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
2024-01-01 16:37:27 +00:00

528 lines
14 KiB
C

// SPDX-License-Identifier: GPL-2.0-or-later
/* AFS fileserver probing
*
* Copyright (C) 2018, 2020 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#include <linux/sched.h>
#include <linux/slab.h>
#include "afs_fs.h"
#include "internal.h"
#include "protocol_afs.h"
#include "protocol_yfs.h"
static unsigned int afs_fs_probe_fast_poll_interval = 30 * HZ;
static unsigned int afs_fs_probe_slow_poll_interval = 5 * 60 * HZ;
struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *estate,
enum afs_estate_trace where)
{
if (estate) {
int r;
__refcount_inc(&estate->ref, &r);
trace_afs_estate(estate->server_id, estate->probe_seq, r, where);
}
return estate;
}
static void afs_endpoint_state_rcu(struct rcu_head *rcu)
{
struct afs_endpoint_state *estate = container_of(rcu, struct afs_endpoint_state, rcu);
trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref),
afs_estate_trace_free);
afs_put_addrlist(estate->addresses, afs_alist_trace_put_estate);
kfree(estate);
}
void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where)
{
if (estate) {
unsigned int server_id = estate->server_id, probe_seq = estate->probe_seq;
bool dead;
int r;
dead = __refcount_dec_and_test(&estate->ref, &r);
trace_afs_estate(server_id, probe_seq, r, where);
if (dead)
call_rcu(&estate->rcu, afs_endpoint_state_rcu);
}
}
/*
* Start the probe polling timer. We have to supply it with an inc on the
* outstanding server count.
*/
static void afs_schedule_fs_probe(struct afs_net *net,
struct afs_server *server, bool fast)
{
unsigned long atj;
if (!net->live)
return;
atj = server->probed_at;
atj += fast ? afs_fs_probe_fast_poll_interval : afs_fs_probe_slow_poll_interval;
afs_inc_servers_outstanding(net);
if (timer_reduce(&net->fs_probe_timer, atj))
afs_dec_servers_outstanding(net);
}
/*
* Handle the completion of a set of probes.
*/
static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server,
struct afs_endpoint_state *estate)
{
bool responded = test_bit(AFS_ESTATE_RESPONDED, &estate->flags);
write_seqlock(&net->fs_lock);
if (responded) {
list_add_tail(&server->probe_link, &net->fs_probe_slow);
} else {
server->rtt = UINT_MAX;
clear_bit(AFS_SERVER_FL_RESPONDING, &server->flags);
list_add_tail(&server->probe_link, &net->fs_probe_fast);
}
write_sequnlock(&net->fs_lock);
afs_schedule_fs_probe(net, server, !responded);
}
/*
* Handle the completion of a probe.
*/
static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server,
struct afs_endpoint_state *estate)
{
_enter("");
if (atomic_dec_and_test(&estate->nr_probing))
afs_finished_fs_probe(net, server, estate);
wake_up_all(&server->probe_wq);
}
/*
* Handle inability to send a probe due to ENOMEM when trying to allocate a
* call struct.
*/
static void afs_fs_probe_not_done(struct afs_net *net,
struct afs_server *server,
struct afs_endpoint_state *estate,
int index)
{
_enter("");
trace_afs_io_error(0, -ENOMEM, afs_io_error_fs_probe_fail);
spin_lock(&server->probe_lock);
set_bit(AFS_ESTATE_LOCAL_FAILURE, &estate->flags);
if (estate->error == 0)
estate->error = -ENOMEM;
set_bit(index, &estate->failed_set);
spin_unlock(&server->probe_lock);
return afs_done_one_fs_probe(net, server, estate);
}
/*
* Process the result of probing a fileserver. This is called after successful
* or failed delivery of an FS.GetCapabilities operation.
*/
void afs_fileserver_probe_result(struct afs_call *call)
{
struct afs_endpoint_state *estate = call->probe;
struct afs_addr_list *alist = estate->addresses;
struct afs_address *addr = &alist->addrs[call->probe_index];
struct afs_server *server = call->server;
unsigned int index = call->probe_index;
unsigned int rtt_us = -1, cap0;
int ret = call->error;
_enter("%pU,%u", &server->uuid, index);
WRITE_ONCE(addr->last_error, ret);
spin_lock(&server->probe_lock);
switch (ret) {
case 0:
estate->error = 0;
goto responded;
case -ECONNABORTED:
if (!test_bit(AFS_ESTATE_RESPONDED, &estate->flags)) {
estate->abort_code = call->abort_code;
estate->error = ret;
}
goto responded;
case -ENOMEM:
case -ENONET:
clear_bit(index, &estate->responsive_set);
set_bit(AFS_ESTATE_LOCAL_FAILURE, &estate->flags);
trace_afs_io_error(call->debug_id, ret, afs_io_error_fs_probe_fail);
goto out;
case -ECONNRESET: /* Responded, but call expired. */
case -ERFKILL:
case -EADDRNOTAVAIL:
case -ENETUNREACH:
case -EHOSTUNREACH:
case -EHOSTDOWN:
case -ECONNREFUSED:
case -ETIMEDOUT:
case -ETIME:
default:
clear_bit(index, &estate->responsive_set);
set_bit(index, &estate->failed_set);
if (!test_bit(AFS_ESTATE_RESPONDED, &estate->flags) &&
(estate->error == 0 ||
estate->error == -ETIMEDOUT ||
estate->error == -ETIME))
estate->error = ret;
trace_afs_io_error(call->debug_id, ret, afs_io_error_fs_probe_fail);
goto out;
}
responded:
clear_bit(index, &estate->failed_set);
if (call->service_id == YFS_FS_SERVICE) {
set_bit(AFS_ESTATE_IS_YFS, &estate->flags);
set_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
server->service_id = call->service_id;
} else {
set_bit(AFS_ESTATE_NOT_YFS, &estate->flags);
if (!test_bit(AFS_ESTATE_IS_YFS, &estate->flags)) {
clear_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
server->service_id = call->service_id;
}
cap0 = ntohl(call->tmp);
if (cap0 & AFS3_VICED_CAPABILITY_64BITFILES)
set_bit(AFS_SERVER_FL_HAS_FS64, &server->flags);
else
clear_bit(AFS_SERVER_FL_HAS_FS64, &server->flags);
}
rtt_us = rxrpc_kernel_get_srtt(addr->peer);
if (rtt_us < estate->rtt) {
estate->rtt = rtt_us;
server->rtt = rtt_us;
alist->preferred = index;
}
smp_wmb(); /* Set rtt before responded. */
set_bit(AFS_ESTATE_RESPONDED, &estate->flags);
set_bit(index, &estate->responsive_set);
set_bit(AFS_SERVER_FL_RESPONDING, &server->flags);
out:
spin_unlock(&server->probe_lock);
trace_afs_fs_probe(server, false, estate, index, call->error, call->abort_code, rtt_us);
_debug("probe[%x] %pU [%u] %pISpc rtt=%d ret=%d",
estate->probe_seq, &server->uuid, index,
rxrpc_kernel_remote_addr(alist->addrs[index].peer),
rtt_us, ret);
return afs_done_one_fs_probe(call->net, server, estate);
}
/*
* Probe all of a fileserver's addresses to find out the best route and to
* query its capabilities.
*/
void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
struct afs_addr_list *new_alist, struct key *key)
{
struct afs_endpoint_state *estate, *old;
struct afs_addr_list *alist;
unsigned long unprobed;
_enter("%pU", &server->uuid);
estate = kzalloc(sizeof(*estate), GFP_KERNEL);
if (!estate)
return;
refcount_set(&estate->ref, 1);
estate->server_id = server->debug_id;
estate->rtt = UINT_MAX;
write_lock(&server->fs_lock);
old = rcu_dereference_protected(server->endpoint_state,
lockdep_is_held(&server->fs_lock));
estate->responsive_set = old->responsive_set;
estate->addresses = afs_get_addrlist(new_alist ?: old->addresses,
afs_alist_trace_get_estate);
alist = estate->addresses;
estate->probe_seq = ++server->probe_counter;
atomic_set(&estate->nr_probing, alist->nr_addrs);
rcu_assign_pointer(server->endpoint_state, estate);
set_bit(AFS_ESTATE_SUPERSEDED, &old->flags);
write_unlock(&server->fs_lock);
trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref),
afs_estate_trace_alloc_probe);
afs_get_address_preferences(net, alist);
server->probed_at = jiffies;
unprobed = (1UL << alist->nr_addrs) - 1;
while (unprobed) {
unsigned int index = 0, i;
int best_prio = -1;
for (i = 0; i < alist->nr_addrs; i++) {
if (test_bit(i, &unprobed) &&
alist->addrs[i].prio > best_prio) {
index = i;
best_prio = alist->addrs[i].prio;
}
}
__clear_bit(index, &unprobed);
trace_afs_fs_probe(server, true, estate, index, 0, 0, 0);
if (!afs_fs_get_capabilities(net, server, estate, index, key))
afs_fs_probe_not_done(net, server, estate, index);
}
afs_put_endpoint_state(old, afs_estate_trace_put_probe);
}
/*
* Wait for the first as-yet untried fileserver to respond, for the probe state
* to be superseded or for all probes to finish.
*/
int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr)
{
struct afs_endpoint_state *estate;
struct afs_server_list *slist = op->server_list;
bool still_probing = true;
int ret = 0, i;
_enter("%u", slist->nr_servers);
for (i = 0; i < slist->nr_servers; i++) {
estate = states[i].endpoint_state;
if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags))
return 2;
if (atomic_read(&estate->nr_probing))
still_probing = true;
if (estate->responsive_set & states[i].untried_addrs)
return 1;
}
if (!still_probing)
return 0;
for (i = 0; i < slist->nr_servers; i++)
add_wait_queue(&slist->servers[i].server->probe_wq, &states[i].probe_waiter);
for (;;) {
still_probing = false;
set_current_state(intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
for (i = 0; i < slist->nr_servers; i++) {
estate = states[i].endpoint_state;
if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags)) {
ret = 2;
goto stop;
}
if (atomic_read(&estate->nr_probing))
still_probing = true;
if (estate->responsive_set & states[i].untried_addrs) {
ret = 1;
goto stop;
}
}
if (!still_probing || signal_pending(current))
goto stop;
schedule();
}
stop:
set_current_state(TASK_RUNNING);
for (i = 0; i < slist->nr_servers; i++)
remove_wait_queue(&slist->servers[i].server->probe_wq, &states[i].probe_waiter);
if (!ret && signal_pending(current))
ret = -ERESTARTSYS;
return ret;
}
/*
* Probe timer. We have an increment on fs_outstanding that we need to pass
* along to the work item.
*/
void afs_fs_probe_timer(struct timer_list *timer)
{
struct afs_net *net = container_of(timer, struct afs_net, fs_probe_timer);
if (!net->live || !queue_work(afs_wq, &net->fs_prober))
afs_dec_servers_outstanding(net);
}
/*
* Dispatch a probe to a server.
*/
static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server)
__releases(&net->fs_lock)
{
struct key *key = NULL;
/* We remove it from the queues here - it will be added back to
* one of the queues on the completion of the probe.
*/
list_del_init(&server->probe_link);
afs_get_server(server, afs_server_trace_get_probe);
write_sequnlock(&net->fs_lock);
afs_fs_probe_fileserver(net, server, NULL, key);
afs_put_server(net, server, afs_server_trace_put_probe);
}
/*
* Probe a server immediately without waiting for its due time to come
* round. This is used when all of the addresses have been tried.
*/
void afs_probe_fileserver(struct afs_net *net, struct afs_server *server)
{
write_seqlock(&net->fs_lock);
if (!list_empty(&server->probe_link))
return afs_dispatch_fs_probe(net, server);
write_sequnlock(&net->fs_lock);
}
/*
* Probe dispatcher to regularly dispatch probes to keep NAT alive.
*/
void afs_fs_probe_dispatcher(struct work_struct *work)
{
struct afs_net *net = container_of(work, struct afs_net, fs_prober);
struct afs_server *fast, *slow, *server;
unsigned long nowj, timer_at, poll_at;
bool first_pass = true, set_timer = false;
if (!net->live) {
afs_dec_servers_outstanding(net);
return;
}
_enter("");
if (list_empty(&net->fs_probe_fast) && list_empty(&net->fs_probe_slow)) {
afs_dec_servers_outstanding(net);
_leave(" [none]");
return;
}
again:
write_seqlock(&net->fs_lock);
fast = slow = server = NULL;
nowj = jiffies;
timer_at = nowj + MAX_JIFFY_OFFSET;
if (!list_empty(&net->fs_probe_fast)) {
fast = list_first_entry(&net->fs_probe_fast, struct afs_server, probe_link);
poll_at = fast->probed_at + afs_fs_probe_fast_poll_interval;
if (time_before(nowj, poll_at)) {
timer_at = poll_at;
set_timer = true;
fast = NULL;
}
}
if (!list_empty(&net->fs_probe_slow)) {
slow = list_first_entry(&net->fs_probe_slow, struct afs_server, probe_link);
poll_at = slow->probed_at + afs_fs_probe_slow_poll_interval;
if (time_before(nowj, poll_at)) {
if (time_before(poll_at, timer_at))
timer_at = poll_at;
set_timer = true;
slow = NULL;
}
}
server = fast ?: slow;
if (server)
_debug("probe %pU", &server->uuid);
if (server && (first_pass || !need_resched())) {
afs_dispatch_fs_probe(net, server);
first_pass = false;
goto again;
}
write_sequnlock(&net->fs_lock);
if (server) {
if (!queue_work(afs_wq, &net->fs_prober))
afs_dec_servers_outstanding(net);
_leave(" [requeue]");
} else if (set_timer) {
if (timer_reduce(&net->fs_probe_timer, timer_at))
afs_dec_servers_outstanding(net);
_leave(" [timer]");
} else {
afs_dec_servers_outstanding(net);
_leave(" [quiesce]");
}
}
/*
* Wait for a probe on a particular fileserver to complete for 2s.
*/
int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate,
unsigned long exclude, bool is_intr)
{
struct wait_queue_entry wait;
unsigned long timo = 2 * HZ;
if (atomic_read(&estate->nr_probing) == 0)
goto dont_wait;
init_wait_entry(&wait, 0);
for (;;) {
prepare_to_wait_event(&server->probe_wq, &wait,
is_intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
if (timo == 0 ||
test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags) ||
(estate->responsive_set & ~exclude) ||
atomic_read(&estate->nr_probing) == 0 ||
(is_intr && signal_pending(current)))
break;
timo = schedule_timeout(timo);
}
finish_wait(&server->probe_wq, &wait);
dont_wait:
if (estate->responsive_set & ~exclude)
return 1;
if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags))
return 0;
if (is_intr && signal_pending(current))
return -ERESTARTSYS;
if (timo == 0)
return -ETIME;
return -EDESTADDRREQ;
}
/*
* Clean up the probing when the namespace is killed off.
*/
void afs_fs_probe_cleanup(struct afs_net *net)
{
if (del_timer_sync(&net->fs_probe_timer))
afs_dec_servers_outstanding(net);
}