1
linux/net/dccp/ccids/ccid3.h
Gerrit Renker de6f2b59e5 dccp ccid-3: Bug fix for the inter-packet scheduling algorithm
This fixes a subtle bug in the calculation of the inter-packet gap and shows
that t_delta, as it is currently used, is not needed. And hence replaced.

The algorithm from RFC 3448, 4.6 below continually computes a send time t_nom,
which is initialised with the current time t_now; t_gran = 1E6 / HZ specifies
the scheduling granularity, s the packet size, and X the sending rate:

  t_distance = t_nom - t_now;		// in microseconds
  t_delta    = min(t_ipi, t_gran) / 2;	// `delta' parameter in microseconds

  if (t_distance >= t_delta) {
	reschedule after (t_distance / 1000) milliseconds;
  } else {
  	t_ipi  = s / X;			// inter-packet interval in usec
	t_nom += t_ipi;			// compute the next send time
	send packet now;
  }


1) Description of the bug
-------------------------
Rescheduling requires a conversion into milliseconds, due to this call chain:

 * ccid3_hc_tx_send_packet() returns a timeout in milliseconds,
 * this value is converted by msecs_to_jiffies() in dccp_write_xmit(),
 * and finally used as jiffy-expires-value for sk_reset_timer().

The highest jiffy resolution with HZ=1000 is 1 millisecond, so using a higher
granularity does not make much sense here.

As a consequence, values of t_distance < 1000 are truncated to 0. This issue 
has so far been resolved by using instead

  if (t_distance >= t_delta + 1000)
	reschedule after (t_distance / 1000) milliseconds;

The bug is in artificially inflating t_delta to t_delta' = t_delta + 1000. This
is unnecessarily large, a more adequate value is t_delta' = max(t_delta, 1000).


2) Consequences of using the corrected t_delta'
-----------------------------------------------
Since t_delta <= t_gran/2 = 10^6/(2*HZ), we have t_delta <= 1000 as long as
HZ >= 500. This means that t_delta' = max(1000, t_delta) is constant at 1000.

On the other hand, when using a coarse HZ value of HZ < 500, we have three
sub-cases that can all be reduced to using another constant of t_gran/2.

 (a) The first case arises when t_ipi > t_gran. Here t_delta' is the constant
     t_delta' = max(1000, t_gran/2) = t_gran/2.

 (b) If t_ipi <= 2000 < t_gran = 10^6/HZ usec, then t_delta = t_ipi/2 <= 1000,
     so that t_delta' = max(1000, t_delta) = 1000 < t_gran/2. 

 (c) If 2000 < t_ipi <= t_gran, we have t_delta' = max(t_delta, 1000) = t_ipi/2.

In the second and third cases we have delay values less than t_gran/2, which is
in the order of less than or equal to half a jiffy. 

How these are treated depends on how fractions of a jiffy are handled: they
are either always rounded down to 0, or always rounded up to 1 jiffy (assuming
non-zero values). In both cases the error is on average in the order of 50%.

Thus we are not increasing the error when in the second/third case we replace
a value less than t_gran/2 with 0, by setting t_delta' to the constant t_gran/2.


3) Summary
----------
Fixing (1) and considering (2), the patch replaces t_delta with a constant,
whose value depends on CONFIG_HZ, changing the above algorithm to:
 
  if (t_distance >= t_delta')
	reschedule after (t_distance / 1000) milliseconds;

where t_delta' = 10^6/(2*HZ) if HZ < 500, and t_delta' = 1000 otherwise.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
2008-09-04 07:45:33 +02:00

180 lines
5.9 KiB
C

/*
* net/dccp/ccids/ccid3.h
*
* Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
* Copyright (c) 2007 The University of Aberdeen, Scotland, UK
*
* An implementation of the DCCP protocol
*
* This code has been developed by the University of Waikato WAND
* research group. For further information please see http://www.wand.net.nz/
* or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz
*
* This code also uses code from Lulea University, rereleased as GPL by its
* authors:
* Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
*
* Changes to meet Linux coding standards, to make it meet latest ccid3 draft
* and to make it work as a loadable module in the DCCP stack written by
* Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
*
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef _DCCP_CCID3_H_
#define _DCCP_CCID3_H_
#include <linux/ktime.h>
#include <linux/list.h>
#include <linux/types.h>
#include <linux/tfrc.h>
#include "lib/tfrc.h"
#include "../ccid.h"
/* Two seconds as per RFC 3448 4.2 */
#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC)
/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
#define TFRC_T_MBI 64
/*
* The t_delta parameter (RFC 3448, 4.6): delays of less than %USEC_PER_MSEC are
* rounded down to 0, since sk_reset_timer() here uses millisecond granularity.
* Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse
* resolution of HZ < 500 means that the error is below one timer tick (t_gran)
* when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ).
*/
#if (HZ >= 500)
# define TFRC_T_DELTA USEC_PER_MSEC
#else
# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ))
#warning Coarse CONFIG_HZ resolution -- higher value recommended for TFRC.
#endif
enum ccid3_options {
TFRC_OPT_LOSS_EVENT_RATE = 192,
TFRC_OPT_LOSS_INTERVALS = 193,
TFRC_OPT_RECEIVE_RATE = 194,
};
struct ccid3_options_received {
u64 ccid3or_seqno:48,
ccid3or_loss_intervals_idx:16;
u16 ccid3or_loss_intervals_len;
u32 ccid3or_loss_event_rate;
u32 ccid3or_receive_rate;
};
/* TFRC sender states */
enum ccid3_hc_tx_states {
TFRC_SSTATE_NO_SENT = 1,
TFRC_SSTATE_NO_FBACK,
TFRC_SSTATE_FBACK,
TFRC_SSTATE_TERM,
};
/** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket
*
* @x - Current sending rate in 64 * bytes per second
* @x_recv - Receive rate in 64 * bytes per second
* @x_calc - Calculated rate in bytes per second
* @rtt - Estimate of current round trip time in usecs
* @p - Current loss event rate (0-1) scaled by 1000000
* @s - Packet size in bytes
* @t_rto - Nofeedback Timer setting in usecs
* @t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs
* @state - Sender state, one of %ccid3_hc_tx_states
* @last_win_count - Last window counter sent
* @t_last_win_count - Timestamp of earliest packet with
* last_win_count value sent
* @no_feedback_timer - Handle to no feedback timer
* @t_ld - Time last doubled during slow start
* @t_nom - Nominal send time of next packet
* @hist - Packet history
* @options_received - Parsed set of retrieved options
*/
struct ccid3_hc_tx_sock {
u64 x;
u64 x_recv;
u32 x_calc;
u32 rtt;
u32 p;
u32 t_rto;
u32 t_ipi;
u16 s;
enum ccid3_hc_tx_states state:8;
u8 last_win_count;
ktime_t t_last_win_count;
struct timer_list no_feedback_timer;
ktime_t t_ld;
ktime_t t_nom;
struct tfrc_tx_hist_entry *hist;
struct ccid3_options_received options_received;
};
static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
{
struct ccid3_hc_tx_sock *hctx = ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid);
BUG_ON(hctx == NULL);
return hctx;
}
/* TFRC receiver states */
enum ccid3_hc_rx_states {
TFRC_RSTATE_NO_DATA = 1,
TFRC_RSTATE_DATA,
TFRC_RSTATE_TERM = 127,
};
/** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
*
* @x_recv - Receiver estimate of send rate (RFC 3448 4.3)
* @rtt - Receiver estimate of rtt (non-standard)
* @p - Current loss event rate (RFC 3448 5.4)
* @last_counter - Tracks window counter (RFC 4342, 8.1)
* @state - Receiver state, one of %ccid3_hc_rx_states
* @bytes_recv - Total sum of DCCP payload bytes
* @x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3)
* @rtt - Receiver estimate of RTT
* @tstamp_last_feedback - Time at which last feedback was sent
* @tstamp_last_ack - Time at which last feedback was sent
* @hist - Packet history (loss detection + RTT sampling)
* @li_hist - Loss Interval database
* @s - Received packet size in bytes
* @p_inverse - Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
*/
struct ccid3_hc_rx_sock {
u8 last_counter:4;
enum ccid3_hc_rx_states state:8;
u32 bytes_recv;
u32 x_recv;
u32 rtt;
ktime_t tstamp_last_feedback;
struct tfrc_rx_hist hist;
struct tfrc_loss_hist li_hist;
u16 s;
#define p_inverse li_hist.i_mean
};
static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk)
{
struct ccid3_hc_rx_sock *hcrx = ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid);
BUG_ON(hcrx == NULL);
return hcrx;
}
#endif /* _DCCP_CCID3_H_ */