patch-2.1.99 linux/net/sched/sch_tbf.c
Next file: linux/net/sched/sch_teql.c
Previous file: linux/net/sched/sch_sfq.c
Back to the patch index
Back to the overall index
- Lines: 450
- Date:
Tue Apr 28 11:10:11 1998
- Orig file:
v2.1.98/linux/net/sched/sch_tbf.c
- Orig date:
Thu Feb 12 20:56:15 1998
diff -u --recursive --new-file v2.1.98/linux/net/sched/sch_tbf.c linux/net/sched/sch_tbf.c
@@ -1,5 +1,5 @@
/*
- * net/sched/sch_tbf.c Token Bucket Filter.
+ * net/sched/sch_tbf.c Token Bucket Filter queue.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -10,6 +10,8 @@
*
*/
+#include <linux/config.h>
+#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/bitops.h>
@@ -39,69 +41,91 @@
=======================================
SOURCE.
+ -------
None.
- ALGORITHM.
+ Description.
+ ------------
+
+ Data flow obeys TBF with rate R and depth B, if for any
+ time interval t_i...t_f number of transmitted bits
+ does not exceed B + R*(t_f-t_i).
+
+ Packetized version of this definition:
+ sequence of packets of sizes s_i served at moments t_i
+ obeys TBF, if for any i<=k:
+
+ s_i+....+s_k <= B + R*(t_k - t_i)
+
+ Algorithm.
+ ----------
+
+ Let N(t_i) be B/R initially and N(t) grows continuously with time as:
+
+ N(t+delta) = min{B/R, N(t) + delta}
+
+ If the first packet in queue has length S, it may be
+ transmited only at the time t_* when S/R <= N(t_*),
+ and in this case N(t) jumps:
+
+ N(t_* + 0) = N(t_* - 0) - S/R.
+
+
- Sequence of packets satisfy token bucket filter with
- rate $r$ and depth $b$, if all the numbers defined by:
- \begin{eqnarray*}
- n_0 &=& b, \\
- n_i &=& {\rm max} ( b, n_{i-1} + r*(t_i-t_{i-1}) - L_i ),
- \end{eqnarray*}
- where $t_i$ --- departure time of $i$-th packet and
- $L_i$ -- its length, never less than zero.
-
- It is convenient to rescale $n_i$ by factor $r$, so
- that the sequence has "canonical" form:
- \[
- n_0 = b/r,
- n_i = max { b/r, n_{i-1} + t_i - t_{i-1} - L_i/r },
- \]
+ Actually, QoS requires two TBF to be applied to data stream.
+ One of them controls steady state burst size, another
+ with rate P (peak rate) and depth M (equal to link MTU)
+ limits bursts at smaller time scale.
+
+ Apparently, P>R, and B>M. If P is infinity, this double
+ TBF is equivalent to single one.
+
+ When TBF works in reshaping mode, latency is estimated as:
+
+ lat = max ((L-B)/R, (L-M)/P)
- If a packet has n_i < 0, we throttle filter
- by $-n_i$ usecs.
NOTES.
+ ------
If TBF throttles, it starts watchdog timer, which will wake up it
- after 0...10 msec.
+ when it will be ready to transmit.
+ Note, that minimal timer resolution is 1/HZ.
If no new packets will arrive during this period,
or device will not be awaken by EOI for previous packet,
- tbf could stop its activity for 10 msec.
+ tbf could stop its activity for 1/HZ.
+
+
+ It means, that with depth B, the maximal rate is
+
+ R_crit = B*HZ
- It means that tbf will sometimes introduce pathological
- 10msec delays to flow corresponding to rate*10msec bytes.
- For 10Mbit/sec flow it is about 12Kb, on 100Mbit/sec -- ~100Kb.
- This number puts lower reasonbale bound on token bucket depth,
- but even if depth is larger traffic is erratic at large rates.
-
- This problem is not specific for THIS implementation. Really,
- there exists statement that any attempt to shape traffic
- in transit will increase delays and jitter much more than
- we expected naively.
+ F.e. for 10Mbit ethernet and HZ=100 minimal allowed B is ~10Kbytes.
- Particularily, it means that delay/jitter sensitive traffic
- MUST NOT be shaped. Cf. CBQ (wrong) and CSZ (correct) approaches.
+ Note, that peak rate TBF is much more tough: with MTU 1500
+ P_crit = 150Kbytes/sec. So that, if you need greater peak
+ rates, use alpha with HZ=1000 :-)
*/
struct tbf_sched_data
{
/* Parameters */
- int cell_log; /* 1<<cell_log is quantum of packet size */
- unsigned long L_tab[256]; /* Lookup table for L/B values */
- unsigned long depth; /* Token bucket depth/B: MUST BE >= MTU/B */
- unsigned long max_bytes; /* Maximal length of backlog: bytes */
+ u32 limit; /* Maximal length of backlog: bytes */
+ u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
+ u32 mtu;
+ struct qdisc_rate_table *R_tab;
+ struct qdisc_rate_table *P_tab;
/* Variables */
- unsigned long bytes; /* Current length of backlog */
- unsigned long tokens; /* Current number of tokens */
+ long tokens; /* Current number of B tokens */
+ long ptokens; /* Current number of P tokens */
psched_time_t t_c; /* Time check-point */
struct timer_list wd_timer; /* Watchdog timer */
};
-#define L2T(q,L) ((q)->L_tab[(L)>>(q)->cell_log])
+#define L2T(q,L) ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log])
+#define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log])
static int
tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
@@ -109,30 +133,56 @@
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
__skb_queue_tail(&sch->q, skb);
- if ((q->bytes += skb->len) <= q->max_bytes)
+ if ((sch->stats.backlog += skb->len) <= q->limit) {
+ sch->stats.bytes += skb->len;
+ sch->stats.packets++;
return 1;
+ }
/* Drop action: undo the things that we just made,
* i.e. make tail drop
*/
__skb_unlink(skb, &sch->q);
- q->bytes -= skb->len;
- kfree_skb(skb);
+ sch->stats.backlog -= skb->len;
+ sch->stats.drops++;
+#ifdef CONFIG_NET_CLS_POLICE
+ if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch))
+#endif
+ kfree_skb(skb);
+ return 0;
+}
+
+static int
+tbf_requeue(struct sk_buff *skb, struct Qdisc* sch)
+{
+ __skb_queue_head(&sch->q, skb);
+ sch->stats.backlog += skb->len;
+ return 1;
+}
+
+static int
+tbf_drop(struct Qdisc* sch)
+{
+ struct sk_buff *skb;
+
+ skb = __skb_dequeue_tail(&sch->q);
+ if (skb) {
+ sch->stats.backlog -= skb->len;
+ sch->stats.drops++;
+ kfree_skb(skb);
+ return 1;
+ }
return 0;
}
static void tbf_watchdog(unsigned long arg)
{
struct Qdisc *sch = (struct Qdisc*)arg;
- struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
-
- q->wd_timer.function = NULL;
qdisc_wakeup(sch->dev);
}
-
static struct sk_buff *
tbf_dequeue(struct Qdisc* sch)
{
@@ -144,19 +194,42 @@
if (skb) {
psched_time_t now;
long toks;
+ long ptoks = 0;
PSCHED_GET_TIME(now);
- toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->depth, 0)
- + q->tokens - L2T(q,skb->len);
+ toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->buffer, 0);
+
+ if (q->P_tab) {
+ ptoks = toks + q->ptokens;
+ if (ptoks > (long)q->mtu)
+ ptoks = q->mtu;
+ ptoks -= L2T_P(q, skb->len);
+ }
+ toks += q->tokens;
+ if (toks > (long)q->buffer)
+ toks = q->buffer;
+ toks -= L2T(q, skb->len);
- if (toks >= 0) {
+ if ((toks|ptoks) >= 0) {
q->t_c = now;
- q->tokens = toks <= q->depth ? toks : q->depth;
- q->bytes -= skb->len;
+ q->tokens = toks;
+ q->ptokens = ptoks;
+ sch->stats.backlog -= skb->len;
return skb;
}
+ if (!sch->dev->tbusy) {
+ long delay = PSCHED_US2JIFFIE(max(-toks, -ptoks));
+
+ if (delay == 0)
+ delay = 1;
+
+ del_timer(&q->wd_timer);
+ q->wd_timer.expires = jiffies + delay;
+ add_timer(&q->wd_timer);
+ }
+
/* Maybe, we have in queue a shorter packet,
which can be sent now. It sounds cool,
but, however, wrong in principle.
@@ -164,17 +237,12 @@
Really, if we splitted flow to independent
subflows, it would be very good solution.
- Look at sch_csz.c.
+ It is main idea of all FQ algorithms
+ (cf. CSZ, HPFQ, HFCS)
*/
__skb_queue_head(&sch->q, skb);
- if (!sch->dev->tbusy) {
- if (q->wd_timer.function)
- del_timer(&q->wd_timer);
- q->wd_timer.function = tbf_watchdog;
- q->wd_timer.expires = jiffies + PSCHED_US2JIFFIE(-toks);
- add_timer(&q->wd_timer);
- }
+ sch->stats.overlimits++;
}
return NULL;
}
@@ -184,69 +252,135 @@
tbf_reset(struct Qdisc* sch)
{
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
- struct sk_buff *skb;
- while ((skb = __skb_dequeue(&sch->q)) != NULL)
- kfree_skb(skb);
- q->bytes = 0;
+ skb_queue_purge(&sch->q);
+ sch->stats.backlog = 0;
PSCHED_GET_TIME(q->t_c);
- q->tokens = q->depth;
- if (q->wd_timer.function) {
- del_timer(&q->wd_timer);
- q->wd_timer.function = NULL;
- }
+ q->tokens = q->buffer;
+ q->ptokens = q->mtu;
+ del_timer(&q->wd_timer);
}
-static int tbf_init(struct Qdisc* sch, void *arg)
+static int tbf_init(struct Qdisc* sch, struct rtattr *opt)
{
struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
- struct tbfctl *ctl = (struct tbfctl*)arg;
+ struct rtattr *tb[TCA_TBF_PTAB];
+ struct tc_tbf_qopt *qopt;
+
+ MOD_INC_USE_COUNT;
+
+ if (opt == NULL ||
+ rtattr_parse(tb, TCA_TBF_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) ||
+ tb[TCA_TBF_PARMS-1] == NULL ||
+ RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt)) {
+ MOD_DEC_USE_COUNT;
+ return -EINVAL;
+ }
+
+ qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]);
+ q->R_tab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]);
+ if (q->R_tab == NULL) {
+ MOD_DEC_USE_COUNT;
+ return -EINVAL;
+ }
+
+ if (qopt->peakrate.rate) {
+ q->P_tab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_PTAB-1]);
+ if (q->P_tab == NULL) {
+ MOD_DEC_USE_COUNT;
+ qdisc_put_rtab(q->R_tab);
+ return -EINVAL;
+ }
+ }
PSCHED_GET_TIME(q->t_c);
init_timer(&q->wd_timer);
- q->wd_timer.function = NULL;
+ q->wd_timer.function = tbf_watchdog;
q->wd_timer.data = (unsigned long)sch;
- if (ctl) {
- q->max_bytes = ctl->bytes;
- q->depth = ctl->depth;
- q->tokens = q->tokens;
- q->cell_log = ctl->cell_log;
- memcpy(q->L_tab, ctl->L_tab, 256*sizeof(unsigned long));
- }
+ q->limit = qopt->limit;
+ q->mtu = qopt->mtu;
+ if (q->mtu == 0)
+ q->mtu = psched_mtu(sch->dev);
+ q->buffer = qopt->buffer;
+ q->tokens = q->buffer;
+ q->ptokens = q->mtu;
return 0;
}
-struct Qdisc_ops tbf_ops =
+static void tbf_destroy(struct Qdisc *sch)
{
+ struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
+
+ del_timer(&q->wd_timer);
+
+ if (q->P_tab)
+ qdisc_put_rtab(q->P_tab);
+ if (q->R_tab)
+ qdisc_put_rtab(q->R_tab);
+
+ MOD_DEC_USE_COUNT;
+}
+
+#ifdef CONFIG_RTNETLINK
+static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
+ unsigned char *b = skb->tail;
+ struct rtattr *rta;
+ struct tc_tbf_qopt opt;
+
+ rta = (struct rtattr*)b;
+ RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+
+ opt.limit = q->limit;
+ opt.rate = q->R_tab->rate;
+ if (q->P_tab)
+ opt.peakrate = q->P_tab->rate;
+ else
+ memset(&opt.peakrate, 0, sizeof(opt.peakrate));
+ opt.mtu = q->mtu;
+ opt.buffer = q->buffer;
+ RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt);
+ rta->rta_len = skb->tail - b;
+
+ return skb->len;
+
+rtattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+#endif
+
+struct Qdisc_ops tbf_qdisc_ops =
+{
+ NULL,
NULL,
"tbf",
- 0,
sizeof(struct tbf_sched_data),
+
tbf_enqueue,
tbf_dequeue,
- tbf_reset,
- NULL,
+ tbf_requeue,
+ tbf_drop,
+
tbf_init,
- NULL,
+ tbf_reset,
+ tbf_destroy,
+
+#ifdef CONFIG_RTNETLINK
+ tbf_dump,
+#endif
};
#ifdef MODULE
-#include <linux/module.h>
int init_module(void)
{
- int err;
-
- /* Load once and never free it. */
- MOD_INC_USE_COUNT;
-
- err = register_qdisc(&tbf_ops);
- if (err)
- MOD_DEC_USE_COUNT;
- return err;
+ return register_qdisc(&tbf_qdisc_ops);
}
void cleanup_module(void)
{
+ unregister_qdisc(&tbf_qdisc_ops);
}
#endif
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov