]> git.itanic.dy.fi Git - linux-stable/blob - net/sched/sch_cake.c
sch_cake: Return __NET_XMIT_STOLEN when consuming enqueued skb
[linux-stable] / net / sched / sch_cake.c
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4  *
5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11  *
12  * The CAKE Principles:
13  *                 (or, how to have your cake and eat it too)
14  *
15  * This is a combination of several shaping, AQM and FQ techniques into one
16  * easy-to-use package:
17  *
18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20  *   eliminating the need for any sort of burst parameter (eg. token bucket
21  *   depth).  Burst support is limited to that necessary to overcome scheduling
22  *   latency.
23  *
24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26  *   the priority is reduced to avoid starving other tins.
27  *
28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
29  *   flows from each other.  This prevents a burst on one flow from increasing
30  *   the delay to another.  Flows are distributed to queues using a
31  *   set-associative hash function.
32  *
33  * - Each queue is actively managed by Cobalt, which is a combination of the
34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35  *   congestion early via ECN (if available) and/or packet drops, to keep
36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37  *   setting, as is necessary at low bandwidths.
38  *
39  * The configuration parameters are kept deliberately simple for ease of use.
40  * Everything has sane defaults.  Complete generality of configuration is *not*
41  * a goal.
42  *
43  * The priority queue operates according to a weighted DRR scheme, combined with
44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
45  * bandwidth sharing threshold the tin is operating.  This determines whether a
46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
47  * that tin in the current pass.
48  *
49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50  * granted us permission to leverage.
51  */
52
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/if_vlan.h>
68 #include <net/pkt_sched.h>
69 #include <net/pkt_cls.h>
70 #include <net/tcp.h>
71 #include <net/flow_dissector.h>
72
73 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
74 #include <net/netfilter/nf_conntrack_core.h>
75 #endif
76
77 #define CAKE_SET_WAYS (8)
78 #define CAKE_MAX_TINS (8)
79 #define CAKE_QUEUES (1024)
80 #define CAKE_FLOW_MASK 63
81 #define CAKE_FLOW_NAT_FLAG 64
82
83 /* struct cobalt_params - contains codel and blue parameters
84  * @interval:   codel initial drop rate
85  * @target:     maximum persistent sojourn time & blue update rate
86  * @mtu_time:   serialisation delay of maximum-size packet
87  * @p_inc:      increment of blue drop probability (0.32 fxp)
88  * @p_dec:      decrement of blue drop probability (0.32 fxp)
89  */
90 struct cobalt_params {
91         u64     interval;
92         u64     target;
93         u64     mtu_time;
94         u32     p_inc;
95         u32     p_dec;
96 };
97
98 /* struct cobalt_vars - contains codel and blue variables
99  * @count:              codel dropping frequency
100  * @rec_inv_sqrt:       reciprocal value of sqrt(count) >> 1
101  * @drop_next:          time to drop next packet, or when we dropped last
102  * @blue_timer:         Blue time to next drop
103  * @p_drop:             BLUE drop probability (0.32 fxp)
104  * @dropping:           set if in dropping state
105  * @ecn_marked:         set if marked
106  */
107 struct cobalt_vars {
108         u32     count;
109         u32     rec_inv_sqrt;
110         ktime_t drop_next;
111         ktime_t blue_timer;
112         u32     p_drop;
113         bool    dropping;
114         bool    ecn_marked;
115 };
116
117 enum {
118         CAKE_SET_NONE = 0,
119         CAKE_SET_SPARSE,
120         CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
121         CAKE_SET_BULK,
122         CAKE_SET_DECAYING
123 };
124
125 struct cake_flow {
126         /* this stuff is all needed per-flow at dequeue time */
127         struct sk_buff    *head;
128         struct sk_buff    *tail;
129         struct list_head  flowchain;
130         s32               deficit;
131         u32               dropped;
132         struct cobalt_vars cvars;
133         u16               srchost; /* index into cake_host table */
134         u16               dsthost;
135         u8                set;
136 }; /* please try to keep this structure <= 64 bytes */
137
138 struct cake_host {
139         u32 srchost_tag;
140         u32 dsthost_tag;
141         u16 srchost_refcnt;
142         u16 dsthost_refcnt;
143 };
144
145 struct cake_heap_entry {
146         u16 t:3, b:10;
147 };
148
149 struct cake_tin_data {
150         struct cake_flow flows[CAKE_QUEUES];
151         u32     backlogs[CAKE_QUEUES];
152         u32     tags[CAKE_QUEUES]; /* for set association */
153         u16     overflow_idx[CAKE_QUEUES];
154         struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
155         u16     flow_quantum;
156
157         struct cobalt_params cparams;
158         u32     drop_overlimit;
159         u16     bulk_flow_count;
160         u16     sparse_flow_count;
161         u16     decaying_flow_count;
162         u16     unresponsive_flow_count;
163
164         u32     max_skblen;
165
166         struct list_head new_flows;
167         struct list_head old_flows;
168         struct list_head decaying_flows;
169
170         /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
171         ktime_t time_next_packet;
172         u64     tin_rate_ns;
173         u64     tin_rate_bps;
174         u16     tin_rate_shft;
175
176         u16     tin_quantum_prio;
177         u16     tin_quantum_band;
178         s32     tin_deficit;
179         u32     tin_backlog;
180         u32     tin_dropped;
181         u32     tin_ecn_mark;
182
183         u32     packets;
184         u64     bytes;
185
186         u32     ack_drops;
187
188         /* moving averages */
189         u64 avge_delay;
190         u64 peak_delay;
191         u64 base_delay;
192
193         /* hash function stats */
194         u32     way_directs;
195         u32     way_hits;
196         u32     way_misses;
197         u32     way_collisions;
198 }; /* number of tins is small, so size of this struct doesn't matter much */
199
200 struct cake_sched_data {
201         struct tcf_proto __rcu *filter_list; /* optional external classifier */
202         struct tcf_block *block;
203         struct cake_tin_data *tins;
204
205         struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
206         u16             overflow_timeout;
207
208         u16             tin_cnt;
209         u8              tin_mode;
210         u8              flow_mode;
211         u8              ack_filter;
212         u8              atm_mode;
213
214         /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
215         u16             rate_shft;
216         ktime_t         time_next_packet;
217         ktime_t         failsafe_next_packet;
218         u64             rate_ns;
219         u64             rate_bps;
220         u16             rate_flags;
221         s16             rate_overhead;
222         u16             rate_mpu;
223         u64             interval;
224         u64             target;
225
226         /* resource tracking */
227         u32             buffer_used;
228         u32             buffer_max_used;
229         u32             buffer_limit;
230         u32             buffer_config_limit;
231
232         /* indices for dequeue */
233         u16             cur_tin;
234         u16             cur_flow;
235
236         struct qdisc_watchdog watchdog;
237         const u8        *tin_index;
238         const u8        *tin_order;
239
240         /* bandwidth capacity estimate */
241         ktime_t         last_packet_time;
242         ktime_t         avg_window_begin;
243         u64             avg_packet_interval;
244         u64             avg_window_bytes;
245         u64             avg_peak_bandwidth;
246         ktime_t         last_reconfig_time;
247
248         /* packet length stats */
249         u32             avg_netoff;
250         u16             max_netlen;
251         u16             max_adjlen;
252         u16             min_netlen;
253         u16             min_adjlen;
254 };
255
256 enum {
257         CAKE_FLAG_OVERHEAD         = BIT(0),
258         CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
259         CAKE_FLAG_INGRESS          = BIT(2),
260         CAKE_FLAG_WASH             = BIT(3),
261         CAKE_FLAG_SPLIT_GSO        = BIT(4)
262 };
263
264 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
265  * obtain the best features of each.  Codel is excellent on flows which
266  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
267  * unresponsive flows.
268  */
269
270 struct cobalt_skb_cb {
271         ktime_t enqueue_time;
272         u32     adjusted_len;
273 };
274
275 static u64 us_to_ns(u64 us)
276 {
277         return us * NSEC_PER_USEC;
278 }
279
280 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
281 {
282         qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
283         return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
284 }
285
286 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
287 {
288         return get_cobalt_cb(skb)->enqueue_time;
289 }
290
291 static void cobalt_set_enqueue_time(struct sk_buff *skb,
292                                     ktime_t now)
293 {
294         get_cobalt_cb(skb)->enqueue_time = now;
295 }
296
297 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
298
299 /* Diffserv lookup tables */
300
301 static const u8 precedence[] = {
302         0, 0, 0, 0, 0, 0, 0, 0,
303         1, 1, 1, 1, 1, 1, 1, 1,
304         2, 2, 2, 2, 2, 2, 2, 2,
305         3, 3, 3, 3, 3, 3, 3, 3,
306         4, 4, 4, 4, 4, 4, 4, 4,
307         5, 5, 5, 5, 5, 5, 5, 5,
308         6, 6, 6, 6, 6, 6, 6, 6,
309         7, 7, 7, 7, 7, 7, 7, 7,
310 };
311
312 static const u8 diffserv8[] = {
313         2, 5, 1, 2, 4, 2, 2, 2,
314         0, 2, 1, 2, 1, 2, 1, 2,
315         5, 2, 4, 2, 4, 2, 4, 2,
316         3, 2, 3, 2, 3, 2, 3, 2,
317         6, 2, 3, 2, 3, 2, 3, 2,
318         6, 2, 2, 2, 6, 2, 6, 2,
319         7, 2, 2, 2, 2, 2, 2, 2,
320         7, 2, 2, 2, 2, 2, 2, 2,
321 };
322
323 static const u8 diffserv4[] = {
324         0, 2, 0, 0, 2, 0, 0, 0,
325         1, 0, 0, 0, 0, 0, 0, 0,
326         2, 0, 2, 0, 2, 0, 2, 0,
327         2, 0, 2, 0, 2, 0, 2, 0,
328         3, 0, 2, 0, 2, 0, 2, 0,
329         3, 0, 0, 0, 3, 0, 3, 0,
330         3, 0, 0, 0, 0, 0, 0, 0,
331         3, 0, 0, 0, 0, 0, 0, 0,
332 };
333
334 static const u8 diffserv3[] = {
335         0, 0, 0, 0, 2, 0, 0, 0,
336         1, 0, 0, 0, 0, 0, 0, 0,
337         0, 0, 0, 0, 0, 0, 0, 0,
338         0, 0, 0, 0, 0, 0, 0, 0,
339         0, 0, 0, 0, 0, 0, 0, 0,
340         0, 0, 0, 0, 2, 0, 2, 0,
341         2, 0, 0, 0, 0, 0, 0, 0,
342         2, 0, 0, 0, 0, 0, 0, 0,
343 };
344
345 static const u8 besteffort[] = {
346         0, 0, 0, 0, 0, 0, 0, 0,
347         0, 0, 0, 0, 0, 0, 0, 0,
348         0, 0, 0, 0, 0, 0, 0, 0,
349         0, 0, 0, 0, 0, 0, 0, 0,
350         0, 0, 0, 0, 0, 0, 0, 0,
351         0, 0, 0, 0, 0, 0, 0, 0,
352         0, 0, 0, 0, 0, 0, 0, 0,
353         0, 0, 0, 0, 0, 0, 0, 0,
354 };
355
356 /* tin priority order for stats dumping */
357
358 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
359 static const u8 bulk_order[] = {1, 0, 2, 3};
360
361 #define REC_INV_SQRT_CACHE (16)
362 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
363
364 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
365  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
366  *
367  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
368  */
369
370 static void cobalt_newton_step(struct cobalt_vars *vars)
371 {
372         u32 invsqrt, invsqrt2;
373         u64 val;
374
375         invsqrt = vars->rec_inv_sqrt;
376         invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
377         val = (3LL << 32) - ((u64)vars->count * invsqrt2);
378
379         val >>= 2; /* avoid overflow in following multiply */
380         val = (val * invsqrt) >> (32 - 2 + 1);
381
382         vars->rec_inv_sqrt = val;
383 }
384
385 static void cobalt_invsqrt(struct cobalt_vars *vars)
386 {
387         if (vars->count < REC_INV_SQRT_CACHE)
388                 vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
389         else
390                 cobalt_newton_step(vars);
391 }
392
393 /* There is a big difference in timing between the accurate values placed in
394  * the cache and the approximations given by a single Newton step for small
395  * count values, particularly when stepping from count 1 to 2 or vice versa.
396  * Above 16, a single Newton step gives sufficient accuracy in either
397  * direction, given the precision stored.
398  *
399  * The magnitude of the error when stepping up to count 2 is such as to give
400  * the value that *should* have been produced at count 4.
401  */
402
403 static void cobalt_cache_init(void)
404 {
405         struct cobalt_vars v;
406
407         memset(&v, 0, sizeof(v));
408         v.rec_inv_sqrt = ~0U;
409         cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
410
411         for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
412                 cobalt_newton_step(&v);
413                 cobalt_newton_step(&v);
414                 cobalt_newton_step(&v);
415                 cobalt_newton_step(&v);
416
417                 cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
418         }
419 }
420
421 static void cobalt_vars_init(struct cobalt_vars *vars)
422 {
423         memset(vars, 0, sizeof(*vars));
424
425         if (!cobalt_rec_inv_sqrt_cache[0]) {
426                 cobalt_cache_init();
427                 cobalt_rec_inv_sqrt_cache[0] = ~0;
428         }
429 }
430
431 /* CoDel control_law is t + interval/sqrt(count)
432  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
433  * both sqrt() and divide operation.
434  */
435 static ktime_t cobalt_control(ktime_t t,
436                               u64 interval,
437                               u32 rec_inv_sqrt)
438 {
439         return ktime_add_ns(t, reciprocal_scale(interval,
440                                                 rec_inv_sqrt));
441 }
442
443 /* Call this when a packet had to be dropped due to queue overflow.  Returns
444  * true if the BLUE state was quiescent before but active after this call.
445  */
446 static bool cobalt_queue_full(struct cobalt_vars *vars,
447                               struct cobalt_params *p,
448                               ktime_t now)
449 {
450         bool up = false;
451
452         if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
453                 up = !vars->p_drop;
454                 vars->p_drop += p->p_inc;
455                 if (vars->p_drop < p->p_inc)
456                         vars->p_drop = ~0;
457                 vars->blue_timer = now;
458         }
459         vars->dropping = true;
460         vars->drop_next = now;
461         if (!vars->count)
462                 vars->count = 1;
463
464         return up;
465 }
466
467 /* Call this when the queue was serviced but turned out to be empty.  Returns
468  * true if the BLUE state was active before but quiescent after this call.
469  */
470 static bool cobalt_queue_empty(struct cobalt_vars *vars,
471                                struct cobalt_params *p,
472                                ktime_t now)
473 {
474         bool down = false;
475
476         if (vars->p_drop &&
477             ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
478                 if (vars->p_drop < p->p_dec)
479                         vars->p_drop = 0;
480                 else
481                         vars->p_drop -= p->p_dec;
482                 vars->blue_timer = now;
483                 down = !vars->p_drop;
484         }
485         vars->dropping = false;
486
487         if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
488                 vars->count--;
489                 cobalt_invsqrt(vars);
490                 vars->drop_next = cobalt_control(vars->drop_next,
491                                                  p->interval,
492                                                  vars->rec_inv_sqrt);
493         }
494
495         return down;
496 }
497
498 /* Call this with a freshly dequeued packet for possible congestion marking.
499  * Returns true as an instruction to drop the packet, false for delivery.
500  */
501 static bool cobalt_should_drop(struct cobalt_vars *vars,
502                                struct cobalt_params *p,
503                                ktime_t now,
504                                struct sk_buff *skb,
505                                u32 bulk_flows)
506 {
507         bool next_due, over_target, drop = false;
508         ktime_t schedule;
509         u64 sojourn;
510
511 /* The 'schedule' variable records, in its sign, whether 'now' is before or
512  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
513  * scheduling decision is actually branched, without destroying that
514  * information.  Similarly, the first 'schedule' value calculated is preserved
515  * in the boolean 'next_due'.
516  *
517  * As for 'drop_next', we take advantage of the fact that 'interval' is both
518  * the delay between first exceeding 'target' and the first signalling event,
519  * *and* the scaling factor for the signalling frequency.  It's therefore very
520  * natural to use a single mechanism for both purposes, and eliminates a
521  * significant amount of reference Codel's spaghetti code.  To help with this,
522  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
523  * as possible to 1.0 in fixed-point.
524  */
525
526         sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
527         schedule = ktime_sub(now, vars->drop_next);
528         over_target = sojourn > p->target &&
529                       sojourn > p->mtu_time * bulk_flows * 2 &&
530                       sojourn > p->mtu_time * 4;
531         next_due = vars->count && ktime_to_ns(schedule) >= 0;
532
533         vars->ecn_marked = false;
534
535         if (over_target) {
536                 if (!vars->dropping) {
537                         vars->dropping = true;
538                         vars->drop_next = cobalt_control(now,
539                                                          p->interval,
540                                                          vars->rec_inv_sqrt);
541                 }
542                 if (!vars->count)
543                         vars->count = 1;
544         } else if (vars->dropping) {
545                 vars->dropping = false;
546         }
547
548         if (next_due && vars->dropping) {
549                 /* Use ECN mark if possible, otherwise drop */
550                 drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
551
552                 vars->count++;
553                 if (!vars->count)
554                         vars->count--;
555                 cobalt_invsqrt(vars);
556                 vars->drop_next = cobalt_control(vars->drop_next,
557                                                  p->interval,
558                                                  vars->rec_inv_sqrt);
559                 schedule = ktime_sub(now, vars->drop_next);
560         } else {
561                 while (next_due) {
562                         vars->count--;
563                         cobalt_invsqrt(vars);
564                         vars->drop_next = cobalt_control(vars->drop_next,
565                                                          p->interval,
566                                                          vars->rec_inv_sqrt);
567                         schedule = ktime_sub(now, vars->drop_next);
568                         next_due = vars->count && ktime_to_ns(schedule) >= 0;
569                 }
570         }
571
572         /* Simple BLUE implementation.  Lack of ECN is deliberate. */
573         if (vars->p_drop)
574                 drop |= (prandom_u32() < vars->p_drop);
575
576         /* Overload the drop_next field as an activity timeout */
577         if (!vars->count)
578                 vars->drop_next = ktime_add_ns(now, p->interval);
579         else if (ktime_to_ns(schedule) > 0 && !drop)
580                 vars->drop_next = now;
581
582         return drop;
583 }
584
585 static void cake_update_flowkeys(struct flow_keys *keys,
586                                  const struct sk_buff *skb)
587 {
588 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
589         struct nf_conntrack_tuple tuple = {};
590         bool rev = !skb->_nfct;
591
592         if (skb_protocol(skb, true) != htons(ETH_P_IP))
593                 return;
594
595         if (!nf_ct_get_tuple_skb(&tuple, skb))
596                 return;
597
598         keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
599         keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
600
601         if (keys->ports.ports) {
602                 keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all;
603                 keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all;
604         }
605 #endif
606 }
607
608 /* Cake has several subtle multiple bit settings. In these cases you
609  *  would be matching triple isolate mode as well.
610  */
611
612 static bool cake_dsrc(int flow_mode)
613 {
614         return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
615 }
616
617 static bool cake_ddst(int flow_mode)
618 {
619         return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
620 }
621
622 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
623                      int flow_mode, u16 flow_override, u16 host_override)
624 {
625         u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
626         u16 reduced_hash, srchost_idx, dsthost_idx;
627         struct flow_keys keys, host_keys;
628
629         if (unlikely(flow_mode == CAKE_FLOW_NONE))
630                 return 0;
631
632         /* If both overrides are set we can skip packet dissection entirely */
633         if ((flow_override || !(flow_mode & CAKE_FLOW_FLOWS)) &&
634             (host_override || !(flow_mode & CAKE_FLOW_HOSTS)))
635                 goto skip_hash;
636
637         skb_flow_dissect_flow_keys(skb, &keys,
638                                    FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
639
640         if (flow_mode & CAKE_FLOW_NAT_FLAG)
641                 cake_update_flowkeys(&keys, skb);
642
643         /* flow_hash_from_keys() sorts the addresses by value, so we have
644          * to preserve their order in a separate data structure to treat
645          * src and dst host addresses as independently selectable.
646          */
647         host_keys = keys;
648         host_keys.ports.ports     = 0;
649         host_keys.basic.ip_proto  = 0;
650         host_keys.keyid.keyid     = 0;
651         host_keys.tags.flow_label = 0;
652
653         switch (host_keys.control.addr_type) {
654         case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
655                 host_keys.addrs.v4addrs.src = 0;
656                 dsthost_hash = flow_hash_from_keys(&host_keys);
657                 host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
658                 host_keys.addrs.v4addrs.dst = 0;
659                 srchost_hash = flow_hash_from_keys(&host_keys);
660                 break;
661
662         case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
663                 memset(&host_keys.addrs.v6addrs.src, 0,
664                        sizeof(host_keys.addrs.v6addrs.src));
665                 dsthost_hash = flow_hash_from_keys(&host_keys);
666                 host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
667                 memset(&host_keys.addrs.v6addrs.dst, 0,
668                        sizeof(host_keys.addrs.v6addrs.dst));
669                 srchost_hash = flow_hash_from_keys(&host_keys);
670                 break;
671
672         default:
673                 dsthost_hash = 0;
674                 srchost_hash = 0;
675         }
676
677         /* This *must* be after the above switch, since as a
678          * side-effect it sorts the src and dst addresses.
679          */
680         if (flow_mode & CAKE_FLOW_FLOWS)
681                 flow_hash = flow_hash_from_keys(&keys);
682
683 skip_hash:
684         if (flow_override)
685                 flow_hash = flow_override - 1;
686         if (host_override) {
687                 dsthost_hash = host_override - 1;
688                 srchost_hash = host_override - 1;
689         }
690
691         if (!(flow_mode & CAKE_FLOW_FLOWS)) {
692                 if (flow_mode & CAKE_FLOW_SRC_IP)
693                         flow_hash ^= srchost_hash;
694
695                 if (flow_mode & CAKE_FLOW_DST_IP)
696                         flow_hash ^= dsthost_hash;
697         }
698
699         reduced_hash = flow_hash % CAKE_QUEUES;
700
701         /* set-associative hashing */
702         /* fast path if no hash collision (direct lookup succeeds) */
703         if (likely(q->tags[reduced_hash] == flow_hash &&
704                    q->flows[reduced_hash].set)) {
705                 q->way_directs++;
706         } else {
707                 u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
708                 u32 outer_hash = reduced_hash - inner_hash;
709                 bool allocate_src = false;
710                 bool allocate_dst = false;
711                 u32 i, k;
712
713                 /* check if any active queue in the set is reserved for
714                  * this flow.
715                  */
716                 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
717                      i++, k = (k + 1) % CAKE_SET_WAYS) {
718                         if (q->tags[outer_hash + k] == flow_hash) {
719                                 if (i)
720                                         q->way_hits++;
721
722                                 if (!q->flows[outer_hash + k].set) {
723                                         /* need to increment host refcnts */
724                                         allocate_src = cake_dsrc(flow_mode);
725                                         allocate_dst = cake_ddst(flow_mode);
726                                 }
727
728                                 goto found;
729                         }
730                 }
731
732                 /* no queue is reserved for this flow, look for an
733                  * empty one.
734                  */
735                 for (i = 0; i < CAKE_SET_WAYS;
736                          i++, k = (k + 1) % CAKE_SET_WAYS) {
737                         if (!q->flows[outer_hash + k].set) {
738                                 q->way_misses++;
739                                 allocate_src = cake_dsrc(flow_mode);
740                                 allocate_dst = cake_ddst(flow_mode);
741                                 goto found;
742                         }
743                 }
744
745                 /* With no empty queues, default to the original
746                  * queue, accept the collision, update the host tags.
747                  */
748                 q->way_collisions++;
749                 q->hosts[q->flows[reduced_hash].srchost].srchost_refcnt--;
750                 q->hosts[q->flows[reduced_hash].dsthost].dsthost_refcnt--;
751                 allocate_src = cake_dsrc(flow_mode);
752                 allocate_dst = cake_ddst(flow_mode);
753 found:
754                 /* reserve queue for future packets in same flow */
755                 reduced_hash = outer_hash + k;
756                 q->tags[reduced_hash] = flow_hash;
757
758                 if (allocate_src) {
759                         srchost_idx = srchost_hash % CAKE_QUEUES;
760                         inner_hash = srchost_idx % CAKE_SET_WAYS;
761                         outer_hash = srchost_idx - inner_hash;
762                         for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
763                                 i++, k = (k + 1) % CAKE_SET_WAYS) {
764                                 if (q->hosts[outer_hash + k].srchost_tag ==
765                                     srchost_hash)
766                                         goto found_src;
767                         }
768                         for (i = 0; i < CAKE_SET_WAYS;
769                                 i++, k = (k + 1) % CAKE_SET_WAYS) {
770                                 if (!q->hosts[outer_hash + k].srchost_refcnt)
771                                         break;
772                         }
773                         q->hosts[outer_hash + k].srchost_tag = srchost_hash;
774 found_src:
775                         srchost_idx = outer_hash + k;
776                         q->hosts[srchost_idx].srchost_refcnt++;
777                         q->flows[reduced_hash].srchost = srchost_idx;
778                 }
779
780                 if (allocate_dst) {
781                         dsthost_idx = dsthost_hash % CAKE_QUEUES;
782                         inner_hash = dsthost_idx % CAKE_SET_WAYS;
783                         outer_hash = dsthost_idx - inner_hash;
784                         for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
785                              i++, k = (k + 1) % CAKE_SET_WAYS) {
786                                 if (q->hosts[outer_hash + k].dsthost_tag ==
787                                     dsthost_hash)
788                                         goto found_dst;
789                         }
790                         for (i = 0; i < CAKE_SET_WAYS;
791                              i++, k = (k + 1) % CAKE_SET_WAYS) {
792                                 if (!q->hosts[outer_hash + k].dsthost_refcnt)
793                                         break;
794                         }
795                         q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
796 found_dst:
797                         dsthost_idx = outer_hash + k;
798                         q->hosts[dsthost_idx].dsthost_refcnt++;
799                         q->flows[reduced_hash].dsthost = dsthost_idx;
800                 }
801         }
802
803         return reduced_hash;
804 }
805
806 /* helper functions : might be changed when/if skb use a standard list_head */
807 /* remove one skb from head of slot queue */
808
809 static struct sk_buff *dequeue_head(struct cake_flow *flow)
810 {
811         struct sk_buff *skb = flow->head;
812
813         if (skb) {
814                 flow->head = skb->next;
815                 skb->next = NULL;
816         }
817
818         return skb;
819 }
820
821 /* add skb to flow queue (tail add) */
822
823 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
824 {
825         if (!flow->head)
826                 flow->head = skb;
827         else
828                 flow->tail->next = skb;
829         flow->tail = skb;
830         skb->next = NULL;
831 }
832
833 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
834                                     struct ipv6hdr *buf)
835 {
836         unsigned int offset = skb_network_offset(skb);
837         struct iphdr *iph;
838
839         iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
840
841         if (!iph)
842                 return NULL;
843
844         if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
845                 return skb_header_pointer(skb, offset + iph->ihl * 4,
846                                           sizeof(struct ipv6hdr), buf);
847
848         else if (iph->version == 4)
849                 return iph;
850
851         else if (iph->version == 6)
852                 return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
853                                           buf);
854
855         return NULL;
856 }
857
858 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
859                                       void *buf, unsigned int bufsize)
860 {
861         unsigned int offset = skb_network_offset(skb);
862         const struct ipv6hdr *ipv6h;
863         const struct tcphdr *tcph;
864         const struct iphdr *iph;
865         struct ipv6hdr _ipv6h;
866         struct tcphdr _tcph;
867
868         ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
869
870         if (!ipv6h)
871                 return NULL;
872
873         if (ipv6h->version == 4) {
874                 iph = (struct iphdr *)ipv6h;
875                 offset += iph->ihl * 4;
876
877                 /* special-case 6in4 tunnelling, as that is a common way to get
878                  * v6 connectivity in the home
879                  */
880                 if (iph->protocol == IPPROTO_IPV6) {
881                         ipv6h = skb_header_pointer(skb, offset,
882                                                    sizeof(_ipv6h), &_ipv6h);
883
884                         if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
885                                 return NULL;
886
887                         offset += sizeof(struct ipv6hdr);
888
889                 } else if (iph->protocol != IPPROTO_TCP) {
890                         return NULL;
891                 }
892
893         } else if (ipv6h->version == 6) {
894                 if (ipv6h->nexthdr != IPPROTO_TCP)
895                         return NULL;
896
897                 offset += sizeof(struct ipv6hdr);
898         } else {
899                 return NULL;
900         }
901
902         tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
903         if (!tcph || tcph->doff < 5)
904                 return NULL;
905
906         return skb_header_pointer(skb, offset,
907                                   min(__tcp_hdrlen(tcph), bufsize), buf);
908 }
909
910 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
911                                    int code, int *oplen)
912 {
913         /* inspired by tcp_parse_options in tcp_input.c */
914         int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
915         const u8 *ptr = (const u8 *)(tcph + 1);
916
917         while (length > 0) {
918                 int opcode = *ptr++;
919                 int opsize;
920
921                 if (opcode == TCPOPT_EOL)
922                         break;
923                 if (opcode == TCPOPT_NOP) {
924                         length--;
925                         continue;
926                 }
927                 if (length < 2)
928                         break;
929                 opsize = *ptr++;
930                 if (opsize < 2 || opsize > length)
931                         break;
932
933                 if (opcode == code) {
934                         *oplen = opsize;
935                         return ptr;
936                 }
937
938                 ptr += opsize - 2;
939                 length -= opsize;
940         }
941
942         return NULL;
943 }
944
945 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
946  * bytes than the other. In the case where both sequences ACKs bytes that the
947  * other doesn't, A is considered greater. DSACKs in A also makes A be
948  * considered greater.
949  *
950  * @return -1, 0 or 1 as normal compare functions
951  */
952 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
953                                   const struct tcphdr *tcph_b)
954 {
955         const struct tcp_sack_block_wire *sack_a, *sack_b;
956         u32 ack_seq_a = ntohl(tcph_a->ack_seq);
957         u32 bytes_a = 0, bytes_b = 0;
958         int oplen_a, oplen_b;
959         bool first = true;
960
961         sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
962         sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
963
964         /* pointers point to option contents */
965         oplen_a -= TCPOLEN_SACK_BASE;
966         oplen_b -= TCPOLEN_SACK_BASE;
967
968         if (sack_a && oplen_a >= sizeof(*sack_a) &&
969             (!sack_b || oplen_b < sizeof(*sack_b)))
970                 return -1;
971         else if (sack_b && oplen_b >= sizeof(*sack_b) &&
972                  (!sack_a || oplen_a < sizeof(*sack_a)))
973                 return 1;
974         else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
975                  (!sack_b || oplen_b < sizeof(*sack_b)))
976                 return 0;
977
978         while (oplen_a >= sizeof(*sack_a)) {
979                 const struct tcp_sack_block_wire *sack_tmp = sack_b;
980                 u32 start_a = get_unaligned_be32(&sack_a->start_seq);
981                 u32 end_a = get_unaligned_be32(&sack_a->end_seq);
982                 int oplen_tmp = oplen_b;
983                 bool found = false;
984
985                 /* DSACK; always considered greater to prevent dropping */
986                 if (before(start_a, ack_seq_a))
987                         return -1;
988
989                 bytes_a += end_a - start_a;
990
991                 while (oplen_tmp >= sizeof(*sack_tmp)) {
992                         u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
993                         u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
994
995                         /* first time through we count the total size */
996                         if (first)
997                                 bytes_b += end_b - start_b;
998
999                         if (!after(start_b, start_a) && !before(end_b, end_a)) {
1000                                 found = true;
1001                                 if (!first)
1002                                         break;
1003                         }
1004                         oplen_tmp -= sizeof(*sack_tmp);
1005                         sack_tmp++;
1006                 }
1007
1008                 if (!found)
1009                         return -1;
1010
1011                 oplen_a -= sizeof(*sack_a);
1012                 sack_a++;
1013                 first = false;
1014         }
1015
1016         /* If we made it this far, all ranges SACKed by A are covered by B, so
1017          * either the SACKs are equal, or B SACKs more bytes.
1018          */
1019         return bytes_b > bytes_a ? 1 : 0;
1020 }
1021
1022 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1023                                  u32 *tsval, u32 *tsecr)
1024 {
1025         const u8 *ptr;
1026         int opsize;
1027
1028         ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1029
1030         if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1031                 *tsval = get_unaligned_be32(ptr);
1032                 *tsecr = get_unaligned_be32(ptr + 4);
1033         }
1034 }
1035
1036 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1037                                u32 tstamp_new, u32 tsecr_new)
1038 {
1039         /* inspired by tcp_parse_options in tcp_input.c */
1040         int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1041         const u8 *ptr = (const u8 *)(tcph + 1);
1042         u32 tstamp, tsecr;
1043
1044         /* 3 reserved flags must be unset to avoid future breakage
1045          * ACK must be set
1046          * ECE/CWR are handled separately
1047          * All other flags URG/PSH/RST/SYN/FIN must be unset
1048          * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1049          * 0x00C00000 = CWR/ECE (handled separately)
1050          * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1051          */
1052         if (((tcp_flag_word(tcph) &
1053               cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1054                 return false;
1055
1056         while (length > 0) {
1057                 int opcode = *ptr++;
1058                 int opsize;
1059
1060                 if (opcode == TCPOPT_EOL)
1061                         break;
1062                 if (opcode == TCPOPT_NOP) {
1063                         length--;
1064                         continue;
1065                 }
1066                 if (length < 2)
1067                         break;
1068                 opsize = *ptr++;
1069                 if (opsize < 2 || opsize > length)
1070                         break;
1071
1072                 switch (opcode) {
1073                 case TCPOPT_MD5SIG: /* doesn't influence state */
1074                         break;
1075
1076                 case TCPOPT_SACK: /* stricter checking performed later */
1077                         if (opsize % 8 != 2)
1078                                 return false;
1079                         break;
1080
1081                 case TCPOPT_TIMESTAMP:
1082                         /* only drop timestamps lower than new */
1083                         if (opsize != TCPOLEN_TIMESTAMP)
1084                                 return false;
1085                         tstamp = get_unaligned_be32(ptr);
1086                         tsecr = get_unaligned_be32(ptr + 4);
1087                         if (after(tstamp, tstamp_new) ||
1088                             after(tsecr, tsecr_new))
1089                                 return false;
1090                         break;
1091
1092                 case TCPOPT_MSS:  /* these should only be set on SYN */
1093                 case TCPOPT_WINDOW:
1094                 case TCPOPT_SACK_PERM:
1095                 case TCPOPT_FASTOPEN:
1096                 case TCPOPT_EXP:
1097                 default: /* don't drop if any unknown options are present */
1098                         return false;
1099                 }
1100
1101                 ptr += opsize - 2;
1102                 length -= opsize;
1103         }
1104
1105         return true;
1106 }
1107
1108 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1109                                        struct cake_flow *flow)
1110 {
1111         bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1112         struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1113         struct sk_buff *skb_check, *skb_prev = NULL;
1114         const struct ipv6hdr *ipv6h, *ipv6h_check;
1115         unsigned char _tcph[64], _tcph_check[64];
1116         const struct tcphdr *tcph, *tcph_check;
1117         const struct iphdr *iph, *iph_check;
1118         struct ipv6hdr _iph, _iph_check;
1119         const struct sk_buff *skb;
1120         int seglen, num_found = 0;
1121         u32 tstamp = 0, tsecr = 0;
1122         __be32 elig_flags = 0;
1123         int sack_comp;
1124
1125         /* no other possible ACKs to filter */
1126         if (flow->head == flow->tail)
1127                 return NULL;
1128
1129         skb = flow->tail;
1130         tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1131         iph = cake_get_iphdr(skb, &_iph);
1132         if (!tcph)
1133                 return NULL;
1134
1135         cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1136
1137         /* the 'triggering' packet need only have the ACK flag set.
1138          * also check that SYN is not set, as there won't be any previous ACKs.
1139          */
1140         if ((tcp_flag_word(tcph) &
1141              (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1142                 return NULL;
1143
1144         /* the 'triggering' ACK is at the tail of the queue, we have already
1145          * returned if it is the only packet in the flow. loop through the rest
1146          * of the queue looking for pure ACKs with the same 5-tuple as the
1147          * triggering one.
1148          */
1149         for (skb_check = flow->head;
1150              skb_check && skb_check != skb;
1151              skb_prev = skb_check, skb_check = skb_check->next) {
1152                 iph_check = cake_get_iphdr(skb_check, &_iph_check);
1153                 tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1154                                              sizeof(_tcph_check));
1155
1156                 /* only TCP packets with matching 5-tuple are eligible, and only
1157                  * drop safe headers
1158                  */
1159                 if (!tcph_check || iph->version != iph_check->version ||
1160                     tcph_check->source != tcph->source ||
1161                     tcph_check->dest != tcph->dest)
1162                         continue;
1163
1164                 if (iph_check->version == 4) {
1165                         if (iph_check->saddr != iph->saddr ||
1166                             iph_check->daddr != iph->daddr)
1167                                 continue;
1168
1169                         seglen = ntohs(iph_check->tot_len) -
1170                                        (4 * iph_check->ihl);
1171                 } else if (iph_check->version == 6) {
1172                         ipv6h = (struct ipv6hdr *)iph;
1173                         ipv6h_check = (struct ipv6hdr *)iph_check;
1174
1175                         if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1176                             ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1177                                 continue;
1178
1179                         seglen = ntohs(ipv6h_check->payload_len);
1180                 } else {
1181                         WARN_ON(1);  /* shouldn't happen */
1182                         continue;
1183                 }
1184
1185                 /* If the ECE/CWR flags changed from the previous eligible
1186                  * packet in the same flow, we should no longer be dropping that
1187                  * previous packet as this would lose information.
1188                  */
1189                 if (elig_ack && (tcp_flag_word(tcph_check) &
1190                                  (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1191                         elig_ack = NULL;
1192                         elig_ack_prev = NULL;
1193                         num_found--;
1194                 }
1195
1196                 /* Check TCP options and flags, don't drop ACKs with segment
1197                  * data, and don't drop ACKs with a higher cumulative ACK
1198                  * counter than the triggering packet. Check ACK seqno here to
1199                  * avoid parsing SACK options of packets we are going to exclude
1200                  * anyway.
1201                  */
1202                 if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1203                     (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1204                     after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1205                         continue;
1206
1207                 /* Check SACK options. The triggering packet must SACK more data
1208                  * than the ACK under consideration, or SACK the same range but
1209                  * have a larger cumulative ACK counter. The latter is a
1210                  * pathological case, but is contained in the following check
1211                  * anyway, just to be safe.
1212                  */
1213                 sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1214
1215                 if (sack_comp < 0 ||
1216                     (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1217                      sack_comp == 0))
1218                         continue;
1219
1220                 /* At this point we have found an eligible pure ACK to drop; if
1221                  * we are in aggressive mode, we are done. Otherwise, keep
1222                  * searching unless this is the second eligible ACK we
1223                  * found.
1224                  *
1225                  * Since we want to drop ACK closest to the head of the queue,
1226                  * save the first eligible ACK we find, even if we need to loop
1227                  * again.
1228                  */
1229                 if (!elig_ack) {
1230                         elig_ack = skb_check;
1231                         elig_ack_prev = skb_prev;
1232                         elig_flags = (tcp_flag_word(tcph_check)
1233                                       & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1234                 }
1235
1236                 if (num_found++ > 0)
1237                         goto found;
1238         }
1239
1240         /* We made it through the queue without finding two eligible ACKs . If
1241          * we found a single eligible ACK we can drop it in aggressive mode if
1242          * we can guarantee that this does not interfere with ECN flag
1243          * information. We ensure this by dropping it only if the enqueued
1244          * packet is consecutive with the eligible ACK, and their flags match.
1245          */
1246         if (elig_ack && aggressive && elig_ack->next == skb &&
1247             (elig_flags == (tcp_flag_word(tcph) &
1248                             (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1249                 goto found;
1250
1251         return NULL;
1252
1253 found:
1254         if (elig_ack_prev)
1255                 elig_ack_prev->next = elig_ack->next;
1256         else
1257                 flow->head = elig_ack->next;
1258
1259         elig_ack->next = NULL;
1260
1261         return elig_ack;
1262 }
1263
1264 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1265 {
1266         avg -= avg >> shift;
1267         avg += sample >> shift;
1268         return avg;
1269 }
1270
1271 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1272 {
1273         if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1274                 len -= off;
1275
1276         if (q->max_netlen < len)
1277                 q->max_netlen = len;
1278         if (q->min_netlen > len)
1279                 q->min_netlen = len;
1280
1281         len += q->rate_overhead;
1282
1283         if (len < q->rate_mpu)
1284                 len = q->rate_mpu;
1285
1286         if (q->atm_mode == CAKE_ATM_ATM) {
1287                 len += 47;
1288                 len /= 48;
1289                 len *= 53;
1290         } else if (q->atm_mode == CAKE_ATM_PTM) {
1291                 /* Add one byte per 64 bytes or part thereof.
1292                  * This is conservative and easier to calculate than the
1293                  * precise value.
1294                  */
1295                 len += (len + 63) / 64;
1296         }
1297
1298         if (q->max_adjlen < len)
1299                 q->max_adjlen = len;
1300         if (q->min_adjlen > len)
1301                 q->min_adjlen = len;
1302
1303         return len;
1304 }
1305
1306 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1307 {
1308         const struct skb_shared_info *shinfo = skb_shinfo(skb);
1309         unsigned int hdr_len, last_len = 0;
1310         u32 off = skb_network_offset(skb);
1311         u32 len = qdisc_pkt_len(skb);
1312         u16 segs = 1;
1313
1314         q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1315
1316         if (!shinfo->gso_size)
1317                 return cake_calc_overhead(q, len, off);
1318
1319         /* borrowed from qdisc_pkt_len_init() */
1320         hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
1321
1322         /* + transport layer */
1323         if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1324                                                 SKB_GSO_TCPV6))) {
1325                 const struct tcphdr *th;
1326                 struct tcphdr _tcphdr;
1327
1328                 th = skb_header_pointer(skb, skb_transport_offset(skb),
1329                                         sizeof(_tcphdr), &_tcphdr);
1330                 if (likely(th))
1331                         hdr_len += __tcp_hdrlen(th);
1332         } else {
1333                 struct udphdr _udphdr;
1334
1335                 if (skb_header_pointer(skb, skb_transport_offset(skb),
1336                                        sizeof(_udphdr), &_udphdr))
1337                         hdr_len += sizeof(struct udphdr);
1338         }
1339
1340         if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1341                 segs = DIV_ROUND_UP(skb->len - hdr_len,
1342                                     shinfo->gso_size);
1343         else
1344                 segs = shinfo->gso_segs;
1345
1346         len = shinfo->gso_size + hdr_len;
1347         last_len = skb->len - shinfo->gso_size * (segs - 1);
1348
1349         return (cake_calc_overhead(q, len, off) * (segs - 1) +
1350                 cake_calc_overhead(q, last_len, off));
1351 }
1352
1353 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1354 {
1355         struct cake_heap_entry ii = q->overflow_heap[i];
1356         struct cake_heap_entry jj = q->overflow_heap[j];
1357
1358         q->overflow_heap[i] = jj;
1359         q->overflow_heap[j] = ii;
1360
1361         q->tins[ii.t].overflow_idx[ii.b] = j;
1362         q->tins[jj.t].overflow_idx[jj.b] = i;
1363 }
1364
1365 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1366 {
1367         struct cake_heap_entry ii = q->overflow_heap[i];
1368
1369         return q->tins[ii.t].backlogs[ii.b];
1370 }
1371
1372 static void cake_heapify(struct cake_sched_data *q, u16 i)
1373 {
1374         static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1375         u32 mb = cake_heap_get_backlog(q, i);
1376         u32 m = i;
1377
1378         while (m < a) {
1379                 u32 l = m + m + 1;
1380                 u32 r = l + 1;
1381
1382                 if (l < a) {
1383                         u32 lb = cake_heap_get_backlog(q, l);
1384
1385                         if (lb > mb) {
1386                                 m  = l;
1387                                 mb = lb;
1388                         }
1389                 }
1390
1391                 if (r < a) {
1392                         u32 rb = cake_heap_get_backlog(q, r);
1393
1394                         if (rb > mb) {
1395                                 m  = r;
1396                                 mb = rb;
1397                         }
1398                 }
1399
1400                 if (m != i) {
1401                         cake_heap_swap(q, i, m);
1402                         i = m;
1403                 } else {
1404                         break;
1405                 }
1406         }
1407 }
1408
1409 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1410 {
1411         while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1412                 u16 p = (i - 1) >> 1;
1413                 u32 ib = cake_heap_get_backlog(q, i);
1414                 u32 pb = cake_heap_get_backlog(q, p);
1415
1416                 if (ib > pb) {
1417                         cake_heap_swap(q, i, p);
1418                         i = p;
1419                 } else {
1420                         break;
1421                 }
1422         }
1423 }
1424
1425 static int cake_advance_shaper(struct cake_sched_data *q,
1426                                struct cake_tin_data *b,
1427                                struct sk_buff *skb,
1428                                ktime_t now, bool drop)
1429 {
1430         u32 len = get_cobalt_cb(skb)->adjusted_len;
1431
1432         /* charge packet bandwidth to this tin
1433          * and to the global shaper.
1434          */
1435         if (q->rate_ns) {
1436                 u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1437                 u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1438                 u64 failsafe_dur = global_dur + (global_dur >> 1);
1439
1440                 if (ktime_before(b->time_next_packet, now))
1441                         b->time_next_packet = ktime_add_ns(b->time_next_packet,
1442                                                            tin_dur);
1443
1444                 else if (ktime_before(b->time_next_packet,
1445                                       ktime_add_ns(now, tin_dur)))
1446                         b->time_next_packet = ktime_add_ns(now, tin_dur);
1447
1448                 q->time_next_packet = ktime_add_ns(q->time_next_packet,
1449                                                    global_dur);
1450                 if (!drop)
1451                         q->failsafe_next_packet = \
1452                                 ktime_add_ns(q->failsafe_next_packet,
1453                                              failsafe_dur);
1454         }
1455         return len;
1456 }
1457
1458 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1459 {
1460         struct cake_sched_data *q = qdisc_priv(sch);
1461         ktime_t now = ktime_get();
1462         u32 idx = 0, tin = 0, len;
1463         struct cake_heap_entry qq;
1464         struct cake_tin_data *b;
1465         struct cake_flow *flow;
1466         struct sk_buff *skb;
1467
1468         if (!q->overflow_timeout) {
1469                 int i;
1470                 /* Build fresh max-heap */
1471                 for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1472                         cake_heapify(q, i);
1473         }
1474         q->overflow_timeout = 65535;
1475
1476         /* select longest queue for pruning */
1477         qq  = q->overflow_heap[0];
1478         tin = qq.t;
1479         idx = qq.b;
1480
1481         b = &q->tins[tin];
1482         flow = &b->flows[idx];
1483         skb = dequeue_head(flow);
1484         if (unlikely(!skb)) {
1485                 /* heap has gone wrong, rebuild it next time */
1486                 q->overflow_timeout = 0;
1487                 return idx + (tin << 16);
1488         }
1489
1490         if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1491                 b->unresponsive_flow_count++;
1492
1493         len = qdisc_pkt_len(skb);
1494         q->buffer_used      -= skb->truesize;
1495         b->backlogs[idx]    -= len;
1496         b->tin_backlog      -= len;
1497         sch->qstats.backlog -= len;
1498         qdisc_tree_reduce_backlog(sch, 1, len);
1499
1500         flow->dropped++;
1501         b->tin_dropped++;
1502         sch->qstats.drops++;
1503
1504         if (q->rate_flags & CAKE_FLAG_INGRESS)
1505                 cake_advance_shaper(q, b, skb, now, true);
1506
1507         __qdisc_drop(skb, to_free);
1508         sch->q.qlen--;
1509
1510         cake_heapify(q, 0);
1511
1512         return idx + (tin << 16);
1513 }
1514
1515 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1516 {
1517         const int offset = skb_network_offset(skb);
1518         u16 *buf, buf_;
1519         u8 dscp;
1520
1521         switch (skb_protocol(skb, true)) {
1522         case htons(ETH_P_IP):
1523                 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1524                 if (unlikely(!buf))
1525                         return 0;
1526
1527                 /* ToS is in the second byte of iphdr */
1528                 dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1529
1530                 if (wash && dscp) {
1531                         const int wlen = offset + sizeof(struct iphdr);
1532
1533                         if (!pskb_may_pull(skb, wlen) ||
1534                             skb_try_make_writable(skb, wlen))
1535                                 return 0;
1536
1537                         ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1538                 }
1539
1540                 return dscp;
1541
1542         case htons(ETH_P_IPV6):
1543                 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1544                 if (unlikely(!buf))
1545                         return 0;
1546
1547                 /* Traffic class is in the first and second bytes of ipv6hdr */
1548                 dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1549
1550                 if (wash && dscp) {
1551                         const int wlen = offset + sizeof(struct ipv6hdr);
1552
1553                         if (!pskb_may_pull(skb, wlen) ||
1554                             skb_try_make_writable(skb, wlen))
1555                                 return 0;
1556
1557                         ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1558                 }
1559
1560                 return dscp;
1561
1562         case htons(ETH_P_ARP):
1563                 return 0x38;  /* CS7 - Net Control */
1564
1565         default:
1566                 /* If there is no Diffserv field, treat as best-effort */
1567                 return 0;
1568         }
1569 }
1570
1571 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1572                                              struct sk_buff *skb)
1573 {
1574         struct cake_sched_data *q = qdisc_priv(sch);
1575         u32 tin;
1576         bool wash;
1577         u8 dscp;
1578
1579         /* Tin selection: Default to diffserv-based selection, allow overriding
1580          * using firewall marks or skb->priority. Call DSCP parsing early if
1581          * wash is enabled, otherwise defer to below to skip unneeded parsing.
1582          */
1583         wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1584         if (wash)
1585                 dscp = cake_handle_diffserv(skb, wash);
1586
1587         if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1588                 tin = 0;
1589
1590         else if (TC_H_MAJ(skb->priority) == sch->handle &&
1591                  TC_H_MIN(skb->priority) > 0 &&
1592                  TC_H_MIN(skb->priority) <= q->tin_cnt)
1593                 tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1594
1595         else {
1596                 if (!wash)
1597                         dscp = cake_handle_diffserv(skb, wash);
1598                 tin = q->tin_index[dscp];
1599
1600                 if (unlikely(tin >= q->tin_cnt))
1601                         tin = 0;
1602         }
1603
1604         return &q->tins[tin];
1605 }
1606
1607 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1608                          struct sk_buff *skb, int flow_mode, int *qerr)
1609 {
1610         struct cake_sched_data *q = qdisc_priv(sch);
1611         struct tcf_proto *filter;
1612         struct tcf_result res;
1613         u16 flow = 0, host = 0;
1614         int result;
1615
1616         filter = rcu_dereference_bh(q->filter_list);
1617         if (!filter)
1618                 goto hash;
1619
1620         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1621         result = tcf_classify(skb, filter, &res, false);
1622
1623         if (result >= 0) {
1624 #ifdef CONFIG_NET_CLS_ACT
1625                 switch (result) {
1626                 case TC_ACT_STOLEN:
1627                 case TC_ACT_QUEUED:
1628                 case TC_ACT_TRAP:
1629                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1630                         /* fall through */
1631                 case TC_ACT_SHOT:
1632                         return 0;
1633                 }
1634 #endif
1635                 if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1636                         flow = TC_H_MIN(res.classid);
1637                 if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1638                         host = TC_H_MAJ(res.classid) >> 16;
1639         }
1640 hash:
1641         *t = cake_select_tin(sch, skb);
1642         return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1643 }
1644
1645 static void cake_reconfigure(struct Qdisc *sch);
1646
1647 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1648                         struct sk_buff **to_free)
1649 {
1650         struct cake_sched_data *q = qdisc_priv(sch);
1651         int len = qdisc_pkt_len(skb);
1652         int uninitialized_var(ret);
1653         struct sk_buff *ack = NULL;
1654         ktime_t now = ktime_get();
1655         struct cake_tin_data *b;
1656         struct cake_flow *flow;
1657         u32 idx;
1658
1659         /* choose flow to insert into */
1660         idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1661         if (idx == 0) {
1662                 if (ret & __NET_XMIT_BYPASS)
1663                         qdisc_qstats_drop(sch);
1664                 __qdisc_drop(skb, to_free);
1665                 return ret;
1666         }
1667         idx--;
1668         flow = &b->flows[idx];
1669         ret = NET_XMIT_SUCCESS;
1670
1671         /* ensure shaper state isn't stale */
1672         if (!b->tin_backlog) {
1673                 if (ktime_before(b->time_next_packet, now))
1674                         b->time_next_packet = now;
1675
1676                 if (!sch->q.qlen) {
1677                         if (ktime_before(q->time_next_packet, now)) {
1678                                 q->failsafe_next_packet = now;
1679                                 q->time_next_packet = now;
1680                         } else if (ktime_after(q->time_next_packet, now) &&
1681                                    ktime_after(q->failsafe_next_packet, now)) {
1682                                 u64 next = \
1683                                         min(ktime_to_ns(q->time_next_packet),
1684                                             ktime_to_ns(
1685                                                    q->failsafe_next_packet));
1686                                 sch->qstats.overlimits++;
1687                                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
1688                         }
1689                 }
1690         }
1691
1692         if (unlikely(len > b->max_skblen))
1693                 b->max_skblen = len;
1694
1695         if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1696                 struct sk_buff *segs, *nskb;
1697                 netdev_features_t features = netif_skb_features(skb);
1698                 unsigned int slen = 0, numsegs = 0;
1699
1700                 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1701                 if (IS_ERR_OR_NULL(segs))
1702                         return qdisc_drop(skb, sch, to_free);
1703
1704                 while (segs) {
1705                         nskb = segs->next;
1706                         segs->next = NULL;
1707                         qdisc_skb_cb(segs)->pkt_len = segs->len;
1708                         cobalt_set_enqueue_time(segs, now);
1709                         get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1710                                                                           segs);
1711                         flow_queue_add(flow, segs);
1712
1713                         sch->q.qlen++;
1714                         numsegs++;
1715                         slen += segs->len;
1716                         q->buffer_used += segs->truesize;
1717                         b->packets++;
1718                         segs = nskb;
1719                 }
1720
1721                 /* stats */
1722                 b->bytes            += slen;
1723                 b->backlogs[idx]    += slen;
1724                 b->tin_backlog      += slen;
1725                 sch->qstats.backlog += slen;
1726                 q->avg_window_bytes += slen;
1727
1728                 qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1729                 consume_skb(skb);
1730                 ret |= __NET_XMIT_STOLEN;
1731         } else {
1732                 /* not splitting */
1733                 cobalt_set_enqueue_time(skb, now);
1734                 get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1735                 flow_queue_add(flow, skb);
1736
1737                 if (q->ack_filter)
1738                         ack = cake_ack_filter(q, flow);
1739
1740                 if (ack) {
1741                         b->ack_drops++;
1742                         sch->qstats.drops++;
1743                         b->bytes += qdisc_pkt_len(ack);
1744                         len -= qdisc_pkt_len(ack);
1745                         q->buffer_used += skb->truesize - ack->truesize;
1746                         if (q->rate_flags & CAKE_FLAG_INGRESS)
1747                                 cake_advance_shaper(q, b, ack, now, true);
1748
1749                         qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1750                         consume_skb(ack);
1751                 } else {
1752                         sch->q.qlen++;
1753                         q->buffer_used      += skb->truesize;
1754                 }
1755
1756                 /* stats */
1757                 b->packets++;
1758                 b->bytes            += len;
1759                 b->backlogs[idx]    += len;
1760                 b->tin_backlog      += len;
1761                 sch->qstats.backlog += len;
1762                 q->avg_window_bytes += len;
1763         }
1764
1765         if (q->overflow_timeout)
1766                 cake_heapify_up(q, b->overflow_idx[idx]);
1767
1768         /* incoming bandwidth capacity estimate */
1769         if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1770                 u64 packet_interval = \
1771                         ktime_to_ns(ktime_sub(now, q->last_packet_time));
1772
1773                 if (packet_interval > NSEC_PER_SEC)
1774                         packet_interval = NSEC_PER_SEC;
1775
1776                 /* filter out short-term bursts, eg. wifi aggregation */
1777                 q->avg_packet_interval = \
1778                         cake_ewma(q->avg_packet_interval,
1779                                   packet_interval,
1780                                   (packet_interval > q->avg_packet_interval ?
1781                                           2 : 8));
1782
1783                 q->last_packet_time = now;
1784
1785                 if (packet_interval > q->avg_packet_interval) {
1786                         u64 window_interval = \
1787                                 ktime_to_ns(ktime_sub(now,
1788                                                       q->avg_window_begin));
1789                         u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1790
1791                         b = div64_u64(b, window_interval);
1792                         q->avg_peak_bandwidth =
1793                                 cake_ewma(q->avg_peak_bandwidth, b,
1794                                           b > q->avg_peak_bandwidth ? 2 : 8);
1795                         q->avg_window_bytes = 0;
1796                         q->avg_window_begin = now;
1797
1798                         if (ktime_after(now,
1799                                         ktime_add_ms(q->last_reconfig_time,
1800                                                      250))) {
1801                                 q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1802                                 cake_reconfigure(sch);
1803                         }
1804                 }
1805         } else {
1806                 q->avg_window_bytes = 0;
1807                 q->last_packet_time = now;
1808         }
1809
1810         /* flowchain */
1811         if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1812                 struct cake_host *srchost = &b->hosts[flow->srchost];
1813                 struct cake_host *dsthost = &b->hosts[flow->dsthost];
1814                 u16 host_load = 1;
1815
1816                 if (!flow->set) {
1817                         list_add_tail(&flow->flowchain, &b->new_flows);
1818                 } else {
1819                         b->decaying_flow_count--;
1820                         list_move_tail(&flow->flowchain, &b->new_flows);
1821                 }
1822                 flow->set = CAKE_SET_SPARSE;
1823                 b->sparse_flow_count++;
1824
1825                 if (cake_dsrc(q->flow_mode))
1826                         host_load = max(host_load, srchost->srchost_refcnt);
1827
1828                 if (cake_ddst(q->flow_mode))
1829                         host_load = max(host_load, dsthost->dsthost_refcnt);
1830
1831                 flow->deficit = (b->flow_quantum *
1832                                  quantum_div[host_load]) >> 16;
1833         } else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1834                 /* this flow was empty, accounted as a sparse flow, but actually
1835                  * in the bulk rotation.
1836                  */
1837                 flow->set = CAKE_SET_BULK;
1838                 b->sparse_flow_count--;
1839                 b->bulk_flow_count++;
1840         }
1841
1842         if (q->buffer_used > q->buffer_max_used)
1843                 q->buffer_max_used = q->buffer_used;
1844
1845         if (q->buffer_used > q->buffer_limit) {
1846                 u32 dropped = 0;
1847
1848                 while (q->buffer_used > q->buffer_limit) {
1849                         dropped++;
1850                         cake_drop(sch, to_free);
1851                 }
1852                 b->drop_overlimit += dropped;
1853         }
1854         return ret;
1855 }
1856
1857 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1858 {
1859         struct cake_sched_data *q = qdisc_priv(sch);
1860         struct cake_tin_data *b = &q->tins[q->cur_tin];
1861         struct cake_flow *flow = &b->flows[q->cur_flow];
1862         struct sk_buff *skb = NULL;
1863         u32 len;
1864
1865         if (flow->head) {
1866                 skb = dequeue_head(flow);
1867                 len = qdisc_pkt_len(skb);
1868                 b->backlogs[q->cur_flow] -= len;
1869                 b->tin_backlog           -= len;
1870                 sch->qstats.backlog      -= len;
1871                 q->buffer_used           -= skb->truesize;
1872                 sch->q.qlen--;
1873
1874                 if (q->overflow_timeout)
1875                         cake_heapify(q, b->overflow_idx[q->cur_flow]);
1876         }
1877         return skb;
1878 }
1879
1880 /* Discard leftover packets from a tin no longer in use. */
1881 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1882 {
1883         struct cake_sched_data *q = qdisc_priv(sch);
1884         struct sk_buff *skb;
1885
1886         q->cur_tin = tin;
1887         for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1888                 while (!!(skb = cake_dequeue_one(sch)))
1889                         kfree_skb(skb);
1890 }
1891
1892 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1893 {
1894         struct cake_sched_data *q = qdisc_priv(sch);
1895         struct cake_tin_data *b = &q->tins[q->cur_tin];
1896         struct cake_host *srchost, *dsthost;
1897         ktime_t now = ktime_get();
1898         struct cake_flow *flow;
1899         struct list_head *head;
1900         bool first_flow = true;
1901         struct sk_buff *skb;
1902         u16 host_load;
1903         u64 delay;
1904         u32 len;
1905
1906 begin:
1907         if (!sch->q.qlen)
1908                 return NULL;
1909
1910         /* global hard shaper */
1911         if (ktime_after(q->time_next_packet, now) &&
1912             ktime_after(q->failsafe_next_packet, now)) {
1913                 u64 next = min(ktime_to_ns(q->time_next_packet),
1914                                ktime_to_ns(q->failsafe_next_packet));
1915
1916                 sch->qstats.overlimits++;
1917                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
1918                 return NULL;
1919         }
1920
1921         /* Choose a class to work on. */
1922         if (!q->rate_ns) {
1923                 /* In unlimited mode, can't rely on shaper timings, just balance
1924                  * with DRR
1925                  */
1926                 bool wrapped = false, empty = true;
1927
1928                 while (b->tin_deficit < 0 ||
1929                        !(b->sparse_flow_count + b->bulk_flow_count)) {
1930                         if (b->tin_deficit <= 0)
1931                                 b->tin_deficit += b->tin_quantum_band;
1932                         if (b->sparse_flow_count + b->bulk_flow_count)
1933                                 empty = false;
1934
1935                         q->cur_tin++;
1936                         b++;
1937                         if (q->cur_tin >= q->tin_cnt) {
1938                                 q->cur_tin = 0;
1939                                 b = q->tins;
1940
1941                                 if (wrapped) {
1942                                         /* It's possible for q->qlen to be
1943                                          * nonzero when we actually have no
1944                                          * packets anywhere.
1945                                          */
1946                                         if (empty)
1947                                                 return NULL;
1948                                 } else {
1949                                         wrapped = true;
1950                                 }
1951                         }
1952                 }
1953         } else {
1954                 /* In shaped mode, choose:
1955                  * - Highest-priority tin with queue and meeting schedule, or
1956                  * - The earliest-scheduled tin with queue.
1957                  */
1958                 ktime_t best_time = KTIME_MAX;
1959                 int tin, best_tin = 0;
1960
1961                 for (tin = 0; tin < q->tin_cnt; tin++) {
1962                         b = q->tins + tin;
1963                         if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
1964                                 ktime_t time_to_pkt = \
1965                                         ktime_sub(b->time_next_packet, now);
1966
1967                                 if (ktime_to_ns(time_to_pkt) <= 0 ||
1968                                     ktime_compare(time_to_pkt,
1969                                                   best_time) <= 0) {
1970                                         best_time = time_to_pkt;
1971                                         best_tin = tin;
1972                                 }
1973                         }
1974                 }
1975
1976                 q->cur_tin = best_tin;
1977                 b = q->tins + best_tin;
1978
1979                 /* No point in going further if no packets to deliver. */
1980                 if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
1981                         return NULL;
1982         }
1983
1984 retry:
1985         /* service this class */
1986         head = &b->decaying_flows;
1987         if (!first_flow || list_empty(head)) {
1988                 head = &b->new_flows;
1989                 if (list_empty(head)) {
1990                         head = &b->old_flows;
1991                         if (unlikely(list_empty(head))) {
1992                                 head = &b->decaying_flows;
1993                                 if (unlikely(list_empty(head)))
1994                                         goto begin;
1995                         }
1996                 }
1997         }
1998         flow = list_first_entry(head, struct cake_flow, flowchain);
1999         q->cur_flow = flow - b->flows;
2000         first_flow = false;
2001
2002         /* triple isolation (modified DRR++) */
2003         srchost = &b->hosts[flow->srchost];
2004         dsthost = &b->hosts[flow->dsthost];
2005         host_load = 1;
2006
2007         if (cake_dsrc(q->flow_mode))
2008                 host_load = max(host_load, srchost->srchost_refcnt);
2009
2010         if (cake_ddst(q->flow_mode))
2011                 host_load = max(host_load, dsthost->dsthost_refcnt);
2012
2013         WARN_ON(host_load > CAKE_QUEUES);
2014
2015         /* flow isolation (DRR++) */
2016         if (flow->deficit <= 0) {
2017                 /* The shifted prandom_u32() is a way to apply dithering to
2018                  * avoid accumulating roundoff errors
2019                  */
2020                 flow->deficit += (b->flow_quantum * quantum_div[host_load] +
2021                                   (prandom_u32() >> 16)) >> 16;
2022                 list_move_tail(&flow->flowchain, &b->old_flows);
2023
2024                 /* Keep all flows with deficits out of the sparse and decaying
2025                  * rotations.  No non-empty flow can go into the decaying
2026                  * rotation, so they can't get deficits
2027                  */
2028                 if (flow->set == CAKE_SET_SPARSE) {
2029                         if (flow->head) {
2030                                 b->sparse_flow_count--;
2031                                 b->bulk_flow_count++;
2032                                 flow->set = CAKE_SET_BULK;
2033                         } else {
2034                                 /* we've moved it to the bulk rotation for
2035                                  * correct deficit accounting but we still want
2036                                  * to count it as a sparse flow, not a bulk one.
2037                                  */
2038                                 flow->set = CAKE_SET_SPARSE_WAIT;
2039                         }
2040                 }
2041                 goto retry;
2042         }
2043
2044         /* Retrieve a packet via the AQM */
2045         while (1) {
2046                 skb = cake_dequeue_one(sch);
2047                 if (!skb) {
2048                         /* this queue was actually empty */
2049                         if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2050                                 b->unresponsive_flow_count--;
2051
2052                         if (flow->cvars.p_drop || flow->cvars.count ||
2053                             ktime_before(now, flow->cvars.drop_next)) {
2054                                 /* keep in the flowchain until the state has
2055                                  * decayed to rest
2056                                  */
2057                                 list_move_tail(&flow->flowchain,
2058                                                &b->decaying_flows);
2059                                 if (flow->set == CAKE_SET_BULK) {
2060                                         b->bulk_flow_count--;
2061                                         b->decaying_flow_count++;
2062                                 } else if (flow->set == CAKE_SET_SPARSE ||
2063                                            flow->set == CAKE_SET_SPARSE_WAIT) {
2064                                         b->sparse_flow_count--;
2065                                         b->decaying_flow_count++;
2066                                 }
2067                                 flow->set = CAKE_SET_DECAYING;
2068                         } else {
2069                                 /* remove empty queue from the flowchain */
2070                                 list_del_init(&flow->flowchain);
2071                                 if (flow->set == CAKE_SET_SPARSE ||
2072                                     flow->set == CAKE_SET_SPARSE_WAIT)
2073                                         b->sparse_flow_count--;
2074                                 else if (flow->set == CAKE_SET_BULK)
2075                                         b->bulk_flow_count--;
2076                                 else
2077                                         b->decaying_flow_count--;
2078
2079                                 flow->set = CAKE_SET_NONE;
2080                                 srchost->srchost_refcnt--;
2081                                 dsthost->dsthost_refcnt--;
2082                         }
2083                         goto begin;
2084                 }
2085
2086                 /* Last packet in queue may be marked, shouldn't be dropped */
2087                 if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2088                                         (b->bulk_flow_count *
2089                                          !!(q->rate_flags &
2090                                             CAKE_FLAG_INGRESS))) ||
2091                     !flow->head)
2092                         break;
2093
2094                 /* drop this packet, get another one */
2095                 if (q->rate_flags & CAKE_FLAG_INGRESS) {
2096                         len = cake_advance_shaper(q, b, skb,
2097                                                   now, true);
2098                         flow->deficit -= len;
2099                         b->tin_deficit -= len;
2100                 }
2101                 flow->dropped++;
2102                 b->tin_dropped++;
2103                 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2104                 qdisc_qstats_drop(sch);
2105                 kfree_skb(skb);
2106                 if (q->rate_flags & CAKE_FLAG_INGRESS)
2107                         goto retry;
2108         }
2109
2110         b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2111         qdisc_bstats_update(sch, skb);
2112
2113         /* collect delay stats */
2114         delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2115         b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2116         b->peak_delay = cake_ewma(b->peak_delay, delay,
2117                                   delay > b->peak_delay ? 2 : 8);
2118         b->base_delay = cake_ewma(b->base_delay, delay,
2119                                   delay < b->base_delay ? 2 : 8);
2120
2121         len = cake_advance_shaper(q, b, skb, now, false);
2122         flow->deficit -= len;
2123         b->tin_deficit -= len;
2124
2125         if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2126                 u64 next = min(ktime_to_ns(q->time_next_packet),
2127                                ktime_to_ns(q->failsafe_next_packet));
2128
2129                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
2130         } else if (!sch->q.qlen) {
2131                 int i;
2132
2133                 for (i = 0; i < q->tin_cnt; i++) {
2134                         if (q->tins[i].decaying_flow_count) {
2135                                 ktime_t next = \
2136                                         ktime_add_ns(now,
2137                                                      q->tins[i].cparams.target);
2138
2139                                 qdisc_watchdog_schedule_ns(&q->watchdog,
2140                                                            ktime_to_ns(next));
2141                                 break;
2142                         }
2143                 }
2144         }
2145
2146         if (q->overflow_timeout)
2147                 q->overflow_timeout--;
2148
2149         return skb;
2150 }
2151
2152 static void cake_reset(struct Qdisc *sch)
2153 {
2154         u32 c;
2155
2156         for (c = 0; c < CAKE_MAX_TINS; c++)
2157                 cake_clear_tin(sch, c);
2158 }
2159
2160 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2161         [TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2162         [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2163         [TCA_CAKE_ATM]           = { .type = NLA_U32 },
2164         [TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2165         [TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2166         [TCA_CAKE_RTT]           = { .type = NLA_U32 },
2167         [TCA_CAKE_TARGET]        = { .type = NLA_U32 },
2168         [TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2169         [TCA_CAKE_MEMORY]        = { .type = NLA_U32 },
2170         [TCA_CAKE_NAT]           = { .type = NLA_U32 },
2171         [TCA_CAKE_RAW]           = { .type = NLA_U32 },
2172         [TCA_CAKE_WASH]          = { .type = NLA_U32 },
2173         [TCA_CAKE_MPU]           = { .type = NLA_U32 },
2174         [TCA_CAKE_INGRESS]       = { .type = NLA_U32 },
2175         [TCA_CAKE_ACK_FILTER]    = { .type = NLA_U32 },
2176 };
2177
2178 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2179                           u64 target_ns, u64 rtt_est_ns)
2180 {
2181         /* convert byte-rate into time-per-byte
2182          * so it will always unwedge in reasonable time.
2183          */
2184         static const u64 MIN_RATE = 64;
2185         u32 byte_target = mtu;
2186         u64 byte_target_ns;
2187         u8  rate_shft = 0;
2188         u64 rate_ns = 0;
2189
2190         b->flow_quantum = 1514;
2191         if (rate) {
2192                 b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2193                 rate_shft = 34;
2194                 rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2195                 rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2196                 while (!!(rate_ns >> 34)) {
2197                         rate_ns >>= 1;
2198                         rate_shft--;
2199                 }
2200         } /* else unlimited, ie. zero delay */
2201
2202         b->tin_rate_bps  = rate;
2203         b->tin_rate_ns   = rate_ns;
2204         b->tin_rate_shft = rate_shft;
2205
2206         byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2207
2208         b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2209         b->cparams.interval = max(rtt_est_ns +
2210                                      b->cparams.target - target_ns,
2211                                      b->cparams.target * 2);
2212         b->cparams.mtu_time = byte_target_ns;
2213         b->cparams.p_inc = 1 << 24; /* 1/256 */
2214         b->cparams.p_dec = 1 << 20; /* 1/4096 */
2215 }
2216
2217 static int cake_config_besteffort(struct Qdisc *sch)
2218 {
2219         struct cake_sched_data *q = qdisc_priv(sch);
2220         struct cake_tin_data *b = &q->tins[0];
2221         u32 mtu = psched_mtu(qdisc_dev(sch));
2222         u64 rate = q->rate_bps;
2223
2224         q->tin_cnt = 1;
2225
2226         q->tin_index = besteffort;
2227         q->tin_order = normal_order;
2228
2229         cake_set_rate(b, rate, mtu,
2230                       us_to_ns(q->target), us_to_ns(q->interval));
2231         b->tin_quantum_band = 65535;
2232         b->tin_quantum_prio = 65535;
2233
2234         return 0;
2235 }
2236
2237 static int cake_config_precedence(struct Qdisc *sch)
2238 {
2239         /* convert high-level (user visible) parameters into internal format */
2240         struct cake_sched_data *q = qdisc_priv(sch);
2241         u32 mtu = psched_mtu(qdisc_dev(sch));
2242         u64 rate = q->rate_bps;
2243         u32 quantum1 = 256;
2244         u32 quantum2 = 256;
2245         u32 i;
2246
2247         q->tin_cnt = 8;
2248         q->tin_index = precedence;
2249         q->tin_order = normal_order;
2250
2251         for (i = 0; i < q->tin_cnt; i++) {
2252                 struct cake_tin_data *b = &q->tins[i];
2253
2254                 cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2255                               us_to_ns(q->interval));
2256
2257                 b->tin_quantum_prio = max_t(u16, 1U, quantum1);
2258                 b->tin_quantum_band = max_t(u16, 1U, quantum2);
2259
2260                 /* calculate next class's parameters */
2261                 rate  *= 7;
2262                 rate >>= 3;
2263
2264                 quantum1  *= 3;
2265                 quantum1 >>= 1;
2266
2267                 quantum2  *= 7;
2268                 quantum2 >>= 3;
2269         }
2270
2271         return 0;
2272 }
2273
2274 /*      List of known Diffserv codepoints:
2275  *
2276  *      Least Effort (CS1)
2277  *      Best Effort (CS0)
2278  *      Max Reliability & LLT "Lo" (TOS1)
2279  *      Max Throughput (TOS2)
2280  *      Min Delay (TOS4)
2281  *      LLT "La" (TOS5)
2282  *      Assured Forwarding 1 (AF1x) - x3
2283  *      Assured Forwarding 2 (AF2x) - x3
2284  *      Assured Forwarding 3 (AF3x) - x3
2285  *      Assured Forwarding 4 (AF4x) - x3
2286  *      Precedence Class 2 (CS2)
2287  *      Precedence Class 3 (CS3)
2288  *      Precedence Class 4 (CS4)
2289  *      Precedence Class 5 (CS5)
2290  *      Precedence Class 6 (CS6)
2291  *      Precedence Class 7 (CS7)
2292  *      Voice Admit (VA)
2293  *      Expedited Forwarding (EF)
2294
2295  *      Total 25 codepoints.
2296  */
2297
2298 /*      List of traffic classes in RFC 4594:
2299  *              (roughly descending order of contended priority)
2300  *              (roughly ascending order of uncontended throughput)
2301  *
2302  *      Network Control (CS6,CS7)      - routing traffic
2303  *      Telephony (EF,VA)         - aka. VoIP streams
2304  *      Signalling (CS5)               - VoIP setup
2305  *      Multimedia Conferencing (AF4x) - aka. video calls
2306  *      Realtime Interactive (CS4)     - eg. games
2307  *      Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2308  *      Broadcast Video (CS3)
2309  *      Low Latency Data (AF2x,TOS4)      - eg. database
2310  *      Ops, Admin, Management (CS2,TOS1) - eg. ssh
2311  *      Standard Service (CS0 & unrecognised codepoints)
2312  *      High Throughput Data (AF1x,TOS2)  - eg. web traffic
2313  *      Low Priority Data (CS1)           - eg. BitTorrent
2314
2315  *      Total 12 traffic classes.
2316  */
2317
2318 static int cake_config_diffserv8(struct Qdisc *sch)
2319 {
2320 /*      Pruned list of traffic classes for typical applications:
2321  *
2322  *              Network Control          (CS6, CS7)
2323  *              Minimum Latency          (EF, VA, CS5, CS4)
2324  *              Interactive Shell        (CS2, TOS1)
2325  *              Low Latency Transactions (AF2x, TOS4)
2326  *              Video Streaming          (AF4x, AF3x, CS3)
2327  *              Bog Standard             (CS0 etc.)
2328  *              High Throughput          (AF1x, TOS2)
2329  *              Background Traffic       (CS1)
2330  *
2331  *              Total 8 traffic classes.
2332  */
2333
2334         struct cake_sched_data *q = qdisc_priv(sch);
2335         u32 mtu = psched_mtu(qdisc_dev(sch));
2336         u64 rate = q->rate_bps;
2337         u32 quantum1 = 256;
2338         u32 quantum2 = 256;
2339         u32 i;
2340
2341         q->tin_cnt = 8;
2342
2343         /* codepoint to class mapping */
2344         q->tin_index = diffserv8;
2345         q->tin_order = normal_order;
2346
2347         /* class characteristics */
2348         for (i = 0; i < q->tin_cnt; i++) {
2349                 struct cake_tin_data *b = &q->tins[i];
2350
2351                 cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2352                               us_to_ns(q->interval));
2353
2354                 b->tin_quantum_prio = max_t(u16, 1U, quantum1);
2355                 b->tin_quantum_band = max_t(u16, 1U, quantum2);
2356
2357                 /* calculate next class's parameters */
2358                 rate  *= 7;
2359                 rate >>= 3;
2360
2361                 quantum1  *= 3;
2362                 quantum1 >>= 1;
2363
2364                 quantum2  *= 7;
2365                 quantum2 >>= 3;
2366         }
2367
2368         return 0;
2369 }
2370
2371 static int cake_config_diffserv4(struct Qdisc *sch)
2372 {
2373 /*  Further pruned list of traffic classes for four-class system:
2374  *
2375  *          Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2376  *          Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
2377  *          Best Effort        (CS0, AF1x, TOS2, and those not specified)
2378  *          Background Traffic (CS1)
2379  *
2380  *              Total 4 traffic classes.
2381  */
2382
2383         struct cake_sched_data *q = qdisc_priv(sch);
2384         u32 mtu = psched_mtu(qdisc_dev(sch));
2385         u64 rate = q->rate_bps;
2386         u32 quantum = 1024;
2387
2388         q->tin_cnt = 4;
2389
2390         /* codepoint to class mapping */
2391         q->tin_index = diffserv4;
2392         q->tin_order = bulk_order;
2393
2394         /* class characteristics */
2395         cake_set_rate(&q->tins[0], rate, mtu,
2396                       us_to_ns(q->target), us_to_ns(q->interval));
2397         cake_set_rate(&q->tins[1], rate >> 4, mtu,
2398                       us_to_ns(q->target), us_to_ns(q->interval));
2399         cake_set_rate(&q->tins[2], rate >> 1, mtu,
2400                       us_to_ns(q->target), us_to_ns(q->interval));
2401         cake_set_rate(&q->tins[3], rate >> 2, mtu,
2402                       us_to_ns(q->target), us_to_ns(q->interval));
2403
2404         /* priority weights */
2405         q->tins[0].tin_quantum_prio = quantum;
2406         q->tins[1].tin_quantum_prio = quantum >> 4;
2407         q->tins[2].tin_quantum_prio = quantum << 2;
2408         q->tins[3].tin_quantum_prio = quantum << 4;
2409
2410         /* bandwidth-sharing weights */
2411         q->tins[0].tin_quantum_band = quantum;
2412         q->tins[1].tin_quantum_band = quantum >> 4;
2413         q->tins[2].tin_quantum_band = quantum >> 1;
2414         q->tins[3].tin_quantum_band = quantum >> 2;
2415
2416         return 0;
2417 }
2418
2419 static int cake_config_diffserv3(struct Qdisc *sch)
2420 {
2421 /*  Simplified Diffserv structure with 3 tins.
2422  *              Low Priority            (CS1)
2423  *              Best Effort
2424  *              Latency Sensitive       (TOS4, VA, EF, CS6, CS7)
2425  */
2426         struct cake_sched_data *q = qdisc_priv(sch);
2427         u32 mtu = psched_mtu(qdisc_dev(sch));
2428         u64 rate = q->rate_bps;
2429         u32 quantum = 1024;
2430
2431         q->tin_cnt = 3;
2432
2433         /* codepoint to class mapping */
2434         q->tin_index = diffserv3;
2435         q->tin_order = bulk_order;
2436
2437         /* class characteristics */
2438         cake_set_rate(&q->tins[0], rate, mtu,
2439                       us_to_ns(q->target), us_to_ns(q->interval));
2440         cake_set_rate(&q->tins[1], rate >> 4, mtu,
2441                       us_to_ns(q->target), us_to_ns(q->interval));
2442         cake_set_rate(&q->tins[2], rate >> 2, mtu,
2443                       us_to_ns(q->target), us_to_ns(q->interval));
2444
2445         /* priority weights */
2446         q->tins[0].tin_quantum_prio = quantum;
2447         q->tins[1].tin_quantum_prio = quantum >> 4;
2448         q->tins[2].tin_quantum_prio = quantum << 4;
2449
2450         /* bandwidth-sharing weights */
2451         q->tins[0].tin_quantum_band = quantum;
2452         q->tins[1].tin_quantum_band = quantum >> 4;
2453         q->tins[2].tin_quantum_band = quantum >> 2;
2454
2455         return 0;
2456 }
2457
2458 static void cake_reconfigure(struct Qdisc *sch)
2459 {
2460         struct cake_sched_data *q = qdisc_priv(sch);
2461         int c, ft;
2462
2463         switch (q->tin_mode) {
2464         case CAKE_DIFFSERV_BESTEFFORT:
2465                 ft = cake_config_besteffort(sch);
2466                 break;
2467
2468         case CAKE_DIFFSERV_PRECEDENCE:
2469                 ft = cake_config_precedence(sch);
2470                 break;
2471
2472         case CAKE_DIFFSERV_DIFFSERV8:
2473                 ft = cake_config_diffserv8(sch);
2474                 break;
2475
2476         case CAKE_DIFFSERV_DIFFSERV4:
2477                 ft = cake_config_diffserv4(sch);
2478                 break;
2479
2480         case CAKE_DIFFSERV_DIFFSERV3:
2481         default:
2482                 ft = cake_config_diffserv3(sch);
2483                 break;
2484         }
2485
2486         for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2487                 cake_clear_tin(sch, c);
2488                 q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2489         }
2490
2491         q->rate_ns   = q->tins[ft].tin_rate_ns;
2492         q->rate_shft = q->tins[ft].tin_rate_shft;
2493
2494         if (q->buffer_config_limit) {
2495                 q->buffer_limit = q->buffer_config_limit;
2496         } else if (q->rate_bps) {
2497                 u64 t = q->rate_bps * q->interval;
2498
2499                 do_div(t, USEC_PER_SEC / 4);
2500                 q->buffer_limit = max_t(u32, t, 4U << 20);
2501         } else {
2502                 q->buffer_limit = ~0;
2503         }
2504
2505         sch->flags &= ~TCQ_F_CAN_BYPASS;
2506
2507         q->buffer_limit = min(q->buffer_limit,
2508                               max(sch->limit * psched_mtu(qdisc_dev(sch)),
2509                                   q->buffer_config_limit));
2510 }
2511
2512 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2513                        struct netlink_ext_ack *extack)
2514 {
2515         struct cake_sched_data *q = qdisc_priv(sch);
2516         struct nlattr *tb[TCA_CAKE_MAX + 1];
2517         int err;
2518
2519         if (!opt)
2520                 return -EINVAL;
2521
2522         err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack);
2523         if (err < 0)
2524                 return err;
2525
2526         if (tb[TCA_CAKE_NAT]) {
2527 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2528                 q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2529                 q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2530                         !!nla_get_u32(tb[TCA_CAKE_NAT]);
2531 #else
2532                 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2533                                     "No conntrack support in kernel");
2534                 return -EOPNOTSUPP;
2535 #endif
2536         }
2537
2538         if (tb[TCA_CAKE_BASE_RATE64])
2539                 q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2540
2541         if (tb[TCA_CAKE_DIFFSERV_MODE])
2542                 q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2543
2544         if (tb[TCA_CAKE_WASH]) {
2545                 if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2546                         q->rate_flags |= CAKE_FLAG_WASH;
2547                 else
2548                         q->rate_flags &= ~CAKE_FLAG_WASH;
2549         }
2550
2551         if (tb[TCA_CAKE_FLOW_MODE])
2552                 q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2553                                 (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2554                                         CAKE_FLOW_MASK));
2555
2556         if (tb[TCA_CAKE_ATM])
2557                 q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2558
2559         if (tb[TCA_CAKE_OVERHEAD]) {
2560                 q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2561                 q->rate_flags |= CAKE_FLAG_OVERHEAD;
2562
2563                 q->max_netlen = 0;
2564                 q->max_adjlen = 0;
2565                 q->min_netlen = ~0;
2566                 q->min_adjlen = ~0;
2567         }
2568
2569         if (tb[TCA_CAKE_RAW]) {
2570                 q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2571
2572                 q->max_netlen = 0;
2573                 q->max_adjlen = 0;
2574                 q->min_netlen = ~0;
2575                 q->min_adjlen = ~0;
2576         }
2577
2578         if (tb[TCA_CAKE_MPU])
2579                 q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2580
2581         if (tb[TCA_CAKE_RTT]) {
2582                 q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2583
2584                 if (!q->interval)
2585                         q->interval = 1;
2586         }
2587
2588         if (tb[TCA_CAKE_TARGET]) {
2589                 q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2590
2591                 if (!q->target)
2592                         q->target = 1;
2593         }
2594
2595         if (tb[TCA_CAKE_AUTORATE]) {
2596                 if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2597                         q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2598                 else
2599                         q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2600         }
2601
2602         if (tb[TCA_CAKE_INGRESS]) {
2603                 if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2604                         q->rate_flags |= CAKE_FLAG_INGRESS;
2605                 else
2606                         q->rate_flags &= ~CAKE_FLAG_INGRESS;
2607         }
2608
2609         if (tb[TCA_CAKE_ACK_FILTER])
2610                 q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2611
2612         if (tb[TCA_CAKE_MEMORY])
2613                 q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2614
2615         if (tb[TCA_CAKE_SPLIT_GSO]) {
2616                 if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2617                         q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2618                 else
2619                         q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2620         }
2621
2622         if (q->tins) {
2623                 sch_tree_lock(sch);
2624                 cake_reconfigure(sch);
2625                 sch_tree_unlock(sch);
2626         }
2627
2628         return 0;
2629 }
2630
2631 static void cake_destroy(struct Qdisc *sch)
2632 {
2633         struct cake_sched_data *q = qdisc_priv(sch);
2634
2635         qdisc_watchdog_cancel(&q->watchdog);
2636         tcf_block_put(q->block);
2637         kvfree(q->tins);
2638 }
2639
2640 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2641                      struct netlink_ext_ack *extack)
2642 {
2643         struct cake_sched_data *q = qdisc_priv(sch);
2644         int i, j, err;
2645
2646         sch->limit = 10240;
2647         q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2648         q->flow_mode  = CAKE_FLOW_TRIPLE;
2649
2650         q->rate_bps = 0; /* unlimited by default */
2651
2652         q->interval = 100000; /* 100ms default */
2653         q->target   =   5000; /* 5ms: codel RFC argues
2654                                * for 5 to 10% of interval
2655                                */
2656         q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2657         q->cur_tin = 0;
2658         q->cur_flow  = 0;
2659
2660         qdisc_watchdog_init(&q->watchdog, sch);
2661
2662         if (opt) {
2663                 err = cake_change(sch, opt, extack);
2664
2665                 if (err)
2666                         return err;
2667         }
2668
2669         err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2670         if (err)
2671                 return err;
2672
2673         quantum_div[0] = ~0;
2674         for (i = 1; i <= CAKE_QUEUES; i++)
2675                 quantum_div[i] = 65535 / i;
2676
2677         q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2678                            GFP_KERNEL);
2679         if (!q->tins)
2680                 return -ENOMEM;
2681
2682         for (i = 0; i < CAKE_MAX_TINS; i++) {
2683                 struct cake_tin_data *b = q->tins + i;
2684
2685                 INIT_LIST_HEAD(&b->new_flows);
2686                 INIT_LIST_HEAD(&b->old_flows);
2687                 INIT_LIST_HEAD(&b->decaying_flows);
2688                 b->sparse_flow_count = 0;
2689                 b->bulk_flow_count = 0;
2690                 b->decaying_flow_count = 0;
2691
2692                 for (j = 0; j < CAKE_QUEUES; j++) {
2693                         struct cake_flow *flow = b->flows + j;
2694                         u32 k = j * CAKE_MAX_TINS + i;
2695
2696                         INIT_LIST_HEAD(&flow->flowchain);
2697                         cobalt_vars_init(&flow->cvars);
2698
2699                         q->overflow_heap[k].t = i;
2700                         q->overflow_heap[k].b = j;
2701                         b->overflow_idx[j] = k;
2702                 }
2703         }
2704
2705         cake_reconfigure(sch);
2706         q->avg_peak_bandwidth = q->rate_bps;
2707         q->min_netlen = ~0;
2708         q->min_adjlen = ~0;
2709         return 0;
2710 }
2711
2712 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2713 {
2714         struct cake_sched_data *q = qdisc_priv(sch);
2715         struct nlattr *opts;
2716
2717         opts = nla_nest_start(skb, TCA_OPTIONS);
2718         if (!opts)
2719                 goto nla_put_failure;
2720
2721         if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2722                               TCA_CAKE_PAD))
2723                 goto nla_put_failure;
2724
2725         if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2726                         q->flow_mode & CAKE_FLOW_MASK))
2727                 goto nla_put_failure;
2728
2729         if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2730                 goto nla_put_failure;
2731
2732         if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2733                 goto nla_put_failure;
2734
2735         if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2736                 goto nla_put_failure;
2737
2738         if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2739                         !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2740                 goto nla_put_failure;
2741
2742         if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2743                         !!(q->rate_flags & CAKE_FLAG_INGRESS)))
2744                 goto nla_put_failure;
2745
2746         if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2747                 goto nla_put_failure;
2748
2749         if (nla_put_u32(skb, TCA_CAKE_NAT,
2750                         !!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2751                 goto nla_put_failure;
2752
2753         if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2754                 goto nla_put_failure;
2755
2756         if (nla_put_u32(skb, TCA_CAKE_WASH,
2757                         !!(q->rate_flags & CAKE_FLAG_WASH)))
2758                 goto nla_put_failure;
2759
2760         if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2761                 goto nla_put_failure;
2762
2763         if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2764                 if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2765                         goto nla_put_failure;
2766
2767         if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2768                 goto nla_put_failure;
2769
2770         if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2771                 goto nla_put_failure;
2772
2773         if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2774                         !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2775                 goto nla_put_failure;
2776
2777         return nla_nest_end(skb, opts);
2778
2779 nla_put_failure:
2780         return -1;
2781 }
2782
2783 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2784 {
2785         struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP);
2786         struct cake_sched_data *q = qdisc_priv(sch);
2787         struct nlattr *tstats, *ts;
2788         int i;
2789
2790         if (!stats)
2791                 return -1;
2792
2793 #define PUT_STAT_U32(attr, data) do {                                  \
2794                 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2795                         goto nla_put_failure;                          \
2796         } while (0)
2797 #define PUT_STAT_U64(attr, data) do {                                  \
2798                 if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2799                                         data, TCA_CAKE_STATS_PAD)) \
2800                         goto nla_put_failure;                          \
2801         } while (0)
2802
2803         PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2804         PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2805         PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2806         PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2807         PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2808         PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2809         PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2810         PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2811
2812 #undef PUT_STAT_U32
2813 #undef PUT_STAT_U64
2814
2815         tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS);
2816         if (!tstats)
2817                 goto nla_put_failure;
2818
2819 #define PUT_TSTAT_U32(attr, data) do {                                  \
2820                 if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2821                         goto nla_put_failure;                           \
2822         } while (0)
2823 #define PUT_TSTAT_U64(attr, data) do {                                  \
2824                 if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2825                                         data, TCA_CAKE_TIN_STATS_PAD))  \
2826                         goto nla_put_failure;                           \
2827         } while (0)
2828
2829         for (i = 0; i < q->tin_cnt; i++) {
2830                 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2831
2832                 ts = nla_nest_start(d->skb, i + 1);
2833                 if (!ts)
2834                         goto nla_put_failure;
2835
2836                 PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2837                 PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2838                 PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2839
2840                 PUT_TSTAT_U32(TARGET_US,
2841                               ktime_to_us(ns_to_ktime(b->cparams.target)));
2842                 PUT_TSTAT_U32(INTERVAL_US,
2843                               ktime_to_us(ns_to_ktime(b->cparams.interval)));
2844
2845                 PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2846                 PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2847                 PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2848                 PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2849
2850                 PUT_TSTAT_U32(PEAK_DELAY_US,
2851                               ktime_to_us(ns_to_ktime(b->peak_delay)));
2852                 PUT_TSTAT_U32(AVG_DELAY_US,
2853                               ktime_to_us(ns_to_ktime(b->avge_delay)));
2854                 PUT_TSTAT_U32(BASE_DELAY_US,
2855                               ktime_to_us(ns_to_ktime(b->base_delay)));
2856
2857                 PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2858                 PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2859                 PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2860
2861                 PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2862                                             b->decaying_flow_count);
2863                 PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2864                 PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2865                 PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2866
2867                 PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2868                 nla_nest_end(d->skb, ts);
2869         }
2870
2871 #undef PUT_TSTAT_U32
2872 #undef PUT_TSTAT_U64
2873
2874         nla_nest_end(d->skb, tstats);
2875         return nla_nest_end(d->skb, stats);
2876
2877 nla_put_failure:
2878         nla_nest_cancel(d->skb, stats);
2879         return -1;
2880 }
2881
2882 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2883 {
2884         return NULL;
2885 }
2886
2887 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2888 {
2889         return 0;
2890 }
2891
2892 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2893                                u32 classid)
2894 {
2895         return 0;
2896 }
2897
2898 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2899 {
2900 }
2901
2902 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2903                                         struct netlink_ext_ack *extack)
2904 {
2905         struct cake_sched_data *q = qdisc_priv(sch);
2906
2907         if (cl)
2908                 return NULL;
2909         return q->block;
2910 }
2911
2912 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2913                            struct sk_buff *skb, struct tcmsg *tcm)
2914 {
2915         tcm->tcm_handle |= TC_H_MIN(cl);
2916         return 0;
2917 }
2918
2919 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2920                                  struct gnet_dump *d)
2921 {
2922         struct cake_sched_data *q = qdisc_priv(sch);
2923         const struct cake_flow *flow = NULL;
2924         struct gnet_stats_queue qs = { 0 };
2925         struct nlattr *stats;
2926         u32 idx = cl - 1;
2927
2928         if (idx < CAKE_QUEUES * q->tin_cnt) {
2929                 const struct cake_tin_data *b = \
2930                         &q->tins[q->tin_order[idx / CAKE_QUEUES]];
2931                 const struct sk_buff *skb;
2932
2933                 flow = &b->flows[idx % CAKE_QUEUES];
2934
2935                 if (flow->head) {
2936                         sch_tree_lock(sch);
2937                         skb = flow->head;
2938                         while (skb) {
2939                                 qs.qlen++;
2940                                 skb = skb->next;
2941                         }
2942                         sch_tree_unlock(sch);
2943                 }
2944                 qs.backlog = b->backlogs[idx % CAKE_QUEUES];
2945                 qs.drops = flow->dropped;
2946         }
2947         if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
2948                 return -1;
2949         if (flow) {
2950                 ktime_t now = ktime_get();
2951
2952                 stats = nla_nest_start(d->skb, TCA_STATS_APP);
2953                 if (!stats)
2954                         return -1;
2955
2956 #define PUT_STAT_U32(attr, data) do {                                  \
2957                 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2958                         goto nla_put_failure;                          \
2959         } while (0)
2960 #define PUT_STAT_S32(attr, data) do {                                  \
2961                 if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2962                         goto nla_put_failure;                          \
2963         } while (0)
2964
2965                 PUT_STAT_S32(DEFICIT, flow->deficit);
2966                 PUT_STAT_U32(DROPPING, flow->cvars.dropping);
2967                 PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
2968                 PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
2969                 if (flow->cvars.p_drop) {
2970                         PUT_STAT_S32(BLUE_TIMER_US,
2971                                      ktime_to_us(
2972                                              ktime_sub(now,
2973                                                        flow->cvars.blue_timer)));
2974                 }
2975                 if (flow->cvars.dropping) {
2976                         PUT_STAT_S32(DROP_NEXT_US,
2977                                      ktime_to_us(
2978                                              ktime_sub(now,
2979                                                        flow->cvars.drop_next)));
2980                 }
2981
2982                 if (nla_nest_end(d->skb, stats) < 0)
2983                         return -1;
2984         }
2985
2986         return 0;
2987
2988 nla_put_failure:
2989         nla_nest_cancel(d->skb, stats);
2990         return -1;
2991 }
2992
2993 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2994 {
2995         struct cake_sched_data *q = qdisc_priv(sch);
2996         unsigned int i, j;
2997
2998         if (arg->stop)
2999                 return;
3000
3001         for (i = 0; i < q->tin_cnt; i++) {
3002                 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3003
3004                 for (j = 0; j < CAKE_QUEUES; j++) {
3005                         if (list_empty(&b->flows[j].flowchain) ||
3006                             arg->count < arg->skip) {
3007                                 arg->count++;
3008                                 continue;
3009                         }
3010                         if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) {
3011                                 arg->stop = 1;
3012                                 break;
3013                         }
3014                         arg->count++;
3015                 }
3016         }
3017 }
3018
3019 static const struct Qdisc_class_ops cake_class_ops = {
3020         .leaf           =       cake_leaf,
3021         .find           =       cake_find,
3022         .tcf_block      =       cake_tcf_block,
3023         .bind_tcf       =       cake_bind,
3024         .unbind_tcf     =       cake_unbind,
3025         .dump           =       cake_dump_class,
3026         .dump_stats     =       cake_dump_class_stats,
3027         .walk           =       cake_walk,
3028 };
3029
3030 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3031         .cl_ops         =       &cake_class_ops,
3032         .id             =       "cake",
3033         .priv_size      =       sizeof(struct cake_sched_data),
3034         .enqueue        =       cake_enqueue,
3035         .dequeue        =       cake_dequeue,
3036         .peek           =       qdisc_peek_dequeued,
3037         .init           =       cake_init,
3038         .reset          =       cake_reset,
3039         .destroy        =       cake_destroy,
3040         .change         =       cake_change,
3041         .dump           =       cake_dump,
3042         .dump_stats     =       cake_dump_stats,
3043         .owner          =       THIS_MODULE,
3044 };
3045
3046 static int __init cake_module_init(void)
3047 {
3048         return register_qdisc(&cake_qdisc_ops);
3049 }
3050
3051 static void __exit cake_module_exit(void)
3052 {
3053         unregister_qdisc(&cake_qdisc_ops);
3054 }
3055
3056 module_init(cake_module_init)
3057 module_exit(cake_module_exit)
3058 MODULE_AUTHOR("Jonathan Morton");
3059 MODULE_LICENSE("Dual BSD/GPL");
3060 MODULE_DESCRIPTION("The CAKE shaper.");