Skip to content

Commit bd2c339

Browse files
committed
queueing: get rid of per-peer ring buffers
Having two ring buffers per-peer means that every peer results in two massive ring allocations. On an 8-core x86_64 machine, this commit reduces the per-peer allocation from 18,688 bytes to 1,856 bytes, which is an 90% reduction. Ninety percent! With some single-machine deployments approaching 500,000 peers, we're talking about a reduction from 7 gigs of memory down to 700 megs of memory. In order to get rid of these per-peer allocations, this commit switches to using a list-based queueing approach. Currently GSO fragments are chained together using the skb->next pointer, so we form the per-peer queue around the unused skb->prev pointer, which makes sense because the links are pointing backwards. Multiple cores can write into the queue at any given time, because its writes occur in the start_xmit path or in the udp_recv path. But reads happen in a single workqueue item per-peer, amounting to a multi-producer, single-consumer paradigm. The MPSC queue is implemented locklessly and never blocks. However, it is not linearizable (though it is serializable), with a very tight and unlikely race on writes, which, when hit (some tiny fraction of the 0.15% of partial adds on a fully loaded 16-core x86_64 system), causes the queue reader to terminate early. However, because every packet sent queues up the same workqueue item after it is fully added, the queue resumes again, and stopping early isn't actually a problem, since at that point the packet wouldn't have yet been added to the encryption queue. These properties allow us to avoid disabling interrupts or spinning. Performance-wise, ordinarily list-based queues aren't preferable to ringbuffers, because of cache misses when following pointers around. However, we *already* have to follow the adjacent pointers when working through fragments, so there shouldn't actually be any change there. A potential downside is that dequeueing is a bit more complicated, but the ptr_ring structure used prior had a spinlock when dequeueing, so all and all the difference appears to be a wash. Actually, from profiling, the biggest performance hit, by far, of this commit winds up being atomic_add_unless(count, 1, max) and atomic_ dec(count), which account for the majority of CPU time, according to perf. In that sense, the previous ring buffer was superior in that it could check if it was full by head==tail, which the list-based approach cannot do. Reviewed-by: Dmitry Vyukov <dvyukov@google.com> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
1 parent f7e9853 commit bd2c339

File tree

9 files changed

+160
-93
lines changed

9 files changed

+160
-93
lines changed

src/compat/compat.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1068,6 +1068,22 @@ static const struct header_ops ip_tunnel_header_ops = { .parse_protocol = ip_tun
10681068
#define kfree_sensitive(a) kzfree(a)
10691069
#endif
10701070

1071+
#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0) && !defined(ISRHEL7)
1072+
#define xchg_release xchg
1073+
#endif
1074+
1075+
#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0) && !defined(ISRHEL7)
1076+
#include <asm/barrier.h>
1077+
#ifndef smp_load_acquire
1078+
#define smp_load_acquire(p) \
1079+
({ \
1080+
typeof(*p) ___p1 = ACCESS_ONCE(*p); \
1081+
smp_mb(); \
1082+
___p1; \
1083+
})
1084+
#endif
1085+
#endif
1086+
10711087
#if defined(ISUBUNTU1604) || defined(ISRHEL7)
10721088
#include <linux/siphash.h>
10731089
#ifndef _WG_LINUX_SIPHASH_H

src/device.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -242,8 +242,8 @@ static void wg_destruct(struct net_device *dev)
242242
destroy_workqueue(wg->handshake_receive_wq);
243243
destroy_workqueue(wg->handshake_send_wq);
244244
destroy_workqueue(wg->packet_crypt_wq);
245-
wg_packet_queue_free(&wg->decrypt_queue, true);
246-
wg_packet_queue_free(&wg->encrypt_queue, true);
245+
wg_packet_queue_free(&wg->decrypt_queue);
246+
wg_packet_queue_free(&wg->encrypt_queue);
247247
rcu_barrier(); /* Wait for all the peers to be actually freed. */
248248
wg_ratelimiter_uninit();
249249
memzero_explicit(&wg->static_identity, sizeof(wg->static_identity));
@@ -351,12 +351,12 @@ static int wg_newlink(struct net *src_net, struct net_device *dev,
351351
goto err_destroy_handshake_send;
352352

353353
ret = wg_packet_queue_init(&wg->encrypt_queue, wg_packet_encrypt_worker,
354-
true, MAX_QUEUED_PACKETS);
354+
MAX_QUEUED_PACKETS);
355355
if (ret < 0)
356356
goto err_destroy_packet_crypt;
357357

358358
ret = wg_packet_queue_init(&wg->decrypt_queue, wg_packet_decrypt_worker,
359-
true, MAX_QUEUED_PACKETS);
359+
MAX_QUEUED_PACKETS);
360360
if (ret < 0)
361361
goto err_free_encrypt_queue;
362362

@@ -381,9 +381,9 @@ static int wg_newlink(struct net *src_net, struct net_device *dev,
381381
err_uninit_ratelimiter:
382382
wg_ratelimiter_uninit();
383383
err_free_decrypt_queue:
384-
wg_packet_queue_free(&wg->decrypt_queue, true);
384+
wg_packet_queue_free(&wg->decrypt_queue);
385385
err_free_encrypt_queue:
386-
wg_packet_queue_free(&wg->encrypt_queue, true);
386+
wg_packet_queue_free(&wg->encrypt_queue);
387387
err_destroy_packet_crypt:
388388
destroy_workqueue(wg->packet_crypt_wq);
389389
err_destroy_handshake_send:

src/device.h

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,14 @@ struct multicore_worker {
2727

2828
struct crypt_queue {
2929
struct ptr_ring ring;
30-
union {
31-
struct {
32-
struct multicore_worker __percpu *worker;
33-
int last_cpu;
34-
};
35-
struct work_struct work;
36-
};
30+
struct multicore_worker __percpu *worker;
31+
int last_cpu;
32+
};
33+
34+
struct prev_queue {
35+
struct sk_buff *head, *tail, *peeked;
36+
struct { struct sk_buff *next, *prev; } empty; // Match first 2 members of struct sk_buff.
37+
atomic_t count;
3738
};
3839

3940
struct wg_device {

src/peer.c

Lines changed: 9 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -32,27 +32,22 @@ struct wg_peer *wg_peer_create(struct wg_device *wg,
3232
peer = kzalloc(sizeof(*peer), GFP_KERNEL);
3333
if (unlikely(!peer))
3434
return ERR_PTR(ret);
35-
peer->device = wg;
35+
if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL))
36+
goto err;
3637

38+
peer->device = wg;
3739
wg_noise_handshake_init(&peer->handshake, &wg->static_identity,
3840
public_key, preshared_key, peer);
39-
if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL))
40-
goto err_1;
41-
if (wg_packet_queue_init(&peer->tx_queue, wg_packet_tx_worker, false,
42-
MAX_QUEUED_PACKETS))
43-
goto err_2;
44-
if (wg_packet_queue_init(&peer->rx_queue, NULL, false,
45-
MAX_QUEUED_PACKETS))
46-
goto err_3;
47-
4841
peer->internal_id = atomic64_inc_return(&peer_counter);
4942
peer->serial_work_cpu = nr_cpumask_bits;
5043
wg_cookie_init(&peer->latest_cookie);
5144
wg_timers_init(peer);
5245
wg_cookie_checker_precompute_peer_keys(peer);
5346
spin_lock_init(&peer->keypairs.keypair_update_lock);
54-
INIT_WORK(&peer->transmit_handshake_work,
55-
wg_packet_handshake_send_worker);
47+
INIT_WORK(&peer->transmit_handshake_work, wg_packet_handshake_send_worker);
48+
INIT_WORK(&peer->transmit_packet_work, wg_packet_tx_worker);
49+
wg_prev_queue_init(&peer->tx_queue);
50+
wg_prev_queue_init(&peer->rx_queue);
5651
rwlock_init(&peer->endpoint_lock);
5752
kref_init(&peer->refcount);
5853
skb_queue_head_init(&peer->staged_packet_queue);
@@ -68,11 +63,7 @@ struct wg_peer *wg_peer_create(struct wg_device *wg,
6863
pr_debug("%s: Peer %llu created\n", wg->dev->name, peer->internal_id);
6964
return peer;
7065

71-
err_3:
72-
wg_packet_queue_free(&peer->tx_queue, false);
73-
err_2:
74-
dst_cache_destroy(&peer->endpoint_cache);
75-
err_1:
66+
err:
7667
kfree(peer);
7768
return ERR_PTR(ret);
7869
}
@@ -197,8 +188,7 @@ static void rcu_release(struct rcu_head *rcu)
197188
struct wg_peer *peer = container_of(rcu, struct wg_peer, rcu);
198189

199190
dst_cache_destroy(&peer->endpoint_cache);
200-
wg_packet_queue_free(&peer->rx_queue, false);
201-
wg_packet_queue_free(&peer->tx_queue, false);
191+
WARN_ON(wg_prev_queue_peek(&peer->tx_queue) || wg_prev_queue_peek(&peer->rx_queue));
202192

203193
/* The final zeroing takes care of clearing any remaining handshake key
204194
* material and other potentially sensitive information.

src/peer.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ struct endpoint {
3636

3737
struct wg_peer {
3838
struct wg_device *device;
39-
struct crypt_queue tx_queue, rx_queue;
39+
struct prev_queue tx_queue, rx_queue;
4040
struct sk_buff_head staged_packet_queue;
4141
int serial_work_cpu;
4242
bool is_dead;
@@ -46,7 +46,7 @@ struct wg_peer {
4646
rwlock_t endpoint_lock;
4747
struct noise_handshake handshake;
4848
atomic64_t last_sent_handshake;
49-
struct work_struct transmit_handshake_work, clear_peer_work;
49+
struct work_struct transmit_handshake_work, clear_peer_work, transmit_packet_work;
5050
struct cookie latest_cookie;
5151
struct hlist_node pubkey_hash;
5252
u64 rx_bytes, tx_bytes;

src/queueing.c

Lines changed: 69 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@ struct multicore_worker __percpu *
99
wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr)
1010
{
1111
int cpu;
12-
struct multicore_worker __percpu *worker =
13-
alloc_percpu(struct multicore_worker);
12+
struct multicore_worker __percpu *worker = alloc_percpu(struct multicore_worker);
1413

1514
if (!worker)
1615
return NULL;
@@ -23,33 +22,86 @@ wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr)
2322
}
2423

2524
int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function,
26-
bool multicore, unsigned int len)
25+
unsigned int len)
2726
{
2827
int ret;
2928

3029
memset(queue, 0, sizeof(*queue));
3130
ret = ptr_ring_init(&queue->ring, len, GFP_KERNEL);
3231
if (ret)
3332
return ret;
34-
if (function) {
35-
if (multicore) {
36-
queue->worker = wg_packet_percpu_multicore_worker_alloc(
37-
function, queue);
38-
if (!queue->worker) {
39-
ptr_ring_cleanup(&queue->ring, NULL);
40-
return -ENOMEM;
41-
}
42-
} else {
43-
INIT_WORK(&queue->work, function);
44-
}
33+
queue->worker = wg_packet_percpu_multicore_worker_alloc(function, queue);
34+
if (!queue->worker) {
35+
ptr_ring_cleanup(&queue->ring, NULL);
36+
return -ENOMEM;
4537
}
4638
return 0;
4739
}
4840

49-
void wg_packet_queue_free(struct crypt_queue *queue, bool multicore)
41+
void wg_packet_queue_free(struct crypt_queue *queue)
5042
{
51-
if (multicore)
52-
free_percpu(queue->worker);
43+
free_percpu(queue->worker);
5344
WARN_ON(!__ptr_ring_empty(&queue->ring));
5445
ptr_ring_cleanup(&queue->ring, NULL);
5546
}
47+
48+
#define NEXT(skb) ((skb)->prev)
49+
#define STUB(queue) ((struct sk_buff *)&queue->empty)
50+
51+
void wg_prev_queue_init(struct prev_queue *queue)
52+
{
53+
NEXT(STUB(queue)) = NULL;
54+
queue->head = queue->tail = STUB(queue);
55+
queue->peeked = NULL;
56+
atomic_set(&queue->count, 0);
57+
BUILD_BUG_ON(
58+
offsetof(struct sk_buff, next) != offsetof(struct prev_queue, empty.next) -
59+
offsetof(struct prev_queue, empty) ||
60+
offsetof(struct sk_buff, prev) != offsetof(struct prev_queue, empty.prev) -
61+
offsetof(struct prev_queue, empty));
62+
}
63+
64+
static void __wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb)
65+
{
66+
WRITE_ONCE(NEXT(skb), NULL);
67+
WRITE_ONCE(NEXT(xchg_release(&queue->head, skb)), skb);
68+
}
69+
70+
bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb)
71+
{
72+
if (!atomic_add_unless(&queue->count, 1, MAX_QUEUED_PACKETS))
73+
return false;
74+
__wg_prev_queue_enqueue(queue, skb);
75+
return true;
76+
}
77+
78+
struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue)
79+
{
80+
struct sk_buff *tail = queue->tail, *next = smp_load_acquire(&NEXT(tail));
81+
82+
if (tail == STUB(queue)) {
83+
if (!next)
84+
return NULL;
85+
queue->tail = next;
86+
tail = next;
87+
next = smp_load_acquire(&NEXT(next));
88+
}
89+
if (next) {
90+
queue->tail = next;
91+
atomic_dec(&queue->count);
92+
return tail;
93+
}
94+
if (tail != READ_ONCE(queue->head))
95+
return NULL;
96+
__wg_prev_queue_enqueue(queue, STUB(queue));
97+
next = smp_load_acquire(&NEXT(tail));
98+
if (next) {
99+
queue->tail = next;
100+
atomic_dec(&queue->count);
101+
return tail;
102+
}
103+
return NULL;
104+
}
105+
106+
#undef NEXT
107+
#undef STUB

src/queueing.h

Lines changed: 33 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,13 @@ struct wg_device;
1717
struct wg_peer;
1818
struct multicore_worker;
1919
struct crypt_queue;
20+
struct prev_queue;
2021
struct sk_buff;
2122

2223
/* queueing.c APIs: */
2324
int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function,
24-
bool multicore, unsigned int len);
25-
void wg_packet_queue_free(struct crypt_queue *queue, bool multicore);
25+
unsigned int len);
26+
void wg_packet_queue_free(struct crypt_queue *queue);
2627
struct multicore_worker __percpu *
2728
wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr);
2829

@@ -138,8 +139,31 @@ static inline int wg_cpumask_next_online(int *next)
138139
return cpu;
139140
}
140141

142+
void wg_prev_queue_init(struct prev_queue *queue);
143+
144+
/* Multi producer */
145+
bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb);
146+
147+
/* Single consumer */
148+
struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue);
149+
150+
/* Single consumer */
151+
static inline struct sk_buff *wg_prev_queue_peek(struct prev_queue *queue)
152+
{
153+
if (queue->peeked)
154+
return queue->peeked;
155+
queue->peeked = wg_prev_queue_dequeue(queue);
156+
return queue->peeked;
157+
}
158+
159+
/* Single consumer */
160+
static inline void wg_prev_queue_drop_peeked(struct prev_queue *queue)
161+
{
162+
queue->peeked = NULL;
163+
}
164+
141165
static inline int wg_queue_enqueue_per_device_and_peer(
142-
struct crypt_queue *device_queue, struct crypt_queue *peer_queue,
166+
struct crypt_queue *device_queue, struct prev_queue *peer_queue,
143167
struct sk_buff *skb, struct workqueue_struct *wq, int *next_cpu)
144168
{
145169
int cpu;
@@ -148,8 +172,9 @@ static inline int wg_queue_enqueue_per_device_and_peer(
148172
/* We first queue this up for the peer ingestion, but the consumer
149173
* will wait for the state to change to CRYPTED or DEAD before.
150174
*/
151-
if (unlikely(ptr_ring_produce_bh(&peer_queue->ring, skb)))
175+
if (unlikely(!wg_prev_queue_enqueue(peer_queue, skb)))
152176
return -ENOSPC;
177+
153178
/* Then we queue it up in the device queue, which consumes the
154179
* packet as soon as it can.
155180
*/
@@ -160,24 +185,20 @@ static inline int wg_queue_enqueue_per_device_and_peer(
160185
return 0;
161186
}
162187

163-
static inline void wg_queue_enqueue_per_peer(struct crypt_queue *queue,
164-
struct sk_buff *skb,
165-
enum packet_state state)
188+
static inline void wg_queue_enqueue_per_peer_tx(struct sk_buff *skb, enum packet_state state)
166189
{
167190
/* We take a reference, because as soon as we call atomic_set, the
168191
* peer can be freed from below us.
169192
*/
170193
struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb));
171194

172195
atomic_set_release(&PACKET_CB(skb)->state, state);
173-
queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu,
174-
peer->internal_id),
175-
peer->device->packet_crypt_wq, &queue->work);
196+
queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, peer->internal_id),
197+
peer->device->packet_crypt_wq, &peer->transmit_packet_work);
176198
wg_peer_put(peer);
177199
}
178200

179-
static inline void wg_queue_enqueue_per_peer_napi(struct sk_buff *skb,
180-
enum packet_state state)
201+
static inline void wg_queue_enqueue_per_peer_rx(struct sk_buff *skb, enum packet_state state)
181202
{
182203
/* We take a reference, because as soon as we call atomic_set, the
183204
* peer can be freed from below us.

0 commit comments

Comments
 (0)