Skip to content

Commit d493d9d

Browse files
Julian Anastasovummakynes
authored andcommitted
ipvs: fix the spin_lock usage for RT build
syzbot reports for sleeping function called from invalid context [1]. The recently added code for resizable hash tables uses hlist_bl bit locks in combination with spin_lock for the connection fields (cp->lock). Fix the following problems: * avoid using spin_lock(&cp->lock) under locked bit lock because it sleeps on PREEMPT_RT * as the recent changes call ip_vs_conn_hash() only for newly allocated connection, the spin_lock can be removed there because the connection is still not linked to table and does not need cp->lock protection. * the lock can be removed also from ip_vs_conn_unlink() where we are the last connection user. * the last place that is fixed is ip_vs_conn_fill_cport() where now the cp->lock is locked before the other locks to ensure other packets do not modify the cp->flags in non-atomic way. Here we make sure cport and flags are changed only once if two or more packets race to fill the cport. Also, we fill cport early, so that if we race with resizing there will be valid cport key for the hashing. Add a warning if too many hash table changes occur for our RCU read-side critical section which is error condition but minor because the connection still can expire gracefully. Still, restore the cport to 0 to allow retransmitted packet to properly fill the cport. Problems reported by Sashiko. [1]: BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 16, name: ktimers/0 preempt_count: 2, expected: 0 RCU nest depth: 3, expected: 3 8 locks held by ktimers/0/16: #0: ffffffff8de5f260 (local_bh){.+.+}-{1:3}, at: __local_bh_disable_ip+0x3c/0x420 kernel/softirq.c:163 #1: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: __local_bh_disable_ip+0x3c/0x420 kernel/softirq.c:163 svenkatr#2: ffff8880b8826360 (&base->expiry_lock){+...}-{3:3}, at: spin_lock include/linux/spinlock_rt.h:45 [inline] svenkatr#2: ffff8880b8826360 (&base->expiry_lock){+...}-{3:3}, at: timer_base_lock_expiry kernel/time/timer.c:1502 [inline] svenkatr#2: ffff8880b8826360 (&base->expiry_lock){+...}-{3:3}, at: __run_timer_base+0x120/0x9f0 kernel/time/timer.c:2384 svenkatr#3: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire include/linux/rcupdate.h:300 [inline] svenkatr#3: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rcu_read_lock include/linux/rcupdate.h:838 [inline] svenkatr#3: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: __rt_spin_lock kernel/locking/spinlock_rt.c:50 [inline] svenkatr#3: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rt_spin_lock+0x1e0/0x400 kernel/locking/spinlock_rt.c:57 svenkatr#4: ffffc90000157a80 ((&cp->timer)){+...}-{0:0}, at: call_timer_fn+0xd4/0x5e0 kernel/time/timer.c:1745 svenkatr#5: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire include/linux/rcupdate.h:300 [inline] svenkatr#5: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rcu_read_lock include/linux/rcupdate.h:838 [inline] svenkatr#5: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: ip_vs_conn_unlink net/netfilter/ipvs/ip_vs_conn.c:315 [inline] svenkatr#5: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: ip_vs_conn_expire+0x257/0x2390 net/netfilter/ipvs/ip_vs_conn.c:1260 svenkatr#6: ffffffff8de5f260 (local_bh){.+.+}-{1:3}, at: __local_bh_disable_ip+0x3c/0x420 kernel/softirq.c:163 svenkatr#7: ffff888068d4c3f0 (&cp->lock#2){+...}-{3:3}, at: spin_lock include/linux/spinlock_rt.h:45 [inline] svenkatr#7: ffff888068d4c3f0 (&cp->lock#2){+...}-{3:3}, at: ip_vs_conn_unlink net/netfilter/ipvs/ip_vs_conn.c:324 [inline] svenkatr#7: ffff888068d4c3f0 (&cp->lock#2){+...}-{3:3}, at: ip_vs_conn_expire+0xd4a/0x2390 net/netfilter/ipvs/ip_vs_conn.c:1260 Preemption disabled at: [<ffffffff898a6358>] bit_spin_lock include/linux/bit_spinlock.h:38 [inline] [<ffffffff898a6358>] hlist_bl_lock+0x18/0x110 include/linux/list_bl.h:149 CPU: 0 UID: 0 PID: 16 Comm: ktimers/0 Tainted: G W L syzkaller #0 PREEMPT_{RT,(full)} Tainted: [W]=WARN, [L]=SOFTLOCKUP Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/18/2026 Call Trace: <TASK> dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120 __might_resched+0x329/0x480 kernel/sched/core.c:9162 __rt_spin_lock kernel/locking/spinlock_rt.c:48 [inline] rt_spin_lock+0xc2/0x400 kernel/locking/spinlock_rt.c:57 spin_lock include/linux/spinlock_rt.h:45 [inline] ip_vs_conn_unlink net/netfilter/ipvs/ip_vs_conn.c:324 [inline] ip_vs_conn_expire+0xd4a/0x2390 net/netfilter/ipvs/ip_vs_conn.c:1260 call_timer_fn+0x192/0x5e0 kernel/time/timer.c:1748 expire_timers kernel/time/timer.c:1799 [inline] __run_timers kernel/time/timer.c:2374 [inline] __run_timer_base+0x6a3/0x9f0 kernel/time/timer.c:2386 run_timer_base kernel/time/timer.c:2395 [inline] run_timer_softirq+0xb7/0x170 kernel/time/timer.c:2405 handle_softirqs+0x1de/0x6d0 kernel/softirq.c:622 __do_softirq kernel/softirq.c:656 [inline] run_ktimerd+0x69/0x100 kernel/softirq.c:1151 smpboot_thread_fn+0x541/0xa50 kernel/smpboot.c:160 kthread+0x388/0x470 kernel/kthread.c:436 ret_from_fork+0x514/0xb70 arch/x86/kernel/process.c:158 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 </TASK> Reported-by: syzbot+504e778ddaecd36fdd17@syzkaller.appspotmail.com Link: https://sashiko.dev/#/patchset/20260415200216.79699-1-ja%40ssi.bg Link: https://sashiko.dev/#/patchset/20260420165539.85174-4-ja%40ssi.bg Link: https://sashiko.dev/#/patchset/20260422135823.50489-4-ja%40ssi.bg Fixes: 2fa7cc9 ("ipvs: switch to per-net connection table") Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
1 parent f2da9a9 commit d493d9d

1 file changed

Lines changed: 41 additions & 33 deletions

File tree

net/netfilter/ipvs/ip_vs_conn.c

Lines changed: 41 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -267,27 +267,20 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
267267
hash_key2 = hash_key;
268268
use2 = false;
269269
}
270+
270271
conn_tab_lock(t, cp, hash_key, hash_key2, use2, true /* new_hash */,
271272
&head, &head2);
272-
spin_lock(&cp->lock);
273-
274-
if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
275-
cp->flags |= IP_VS_CONN_F_HASHED;
276-
WRITE_ONCE(cp->hn0.hash_key, hash_key);
277-
WRITE_ONCE(cp->hn1.hash_key, hash_key2);
278-
refcount_inc(&cp->refcnt);
279-
hlist_bl_add_head_rcu(&cp->hn0.node, head);
280-
if (use2)
281-
hlist_bl_add_head_rcu(&cp->hn1.node, head2);
282-
ret = 1;
283-
} else {
284-
pr_err("%s(): request for already hashed, called from %pS\n",
285-
__func__, __builtin_return_address(0));
286-
ret = 0;
287-
}
288273

289-
spin_unlock(&cp->lock);
274+
cp->flags |= IP_VS_CONN_F_HASHED;
275+
WRITE_ONCE(cp->hn0.hash_key, hash_key);
276+
WRITE_ONCE(cp->hn1.hash_key, hash_key2);
277+
refcount_inc(&cp->refcnt);
278+
hlist_bl_add_head_rcu(&cp->hn0.node, head);
279+
if (use2)
280+
hlist_bl_add_head_rcu(&cp->hn1.node, head2);
281+
290282
conn_tab_unlock(head, head2);
283+
ret = 1;
291284

292285
/* Schedule resizing if load increases */
293286
if (atomic_read(&ipvs->conn_count) > t->u_thresh &&
@@ -321,7 +314,6 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
321314

322315
conn_tab_lock(t, cp, hash_key, hash_key2, use2, false /* new_hash */,
323316
&head, &head2);
324-
spin_lock(&cp->lock);
325317

326318
if (cp->flags & IP_VS_CONN_F_HASHED) {
327319
/* Decrease refcnt and unlink conn only if we are last user */
@@ -334,7 +326,6 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
334326
}
335327
}
336328

337-
spin_unlock(&cp->lock);
338329
conn_tab_unlock(head, head2);
339330

340331
rcu_read_unlock();
@@ -637,6 +628,7 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
637628
struct ip_vs_conn_hnode *hn;
638629
u32 hash_key, hash_key_new;
639630
struct ip_vs_conn_param p;
631+
bool by_me = false;
640632
int ntbl;
641633
int dir;
642634

@@ -664,8 +656,16 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
664656
t = rcu_dereference(t->new_tbl);
665657
ntbl++;
666658
/* We are lost? */
667-
if (ntbl >= 2)
659+
if (ntbl >= 2) {
660+
spin_lock_bh(&cp->lock);
661+
if (cp->flags & IP_VS_CONN_F_NO_CPORT && by_me)
662+
cp->cport = 0;
663+
/* hn1 will be rehashed on next packet */
664+
spin_unlock_bh(&cp->lock);
665+
IP_VS_ERR_RL("%s(): Too many ht changes for dir %d\n",
666+
__func__, dir);
668667
return;
668+
}
669669
}
670670

671671
/* Rehashing during resize? Use the recent table for adds */
@@ -683,10 +683,13 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
683683
if (head > head2 && t == t2)
684684
swap(head, head2);
685685

686+
/* Protect the cp->flags modification */
687+
spin_lock_bh(&cp->lock);
688+
686689
/* Lock seqcount only for the old bucket, even if we are on new table
687690
* because it affects the del operation, not the adding.
688691
*/
689-
spin_lock_bh(&t->lock[hash_key & t->lock_mask].l);
692+
spin_lock(&t->lock[hash_key & t->lock_mask].l);
690693
preempt_disable_nested();
691694
write_seqcount_begin(&t->seqc[hash_key & t->seqc_mask]);
692695

@@ -704,14 +707,23 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
704707
hlist_bl_unlock(head);
705708
write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]);
706709
preempt_enable_nested();
707-
spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l);
710+
spin_unlock(&t->lock[hash_key & t->lock_mask].l);
711+
spin_unlock_bh(&cp->lock);
708712
hash_key = hash_key_new;
709713
goto retry;
710714
}
711715

712-
spin_lock(&cp->lock);
713-
if ((cp->flags & IP_VS_CONN_F_NO_CPORT) &&
714-
(cp->flags & IP_VS_CONN_F_HASHED)) {
716+
/* Fill cport once, even if multiple packets try to do it */
717+
if (cp->flags & IP_VS_CONN_F_NO_CPORT && (!cp->cport || by_me)) {
718+
/* If we race with resizing make sure cport is set for dir 1 */
719+
if (!cp->cport) {
720+
cp->cport = cport;
721+
by_me = true;
722+
}
723+
if (!dir) {
724+
atomic_dec(&ipvs->no_cport_conns[af_id]);
725+
cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
726+
}
715727
/* We do not recalc hash_key_r under lock, we assume the
716728
* parameters in cp do not change, i.e. cport is
717729
* the only possible change.
@@ -726,21 +738,17 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
726738
hlist_bl_del_rcu(&hn->node);
727739
hlist_bl_add_head_rcu(&hn->node, head_new);
728740
}
729-
if (!dir) {
730-
atomic_dec(&ipvs->no_cport_conns[af_id]);
731-
cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
732-
cp->cport = cport;
733-
}
734741
}
735-
spin_unlock(&cp->lock);
736742

737743
if (head != head2)
738744
hlist_bl_unlock(head2);
739745
hlist_bl_unlock(head);
740746
write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]);
741747
preempt_enable_nested();
742-
spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l);
743-
if (dir--)
748+
spin_unlock(&t->lock[hash_key & t->lock_mask].l);
749+
750+
spin_unlock_bh(&cp->lock);
751+
if (dir-- && by_me)
744752
goto next_dir;
745753
}
746754

0 commit comments

Comments
 (0)