-
Notifications
You must be signed in to change notification settings - Fork 54
Expand file tree
/
Copy pathhoma_impl.h
More file actions
999 lines (896 loc) · 33.1 KB
/
homa_impl.h
File metadata and controls
999 lines (896 loc) · 33.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
/* SPDX-License-Identifier: BSD-2-Clause or GPL-2.0+ */
/* This file contains definitions that are shared across the files
* that implement Homa for Linux.
*/
#ifndef _HOMA_IMPL_H
#define _HOMA_IMPL_H
#include <linux/bug.h>
#ifdef __UNIT_TEST__
#undef WARN
#define WARN(...)
#undef WARN_ON
#define WARN_ON(condition) ({ \
int __ret_warn_on = !!(condition); \
unlikely(__ret_warn_on); \
})
#undef WARN_ON_ONCE
#define WARN_ON_ONCE(condition) WARN_ON(condition)
#undef WARN_ONCE
/* This definition allows WARN_ONCE to be used both as a value and as
* a statement.
*/
#define WARN_ONCE(cond, ...) ({ bool __c = (cond); (void)__c; __c; })
#endif /* __UNIT_TEST__ */
#include <linux/audit.h>
#include <linux/icmp.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
#include <linux/completion.h>
#include <linux/proc_fs.h>
#include <linux/sched/signal.h>
#include <linux/skbuff.h>
#include <linux/socket.h>
#include <linux/vmalloc.h>
#include <net/icmp.h>
#include <net/ip.h>
#include <net/netns/generic.h>
#include <net/protocol.h>
#include <net/inet_common.h>
#include <net/gro.h>
#include <net/rps.h>
#ifndef __UPSTREAM__ /* See strip.py */
#include "homa.h"
#include <linux/version.h>
#ifdef CONFIG_ARM64
#include <clocksource/arm_arch_timer.h>
#endif
#include "homa_devel.h"
#else /* See strip.py */
#include <linux/homa.h>
#endif /* See strip.py */
#include "homa_wire.h"
#ifdef __UNIT_TEST__
#include "mock.h"
#endif /* __UNIT_TEST__ */
#ifndef __STRIP__ /* See strip.py */
/* Null out things that confuse VSCode Intellisense */
#ifdef __VSCODE__
#define smp_processor_id() 1
#define BUG()
#define BUG_ON(...)
#define set_current_state(...)
#endif
#endif /* See strip.py */
/* Forward declarations. */
struct homa;
struct homa_peer;
struct homa_rpc;
struct homa_sock;
#ifndef __STRIP__ /* See strip.py */
#include "timetrace.h"
#include "homa_metrics.h"
/* Declarations used in this file, so they can't be made at the end. */
void homa_throttle_lock_slow(struct homa *homa);
#endif /* See strip.py */
/**
* union sockaddr_in_union - Holds either an IPv4 or IPv6 address (smaller
* and easier to use than sockaddr_storage).
*/
union sockaddr_in_union {
/** @sa: Used to access as a generic sockaddr. */
struct sockaddr sa;
/** @in4: Used to access as IPv4 socket. */
struct sockaddr_in in4;
/** @in6: Used to access as IPv6 socket. */
struct sockaddr_in6 in6;
};
/**
* struct homa - Stores overall information about the Homa transport, which
* is shared across all Homa sockets and all network namespaces.
*/
struct homa {
/**
* @next_outgoing_id: Id to use for next outgoing RPC request.
* This is always even: it's used only to generate client-side ids.
* Accessed without locks. Note: RPC ids are unique within a
* single client machine.
*/
atomic64_t next_outgoing_id;
#ifndef __UPSTREAM__ /* See strip.py */
/** @qshared: Contains information used by homa_qdisc.c. */
struct homa_qdisc_shared *qshared;
#endif /* See strip.py */
#ifndef __STRIP__ /* See strip.py */
/**
* @pacer: Information related to the pacer; managed by homa_pacer.c.
*/
struct homa_pacer *pacer;
/**
* @grant: Contains information used by homa_grant.c to manage
* grants for incoming messages.
*/
struct homa_grant *grant;
#endif /* See strip.py */
/**
* @peertab: Info about all the other hosts we have communicated with;
* includes peers from all network namespaces.
*/
struct homa_peertab *peertab;
/**
* @socktab: Information about all open sockets. Dynamically
* allocated; must be kfreed.
*/
struct homa_socktab *socktab;
#ifndef __STRIP__ /* See strip.py */
/**
* @page_pool_mutex: Synchronizes access to any/all of the page_pools
* used for outgoing sk_buff data.
*/
spinlock_t page_pool_mutex ____cacheline_aligned_in_smp;
/**
* @page_pools: One page pool for each NUMA node on the machine.
* If there are no cores for node, then this value is NULL.
*/
struct homa_page_pool *page_pools[MAX_NUMNODES];
#endif /* See strip.py */
/** @max_numa: Highest NUMA node id in use by any core. */
int max_numa;
#ifndef __STRIP__ /* See strip.py */
/**
* @skb_page_frees_per_sec: Rate at which to return pages from sk_buff
* page pools back to Linux. This is the total rate across all pools.
* Set externally via sysctl.
*/
int skb_page_frees_per_sec;
/**
* @skb_pages_to_free: Space in which to collect pages that are
* about to be released. Dynamically allocated.
*/
struct page **skb_pages_to_free;
/**
* @pages_to_free_slots: Maximum number of pages that can be
* stored in skb_pages_to_free;
*/
int pages_to_free_slots;
/**
* @skb_page_free_time: homa_clock() time when the next sk_buff
* page should be freed. Could be in the past.
*/
u64 skb_page_free_time;
/**
* @skb_page_pool_min_kb: Don't return pages from a pool to Linux
* if the amount of unused space in the pool has been less than this
* many KBytes at any time in the recent past. Set externally via
* sysctl.
*/
int skb_page_pool_min_kb;
/**
* @unsched_bytes: The number of bytes that may be sent in a
* new message without receiving any grants. There used to be a
* variable rtt_bytes that served this purpose, and was also used
* for window. Historically, rtt_bytes was intended to be the amount
* of data that can be transmitted over the wire in the time it
* takes to send a full-size data packet and receive back a grant.
* But, for fast networks that value could result in too much
* buffer utilization (and, we wanted to have separate values for
* @unsched_bytes and @window). Set externally via sysctl.
*/
int unsched_bytes;
#endif /* See strip.py */
/**
* @link_mbps: The raw bandwidth of the network uplink, in
* units of 1e06 bits per second. Set externally via sysctl.
*/
int link_mbps;
#ifndef __STRIP__ /* See strip.py */
/**
* @poll_usecs: Amount of time (in microseconds) that a thread
* will spend busy-waiting for an incoming messages before
* going to sleep. Set externally via sysctl.
*/
int poll_usecs;
/** @poll_cycles: Same as poll_usecs except in homa_clock() units. */
u64 poll_cycles;
/**
* @num_priorities: The total number of priority levels available for
* Homa's use. Internally, Homa will use priorities from 0 to
* num_priorities-1, inclusive. Set externally via sysctl.
*/
int num_priorities;
/**
* @priority_map: entry i gives the value to store in the high-order
* 3 bits of the DSCP field of IP headers to implement priority level
* i. Set externally via sysctl.
*/
int priority_map[HOMA_MAX_PRIORITIES];
/**
* @max_sched_prio: The highest priority level currently available for
* scheduled packets. Levels above this are reserved for unscheduled
* packets. Set externally via sysctl.
*/
int max_sched_prio;
/**
* @unsched_cutoffs: the current priority assignments for incoming
* unscheduled packets. The value of entry i is the largest
* message size that uses priority i (larger i is higher priority).
* If entry i has a value of HOMA_MAX_MESSAGE_SIZE or greater, then
* priority levels less than i will not be used for unscheduled
* packets. At least one entry in the array must have a value of
* HOMA_MAX_MESSAGE_SIZE or greater (entry 0 is usually INT_MAX).
* Set externally via sysctl.
*/
int unsched_cutoffs[HOMA_MAX_PRIORITIES];
/**
* @cutoff_version: increments every time unsched_cutoffs is
* modified. Used to determine when we need to send updates to
* peers. Note: 16 bits should be fine for this: the worst
* that happens is a peer has a super-stale value that equals
* our current value, so the peer uses suboptimal cutoffs until the
* next version change. Can be set externally via sysctl.
*/
int cutoff_version;
#endif /* See strip.py */
/**
* @resend_ticks: When an RPC's @silent_ticks reaches this value,
* start sending RESEND requests.
*/
int resend_ticks;
/**
* @resend_interval: minimum number of homa timer ticks between
* RESENDs for the same RPC.
*/
int resend_interval;
/**
* @timeout_ticks: abort an RPC if its silent_ticks reaches this value.
*/
int timeout_ticks;
/**
* @timeout_resends: Assume that a server is dead if it has not
* responded after this many RESENDs have been sent to it.
*/
int timeout_resends;
/**
* @request_ack_ticks: How many timer ticks we'll wait for the
* client to ack an RPC before explicitly requesting an ack.
* Set externally via sysctl.
*/
int request_ack_ticks;
/**
* @reap_limit: Maximum number of packet buffers to free in a
* single call to home_rpc_reap.
*/
int reap_limit;
/**
* @dead_buffs_limit: If the number of packet buffers in dead but
* not yet reaped RPCs is less than this number, then Homa reaps
* RPCs in a way that minimizes impact on performance but may permit
* dead RPCs to accumulate. If the number of dead packet buffers
* exceeds this value, then Homa switches to a more aggressive approach
* to reaping RPCs. Set externally via sysctl.
*/
int dead_buffs_limit;
/**
* @max_dead_buffs: The largest aggregate number of packet buffers
* in dead (but not yet reaped) RPCs that has existed so far in a
* single socket. Readable via sysctl, and may be reset via sysctl
* to begin recalculating.
*/
int max_dead_buffs;
#ifndef __STRIP__ /* See strip.py */
/**
* @verbose: Nonzero enables additional logging. Set externally via
* sysctl.
*/
int verbose;
#endif /* See strip.py */
/**
* @max_gso_size: Maximum number of bytes that will be included
* in a single output packet that Homa passes to Linux. Can be set
* externally via sysctl to lower the limit already enforced by Linux.
*/
int max_gso_size;
/**
* @gso_force_software: A non-zero value will cause Homa to perform
* segmentation in software using GSO; zero means ask the NIC to
* perform TSO. Set externally via sysctl.
*/
int gso_force_software;
/**
* @wmem_max: Limit on the value of sk_sndbuf for any socket. Set
* externally via sysctl.
*/
int wmem_max;
#ifndef __STRIP__ /* See strip.py */
/**
* @hijack_tcp: Non-zero means encapsulate outgoing Homa packets
* as TCP packets (i.e. use TCP as the IP protocol). This makes TSO
* and RSS work better. Set externally via sysctl.
*/
int hijack_tcp;
/**
* @max_gro_skbs: Maximum number of socket buffers that can be
* aggregated by the GRO mechanism. Set externally via sysctl.
*/
int max_gro_skbs;
/**
* @gro_policy: An OR'ed together collection of bits that determine
* how Homa packets should be steered for SoftIRQ handling. A value
* of zero will eliminate any Homa-specific behaviors, reverting
* to the Linux defaults. Set externally via sysctl (but modifying
* it is almost certainly a bad idea; see below).
*/
int gro_policy;
/* Bits that can be specified for gro_policy. These were created for
* testing, in order to evaluate various possible policies; you almost
* certainly should not use any value other than HOMA_GRO_NORMAL.
* HOMA_GRO_SAME_CORE If isolated packets arrive (not part of a
* batch) use the GRO core for SoftIRQ also.
* HOMA_GRO_IDLE Use old mechanism for selecting an idle
* core for SoftIRQ (deprecated).
* HOMA_GRO_NEXT Always use the next core in circular
* order for SoftIRQ (deprecated).
* HOMA_GRO_GEN2 Use the new mechanism for selecting an
* idle core for SoftIRQ.
* HOMA_GRO_FAST_GRANTS Pass all grants immediately to
* homa_softirq during GRO (only if the
* core isn't overloaded).
* HOMA_GRO_SHORT_BYPASS Pass all single-packet messages directly
* to homa_softirq during GRO (only if the
* core isn't overloaded).
* HOMA_GRO_GEN3 Use the "Gen3" mechanisms for load
* balancing.
*/
#define HOMA_GRO_SAME_CORE 2
#define HOMA_GRO_IDLE 4
#define HOMA_GRO_NEXT 8
#define HOMA_GRO_GEN2 0x10
#define HOMA_GRO_FAST_GRANTS 0x20
#define HOMA_GRO_SHORT_BYPASS 0x40
#define HOMA_GRO_GEN3 0x80
#define HOMA_GRO_NORMAL (HOMA_GRO_SAME_CORE | HOMA_GRO_GEN2 | \
HOMA_GRO_SHORT_BYPASS | HOMA_GRO_FAST_GRANTS)
/**
* @busy_usecs: if there has been activity on a core within the
* last @busy_usecs, it is considered to be busy and Homa will
* try to avoid scheduling other activities on the core. See
* balance.txt for more on load balancing. Set externally via sysctl.
*/
int busy_usecs;
/** @busy_cycles: Same as busy_usecs except in homa_clock() units. */
int busy_cycles;
/**
* @gro_busy_usecs: if the gap between the completion of
* homa_gro_receive and the next call to homa_gro_receive on the same
* core is less than this, then GRO on that core is considered to be
* "busy", and optimizations such as HOMA_GRO_SHORT_BYPASS will not be
* done because they risk overloading the core. Set externally via
* sysctl.
*/
int gro_busy_usecs;
/**
* @gro_busy_cycles: Same as busy_usecs except in homa_clock() units.
*/
int gro_busy_cycles;
#endif /* See strip.py */
/**
* @timer_ticks: number of times that homa_timer has been invoked
* (may wraparound, which is safe).
*/
u32 timer_ticks;
/**
* @flags: a collection of bits that can be set using sysctl
* to trigger various behaviors.
*/
int flags;
#ifndef __STRIP__ /* See strip.py */
/**
* @freeze_type: determines conditions under which the time trace
* should be frozen. Set externally via sysctl.
*/
enum homa_freeze_type freeze_type;
/**
* @accept_bits: determines how many consecutive packets will be
* accepted before the next bunch of packets is dropped (intervals
* between dropped packets are chosen uniformly from the
* range[0..1<<accept_bits)). Zero means don't drop any packets.
* Set externally via sysctl.
*/
int accept_bits;
/**
* @drop_bits: determines how many consecutive packets are dropped
* when drops occur (counts are chosen uniformly from the
* range [1..1<<drop_bits]) Set externally via sysctl.
*/
int drop_bits;
#endif /* See strip.py */
/**
* @bpage_lease_usecs: how long a core can own a bpage (microseconds)
* before its ownership can be revoked to reclaim the page.
*/
int bpage_lease_usecs;
/**
* @bpage_lease_cycles: same as bpage_lease_usecs except in
* homa_clock() units.
*/
int bpage_lease_cycles;
/**
* @next_id: Set via sysctl; causes next_outgoing_id to be set to
* this value; always reads as zero. Typically used while debugging to
* ensure that different nodes use different ranges of ids.
*/
int next_id;
/**
* @destroyed: True means that this structure is being destroyed
* so everyone should clean up.
*/
bool destroyed;
#ifndef __UPSTREAM__ /* See strip.py */
/**
* @sysctl_action: This value is set by sysctl to invoke one of
* several actions for testing. It is normally zero.
*/
int sysctl_action;
/**
* @temp: the values in this array can be read and written with sysctl.
* They have no officially defined purpose, and are available for
* short-term use during testing.
*/
int temp[4];
#endif /* See strip.py */
};
/**
* struct homa_net - Contains Homa information that is specific to a
* particular network namespace.
*/
struct homa_net {
/** @homa: Global Homa information. */
struct homa *homa;
/**
* @prev_default_port: The most recent port number assigned from
* the range of default ports.
*/
u16 prev_default_port;
/**
* @num_peers: The total number of struct homa_peers that exist
* for this namespace. Managed by homa_peer.c under the peertab lock.
*/
int num_peers;
};
/**
* struct homa_skb_info - Additional information needed by Homa for each
* outbound DATA packet. Space is allocated for this at the very end of the
* linear part of the skb.
*/
struct homa_skb_info {
/** @next_skb: used to link together outgoing skb's for a message. */
struct sk_buff *next_skb;
/**
* @wire_bytes: total number of bytes of network bandwidth that
* will be consumed by this packet. This includes everything,
* including additional headers added by GSO, IP header, Ethernet
* header, CRC, preamble, and inter-packet gap.
*/
int wire_bytes;
/**
* @data_bytes: total bytes of message data across all of the
* segments in this packet.
*/
int data_bytes;
/** @seg_length: maximum number of data bytes in each GSO segment. */
int seg_length;
/**
* @offset: offset within the message of the first byte of data in
* this packet.
*/
int offset;
/** @rpc: RPC that this packet belongs to. */
void *rpc;
};
/**
* homa_get_skb_info() - Return the address of Homa's private information
* for an sk_buff.
* @skb: Socket buffer whose info is needed.
* Return: address of Homa's private information for @skb.
*/
static inline struct homa_skb_info *homa_get_skb_info(struct sk_buff *skb)
{
return (struct homa_skb_info *)(skb_end_pointer(skb)) - 1;
}
/**
* homa_set_doff() - Fills in the doff TCP header field for a packet.
* @skb: Packet whose doff field is to be set.
* @size: Size of the "header" in bytes (must be a multiple of 4). This is
* needed for two reasons. First, for TSO to work it must indicate
* the number of bytes that should be replicated in each segment.
* The bytes after this will be distributed among segments. Second,
* for TCP hijacking to work it must have a valid value (20 is a
* good choice if the packet isn't a TSO frame).
*/
static inline void homa_set_doff(struct sk_buff *skb, int size)
{
tcp_hdr(skb)->doff = size >> 2;
}
/** skb_is_ipv6() - Return true if the packet is encapsulated with IPv6,
* false otherwise (presumably it's IPv4).
*/
static inline bool skb_is_ipv6(const struct sk_buff *skb)
{
return ipv6_hdr(skb)->version == 6;
}
/**
* ipv6_to_ipv4() - Given an IPv6 address produced by ipv4_to_ipv6, return
* the original IPv4 address (in network byte order).
* @ip6: IPv6 address; assumed to be a mapped IPv4 address.
* Return: IPv4 address stored in @ip6.
*/
static inline __be32 ipv6_to_ipv4(const struct in6_addr ip6)
{
return ip6.in6_u.u6_addr32[3];
}
/**
* canonical_ipv6_addr() - Convert a socket address to the "standard"
* form used in Homa, which is always an IPv6 address; if the original address
* was IPv4, convert it to an IPv4-mapped IPv6 address.
* @addr: Address to canonicalize (if NULL, "any" is returned).
* Return: IPv6 address corresponding to @addr.
*/
static inline struct in6_addr canonical_ipv6_addr(const union sockaddr_in_union
*addr)
{
struct in6_addr mapped;
if (addr) {
if (addr->sa.sa_family == AF_INET6)
return addr->in6.sin6_addr;
ipv6_addr_set_v4mapped(addr->in4.sin_addr.s_addr, &mapped);
return mapped;
}
return in6addr_any;
}
/**
* skb_canonical_ipv6_saddr() - Given a packet buffer, return its source
* address in the "standard" form used in Homa, which is always an IPv6
* address; if the original address was IPv4, convert it to an IPv4-mapped
* IPv6 address.
* @skb: The source address will be extracted from this packet buffer.
* Return: IPv6 address for @skb's source machine.
*/
static inline struct in6_addr skb_canonical_ipv6_saddr(struct sk_buff *skb)
{
struct in6_addr mapped;
if (skb_is_ipv6(skb))
return ipv6_hdr(skb)->saddr;
ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &mapped);
return mapped;
}
/**
* homa_make_header_avl() - Invokes pskb_may_pull to make sure that all the
* Homa header information for a packet is in the linear part of the skb
* where it can be addressed using skb_transport_header.
* @skb: Packet for which header is needed.
* Return: The result of pskb_may_pull (true for success)
*/
static inline bool homa_make_header_avl(struct sk_buff *skb)
{
int pull_length;
pull_length = skb_transport_header(skb) - skb->data + HOMA_MAX_HEADER;
if (pull_length > skb->len)
pull_length = skb->len;
return pskb_may_pull(skb, pull_length);
}
#ifndef __UPSTREAM__ /* See strip.py */
#ifdef __UNIT_TEST__
void unit_log_printf(const char *separator, const char *format, ...)
__printf(2, 3);
#define UNIT_LOG unit_log_printf
void unit_hook(char *id);
#define UNIT_HOOK(msg) unit_hook(msg)
#else /* __UNIT_TEST__ */
#define UNIT_LOG(...)
#define UNIT_HOOK(...)
#endif /* __UNIT_TEST__ */
#endif /* See strip.py */
extern unsigned int homa_net_id;
void homa_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk,
struct homa_rpc *rpc);
void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb);
int homa_bind(struct socket *sk, struct sockaddr *addr,
int addr_len);
void homa_close(struct sock *sock, long timeout);
int homa_copy_to_user(struct homa_rpc *rpc);
void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc);
void homa_destroy(struct homa *homa);
void homa_dispatch_pkts(struct sk_buff *skb);
int homa_err_handler_v4(struct sk_buff *skb, u32 info);
int homa_err_handler_v6(struct sk_buff *skb,
struct inet6_skb_parm *opt, u8 type, u8 code,
int offset, __be32 info);
int homa_fill_data_interleaved(struct homa_rpc *rpc,
struct sk_buff *skb, struct iov_iter *iter);
struct homa_gap *homa_gap_alloc(struct list_head *next, int start, int end);
int homa_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen);
int homa_hash(struct sock *sk);
enum hrtimer_restart homa_hrtimer(struct hrtimer *timer);
int homa_init(struct homa *homa);
int homa_ioc_info(struct socket *sock, unsigned long arg);
int homa_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
int homa_load(void);
int homa_message_out_fill(struct homa_rpc *rpc,
struct iov_iter *iter, int xmit);
void homa_message_out_init(struct homa_rpc *rpc, int length);
void homa_need_ack_pkt(struct sk_buff *skb, struct homa_sock *hsk,
struct homa_rpc *rpc);
void homa_net_destroy(struct homa_net *hnet);
void homa_net_exit(struct net *net);
int homa_net_init(struct homa_net *hnet, struct net *net,
struct homa *homa);
int homa_net_start(struct net *net);
__poll_t homa_poll(struct file *file, struct socket *sock,
struct poll_table_struct *wait);
int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int flags, int *addr_len);
void homa_request_retrans(struct homa_rpc *rpc);
void homa_resend_pkt(struct sk_buff *skb, struct homa_rpc *rpc,
struct homa_sock *hsk);
void homa_rpc_handoff(struct homa_rpc *rpc);
int homa_rpc_tx_end(struct homa_rpc *rpc);
int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
int homa_setsockopt(struct sock *sk, int level, int optname,
sockptr_t optval, unsigned int optlen);
int homa_shutdown(struct socket *sock, int how);
int homa_socket(struct sock *sk);
int homa_softirq(struct sk_buff *skb);
void homa_spin(int ns);
void homa_timer(struct homa *homa);
void homa_timer_check_rpc(struct homa_rpc *rpc);
int homa_timer_main(void *transport);
struct sk_buff *homa_tx_data_pkt_alloc(struct homa_rpc *rpc,
struct iov_iter *iter, int offset,
int length, int max_seg_data);
void homa_unhash(struct sock *sk);
void homa_rpc_unknown_pkt(struct sk_buff *skb, struct homa_rpc *rpc);
void homa_unload(void);
int homa_wait_private(struct homa_rpc *rpc, int nonblocking);
struct homa_rpc *homa_wait_shared(struct homa_sock *hsk, int nonblocking);
int homa_xmit_control(enum homa_packet_type type, void *contents,
size_t length, struct homa_rpc *rpc);
int __homa_xmit_control(void *contents, size_t length,
struct homa_peer *peer, struct homa_sock *hsk);
void homa_xmit_unknown(struct sk_buff *skb, struct homa_sock *hsk);
#ifndef __STRIP__ /* See strip.py */
void homa_cutoffs_pkt(struct sk_buff *skb, struct homa_sock *hsk);
int homa_dointvec(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos);
void homa_incoming_sysctl_changed(struct homa *homa);
int homa_ioc_abort(struct socket *sock, unsigned long arg);
int homa_message_in_init(struct homa_rpc *rpc, int length,
int unsched);
void homa_prios_changed(struct homa *homa);
void homa_resend_data(struct homa_rpc *rpc, int start, int end,
int priority);
int homa_sysctl_softirq_cores(const struct ctl_table *table,
int write, void *buffer, size_t *lenp,
loff_t *ppos);
int homa_unsched_priority(struct homa *homa, struct homa_peer *peer,
int length);
void homa_xmit_data(struct homa_rpc *rpc, bool force);
void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc,
int priority);
#else /* See strip.py */
int homa_message_in_init(struct homa_rpc *rpc, int unsched);
void homa_resend_data(struct homa_rpc *rpc, int start, int end);
void homa_xmit_data(struct homa_rpc *rpc);
void __homa_xmit_data(struct sk_buff *skb, struct homa_rpc *rpc);
#endif /* See strip.py */
/**
* homa_net() - Return the struct homa_net associated with a particular
* struct net.
* @net: Get the Homa data for this net namespace.
* Return: see above.
*/
static inline struct homa_net *homa_net(struct net *net)
{
return (struct homa_net *)net_generic(net, homa_net_id);
}
/**
* homa_clock() - Return a fine-grain clock value that is monotonic and
* consistent across cores.
* Return: see above.
*/
static inline u64 homa_clock(void)
{
/* This function exists to make it easy to switch time sources
* if/when new or better sources become available.
*/
#ifdef __UNIT_TEST__
u64 mock_get_clock(void);
return mock_get_clock();
#else /* __UNIT_TEST__ */
#ifndef __UPSTREAM__ /* See strip.py */
/* As of August 2025, get_cycles takes only about 8 ns/call, vs.
* 14 ns/call for ktime_get_ns. This saves about .24 core when
* driving a 25 Gbps network at high load (see perf.txt for details).
* Unfortunately, Linux reviewers will not allow get_cycles in the
* upstreamed version.
*/
return get_cycles();
#else /* See strip.py */
return ktime_get_ns();
#endif /* See strip.py */
#endif /* __UNIT_TEST__ */
}
/**
* homa_clock_khz() - Return the frequency of the values returned by
* homa_clock, in units of KHz.
* Return: see above.
*/
static inline u64 homa_clock_khz(void)
{
#ifdef __UNIT_TEST__
return 1000000;
#else /* __UNIT_TEST__ */
#ifndef __UPSTREAM__ /* See strip.py */
#ifdef CONFIG_X86
return tsc_khz;
#elif defined(CONFIG_ARM64)
return arch_timer_get_cntfrq() / 1000;
#else
return 1000000;
#endif
#else /* See strip.py */
return 1000000;
#endif /* See strip.py */
#endif /* __UNIT_TEST__ */
}
/**
* homa_ns_to_cycles() - Convert from units of nanoseconds to units of
* homa_clock().
* @ns: A time measurement in nanoseconds
* Return: The time in homa_clock() units corresponding to @ns.
*/
static inline u64 homa_ns_to_cycles(u64 ns)
{
#ifdef __UNIT_TEST__
return ns;
#else /* __UNIT_TEST__ */
u64 tmp;
tmp = ns * homa_clock_khz();
do_div(tmp, 1000000);
return tmp;
#endif /* __UNIT_TEST__ */
}
#ifndef __STRIP__ /* See strip.py */
/**
* homa_usecs_to_cycles() - Convert from units of microseconds to units of
* homa_clock().
* @usecs: A time measurement in microseconds
* Return: The time in homa_clock() units corresponding to @usecs.
*/
static inline u64 homa_usecs_to_cycles(u64 usecs)
{
#ifdef __UNIT_TEST__
return usecs * 1000;
#else /* __UNIT_TEST__ */
u64 tmp;
tmp = usecs * homa_clock_khz();
do_div(tmp, 1000);
return tmp;
#endif /* __UNIT_TEST__ */
}
#endif /* See strip.py */
#ifndef __STRIP__ /* See strip.py */
/**
* homa_high_priority() - Return the next-to-highest available priority
* level. Used in situations where we want to boost the priority of
* something but don't want to interfere with the highest priority packets
* such as control packets.
* @homa: Overall information about the Homa protocol.
* Return: See above.
*
*/
static inline int homa_high_priority(struct homa *homa)
{
return (homa->num_priorities <= 2) ? 0 : homa->num_priorities - 2;
}
#endif /* See strip.py */
/* Homa Locking Strategy:
*
* (Note: this documentation is referenced in several other places in the
* Homa code)
*
* In the Linux TCP/IP stack the primary locking mechanism is a sleep-lock
* per socket. However, per-socket locks aren't adequate for Homa, because
* sockets are "larger" in Homa. In TCP, a socket corresponds to a single
* connection between two peers; an application can have hundreds or
* thousands of sockets open at once, so per-socket locks leave lots of
* opportunities for concurrency. With Homa, a single socket can be used for
* communicating with any number of peers, so there will typically be just
* one socket per thread. As a result, a single Homa socket must support many
* concurrent RPCs efficiently, and a per-socket lock would create a bottleneck
* (Homa tried this approach initially).
*
* Thus, the primary locks used in Homa spinlocks at RPC granularity. This
* allows operations on different RPCs for the same socket to proceed
* concurrently. Homa also has socket locks (which are spinlocks different
* from the official socket sleep-locks) but these are used much less
* frequently than RPC locks.
*
* Lock Ordering:
*
* There are several other locks in Homa besides RPC locks, all of which
* are spinlocks. When multiple locks are held, they must be acquired in a
* consistent order in order to prevent deadlock. Here are the rules for Homa:
* 1. Except for RPC and socket locks, all locks should be considered
* "leaf" locks: don't acquire other locks while holding them.
* 2. The lock order is:
* * RPC lock
* * Socket lock
* * Other lock
*
* It may seem surprising that RPC locks are acquired *before* socket locks,
* but this is essential for high performance. Homa has been designed so that
* many common operations (such as processing input packets) can be performed
* while holding only an RPC lock; this allows operations on different RPCs
* to proceed in parallel. Only a few operations, such as handing off an
* incoming message to a waiting thread, require the socket lock. If socket
* locks had to be acquired first, any operation that might eventually need
* the socket lock would have to acquire it before the RPC lock, which would
* severely restrict concurrency.
*
* Socket Shutdown:
*
* It is possible for socket shutdown to begin while operations are underway
* that hold RPC locks but not the socket lock. For example, a new RPC
* creation might be underway when a socket is shut down. The RPC creation
* will eventually acquire the socket lock and add the new RPC to those
* for the socket; it would be very bad if this were to happen after
* homa_sock_shutdown things is has deleted all RPCs for the socket.
* In general, any operation that acquires a socket lock must check
* hsk->shutdown after acquiring the lock and abort if hsk->shutdown is set.
*
* Spinlock Implications:
*
* Homa uses spinlocks exclusively; this is needed because locks typically
* need to be acquired at atomic level, such as in SoftIRQ code.
*
* Operations that can block, such as memory allocation and copying data
* to/from user space, are not permitted while holding spinlocks (spinlocks
* disable interrupts, so the holder must not block. This results in awkward
* code in several places to move restricted operations outside locked
* regions. Such code typically looks like this:
* - Acquire a reference on an object such as an RPC, in order to prevent
* the object from being deleted.
* - Release the object's lock.
* - Perform the restricted operation.
* - Re-acquire the lock.
* - Release the reference.
* It is possible that the object may have been modified by some other party
* while it was unlocked, so additional checks may be needed after reacquiring
* the lock. As one example, an RPC may have been terminated, in which case
* any operation in progress on that RPC should be aborted after reacquiring
* the lock.
*
* Lists of RPCs:
*
* There are a few places where Homa needs to process all of the RPCs
* associated with a socket, such as the timer. Such code must first lock
* the socket (to protect access to the link pointers) then lock
* individual RPCs on the list. However, this violates the rules for locking
* order. It isn't safe to unlock the socket before locking the individual RPCs,
* because RPCs could be deleted and their memory recycled between the unlock
* of the socket lock and the lock of the RPC; this could result in corruption.
* Homa uses two different approaches to handle this situation:
* 1. Use ``homa_protect_rpcs`` to prevent RPC reaping for a socket. RPCs can
* still be terminated, but their memory won't go away until
* homa_unprotect_rpcs is invoked. This allows the socket lock to be
* released before acquiring RPC locks; after acquiring each RPC lock,
* the RPC must be checked to see if it has been terminated; if so, skip it.
* 2. Use ``spin_trylock_bh`` to acquire the RPC lock while still holding the
* socket lock. If this fails, then release the socket lock and retry
* both the socket lock and the RPC lock. Of course, the state of both
* socket and RPC could change before the locks are finally acquired.
*/
#endif /* _HOMA_IMPL_H */