Skip to content

Commit ab54a7a

Browse files
author
zhitengqiu
committed
mm: Enhanced copy capabilities for Hygon processor
hygon inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IAQQDF CVE: NA --------------------------- The following methods are used to improve the large memory copy performance of the Hygon processor between kernel and user mode. Prefetch is a technique for reading blocks of data from main memory at very high data rates, then operating on them within the cache. Results are then written out to memory, all with high efficiency. The code can employ a very special instruction: NT. This is a streaming store instruction for writing data to memory. This instruction bypasses the on-chip cache and sends data directly into a write-combining buffer. Because NT allows the CPU to avoid reading the old data from the memory destination address, NT can effectively improve the total write bandwidth. There are similar optimizations for reading data from memory. Interruptions may occur when copying large memory, which may trigger thread switching. You need to save the current MMX register context and continue copying when switching back to the thread next time. Signed-off-by: zhuchao <zhuchao@hygon.cn> Signed-off-by: qiuzhiteng <qiuzhiteng@hygon.cn>
1 parent c85b8c8 commit ab54a7a

16 files changed

Lines changed: 1079 additions & 0 deletions

File tree

arch/x86/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -899,6 +899,7 @@ config INTEL_TDX_GUEST
899899
endif # HYPERVISOR_GUEST
900900

901901
source "arch/x86/Kconfig.cpu"
902+
source "arch/x86/Kconfig.fpu"
902903

903904
config HPET_TIMER
904905
def_bool X86_64

arch/x86/Kconfig.fpu

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# SPDX-License-Identifier: GPL-2.0
2+
3+
menuconfig USING_FPU_IN_KERNEL_NONATOMIC
4+
bool "Hygon large memory copy support"
5+
help
6+
This option enables support for optimized large memory copy operations
7+
on Hygon processors in the kernel space using SSE2 or AVX2 non-temporal (NT)
8+
copy instructions. NT instructions are streaming store instructions that bypass
9+
the on-chip cache and send data directly to a write-combining buffer.
10+
11+
When this option is enabled, you can choose the specific instruction set to use
12+
for large memory copy: SSE2 or AVX2. Using these instruction sets can improve data
13+
throughput and reduce the number of cache misses during memory copy operations.
14+
15+
if USING_FPU_IN_KERNEL_NONATOMIC
16+
17+
choice
18+
prompt "X86_HYGON_LMC"
19+
depends on X86_64 && CPU_SUP_HYGON
20+
default X86_HYGON_LMC_AVX2_ON
21+
help
22+
Select the type of non-temporal (NT) copy instructions to use for
23+
large memory copy operations between kernel and user mode. You can
24+
choose between SSE2 or AVX2 instructions based on the processor
25+
capabilities and the size of the memory being copied.
26+
27+
To use this feature, you also need to configure the data copy size.
28+
The file is in `/sys/c86_features/hygon_c86/nt_cpy_mini_len`. Please
29+
refer to configuration 4096 and above.
30+
31+
config X86_HYGON_LMC_SSE2_ON
32+
bool "Using sse2 nt copy for large memory copy"
33+
help
34+
When this feature is enabled, the kernel will use the
35+
copy_user_sse2_opt_string function for large memory copy operations.
36+
37+
SSE2 (Streaming SIMD Extensions 2) instructions support non-temporal
38+
(NT) stores that bypass the CPU cache and write data directly to
39+
memory. This can improve performance for large memory copies by reducing
40+
cache pollution and taking advantage of the write-combining buffer.
41+
42+
However, using SSE2 NT copy may require saving and restoring MMX and
43+
SSE2 register contexts during thread switching if an interruption occurs.
44+
45+
config X86_HYGON_LMC_AVX2_ON
46+
bool "Using avx2 nt copy for large memory copy"
47+
help
48+
When this feature is enabled, the kernel will use the
49+
copy_user_avx2_pf64_nt_string function for large memory copy operations.
50+
51+
AVX2 (Advanced Vector Extensions 2) instructions provide enhanced
52+
vector processing capabilities and support for non-temporal (NT) stores,
53+
which can significantly improve memory copy performance for large blocks
54+
of data. By bypassing the cache and writing data directly to memory,
55+
AVX2 NT copy can achieve higher throughput than SSE2 NT copy.
56+
57+
Similar to SSE2, using AVX2 NT copy may require saving and restoring
58+
AVX2 register contexts if an interruption occurs during large memory
59+
copying, to ensure the process continues smoothly after resuming.
60+
61+
endchoice
62+
endif

arch/x86/configs/deepin_x86_desktop_defconfig

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5897,3 +5897,6 @@ CONFIG_IO_DELAY_0XED=y
58975897
# CONFIG_X86_DEBUG_FPU is not set
58985898
CONFIG_UNWINDER_FRAME_POINTER=y
58995899
# CONFIG_RUNTIME_TESTING_MENU is not set
5900+
CONFIG_USING_FPU_IN_KERNEL_NONATOMIC=y
5901+
# CONFIG_X86_HYGON_LMC_SSE2_ON is not set
5902+
CONFIG_X86_HYGON_LMC_AVX2_ON=y

arch/x86/include/asm/fpu/api.h

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,43 @@ static inline void kernel_fpu_begin(void)
4949
#endif
5050
}
5151

52+
#if defined(CONFIG_X86_HYGON_LMC_SSE2_ON) || \
53+
defined(CONFIG_X86_HYGON_LMC_AVX2_ON)
54+
extern int kernel_fpu_begin_nonatomic_mask(unsigned int kfpu_mask);
55+
extern void kernel_fpu_end_nonatomic(void);
56+
57+
/* Code that is unaware of kernel_fpu_begin_nonatomic_mask() can use this */
58+
static inline int kernel_fpu_begin_nonatomic(void)
59+
{
60+
#ifdef CONFIG_X86_64
61+
/*
62+
* Any 64-bit code that uses 387 instructions must explicitly request
63+
* KFPU_387.
64+
*/
65+
return kernel_fpu_begin_nonatomic_mask(KFPU_MXCSR);
66+
#else
67+
/*
68+
* 32-bit kernel code may use 387 operations as well as SSE2, etc,
69+
* as long as it checks that the CPU has the required capability.
70+
*/
71+
return kernel_fpu_begin_nonatomic_mask(KFPU_387 | KFPU_MXCSR);
72+
#endif
73+
}
74+
75+
/*
76+
* It means we call kernel_fpu_end after kernel_fpu_begin_nonatomic
77+
* func, but before kernel_fpu_end_nonatomic
78+
*/
79+
static inline void check_using_kernel_fpu(void)
80+
{
81+
WARN_ON_ONCE(test_thread_flag(TIF_USING_FPU_NONATOMIC));
82+
}
83+
84+
#else
85+
static inline void check_using_kernel_fpu(void) { }
86+
87+
#endif
88+
5289
/*
5390
* Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate.
5491
* A context switch will (and softirq might) save CPU's FPU registers to

arch/x86/include/asm/fpu/sched.h

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,4 +66,62 @@ static inline void switch_fpu_finish(void)
6666
set_thread_flag(TIF_NEED_FPU_LOAD);
6767
}
6868

69+
/*
70+
* Kernel FPU state switching for scheduling.
71+
*
72+
* This is a two-stage process:
73+
*
74+
* - switch_kernel_fpu_prepare() saves the old kernel fpu state.
75+
* This is done within the context of the old process.
76+
*
77+
* - switch_kernel_fpu_finish() restore new kernel fpu state.
78+
*
79+
* The kernel FPU context is only stored/restored for a user task in kernel
80+
* mode and PF_KTHREAD is used to distinguish between kernel and user threads.
81+
*/
82+
#if defined(CONFIG_X86_HYGON_LMC_SSE2_ON) || \
83+
defined(CONFIG_X86_HYGON_LMC_AVX2_ON)
84+
extern void save_fpregs_to_fpkernelstate(struct fpu *kfpu);
85+
extern unsigned long get_fpu_registers_pos(struct fpu *fpu, unsigned int off);
86+
static inline void switch_kernel_fpu_prepare(struct task_struct *prev, int cpu)
87+
{
88+
struct fpu *old_fpu = &prev->thread.fpu;
89+
90+
if (!test_thread_flag(TIF_USING_FPU_NONATOMIC))
91+
return;
92+
93+
if (static_cpu_has(X86_FEATURE_FPU) && !(prev->flags & PF_KTHREAD))
94+
save_fpregs_to_fpkernelstate(old_fpu);
95+
}
96+
97+
/* Internal helper for switch_kernel_fpu_finish() and signal frame setup */
98+
static inline void fpregs_restore_kernelregs(struct fpu *kfpu)
99+
{
100+
kernel_fpu_states_restore(NULL, (void *)get_fpu_registers_pos(kfpu, MAX_FPU_CTX_SIZE),
101+
MAX_FPU_CTX_SIZE);
102+
}
103+
104+
/* Loading of the complete FPU state immediately. */
105+
static inline void switch_kernel_fpu_finish(struct task_struct *next)
106+
{
107+
struct fpu *new_fpu = &next->thread.fpu;
108+
109+
if (next->flags & PF_KTHREAD)
110+
return;
111+
112+
if (cpu_feature_enabled(X86_FEATURE_FPU) &&
113+
test_ti_thread_flag((struct thread_info *)next,
114+
TIF_USING_FPU_NONATOMIC))
115+
fpregs_restore_kernelregs(new_fpu);
116+
}
117+
#else
118+
static inline void switch_kernel_fpu_prepare(struct task_struct *prev, int cpu)
119+
{
120+
}
121+
static inline void switch_kernel_fpu_finish(struct task_struct *next)
122+
{
123+
}
124+
125+
#endif
126+
69127
#endif /* _ASM_X86_FPU_SCHED_H */

arch/x86/include/asm/thread_info.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ struct thread_info {
103103
#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
104104
#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
105105
#define TIF_ADDR32 29 /* 32-bit address space on 64 bits */
106+
#define TIF_USING_FPU_NONATOMIC 30 /* using fpu in kernel non-atomic context */
106107

107108
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
108109
#define _TIF_SIGPENDING (1 << TIF_SIGPENDING)

arch/x86/include/asm/uaccess_64.h

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@
1111
#include <asm/alternative.h>
1212
#include <asm/cpufeatures.h>
1313
#include <asm/page.h>
14+
#if defined(CONFIG_X86_HYGON_LMC_SSE2_ON) || \
15+
defined(CONFIG_X86_HYGON_LMC_AVX2_ON)
16+
#include <asm/fpu/api.h>
17+
#endif
18+
19+
extern struct static_key_false hygon_lmc_key;
1420

1521
#ifdef CONFIG_ADDRESS_MASKING
1622
/*
@@ -97,13 +103,91 @@ static inline bool __access_ok(const void __user *ptr, unsigned long size)
97103
* Copy To/From Userspace
98104
*/
99105

106+
#ifdef CONFIG_X86_HYGON_LMC_SSE2_ON
107+
void fpu_save_xmm0_3(void *to, const void *from, unsigned long len);
108+
void fpu_restore_xmm0_3(void *to, const void *from, unsigned long len);
109+
110+
#define kernel_fpu_states_save fpu_save_xmm0_3
111+
#define kernel_fpu_states_restore fpu_restore_xmm0_3
112+
113+
__must_check unsigned long copy_user_sse2_opt_string(void *to, const void *from,
114+
unsigned long len);
115+
116+
#define MAX_FPU_CTX_SIZE 64
117+
#define KERNEL_FPU_NONATOMIC_SIZE (2 * (MAX_FPU_CTX_SIZE))
118+
119+
#define copy_user_large_memory_generic_string copy_user_sse2_opt_string
120+
121+
#endif
122+
123+
#ifdef CONFIG_X86_HYGON_LMC_AVX2_ON
124+
void fpu_save_ymm0_7(void *to, const void *from, unsigned long len);
125+
void fpu_restore_ymm0_7(void *to, const void *from, unsigned long len);
126+
127+
#define kernel_fpu_states_save fpu_save_ymm0_7
128+
#define kernel_fpu_states_restore fpu_restore_ymm0_7
129+
130+
__must_check unsigned long
131+
copy_user_avx2_pf64_nt_string(void *to, const void *from, unsigned long len);
132+
133+
#define MAX_FPU_CTX_SIZE 256
134+
#define KERNEL_FPU_NONATOMIC_SIZE (2 * (MAX_FPU_CTX_SIZE))
135+
136+
#define copy_user_large_memory_generic_string copy_user_avx2_pf64_nt_string
137+
#endif
138+
139+
#if defined(CONFIG_X86_HYGON_LMC_SSE2_ON) || \
140+
defined(CONFIG_X86_HYGON_LMC_AVX2_ON)
141+
unsigned int get_nt_block_copy_mini_len(void);
142+
static inline bool Hygon_LMC_check(unsigned long len)
143+
{
144+
unsigned int nt_blk_cpy_mini_len = get_nt_block_copy_mini_len();
145+
146+
if (((nt_blk_cpy_mini_len) && (nt_blk_cpy_mini_len <= len) &&
147+
(system_state == SYSTEM_RUNNING) &&
148+
(!kernel_fpu_begin_nonatomic())))
149+
return true;
150+
else
151+
return false;
152+
}
153+
static inline unsigned long
154+
copy_large_memory_generic_string(void *to, const void *from, unsigned long len)
155+
{
156+
unsigned long ret;
157+
158+
ret = copy_user_large_memory_generic_string(to, from, len);
159+
kernel_fpu_end_nonatomic();
160+
return ret;
161+
}
162+
#else
163+
static inline bool Hygon_LMC_check(unsigned long len)
164+
{
165+
return false;
166+
}
167+
static inline unsigned long
168+
copy_large_memory_generic_string(void *to, const void *from, unsigned long len)
169+
{
170+
return 0;
171+
}
172+
#endif
173+
100174
/* Handles exceptions in both to and from, but doesn't do access_ok */
101175
__must_check unsigned long
102176
rep_movs_alternative(void *to, const void *from, unsigned len);
103177

104178
static __always_inline __must_check unsigned long
105179
copy_user_generic(void *to, const void *from, unsigned long len)
106180
{
181+
/* Check if Hygon large memory copy support enabled. */
182+
if (static_branch_unlikely(&hygon_lmc_key)) {
183+
if (Hygon_LMC_check(len)) {
184+
unsigned long ret;
185+
186+
ret = copy_large_memory_generic_string(to, from, len);
187+
return ret;
188+
}
189+
}
190+
107191
stac();
108192
/*
109193
* If CPU has FSRM feature, use 'rep movs'.

arch/x86/kernel/cpu/common.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,9 @@ EXPORT_SYMBOL_GPL(get_llc_id);
9292
/* L2 cache ID of each logical CPU */
9393
DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id) = BAD_APICID;
9494

95+
DEFINE_STATIC_KEY_FALSE(hygon_lmc_key);
96+
EXPORT_SYMBOL_GPL(hygon_lmc_key);
97+
9598
static struct ppin_info {
9699
int feature;
97100
int msr_ppin_ctl;
@@ -2511,6 +2514,17 @@ void arch_smt_update(void)
25112514
apic_smt_update();
25122515
}
25132516

2517+
#if defined(CONFIG_X86_HYGON_LMC_SSE2_ON) || \
2518+
defined(CONFIG_X86_HYGON_LMC_AVX2_ON)
2519+
static inline void update_lmc_branch_cond(void)
2520+
{
2521+
if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
2522+
static_branch_enable(&hygon_lmc_key);
2523+
}
2524+
#else
2525+
static inline void update_lmc_branch_cond(void) { }
2526+
#endif
2527+
25142528
void __init arch_cpu_finalize_init(void)
25152529
{
25162530
identify_boot_cpu();
@@ -2530,6 +2544,8 @@ void __init arch_cpu_finalize_init(void)
25302544

25312545
arch_smt_update();
25322546

2547+
update_lmc_branch_cond();
2548+
25332549
if (IS_ENABLED(CONFIG_X86_32)) {
25342550
/*
25352551
* Check whether this is a real i386 which is not longer

0 commit comments

Comments
 (0)