Skip to content

Commit 18bc0b9

Browse files
committed
Improve alignment
Signed-off-by: Heinz N. Gies <heinz@licenser.net>
1 parent 5a62202 commit 18bc0b9

8 files changed

Lines changed: 110 additions & 112 deletions

File tree

src/impls/avx2/stage1.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,14 +44,14 @@ impl Stage1Parse for SimdInput {
4444
type Utf8Validator = simdutf8::basic::imp::x86::avx2::ChunkedUtf8ValidatorImp;
4545
type SimdRepresentation = __m256i;
4646
#[cfg_attr(not(feature = "no-inline"), inline)]
47-
// _mm256_loadu_si256 does not need alignment
47+
// _mm256_loadu_si256 does not need alignment we allign our input so we can use _mm256_loadu_si256
4848
#[allow(clippy::cast_ptr_alignment)]
4949
#[target_feature(enable = "avx2")]
50-
unsafe fn new(ptr: &[u8]) -> Self {
50+
unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self {
5151
unsafe {
5252
Self {
53-
v0: _mm256_loadu_si256(ptr.as_ptr().cast::<__m256i>()),
54-
v1: _mm256_loadu_si256(ptr.as_ptr().add(32).cast::<__m256i>()),
53+
v0: _mm256_load_si256(ptr.as_ptr().cast::<__m256i>()),
54+
v1: _mm256_load_si256(ptr.as_ptr().add(32).cast::<__m256i>()),
5555
}
5656
}
5757
}

src/impls/native/stage1.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -292,13 +292,13 @@ pub(crate) struct SimdInput {
292292
impl Stage1Parse for SimdInput {
293293
type Utf8Validator = super::ChunkedUtf8ValidatorImp;
294294
type SimdRepresentation = V128;
295-
unsafe fn new(ptr: &[u8]) -> Self {
295+
unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self {
296296
unsafe {
297297
SimdInput {
298-
v0: *(ptr.as_ptr().cast::<V128>()),
299-
v1: *(ptr.as_ptr().add(16).cast::<V128>()),
300-
v2: *(ptr.as_ptr().add(32).cast::<V128>()),
301-
v3: *(ptr.as_ptr().add(48).cast::<V128>()),
298+
v0: ptr.as_ptr().cast::<V128>().read(),
299+
v1: ptr.as_ptr().add(16).cast::<V128>().read(),
300+
v2: ptr.as_ptr().add(32).cast::<V128>().read(),
301+
v3: ptr.as_ptr().add(48).cast::<V128>().read(),
302302
}
303303
}
304304
}

src/impls/neon/stage1.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use crate::Stage1Parse;
1+
use crate::{SIMDINPUT_LENGTH, Stage1Parse, macros::static_cast_i32};
22
use std::arch::aarch64::{
33
int8x16_t, int32x4_t, uint8x16_t, vaddq_s32, vandq_u8, vceqq_u8, vcleq_u8, vdupq_n_s8,
44
vgetq_lane_u64, vld1q_u8, vmovq_n_u8, vpaddq_u8, vqtbl1q_u8, vreinterpretq_u8_s8,
@@ -57,13 +57,13 @@ impl Stage1Parse for SimdInput {
5757
type Utf8Validator = simdutf8::basic::imp::aarch64::neon::ChunkedUtf8ValidatorImp;
5858
type SimdRepresentation = int8x16_t;
5959
#[cfg_attr(not(feature = "no-inline"), inline)]
60-
unsafe fn new(ptr: &[u8]) -> Self {
60+
unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self {
6161
unsafe {
6262
Self {
63-
v0: vld1q_u8(ptr.as_ptr().cast::<u8>()),
64-
v1: vld1q_u8(ptr.as_ptr().add(16).cast::<u8>()),
65-
v2: vld1q_u8(ptr.as_ptr().add(32).cast::<u8>()),
66-
v3: vld1q_u8(ptr.as_ptr().add(48).cast::<u8>()),
63+
v0: vld1q_u8(ptr.as_ptr()),
64+
v1: vld1q_u8(ptr.as_ptr().add(16)),
65+
v2: vld1q_u8(ptr.as_ptr().add(32)),
66+
v3: vld1q_u8(ptr.as_ptr().add(48)),
6767
}
6868
}
6969
}

src/impls/portable/stage1.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@ impl Stage1Parse for SimdInput {
1010
type Utf8Validator = simdutf8::basic::imp::portable::ChunkedUtf8ValidatorImp;
1111
type SimdRepresentation = u8x64;
1212
#[cfg_attr(not(feature = "no-inline"), inline)]
13-
unsafe fn new(ptr: &[u8]) -> Self {
13+
unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self {
1414
Self {
15-
v: u8x64::from_array(*ptr.as_ptr().cast::<[u8; 64]>()),
15+
v: u8x64::from_array(ptr),
1616
}
1717
}
1818

src/impls/simd128/stage1.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ impl Stage1Parse for SimdInput {
1818

1919
#[cfg_attr(not(feature = "no-inline"), inline)]
2020
#[allow(clippy::cast_ptr_alignment)]
21-
unsafe fn new(ptr: &[u8]) -> Self {
21+
unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self {
2222
Self {
2323
v0: v128_load(ptr.as_ptr().cast::<v128>()),
2424
v1: v128_load(ptr.as_ptr().add(16).cast::<v128>()),

src/impls/sse42/stage1.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,13 @@ impl Stage1Parse for SimdInput {
4848
#[target_feature(enable = "sse4.2")]
4949
#[cfg_attr(not(feature = "no-inline"), inline)]
5050
#[allow(clippy::cast_ptr_alignment)]
51-
unsafe fn new(ptr: &[u8]) -> Self {
51+
unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self {
5252
unsafe {
5353
Self {
54-
v0: _mm_loadu_si128(ptr.as_ptr().cast::<arch::__m128i>()),
55-
v1: _mm_loadu_si128(ptr.as_ptr().add(16).cast::<arch::__m128i>()),
56-
v2: _mm_loadu_si128(ptr.as_ptr().add(32).cast::<arch::__m128i>()),
57-
v3: _mm_loadu_si128(ptr.as_ptr().add(48).cast::<arch::__m128i>()),
54+
v0: _mm_load_si128(ptr.as_ptr().cast::<arch::__m128i>()),
55+
v1: _mm_load_si128(ptr.as_ptr().add(16).cast::<arch::__m128i>()),
56+
v2: _mm_load_si128(ptr.as_ptr().add(32).cast::<arch::__m128i>()),
57+
v3: _mm_load_si128(ptr.as_ptr().add(48).cast::<arch::__m128i>()),
5858
}
5959
}
6060
}

src/lib.rs

Lines changed: 65 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ pub(crate) trait Stage1Parse {
150150
type Utf8Validator: ChunkedUtf8Validator;
151151
type SimdRepresentation;
152152

153-
unsafe fn new(ptr: &[u8]) -> Self;
153+
unsafe fn new(ptr: [u8; SIMDINPUT_LENGTH]) -> Self;
154154

155155
unsafe fn compute_quote_mask(quote_bits: u64) -> u64;
156156

@@ -665,6 +665,7 @@ impl Deserializer<'_> {
665665
#[cfg_attr(not(feature = "no-inline"), inline)]
666666
pub(crate) unsafe fn find_structural_bits(
667667
input: &[u8],
668+
len: usize,
668669
structural_indexes: &mut Vec<u32>,
669670
) -> std::result::Result<(), ErrorType> {
670671
// This is a nasty hack, we don't have a chunked implementation for native rust
@@ -675,18 +676,23 @@ impl Deserializer<'_> {
675676
};
676677
#[cfg(not(feature = "portable"))]
677678
unsafe {
678-
Self::_find_structural_bits::<impls::native::SimdInput>(input, structural_indexes)
679+
Self::_find_structural_bits::<impls::native::SimdInput>(input, len, structural_indexes)
679680
}
680681
}
681682

682683
#[cfg(all(feature = "portable", not(feature = "runtime-detection")))]
683684
#[cfg_attr(not(feature = "no-inline"), inline)]
684685
pub(crate) unsafe fn find_structural_bits(
685686
input: &[u8],
687+
len: usize,
686688
structural_indexes: &mut Vec<u32>,
687689
) -> std::result::Result<(), ErrorType> {
688690
unsafe {
689-
Self::_find_structural_bits::<impls::portable::SimdInput>(input, structural_indexes)
691+
Self::_find_structural_bits::<impls::portable::SimdInput>(
692+
input,
693+
len,
694+
structural_indexes,
695+
)
690696
}
691697
}
692698

@@ -698,9 +704,12 @@ impl Deserializer<'_> {
698704
#[cfg_attr(not(feature = "no-inline"), inline)]
699705
pub(crate) unsafe fn find_structural_bits(
700706
input: &[u8],
707+
len: usize,
701708
structural_indexes: &mut Vec<u32>,
702709
) -> std::result::Result<(), ErrorType> {
703-
unsafe { Self::_find_structural_bits::<impls::avx2::SimdInput>(input, structural_indexes) }
710+
unsafe {
711+
Self::_find_structural_bits::<impls::avx2::SimdInput>(input, len, structural_indexes)
712+
}
704713
}
705714

706715
#[cfg(all(
@@ -712,28 +721,35 @@ impl Deserializer<'_> {
712721
#[cfg_attr(not(feature = "no-inline"), inline)]
713722
pub(crate) unsafe fn find_structural_bits(
714723
input: &[u8],
724+
len: usize,
715725
structural_indexes: &mut Vec<u32>,
716726
) -> std::result::Result<(), ErrorType> {
717-
unsafe { Self::_find_structural_bits::<impls::sse42::SimdInput>(input, structural_indexes) }
727+
unsafe {
728+
Self::_find_structural_bits::<impls::sse42::SimdInput>(input, len, structural_indexes)
729+
}
718730
}
719731

720732
#[cfg(all(target_arch = "aarch64", not(feature = "portable")))]
721733
#[cfg_attr(not(feature = "no-inline"), inline)]
722734
pub(crate) unsafe fn find_structural_bits(
723-
input: &[u8],
735+
input: &AlignedBuf,
736+
len: usize,
724737
structural_indexes: &mut Vec<u32>,
725738
) -> std::result::Result<(), ErrorType> {
726-
unsafe { Self::_find_structural_bits::<impls::neon::SimdInput>(input, structural_indexes) }
739+
unsafe {
740+
Self::_find_structural_bits::<impls::neon::SimdInput>(input, len, structural_indexes)
741+
}
727742
}
728743

729744
#[cfg(all(target_feature = "simd128", not(feature = "portable")))]
730745
#[cfg_attr(not(feature = "no-inline"), inline)]
731746
pub(crate) unsafe fn find_structural_bits(
732747
input: &[u8],
748+
len: usize,
733749
structural_indexes: &mut Vec<u32>,
734750
) -> std::result::Result<(), ErrorType> {
735751
unsafe {
736-
Self::_find_structural_bits::<impls::simd128::SimdInput>(input, structural_indexes)
752+
Self::_find_structural_bits::<impls::simd128::SimdInput>(input, len, structural_indexes)
737753
}
738754
}
739755
}
@@ -795,7 +811,7 @@ impl<'de> Deserializer<'de> {
795811
buffer: &mut Buffers,
796812
tape: &mut Vec<Node<'de>>,
797813
) -> Result<()> {
798-
const LOTS_OF_ZOERS: [u8; SIMDINPUT_LENGTH] = [0; SIMDINPUT_LENGTH];
814+
const LOTS_OF_ZOERS: [u8; SIMDINPUT_LENGTH] = [0x20; SIMDINPUT_LENGTH];
799815
let len = input.len();
800816
let simd_safe_len = len + SIMDINPUT_LENGTH;
801817

@@ -830,7 +846,7 @@ impl<'de> Deserializer<'de> {
830846
// safety: all bytes are initialized
831847
input_buffer.set_len(simd_safe_len);
832848

833-
Self::find_structural_bits(input, &mut buffer.structural_indexes)
849+
Self::find_structural_bits(input_buffer, input.len(), &mut buffer.structural_indexes)
834850
.map_err(Error::generic)?;
835851
};
836852

@@ -881,10 +897,11 @@ impl<'de> Deserializer<'de> {
881897
#[cfg_attr(not(feature = "no-inline"), inline)]
882898
#[allow(clippy::cast_possible_truncation)]
883899
pub(crate) unsafe fn _find_structural_bits<S: Stage1Parse>(
884-
input: &[u8],
900+
input: &AlignedBuf,
901+
len: usize,
885902
structural_indexes: &mut Vec<u32>,
886903
) -> std::result::Result<(), ErrorType> {
887-
let len = input.len();
904+
// let len = input.len();
888905
// 8 is a heuristic number to estimate it turns out a rate of 1/8 structural characters
889906
// leads almost never to relocations.
890907
structural_indexes.clear();
@@ -916,18 +933,18 @@ impl<'de> Deserializer<'de> {
916933
// expensive carryless multiply in the previous step with this work
917934
let mut structurals: u64 = 0;
918935

919-
let lenminus64: usize = if len < 64 { 0 } else { len - 64 };
936+
// let lenminus64: usize = if len < 64 { 0 } else { len - 64 };
920937
let mut idx: usize = 0;
921938
let mut error_mask: u64 = 0; // for unescaped characters within strings (ASCII code points < 0x20)
922939

923-
while idx < lenminus64 {
940+
while idx <= len / SIMDINPUT_LENGTH {
924941
/*
925942
#ifndef _MSC_VER
926943
__builtin_prefetch(buf + idx + 128);
927944
#endif
928945
*/
929-
let chunk = unsafe { input.get_kinda_unchecked(idx..idx + 64) };
930-
unsafe { utf8_validator.update_from_chunks(chunk) };
946+
let chunk: [u8; SIMDINPUT_LENGTH] = unsafe { input.load_register(idx) };
947+
unsafe { utf8_validator.update_from_chunks(&chunk) };
931948

932949
let input = unsafe { S::new(chunk) };
933950
// detect odd sequences of backslashes
@@ -946,7 +963,7 @@ impl<'de> Deserializer<'de> {
946963

947964
// take the previous iterations structural bits, not our current iteration,
948965
// and flatten
949-
unsafe { S::flatten_bits(structural_indexes, idx as u32, structurals) };
966+
unsafe { S::flatten_bits(structural_indexes, (idx * 64) as u32, structurals) };
950967

951968
let mut whitespace: u64 = 0;
952969
unsafe { input.find_whitespace_and_structurals(&mut whitespace, &mut structurals) };
@@ -959,60 +976,15 @@ impl<'de> Deserializer<'de> {
959976
quote_bits,
960977
&mut prev_iter_ends_pseudo_pred,
961978
);
962-
idx += SIMDINPUT_LENGTH;
979+
idx += 1;
963980
}
964981

965-
// we use a giant copy-paste which is ugly.
966-
// but otherwise the string needs to be properly padded or else we
967-
// risk invalidating the UTF-8 checks.
968-
if idx < len {
969-
let mut tmpbuf: [u8; SIMDINPUT_LENGTH] = [0x20; SIMDINPUT_LENGTH];
970-
unsafe {
971-
tmpbuf
972-
.as_mut_ptr()
973-
.copy_from(input.as_ptr().add(idx), len - idx);
974-
};
975-
unsafe { utf8_validator.update_from_chunks(&tmpbuf) };
976-
977-
let input = unsafe { S::new(&tmpbuf) };
978-
979-
// detect odd sequences of backslashes
980-
let odd_ends: u64 =
981-
input.find_odd_backslash_sequences(&mut prev_iter_ends_odd_backslash);
982-
983-
// detect insides of quote pairs ("quote_mask") and also our quote_bits
984-
// themselves
985-
let mut quote_bits: u64 = 0;
986-
let quote_mask: u64 = input.find_quote_mask_and_bits(
987-
odd_ends,
988-
&mut prev_iter_inside_quote,
989-
&mut quote_bits,
990-
&mut error_mask,
991-
);
992-
993-
// take the previous iterations structural bits, not our current iteration,
994-
// and flatten
995-
unsafe { S::flatten_bits(structural_indexes, idx as u32, structurals) };
996-
997-
let mut whitespace: u64 = 0;
998-
unsafe { input.find_whitespace_and_structurals(&mut whitespace, &mut structurals) };
999-
1000-
// fixup structurals to reflect quotes and add pseudo-structural characters
1001-
structurals = S::finalize_structurals(
1002-
structurals,
1003-
whitespace,
1004-
quote_mask,
1005-
quote_bits,
1006-
&mut prev_iter_ends_pseudo_pred,
1007-
);
1008-
idx += SIMDINPUT_LENGTH;
1009-
}
1010982
// This test isn't in upstream, for some reason the error mask is et for then.
1011983
if prev_iter_inside_quote != 0 {
1012984
return Err(ErrorType::Syntax);
1013985
}
1014986
// finally, flatten out the remaining structurals from the last iteration
1015-
unsafe { S::flatten_bits(structural_indexes, idx as u32, structurals) };
987+
unsafe { S::flatten_bits(structural_indexes, (idx * 64) as u32, structurals) };
1016988

1017989
// a valid JSON file cannot have zero structural indexes - we should have
1018990
// found something (note that we compare to 1 as we always add the root!)
@@ -1051,22 +1023,40 @@ impl AlignedBuf {
10511023
/// Creates a new buffer that is aligned with the simd register size
10521024
#[must_use]
10531025
pub fn with_capacity(capacity: usize) -> Self {
1054-
let Ok(layout) = Layout::from_size_align(capacity, SIMDJSON_PADDING) else {
1055-
Self::capacity_overflow()
1026+
let offset = capacity % SIMDINPUT_LENGTH;
1027+
let capacity = if offset == 0 {
1028+
capacity
1029+
} else {
1030+
capacity + SIMDINPUT_LENGTH - offset
10561031
};
1032+
10571033
if mem::size_of::<usize>() < 8 && capacity > isize::MAX as usize {
10581034
Self::capacity_overflow()
10591035
}
1036+
let layout = match Layout::from_size_align(capacity, SIMDINPUT_LENGTH) {
1037+
Ok(layout) => layout,
1038+
Err(_) => Self::capacity_overflow(),
1039+
};
1040+
1041+
let inner = match unsafe { NonNull::new(alloc(layout)) } {
1042+
Some(ptr) => ptr,
1043+
None => handle_alloc_error(layout),
1044+
};
1045+
Self {
1046+
layout,
1047+
capacity,
1048+
len: 0,
1049+
inner,
1050+
}
1051+
}
1052+
1053+
unsafe fn load_register(&self, idx: usize) -> [u8; SIMDINPUT_LENGTH] {
10601054
unsafe {
1061-
let Some(inner) = NonNull::new(alloc(layout)) else {
1062-
handle_alloc_error(layout)
1063-
};
1064-
Self {
1065-
layout,
1066-
capacity,
1067-
len: 0,
1068-
inner,
1069-
}
1055+
self.inner
1056+
.as_ptr()
1057+
.cast::<[u8; SIMDINPUT_LENGTH]>()
1058+
.add(idx)
1059+
.read()
10701060
}
10711061
}
10721062

0 commit comments

Comments
 (0)