@@ -444,6 +444,142 @@ struct CompressTraits<SfpStream> {
444444 }
445445};
446446
447+ template <>
448+ struct CompressTraits <int8_t > {
449+ using Packed = int8_t ;
450+
451+ static size_t CompressBound (size_t num) { return num * sizeof (Packed); }
452+
453+ template <class DF , HWY_IF_F32_D (DF )>
454+ static HWY_INLINE void Compress (DF df, const float * HWY_RESTRICT raw,
455+ size_t num, CompressPerThread& /* tls*/ ,
456+ const PackedSpan<Packed>& packed,
457+ const size_t packed_ofs) {
458+ const hn::Repartition<int32_t , DF > di32;
459+ const hn::Repartition<int16_t , DF > di16;
460+ const hn::Repartition<int8_t , DF > di8;
461+ const auto di16_16 = hn::Half<decltype (di16)>();
462+ const auto di8_16 = hn::Half<decltype (di8)>();
463+ using VF = hn::Vec<DF >;
464+ const size_t NF = hn::Lanes (df);
465+
466+ size_t i = 0 ;
467+ if (num >= 2 * NF ) {
468+ for (; i <= num - 2 * NF ; i += 2 * NF ) {
469+ const VF v0 = hn::LoadU (df, raw + i);
470+ const VF v1 = hn::LoadU (df, raw + i + NF );
471+ const auto vi32_0 = hn::NearestInt (v0);
472+ const auto vi32_1 = hn::NearestInt (v1);
473+ const auto vi16 = hn::OrderedDemote2To (di16, vi32_0, vi32_1);
474+ const auto vi8 = hn::OrderedDemote2To (
475+ di8_16, hn::UpperHalf (di16_16, vi16), hn::LowerHalf (di16_16, vi16));
476+ hn::StoreU (vi8, di8_16, packed.ptr + packed_ofs + i);
477+ }
478+ }
479+ const size_t remaining = num - i;
480+ if (remaining > 0 ) {
481+ HWY_ALIGN float buf[2 * NF ];
482+ hwy::ZeroBytes (buf, 2 * NF * sizeof (float ));
483+ for (size_t j = 0 ; j < remaining; ++j) buf[j] = raw[i + j];
484+ const VF v0 = hn::LoadU (df, buf);
485+ const VF v1 = hn::LoadU (df, buf + NF );
486+ const auto vi32_0 = hn::NearestInt (v0);
487+ const auto vi32_1 = hn::NearestInt (v1);
488+ const auto vi16 = hn::OrderedDemote2To (di16, vi32_0, vi32_1);
489+ const auto vi8 = hn::OrderedDemote2To (
490+ di8_16, hn::UpperHalf (di16_16, vi16), hn::LowerHalf (di16_16, vi16));
491+ hn::StoreN (vi8, di8_16, packed.ptr + packed_ofs + i, remaining);
492+ }
493+ }
494+
495+ static float ToFloatSlow (const Packed x) { return static_cast <float >(x); }
496+
497+ template <class DF , HWY_IF_F32_D (DF )>
498+ static HWY_INLINE void Load2 (DF df, const PackedSpan<const Packed>& packed,
499+ const size_t packed_ofs, hn::Vec<DF >& raw0,
500+ hn::Vec<DF >& raw1) {
501+ const hn::Repartition<int32_t , DF > di32;
502+ const hn::Repartition<int16_t , DF > di16;
503+ const hn::Rebind<int8_t , decltype (di16)> di8_half;
504+
505+ const auto vec_i8 = hn::LoadU (di8_half, packed.ptr + packed_ofs);
506+ const auto vec_i16 = hn::PromoteTo (di16, vec_i8);
507+ const auto vec_i32_0 = hn::PromoteLowerTo (di32, vec_i16);
508+ const auto vec_i32_1 = hn::PromoteUpperTo (di32, vec_i16);
509+
510+ raw0 = hn::ConvertTo (df, vec_i32_0);
511+ raw1 = hn::ConvertTo (df, vec_i32_1);
512+ }
513+
514+ template <class DBF , HWY_IF_BF16_D (DBF )>
515+ static HWY_INLINE void Load2 (DBF dbf, const PackedSpan<const Packed>& packed,
516+ const size_t packed_ofs, hn::Vec<DBF >& raw0,
517+ hn::Vec<DBF >& raw1) {
518+ const hn::Repartition<float , DBF > df;
519+ using VF = hn::Vec<decltype (df)>;
520+ const size_t NF = hn::Lanes (df);
521+
522+ VF f0, f1, f2, f3;
523+ Load2 (df, packed, packed_ofs, f0, f1);
524+ Load2 (df, packed, packed_ofs + 2 * NF , f2, f3);
525+
526+ raw0 = hn::OrderedDemote2To (dbf, f0, f1);
527+ raw1 = hn::OrderedDemote2To (dbf, f2, f3);
528+ }
529+
530+ template <class DF , HWY_IF_F32_D (DF )>
531+ static HWY_INLINE void DecompressAndZeroPad (
532+ DF df, const PackedSpan<const Packed>& packed, const size_t packed_ofs,
533+ float * HWY_RESTRICT raw, size_t num) {
534+ using VF = hn::Vec<decltype (df)>;
535+ const size_t NF = hn::Lanes (df);
536+
537+ size_t i = 0 ;
538+ if (num >= 2 * NF ) {
539+ for (; i <= num - 2 * NF ; i += 2 * NF ) {
540+ VF raw0, raw1;
541+ Load2 (df, packed, packed_ofs + i, raw0, raw1);
542+ hn::StoreU (raw0, df, raw + i);
543+ hn::StoreU (raw1, df, raw + i + NF );
544+ }
545+ }
546+
547+ const size_t remaining = num - i;
548+ if (HWY_UNLIKELY (remaining != 0 )) {
549+ for (size_t j = 0 ; j < remaining; ++j) {
550+ raw[i + j] = static_cast <float >(packed.ptr [packed_ofs + i + j]);
551+ }
552+ }
553+ }
554+
555+ template <class DBF , HWY_IF_BF16_D (DBF )>
556+ static HWY_INLINE void DecompressAndZeroPad (
557+ DBF dbf, const PackedSpan<const Packed>& packed, const size_t packed_ofs,
558+ BF16 * HWY_RESTRICT raw, size_t num) {
559+ const hn::Repartition<float , DBF > df;
560+ const size_t NF = hn::Lanes (df);
561+ size_t i = 0 ;
562+ const size_t NBF = hn::Lanes (dbf);
563+ if (num >= NBF ) {
564+ for (; i <= num - NBF ; i += NBF ) {
565+ hn::Vec<decltype (df)> f0, f1;
566+ Load2 (df, packed, packed_ofs + i, f0, f1);
567+ auto vbf = hn::OrderedDemote2To (dbf, f0, f1);
568+ hn::StoreU (vbf, dbf, raw + i);
569+ }
570+ }
571+ const size_t remaining = num - i;
572+ if (remaining > 0 ) {
573+ HWY_ALIGN float buf[2 * hn::MaxLanes (df)];
574+ DecompressAndZeroPad (df, packed, packed_ofs + i, buf, remaining);
575+ auto f0 = hn::LoadU (df, buf);
576+ auto f1 = hn::LoadU (df, buf + NF );
577+ auto vbf = hn::OrderedDemote2To (dbf, f0, f1);
578+ hn::StoreN (vbf, dbf, raw + i, remaining);
579+ }
580+ }
581+ };
582+
447583// Integer quantization.
448584template <>
449585struct CompressTraits <I8Stream> {
0 commit comments