Skip to content

Commit b5047a6

Browse files
committed
Reduce x86_cpu_feature size on non-x86 platforms
1 parent 5a7d696 commit b5047a6

File tree

1 file changed

+180
-120
lines changed

1 file changed

+180
-120
lines changed

include/xsimd/config/xsimd_cpu_features_x86.hpp

Lines changed: 180 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -566,6 +566,99 @@ namespace xsimd
566566
xcr0_reg_t m_low {};
567567
};
568568

569+
/**
570+
* Orchestrator for `CPUID` calls.
571+
*
572+
* This class orchestrate `CPUID` and `XCR0` calls so that they are made in the appropriate
573+
* order. It also implements lazy calling and cache mechanism around those calls.
574+
* Works on all platforms, and return all zeros on non `x86` platforms.
575+
*/
576+
class x86_cpu_features_backend_cpuid
577+
{
578+
public:
579+
x86_cpu_features_backend_cpuid() noexcept = default;
580+
581+
inline x86_xcr0 const& xcr0() const noexcept;
582+
inline x86_cpuid_leaf0 const& leaf0() const;
583+
inline x86_cpuid_leaf80000000 const& leaf80000000() const;
584+
inline x86_cpuid_leaf1 const& leaf1() const;
585+
inline x86_cpuid_leaf7 const& leaf7() const;
586+
inline x86_cpuid_leaf7sub1 const& leaf7sub1() const;
587+
inline x86_cpuid_leaf80000001 const& leaf80000001() const;
588+
589+
private:
590+
enum class status
591+
{
592+
leaf0_valid = 0,
593+
leaf1_valid = 1,
594+
leaf7_valid = 2,
595+
leaf7sub1_valid = 3,
596+
leaf80000000_valid = 4,
597+
leaf80000001_valid = 5,
598+
xcr0_valid = 6,
599+
};
600+
601+
using status_bitset = utils::uint_bitset<status, std::uint32_t>;
602+
603+
mutable x86_cpuid_leaf0 m_leaf0 {};
604+
mutable x86_cpuid_leaf1 m_leaf1 {};
605+
mutable x86_cpuid_leaf7 m_leaf7 {};
606+
mutable x86_cpuid_leaf7sub1 m_leaf7sub1 {};
607+
mutable x86_cpuid_leaf80000000 m_leaf80000000 {};
608+
mutable x86_cpuid_leaf80000001 m_leaf80000001 {};
609+
mutable x86_xcr0 m_xcr0 {};
610+
mutable status_bitset m_status {};
611+
612+
inline bool osxsave() const noexcept;
613+
614+
/**
615+
* Internal utility to lazily read and cache a CPUID leaf.
616+
*
617+
* @tparam status_id The status bit tracking whether this leaf has been read and cached.
618+
* @tparam L The CPUID leaf type (e.g. x86_cpuid_leaf1, x86_cpuid_leaf7).
619+
* @param leaf_cache A non-const reference to the class member that stores the leaf
620+
* value. It must be non-const because this function may write to it on first
621+
* call. It is passed explicitly (rather than accessed via `this`) to allow
622+
* factoring the caching logic across different leaf members.
623+
* @return A const reference to `leaf_cache`. The non-const input / const-ref output
624+
* asymmetry is intentional: callers must not modify the cached value, but
625+
* this function needs write access to populate it.
626+
*
627+
* On first call, checks whether the leaf number is within the range advertised as
628+
* supported by CPUID (via leaf 0 for the standard range, leaf 0x80000000 for the
629+
* extended range). If supported, reads the leaf from the CPU; otherwise leaves
630+
* `leaf_cache` at its zero-initialized default (all feature bits false). Either
631+
* way, `status_id` is set so subsequent calls return immediately.
632+
*/
633+
template <status status_id, typename L>
634+
inline auto const& safe_read_leaf(L& leaf_cache) const;
635+
};
636+
637+
/**
638+
* No-Op orchestrator for `CPUID` calls
639+
*
640+
* This does nothing and return zero-constructed objects on all calls.
641+
* This is meant as an optimization on non `x86` platforms as the
642+
* `x86_cpu_features_backend_cpuid` can be slightly large (hundred of bytes).
643+
*/
644+
class x86_cpu_features_backend_noop
645+
{
646+
public:
647+
constexpr x86_xcr0 xcr0() const noexcept { return {}; }
648+
constexpr x86_cpuid_leaf0 leaf0() const { return {}; }
649+
constexpr x86_cpuid_leaf80000000 leaf80000000() const { return {}; }
650+
constexpr x86_cpuid_leaf1 leaf1() const { return {}; }
651+
constexpr x86_cpuid_leaf7 leaf7() const { return {}; }
652+
constexpr x86_cpuid_leaf7sub1 leaf7sub1() const { return {}; }
653+
constexpr x86_cpuid_leaf80000001 leaf80000001() const { return {}; }
654+
};
655+
656+
#if XSIMD_TARGET_X86
657+
using x86_cpu_features_backend_default = x86_cpu_features_backend_cpuid;
658+
#else
659+
using x86_cpu_features_backend_default = x86_cpu_features_backend_noop;
660+
#endif
661+
569662
/**
570663
* An opiniated CPU feature detection utility for x86.
571664
*
@@ -576,7 +669,7 @@ namespace xsimd
576669
* This is well defined on all architectures. It will always return false on
577670
* non-x86 architectures.
578671
*/
579-
class x86_cpu_features
672+
class x86_cpu_features : private x86_cpu_features_backend_default
580673
{
581674
public:
582675
x86_cpu_features() noexcept = default;
@@ -681,155 +774,122 @@ namespace xsimd
681774
inline bool avxvnni() const noexcept { return avx_enabled() && leaf7sub1().all_bits_set<x86_cpuid_leaf7sub1::eax::avxvnni>(); }
682775

683776
inline bool fma4() const noexcept { return avx_enabled() && leaf80000001().all_bits_set<x86_cpuid_leaf80000001::ecx::fma4>(); }
777+
};
684778

685-
private:
686-
enum class status
687-
{
688-
leaf0_valid = 0,
689-
leaf1_valid = 1,
690-
leaf7_valid = 2,
691-
leaf7sub1_valid = 3,
692-
leaf80000000_valid = 4,
693-
leaf80000001_valid = 5,
694-
xcr0_valid = 6,
695-
};
696-
697-
using status_bitset = utils::uint_bitset<status, std::uint32_t>;
779+
/********************
780+
* Implementation *
781+
********************/
698782

699-
mutable x86_cpuid_leaf0 m_leaf0 {};
700-
mutable x86_cpuid_leaf1 m_leaf1 {};
701-
mutable x86_cpuid_leaf7 m_leaf7 {};
702-
mutable x86_cpuid_leaf7sub1 m_leaf7sub1 {};
703-
mutable x86_cpuid_leaf80000000 m_leaf80000000 {};
704-
mutable x86_cpuid_leaf80000001 m_leaf80000001 {};
705-
mutable x86_xcr0 m_xcr0 {};
706-
mutable status_bitset m_status {};
707-
708-
inline x86_xcr0 const& xcr0() const noexcept
783+
template <x86_cpu_features_backend_cpuid::status status_id, typename L>
784+
inline auto const& x86_cpu_features_backend_cpuid::safe_read_leaf(L& leaf_cache) const
785+
{
786+
// Check if already initialized
787+
if (m_status.bit_is_set<status_id>())
709788
{
710-
if (!m_status.bit_is_set<status::xcr0_valid>())
711-
{
712-
m_xcr0 = osxsave() ? x86_xcr0::read() : x86_xcr0::safe_default();
713-
m_status.set_bit<status::xcr0_valid>();
714-
}
715-
return m_xcr0;
789+
return leaf_cache;
716790
}
717791

718-
inline x86_cpuid_leaf0 const& leaf0() const
719-
{
720-
if (!m_status.bit_is_set<status::leaf0_valid>())
721-
{
722-
m_leaf0 = x86_cpuid_leaf0::read();
723-
m_status.set_bit<status::leaf0_valid>();
724-
}
725-
return m_leaf0;
726-
}
792+
// Limit where we need to check leaf0 or leaf 80000000.
793+
constexpr auto extended_threshold = x86_cpuid_leaf80000000::leaf;
727794

728-
inline x86_cpuid_leaf80000000 const& leaf80000000() const
795+
// Check if it is safe to call CPUID with this value.
796+
// First we identify if the leaf is in the regular or extended range.
797+
// TODO(C++17): if constexpr
798+
if (L::leaf < extended_threshold)
729799
{
730-
if (!m_status.bit_is_set<status::leaf80000000_valid>())
800+
// Check leaf0 in regular range
801+
if (L::leaf <= leaf0().highest_leaf())
731802
{
732-
m_leaf80000000 = x86_cpuid_leaf80000000::read();
733-
m_status.set_bit<status::leaf80000000_valid>();
803+
leaf_cache = L::read();
734804
}
735-
return m_leaf80000000;
736805
}
737-
738-
/**
739-
* Internal utility to lazily read and cache a CPUID leaf.
740-
*
741-
* @tparam status_id The status bit tracking whether this leaf has been read and cached.
742-
* @tparam L The CPUID leaf type (e.g. x86_cpuid_leaf1, x86_cpuid_leaf7).
743-
* @param leaf_cache A non-const reference to the class member that stores the leaf
744-
* value. It must be non-const because this function may write to it on first
745-
* call. It is passed explicitly (rather than accessed via `this`) to allow
746-
* factoring the caching logic across different leaf members.
747-
* @return A const reference to `leaf_cache`. The non-const input / const-ref output
748-
* asymmetry is intentional: callers must not modify the cached value, but
749-
* this function needs write access to populate it.
750-
*
751-
* On first call, checks whether the leaf number is within the range advertised as
752-
* supported by CPUID (via leaf 0 for the standard range, leaf 0x80000000 for the
753-
* extended range). If supported, reads the leaf from the CPU; otherwise leaves
754-
* `leaf_cache` at its zero-initialized default (all feature bits false). Either
755-
* way, `status_id` is set so subsequent calls return immediately.
756-
*/
757-
template <status status_id, typename L>
758-
inline auto const& safe_read_leaf(L& leaf_cache) const
806+
else
759807
{
760-
// Check if already initialized
761-
if (m_status.bit_is_set<status_id>())
808+
// Check leaf80000000 in extended range
809+
if (L::leaf <= leaf80000000().highest_leaf())
762810
{
763-
return leaf_cache;
811+
leaf_cache = L::read();
764812
}
813+
}
765814

766-
// Limit where we need to check leaf0 or leaf 80000000.
767-
constexpr auto extended_threshold = x86_cpuid_leaf80000000::leaf;
768-
769-
// Check if it is safe to call CPUID with this value.
770-
// First we identify if the leaf is in the regular or extended range.
771-
// TODO(C++17): if constexpr
772-
if (L::leaf < extended_threshold)
773-
{
774-
// Check leaf0 in regular range
775-
if (L::leaf <= leaf0().highest_leaf())
776-
{
777-
leaf_cache = L::read();
778-
}
779-
}
780-
else
781-
{
782-
// Check leaf80000000 in extended range
783-
if (L::leaf <= leaf80000000().highest_leaf())
784-
{
785-
leaf_cache = L::read();
786-
}
787-
}
815+
// Mark as valid in all cases, including if it was not read.
816+
// In this case it will be filled with zeros (all false).
817+
m_status.set_bit<status_id>();
818+
return leaf_cache;
819+
}
788820

789-
// Mark as valid in all cases, including if it was not read.
790-
// In this case it will be filled with zeros (all false).
791-
m_status.set_bit<status_id>();
792-
return leaf_cache;
821+
inline x86_xcr0 const& x86_cpu_features_backend_cpuid::xcr0() const noexcept
822+
{
823+
if (!m_status.bit_is_set<status::xcr0_valid>())
824+
{
825+
m_xcr0 = osxsave() ? x86_xcr0::read() : x86_xcr0::safe_default();
826+
m_status.set_bit<status::xcr0_valid>();
793827
}
828+
return m_xcr0;
829+
}
794830

795-
inline x86_cpuid_leaf1 const& leaf1() const
831+
inline x86_cpuid_leaf0 const& x86_cpu_features_backend_cpuid::leaf0() const
832+
{
833+
if (!m_status.bit_is_set<status::leaf0_valid>())
796834
{
797-
return safe_read_leaf<status::leaf1_valid>(m_leaf1);
835+
m_leaf0 = x86_cpuid_leaf0::read();
836+
m_status.set_bit<status::leaf0_valid>();
798837
}
838+
return m_leaf0;
839+
}
799840

800-
inline x86_cpuid_leaf7 const& leaf7() const
841+
inline x86_cpuid_leaf80000000 const& x86_cpu_features_backend_cpuid::leaf80000000() const
842+
{
843+
if (!m_status.bit_is_set<status::leaf80000000_valid>())
801844
{
802-
return safe_read_leaf<status::leaf7_valid>(m_leaf7);
845+
m_leaf80000000 = x86_cpuid_leaf80000000::read();
846+
m_status.set_bit<status::leaf80000000_valid>();
803847
}
848+
return m_leaf80000000;
849+
}
804850

805-
inline x86_cpuid_leaf7sub1 const& leaf7sub1() const
806-
{
807-
// Check if already initialized
808-
if (m_status.bit_is_set<status::leaf7sub1_valid>())
809-
{
810-
return m_leaf7sub1;
811-
}
851+
inline x86_cpuid_leaf1 const& x86_cpu_features_backend_cpuid::leaf1() const
852+
{
853+
return safe_read_leaf<status::leaf1_valid>(m_leaf1);
854+
}
812855

813-
// Check if safe to call CPUID with this value as subleaf.
814-
constexpr auto start = x86_cpuid_leaf7::eax::highest_subleaf_start;
815-
constexpr auto end = x86_cpuid_leaf7::eax::highest_subleaf_end;
816-
const auto highest_subleaf7 = leaf7().get_range<start, end>();
817-
if (x86_cpuid_leaf7sub1::subleaf <= highest_subleaf7)
818-
{
819-
m_leaf7sub1 = x86_cpuid_leaf7sub1::read();
820-
}
856+
inline x86_cpuid_leaf7 const& x86_cpu_features_backend_cpuid::leaf7() const
857+
{
858+
return safe_read_leaf<status::leaf7_valid>(m_leaf7);
859+
}
821860

822-
// Mark as valid in all cases, including if it was not read.
823-
// In this case it will be filled with zeros (all false).
824-
m_status.set_bit<status::leaf7sub1_valid>();
861+
inline x86_cpuid_leaf7sub1 const& x86_cpu_features_backend_cpuid::leaf7sub1() const
862+
{
863+
// Check if already initialized
864+
if (m_status.bit_is_set<status::leaf7sub1_valid>())
865+
{
825866
return m_leaf7sub1;
826867
}
827868

828-
inline x86_cpuid_leaf80000001 const& leaf80000001() const
869+
// Check if safe to call CPUID with this value as subleaf.
870+
constexpr auto start = x86_cpuid_leaf7::eax::highest_subleaf_start;
871+
constexpr auto end = x86_cpuid_leaf7::eax::highest_subleaf_end;
872+
const auto highest_subleaf7 = leaf7().get_range<start, end>();
873+
if (x86_cpuid_leaf7sub1::subleaf <= highest_subleaf7)
829874
{
830-
return safe_read_leaf<status::leaf80000001_valid>(m_leaf80000001);
875+
m_leaf7sub1 = x86_cpuid_leaf7sub1::read();
831876
}
832-
};
877+
878+
// Mark as valid in all cases, including if it was not read.
879+
// In this case it will be filled with zeros (all false).
880+
m_status.set_bit<status::leaf7sub1_valid>();
881+
return m_leaf7sub1;
882+
}
883+
884+
inline x86_cpuid_leaf80000001 const& x86_cpu_features_backend_cpuid::leaf80000001() const
885+
{
886+
return safe_read_leaf<status::leaf80000001_valid>(m_leaf80000001);
887+
}
888+
889+
inline bool x86_cpu_features_backend_cpuid::osxsave() const noexcept
890+
{
891+
return leaf1().all_bits_set<x86_cpuid_leaf1::ecx::osxsave>();
892+
}
833893

834894
namespace detail
835895
{

0 commit comments

Comments
 (0)