@@ -875,8 +875,9 @@ class MMPerPackage {
875875 inner_tasks_(config.InnerTasks()),
876876 out_(config.Out()),
877877 line_bytes_(args.env->ctx.allocator.LineBytes()) {
878+ static const uint32_t zone_id = PROFILER_ADD_ZONE (" MM.DecompressA" );
878879 MMZone zone;
879- zone.MaybeEnter (" MM.DecompressA " , args_);
880+ zone.MaybeEnter (pkg_idx, zone_id , args_);
880881 A_ = DecompressA (A);
881882 }
882883
@@ -914,8 +915,7 @@ class MMPerPackage {
914915 // Single M and K ranges, parallel N. Fills all of C directly.
915916 template <typename TB, typename TC>
916917 HWY_INLINE void DoNT (const MatPtrT<TB>& B, RowPtrs<TC> C_rows) const {
917- MMZone zone;
918- zone.MaybeEnter (" MM.NT" , args_);
918+ static const uint32_t zone_id = PROFILER_ADD_ZONE (" MM.NT" );
919919 HWY_DASSERT (ranges_mc_.NumTasks () == 1 );
920920 HWY_DASSERT (ranges_kc_.NumTasks () == 1 );
921921 const IndexRange& range_M = ranges_mc_.Range (0 );
@@ -928,7 +928,10 @@ class MMPerPackage {
928928 // Similar to `loop_nc` below, but here we hoisted `A_view`.
929929 args_.env ->parallel .ForNP (
930930 range_np_, MultipleNP (sizeof (TC)), inner_tasks_, pkg_idx_,
931- [&](const IndexRange& range_nc) HWY_ATTR {
931+ [&](const IndexRange& range_nc, size_t worker) HWY_ATTR {
932+ MMZone zone;
933+ zone.MaybeEnter (worker, zone_id, args_);
934+
932935 HWY_ALIGN BF16 B_storage[B_storage_max_]; // TLS
933936 const StridedViewBF B_storage_view (B_storage, K, B_stride);
934937
@@ -947,8 +950,7 @@ class MMPerPackage {
947950 // Single M range, parallel N, sequential K. Fills all of partial.
948951 template <typename TB, typename TC>
949952 HWY_INLINE void DoNT_K (const MatPtrT<TB>& B, RowPtrs<TC> C_rows) const {
950- MMZone zone;
951- zone.MaybeEnter (" MM.NT_K" , args_);
953+ static const uint32_t zone_id = PROFILER_ADD_ZONE (" MM.NT_K" );
952954 HWY_DASSERT (ranges_mc_.NumTasks () == 1 );
953955 const IndexRange& range_mc = ranges_mc_.Range (0 );
954956
@@ -975,7 +977,10 @@ class MMPerPackage {
975977
976978 args_.env ->parallel .ForNP (
977979 range_np_, MultipleNP (sizeof (TC)), inner_tasks_, pkg_idx_,
978- [&](const IndexRange& range_nc) HWY_ATTR {
980+ [&](const IndexRange& range_nc, size_t worker) HWY_ATTR {
981+ MMZone zone;
982+ zone.MaybeEnter (worker, zone_id, args_);
983+
979984 HWY_ALIGN BF16 B_storage[B_storage_max_]; // TLS
980985
981986 // Peel off the first iteration of the kc loop: avoid
@@ -988,14 +993,17 @@ class MMPerPackage {
988993 });
989994 });
990995
991- MMZone fill_zone;
992996 if (out_ == MMOut::kCopy ) {
993- fill_zone.MaybeEnter (" MM.NT_K.FillC" , args_);
997+ static const uint32_t zone_id = PROFILER_ADD_ZONE (" MM.NT_K.FillC.Copy" );
998+ MMZone fill_zone;
999+ fill_zone.MaybeEnter (0 , zone_id, args_);
9941000 MMScaleDemoteAdd::FillC (range_mc, range_np_, args_, C_rows);
9951001 } else if (out_ == MMOut::kParM ) {
996- fill_zone. MaybeEnter (" MM.NT_K.FillC.ParM" , args_ );
1002+ static const uint32_t zone_id = PROFILER_ADD_ZONE (" MM.NT_K.FillC.ParM" );
9971003 args_.env ->parallel .ForRangeMC (
998- range_mc, pkg_idx_, [&](size_t row_a) HWY_ATTR {
1004+ range_mc, pkg_idx_, [&](size_t row_a, size_t worker) HWY_ATTR {
1005+ MMZone fill_zone;
1006+ fill_zone.MaybeEnter (worker, zone_id, args_);
9991007 MMScaleDemoteAdd::FillC (IndexRange (row_a, row_a + 1 ), range_np_,
10001008 args_, C_rows);
10011009 });
@@ -1008,8 +1016,7 @@ class MMPerPackage {
10081016 // Fills `mc x nc` sections of C directly, in parallel.
10091017 template <typename TB, typename TC>
10101018 HWY_INLINE void DoNT_MT (const MatPtrT<TB>& B, RowPtrs<TC> C_rows) const {
1011- MMZone zone;
1012- zone.MaybeEnter (" MM.NT_MT" , args_);
1019+ static const uint32_t zone_id = PROFILER_ADD_ZONE (" MM.NT_MT" );
10131020 HWY_DASSERT (ranges_kc_.NumTasks () == 1 );
10141021 const IndexRange& range_K = ranges_kc_.Range (0 );
10151022 const size_t K = range_K.Num ();
@@ -1020,7 +1027,11 @@ class MMPerPackage {
10201027 // except for the profiler strings and `out_tag`.
10211028 args_.env ->parallel .ForRangesMC_NC (
10221029 ranges_mc_, ranges_nc_, pkg_idx_,
1023- [&](const IndexRange& range_mc, const IndexRange& range_nc) HWY_ATTR {
1030+ [&](const IndexRange& range_mc, const IndexRange& range_nc,
1031+ size_t worker) HWY_ATTR {
1032+ MMZone zone;
1033+ zone.MaybeEnter (worker, zone_id, args_);
1034+
10241035 const StridedViewBF& A_view = A_.View (range_mc.begin (), 0 , K);
10251036 HWY_ALIGN BF16 B_storage[B_storage_max_]; // TLS
10261037 const StridedViewBF B_storage_view (B_storage, K, B_stride);
@@ -1041,8 +1052,8 @@ class MMPerPackage {
10411052 // Fills `mc x nc` sections of `partial`, then `C`, in parallel.
10421053 template <typename TB, typename TC>
10431054 HWY_INLINE void DoNT_MT_K (const MatPtrT<TB>& B, RowPtrs<TC> C_rows) const {
1044- MMZone zone ;
1045- zone. MaybeEnter (" MM.NT_MT_K" , args_ );
1055+ static const uint32_t zone_id = PROFILER_ADD_ZONE ( " MM.NT_MT_K " ) ;
1056+ static const uint32_t fill_zone_id = PROFILER_ADD_ZONE (" MM.NT_MT_K.FillC " );
10461057 const size_t kc_max = ranges_kc_.TaskSize ();
10471058 HWY_DASSERT (kc_max <= MMStorage::kMaxKC );
10481059 const size_t B_stride =
@@ -1068,7 +1079,11 @@ class MMPerPackage {
10681079 }; // loop_nc
10691080 args_.env ->parallel .ForRangesMC_NC (
10701081 ranges_mc_, ranges_nc_, pkg_idx_,
1071- [&](const IndexRange& range_mc, const IndexRange& range_nc) HWY_ATTR {
1082+ [&](const IndexRange& range_mc, const IndexRange& range_nc,
1083+ size_t worker) HWY_ATTR {
1084+ MMZone zone;
1085+ zone.MaybeEnter (worker, zone_id, args_);
1086+
10721087 HWY_ALIGN BF16 B_storage[B_storage_max_]; // TLS
10731088 const StridedViewBF B_storage_view (B_storage, kc_max, B_stride);
10741089
@@ -1087,7 +1102,7 @@ class MMPerPackage {
10871102 // `kDirect` is only used with `kNT_MT`.
10881103 HWY_DASSERT (out_ == MMOut::kCopy );
10891104 MMZone fill_zone;
1090- fill_zone.MaybeEnter (" MM.NT_MT_K.FillC " , args_);
1105+ fill_zone.MaybeEnter (worker, fill_zone_id , args_);
10911106 MMScaleDemoteAdd::FillC (range_mc, range_nc, args_, C_rows);
10921107 });
10931108 }
@@ -1139,13 +1154,16 @@ class MMPerPackage {
11391154
11401155 args_.env ->parallel .ForNP (
11411156 all_K, multiple_K, inner_tasks, pkg_idx_,
1142- [&](const IndexRange& range_K) { do_range (all_M, range_K); });
1157+ [&](const IndexRange& range_K, size_t /* worker*/ ) {
1158+ do_range (all_M, range_K);
1159+ });
11431160 break ;
11441161 }
11451162 case MMParA::kM :
1146- args_.env ->parallel .ForRangeMC (all_M, pkg_idx_, [&](size_t row_a) {
1147- do_range (IndexRange (row_a, row_a + 1 ), all_K);
1148- });
1163+ args_.env ->parallel .ForRangeMC (
1164+ all_M, pkg_idx_, [&](size_t row_a, size_t /* worker*/ ) {
1165+ do_range (IndexRange (row_a, row_a + 1 ), all_K);
1166+ });
11491167 break ;
11501168 }
11511169 }
@@ -1261,12 +1279,13 @@ struct MMImpl {
12611279 static HWY_NOINLINE void DoMatMul (const MatPtrT<TA>& A, const MatPtrT<TB>& B,
12621280 RowPtrs<TC> C_rows, const MMArgs& args,
12631281 const MMConfig& config) {
1264- MMZone matmul_zone;
1265- matmul_zone.MaybeEnter (" MM.DoMatMul" , args);
1282+ static const uint32_t zone_id = PROFILER_ADD_ZONE (" MM.DoMatMul" );
12661283
12671284 // Outermost loop: static NUMA-aware partition of B rows across packages.
12681285 args.env ->parallel .ForPkg (
12691286 args.per_key ->ranges_np .NumTasks (), [&](size_t pkg_idx) {
1287+ MMZone matmul_zone;
1288+ matmul_zone.MaybeEnter (pkg_idx, zone_id, args);
12701289 const IndexRange& range_np = args.per_key ->ranges_np .Range (pkg_idx);
12711290 MMPerPackage (A, args, config, pkg_idx, range_np)(B, C_rows);
12721291 });
0 commit comments