Which component has the problem?
CuTe DSL
Bug Report
Describe the bug
build failed caused by following log:
[ 16%] Building CUDA object tools/library/CMakeFiles/cutlass_library_gemm_sm120_bf16_gemm_grouped_e4m3_e5m2_objs.dir/generated/gemm/120/bf16_gemm_grouped_e4m3_e5m2/cutlass3x_sm120_tensorop_gemm_grouped_1x128f32xe4m3_1x128f32xe5m2_f32_bf16_bf16_64x128x128_1x1x1_0_tnn_align128_pingpong_q.cu.o
/home/janboe/cutlass/include/cutlass/epilogue/collective/builders/sm120_builder.inl(158): error: static assertion failed with "CTA tile for FP6 ElementD must have a contiguous extent that is a multiple of 128."
static_assert(!IsFP6 ||
^
detected during:
instantiation of "auto cutlass::epilogue::collective::detail::sm120_compute_tile_shape_or_override<ElementC,ElementD,EpilogueTileType,Schedule,TileShape_MNK,StrideD,FusionOp>() [with ElementC=cutlass::half_t, ElementD=cutlass::float_e3m2_t, EpilogueTileType=cutlass::epilogue::collective::EpilogueTileAuto, Schedule=cutlass::epilogue::collective::EpilogueScheduleAuto, TileShape_MNK=cute::tuple<cute::_128, cute::_32, cute::_128>, StrideD=cute::tuple<int64_t, cute::C<1>, int64_t>, FusionOp=cutlass::epilogue::fusion::LinCombBlockScaleFactor<32, cutlass::float_e3m2_t, float, cutlass::float_ue8m0_t, cutlass::layout::RowMajor, cutlass::half_t, float, cutlass::FloatRoundStyle::round_to_nearest>]" at line 492
instantiation of class "cutlass::epilogue::collective::CollectiveBuilder<cutlass::arch::Sm120, OpClass, TileShape_MNK, ClusterShape_MNK, EpilogueTileType, ElementAccumulator, ElementCompute, ElementC, GmemLayoutTagC, AlignmentC, ElementD, GmemLayoutTagD, AlignmentD, Schedule, FusionOperation, std::enable_if_t<, void>> [with OpClass=cutlass::arch::OpClassBlockScaledTensorOp, TileShape_MNK=cute::tuple<cute::_128, cute::_32, cute::_128>, ClusterShape_MNK=cute::tuple<cute::_1, cute::_1, cute::_1>, EpilogueTileType=cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator=float, ElementCompute=float, ElementC=cutlass::half_t, GmemLayoutTagC=cutlass::layout::RowMajor, AlignmentC=8, ElementD=cutlass::float_e3m2_t, GmemLayoutTagD=cutlass::layout::RowMajor, AlignmentD=128, Schedule=cutlass::epilogue::collective::EpilogueScheduleAuto, FusionOperation=cutlass::epilogue::fusion::LinCombBlockScaleFactor<32, cutlass::float_e3m2_t, float, cutlass::float_ue8m0_t, cutlass::layout::RowMajor, cutlass::half_t, float, cutlass::FloatRoundStyle::round_to_nearest>]" at line 51 of /home/janboe/cutlass/build/tools/library/generated/gemm/120/f16_gemm_e2m1_e2m3/cutlass3x_sm120_bstensorop_gemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_ue8m0xe3m2_128x32x128_1x1x1_0_tnt_align128_cooperative_q_epiVs32t.cu
/home/janboe/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp(130): error: static assertion failed with "EPI_TILE_N must divide CTA_N"
static_assert(size<1>(CtaTileMNK{}) % size<1>(shape(EpilogueTile{})) == 0, "EPI_TILE_N must divide CTA_N");
^
detected during instantiation of class "cutlass::epilogue::collective::CollectiveEpilogue<cutlass::epilogue::Sm90TmaWarpSpecialized<StagesC_, StagesD_, FragmentSize_, ReuseSmemC_, DelayTmaStore_>, CtaTileMNK_, EpilogueTile_, ElementC_, StrideC_, ElementD_, StrideD_, FusionCallbacks_, CopyOpG2S_, SmemLayoutAtomC_, CopyOpS2R_, CopyOpS2G_, SmemLayoutAtomD_, CopyOpR2S_, CopyAtomC_, CopyOpR2R_> [with StagesC_=1, StagesD_=1, FragmentSize_=4, ReuseSmemC_=false, DelayTmaStore_=false, CtaTileMNK_=cute::tuple<cute::128, cute::32, cute::128>, EpilogueTile=cute::tuple<cute::64, cute::128>, ElementC=cutlass::half_t, StrideC=cute::tuple<int64_t, cute::C<1>, int64_t>, ElementD=cutlass::float_e3m2_t, StrideD=cute::tuple<int64_t, cute::C<1>, int64_t>, FusionCallbacks_=cutlass::epilogue::fusion::FusionCallbacks<cutlass::epilogue::Sm120TmaWarpSpecialized<1, 1, 4, false, false>, cutlass::epilogue::fusion::LinCombBlockScaleFactor<32, cutlass::float_e3m2_t, float, cutlass::float_ue8m0_t, cutlass::layout::RowMajor, cutlass::half_t, float, cutlass::FloatRoundStyle::round_to_nearest>, cute::tuple<cute::_128, cute::_32, cute::_128>, cute::tuple<cute::64, cute::128>>, CopyOpG2S=cute::SM90_TMA_LOAD, SmemLayoutAtomC=cute::ComposedLayout<cute::Swizzle<3, 4, 3>, cute::smem_ptr_flag_bits<16>, cute::Layout<cute::tuple<cute::_8, cute::64>, cute::tuple<cute::64, cute::1>>>, CopyOpS2R=cute::SM75_U32x2_LDSM_N, CopyOpS2G=cute::SM90_TMA_STORE, SmemLayoutAtomD=cute::ComposedLayout<cute::Swizzle<3, 4, 3>, cute::smem_ptr_flag_bits<8>, cute::Layout<cute::tuple<cute::_8, cute::128>, cute::tuple<cute::128, cute::1>>>, CopyOpR2S=cute::AutoVectorizingCopyWithAssumedAlignment<128>, CopyAtomC=cute::Copy_Atom<cute::SM90_U32x2_STSM_N, cutlass::half_t>, CopyOpR2R=void]" at line 64 of /home/janboe/cutlass/build/tools/library/generated/gemm/120/f16_gemm_e2m1_e2m3/cutlass3x_sm120_bstensorop_gemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_ue8m0xe3m2_128x32x128_1x1x1_0_tnt_align128_cooperative_q_epiVs32t.cu
2 errors detected in the compilation of "/home/janboe/cutlass/build/tools/library/generated/gemm/120/f16_gemm_e2m1_e2m3/cutlass3x_sm120_bstensorop_gemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_ue8m0xe3m2_128x32x128_1x1x1_0_tnt_align128_cooperative_q_epiVs32t.cu".
Steps/Code to reproduce bug
cmake -B build -DCUTLASS_NVCC_ARCHS="121a;121f"
cmake --build build -j10
Expected behavior
build can finish
Environment details (please complete the following information):
dgx spark
Which component has the problem?
CuTe DSL
Bug Report
Describe the bug
build failed caused by following log:
[ 16%] Building CUDA object tools/library/CMakeFiles/cutlass_library_gemm_sm120_bf16_gemm_grouped_e4m3_e5m2_objs.dir/generated/gemm/120/bf16_gemm_grouped_e4m3_e5m2/cutlass3x_sm120_tensorop_gemm_grouped_1x128f32xe4m3_1x128f32xe5m2_f32_bf16_bf16_64x128x128_1x1x1_0_tnn_align128_pingpong_q.cu.o
/home/janboe/cutlass/include/cutlass/epilogue/collective/builders/sm120_builder.inl(158): error: static assertion failed with "CTA tile for FP6 ElementD must have a contiguous extent that is a multiple of 128."
static_assert(!IsFP6 ||
^
detected during:
instantiation of "auto cutlass::epilogue::collective::detail::sm120_compute_tile_shape_or_override<ElementC,ElementD,EpilogueTileType,Schedule,TileShape_MNK,StrideD,FusionOp>() [with ElementC=cutlass::half_t, ElementD=cutlass::float_e3m2_t, EpilogueTileType=cutlass::epilogue::collective::EpilogueTileAuto, Schedule=cutlass::epilogue::collective::EpilogueScheduleAuto, TileShape_MNK=cute::tuple<cute::_128, cute::_32, cute::_128>, StrideD=cute::tuple<int64_t, cute::C<1>, int64_t>, FusionOp=cutlass::epilogue::fusion::LinCombBlockScaleFactor<32, cutlass::float_e3m2_t, float, cutlass::float_ue8m0_t, cutlass::layout::RowMajor, cutlass::half_t, float, cutlass::FloatRoundStyle::round_to_nearest>]" at line 492
instantiation of class "cutlass::epilogue::collective::CollectiveBuilder<cutlass::arch::Sm120, OpClass, TileShape_MNK, ClusterShape_MNK, EpilogueTileType, ElementAccumulator, ElementCompute, ElementC, GmemLayoutTagC, AlignmentC, ElementD, GmemLayoutTagD, AlignmentD, Schedule, FusionOperation, std::enable_if_t<, void>> [with OpClass=cutlass::arch::OpClassBlockScaledTensorOp, TileShape_MNK=cute::tuple<cute::_128, cute::_32, cute::_128>, ClusterShape_MNK=cute::tuple<cute::_1, cute::_1, cute::_1>, EpilogueTileType=cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator=float, ElementCompute=float, ElementC=cutlass::half_t, GmemLayoutTagC=cutlass::layout::RowMajor, AlignmentC=8, ElementD=cutlass::float_e3m2_t, GmemLayoutTagD=cutlass::layout::RowMajor, AlignmentD=128, Schedule=cutlass::epilogue::collective::EpilogueScheduleAuto, FusionOperation=cutlass::epilogue::fusion::LinCombBlockScaleFactor<32, cutlass::float_e3m2_t, float, cutlass::float_ue8m0_t, cutlass::layout::RowMajor, cutlass::half_t, float, cutlass::FloatRoundStyle::round_to_nearest>]" at line 51 of /home/janboe/cutlass/build/tools/library/generated/gemm/120/f16_gemm_e2m1_e2m3/cutlass3x_sm120_bstensorop_gemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_ue8m0xe3m2_128x32x128_1x1x1_0_tnt_align128_cooperative_q_epiVs32t.cu
/home/janboe/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp(130): error: static assertion failed with "EPI_TILE_N must divide CTA_N"
static_assert(size<1>(CtaTileMNK{}) % size<1>(shape(EpilogueTile{})) == 0, "EPI_TILE_N must divide CTA_N");
^
detected during instantiation of class "cutlass::epilogue::collective::CollectiveEpilogue<cutlass::epilogue::Sm90TmaWarpSpecialized<StagesC_, StagesD_, FragmentSize_, ReuseSmemC_, DelayTmaStore_>, CtaTileMNK_, EpilogueTile_, ElementC_, StrideC_, ElementD_, StrideD_, FusionCallbacks_, CopyOpG2S_, SmemLayoutAtomC_, CopyOpS2R_, CopyOpS2G_, SmemLayoutAtomD_, CopyOpR2S_, CopyAtomC_, CopyOpR2R_> [with StagesC_=1, StagesD_=1, FragmentSize_=4, ReuseSmemC_=false, DelayTmaStore_=false, CtaTileMNK_=cute::tuple<cute::128, cute::32, cute::128>, EpilogueTile=cute::tuple<cute::64, cute::128>, ElementC=cutlass::half_t, StrideC=cute::tuple<int64_t, cute::C<1>, int64_t>, ElementD=cutlass::float_e3m2_t, StrideD=cute::tuple<int64_t, cute::C<1>, int64_t>, FusionCallbacks_=cutlass::epilogue::fusion::FusionCallbacks<cutlass::epilogue::Sm120TmaWarpSpecialized<1, 1, 4, false, false>, cutlass::epilogue::fusion::LinCombBlockScaleFactor<32, cutlass::float_e3m2_t, float, cutlass::float_ue8m0_t, cutlass::layout::RowMajor, cutlass::half_t, float, cutlass::FloatRoundStyle::round_to_nearest>, cute::tuple<cute::_128, cute::_32, cute::_128>, cute::tuple<cute::64, cute::128>>, CopyOpG2S=cute::SM90_TMA_LOAD, SmemLayoutAtomC=cute::ComposedLayout<cute::Swizzle<3, 4, 3>, cute::smem_ptr_flag_bits<16>, cute::Layout<cute::tuple<cute::_8, cute::64>, cute::tuple<cute::64, cute::1>>>, CopyOpS2R=cute::SM75_U32x2_LDSM_N, CopyOpS2G=cute::SM90_TMA_STORE, SmemLayoutAtomD=cute::ComposedLayout<cute::Swizzle<3, 4, 3>, cute::smem_ptr_flag_bits<8>, cute::Layout<cute::tuple<cute::_8, cute::128>, cute::tuple<cute::128, cute::1>>>, CopyOpR2S=cute::AutoVectorizingCopyWithAssumedAlignment<128>, CopyAtomC=cute::Copy_Atom<cute::SM90_U32x2_STSM_N, cutlass::half_t>, CopyOpR2R=void]" at line 64 of /home/janboe/cutlass/build/tools/library/generated/gemm/120/f16_gemm_e2m1_e2m3/cutlass3x_sm120_bstensorop_gemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_ue8m0xe3m2_128x32x128_1x1x1_0_tnt_align128_cooperative_q_epiVs32t.cu
2 errors detected in the compilation of "/home/janboe/cutlass/build/tools/library/generated/gemm/120/f16_gemm_e2m1_e2m3/cutlass3x_sm120_bstensorop_gemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_ue8m0xe3m2_128x32x128_1x1x1_0_tnt_align128_cooperative_q_epiVs32t.cu".
Steps/Code to reproduce bug
cmake -B build -DCUTLASS_NVCC_ARCHS="121a;121f"
cmake --build build -j10
Expected behavior
build can finish
Environment details (please complete the following information):
dgx spark