@@ -553,7 +553,7 @@ def main():
553553 print (" - Driver/CUDA-compat < 13.1" )
554554 print (" - Outdated cuda-bindings (check PyTorch warnings above)" )
555555 print ("Annotations will not be recorded, but the demo will still run." )
556- print ("Any lane changes you see are from cleanup passes , not annotations .\n " )
556+ print ("Kernels will be reassigned to the default lane , not semantic lanes .\n " )
557557
558558 output_dir = Path ("traces" )
559559
@@ -888,11 +888,6 @@ def comm_annotation_demo():
888888# - Verify that ``enable_annotations=True`` was passed to ``torch.cuda.graph()``
889889# - Ensure ``cuda-python`` is installed
890890#
891- # **Kernels still overlapping in the trace?**
892- #
893- # - The cleanup passes should handle this automatically
894- # - If issues persist, try assigning explicit stream IDs in ``mark_kernels``
895- #
896891# **Annotations not showing up in specific kernels?**
897892#
898893# - Some operations may not launch kernels (e.g., tensor views)
@@ -916,7 +911,7 @@ def comm_annotation_demo():
916911# - Annotate communication collectives to recover the NCCL metadata
917912# (collective type, message size, group, rank) that CUDA graphs drop but
918913# eager traces expose
919- # - Post-process traces with ``annotate_trace()`` and cleanup passes
914+ # - Post-process traces with ``annotate_trace()``
920915# - View results in https://ui.perfetto.dev/ for intuitive visualization
921916#
922917# This technique is especially valuable for large models with many components,
0 commit comments