1+ {
2+ "Activation Sharding Dump" : [
3+ {
4+ "deepseek/inputs: bfloat16[96,2048,2048]" : {
5+ "logic_axes" : " ('activation_batch', 'activation_norm_length', 'activation_embed')" ,
6+ "PartitionSpec" : " P(('fsdp', 'expert'), 'context', None)"
7+ }
8+ },
9+ {
10+ "deepseek/pre_attention_norm: bfloat16[96,2048,2048]" : {
11+ "logic_axes" : " ('activation_batch', 'activation_norm_length', 'activation_embed')" ,
12+ "PartitionSpec" : " P(('fsdp', 'expert'), 'context', None)"
13+ }
14+ },
15+ {
16+ "attention_mla/inputs_q: bfloat16[96,2048,2048]" : {
17+ "logic_axes" : " ('activation_batch_attn', 'activation_length', 'activation_embed')" ,
18+ "PartitionSpec" : " P(('fsdp', 'expert'), 'context', None)"
19+ }
20+ },
21+ {
22+ "attention_mla/inputs_kv: bfloat16[96,2048,2048]" : {
23+ "logic_axes" : " ('activation_batch_attn', 'activation_length', 'activation_embed')" ,
24+ "PartitionSpec" : " P(('fsdp', 'expert'), 'context', None)"
25+ }
26+ },
27+ {
28+ "attention_mla/q_nope: bfloat16[96,2048,16,128]" : {
29+ "logic_axes" : " ('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')" ,
30+ "PartitionSpec" : " P(('fsdp', 'expert'), 'context', None, None)"
31+ }
32+ },
33+ {
34+ "attention_mla/q_pe: bfloat16[96,2048,16,64]" : {
35+ "logic_axes" : " ('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')" ,
36+ "PartitionSpec" : " P(('fsdp', 'expert'), 'context', None, None)"
37+ }
38+ },
39+ {
40+ "attention_mla/query: bfloat16[96,2048,16,192]" : {
41+ "logic_axes" : " ('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')" ,
42+ "PartitionSpec" : " P(('fsdp', 'expert'), 'context', None, None)"
43+ }
44+ },
45+ {
46+ "attention_mla/key_nope: bfloat16[96,2048,16,128]" : {
47+ "logic_axes" : " ('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')" ,
48+ "PartitionSpec" : " P(('fsdp', 'expert'), 'context', None, None)"
49+ }
50+ },
51+ {
52+ "attention_mla/key_rope: bfloat16[96,2048,16,64]" : {
53+ "logic_axes" : " ('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')" ,
54+ "PartitionSpec" : " P(('fsdp', 'expert'), 'context', None, None)"
55+ }
56+ },
57+ {
58+ "attention_mla/key: bfloat16[96,2048,16,192]" : {
59+ "logic_axes" : " ('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')" ,
60+ "PartitionSpec" : " P(('fsdp', 'expert'), 'context', None, None)"
61+ }
62+ },
63+ {
64+ "attention_mla/value: bfloat16[96,2048,16,128]" : {
65+ "logic_axes" : " ('activation_kv_batch', 'activation_length', 'activation_kv_heads', 'activation_kv_head_dim')" ,
66+ "PartitionSpec" : " P(('fsdp', 'expert'), 'context', None, None)"
67+ }
68+ },
69+ {
70+ "attention_op/arr: int8[1,4,4]" : {
71+ "logic_axes" : " Unknown" ,
72+ "PartitionSpec" : " P(None, 'context')"
73+ }
74+ },
75+ {
76+ "attention_op/arr: int32[2048]" : {
77+ "logic_axes" : " Unknown" ,
78+ "PartitionSpec" : " P('context',)"
79+ }
80+ },
81+ {
82+ "attention_op/query: bfloat16[96,16,2048,192]" : {
83+ "logic_axes" : " Unknown" ,
84+ "PartitionSpec" : " P(('fsdp', 'expert'), None, 'context', None)"
85+ }
86+ },
87+ {
88+ "attention_op/key: bfloat16[96,16,2048,192]" : {
89+ "logic_axes" : " Unknown" ,
90+ "PartitionSpec" : " P(('fsdp', 'expert'), None, None, None)"
91+ }
92+ },
93+ {
94+ "attention_op/value: bfloat16[96,16,2048,128]" : {
95+ "logic_axes" : " Unknown" ,
96+ "PartitionSpec" : " P(('fsdp', 'expert'), None, None, None)"
97+ }
98+ },
99+ {
100+ "attention_mla/out: bfloat16[96,2048,16,128]" : {
101+ "logic_axes" : " ('activation_batch_attn', 'activation_length', 'activation_heads', 'activation_kv')" ,
102+ "PartitionSpec" : " P(('fsdp', 'expert'), 'context', None, None)"
103+ }
104+ },
105+ {
106+ "deepseek/attention_result: bfloat16[96,2048,2048]" : {
107+ "logic_axes" : " ('activation_batch', 'activation_norm_length', 'activation_embed')" ,
108+ "PartitionSpec" : " P(('fsdp', 'expert'), 'context', None)"
109+ }
110+ },
111+ {
112+ "deepseek/post_attention_norm: bfloat16[96,2048,2048]" : {
113+ "logic_axes" : " ('activation_batch', 'activation_norm_length', 'activation_embed')" ,
114+ "PartitionSpec" : " P(('fsdp', 'expert'), 'context', None)"
115+ }
116+ },
117+ {
118+ "linears/x: bfloat16[96,2048,10944]" : {
119+ "logic_axes" : " ('activation_batch', 'activation_length', 'activation_mlp')" ,
120+ "PartitionSpec" : " P(('fsdp', 'expert'), 'context', None)"
121+ }
122+ },
123+ {
124+ "deepseek/mlp: bfloat16[96,2048,2048]" : {
125+ "logic_axes" : " ('activation_batch', 'activation_norm_length', 'activation_embed')" ,
126+ "PartitionSpec" : " P(('fsdp', 'expert'), 'context', None)"
127+ }
128+ },
129+ {
130+ "deepseek/x: bfloat16[96,2048,2048]" : {
131+ "logic_axes" : " ('activation_batch', 'activation_norm_length', 'activation_embed')" ,
132+ "PartitionSpec" : " P(('fsdp', 'expert'), 'context', None)"
133+ }
134+ },
135+ {
136+ "moe/inputs: bfloat16[96,2048,2048]" : {
137+ "logic_axes" : " ('activation_batch', 'activation_norm_length', None)" ,
138+ "PartitionSpec" : " P(('fsdp', 'expert'), 'context', None)"
139+ }
140+ },
141+ {
142+ "moe/gate_logits: bfloat16[96,2048,64]" : {
143+ "logic_axes" : " ('activation_batch', 'activation_norm_length', None)" ,
144+ "PartitionSpec" : " P(('fsdp', 'expert'), 'context', None)"
145+ }
146+ },
147+ {
148+ "moe/w0_kernel: bfloat16[64,2048,1408]" : {
149+ "logic_axes" : " Unknown" ,
150+ "PartitionSpec" : " P(('context', 'expert'), None, None)"
151+ }
152+ },
153+ {
154+ "moe/w1_kernel: bfloat16[64,2048,1408]" : {
155+ "logic_axes" : " Unknown" ,
156+ "PartitionSpec" : " P(('context', 'expert'), None, None)"
157+ }
158+ },
159+ {
160+ "moe/wo_kernel: bfloat16[64,1408,2048]" : {
161+ "logic_axes" : " Unknown" ,
162+ "PartitionSpec" : " P(('context', 'expert'), None, None)"
163+ }
164+ },
165+ {
166+ "linears/x: bfloat16[96,2048,2816]" : {
167+ "logic_axes" : " ('activation_batch', 'activation_length', 'activation_mlp')" ,
168+ "PartitionSpec" : " P(('fsdp', 'expert'), 'context', None)"
169+ }
170+ },
171+ {
172+ "deepseek/mlp_lnx: bfloat16[96,2048,2048]" : {
173+ "logic_axes" : " ('activation_batch', 'activation_norm_length', 'activation_embed')" ,
174+ "PartitionSpec" : " P(('fsdp', 'expert'), 'context', None)"
175+ }
176+ }
177+ ]
178+ }
0 commit comments