File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -206,6 +206,9 @@ adam_b1: 0.9 # Exponential decay rate to track the first moment of past gradient
206206adam_b2 : 0.999 # Exponential decay rate to track the second moment of past gradients.
207207adam_eps : 1.e-8 # A small constant applied to denominator outside of the square root.
208208adam_weight_decay : 1.e-2 # AdamW Weight decay
209+ opt_enable_grad_clipping : False
210+ max_grad_value : 1.0
211+ opt_enable_grad_global_norm_clipping : False
209212max_grad_norm : 1.0
210213
211214enable_profiler : False
Original file line number Diff line number Diff line change @@ -211,6 +211,9 @@ adam_b1: 0.9 # Exponential decay rate to track the first moment of past gradient
211211adam_b2 : 0.999 # Exponential decay rate to track the second moment of past gradients.
212212adam_eps : 1.e-8 # A small constant applied to denominator outside of the square root.
213213adam_weight_decay : 1.e-2 # AdamW Weight decay
214+ opt_enable_grad_clipping : False
215+ max_grad_value : 1.0
216+ opt_enable_grad_global_norm_clipping : False
214217max_grad_norm : 1.0
215218
216219enable_profiler : False
Original file line number Diff line number Diff line change @@ -221,6 +221,9 @@ adam_b1: 0.9 # Exponential decay rate to track the first moment of past gradient
221221adam_b2 : 0.999 # Exponential decay rate to track the second moment of past gradients.
222222adam_eps : 1.e-8 # A small constant applied to denominator outside of the square root.
223223adam_weight_decay : 1.e-2 # AdamW Weight decay
224+ opt_enable_grad_clipping : False
225+ max_grad_value : 1.0
226+ opt_enable_grad_global_norm_clipping : False
224227max_grad_norm : 1.0
225228
226229enable_profiler : False
Original file line number Diff line number Diff line change @@ -245,6 +245,9 @@ adam_b1: 0.9 # Exponential decay rate to track the first moment of past gradient
245245adam_b2 : 0.999 # Exponential decay rate to track the second moment of past gradients.
246246adam_eps : 1.e-8 # A small constant applied to denominator outside of the square root.
247247adam_weight_decay : 0 # AdamW Weight decay
248+ opt_enable_grad_clipping : False
249+ max_grad_value : 1.0
250+ opt_enable_grad_global_norm_clipping : False
248251max_grad_norm : 1.0
249252
250253enable_profiler : False
Original file line number Diff line number Diff line change @@ -232,6 +232,9 @@ adam_b1: 0.9 # Exponential decay rate to track the first moment of past gradient
232232adam_b2 : 0.999 # Exponential decay rate to track the second moment of past gradients.
233233adam_eps : 1.e-8 # A small constant applied to denominator outside of the square root.
234234adam_weight_decay : 1.e-2 # AdamW Weight decay
235+ opt_enable_grad_clipping : False
236+ max_grad_value : 1.0
237+ opt_enable_grad_global_norm_clipping : False
235238max_grad_norm : 1.0
236239
237240enable_profiler : False
Original file line number Diff line number Diff line change @@ -240,6 +240,9 @@ adam_b1: 0.9 # Exponential decay rate to track the first moment of past gradient
240240adam_b2 : 0.999 # Exponential decay rate to track the second moment of past gradients.
241241adam_eps : 1.e-8 # A small constant applied to denominator outside of the square root.
242242adam_weight_decay : 1.e-2 # AdamW Weight decay
243+ opt_enable_grad_clipping : False
244+ max_grad_value : 1.0
245+ opt_enable_grad_global_norm_clipping : False
243246max_grad_norm : 1.0
244247
245248enable_profiler : False
Original file line number Diff line number Diff line change @@ -301,6 +301,9 @@ adam_b1: 0.9 # Exponential decay rate to track the first moment of past gradient
301301adam_b2 : 0.999 # Exponential decay rate to track the second moment of past gradients.
302302adam_eps : 1.e-8 # A small constant applied to denominator outside of the square root.
303303adam_weight_decay : 0 # AdamW Weight decay
304+ opt_enable_grad_clipping : False
305+ max_grad_value : 1.0
306+ opt_enable_grad_global_norm_clipping : False
304307max_grad_norm : 1.0
305308
306309enable_profiler : False
Original file line number Diff line number Diff line change @@ -257,6 +257,9 @@ adam_b1: 0.9 # Exponential decay rate to track the first moment of past gradient
257257adam_b2 : 0.999 # Exponential decay rate to track the second moment of past gradients.
258258adam_eps : 1.e-8 # A small constant applied to denominator outside of the square root.
259259adam_weight_decay : 0 # AdamW Weight decay
260+ opt_enable_grad_clipping : False
261+ max_grad_value : 1.0
262+ opt_enable_grad_global_norm_clipping : False
260263max_grad_norm : 1.0
261264
262265enable_profiler : False
Original file line number Diff line number Diff line change @@ -268,6 +268,9 @@ adam_b1: 0.9 # Exponential decay rate to track the first moment of past gradient
268268adam_b2 : 0.999 # Exponential decay rate to track the second moment of past gradients.
269269adam_eps : 1.e-8 # A small constant applied to denominator outside of the square root.
270270adam_weight_decay : 0 # AdamW Weight decay
271+ opt_enable_grad_clipping : False
272+ max_grad_value : 1.0
273+ opt_enable_grad_global_norm_clipping : False
271274max_grad_norm : 1.0
272275
273276enable_profiler : False
Original file line number Diff line number Diff line change @@ -263,6 +263,9 @@ adam_b1: 0.9 # Exponential decay rate to track the first moment of past gradient
263263adam_b2 : 0.999 # Exponential decay rate to track the second moment of past gradients.
264264adam_eps : 1.e-8 # A small constant applied to denominator outside of the square root.
265265adam_weight_decay : 0 # AdamW Weight decay
266+ opt_enable_grad_clipping : False
267+ max_grad_value : 1.0
268+ opt_enable_grad_global_norm_clipping : False
266269max_grad_norm : 1.0
267270
268271enable_profiler : False
You can’t perform that action at this time.
0 commit comments