Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions examples/experiments/deepseek_v3_pretrain/run_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,10 +239,6 @@ class ModelArguments:
"help": "Pre-training from existing paddleformers model weights. Default False and model will train from scratch. If set True, the model_name_or_path argument must exist in the paddleformers models."
},
)
num_hidden_layers: Optional[int] = field(
default=None,
metadata={"help": "num_hidden_layers."},
)


def create_pretrained_dataset(
Expand Down Expand Up @@ -482,9 +478,6 @@ def main():
config.vocab_size = max(config.vocab_size, ((tokenizer.vocab_size - 1) // 128 + 1) * 128)
logger.info(f"Reset vocab size to {config.vocab_size} for batter amp peformance.")

config.num_hidden_layers = (
model_args.num_hidden_layers if model_args.num_hidden_layers is not None else config.num_hidden_layers
)
# Config for model using dropout, such as GPT.
if hasattr(config, "use_dualpipev"):
# NOTE(zhangyuqin): In Paddle, the segmentation and scheduling of pipeline parallel
Expand Down
7 changes: 0 additions & 7 deletions paddleformers/cli/train/pretrain/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,6 @@ class ModelArguments:
"help": "Pre-training from existing paddleformers model weights. Default False and model will train from scratch. If set True, the model_name_or_path argument must exist in the paddleformers models."
},
)
num_hidden_layers: Optional[int] = field(
default=None,
metadata={"help": "num_hidden_layers."},
)


def create_pretrained_dataset(
Expand Down Expand Up @@ -418,9 +414,6 @@ def run_dsv3_pretrain(model_args, data_args, generating_args, training_args):
config.vocab_size = max(config.vocab_size, ((tokenizer.vocab_size - 1) // 128 + 1) * 128)
logger.info(f"Reset vocab size to {config.vocab_size} for batter amp peformance.")

config.num_hidden_layers = (
model_args.num_hidden_layers if model_args.num_hidden_layers is not None else config.num_hidden_layers
)
# Config for model using dropout, such as GPT.
if hasattr(config, "use_dualpipev"):
# NOTE(zhangyuqin): In Paddle, the segmentation and scheduling of pipeline parallel
Expand Down
Loading