Skip to content

Commit a96996f

Browse files
authored
Merge pull request #196 from xming521/dev
v0.3.02
2 parents f67d8d2 + 5dcaad0 commit a96996f

19 files changed

Lines changed: 209 additions & 74 deletions

.gitignore

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ data/test
153153
test-scripts-my/
154154
*.csv
155155
!tests/tests_data/test_person/test_0_730.csv
156+
!tests/tests_data/test_PII/test_0_730.csv
156157
*test.*
157158
*-exp.*
158159
experiment/
@@ -185,3 +186,10 @@ dataset/**/*.ico
185186
dataset/*telegram*/*
186187
!*.gitkeep
187188
WC-exp/*
189+
190+
modeloutputs/*
191+
/tmp/*
192+
cache.pkl
193+
hfd.sh
194+
rpa_cache.pkl
195+
settings-bot8006.jsonc

.pre-commit-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ ci:
99

1010
repos:
1111
- repo: https://github.com/pre-commit/pre-commit-hooks
12-
rev: v5.0.0
12+
rev: v6.0.0
1313
hooks:
1414
- id: check-ast # Python 语法检查
1515
- id: check-added-large-files # 防止大文件
@@ -27,7 +27,7 @@ repos:
2727
args: ["--fix=lf"]
2828

2929
- repo: https://github.com/astral-sh/ruff-pre-commit
30-
rev: v0.12.7
30+
rev: v0.12.8
3131
hooks:
3232
- id: ruff
3333
args: [--fix]

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@
5858
> [!IMPORTANT]
5959
> - WeClone is still in rapid iteration phase, current performance does not represent final results.
6060
> - LLM fine-tuning effectiveness largely depends on model size, quantity and quality of chat data. Theoretically, larger models with more data yield better results.
61-
> - 7B models are prone to becoming "dumb", 14B models can barely communicate, while 32B+ models perform much better.
61+
> - The performance of the 7B model is average, while models with 14B or more parameters tend to deliver better results.
6262
> - Windows environment has not been rigorously tested. You can use WSL as the runtime environment.
6363
6464
### Recent Updates

README_zh.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
> [!IMPORTANT]
5757
> - WeClone仍在快速迭代期,当前效果不代表最终效果。
5858
> - 微调LLM效果很大程度取决于模型大小、聊天数据的数量和质量,理论上模型越大,数据越多,效果越好。
59-
> - 7B模型很容易训练成为大笨蛋,14B模型勉强可以交流,32B及以上的模型效果会更好
59+
> - 7B模型效果一般,14B及以上的模型效果会更好
6060
> - Windows环境未进行严格测试,可以使用WSL作为运行环境。
6161
6262
### 近期更新

WC-exp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Subproject commit 8b2332e37cbb0b1481747639406627b77cc3573c

dataset/res_csv/sft/dataset_info.json

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,33 @@
11
{
22
"chat-sft": {
3+
"file_name": "./sft-my.json",
4+
"formatting": "sharegpt",
5+
"columns": {
6+
"messages": "messages",
7+
"system": "system"
8+
},
9+
"tags": {
10+
"role_tag": "role",
11+
"content_tag": "content",
12+
"user_tag": "user",
13+
"assistant_tag": "assistant"
14+
}
15+
},
16+
"chat-sft-cleaned": {
17+
"file_name": "./sft-my-cleaned.json",
18+
"formatting": "sharegpt",
19+
"columns": {
20+
"messages": "messages",
21+
"system": "system"
22+
},
23+
"tags": {
24+
"role_tag": "role",
25+
"content_tag": "content",
26+
"user_tag": "user",
27+
"assistant_tag": "assistant"
28+
}
29+
},
30+
"chat-sft-vl": {
331
"file_name": "./sft-my.json",
432
"formatting": "sharegpt",
533
"columns": {
@@ -14,7 +42,7 @@
1442
"assistant_tag": "assistant"
1543
}
1644
},
17-
"chat-sft-cleaned": {
45+
"chat-sft-vl-cleaned": {
1846
"file_name": "./sft-my-cleaned.json",
1947
"formatting": "sharegpt",
2048
"columns": {

examples/mllm.template.jsonc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,12 @@
6161
"freeze_multi_modal_projector": false, //MLLM 训练时是否冻结多模态投影器。
6262
"use_fast_tokenizer": true,
6363
"lora_target": "q_proj,v_proj,visual.merger.mlp.0,visual.merger.mlp.2",
64-
"lora_rank": 4,
64+
"lora_rank": 8,
6565
"lora_dropout": 0.25,
6666
"weight_decay": 0.1,
6767
"overwrite_cache": true,
68-
"per_device_train_batch_size": 4,
69-
"gradient_accumulation_steps": 8,
68+
"per_device_train_batch_size": 2,
69+
"gradient_accumulation_steps": 16,
7070
"lr_scheduler_type": "cosine",
7171
"cutoff_len": 4096,
7272
"logging_steps": 10,

examples/tg.template.jsonc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,12 +66,12 @@
6666
"freeze_multi_modal_projector": false, // Whether to freeze the multimodal projector during MLLM training
6767
"use_fast_tokenizer": true,
6868
"lora_target": "q_proj,v_proj,visual.merger.mlp.0,visual.merger.mlp.2",
69-
"lora_rank": 4,
70-
"lora_dropout": 0.2,
69+
"lora_rank": 8,
70+
"lora_dropout": 0.25,
7171
"weight_decay": 0.1,
7272
"overwrite_cache": true,
73-
"per_device_train_batch_size": 4,
74-
"gradient_accumulation_steps": 8,
73+
"per_device_train_batch_size": 2,
74+
"gradient_accumulation_steps": 16,
7575
"lr_scheduler_type": "cosine",
7676
"cutoff_len": 4096,
7777
"logging_steps": 10,

pyproject.toml

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "WeClone"
3-
version = "0.3.01"
3+
version = "0.3.02"
44
description = "One-stop solution for creating your digital avatar from chat history"
55
authors = [{ name = "xming521" }]
66
readme = "README.md"
@@ -17,28 +17,29 @@ dependencies = [
1717
"loguru>=0.7.3",
1818
"tomli; python_version < '3.11'",
1919
"langchain",
20-
"openai==1.76.0",
20+
"openai==1.87.0",
2121
"pip"
2222
]
2323

2424
[tool.weclone]
2525
# Configuration file version number. This number should be incremented when the configuration file structure or important default values change.
26-
config_version = "0.3.0"
26+
config_version = "0.3.02"
2727

2828
config_changelog = """
29-
[0.2.22] - 2025-06-05 - Support image modality chat history fine-tuning
3029
[0.2.24] - 2025-06-19 - add test_model_args and vllm_args.
3130
[0.3.00] - 2025-06-30 - Support TG chat logs, add language parameter, add log level parameter.
31+
[0.3.02] - 2025-08-15 - Allow the use of the enable_thinking to control offline cleaning..
3232
"""
3333

3434
[dependency-groups]
3535
main = [
3636
"llamafactory @ git+https://github.com/hiyouga/LLaMA-Factory.git",
37-
"vllm==0.9.1; platform_system == 'Linux'",
38-
"torch==2.7.0",
39-
"transformers==4.52.1",
37+
"vllm==0.10.0; platform_system == 'Linux'",
38+
"torch==2.7.1",
39+
"torchvision==0.22.1",
40+
"transformers==4.53.2",
4041
"accelerate==1.7.0",
41-
"triton==3.3.0; platform_system == 'Linux'",
42+
"triton==3.3.1; platform_system == 'Linux'",
4243
"presidio_analyzer[transformers]",
4344
"presidio_anonymizer",
4445
]

settings.template.jsonc

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
"clean_strategy": "llm",
4242
"llm": {
4343
"accept_score": 2, //可以接受的llm打分阈值,1分最差,5分最好,低于此分数的数据不会用于训练
44+
"enable_thinking": true
4445
}
4546
},
4647
"online_llm_clear": false,
@@ -63,12 +64,12 @@
6364
"dataset_dir": "./dataset/res_csv/sft",
6465
"use_fast_tokenizer": true,
6566
"lora_target": "q_proj,v_proj",
66-
"lora_rank": 4,
67-
"lora_dropout": 0.3,
67+
"lora_rank": 8,
68+
"lora_dropout": 0.25,
6869
"weight_decay": 0.1,
6970
"overwrite_cache": true,
70-
"per_device_train_batch_size": 8,
71-
"gradient_accumulation_steps": 4,
71+
"per_device_train_batch_size": 2,
72+
"gradient_accumulation_steps": 16,
7273
"lr_scheduler_type": "cosine",
7374
"cutoff_len": 2048,
7475
"logging_steps": 10,
@@ -84,7 +85,7 @@
8485
"infer_args": {
8586
"repetition_penalty": 1.2,
8687
"temperature": 0.5,
87-
"max_length": 50,
88+
"max_length": 256,
8889
"top_p": 0.65
8990
},
9091
"vllm_args": {

0 commit comments

Comments
 (0)