Skip to content

Commit e7a1511

Browse files
committed
fix: remove hardcoded internal paths for open source release
training/train.py: - Change --model_weight default from internal path to None dataloader/hevc_feature_decoder*.py: - Change HEVC_FEAT_DECODER fallback to 'hevc' (expects PATH) model_factory/vit_*.py: - vit_clip.py: Use openai/clip-vit-* HuggingFace IDs - vit_siglip.py: Use google/siglip-* HuggingFace IDs - vit_siglip2.py: Use google/siglip2-* HuggingFace IDs - vit_dinov2.py: Use facebook/dinov2-* HuggingFace IDs - vit_dinov3.py: Require explicit ckpt (no public model) - vit_metaclip.py: Use facebook/metaclip-* HuggingFace IDs
1 parent 3517117 commit e7a1511

9 files changed

Lines changed: 33 additions & 24 deletions

File tree

dataloader/hevc_feature_decoder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ def viz_residual(res: np.ndarray, signed: bool = True) -> np.ndarray:
130130
raise ValueError(f"Unexpected residual shape for viz: {res.shape}")
131131
return vis
132132

133-
_HEVC_FEAT_DECODER = os.environ.get('HEVC_FEAT_DECODER', '/video_vit/yunyaoyan/umt/umt_split/decoder/bin/hevc')
133+
_HEVC_FEAT_DECODER = os.environ.get('HEVC_FEAT_DECODER', 'hevc')
134134

135135
_FFMPEG_SUPPORTED_DECODERS = [ext.encode() for ext in [
136136
".mp4", ".mkv", ".mov", ".hevc", ".h265", ".265"

dataloader/hevc_feature_decoder_mv.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ class RobustHevcStream:
229229
def __init__(self, video, parallel=1, hevc_bin=None):
230230
self.video = video
231231
self.parallel = str(parallel)
232-
self.hevc_bin = hevc_bin or os.environ.get('HEVC_FEAT_DECODER', '/video_vit/yunyaoyan/umt/umt_split/decoder/bin/hevc')
232+
self.hevc_bin = hevc_bin or os.environ.get('HEVC_FEAT_DECODER', 'hevc')
233233
if not (os.path.isfile(self.hevc_bin) and os.access(self.hevc_bin, os.X_OK)):
234234
raise FileNotFoundError(f"HEVC binary not found/executable: {self.hevc_bin}")
235235
vinfo, _ = ffprobe(video)
@@ -427,7 +427,7 @@ def viz_residual(res: np.ndarray, signed: bool = True) -> np.ndarray:
427427
raise ValueError(f"Unexpected residual shape for viz: {res.shape}")
428428
return vis
429429

430-
_HEVC_FEAT_DECODER = os.environ.get('HEVC_FEAT_DECODER', '/video_vit/yunyaoyan/umt/umt_split/decoder/bin/hevc')
430+
_HEVC_FEAT_DECODER = os.environ.get('HEVC_FEAT_DECODER', 'hevc')
431431

432432
_FFMPEG_SUPPORTED_DECODERS = [ext.encode() for ext in [
433433
".mp4", ".mkv", ".mov", ".hevc", ".h265", ".265"

model_factory/vit_clip.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def clip_vit_base_patch16(pretrained: bool = False, **kwargs):
5959
"""
6060
model = CLIP(
6161
# 如需使用本地 ckpt,设置为本地路径;否则传入默认/自定义的 HF 路径
62-
ckpt=kwargs.get("ckpt", "/video_vit/pretrain_models/openai/clip-vit-base-patch16"),
62+
ckpt=kwargs.get("ckpt", "openai/clip-vit-base-patch16"),
6363
device=kwargs.get("device", "cuda" if torch.cuda.is_available() else "cpu"),
6464
)
6565
return model
@@ -80,7 +80,7 @@ def clip_vit_large_patch14(pretrained: bool = False, **kwargs):
8080
"""
8181
model = CLIP(
8282
# 如需使用本地 ckpt,设置为本地路径;否则传入默认/自定义的 HF 路径
83-
ckpt=kwargs.get("ckpt", "/video_vit/pretrain_models/openai/clip-vit-large-patch14"),
83+
ckpt=kwargs.get("ckpt", "openai/clip-vit-large-patch14"),
8484
device=kwargs.get("device", "cuda" if torch.cuda.is_available() else "cpu"),
8585
)
8686
return model

model_factory/vit_dinov2.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77
class Dinov2(nn.Module):
88
def __init__(
99
self,
10-
ckpt: str = "/video_vit/pretrain_models/dinov2-base",
10+
ckpt: str = "facebook/dinov2-base",
1111
device: str = "cuda" if torch.cuda.is_available() else "cpu",
12-
local_files_only: bool = True,
12+
local_files_only: bool = False,
1313
):
1414
"""
1515
DINOv2 视觉 Transformer 封装(forward 返回去掉 CLS 的 patch tokens)
@@ -51,18 +51,18 @@ def dinov2_base(pretrained: bool = False, **kwargs):
5151
**kwargs: 透传给 Dinov2(ckpt, device, local_files_only)
5252
"""
5353
model = Dinov2(
54-
ckpt=kwargs.get("ckpt", "/video_vit/pretrain_models/dinov2-base"),
54+
ckpt=kwargs.get("ckpt", "facebook/dinov2-base"),
5555
device=kwargs.get("device", "cuda" if torch.cuda.is_available() else "cpu"),
56-
local_files_only=kwargs.get("local_files_only", True),
56+
local_files_only=kwargs.get("local_files_only", False),
5757
)
5858
return model
5959

6060
@register_model
6161
def dinov2_large(pretrained: bool = False, **kwargs):
6262
model = Dinov2(
63-
ckpt=kwargs.get("ckpt", "/video_vit/pretrain_models/dinov2-large"),
63+
ckpt=kwargs.get("ckpt", "facebook/dinov2-large"),
6464
device=kwargs.get("device", "cuda" if torch.cuda.is_available() else "cpu"),
65-
local_files_only=kwargs.get("local_files_only", True),
65+
local_files_only=kwargs.get("local_files_only", False),
6666
)
6767
return model
6868

model_factory/vit_dinov3.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,17 +49,23 @@ def dinov3_base(pretrained=False, **kwargs):
4949
Returns:
5050
Dinov3: An instance of Dinov3.
5151
"""
52+
ckpt = kwargs.get("ckpt")
53+
if ckpt is None:
54+
raise ValueError("DINOv3 requires a checkpoint path via ckpt=... argument")
5255
model = Dinov3(
53-
ckpt="/video_vit/pretrain_models/dinov3-vitb16-pretrain-lvd1689m",
56+
ckpt=ckpt,
5457
device=kwargs.get("device", "cuda" if torch.cuda.is_available() else "cpu"),
5558
)
5659
return model
5760

5861

5962
@register_model
6063
def dinov3_large(pretrained=False, **kwargs):
64+
ckpt = kwargs.get("ckpt")
65+
if ckpt is None:
66+
raise ValueError("DINOv3 requires a checkpoint path via ckpt=... argument")
6167
model = Dinov3(
62-
ckpt="/video_vit/pretrain_models/dinov3-vitl16-pretrain-lvd1689m",
68+
ckpt=ckpt,
6369
device=kwargs.get("device", "cuda" if torch.cuda.is_available() else "cpu"),
6470
)
6571
return model
@@ -77,8 +83,11 @@ def dinov3_giant(pretrained=False, **kwargs):
7783
Returns:
7884
Dinov3: An instance of Dinov3 with giant variant.
7985
"""
86+
ckpt = kwargs.get("ckpt")
87+
if ckpt is None:
88+
raise ValueError("DINOv3 requires a checkpoint path via ckpt=... argument")
8089
model = Dinov3(
81-
ckpt="/video_vit/pretrain_models/dinov3-giant",
90+
ckpt=ckpt,
8291
device=kwargs.get("device", "cuda" if torch.cuda.is_available() else "cpu"),
8392
)
8493
return model

model_factory/vit_metaclip.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,15 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
2525
@register_model
2626
def metaclip_base16_fullcc(pretrained: bool = False, **kwargs):
2727
model = MetaClip(
28-
ckpt=kwargs.get("ckpt", "/video_vit/pretrain_models/metaclip-b16-fullcc2.5b/"),
28+
ckpt=kwargs.get("ckpt", "facebook/metaclip-b16-fullcc2.5b"),
2929
device=kwargs.get("device", "cuda" if torch.cuda.is_available() else "cpu"),
3030
)
3131
return model
3232

3333
@register_model
3434
def metaclip_large14_fullcc(pretrained: bool = False, **kwargs):
3535
model = MetaClip(
36-
ckpt=kwargs.get("ckpt", "/video_vit/pretrain_models/metaclip-l14-fullcc2.5b/"),
36+
ckpt=kwargs.get("ckpt", "facebook/metaclip-l14-fullcc2.5b"),
3737
device=kwargs.get("device", "cuda" if torch.cuda.is_available() else "cpu"),
3838
)
3939
return model
@@ -42,7 +42,7 @@ def metaclip_large14_fullcc(pretrained: bool = False, **kwargs):
4242
@register_model
4343
def metaclip2_large14(pretrained: bool = False, **kwargs):
4444
model = MetaClip(
45-
ckpt=kwargs.get("ckpt", "/video_vit/pretrain_models/metaclip-2-worldwide-l14"),
45+
ckpt=kwargs.get("ckpt", "facebook/metaclip-l14-fullcc2.5b"),
4646
device=kwargs.get("device", "cuda" if torch.cuda.is_available() else "cpu"),
4747
)
4848
return model

model_factory/vit_siglip.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def siglip_base(pretrained=False, **kwargs):
3333
Register Siglip without CLS token for timm.
3434
"""
3535
model = Siglip(
36-
ckpt="/video_vit/pretrain_models/siglip-base-patch16-224",
36+
ckpt=kwargs.get("ckpt", "google/siglip-base-patch16-224"),
3737
device=kwargs.get("device", "cuda" if torch.cuda.is_available() else "cpu"),
3838
)
3939
return model
@@ -42,7 +42,7 @@ def siglip_base(pretrained=False, **kwargs):
4242
@register_model
4343
def siglip_large_patch16_256(pretrained=False, **kwargs):
4444
model = Siglip(
45-
ckpt="/video_vit/pretrain_models/siglip-large-patch16-256",
45+
ckpt=kwargs.get("ckpt", "google/siglip-large-patch16-256"),
4646
device=kwargs.get("device", "cuda" if torch.cuda.is_available() else "cpu"),
4747
)
4848
return model

model_factory/vit_siglip2.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ def siglip2_base(pretrained=False, **kwargs):
182182
Siglip2: An instance of Siglip2.
183183
"""
184184
model = Siglip2(
185-
ckpt=kwargs.get("ckpt", "/video_vit/pretrain_models/siglip2-base-patch16-224"),
185+
ckpt=kwargs.get("ckpt", "google/siglip2-base-patch16-224"),
186186
device=kwargs.get("device", "cuda" if torch.cuda.is_available() else "cpu"),
187187
)
188188
return model
@@ -202,7 +202,7 @@ def siglip2_large_patch16_256(pretrained=False, **kwargs):
202202
Siglip2: An instance of Siglip2.
203203
"""
204204
model = Siglip2(
205-
ckpt=kwargs.get("ckpt", "/video_vit/pretrain_models/siglip2-large-patch16-256"),
205+
ckpt=kwargs.get("ckpt", "google/siglip2-large-patch16-256"),
206206
device=kwargs.get("device", "cuda" if torch.cuda.is_available() else "cpu"),
207207
)
208208
return model
@@ -222,7 +222,7 @@ def siglip2_so400m_patch16_naflex(pretrained=False, **kwargs):
222222
Siglip2Naflex: An instance of Siglip2Naflex.
223223
"""
224224
model = Siglip2Naflex(
225-
ckpt=kwargs.get("ckpt", "/video_vit/pretrain_models/siglip2-so400m-patch16-naflex"),
225+
ckpt=kwargs.get("ckpt", "google/siglip2-so400m-patch16-naflex"),
226226
device=kwargs.get("device", "cuda" if torch.cuda.is_available() else "cpu"),
227227
)
228228
return model

training/train.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,8 @@
6969
# Model / 模型
7070
# ---------------------------
7171
parser.add_argument("--model_name", default="pretrain_encoder_small_patch16_224_v10_12_rms_unmask_with_head", help="Backbone model name / 主干模型名称")
72-
parser.add_argument("--model_weight", default="/vlm/xiangan/VideoMLCD/checkpoints/llava_vit_s_16.py/00190000/backbone.pt",
73-
help="Path to pretrained weights or None / 预训练权重路径,或 None")
72+
parser.add_argument("--model_weight", default=None,
73+
help="Path to pretrained weights, HuggingFace model ID, or None")
7474
parser.add_argument("--embedding_size", type=int, default=384, help="Embedding dimension of the head / 头部嵌入维度")
7575
parser.add_argument("--gradient_checkpoint", type=int, default=0, help="Enable gradient checkpointing (0/1) / 是否启用梯度检查点(节省显存)")
7676
parser.add_argument("--mask", type=int, default=0, help="Enable mask-related training (0/1) / 是否启用 mask 相关训练")

0 commit comments

Comments
 (0)