Update

anxiangsir · anxiangsir · commit 0953409c8a36 · 2026-02-06T13:55:23.000+08:00
diff --git a/README.md b/README.md
@@ -194,7 +194,7 @@ with torch.no_grad():
 
 </details>
 
-### Loading from Source Code  
+### Loading from Source Code
 
 <details>
 <summary>Click to expand installation and usage code</summary>
@@ -274,17 +274,6 @@ pip install -e .
 
 </details>
 
-### Single Node Dry Run To Test Setup
-
-<details>
-<summary>Click to expand dry run command</summary>
-
-```bash
-bash shells/ov_encoder_base_stage1_si_dry_run.sh
-```
-
-</details>
-
 ### Single Node Stage-1 Single Image
 
 <details>
@@ -329,6 +318,8 @@ To evaluate the OneVision Encoder as a vision backbone for LLaVA-NeXT multimodal
 
 Navigate to the llava_next directory and follow the setup instructions:
 
+For more details, refer to the [LLaVA-NeXT documentation](llava_next/README.md).
+
 <details>
 <summary>Click to expand LLaVA-NeXT evaluation setup</summary>
 
@@ -345,7 +336,9 @@ docker run -it --gpus all --ipc host --net host --privileged \
 
 </details>
 
-#### Running Evaluation
+#### LLaVA-NeXT-Video Evaluation
+
+
 
 For image benchmarks (ChartQA, DocVQA, AI2D, OCRBench, etc.):
 
@@ -374,7 +367,7 @@ TASKS="videomme" bash scripts/eval/eval_ov_encoder.sh
 
 </details>
 
-For more details, refer to the [LLaVA-NeXT documentation](llava_next/README.md).
+
 
 ### Attentive Probe Evaluation
 
@@ -398,9 +391,6 @@ bash shells_eval_ap/eval_ov_encoder_large_16frames.sh
 
 </details>
 
-**Sampling-Specific Parameters:**
-
-- `frames_token_num`: Number of tokens per frame (e.g., 256 tokens for standard sampling).
 
 #### OV-Encoder Codec Evaluation
 
@@ -427,11 +417,6 @@ bash shells_eval_ap/eval_ov_encoder_large_2kpatches_codec.sh
 
 ---
 
-## 📄 License
-
-This project is released under the Apache 2.0 License.
-
-
 
 ## 🔗 Related Projects
 
diff --git a/eval_encoder/attentive_probe.py b/eval_encoder/attentive_probe.py
@@ -56,7 +56,7 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument("--smoothing", type=float, default=0.1)
     parser.add_argument("--print_freq", type=int, default=10)
     parser.add_argument("--eval_freq", type=int, default=1)
-    parser.add_argument("--frames_token_num", type=int, default=196)
+    parser.add_argument("--frames_token_num", type=int, default=256)
 
     # Dataloader
     parser.add_argument("--dali_num_threads", type=int, default=2)
@@ -454,11 +454,18 @@ def evaluate(
 
 def get_model(args: argparse.Namespace) -> nn.Module:
     if args.model_name == "ov_encoder_large":
-        model = AutoModel.from_pretrained(
-            "lmms-lab-encoder/onevision-encoder-large", trust_remote_code=True, attn_implementation="flash_attention_2"
-        )
-        model = torch.compile(model)
-        return model
+        if os.path.isdir(args.model_weight):
+            from onevision_encoder.modeling_onevision_encoder import OneVisionEncoderModel
+            model = OneVisionEncoderModel.from_pretrained(
+                args.model_weight, trust_remote_code=True, attn_implementation="flash_attention_2"
+            )
+            return model
+        else:
+            model = AutoModel.from_pretrained(
+                "lmms-lab-encoder/onevision-encoder-large", trust_remote_code=True, attn_implementation="flash_attention_2"
+            )
+            model = torch.compile(model)
+            return model
 
     model = create_model(args.model_name, pretrained=False)
     if args.model_family in ["chunk_wise_sampling"]: