Revert to DeepSeek-OCR and disable scale-to-zero

MichaelWalker-git · claude · MichaelWalker-git · commit 2caba448230d · 2026-02-06T10:53:16.000-08:00
- Reverted from DeepSeek-OCR-2 to DeepSeek-OCR (weight mismatch issue)
- Disabled scale-to-zero: min=1, max=20, desired=1
- Keep g5.xlarge with bfloat16 for better performance

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,4 +1,4 @@
-# DeepSeek-OCR-2 vLLM Docker Image
+# DeepSeek-OCR vLLM Docker Image
 # Based on official vLLM OpenAI image for better compatibility
 # Supports BF16 inference on g5.xlarge (A10G GPU)
 
@@ -15,11 +15,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     git ca-certificates curl && \
     rm -rf /var/lib/apt/lists/*
 
-# Fetch upstream DeepSeek-OCR-2 sources at build time
+# Fetch upstream DeepSeek-OCR sources at build time
 RUN git clone --depth 1 https://github.com/deepseek-ai/DeepSeek-OCR.git /app/DeepSeek-OCR-src
 
 # Copy the DeepSeek-OCR vLLM implementation (correct nested path)
-# Note: DeepSeek-OCR-2 uses the same vLLM implementation structure
+# Copy the vLLM implementation for DeepSeek-OCR
 RUN cp -r /app/DeepSeek-OCR-src/DeepSeek-OCR-master/DeepSeek-OCR-vllm /app/DeepSeek-OCR-vllm
 
 # Copy custom files to replace the originals (transparent replacement approach)
@@ -35,7 +35,7 @@ COPY custom_run_dpsk_ocr_eval_batch.py ./DeepSeek-OCR-vllm/run_dpsk_ocr_eval_bat
 # Copy the startup script
 COPY start_server.py .
 
-# Upgrade pip and install core dependencies with specific versions for DeepSeek-OCR-2
+# Upgrade pip and install core dependencies
 RUN pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir \
     torch==2.6.0 \
@@ -76,8 +76,8 @@ ENV HF_HOME="/app/models"
 ENV TRANSFORMERS_CACHE="/app/models"
 ENV HUGGINGFACE_HUB_CACHE="/app/models"
 
-# Default model configuration for DeepSeek-OCR-2
-ENV MODEL_PATH="deepseek-ai/DeepSeek-OCR-2"
+# Default model configuration for DeepSeek-OCR
+ENV MODEL_PATH="deepseek-ai/DeepSeek-OCR"
 ENV VLLM_TORCH_DTYPE="bfloat16"
 
 # Make the scripts executable
diff --git a/docker/custom_config.py b/docker/custom_config.py
@@ -1,4 +1,4 @@
-# Custom configuration for DeepSeek-OCR-2 vLLM
+# Custom configuration for DeepSeek-OCR vLLM
 # This file replaces the original config.py during Docker build
 # Modify the PROMPT value below to change the default prompt used by the OCR service
 
@@ -9,10 +9,10 @@
 # Small: base_size = 640, image_size = 640, crop_mode = False
 # Base: base_size = 1024, image_size = 1024, crop_mode = False
 # Large: base_size = 1280, image_size = 1280, crop_mode = False
-# Gundam: base_size = 1024, image_size = 768, crop_mode = True (recommended for OCR-2)
+# Gundam: base_size = 1024, image_size = 768, crop_mode = True
 
 BASE_SIZE = 1024
-IMAGE_SIZE = 768  # Updated for DeepSeek-OCR-2 (larger than OCR-1)
+IMAGE_SIZE = 512  # Standard size for DeepSeek-OCR
 CROP_MODE = True
 MIN_CROPS = 2
 MAX_CROPS = 6  # max:9; If your GPU memory is small, it is recommended to set it to 6.
@@ -21,9 +21,9 @@
 PRINT_NUM_VIS_TOKENS = False
 SKIP_REPEAT = True
 
-# DeepSeek-OCR-2 Model Configuration
+# DeepSeek-OCR Model Configuration
 # Use environment variables for flexibility (Golden AMI may override)
-MODEL_PATH = os.environ.get('MODEL_PATH', 'deepseek-ai/DeepSeek-OCR-2')
+MODEL_PATH = os.environ.get('MODEL_PATH', 'deepseek-ai/DeepSeek-OCR')
 VLLM_TORCH_DTYPE = os.environ.get('VLLM_TORCH_DTYPE', 'bfloat16')  # BF16 for g5 (A10G GPU)
 
 # Check for pre-cached model in Golden AMI location
diff --git a/docker/start_server.py b/docker/start_server.py
@@ -125,7 +125,7 @@ def initialize_model():
     global llm, sampling_params
 
     if llm is None:
-        print("Initializing DeepSeek-OCR-2 model...")
+        print("Initializing DeepSeek-OCR model...")
         print(f"Model path from config: {MODEL_PATH}")
 
         # Get environment variable overrides
@@ -163,7 +163,7 @@ def initialize_model():
 
         # Initialize vLLM engine with the Hugging Face repository ID
         llm = LLM(
-            model=model_path,  # Use HF repository ID: "deepseek-ai/DeepSeek-OCR-2"
+            model=model_path,  # Use HF repository ID: "deepseek-ai/DeepSeek-OCR"
             hf_overrides={"architectures": ["DeepseekOCRForCausalLM"]},
             enforce_eager=True,
             trust_remote_code=True,
diff --git a/src/constructs/deepseek-ocr-ecs.ts b/src/constructs/deepseek-ocr-ecs.ts
@@ -246,7 +246,7 @@ export class DeepSeekOcrEc2GpuConstruct extends Construct {
         {
           capacityProvider: capacityProvider.capacityProviderName,
           weight: 1,
-          base: 0, // Scale-to-zero: no base capacity required
+          base: 1, // Always keep at least 1 task running
         },
       ],
       placementStrategies: [
@@ -388,13 +388,13 @@ export class DeepSeekOcrEc2GpuConstruct extends Construct {
       // GPU configuration
       gpuCount: 1, // Request 1 GPU
 
-      // Environment variables - DeepSeek-OCR-2 with BF16 on g5.xlarge
+      // Environment variables - DeepSeek-OCR with BF16 on g5.xlarge
       environment: {
         // GPU settings
         CUDA_VISIBLE_DEVICES: '0',
 
-        // DeepSeek-OCR-2 model configuration
-        MODEL_PATH: 'deepseek-ai/DeepSeek-OCR-2', // HuggingFace repo ID for OCR-2
+        // DeepSeek-OCR model configuration
+        MODEL_PATH: 'deepseek-ai/DeepSeek-OCR', // HuggingFace repo ID
         VLLM_TORCH_DTYPE: 'bfloat16', // BF16 for A10G GPU (g5 instances)
 
         // Model caching directories - check Golden AMI cache first
diff --git a/src/stacks/ecs.stack.ts b/src/stacks/ecs.stack.ts
@@ -42,13 +42,12 @@ export class EcsStack extends cdk.Stack {
     kmsKey.grantEncryptDecrypt(taskRole);
 
     // ECS Cluster Stack - g5.xlarge for DeepSeek-OCR-2 with BF16 support
-    // Scale-to-zero enabled: min=0, max=20, desired=0
     const ecsClusterConstruct = new DeepSeekOcrEc2GpuConstruct(this, 'EcsGpuService', {
       vpc,
       securityGroups,
-      minCapacity: 0, // Scale-to-zero: no minimum instances
+      minCapacity: 1, // Always keep at least 1 instance running
       maxCapacity: 20, // Allow scaling up to 20 instances
-      desiredCapacity: 0, // Scale-to-zero: start with 0 instances
+      desiredCapacity: 1, // Start with 1 instance
       dockerBuildContext: path.join(__dirname, '../../docker'),
       kmsKey,
       instanceType: ec2.InstanceType.of(ec2.InstanceClass.G5, ec2.InstanceSize.XLARGE), // A10G GPU for BF16