fix(notebook): Step 6+ cells strip orphan lines; NIM verify /v1/health/ready

shadeform · shadeform · commit 928b492177ab · 2026-04-14T07:20:20.000Z
- Remove stray leading comment blocks that caused IndentationError in infrastructure/migration/users cells
- verify_local_nim_installation: probe NIM readiness URLs, longer wait, LLM_NIM_PORT
- compose llm-nim healthcheck uses /v1/health/ready; sync setup notebook from workspace

Made-with: Cursor
diff --git a/deploy/compose/docker-compose.dev.yaml b/deploy/compose/docker-compose.dev.yaml
@@ -184,7 +184,7 @@ services:
       - nim_models:/models
     restart: unless-stopped
     healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      test: ["CMD", "curl", "-f", "http://localhost:8000/v1/health/ready"]
       interval: 30s
       timeout: 10s
       retries: 3
diff --git a/notebooks/setup/complete_setup_guide.ipynb b/notebooks/setup/complete_setup_guide.ipynb
@@ -1295,7 +1295,7 @@
      ]
     },
     {
-     "name": "stdin",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
       "\n",
@@ -1820,7 +1820,7 @@
       "🔧 System Requirements:\n",
       "   ✅ Docker: Installed\n",
       "   ✅ NVIDIA Docker: Available\n",
-      "   ✅ Disk Space: /ephemeral: 6121.2 GB free, /home/shadeform/Multi-Agent-Intelligent-Warehouse: 71.2 GB free (best 6121.2 GB — need 100GB+ on one filesystem)\n",
+      "   ✅ Disk Space: /ephemeral: 6121.2 GB free, /home/shadeform/Multi-Agent-Intelligent-Warehouse: 41.2 GB free (best 6121.2 GB — need 100GB+ on one filesystem)\n",
       "   ✅ CUDA: 12.x (detected from driver)\n",
       "\n",
       "======================================================================\n",
@@ -1873,7 +1873,7 @@
        " 'nvidia_docker_available': True,\n",
        " 'disk_space_gb': 6121.189472198486,\n",
        " 'disk_space_breakdown': [('/ephemeral', 6121.189472198486),\n",
-       "  ('/home/shadeform/Multi-Agent-Intelligent-Warehouse', 71.18986511230469)],\n",
+       "  ('/home/shadeform/Multi-Agent-Intelligent-Warehouse', 41.22386932373047)],\n",
        " 'cuda_version': '12.x (detected from driver)',\n",
        " 'can_run_local': True,\n",
        " 'recommendation': 'local_ideal',\n",
@@ -2144,7 +2144,7 @@
       "🔧 System Requirements:\n",
       "   ✅ Docker: Installed\n",
       "   ✅ NVIDIA Docker: Available\n",
-      "   ✅ Disk Space: /ephemeral: 6121.2 GB free, /home/shadeform/Multi-Agent-Intelligent-Warehouse: 71.2 GB free (best 6121.2 GB — need 100GB+ on one filesystem)\n",
+      "   ✅ Disk Space: /ephemeral: 6121.2 GB free, /home/shadeform/Multi-Agent-Intelligent-Warehouse: 41.2 GB free (best 6121.2 GB — need 100GB+ on one filesystem)\n",
       "   ✅ CUDA: 12.x (detected from driver)\n",
       "\n",
       "======================================================================\n",
@@ -2157,7 +2157,7 @@
      ]
     },
     {
-     "name": "stdin",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
       "\n",
@@ -2209,7 +2209,7 @@
      ]
     },
     {
-     "name": "stdin",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
       "\n",
@@ -2362,7 +2362,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -2379,7 +2379,7 @@
      ]
     },
     {
-     "name": "stdin",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
       "   Overwrite? (y/N):  y\n"
@@ -2399,7 +2399,7 @@
      ]
     },
     {
-     "name": "stdin",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
       "   Run `docker pull` now? (Y/n):  Y\n"
@@ -2415,10 +2415,14 @@
       "✅ LLM NIM container started\n",
       "\n",
       "🧪 Verifying local NIM installation...\n",
-      "   Waiting... (1/12)\n",
-      "   Waiting... (2/12)\n",
-      "   Waiting... (3/12)\n",
-      "   Waiting... (4/12)\n"
+      "   Waiting for NIM ready... (1/45)\n",
+      "✅ Local NIM is up (checked http://localhost:8000/v1/health/live)\n",
+      "\n",
+      "✅ Configuration updated for LOCAL installation\n",
+      "   LLM_NIM_URL=http://localhost:8000/v1\n",
+      "   LLM_MODEL=nvidia/llama-3.3-nemotron-super-49b-v1.5\n",
+      "\n",
+      "✅ Local LLM installation complete!\n"
      ]
     }
    ],
@@ -2468,7 +2472,7 @@
     "      - nim_models:/models\n",
     "    restart: unless-stopped\n",
     "    healthcheck:\n",
-    "      test: [\"CMD\", \"curl\", \"-f\", \"http://localhost:8000/health\"]\n",
+    "      test: [\"CMD\", \"curl\", \"-f\", \"http://localhost:8000/v1/health/ready\"]\n",
     "      interval: 30s\n",
     "      timeout: 10s\n",
     "      retries: 3\n",
@@ -2622,27 +2626,38 @@
     "def verify_local_nim_installation():\n",
     "    \"\"\"\n",
     "    Verify local NIM is running and accessible.\n",
+    "\n",
+    "    NVIDIA NIM LLM readiness is documented at /v1/health/ready (not legacy /health).\n",
+    "    Large models can take several minutes on first start — wait longer than a quick dev server.\n",
     "    \"\"\"\n",
     "    print(\"\\n🧪 Verifying local NIM installation...\")\n",
-    "    \n",
-    "    max_retries = 12  # 2 minutes total\n",
+    "    import os\n",
+    "\n",
+    "    _nim_port = os.getenv(\"LLM_NIM_PORT\", \"8000\")\n",
+    "    _base = f\"http://localhost:{_nim_port}\"\n",
+    "    # Prefer readiness probes (NIM docs); avoid /v1/models alone (can 200 before weights are ready).\n",
+    "    _candidates = (\n",
+    "        f\"{_base}/v1/health/ready\",\n",
+    "        f\"{_base}/v1/health/live\",\n",
+    "        f\"{_base}/health\",\n",
+    "    )\n",
+    "\n",
+    "    max_retries = 45  # ~7.5 min at 10s interval (49B cold start / weight download)\n",
     "    for i in range(max_retries):\n",
-    "        try:\n",
-    "            response = requests.get(\n",
-    "                \"http://localhost:8000/health\",\n",
-    "                timeout=5\n",
-    "            )\n",
-    "            if response.status_code == 200:\n",
-    "                print(\"✅ Local NIM is running and healthy!\")\n",
-    "                return True\n",
-    "        except:\n",
-    "            pass\n",
-    "        \n",
+    "        for url in _candidates:\n",
+    "            try:\n",
+    "                response = requests.get(url, timeout=10)\n",
+    "                if response.status_code == 200:\n",
+    "                    print(f\"✅ Local NIM is up (checked {url})\")\n",
+    "                    return True\n",
+    "            except Exception:\n",
+    "                pass\n",
     "        if i < max_retries - 1:\n",
-    "            print(f\"   Waiting... ({i+1}/{max_retries})\")\n",
+    "            print(f\"   Waiting for NIM ready... ({i + 1}/{max_retries})\")\n",
     "            time.sleep(10)\n",
-    "    \n",
-    "    print(\"❌ Local NIM verification failed\")\n",
+    "\n",
+    "    print(\"❌ Local NIM verification failed (no 200 from readiness URLs)\")\n",
+    "    print(\"   Tried:\", \", \".join(_candidates))\n",
     "    print(\"   Check logs: docker logs wosa-llm-nim\")\n",
     "    return False\n",
     "\n",
@@ -2765,9 +2780,50 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "📋 Environment Variables Configuration\n",
+      "============================================================\n",
+      "\n",
+      "🔍 Current Configuration:\n",
+      "\n",
+      "  NVIDIA_API_KEY            = nvapi-9_...                    # NVIDIA API Key (for NIM services)\n",
+      "  LLM_NIM_URL               = http://localhost:8000/v1       # LLM NIM Endpoint\n",
+      "  EMBEDDING_NIM_URL         = https://integrate.api.nvidia.com/v1 # Embedding NIM Endpoint\n",
+      "  POSTGRES_PASSWORD         = ⚠️  NOT SET (using default/placeholder) # Database Password\n",
+      "  JWT_SECRET_KEY            = your-str...                    # JWT Secret Key (for authentication)\n",
+      "  DEFAULT_ADMIN_PASSWORD    = ⚠️  NOT SET (using default/placeholder) # Default Admin Password\n",
+      "  DB_HOST                   = localhost                      # Database Host\n",
+      "  DB_PORT                   = 5435                           # Database Port\n",
+      "\n",
+      "============================================================\n",
+      "\n",
+      "✅ Environment file check complete!\n",
+      "\n",
+      "💡 Important Notes:\n",
+      "   - For production, change all default passwords and secrets\n",
+      "   - NVIDIA_API_KEY is required for AI features\n",
+      "   - JWT_SECRET_KEY is required in production\n",
+      "\n",
+      "📝 To edit: nano .env  (or your preferred editor)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "from pathlib import Path\n",
     "import os\n",
@@ -2894,13 +2950,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "    # Check if already stored\n",
-    "    # Try to find project root\n",
-    "    # Check if we're already in project root\n",
-    "    # Check if we're in notebooks/setup/ (go up 2 levels)\n",
-    "    # Check if we're in notebooks/ (go up 1 level)\n",
-    "        # Try going up from current directory\n",
-    "    # Change to project root and store it\n",
     "import subprocess\n",
     "import time\n",
     "from pathlib import Path\n",
@@ -3147,13 +3196,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "    # Check if already stored\n",
-    "    # Try to find it\n",
-    "    # Check if we're already in project root\n",
-    "    # Check if we're in notebooks/setup/ (go up 2 levels)\n",
-    "    # Check if we're in notebooks/ (go up 1 level)\n",
-    "        # Try going up from current directory\n",
-    "    # Change to project root and store it\n",
     "import subprocess\n",
     "import os\n",
     "from pathlib import Path\n",
@@ -3333,13 +3375,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "    # Check if already stored\n",
-    "    # Try to find project root\n",
-    "    # Check if we're already in project root\n",
-    "    # Check if we're in notebooks/setup/ (go up 2 levels)\n",
-    "    # Check if we're in notebooks/ (go up 1 level)\n",
-    "        # Try going up from current directory\n",
-    "    # Change to project root and store it\n",
     "import subprocess\n",
     "import sys\n",
     "from pathlib import Path\n",