Skip to content

Commit 928b492

Browse files
author
shadeform
committed
fix(notebook): Step 6+ cells strip orphan lines; NIM verify /v1/health/ready
- Remove stray leading comment blocks that caused IndentationError in infrastructure/migration/users cells - verify_local_nim_installation: probe NIM readiness URLs, longer wait, LLM_NIM_PORT - compose llm-nim healthcheck uses /v1/health/ready; sync setup notebook from workspace Made-with: Cursor
1 parent 0452fca commit 928b492

2 files changed

Lines changed: 89 additions & 54 deletions

File tree

deploy/compose/docker-compose.dev.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ services:
184184
- nim_models:/models
185185
restart: unless-stopped
186186
healthcheck:
187-
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
187+
test: ["CMD", "curl", "-f", "http://localhost:8000/v1/health/ready"]
188188
interval: 30s
189189
timeout: 10s
190190
retries: 3

notebooks/setup/complete_setup_guide.ipynb

Lines changed: 88 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1295,7 +1295,7 @@
12951295
]
12961296
},
12971297
{
1298-
"name": "stdin",
1298+
"name": "stdout",
12991299
"output_type": "stream",
13001300
"text": [
13011301
"\n",
@@ -1820,7 +1820,7 @@
18201820
"🔧 System Requirements:\n",
18211821
" ✅ Docker: Installed\n",
18221822
" ✅ NVIDIA Docker: Available\n",
1823-
" ✅ Disk Space: /ephemeral: 6121.2 GB free, /home/shadeform/Multi-Agent-Intelligent-Warehouse: 71.2 GB free (best 6121.2 GB — need 100GB+ on one filesystem)\n",
1823+
" ✅ Disk Space: /ephemeral: 6121.2 GB free, /home/shadeform/Multi-Agent-Intelligent-Warehouse: 41.2 GB free (best 6121.2 GB — need 100GB+ on one filesystem)\n",
18241824
" ✅ CUDA: 12.x (detected from driver)\n",
18251825
"\n",
18261826
"======================================================================\n",
@@ -1873,7 +1873,7 @@
18731873
" 'nvidia_docker_available': True,\n",
18741874
" 'disk_space_gb': 6121.189472198486,\n",
18751875
" 'disk_space_breakdown': [('/ephemeral', 6121.189472198486),\n",
1876-
" ('/home/shadeform/Multi-Agent-Intelligent-Warehouse', 71.18986511230469)],\n",
1876+
" ('/home/shadeform/Multi-Agent-Intelligent-Warehouse', 41.22386932373047)],\n",
18771877
" 'cuda_version': '12.x (detected from driver)',\n",
18781878
" 'can_run_local': True,\n",
18791879
" 'recommendation': 'local_ideal',\n",
@@ -2144,7 +2144,7 @@
21442144
"🔧 System Requirements:\n",
21452145
" ✅ Docker: Installed\n",
21462146
" ✅ NVIDIA Docker: Available\n",
2147-
" ✅ Disk Space: /ephemeral: 6121.2 GB free, /home/shadeform/Multi-Agent-Intelligent-Warehouse: 71.2 GB free (best 6121.2 GB — need 100GB+ on one filesystem)\n",
2147+
" ✅ Disk Space: /ephemeral: 6121.2 GB free, /home/shadeform/Multi-Agent-Intelligent-Warehouse: 41.2 GB free (best 6121.2 GB — need 100GB+ on one filesystem)\n",
21482148
" ✅ CUDA: 12.x (detected from driver)\n",
21492149
"\n",
21502150
"======================================================================\n",
@@ -2157,7 +2157,7 @@
21572157
]
21582158
},
21592159
{
2160-
"name": "stdin",
2160+
"name": "stdout",
21612161
"output_type": "stream",
21622162
"text": [
21632163
"\n",
@@ -2209,7 +2209,7 @@
22092209
]
22102210
},
22112211
{
2212-
"name": "stdin",
2212+
"name": "stdout",
22132213
"output_type": "stream",
22142214
"text": [
22152215
"\n",
@@ -2362,7 +2362,7 @@
23622362
},
23632363
{
23642364
"cell_type": "code",
2365-
"execution_count": null,
2365+
"execution_count": 6,
23662366
"metadata": {},
23672367
"outputs": [
23682368
{
@@ -2379,7 +2379,7 @@
23792379
]
23802380
},
23812381
{
2382-
"name": "stdin",
2382+
"name": "stdout",
23832383
"output_type": "stream",
23842384
"text": [
23852385
" Overwrite? (y/N): y\n"
@@ -2399,7 +2399,7 @@
23992399
]
24002400
},
24012401
{
2402-
"name": "stdin",
2402+
"name": "stdout",
24032403
"output_type": "stream",
24042404
"text": [
24052405
" Run `docker pull` now? (Y/n): Y\n"
@@ -2415,10 +2415,14 @@
24152415
"✅ LLM NIM container started\n",
24162416
"\n",
24172417
"🧪 Verifying local NIM installation...\n",
2418-
" Waiting... (1/12)\n",
2419-
" Waiting... (2/12)\n",
2420-
" Waiting... (3/12)\n",
2421-
" Waiting... (4/12)\n"
2418+
" Waiting for NIM ready... (1/45)\n",
2419+
"✅ Local NIM is up (checked http://localhost:8000/v1/health/live)\n",
2420+
"\n",
2421+
"✅ Configuration updated for LOCAL installation\n",
2422+
" LLM_NIM_URL=http://localhost:8000/v1\n",
2423+
" LLM_MODEL=nvidia/llama-3.3-nemotron-super-49b-v1.5\n",
2424+
"\n",
2425+
"✅ Local LLM installation complete!\n"
24222426
]
24232427
}
24242428
],
@@ -2468,7 +2472,7 @@
24682472
" - nim_models:/models\n",
24692473
" restart: unless-stopped\n",
24702474
" healthcheck:\n",
2471-
" test: [\"CMD\", \"curl\", \"-f\", \"http://localhost:8000/health\"]\n",
2475+
" test: [\"CMD\", \"curl\", \"-f\", \"http://localhost:8000/v1/health/ready\"]\n",
24722476
" interval: 30s\n",
24732477
" timeout: 10s\n",
24742478
" retries: 3\n",
@@ -2622,27 +2626,38 @@
26222626
"def verify_local_nim_installation():\n",
26232627
" \"\"\"\n",
26242628
" Verify local NIM is running and accessible.\n",
2629+
"\n",
2630+
" NVIDIA NIM LLM readiness is documented at /v1/health/ready (not legacy /health).\n",
2631+
" Large models can take several minutes on first start — wait longer than a quick dev server.\n",
26252632
" \"\"\"\n",
26262633
" print(\"\\n🧪 Verifying local NIM installation...\")\n",
2627-
" \n",
2628-
" max_retries = 12 # 2 minutes total\n",
2634+
" import os\n",
2635+
"\n",
2636+
" _nim_port = os.getenv(\"LLM_NIM_PORT\", \"8000\")\n",
2637+
" _base = f\"http://localhost:{_nim_port}\"\n",
2638+
" # Prefer readiness probes (NIM docs); avoid /v1/models alone (can 200 before weights are ready).\n",
2639+
" _candidates = (\n",
2640+
" f\"{_base}/v1/health/ready\",\n",
2641+
" f\"{_base}/v1/health/live\",\n",
2642+
" f\"{_base}/health\",\n",
2643+
" )\n",
2644+
"\n",
2645+
" max_retries = 45 # ~7.5 min at 10s interval (49B cold start / weight download)\n",
26292646
" for i in range(max_retries):\n",
2630-
" try:\n",
2631-
" response = requests.get(\n",
2632-
" \"http://localhost:8000/health\",\n",
2633-
" timeout=5\n",
2634-
" )\n",
2635-
" if response.status_code == 200:\n",
2636-
" print(\"✅ Local NIM is running and healthy!\")\n",
2637-
" return True\n",
2638-
" except:\n",
2639-
" pass\n",
2640-
" \n",
2647+
" for url in _candidates:\n",
2648+
" try:\n",
2649+
" response = requests.get(url, timeout=10)\n",
2650+
" if response.status_code == 200:\n",
2651+
" print(f\"✅ Local NIM is up (checked {url})\")\n",
2652+
" return True\n",
2653+
" except Exception:\n",
2654+
" pass\n",
26412655
" if i < max_retries - 1:\n",
2642-
" print(f\" Waiting... ({i+1}/{max_retries})\")\n",
2656+
" print(f\" Waiting for NIM ready... ({i + 1}/{max_retries})\")\n",
26432657
" time.sleep(10)\n",
2644-
" \n",
2645-
" print(\"❌ Local NIM verification failed\")\n",
2658+
"\n",
2659+
" print(\"❌ Local NIM verification failed (no 200 from readiness URLs)\")\n",
2660+
" print(\" Tried:\", \", \".join(_candidates))\n",
26462661
" print(\" Check logs: docker logs wosa-llm-nim\")\n",
26472662
" return False\n",
26482663
"\n",
@@ -2765,9 +2780,50 @@
27652780
},
27662781
{
27672782
"cell_type": "code",
2768-
"execution_count": null,
2783+
"execution_count": 7,
27692784
"metadata": {},
2770-
"outputs": [],
2785+
"outputs": [
2786+
{
2787+
"name": "stdout",
2788+
"output_type": "stream",
2789+
"text": [
2790+
"📋 Environment Variables Configuration\n",
2791+
"============================================================\n",
2792+
"\n",
2793+
"🔍 Current Configuration:\n",
2794+
"\n",
2795+
" NVIDIA_API_KEY = nvapi-9_... # NVIDIA API Key (for NIM services)\n",
2796+
" LLM_NIM_URL = http://localhost:8000/v1 # LLM NIM Endpoint\n",
2797+
" EMBEDDING_NIM_URL = https://integrate.api.nvidia.com/v1 # Embedding NIM Endpoint\n",
2798+
" POSTGRES_PASSWORD = ⚠️ NOT SET (using default/placeholder) # Database Password\n",
2799+
" JWT_SECRET_KEY = your-str... # JWT Secret Key (for authentication)\n",
2800+
" DEFAULT_ADMIN_PASSWORD = ⚠️ NOT SET (using default/placeholder) # Default Admin Password\n",
2801+
" DB_HOST = localhost # Database Host\n",
2802+
" DB_PORT = 5435 # Database Port\n",
2803+
"\n",
2804+
"============================================================\n",
2805+
"\n",
2806+
"✅ Environment file check complete!\n",
2807+
"\n",
2808+
"💡 Important Notes:\n",
2809+
" - For production, change all default passwords and secrets\n",
2810+
" - NVIDIA_API_KEY is required for AI features\n",
2811+
" - JWT_SECRET_KEY is required in production\n",
2812+
"\n",
2813+
"📝 To edit: nano .env (or your preferred editor)\n"
2814+
]
2815+
},
2816+
{
2817+
"data": {
2818+
"text/plain": [
2819+
"True"
2820+
]
2821+
},
2822+
"execution_count": 7,
2823+
"metadata": {},
2824+
"output_type": "execute_result"
2825+
}
2826+
],
27712827
"source": [
27722828
"from pathlib import Path\n",
27732829
"import os\n",
@@ -2894,13 +2950,6 @@
28942950
"metadata": {},
28952951
"outputs": [],
28962952
"source": [
2897-
" # Check if already stored\n",
2898-
" # Try to find project root\n",
2899-
" # Check if we're already in project root\n",
2900-
" # Check if we're in notebooks/setup/ (go up 2 levels)\n",
2901-
" # Check if we're in notebooks/ (go up 1 level)\n",
2902-
" # Try going up from current directory\n",
2903-
" # Change to project root and store it\n",
29042953
"import subprocess\n",
29052954
"import time\n",
29062955
"from pathlib import Path\n",
@@ -3147,13 +3196,6 @@
31473196
"metadata": {},
31483197
"outputs": [],
31493198
"source": [
3150-
" # Check if already stored\n",
3151-
" # Try to find it\n",
3152-
" # Check if we're already in project root\n",
3153-
" # Check if we're in notebooks/setup/ (go up 2 levels)\n",
3154-
" # Check if we're in notebooks/ (go up 1 level)\n",
3155-
" # Try going up from current directory\n",
3156-
" # Change to project root and store it\n",
31573199
"import subprocess\n",
31583200
"import os\n",
31593201
"from pathlib import Path\n",
@@ -3333,13 +3375,6 @@
33333375
"metadata": {},
33343376
"outputs": [],
33353377
"source": [
3336-
" # Check if already stored\n",
3337-
" # Try to find project root\n",
3338-
" # Check if we're already in project root\n",
3339-
" # Check if we're in notebooks/setup/ (go up 2 levels)\n",
3340-
" # Check if we're in notebooks/ (go up 1 level)\n",
3341-
" # Try going up from current directory\n",
3342-
" # Change to project root and store it\n",
33433378
"import subprocess\n",
33443379
"import sys\n",
33453380
"from pathlib import Path\n",

0 commit comments

Comments
 (0)