nomic-api-rs/makefile at main · mindthemath/nomic-api-rs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
# ==============================================================================
# nomic-serve Makefile
# ==============================================================================

default: run

.PHONY: model fmt build clean run health docs openapi test test-list test-dim models-all test-models \
        docker-build docker-build-cpu docker-push docker-push-cpu \
        model-txt model-txt-all model-img model-img-all check-txt check-img check-models \
        test-img test-img-batch test-multimodal test-img-stats run-stats \
        test-vision-batch test-vision-variants test-text-batch-fp32 test-text-batch-onnx-fp16 \
        test-fp16-accuracy test-text-batch-transformers test-text-batch-half-precision \
        benchmark-vision-batch benchmark-vision-batch-gpu benchmark-throughput \
        docker-build-gpu docker-build-gpu-full docker-run-gpu docker-run-gpu-full \
        docker-push-gpu

# ==============================================================================
# Model Files
# ==============================================================================

# Text model (nomic-embed-text-v1.5)
model-txt:
	@bash scripts/download_text_models.sh quantized
	@bash scripts/download_text_models.sh fp32

model-txt-all:
	@bash scripts/download_text_models.sh all

# Vision model (nomic-embed-vision-v1.5)
model-img:
	@bash scripts/download_vision_models.sh quantized
	@bash scripts/download_vision_models.sh fp32

model-img-all:
	@bash scripts/download_vision_models.sh all

# Default: download text model (backward compatibility)
model: model-txt
	@echo "✓ Text model files ready"

# Download all model variants for comparison
models-all: model-txt-all model-img-all
	@echo "✓ All model variants downloaded"

# ==============================================================================
# Validity Checks
# ==============================================================================

check-txt:
	@if [ ! -f "models/txt/model_quantized.onnx" ]; then \
		echo "❌ Text model not found. Run: make model-txt"; \
		exit 1; \
	fi
	@if [ ! -f "models/txt/tokenizer.json" ]; then \
		echo "❌ Tokenizer not found. Run: make model-txt"; \
		exit 1; \
	fi
	@echo "✓ Text model files present"

check-img:
	@if [ ! -f "models/img/model_quantized.onnx" ]; then \
		echo "❌ Vision model not found. Run: make model-img"; \
		exit 1; \
	fi
	@echo "✓ Vision model files present"

check-models: check-txt check-img
	@echo "✓ All model files present"

# ==============================================================================
# Build
# ==============================================================================

fmt:
	./bin/cargo fmt

target/release/nomic-serve: src/main.rs Cargo.toml static/swagger-ui/index.html
	./bin/cargo build --release

build: fmt target/release/nomic-serve
	@echo "✓ Build complete"

build-gpu: fmt
	./bin/cargo build --release --features cuda

check:
	@./bin/cargo check
	@echo "✓ Check complete"

lint:
	@uvx black scripts/
	@uvx isort --profile black scripts/
	@echo "✓ Lint complete"

clean:
	rm -rf target Cargo.lock

clean-imgs:
	@printf "Are you sure you want to delete scripts/test_images? [y/N] "; \
	read REPLY; \
	case "$$REPLY" in \
		[Yy]*) rm -rf scripts/test_images; echo "✓ Deleted scripts/test_images"; ;; \
		*) echo "Cancelled."; ;; \
	esac

clean-results:
	@printf "Are you sure you want to delete scripts/results? [y/N] "; \
	read REPLY; \
	case "$$REPLY" in \
		[Yy]*) rm -rf scripts/results; echo "✓ Deleted scripts/results"; ;; \
		*) echo "Cancelled."; ;; \
	esac

# ==============================================================================
# Run
# ==============================================================================

run: build check-models
	IMG_MAX_BATCH_SIZE=256 TXT_MAX_BATCH_SIZE=2056 ./target/release/nomic-serve

run-gpu: build-gpu check-models
	USE_GPU=true ./target/release/nomic-serve

run-benchmark: build
	@echo "Starting server with high max batch sizes for benchmarking..."
	@echo "Using FP32 models to enable batching..."
	@if [ ! -f "models/txt/model.onnx" ]; then \
		echo "❌ FP32 text model not found. Run: make model-txt"; \
		exit 1; \
	fi
	@if [ ! -f "models/img/model.onnx" ]; then \
		echo "❌ FP32 vision model not found. Run: make model-img"; \
		exit 1; \
	fi
	TXT_MODEL=models/txt/model.onnx IMG_MODEL=models/img/model.onnx TXT_MAX_BATCH_SIZE=1024 IMG_MAX_BATCH_SIZE=128 ./target/release/nomic-serve

run-full: build check-models
	AVERAGING=arithmetic TXT_MODEL=models/txt/model.onnx IMG_MODEL=models/img/model.onnx IMG_MAX_BATCH_SIZE=256 TXT_MAX_BATCH_SIZE=2056 ./target/release/nomic-serve


# Run server (image-stats is now always included, no model files required for /img/stats)
run-stats: build
	./target/release/nomic-serve

# ==============================================================================
# Test - Text Endpoints
# ==============================================================================

health:
	@curl -s http://localhost:8080/health | jq .

docs:
	@echo "Opening docs at http://localhost:8080/docs"
	@curl -s -o /dev/null -w "HTTP %{http_code}\n" http://localhost:8080/docs

openapi:
	@curl -s http://localhost:8080/openapi.json | jq '.info'

test:
	@echo "Testing /txt/embed..."
	@curl -s -X POST localhost:8080/txt/embed \
		-H 'content-type: application/json' \
		-d '{"input": "ONNX in Rust is fast"}' | \
		jq '{tokens: .tokens, time_ms: (.time_ms | floor), sample: (.embedding[0:5] | map(. * 1000 | floor / 1000))}'

test-list:
	@echo "Testing /txt/batch..."
	@curl -s -X POST localhost:8080/txt/batch \
		-H 'content-type: application/json' \
		-d '{"inputs": ["ONNX in Rust is fast", "Python is also great", "Embeddings are useful"]}' | \
		jq '{count: (.embeddings | length), tokens, time_ms: (.time_ms | floor), samples: [.embeddings[] | .[0:3] | map(. * 1000 | floor / 1000)]}'

test-dim:
	@echo "Testing Matryoshka embeddings (dim=128)..."
	@curl -s -X POST localhost:8080/txt/embed \
		-H 'content-type: application/json' \
		-d '{"input": "ONNX in Rust is fast", "dim": 128}' | \
		jq '{tokens: .tokens, time_ms: (.time_ms | floor), dim: (.embedding | length), sample: (.embedding[0:5] | map(. * 1000 | floor / 1000))}'

test-query:
	@echo "Testing /txt/query (enforced search_query prefix)..."
	@curl -s -X POST localhost:8080/txt/query \
		-H 'content-type: application/json' \
		-d '{"input": "What is machine learning?"}' | \
		jq '{tokens: .tokens, time_ms: (.time_ms | floor), sample: (.embedding[0:5] | map(. * 1000 | floor / 1000))}'

# ==============================================================================
# Test - Image Endpoints
# ==============================================================================

test-img:
	@echo "Testing /img/embed with URL..."
	@curl -s -X POST localhost:8080/img/embed \
		-H 'content-type: application/json' \
		-d '{"content": "https://picsum.photos/400/300"}' | \
		jq '{time_ms: (.time_ms | floor), dim: (.embedding | length), sample: (.embedding[0:5] | map(. * 1000 | floor / 1000))}'

test-img-batch:
	@echo "Testing /img/batch with multiple URLs..."
	@curl -s -X POST localhost:8080/img/batch \
		-H 'content-type: application/json' \
		-d '{"contents": ["https://picsum.photos/400/300", "https://picsum.photos/300/400"]}' | \
		jq '{count: (.embeddings | length), time_ms: (.time_ms | floor), samples: [.embeddings[] | .[0:3] | map(. * 1000 | floor / 1000)]}'

# Test image stats endpoint (requires image-stats feature)
test-img-stats:
	@echo "Testing /img/stats with URL (geometric mean)..."
	@curl -s -X POST localhost:8080/img/stats \
		-H 'content-type: application/json' \
		-d '{"content": "https://picsum.photos/400/300", "averaging_method": "geometric"}' | \
		jq '{time_ms: (.time_ms | floor), exif_fields: (.exif_data | keys | length), avg_color: .color_data.avg_color, dominant_color: .color_data.dominant_color}'

test-img-stats-arithmetic:
	@echo "Testing /img/stats with arithmetic mean..."
	@curl -s -X POST localhost:8080/img/stats \
		-H 'content-type: application/json' \
		-d '{"content": "https://picsum.photos/400/300", "averaging_method": "arithmetic"}' | \
		jq '{time_ms: (.time_ms | floor), avg_color: .color_data.avg_color, dominant_color: .color_data.dominant_color}'

# Validate Rust image-stats against Python reference
test-img-stats-validate:
	@echo "Validating Rust /img/stats against Python reference..."
	@cd scripts && time python3 test_image_stats.py --rust-url http://localhost:8080 --count 100 --seed 1231 --tidy --paged

# Test vision model batching safety (check for cross-sample interference)
test-vision-batch:
	@echo "Testing vision model batching for cross-sample interference..."
	@cd scripts && python3 test_vision_batch_interference.py

# Test text model FP32 vs quantized batching
test-text-batch-fp32:
	@echo "Testing text model batching: quantized vs FP32..."
	@cd scripts && python3 test_text_batch_fp32.py

# Test text model ONNX FP16 vs other variants
test-text-batch-onnx-fp16:
	@echo "Testing text model batching: ONNX FP16 vs INT8 vs FP32 vs Q4F16..."
	@cd scripts && python3 test_text_batch_onnx_fp16.py

# Test FP16 vs FP32 accuracy and batch sensitivity
test-fp16-accuracy:
	@echo "Testing FP16 vs FP32 accuracy and batch sensitivity..."
	@cd scripts && python3 test_fp16_vs_fp32_accuracy.py

# Test vision model variants (FP32, FP16, quantized)
test-vision-variants:
	@echo "Testing vision model variants: FP32 vs FP16 vs Quantized..."
	@cd scripts && python3 test_vision_model_variants.py

# Test text model with PyTorch/transformers
test-text-batch-transformers:
	@echo "Testing text model batching with PyTorch/transformers..."
	@cd scripts && python3 test_text_batch_transformers.py

# Test text model with PyTorch half-precision (FP16/BF16)
test-text-batch-half-precision:
	@echo "Testing text model batching with PyTorch half-precision (FP16/BF16)..."
	@cd scripts && python3 test_text_batch_half_precision.py

# Benchmark vision model batching performance
benchmark-vision-batch:
	@echo "Benchmarking vision model with different batch sizes..."
	@cd scripts && python3 benchmark_vision_batching.py

# Benchmark vision model on GPU (if available)
benchmark-vision-batch-gpu:
	@echo "Benchmarking vision model on GPU with different batch sizes..."
	@cd scripts && python3 benchmark_vision_batching.py --gpu

# Benchmark API server throughput
benchmark-throughput:
	@echo "Benchmarking API server throughput..."
	@echo "Make sure server is running: make run"
	@cd scripts && python3 benchmark_throughput.py

# Benchmark batch size performance (comprehensive)
benchmark-batch-performance:
	@echo "Benchmarking batch size performance..."
	@echo "Make sure server is running with high max batch size:"
	@echo "  make run-benchmark"
	@cd scripts && python3 benchmark_batch_performance.py --max-batch-size 2056

benchmark-batch-performance-img:
	@echo "Benchmarking image batch size performance..."
	@cd scripts && python3 benchmark_batch_performance.py --endpoint img --max-batch-size 2056

benchmark-batch-performance-txt:
	@echo "Benchmarking text batch size performance..."
	@cd scripts && python3 benchmark_batch_performance.py --endpoint txt --max-batch-size 2056

# Test Rust vision batching implementation via API
test-rust-batch:
	@echo "Testing Rust vision batching implementation..."
	@echo "Make sure server is running: make run"
	@cd scripts && python3 test_rust_batching.py

# ==============================================================================
# Test - Multimodal
# ==============================================================================

test-multimodal:
	@echo "Testing multimodal similarity (text vs image)..."
	@echo "Embedding text: 'a photo of a landscape'..."
	@TXT=$$(curl -s -X POST localhost:8080/txt/embed \
		-H 'content-type: application/json' \
		-d '{"input": "search_query: a photo of a landscape"}' | jq -c '.embedding') && \
	echo "Embedding image: random landscape..." && \
	IMG=$$(curl -s -X POST localhost:8080/img/embed \
		-H 'content-type: application/json' \
		-d '{"content": "https://picsum.photos/400/300"}' | jq -c '.embedding') && \
	echo "Computing cosine similarity..." && \
	python3 -c "import json; t=json.loads('$$TXT'); i=json.loads('$$IMG'); dot=sum(a*b for a,b in zip(t,i)); print(f'Cosine similarity: {dot:.4f}')"

# ==============================================================================
# Model Comparison
# ==============================================================================

# Compare all model variants against baseline (model.onnx, fp32)
# Requires: models-all, build, and Python requests library
test-models: build model-txt-all
	@echo "Starting model variant comparison..."
	@bash scripts/run_model_comparison.sh

# ==============================================================================
# Docker
# ==============================================================================

DOCKER_IMAGE = litcr.io/lit-container/mindthemath/embedding/nomic-embed-v1.5-rs
DOCKER_TAG ?= latest

# Build CPU image
docker-build: docker-build-cpu

# Build CPU-only image (requires both models, defaults to quantized)
docker-build-cpu: model-txt model-img
	@echo "Building CPU Docker image (quantized models)..."
	docker build --target runtime-cpu \
		--build-arg TXT_MODEL_FILE=model_quantized.onnx \
		--build-arg IMG_MODEL_FILE=model_quantized.onnx \
		-t $(DOCKER_IMAGE):$(DOCKER_TAG)-cpu -t $(DOCKER_IMAGE):latest-cpu .

# Build CPU image with full precision models
docker-build-cpu-full: model-txt model-img
	@echo "Building CPU Docker image (full precision models)..."
	docker build --target runtime-cpu \
		--build-arg TXT_MODEL_FILE=model.onnx \
		--build-arg IMG_MODEL_FILE=model.onnx \
		-t $(DOCKER_IMAGE):$(DOCKER_TAG)-cpu-full -t $(DOCKER_IMAGE):latest-cpu-full .

docker-run-cpu: docker-build-cpu
	docker run --rm -p 8080:8080 --dns 1.1.1.1 --dns 1.0.0.1 $(DOCKER_IMAGE):$(DOCKER_TAG)-cpu

docker-run-cpu-full: docker-build-cpu-full
	docker run --rm -p 8080:8080 --dns 1.1.1.1 --dns 1.0.0.1 $(DOCKER_IMAGE):$(DOCKER_TAG)-cpu-full

# Build GPU image with quantized models
docker-build-gpu: model-txt model-img
	@echo "Building GPU Docker image (quantized models)..."
	docker build --target runtime-gpu \
		--build-arg TXT_MODEL_FILE=model_quantized.onnx \
		--build-arg IMG_MODEL_FILE=model_quantized.onnx \
		--build-arg RUST_BUILD_FEATURES="--features cuda" \
		-t $(DOCKER_IMAGE):$(DOCKER_TAG)-gpu -t $(DOCKER_IMAGE):latest-gpu .

# Build GPU image with full precision models
docker-build-gpu-full: model-txt model-img
	@echo "Building GPU Docker image (full precision models)..."
	docker build --target runtime-gpu \
		--build-arg TXT_MODEL_FILE=model.onnx \
		--build-arg IMG_MODEL_FILE=model.onnx \
		--build-arg RUST_BUILD_FEATURES="--features cuda" \
		-t $(DOCKER_IMAGE):$(DOCKER_TAG)-gpu-full -t $(DOCKER_IMAGE):latest-gpu-full .

docker-run-gpu: docker-build-gpu
	docker run --rm -it --gpus all -p 8080:8080 --dns 1.1.1.1 --dns 1.0.0.1 \
		-e TXT_MAX_BATCH_SIZE=2056 -e IMG_MAX_BATCH_SIZE=256 \
		$(DOCKER_IMAGE):$(DOCKER_TAG)-gpu

docker-run-gpu-full: docker-build-gpu-full
	docker run --rm -it --gpus all -p 8080:8080 --dns 1.1.1.1 --dns 1.0.0.1 \
		-e TXT_MAX_BATCH_SIZE=2056 -e IMG_MAX_BATCH_SIZE=256 \
		$(DOCKER_IMAGE):$(DOCKER_TAG)-gpu-full

remote-gpu-image:
	docker run --rm -it --gpus all -p 8080:8080 \
		-e TXT_MAX_BATCH_SIZE=2056 -e IMG_MAX_BATCH_SIZE=256 \
		mindthemath/nomic-embed-v1.5:gpu

# Push image
docker-push: docker-push-cpu docker-push-gpu # Include GPU push

# Push CPU image
docker-push-cpu: docker-build-cpu
	@echo "Pushing CPU image to DockerHub..."
	docker push $(DOCKER_IMAGE):$(DOCKER_TAG)-cpu
	docker push $(DOCKER_IMAGE):latest-cpu

# Push GPU image
docker-push-gpu: docker-build-gpu
	@echo "Pushing GPU image to DockerHub..."
	docker push $(DOCKER_IMAGE):$(DOCKER_TAG)-gpu
	docker push $(DOCKER_IMAGE):latest-gpu