Skip to content

Commit fad12da

Browse files
authored
Merge pull request #1898 from vespa-engine/kaibor737/add-scaling-tutorial
fixed python script and improved node naming
2 parents 1d8c8ec + 8b1c561 commit fad12da

2 files changed

Lines changed: 37 additions & 25 deletions

File tree

Lines changed: 34 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,36 @@
11
import json
2+
import itertools
23

3-
with (
4-
open("ext/corpus.jsonl", "r") as infile,
5-
open("ext/corpus_transformed_full.jsonl", "w") as outfile_full,
6-
open("ext/corpus_transformed_500000.jsonl", "w") as outfile_500000,
7-
open("ext/corpus_transformed_50000.jsonl", "w") as outfile_50000,
8-
open("ext/corpus_transformed_1000.jsonl", "w") as outfile_1000,
9-
):
10-
for line in infile:
11-
doc = json.loads(line)
12-
doc_id = doc["docid"]
13-
transformed = {
14-
"put": f"id:msmarco:passage::{doc_id}",
15-
"fields": {
16-
"text": doc["text"],
17-
"title": doc["title"],
18-
"id": doc_id,
19-
},
20-
}
21-
outfile_full.write(json.dumps(transformed) + "\n")
22-
outfile_500000.write(json.dumps(transformed) + "\n")
23-
outfile_50000.write(json.dumps(transformed) + "\n")
24-
outfile_1000.write(json.dumps(transformed) + "\n")
4+
SUBSETS = {
5+
"full": None,
6+
"500000": 500000,
7+
"50000": 50000,
8+
"5000": 5000,
9+
"1000": 1000,
10+
}
11+
12+
def transform(doc):
13+
doc_id = doc["docid"]
14+
return {
15+
"put": f"id:msmarco:passage::{doc_id}",
16+
"fields": {
17+
"text": doc["text"],
18+
"id": doc_id,
19+
},
20+
}
21+
22+
with open("ext/corpus.jsonl", "r") as infile:
23+
outfiles = {
24+
name: open(f"ext/corpus_transformed_{name}.jsonl", "w")
25+
for name in SUBSETS
26+
}
27+
try:
28+
for i, line in enumerate(infile):
29+
transformed = transform(json.loads(line))
30+
serialized = json.dumps(transformed) + "\n"
31+
for name, limit in SUBSETS.items():
32+
if limit is None or i < limit:
33+
outfiles[name].write(serialized)
34+
finally:
35+
for f in outfiles.values():
36+
f.close()

scaling-tutorial/services.xml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
<!-- Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -->
33
<services version="1.0" xmlns:deploy="vespa" xmlns:preprocess="properties" minimum-required-vespa-version="8.311.28">
44

5-
<container id="default" version="1.0">
5+
<container id="container-default" version="1.0">
66

77
<nodes deploy:environment="dev" count="1">
88
<resources vcpu="1.0" memory="8Gb" architecture="arm64" storage-type="local" disk="59Gb"/>
@@ -34,12 +34,12 @@
3434

3535
</container>
3636

37-
<content id="msmarco" version="1.0">
37+
<content id="content-msmarco" version="1.0">
3838
<min-redundancy>1</min-redundancy>
3939
<documents>
4040
<document mode="index" type="passage"/>
4141
</documents>
42-
<nodes count="1">
42+
<nodes deploy:environment="dev" count="1">
4343
<resources vcpu="1.0" memory="8Gb" architecture="arm64" storage-type="local" disk="59Gb"/>
4444
</nodes>
4545
<engine>

0 commit comments

Comments
 (0)