11import json
2+ import itertools
23
3- with (
4- open ("ext/corpus.jsonl" , "r" ) as infile ,
5- open ("ext/corpus_transformed_full.jsonl" , "w" ) as outfile_full ,
6- open ("ext/corpus_transformed_500000.jsonl" , "w" ) as outfile_500000 ,
7- open ("ext/corpus_transformed_50000.jsonl" , "w" ) as outfile_50000 ,
8- open ("ext/corpus_transformed_1000.jsonl" , "w" ) as outfile_1000 ,
9- ):
10- for line in infile :
11- doc = json .loads (line )
12- doc_id = doc ["docid" ]
13- transformed = {
14- "put" : f"id:msmarco:passage::{ doc_id } " ,
15- "fields" : {
16- "text" : doc ["text" ],
17- "title" : doc ["title" ],
18- "id" : doc_id ,
19- },
20- }
21- outfile_full .write (json .dumps (transformed ) + "\n " )
22- outfile_500000 .write (json .dumps (transformed ) + "\n " )
23- outfile_50000 .write (json .dumps (transformed ) + "\n " )
24- outfile_1000 .write (json .dumps (transformed ) + "\n " )
4+ SUBSETS = {
5+ "full" : None ,
6+ "500000" : 500000 ,
7+ "50000" : 50000 ,
8+ "5000" : 5000 ,
9+ "1000" : 1000 ,
10+ }
11+
12+ def transform (doc ):
13+ doc_id = doc ["docid" ]
14+ return {
15+ "put" : f"id:msmarco:passage::{ doc_id } " ,
16+ "fields" : {
17+ "text" : doc ["text" ],
18+ "id" : doc_id ,
19+ },
20+ }
21+
22+ with open ("ext/corpus.jsonl" , "r" ) as infile :
23+ outfiles = {
24+ name : open (f"ext/corpus_transformed_{ name } .jsonl" , "w" )
25+ for name in SUBSETS
26+ }
27+ try :
28+ for i , line in enumerate (infile ):
29+ transformed = transform (json .loads (line ))
30+ serialized = json .dumps (transformed ) + "\n "
31+ for name , limit in SUBSETS .items ():
32+ if limit is None or i < limit :
33+ outfiles [name ].write (serialized )
34+ finally :
35+ for f in outfiles .values ():
36+ f .close ()
0 commit comments