Skip to content

Commit b3f7fe3

Browse files
committed
feat: add JSONL importer for batch processing of JSON Lines files
1 parent bf943da commit b3f7fe3

3 files changed

Lines changed: 143 additions & 45 deletions

File tree

.github/workflows/test-python-examples.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,8 +174,11 @@ jobs:
174174
echo "⚠️ Skipping sentence-transformers on Windows ARM64 (PyTorch not available)"
175175
pip install numpy requests
176176
elif [[ "$OSTYPE" == "darwin"* ]]; then
177+
# macOS: Default wheels are CPU/MPS (no CUDA)
177178
pip install "numpy<2.0" requests sentence-transformers
178179
else
180+
# Linux & Windows x64: Install CPU-only PyTorch to save space (avoid CUDA)
181+
pip install torch --index-url https://download.pytorch.org/whl/cpu
179182
pip install numpy requests sentence-transformers
180183
fi
181184

bindings/python/examples/download_data.py

Lines changed: 51 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def introduce_null_values_movielens(extract_dir):
8989
# Seed for reproducibility - same NULLs every time
9090
random.seed(42)
9191

92-
print("\n🔧 Systematically introducing NULL values in MovieLens CSV files...")
92+
print("\n[WORK] Systematically introducing NULL values in MovieLens CSV files...")
9393
overall_start = time.time()
9494

9595
# Configuration: file -> {nullable_field: percentage}
@@ -155,10 +155,10 @@ def introduce_null_values_movielens(extract_dir):
155155
null_summary = ", ".join(
156156
f"{count} NULL {field}" for field, count in null_counts.items()
157157
)
158-
print(f" {filename}: {null_summary} ({elapsed:.2f}s)")
158+
print(f" [OK] {filename}: {null_summary} ({elapsed:.2f}s)")
159159

160160
overall_elapsed = time.time() - overall_start
161-
print(f"\n⏱️ Total CSV NULL injection time: {overall_elapsed:.2f}s")
161+
print(f"\n[TIME] Total CSV NULL injection time: {overall_elapsed:.2f}s")
162162

163163

164164
def download_movielens(size="large", inject_nulls=True):
@@ -200,7 +200,7 @@ def download_movielens(size="large", inject_nulls=True):
200200

201201
# Check if already downloaded
202202
if extract_dir.exists():
203-
print(f" Dataset already exists at: {extract_dir}")
203+
print(f"[OK] Dataset already exists at: {extract_dir}")
204204
print(f" Size: {config['description']} ({config['size_mb']})")
205205
print()
206206
for csv_file in ["movies.csv", "ratings.csv", "tags.csv", "links.csv"]:
@@ -210,10 +210,12 @@ def download_movielens(size="large", inject_nulls=True):
210210
print(f" - {csv_file}: {size_mb:.1f} MB")
211211

212212
# Ask if user wants to re-introduce NULL values
213-
print("\n💡 To re-introduce NULL values, delete the data directory and re-run.")
213+
print(
214+
"\n[INFO] To re-introduce NULL values, delete the data directory and re-run."
215+
)
214216
return extract_dir
215217

216-
print(f"📥 Downloading MovieLens {size} dataset")
218+
print(f"[DOWNLOAD] Downloading MovieLens {size} dataset")
217219
print(f" Description: {config['description']} ({config['size_mb']})")
218220
print(f" URL: {url}")
219221
print(" This may take a few minutes...")
@@ -237,11 +239,11 @@ def report_progress(block_num, block_size, total_size):
237239
urllib.request.urlretrieve(url, zip_path, reporthook=report_progress)
238240
print() # New line after progress
239241
download_elapsed = time.time() - download_start
240-
print(f" Downloaded to: {zip_path} " f"({download_elapsed:.2f}s)")
242+
print(f"[OK] Downloaded to: {zip_path} " f"({download_elapsed:.2f}s)")
241243

242244
# Extract
243245
extract_start = time.time()
244-
print("📦 Extracting...")
246+
print("[EXTRACT] Extracting...")
245247
with zipfile.ZipFile(zip_path, "r") as zip_ref:
246248
# Extract to temp dir (zip contains ml-latest/ml-latest-small)
247249
temp_extract = data_dir / "temp_extract"
@@ -261,28 +263,28 @@ def report_progress(block_num, block_size, total_size):
261263
raise Exception(f"Unexpected zip structure in {temp_extract}")
262264

263265
extract_elapsed = time.time() - extract_start
264-
print(f" Extracted to: {extract_dir} ({extract_elapsed:.2f}s)")
266+
print(f"[OK] Extracted to: {extract_dir} ({extract_elapsed:.2f}s)")
265267

266268
# Introduce NULL values for testing
267269
if inject_nulls:
268270
introduce_null_values_movielens(extract_dir)
269271
else:
270-
print("\n⏭️ Skipping NULL value injection (--no-nulls flag)")
272+
print("\n[SKIP] Skipping NULL value injection (--no-nulls flag)")
271273

272274
# Show file sizes
273-
print("\n📊 Dataset contents:")
275+
print("\n[STATS] Dataset contents:")
274276
for csv_file in extract_dir.glob("*.csv"):
275277
size_mb = csv_file.stat().st_size / (1024 * 1024)
276278
print(f" - {csv_file.name}: {size_mb:.1f} MB")
277279

278280
# Clean up zip file
279281
zip_path.unlink()
280-
print("\n🧹 Cleaned up zip file")
282+
print("\n[CLEAN] Cleaned up zip file")
281283

282284
return extract_dir
283285

284286
except Exception as e:
285-
print(f" Error downloading dataset: {e}")
287+
print(f"[ERROR] Error downloading dataset: {e}")
286288
print(f" You can manually download from: {url}")
287289
raise
288290

@@ -296,7 +298,7 @@ def download_stackoverflow(size="small"):
296298
try:
297299
import py7zr
298300
except ImportError:
299-
print(" Missing dependency: py7zr")
301+
print("[ERROR] Missing dependency: py7zr")
300302
print(" Install with: pip install py7zr")
301303
raise
302304

@@ -366,7 +368,7 @@ def download_stackoverflow(size="small"):
366368

367369
# Check if already downloaded
368370
if extract_dir.exists():
369-
print(f" Dataset already exists at: {extract_dir}")
371+
print(f"[OK] Dataset already exists at: {extract_dir}")
370372
print(f" Site: {config['site']}")
371373
print(f" Size: {config['description']} ({config['size_mb']})")
372374
print(f" Date: {config['date']}")
@@ -380,10 +382,10 @@ def download_stackoverflow(size="small"):
380382
size_mb = xml_file.stat().st_size / (1024 * 1024)
381383
print(f" - {xml_file.name}: {size_mb:.1f} MB")
382384

383-
print("\n💡 To re-download, delete the data directory and re-run.")
385+
print("\n[INFO] To re-download, delete the data directory and re-run.")
384386
return extract_dir
385387

386-
print(f"📥 Downloading Stack Exchange {size} dataset")
388+
print(f"[DOWNLOAD] Downloading Stack Exchange {size} dataset")
387389
print(f" Site: {config['site']}")
388390
print(f" Description: {config['description']} ({config['size_mb']})")
389391
print(f" Date: {config['date']} (pinned for reproducibility)")
@@ -411,44 +413,44 @@ def report_progress(block_num, block_size, total_size):
411413
for url in config["urls"]:
412414
filename = url.split("/")[-1]
413415
archive_path = data_dir / filename
414-
print(f"\n📥 Downloading {filename}")
416+
print(f"\n[DOWNLOAD] Downloading {filename}")
415417
file_start = time.time()
416418
urllib.request.urlretrieve(url, archive_path, reporthook=report_progress)
417419
print() # New line after progress
418420
file_elapsed = time.time() - file_start
419-
print(f" Downloaded to: {archive_path} ({file_elapsed:.2f}s)")
421+
print(f"[OK] Downloaded to: {archive_path} ({file_elapsed:.2f}s)")
420422

421423
# Extract 7z file
422424
extract_start = time.time()
423-
print(f"📦 Extracting {filename}...")
425+
print(f"[EXTRACT] Extracting {filename}...")
424426
with py7zr.SevenZipFile(archive_path, mode="r") as archive:
425427
archive.extractall(path=extract_dir)
426428

427429
extract_elapsed = time.time() - extract_start
428-
print(f" Extracted ({extract_elapsed:.2f}s)")
430+
print(f"[OK] Extracted ({extract_elapsed:.2f}s)")
429431

430432
# Clean up archive file
431433
archive_path.unlink()
432434

433435
download_elapsed = time.time() - download_start
434-
print(f"\n⏱️ Total download time: {download_elapsed:.2f}s")
436+
print(f"\n[TIME] Total download time: {download_elapsed:.2f}s")
435437

436-
print(f"\n Extracted to: {extract_dir}")
438+
print(f"\n[OK] Extracted to: {extract_dir}")
437439

438440
# Show file sizes
439-
print("\n📊 Dataset contents:")
441+
print("\n[STATS] Dataset contents:")
440442
xml_files = list(extract_dir.glob("*.xml"))
441443
if xml_files:
442444
for xml_file in sorted(xml_files):
443445
size_mb = xml_file.stat().st_size / (1024 * 1024)
444446
print(f" - {xml_file.name}: {size_mb:.1f} MB")
445447
else:
446-
print(" ⚠️ No XML files found")
448+
print(" [WARNING] No XML files found")
447449

448450
return extract_dir
449451

450452
except Exception as e:
451-
print(f" Error downloading dataset: {e}")
453+
print(f"[ERROR] Error downloading dataset: {e}")
452454
print(f" You can manually download from: {config['urls'][0]}")
453455
raise
454456

@@ -531,7 +533,7 @@ def verify_csv_nulls(extract_dir, dataset_type="movielens", sample_size=None):
531533
results[filename] = file_results
532534

533535
verification_elapsed = time.time() - verification_start
534-
print(f"\n⏱️ CSV verification time: {verification_elapsed:.2f}s")
536+
print(f"\n[TIME] CSV verification time: {verification_elapsed:.2f}s")
535537
return results
536538

537539

@@ -638,44 +640,44 @@ def verify_xml_nulls(extract_dir, sample_size=None):
638640
}
639641

640642
verification_elapsed = time.time() - verification_start
641-
print(f"\n⏱️ XML verification time: {verification_elapsed:.2f}s")
643+
print(f"\n[TIME] XML verification time: {verification_elapsed:.2f}s")
642644
return results
643645

644646

645647
def print_verification_report(csv_results, xml_results, inject_nulls):
646648
"""Print verification report."""
647649
print()
648650
print("=" * 70)
649-
print("📊 Dataset Verification Report")
651+
print("[STATS] Dataset Verification Report")
650652
print("=" * 70)
651653
print()
652654

653655
# CSV verification (MovieLens)
654656
if csv_results:
655657
if inject_nulls:
656-
print(" NULL injection was ENABLED")
658+
print("[OK] NULL injection was ENABLED")
657659
print()
658660

659661
print("CSV Files:")
660662
for filename, data in csv_results.items():
661663
sampled_note = " (sampled)" if data.get("sampled") else ""
662-
print(f" 📄 {filename}{sampled_note}:")
664+
print(f" [FILE] {filename}{sampled_note}:")
663665
print(f" Total rows: {data['total_rows']}")
664666
for field, count in data["null_counts"].items():
665667
pct = (count / data["total_rows"]) * 100
666-
status = "" if count > 0 else ""
668+
status = "[OK]" if count > 0 else "[ERROR]"
667669
print(f" {status} NULL {field}: {count} ({pct:.1f}%)")
668670
print()
669671

670672
# XML verification (Stack Exchange)
671673
if xml_results:
672674
print("XML Files:")
673675
print()
674-
print(" 📊 Stack Exchange data (original, unmodified)")
676+
print(" [STATS] Stack Exchange data (original, unmodified)")
675677
print()
676678
for filename, data in xml_results.items():
677679
sampled_note = " (sampled)" if data.get("sampled") else ""
678-
print(f" 📄 {filename}{sampled_note}:")
680+
print(f" [FILE] {filename}{sampled_note}:")
679681
print(f" Total rows: {data['total_rows']}")
680682
print(f" Unique attributes: {data['total_attributes']}")
681683
if data["null_counts"]:
@@ -690,7 +692,7 @@ def print_verification_report(csv_results, xml_results, inject_nulls):
690692
print()
691693

692694
print("=" * 70)
693-
print(" Verification Complete")
695+
print("[OK] Verification Complete")
694696
print("=" * 70)
695697

696698

@@ -761,7 +763,11 @@ def main():
761763
args = parser.parse_args()
762764

763765
print("=" * 70)
764-
print("📥 Dataset Download" if not args.verify_only else "📊 Dataset Verification")
766+
print(
767+
"[DOWNLOAD] Dataset Download"
768+
if not args.verify_only
769+
else "[STATS] Dataset Verification"
770+
)
765771
print("=" * 70)
766772
print()
767773

@@ -779,12 +785,12 @@ def main():
779785
# Verify-only mode
780786
if args.verify_only:
781787
if not extract_dir.exists():
782-
print(f" Dataset not found: {extract_dir}")
788+
print(f"[ERROR] Dataset not found: {extract_dir}")
783789
print(" Run without --verify-only to download first.")
784790
return
785791

786-
print(f"📂 Verifying existing dataset: {extract_dir}")
787-
print(" Using smart sampling (100K rows) for fast verification")
792+
print(f"[DIR] Verifying existing dataset: {extract_dir}")
793+
print("[FAST] Using smart sampling (100K rows) for fast verification")
788794
print()
789795

790796
sample_size = 100000
@@ -810,15 +816,15 @@ def main():
810816

811817
print()
812818
print("=" * 70)
813-
print(" MovieLens Dataset Ready!")
819+
print("[OK] MovieLens Dataset Ready!")
814820
print("=" * 70)
815821
print()
816-
print("💡 Use this dataset in examples:")
822+
print("[INFO] Use this dataset in examples:")
817823
print(f" data_dir = Path('{extract_dir}')")
818824
print(" movies_csv = data_dir / 'movies.csv'")
819825
print(" ratings_csv = data_dir / 'ratings.csv'")
820826
print()
821-
print("📚 Dataset info:")
827+
print("[INFO] Dataset info:")
822828
if size == "large":
823829
print(" - ~86,000 movies")
824830
print(" - ~33,000,000 ratings")
@@ -843,15 +849,15 @@ def main():
843849

844850
print()
845851
print("=" * 70)
846-
print(" Stack Exchange Dataset Ready!")
852+
print("[OK] Stack Exchange Dataset Ready!")
847853
print("=" * 70)
848854
print()
849-
print("💡 Use this dataset in examples:")
855+
print("[INFO] Use this dataset in examples:")
850856
print(f" data_dir = Path('{extract_dir}')")
851857
print(" posts_xml = data_dir / 'Posts.xml'")
852858
print(" users_xml = data_dir / 'Users.xml'")
853859
print()
854-
print("📚 Dataset info:")
860+
print("[INFO] Dataset info:")
855861
if size == "small":
856862
print(" - Site: cs.stackexchange.com")
857863
print(" - ~80,000 posts (questions + answers)")

0 commit comments

Comments
 (0)