Skip to content

Commit c267052

Browse files
committed
fix: improve dataset validation and update script paths in documentation
1 parent 788d1a0 commit c267052

4 files changed

Lines changed: 36 additions & 6 deletions

File tree

bindings/python/examples/download_data.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -724,19 +724,43 @@ def download_stackoverflow(size="small"):
724724
'large' (~10 GB subset), 'xlarge' (~50 GB subset), or
725725
'full' (~323 GB)
726726
"""
727+
required_xml_files = (
728+
"Posts.xml",
729+
"Users.xml",
730+
"Comments.xml",
731+
"Tags.xml",
732+
"Badges.xml",
733+
"PostLinks.xml",
734+
"PostHistory.xml",
735+
"Votes.xml",
736+
)
737+
738+
def has_required_xml_files(dataset_dir: Path) -> bool:
739+
return dataset_dir.exists() and all(
740+
(dataset_dir / filename).exists() for filename in required_xml_files
741+
)
742+
727743
# Create data directory
728744
data_dir = Path(__file__).parent / "data"
729745
data_dir.mkdir(exist_ok=True)
730746

731747
if size == "tiny":
732748
source_dir = data_dir / "stackoverflow-small"
733-
if not source_dir.exists():
749+
if not has_required_xml_files(source_dir):
750+
print(
751+
"[INFO] stackoverflow-small is missing required XML files; "
752+
"downloading a fresh source dataset"
753+
)
734754
download_stackoverflow(size="small")
735755
return create_stackoverflow_tiny(source_dir=source_dir)
736756

737757
if size == "large":
738758
source_dir = data_dir / "stackoverflow-full"
739-
if not source_dir.exists():
759+
if not has_required_xml_files(source_dir):
760+
print(
761+
"[INFO] stackoverflow-full is missing required XML files; "
762+
"downloading a fresh source dataset"
763+
)
740764
download_stackoverflow(size="full")
741765
return create_stackoverflow_large(
742766
source_dir=source_dir,
@@ -747,7 +771,11 @@ def download_stackoverflow(size="small"):
747771

748772
if size == "xlarge":
749773
source_dir = data_dir / "stackoverflow-full"
750-
if not source_dir.exists():
774+
if not has_required_xml_files(source_dir):
775+
print(
776+
"[INFO] stackoverflow-full is missing required XML files; "
777+
"downloading a fresh source dataset"
778+
)
751779
download_stackoverflow(size="full")
752780
return create_stackoverflow_large(
753781
source_dir=source_dir,

bindings/python/scripts/fix_markdown.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ def main() -> int:
254254
)
255255
parser.add_argument(
256256
"--docs",
257-
default="/mnt/ssd2/repos/arcadedb-embedded-python/bindings/python/docs",
257+
default="./docs",
258258
help="Path to docs directory",
259259
)
260260
args = parser.parse_args()

bindings/python/scripts/profile-python/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ Required fixes are done. Any further work is optional tuning.
3131
From `bindings/python`:
3232

3333
```bash
34-
/mnt/ssd2/repos/arcadedb-embedded-python/.venv/bin/python scripts/profile-python/profile_bindings.py --preset full --runs 3 --records 5000 --person-count 2000 --vector-records 1500 --vector-k 10 --query-runs 100 --heap-size 4g
34+
python scripts/profile-python/profile_bindings.py --preset full --runs 3 --records 5000 --person-count 2000 --vector-records 1500 --vector-k 10 --query-runs 100 --heap-size 4g
3535
```
3636

3737
## Read The Report

bindings/python/scripts/profile-python/results.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,10 @@ This directory keeps one current benchmark artifact and one short summary.
88

99
Run used to produce it:
1010

11+
from `bindings/python`:
12+
1113
```bash
12-
cd /mnt/ssd2/repos/arcadedb-embedded-python/bindings/python && /mnt/ssd2/repos/arcadedb-embedded-python/.venv/bin/python scripts/profile-python/profile_bindings.py --preset full --runs 3 --records 5000 --person-count 2000 --vector-records 1500 --vector-k 10 --query-runs 100 --heap-size 4g
14+
python scripts/profile-python/profile_bindings.py --preset full --runs 3 --records 5000 --person-count 2000 --vector-records 1500 --vector-k 10 --query-runs 100 --heap-size 4g
1315
```
1416

1517
## Current Numbers

0 commit comments

Comments
 (0)