Skip to content

Commit ebb3d65

Browse files
committed
feat: add HuggingFace nav link and export script
1 parent cea8f37 commit ebb3d65

2 files changed

Lines changed: 238 additions & 1 deletion

File tree

apps/web/index.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,7 @@
318318
<div class="links">
319319
<a href="#explore">Explore</a>
320320
<a href="https://github.com/superdoc-dev/docx-corpus">GitHub</a>
321-
<a href="https://huggingface.co/datasets/superdoc/docx-corpus">Download</a>
321+
<a href="https://huggingface.co/datasets/superdoc-dev/docx-corpus">HuggingFace</a>
322322
</div>
323323
</header>
324324

scripts/export-hf.py

Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
# /// script
2+
# requires-python = ">=3.11"
3+
# dependencies = [
4+
# "psycopg2-binary>=2.9.0",
5+
# "pyarrow>=18.0.0",
6+
# "huggingface_hub>=0.27.0",
7+
# "python-dotenv>=1.0.0",
8+
# "tqdm>=4.66.0",
9+
# ]
10+
# ///
11+
"""
12+
Export docx-corpus metadata to HuggingFace as a Parquet dataset.
13+
14+
Usage:
15+
uv run scripts/export-hf.py # dry-run: export parquet locally
16+
uv run scripts/export-hf.py --push # export and push to HuggingFace
17+
uv run scripts/export-hf.py --push --private # push as private dataset
18+
"""
19+
20+
import argparse
21+
import os
22+
import tempfile
23+
from pathlib import Path
24+
25+
import psycopg2
26+
import pyarrow as pa
27+
import pyarrow.parquet as pq
28+
from dotenv import load_dotenv
29+
from tqdm import tqdm
30+
31+
# Load .env from project root
32+
load_dotenv(Path(__file__).parent.parent / ".env")
33+
34+
REPO_ID = "superdoc-dev/docx-corpus"
35+
R2_BASE = "https://docxcorp.us/documents"
36+
37+
DATASET_CARD = """\
38+
---
39+
license: odc-by
40+
task_categories:
41+
- text-classification
42+
language:
43+
- en
44+
- ru
45+
- cs
46+
- pl
47+
- es
48+
- zh
49+
- lt
50+
- sk
51+
- fr
52+
- pt
53+
- de
54+
- it
55+
- sv
56+
- nl
57+
- bg
58+
- uk
59+
- tr
60+
- ja
61+
- hu
62+
- ko
63+
size_categories:
64+
- 100K<n<1M
65+
tags:
66+
- docx
67+
- word-documents
68+
- document-classification
69+
- ooxml
70+
pretty_name: docx-corpus
71+
---
72+
73+
# docx-corpus
74+
75+
The largest classified corpus of Word documents. 736K+ `.docx` files from the public web, classified into 10 document types and 9 topics across 76 languages.
76+
77+
## Dataset Description
78+
79+
This dataset contains metadata for publicly available `.docx` files collected from the web. Each document has been classified by document type and topic using a two-stage pipeline: LLM labeling (Claude) of a stratified sample, followed by fine-tuned XLM-RoBERTa classifiers applied at scale.
80+
81+
### Schema
82+
83+
| Column | Type | Description |
84+
|--------|------|-------------|
85+
| `id` | string | SHA-256 hash of the file (unique identifier) |
86+
| `filename` | string | Original filename from the source URL |
87+
| `type` | string | Document type (10 classes) |
88+
| `topic` | string | Document topic (9 classes) |
89+
| `language` | string | Detected language (ISO 639-1 code) |
90+
| `word_count` | int | Number of words in the document |
91+
| `confidence` | float | Classification confidence (min of type and topic) |
92+
| `url` | string | Direct download URL for the `.docx` file |
93+
94+
### Document Types
95+
96+
legal, forms, reports, policies, educational, correspondence, technical, administrative, creative, reference
97+
98+
### Topics
99+
100+
government, education, healthcare, finance, legal_judicial, technology, environment, nonprofit, general
101+
102+
## Download Files
103+
104+
Each row includes a `url` column pointing to the `.docx` file on our CDN. You can download files directly:
105+
106+
```python
107+
from datasets import load_dataset
108+
import requests
109+
110+
ds = load_dataset("superdoc-dev/docx-corpus", split="train")
111+
112+
# Filter and download
113+
legal_en = ds.filter(lambda x: x["type"] == "legal" and x["language"] == "en")
114+
for row in legal_en:
115+
resp = requests.get(row["url"])
116+
with open(f"corpus/{row['id']}.docx", "wb") as f:
117+
f.write(resp.content)
118+
```
119+
120+
Or use the manifest API for bulk downloads:
121+
122+
```bash
123+
curl "https://api.docxcorp.us/manifest?type=legal&lang=en" -o manifest.txt
124+
wget -i manifest.txt -P ./corpus/
125+
```
126+
127+
## Links
128+
129+
- **Website**: [docxcorp.us](https://docxcorp.us)
130+
- **GitHub**: [superdoc-dev/docx-corpus](https://github.com/superdoc-dev/docx-corpus)
131+
- **Built by**: [SuperDoc](https://superdoc.dev)
132+
"""
133+
134+
135+
def export_parquet(output_path: str) -> int:
136+
"""Query Neon and write metadata to a Parquet file. Returns row count."""
137+
database_url = os.getenv("DATABASE_URL")
138+
if not database_url:
139+
raise ValueError("DATABASE_URL not set — check .env file")
140+
141+
conn = psycopg2.connect(database_url)
142+
try:
143+
with conn.cursor("export_cursor") as cur:
144+
cur.itersize = 10_000
145+
cur.execute("""
146+
SELECT id, original_filename, document_type, document_topic,
147+
language, word_count, classification_confidence
148+
FROM documents
149+
WHERE document_type IS NOT NULL
150+
ORDER BY id
151+
""")
152+
153+
ids, filenames, types, topics = [], [], [], []
154+
languages, word_counts, confidences, urls = [], [], [], []
155+
156+
for row in tqdm(cur, desc="Reading rows", unit="rows"):
157+
ids.append(row[0])
158+
filenames.append(row[1] or "unknown.docx")
159+
types.append(row[2])
160+
topics.append(row[3])
161+
languages.append(row[4])
162+
word_counts.append(row[5])
163+
confidences.append(row[6])
164+
urls.append(f"{R2_BASE}/{row[0]}.docx")
165+
166+
table = pa.table({
167+
"id": pa.array(ids, type=pa.string()),
168+
"filename": pa.array(filenames, type=pa.string()),
169+
"type": pa.array(types, type=pa.string()),
170+
"topic": pa.array(topics, type=pa.string()),
171+
"language": pa.array(languages, type=pa.string()),
172+
"word_count": pa.array(word_counts, type=pa.int32()),
173+
"confidence": pa.array(confidences, type=pa.float32()),
174+
"url": pa.array(urls, type=pa.string()),
175+
})
176+
177+
pq.write_table(table, output_path, compression="zstd")
178+
return len(ids)
179+
finally:
180+
conn.close()
181+
182+
183+
def push_to_hub(parquet_path: str, private: bool = False):
184+
"""Push the parquet file and dataset card to HuggingFace."""
185+
from huggingface_hub import HfApi
186+
187+
api = HfApi()
188+
189+
# Create or get repo
190+
api.create_repo(REPO_ID, repo_type="dataset", private=private, exist_ok=True)
191+
192+
# Upload parquet
193+
print(f"Uploading {parquet_path} to {REPO_ID}...")
194+
api.upload_file(
195+
path_or_fileobj=parquet_path,
196+
path_in_repo="data/train-00000-of-00001.parquet",
197+
repo_id=REPO_ID,
198+
repo_type="dataset",
199+
)
200+
201+
# Upload dataset card
202+
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
203+
f.write(DATASET_CARD)
204+
card_path = f.name
205+
206+
api.upload_file(
207+
path_or_fileobj=card_path,
208+
path_in_repo="README.md",
209+
repo_id=REPO_ID,
210+
repo_type="dataset",
211+
)
212+
os.unlink(card_path)
213+
214+
print(f"Done! Dataset available at: https://huggingface.co/datasets/{REPO_ID}")
215+
216+
217+
def main():
218+
parser = argparse.ArgumentParser(description="Export docx-corpus to HuggingFace")
219+
parser.add_argument("--push", action="store_true", help="Push to HuggingFace (default: local export only)")
220+
parser.add_argument("--private", action="store_true", help="Create as private dataset")
221+
parser.add_argument("--output", default="docx-corpus.parquet", help="Local parquet output path")
222+
args = parser.parse_args()
223+
224+
print(f"Exporting metadata to {args.output}...")
225+
count = export_parquet(args.output)
226+
size_mb = os.path.getsize(args.output) / (1024 * 1024)
227+
print(f"Exported {count:,} rows ({size_mb:.1f} MB)")
228+
229+
if args.push:
230+
push_to_hub(args.output, private=args.private)
231+
else:
232+
print("Dry run — use --push to upload to HuggingFace")
233+
print(f" uv run scripts/export-hf.py --push")
234+
235+
236+
if __name__ == "__main__":
237+
main()

0 commit comments

Comments
 (0)