Skip to content

Commit af05724

Browse files
committed
feat(web): add SEO meta tags, structured data, and sitemap
Add canonical URL, Twitter card tags, og:image, JSON-LD Dataset structured data (schema.org), and sitemap.xml for Google indexing. Update deploy workflow to upload sitemap to R2.
1 parent bd1515a commit af05724

3 files changed

Lines changed: 53 additions & 0 deletions

File tree

.github/workflows/deploy-site.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,5 @@ jobs:
2424
R2_BUCKET: s3://${{ secrets.R2_BUCKET_NAME }}
2525
run: |
2626
aws s3 cp apps/web/index.html "$R2_BUCKET/index.html" --endpoint-url "$R2_ENDPOINT"
27+
aws s3 cp apps/web/sitemap.xml "$R2_BUCKET/sitemap.xml" --endpoint-url "$R2_ENDPOINT"
2728
aws s3 cp apps/web/public/ "$R2_BUCKET/public/" --recursive --endpoint-url "$R2_ENDPOINT"

apps/web/index.html

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,55 @@
1515
<title>docx-corpus — The largest classified corpus of Word documents</title>
1616
<meta name="description"
1717
content="736K+ .docx files from the public web, classified into 10 document types and 9 topics across 46+ languages. Open dataset for document processing research.">
18+
<link rel="canonical" href="https://docxcorp.us">
1819
<meta property="og:title" content="docx-corpus">
1920
<meta property="og:description"
2021
content="The largest classified corpus of Word documents. 736K+ files across 46+ languages.">
2122
<meta property="og:type" content="website">
2223
<meta property="og:url" content="https://docxcorp.us">
24+
<meta property="og:image" content="https://docxcorp.us/public/logo.png">
25+
<meta name="twitter:card" content="summary_large_image">
26+
<meta name="twitter:title" content="docx-corpus — The largest classified corpus of Word documents">
27+
<meta name="twitter:description"
28+
content="736K+ .docx files from the public web, classified into 10 document types and 9 topics across 46+ languages.">
29+
<meta name="twitter:image" content="https://docxcorp.us/public/logo.png">
30+
<script type="application/ld+json">
31+
{
32+
"@context": "https://schema.org",
33+
"@type": "Dataset",
34+
"name": "docx-corpus",
35+
"description": "The largest classified corpus of Word documents. 736K+ .docx files from the public web, classified into 10 document types and 9 topics across 46+ languages.",
36+
"url": "https://docxcorp.us",
37+
"license": "https://github.com/superdoc-dev/docx-corpus/blob/main/LICENSE",
38+
"creator": {
39+
"@type": "Organization",
40+
"name": "SuperDoc",
41+
"url": "https://superdoc.dev"
42+
},
43+
"distribution": [
44+
{
45+
"@type": "DataDownload",
46+
"encodingFormat": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
47+
"contentUrl": "https://docxcorp.us"
48+
}
49+
],
50+
"keywords": ["docx", "word documents", "NLP", "document processing", "corpus", "dataset", "OOXML"],
51+
"inLanguage": ["en", "de", "fr", "es", "pt", "zh", "ja", "ko", "ar"],
52+
"measurementTechnique": "Automated classification using ModernBERT",
53+
"variableMeasured": [
54+
{
55+
"@type": "PropertyValue",
56+
"name": "document_type",
57+
"description": "Document type classification (e.g., report, letter, resume)"
58+
},
59+
{
60+
"@type": "PropertyValue",
61+
"name": "document_topic",
62+
"description": "Document topic classification (e.g., business, education, legal)"
63+
}
64+
]
65+
}
66+
</script>
2367
<link rel="icon" type="image/x-icon" href="/public/favicon.ico">
2468
<link rel="icon" type="image/png" sizes="32x32" href="/public/favicon-32x32.png">
2569
<link rel="icon" type="image/png" sizes="16x16" href="/public/favicon-16x16.png">

apps/web/sitemap.xml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
3+
<url>
4+
<loc>https://docxcorp.us/</loc>
5+
<changefreq>weekly</changefreq>
6+
<priority>1.0</priority>
7+
</url>
8+
</urlset>

0 commit comments

Comments
 (0)