|
15 | 15 | <title>docx-corpus — The largest classified corpus of Word documents</title> |
16 | 16 | <meta name="description" |
17 | 17 | content="736K+ .docx files from the public web, classified into 10 document types and 9 topics across 46+ languages. Open dataset for document processing research."> |
| 18 | + <link rel="canonical" href="https://docxcorp.us"> |
18 | 19 | <meta property="og:title" content="docx-corpus"> |
19 | 20 | <meta property="og:description" |
20 | 21 | content="The largest classified corpus of Word documents. 736K+ files across 46+ languages."> |
21 | 22 | <meta property="og:type" content="website"> |
22 | 23 | <meta property="og:url" content="https://docxcorp.us"> |
| 24 | + <meta property="og:image" content="https://docxcorp.us/public/logo.png"> |
| 25 | + <meta name="twitter:card" content="summary_large_image"> |
| 26 | + <meta name="twitter:title" content="docx-corpus — The largest classified corpus of Word documents"> |
| 27 | + <meta name="twitter:description" |
| 28 | + content="736K+ .docx files from the public web, classified into 10 document types and 9 topics across 46+ languages."> |
| 29 | + <meta name="twitter:image" content="https://docxcorp.us/public/logo.png"> |
| 30 | + <script type="application/ld+json"> |
| 31 | + { |
| 32 | + "@context": "https://schema.org", |
| 33 | + "@type": "Dataset", |
| 34 | + "name": "docx-corpus", |
| 35 | + "description": "The largest classified corpus of Word documents. 736K+ .docx files from the public web, classified into 10 document types and 9 topics across 46+ languages.", |
| 36 | + "url": "https://docxcorp.us", |
| 37 | + "license": "https://github.com/superdoc-dev/docx-corpus/blob/main/LICENSE", |
| 38 | + "creator": { |
| 39 | + "@type": "Organization", |
| 40 | + "name": "SuperDoc", |
| 41 | + "url": "https://superdoc.dev" |
| 42 | + }, |
| 43 | + "distribution": [ |
| 44 | + { |
| 45 | + "@type": "DataDownload", |
| 46 | + "encodingFormat": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", |
| 47 | + "contentUrl": "https://docxcorp.us" |
| 48 | + } |
| 49 | + ], |
| 50 | + "keywords": ["docx", "word documents", "NLP", "document processing", "corpus", "dataset", "OOXML"], |
| 51 | + "inLanguage": ["en", "de", "fr", "es", "pt", "zh", "ja", "ko", "ar"], |
| 52 | + "measurementTechnique": "Automated classification using ModernBERT", |
| 53 | + "variableMeasured": [ |
| 54 | + { |
| 55 | + "@type": "PropertyValue", |
| 56 | + "name": "document_type", |
| 57 | + "description": "Document type classification (e.g., report, letter, resume)" |
| 58 | + }, |
| 59 | + { |
| 60 | + "@type": "PropertyValue", |
| 61 | + "name": "document_topic", |
| 62 | + "description": "Document topic classification (e.g., business, education, legal)" |
| 63 | + } |
| 64 | + ] |
| 65 | + } |
| 66 | + </script> |
23 | 67 | <link rel="icon" type="image/x-icon" href="/public/favicon.ico"> |
24 | 68 | <link rel="icon" type="image/png" sizes="32x32" href="/public/favicon-32x32.png"> |
25 | 69 | <link rel="icon" type="image/png" sizes="16x16" href="/public/favicon-16x16.png"> |
|
0 commit comments