1+ using DocumentFormat . OpenXml . Packaging ;
2+ using DocumentFormat . OpenXml . Wordprocessing ;
3+ using System . Text ;
4+
5+ namespace MarkItDown . Core . Converters ;
6+
7+ /// <summary>
8+ /// Converter for Microsoft Word (.docx) files to Markdown using DocumentFormat.OpenXml.
9+ /// </summary>
10+ public sealed class DocxConverter : IDocumentConverter
11+ {
12+ private static readonly HashSet < string > AcceptedExtensions = new ( StringComparer . OrdinalIgnoreCase )
13+ {
14+ ".docx"
15+ } ;
16+
17+ private static readonly HashSet < string > AcceptedMimeTypes = new ( StringComparer . OrdinalIgnoreCase )
18+ {
19+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
20+ } ;
21+
22+ public int Priority => 210 ; // Between PDF and plain text
23+
24+ public bool AcceptsInput ( StreamInfo streamInfo )
25+ {
26+ var mimeType = streamInfo . MimeType ? . ToLowerInvariant ( ) ?? string . Empty ;
27+ var extension = streamInfo . Extension ? . ToLowerInvariant ( ) ;
28+
29+ // Check the extension
30+ if ( extension is not null && AcceptedExtensions . Contains ( extension ) )
31+ return true ;
32+
33+ // Check the mimetype
34+ if ( AcceptedMimeTypes . Contains ( mimeType ) )
35+ return true ;
36+
37+ return false ;
38+ }
39+
40+ public bool Accepts ( Stream stream , StreamInfo streamInfo , CancellationToken cancellationToken = default )
41+ {
42+ if ( ! AcceptsInput ( streamInfo ) )
43+ return false ;
44+
45+ // Validate ZIP/DOCX header if we have access to the stream
46+ if ( stream . CanSeek && stream . Length > 4 )
47+ {
48+ var originalPosition = stream . Position ;
49+ try
50+ {
51+ stream . Position = 0 ;
52+ var buffer = new byte [ 4 ] ;
53+ var bytesRead = stream . Read ( buffer , 0 , 4 ) ;
54+ stream . Position = originalPosition ;
55+
56+ if ( bytesRead == 4 )
57+ {
58+ // Check for ZIP file signature (DOCX files are ZIP archives)
59+ return buffer [ 0 ] == 0x50 && buffer [ 1 ] == 0x4B &&
60+ ( buffer [ 2 ] == 0x03 || buffer [ 2 ] == 0x05 || buffer [ 2 ] == 0x07 ) &&
61+ ( buffer [ 3 ] == 0x04 || buffer [ 3 ] == 0x06 || buffer [ 3 ] == 0x08 ) ;
62+ }
63+ }
64+ catch
65+ {
66+ stream . Position = originalPosition ;
67+ }
68+ }
69+
70+ return true ;
71+ }
72+
73+ public async Task < DocumentConverterResult > ConvertAsync ( Stream stream , StreamInfo streamInfo , CancellationToken cancellationToken = default )
74+ {
75+ try
76+ {
77+ // Reset stream position
78+ if ( stream . CanSeek )
79+ stream . Position = 0 ;
80+
81+ var markdown = await ExtractTextFromDocxAsync ( stream , cancellationToken ) ;
82+ var title = ExtractTitle ( markdown ) ;
83+
84+ return new DocumentConverterResult ( markdown , title ) ;
85+ }
86+ catch ( Exception ex ) when ( ! ( ex is MarkItDownException ) )
87+ {
88+ throw new FileConversionException ( $ "Failed to convert DOCX file: { ex . Message } ", ex ) ;
89+ }
90+ }
91+
92+ private static async Task < string > ExtractTextFromDocxAsync ( Stream stream , CancellationToken cancellationToken )
93+ {
94+ var result = new StringBuilder ( ) ;
95+
96+ await Task . Run ( ( ) =>
97+ {
98+ using var wordDocument = WordprocessingDocument . Open ( stream , false ) ;
99+ var body = wordDocument . MainDocumentPart ? . Document ? . Body ;
100+
101+ if ( body != null )
102+ {
103+ ProcessBodyElements ( body , result , cancellationToken ) ;
104+ }
105+ } , cancellationToken ) ;
106+
107+ return result . ToString ( ) . Trim ( ) ;
108+ }
109+
110+ private static void ProcessBodyElements ( Body body , StringBuilder result , CancellationToken cancellationToken )
111+ {
112+ foreach ( var element in body . Elements ( ) )
113+ {
114+ cancellationToken . ThrowIfCancellationRequested ( ) ;
115+
116+ switch ( element )
117+ {
118+ case Paragraph paragraph :
119+ ProcessParagraph ( paragraph , result ) ;
120+ break ;
121+ case Table table :
122+ ProcessTable ( table , result ) ;
123+ break ;
124+ // Add more element types as needed
125+ }
126+ }
127+ }
128+
129+ private static void ProcessParagraph ( Paragraph paragraph , StringBuilder result )
130+ {
131+ var paragraphText = new StringBuilder ( ) ;
132+ var isHeading = false ;
133+ var headingLevel = 0 ;
134+
135+ // Check paragraph properties for heading styles
136+ var paragraphProperties = paragraph . ParagraphProperties ;
137+ if ( paragraphProperties ? . ParagraphStyleId ? . Val ? . Value != null )
138+ {
139+ var styleId = paragraphProperties . ParagraphStyleId . Val . Value . ToLowerInvariant ( ) ;
140+ if ( styleId . StartsWith ( "heading" ) )
141+ {
142+ isHeading = true ;
143+ if ( int . TryParse ( styleId . Replace ( "heading" , "" ) , out var level ) )
144+ {
145+ headingLevel = level ;
146+ }
147+ }
148+ }
149+
150+ // Process runs within the paragraph
151+ foreach ( var run in paragraph . Elements < Run > ( ) )
152+ {
153+ var runProperties = run . RunProperties ;
154+ var currentBold = runProperties ? . Bold != null ;
155+ var currentItalic = runProperties ? . Italic != null ;
156+
157+ foreach ( var textElement in run . Elements ( ) )
158+ {
159+ switch ( textElement )
160+ {
161+ case Text text :
162+ var textContent = text . Text ;
163+
164+ // Apply formatting
165+ if ( currentBold && ! isHeading )
166+ textContent = $ "**{ textContent } **";
167+ if ( currentItalic && ! isHeading )
168+ textContent = $ "*{ textContent } *";
169+
170+ paragraphText . Append ( textContent ) ;
171+ break ;
172+ case TabChar :
173+ paragraphText . Append ( "\t " ) ;
174+ break ;
175+ case Break :
176+ paragraphText . AppendLine ( ) ;
177+ break ;
178+ }
179+ }
180+ }
181+
182+ var finalText = paragraphText . ToString ( ) ;
183+
184+ if ( ! string . IsNullOrWhiteSpace ( finalText ) )
185+ {
186+ if ( isHeading && headingLevel > 0 )
187+ {
188+ result . Append ( new string ( '#' , Math . Min ( headingLevel , 6 ) ) ) ;
189+ result . Append ( ' ' ) ;
190+ result . AppendLine ( finalText . Trim ( ) ) ;
191+ result . AppendLine ( ) ;
192+ }
193+ else
194+ {
195+ result . AppendLine ( finalText . Trim ( ) ) ;
196+ result . AppendLine ( ) ;
197+ }
198+ }
199+ }
200+
201+ private static void ProcessTable ( Table table , StringBuilder result )
202+ {
203+ var rows = table . Elements < TableRow > ( ) . ToList ( ) ;
204+ if ( rows . Count == 0 )
205+ return ;
206+
207+ result . AppendLine ( ) ;
208+
209+ var isFirstRow = true ;
210+ foreach ( var row in rows )
211+ {
212+ var cells = row . Elements < TableCell > ( ) . ToList ( ) ;
213+ if ( cells . Count == 0 )
214+ continue ;
215+
216+ result . Append ( "|" ) ;
217+ foreach ( var cell in cells )
218+ {
219+ var cellText = ExtractCellText ( cell ) ;
220+ result . Append ( $ " { cellText . Replace ( "|" , "\\ |" ) . Trim ( ) } |") ;
221+ }
222+ result . AppendLine ( ) ;
223+
224+ // Add header separator after first row
225+ if ( isFirstRow )
226+ {
227+ result . Append ( "|" ) ;
228+ for ( int i = 0 ; i < cells . Count ; i ++ )
229+ {
230+ result . Append ( " --- |" ) ;
231+ }
232+ result . AppendLine ( ) ;
233+ isFirstRow = false ;
234+ }
235+ }
236+
237+ result . AppendLine ( ) ;
238+ }
239+
240+ private static string ExtractCellText ( TableCell cell )
241+ {
242+ var cellText = new StringBuilder ( ) ;
243+
244+ foreach ( var paragraph in cell . Elements < Paragraph > ( ) )
245+ {
246+ foreach ( var run in paragraph . Elements < Run > ( ) )
247+ {
248+ foreach ( var text in run . Elements < Text > ( ) )
249+ {
250+ cellText . Append ( text . Text ) ;
251+ }
252+ }
253+
254+ if ( cellText . Length > 0 )
255+ cellText . Append ( " " ) ;
256+ }
257+
258+ return cellText . ToString ( ) . Trim ( ) ;
259+ }
260+
261+ private static string ? ExtractTitle ( string markdown )
262+ {
263+ if ( string . IsNullOrWhiteSpace ( markdown ) )
264+ return null ;
265+
266+ var lines = markdown . Split ( '\n ' , StringSplitOptions . RemoveEmptyEntries ) ;
267+
268+ // Look for the first heading
269+ foreach ( var line in lines . Take ( 10 ) )
270+ {
271+ var trimmedLine = line . Trim ( ) ;
272+ if ( trimmedLine . StartsWith ( '#' ) )
273+ {
274+ return trimmedLine . TrimStart ( '#' ) . Trim ( ) ;
275+ }
276+ }
277+
278+ // If no heading found, use the first substantial line
279+ foreach ( var line in lines . Take ( 5 ) )
280+ {
281+ var trimmedLine = line . Trim ( ) ;
282+ if ( trimmedLine . Length > 5 && trimmedLine . Length < 100 )
283+ {
284+ return trimmedLine ;
285+ }
286+ }
287+
288+ return null ;
289+ }
290+ }
0 commit comments