Skip to content

Commit b6ded3f

Browse files
committed
test files
1 parent b63beb7 commit b6ded3f

37 files changed

+3467
-556
lines changed

src/MarkItDown/Converters/AudioConverter.cs

Lines changed: 66 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1+
using System.Collections.Generic;
12
using System.Text;
3+
using System.Threading;
4+
using System.Threading.Tasks;
25

36
namespace MarkItDown.Converters;
47

@@ -39,13 +42,19 @@ public sealed class AudioConverter : IDocumentConverter
3942
"BitsPerSample",
4043
};
4144

42-
private readonly string? _exifToolPath;
43-
private readonly Func<byte[], StreamInfo, CancellationToken, Task<string?>>? _transcribeAsync;
45+
private readonly IAudioMetadataExtractor metadataExtractor;
46+
private readonly IAudioTranscriber transcriber;
4447

4548
public AudioConverter(string? exifToolPath = null, Func<byte[], StreamInfo, CancellationToken, Task<string?>>? transcribeAsync = null)
49+
: this(new ExifToolAudioMetadataExtractor(exifToolPath),
50+
transcribeAsync is null ? NoOpAudioTranscriber.Instance : new DelegateAudioTranscriber(transcribeAsync))
4651
{
47-
_exifToolPath = exifToolPath;
48-
_transcribeAsync = transcribeAsync;
52+
}
53+
54+
internal AudioConverter(IAudioMetadataExtractor metadataExtractor, IAudioTranscriber transcriber)
55+
{
56+
this.metadataExtractor = metadataExtractor ?? throw new ArgumentNullException(nameof(metadataExtractor));
57+
this.transcriber = transcriber ?? throw new ArgumentNullException(nameof(transcriber));
4958
}
5059

5160
public int Priority => 460;
@@ -76,7 +85,7 @@ public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInf
7685
await stream.CopyToAsync(memory, cancellationToken).ConfigureAwait(false);
7786
var bytes = memory.ToArray();
7887

79-
var metadata = await ExifToolMetadataExtractor.ExtractAsync(bytes, streamInfo.Extension, _exifToolPath, cancellationToken).ConfigureAwait(false);
88+
var metadata = await metadataExtractor.ExtractAsync(bytes, streamInfo, cancellationToken).ConfigureAwait(false);
8089
var builder = new StringBuilder();
8190

8291
foreach (var field in MetadataFields)
@@ -110,18 +119,65 @@ public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInf
110119

111120
private async Task<string?> TryTranscribeAsync(byte[] audioBytes, StreamInfo streamInfo, CancellationToken cancellationToken)
112121
{
113-
if (_transcribeAsync is null)
122+
try
123+
{
124+
return await transcriber.TranscribeAsync(audioBytes, streamInfo, cancellationToken).ConfigureAwait(false);
125+
}
126+
catch
114127
{
115128
return null;
116129
}
130+
}
117131

118-
try
132+
internal interface IAudioMetadataExtractor
133+
{
134+
Task<IReadOnlyDictionary<string, string>> ExtractAsync(byte[] audioBytes, StreamInfo streamInfo, CancellationToken cancellationToken);
135+
}
136+
137+
internal interface IAudioTranscriber
138+
{
139+
Task<string?> TranscribeAsync(byte[] audioBytes, StreamInfo streamInfo, CancellationToken cancellationToken);
140+
}
141+
142+
private sealed class ExifToolAudioMetadataExtractor : IAudioMetadataExtractor
143+
{
144+
private readonly string? exifToolPath;
145+
146+
public ExifToolAudioMetadataExtractor(string? exifToolPath)
119147
{
120-
return await _transcribeAsync(audioBytes, streamInfo, cancellationToken).ConfigureAwait(false);
148+
this.exifToolPath = exifToolPath;
121149
}
122-
catch
150+
151+
public async Task<IReadOnlyDictionary<string, string>> ExtractAsync(byte[] audioBytes, StreamInfo streamInfo, CancellationToken cancellationToken)
152+
{
153+
var result = await ExifToolMetadataExtractor
154+
.ExtractAsync(audioBytes, streamInfo.Extension, exifToolPath, cancellationToken)
155+
.ConfigureAwait(false);
156+
157+
return result;
158+
}
159+
}
160+
161+
private sealed class DelegateAudioTranscriber : IAudioTranscriber
162+
{
163+
private readonly Func<byte[], StreamInfo, CancellationToken, Task<string?>> factory;
164+
165+
public DelegateAudioTranscriber(Func<byte[], StreamInfo, CancellationToken, Task<string?>> factory)
166+
=> this.factory = factory;
167+
168+
public Task<string?> TranscribeAsync(byte[] audioBytes, StreamInfo streamInfo, CancellationToken cancellationToken)
169+
=> factory(audioBytes, streamInfo, cancellationToken);
170+
}
171+
172+
private sealed class NoOpAudioTranscriber : IAudioTranscriber
173+
{
174+
public static NoOpAudioTranscriber Instance { get; } = new();
175+
176+
private NoOpAudioTranscriber()
123177
{
124-
return null;
125178
}
179+
180+
public Task<string?> TranscribeAsync(byte[] audioBytes, StreamInfo streamInfo, CancellationToken cancellationToken)
181+
=> Task.FromResult<string?>(null);
126182
}
127183
}

src/MarkItDown/Converters/CsvConverter.cs

Lines changed: 45 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
using System.Collections.Generic;
2+
using System.Linq;
23
using System.Text;
4+
using Sylvan.Data.Csv;
35
using ManagedCode.MimeTypes;
46

57
namespace MarkItDown.Converters;
@@ -47,21 +49,42 @@ public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInf
4749
if (stream.CanSeek)
4850
stream.Position = 0;
4951

50-
// Read the content
5152
using var reader = new StreamReader(stream, streamInfo.Charset ?? Encoding.UTF8, detectEncodingFromByteOrderMarks: true, leaveOpen: true);
52-
var content = await reader.ReadToEndAsync(cancellationToken);
53+
using var csv = CsvDataReader.Create(reader, new CsvDataReaderOptions
54+
{
55+
HasHeaders = false,
56+
BufferSize = 64 * 1024,
57+
});
5358

54-
if (string.IsNullOrWhiteSpace(content))
59+
if (!await csv.ReadAsync(cancellationToken).ConfigureAwait(false))
60+
{
5561
return new DocumentConverterResult(string.Empty);
62+
}
63+
64+
var rows = new List<string[]>();
65+
var maxColumns = 0;
66+
67+
do
68+
{
69+
cancellationToken.ThrowIfCancellationRequested();
5670

57-
// Parse CSV content
58-
var rows = ParseCsvContent(content);
71+
var values = new string[csv.FieldCount];
72+
for (var i = 0; i < csv.FieldCount; i++)
73+
{
74+
values[i] = EscapeMarkdownTableCell(csv.IsDBNull(i) ? string.Empty : csv.GetString(i) ?? string.Empty);
75+
}
76+
77+
maxColumns = Math.Max(maxColumns, values.Length);
78+
rows.Add(values);
79+
}
80+
while (await csv.ReadAsync(cancellationToken).ConfigureAwait(false));
5981

6082
if (rows.Count == 0)
83+
{
6184
return new DocumentConverterResult(string.Empty);
85+
}
6286

63-
// Create markdown table
64-
var markdownTable = CreateMarkdownTable(rows);
87+
var markdownTable = CreateMarkdownTable(rows, maxColumns);
6588

6689
return new DocumentConverterResult(
6790
markdown: markdownTable,
@@ -74,100 +97,37 @@ public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInf
7497
}
7598
}
7699

77-
private static List<List<string>> ParseCsvContent(string content)
78-
{
79-
var rows = new List<List<string>>();
80-
var lines = content.Split(new char[] { '\n', '\r' }, StringSplitOptions.RemoveEmptyEntries);
81-
82-
foreach (var line in lines)
83-
{
84-
if (string.IsNullOrWhiteSpace(line))
85-
continue;
86-
87-
var row = ParseCsvLine(line.Trim());
88-
if (row.Count > 0)
89-
rows.Add(row);
90-
}
91-
92-
return rows;
93-
}
94-
95-
private static List<string> ParseCsvLine(string line)
96-
{
97-
var fields = new List<string>();
98-
var currentField = new StringBuilder();
99-
var inQuotes = false;
100-
var i = 0;
101-
102-
while (i < line.Length)
103-
{
104-
var c = line[i];
105-
106-
if (c == '"')
107-
{
108-
if (inQuotes && i + 1 < line.Length && line[i + 1] == '"')
109-
{
110-
// Escaped quote
111-
currentField.Append('"');
112-
i += 2;
113-
}
114-
else
115-
{
116-
// Toggle quote state
117-
inQuotes = !inQuotes;
118-
i++;
119-
}
120-
}
121-
else if (c == ',' && !inQuotes)
122-
{
123-
// Field separator
124-
fields.Add(currentField.ToString());
125-
currentField.Clear();
126-
i++;
127-
}
128-
else
129-
{
130-
currentField.Append(c);
131-
i++;
132-
}
133-
}
134-
135-
// Add the last field
136-
fields.Add(currentField.ToString());
137-
138-
return fields;
139-
}
140-
141-
private static string CreateMarkdownTable(List<List<string>> rows)
100+
private static string CreateMarkdownTable(List<string[]> rows, int maxColumns)
142101
{
143102
if (rows.Count == 0)
144103
return string.Empty;
145104

146105
var result = new StringBuilder();
147-
var maxColumns = rows.Max(r => r.Count);
148-
149-
// Ensure all rows have the same number of columns
150-
foreach (var row in rows)
151-
{
152-
while (row.Count < maxColumns)
153-
row.Add(string.Empty);
154-
}
155106

156-
// Add header row
157-
result.AppendLine("| " + string.Join(" | ", rows[0].Select(EscapeMarkdownTableCell)) + " |");
107+
// Header row is the first line
108+
var header = PadRow(rows[0], maxColumns);
109+
result.AppendLine("| " + string.Join(" | ", header) + " |");
158110

159-
// Add separator row
111+
// Separator row
160112
result.AppendLine("| " + string.Join(" | ", Enumerable.Repeat("---", maxColumns)) + " |");
161113

162-
// Add data rows
163114
for (var i = 1; i < rows.Count; i++)
164115
{
165-
result.AppendLine("| " + string.Join(" | ", rows[i].Select(EscapeMarkdownTableCell)) + " |");
116+
var row = PadRow(rows[i], maxColumns);
117+
result.AppendLine("| " + string.Join(" | ", row) + " |");
166118
}
167119

168120
return result.ToString().TrimEnd();
169121
}
170122

123+
private static IEnumerable<string> PadRow(string[] row, int maxColumns)
124+
{
125+
for (var i = 0; i < maxColumns; i++)
126+
{
127+
yield return i < row.Length ? row[i] : string.Empty;
128+
}
129+
}
130+
171131
private static string EscapeMarkdownTableCell(string cell)
172132
{
173133
if (string.IsNullOrEmpty(cell))

0 commit comments

Comments
 (0)