Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 15 additions & 37 deletions recipes/mojo_csv/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

<!-- ![mojo_csv_logo](./mojo_csv_logo.png) -->
<image src='./mojo_csv_logo.png' width='900'/>

Expand All @@ -17,14 +16,12 @@ Add the Modular community channel (https://repo.prefix.dev/modular-community) to
channels = ["conda-forge", "https://conda.modular.com/max", "https://repo.prefix.dev/modular-community"]
```


```sh
pixi add mojo_csv
```

## Usage


By default uses all logical cores - 2
```mojo
CsvReader(
Expand All @@ -50,7 +47,6 @@ fn main() raises:
print(reader[i])
```


### Delimiters

```mojo
Expand All @@ -71,7 +67,6 @@ var reader = CsvReader(
)
```


### Attributes

```mojo
Expand All @@ -93,10 +88,7 @@ reader[0] # first element
```

### Performance

- average times over 100-1k iterations
- AMD 7950x@5.8ghz
- single-threaded
__See BENCHMARK.md for expanded info__

micro file benchmark (3 rows)
mini (100 rows)
Expand Down Expand Up @@ -126,34 +118,20 @@ average time in ms for large file:
878.6 ms
```

#### CSV Reader Performance Comparison
```
Small file benchmark (1,000 rows):
Single-threaded:
Average time: 0.455 ms
Multi-threaded:
Average time: 0.3744 ms
Speedup: 1.22 x

Medium file benchmark (100,000 rows):
Single-threaded:
Average time: 37.37 ms
Multi-threaded:
Average time: 24.46 ms
Speedup: 1.53 x

Large file benchmark (2,000,000 rows):
Single-threaded:
Average time: 1210.3 ms
Multi-threaded:
Average time: 863.9 ms
Speedup: 1.4 x

Summary:
Small file speedup: 1.22 x
Medium file speedup: 1.53 x
Large file speedup: 1.4 x
```
## Experimental
Dict Reader and CsvWriter are in Beta

=== DictCsvReader Performance ===
-----------------------------------
Small file benchmark (1,000 rows):
Small Single-threaded: 0.6154 ms
Small Threaded: 0.5044 ms
-----------------------------------
Medium file benchmark (100,000 rows):
Medium: 42.04 ms
-----------------------------------
Large file benchmark (2,000,000 rows):
Large: 1280.5 ms


## Future Improvements
Expand Down
16 changes: 9 additions & 7 deletions recipes/mojo_csv/recipe.yaml
Original file line number Diff line number Diff line change
@@ -1,33 +1,35 @@
context:
version: 1.5.0

version: 1.6.2
package:
name: "mojo_csv"
version: ${{ version }}

source:
- git: https://github.com/Phelsong/mojo_csv.git
rev: b3a9dc4422efbea7a94939e3a48ff4a3b03e3505
rev: 8e67c53570d3a9beb7c99e81a40079bc1328e692

build:
number: 0
number: 3
script:
- mojo package src -o ${{ PREFIX }}/lib/mojo/mojo_csv.mojopkg

requirements:
host:
- max >=25.4.0,<26
- max >=26.0,<27.0
run:
- ${{ pin_compatible('max') }}

tests:
- script:
- if: unix
then:
- mojo test tests
- mojo run -I $PREFIX/lib/mojo/mojo_csv.mojopkg test_pack.mojo
files:
recipe:
- mojo_csv
- test_pack.mojo
requirements:
run:
- max >=26.0,<27.0

about:
homepage: https://github.com/Phelsong/mojo_csv
Expand Down
174 changes: 174 additions & 0 deletions recipes/mojo_csv/test_pack.mojo
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
from std.pathlib import Path, cwd
from std.testing import assert_true

from mojo_csv import CsvReader


fn test_basic_lf() raises:
"""Test basic comma-separated parsing with LF line endings."""
var expected = List[String]()
expected.append("item1")
expected.append("item2")
expected.append('"ite,em3"')
expected.append('"p""ic"')
expected.append(" pi c")
expected.append("pic")
expected.append("r_i_1")
expected.append('"r_i_2"""')
expected.append("r_i_3")

var in_csv: Path = cwd().joinpath("test_data_lf.csv")
in_csv.write_text(
'item1,item2,"ite,em3"\n'
'"p""ic", pi c,pic,\n'
'r_i_1,"r_i_2""",r_i_3,\n'
)
var rd = CsvReader(in_csv)
assert_true(rd.col_count == 3)
for x in range(len(rd)):
assert_true(
rd.elements[x] == expected[x],
String("[{0}] != expected [{1}] at index {2}").format(
rd.elements[x], expected[x], x
),
)
assert_true(rd.row_count == 3)
assert_true(len(rd.elements) == 9)
print("✅ test_basic_lf passed")


fn test_crlf_single_threaded() raises:
"""Test CRLF line endings parsed single-threaded.

This catches the bug where ord(\"\\r\\n\") was incorrectly used
(ord takes a single char) and where col_start after \\r pointed
at the \\n byte, including it in the next element string.
"""
var expected = List[String]()
expected.append("a")
expected.append("b")
expected.append("c")
expected.append("d")
expected.append("e")
expected.append("f")

var in_csv: Path = cwd().joinpath("test_data_crlf.csv")
in_csv.write_text("a,b,c\r\nd,e,f\r\n")

var rd = CsvReader(in_csv, num_threads=1)
assert_true(rd.col_count == 3, "crlf col_count mismatch")
assert_true(rd.row_count == 2, "crlf row_count mismatch")
assert_true(len(rd) == 6, "crlf element count mismatch")
for i in range(len(expected)):
assert_true(
rd[i] == expected[i],
String("crlf [{0}] != expected [{1}] at index {2}").format(
rd[i], expected[i], i
),
)
print("✅ test_crlf_single_threaded passed")


fn test_threaded_vs_single_threaded() raises:
"""Test that threaded and single-threaded parsers produce identical results.

This catches the bug where single-threaded CRLF handling included
the \\n byte in the next element, causing mismatches with threaded output.
"""
# Build a CSV large enough to trigger multi-threaded parsing (>1000 bytes)
var csv_content = String("header1,header2,header3\n")
for i in range(50):
csv_content += String("val{0},val{0},val{0}\n").format(i)

var in_csv: Path = cwd().joinpath("test_data_threaded.csv")
in_csv.write_text(csv_content)

var single = CsvReader(in_csv, num_threads=1)
var threaded = CsvReader(in_csv, num_threads=4)

assert_true(
single.row_count == threaded.row_count,
String("row_count: single={0} threaded={1}").format(
single.row_count, threaded.row_count
),
)
assert_true(
single.col_count == threaded.col_count,
String("col_count: single={0} threaded={1}").format(
single.col_count, threaded.col_count
),
)
assert_true(
len(single) == len(threaded),
String("length: single={0} threaded={1}").format(
len(single), len(threaded)
),
)

var check_count = min(len(single), len(threaded))
for i in range(check_count):
assert_true(
single[i] == threaded[i],
String("element[{0}]: single='{1}' threaded='{2}'").format(
i, single[i], threaded[i]
),
)

# Verify headers match
assert_true(len(single.headers) == len(threaded.headers))
for i in range(len(single.headers)):
assert_true(
single.headers[i] == threaded.headers[i],
String("header[{0}]: single='{1}' threaded='{2}'").format(
i, single.headers[i], threaded.headers[i]
),
)
print("✅ test_threaded_vs_single_threaded passed")


fn test_crlf_threaded_vs_single() raises:
"""Test CRLF consistency between single-threaded and threaded parsers."""
# Build CSV with CRLF line endings, large enough for threading
var csv_content = String("header1,header2,header3\r\n")
for i in range(50):
csv_content += String("val{0},val{0},val{0}\r\n").format(i)

var in_csv: Path = cwd().joinpath("test_data_crlf_threaded.csv")
in_csv.write_text(csv_content)

var single = CsvReader(in_csv, num_threads=1)
var threaded = CsvReader(in_csv, num_threads=4)

assert_true(
single.row_count == threaded.row_count,
String("crlf row_count: single={0} threaded={1}").format(
single.row_count, threaded.row_count
),
)
assert_true(
len(single) == len(threaded),
String("crlf length: single={0} threaded={1}").format(
len(single), len(threaded)
),
)

var check_count = min(len(single), len(threaded))
for i in range(check_count):
assert_true(
single[i] == threaded[i],
String("crlf element[{0}]: single='{1}' threaded='{2}'").format(
i, single[i], threaded[i]
),
)
print("✅ test_crlf_threaded_vs_single passed")


fn main():
try:
test_basic_lf()
test_crlf_single_threaded()
test_threaded_vs_single_threaded()
test_crlf_threaded_vs_single()
print("\n✅ All CI tests PASSED")
except:
print("❌ CI test FAILED")
Loading