diff --git a/recipes/mojo_csv/README.md b/recipes/mojo_csv/README.md index 4c0b6bce..cf6354bd 100644 --- a/recipes/mojo_csv/README.md +++ b/recipes/mojo_csv/README.md @@ -1,4 +1,3 @@ - @@ -17,14 +16,12 @@ Add the Modular community channel (https://repo.prefix.dev/modular-community) to channels = ["conda-forge", "https://conda.modular.com/max", "https://repo.prefix.dev/modular-community"] ``` - ```sh pixi add mojo_csv ``` ## Usage - By default uses all logical cores - 2 ```mojo CsvReader( @@ -50,7 +47,6 @@ fn main() raises: print(reader[i]) ``` - ### Delimiters ```mojo @@ -71,7 +67,6 @@ var reader = CsvReader( ) ``` - ### Attributes ```mojo @@ -93,10 +88,7 @@ reader[0] # first element ``` ### Performance - -- average times over 100-1k iterations -- AMD 7950x@5.8ghz -- single-threaded +__See BENCHMARK.md for expanded info__ micro file benchmark (3 rows) mini (100 rows) @@ -126,34 +118,20 @@ average time in ms for large file: 878.6 ms ``` -#### CSV Reader Performance Comparison -``` -Small file benchmark (1,000 rows): -Single-threaded: -Average time: 0.455 ms -Multi-threaded: -Average time: 0.3744 ms -Speedup: 1.22 x - -Medium file benchmark (100,000 rows): -Single-threaded: -Average time: 37.37 ms -Multi-threaded: -Average time: 24.46 ms -Speedup: 1.53 x - -Large file benchmark (2,000,000 rows): -Single-threaded: -Average time: 1210.3 ms -Multi-threaded: -Average time: 863.9 ms -Speedup: 1.4 x - -Summary: -Small file speedup: 1.22 x -Medium file speedup: 1.53 x -Large file speedup: 1.4 x -``` +## Experimental +Dict Reader and CsvWriter are in Beta + +=== DictCsvReader Performance === +----------------------------------- +Small file benchmark (1,000 rows): +Small Single-threaded: 0.6154 ms +Small Threaded: 0.5044 ms +----------------------------------- +Medium file benchmark (100,000 rows): +Medium: 42.04 ms +----------------------------------- +Large file benchmark (2,000,000 rows): +Large: 1280.5 ms ## Future Improvements diff --git a/recipes/mojo_csv/recipe.yaml b/recipes/mojo_csv/recipe.yaml index c780a657..2c1edd73 100644 --- a/recipes/mojo_csv/recipe.yaml +++ b/recipes/mojo_csv/recipe.yaml @@ -1,22 +1,21 @@ context: - version: 1.5.0 - + version: 1.6.2 package: name: "mojo_csv" version: ${{ version }} source: - git: https://github.com/Phelsong/mojo_csv.git - rev: b3a9dc4422efbea7a94939e3a48ff4a3b03e3505 + rev: 8e67c53570d3a9beb7c99e81a40079bc1328e692 build: - number: 0 + number: 3 script: - mojo package src -o ${{ PREFIX }}/lib/mojo/mojo_csv.mojopkg requirements: host: - - max >=25.4.0,<26 + - max >=26.0,<27.0 run: - ${{ pin_compatible('max') }} @@ -24,10 +23,13 @@ tests: - script: - if: unix then: - - mojo test tests + - mojo run -I $PREFIX/lib/mojo/mojo_csv.mojopkg test_pack.mojo files: recipe: - - mojo_csv + - test_pack.mojo + requirements: + run: + - max >=26.0,<27.0 about: homepage: https://github.com/Phelsong/mojo_csv diff --git a/recipes/mojo_csv/test_pack.mojo b/recipes/mojo_csv/test_pack.mojo new file mode 100755 index 00000000..f6824056 --- /dev/null +++ b/recipes/mojo_csv/test_pack.mojo @@ -0,0 +1,174 @@ +from std.pathlib import Path, cwd +from std.testing import assert_true + +from mojo_csv import CsvReader + + +fn test_basic_lf() raises: + """Test basic comma-separated parsing with LF line endings.""" + var expected = List[String]() + expected.append("item1") + expected.append("item2") + expected.append('"ite,em3"') + expected.append('"p""ic"') + expected.append(" pi c") + expected.append("pic") + expected.append("r_i_1") + expected.append('"r_i_2"""') + expected.append("r_i_3") + + var in_csv: Path = cwd().joinpath("test_data_lf.csv") + in_csv.write_text( + 'item1,item2,"ite,em3"\n' + '"p""ic", pi c,pic,\n' + 'r_i_1,"r_i_2""",r_i_3,\n' + ) + var rd = CsvReader(in_csv) + assert_true(rd.col_count == 3) + for x in range(len(rd)): + assert_true( + rd.elements[x] == expected[x], + String("[{0}] != expected [{1}] at index {2}").format( + rd.elements[x], expected[x], x + ), + ) + assert_true(rd.row_count == 3) + assert_true(len(rd.elements) == 9) + print("✅ test_basic_lf passed") + + +fn test_crlf_single_threaded() raises: + """Test CRLF line endings parsed single-threaded. + + This catches the bug where ord(\"\\r\\n\") was incorrectly used + (ord takes a single char) and where col_start after \\r pointed + at the \\n byte, including it in the next element string. + """ + var expected = List[String]() + expected.append("a") + expected.append("b") + expected.append("c") + expected.append("d") + expected.append("e") + expected.append("f") + + var in_csv: Path = cwd().joinpath("test_data_crlf.csv") + in_csv.write_text("a,b,c\r\nd,e,f\r\n") + + var rd = CsvReader(in_csv, num_threads=1) + assert_true(rd.col_count == 3, "crlf col_count mismatch") + assert_true(rd.row_count == 2, "crlf row_count mismatch") + assert_true(len(rd) == 6, "crlf element count mismatch") + for i in range(len(expected)): + assert_true( + rd[i] == expected[i], + String("crlf [{0}] != expected [{1}] at index {2}").format( + rd[i], expected[i], i + ), + ) + print("✅ test_crlf_single_threaded passed") + + +fn test_threaded_vs_single_threaded() raises: + """Test that threaded and single-threaded parsers produce identical results. + + This catches the bug where single-threaded CRLF handling included + the \\n byte in the next element, causing mismatches with threaded output. + """ + # Build a CSV large enough to trigger multi-threaded parsing (>1000 bytes) + var csv_content = String("header1,header2,header3\n") + for i in range(50): + csv_content += String("val{0},val{0},val{0}\n").format(i) + + var in_csv: Path = cwd().joinpath("test_data_threaded.csv") + in_csv.write_text(csv_content) + + var single = CsvReader(in_csv, num_threads=1) + var threaded = CsvReader(in_csv, num_threads=4) + + assert_true( + single.row_count == threaded.row_count, + String("row_count: single={0} threaded={1}").format( + single.row_count, threaded.row_count + ), + ) + assert_true( + single.col_count == threaded.col_count, + String("col_count: single={0} threaded={1}").format( + single.col_count, threaded.col_count + ), + ) + assert_true( + len(single) == len(threaded), + String("length: single={0} threaded={1}").format( + len(single), len(threaded) + ), + ) + + var check_count = min(len(single), len(threaded)) + for i in range(check_count): + assert_true( + single[i] == threaded[i], + String("element[{0}]: single='{1}' threaded='{2}'").format( + i, single[i], threaded[i] + ), + ) + + # Verify headers match + assert_true(len(single.headers) == len(threaded.headers)) + for i in range(len(single.headers)): + assert_true( + single.headers[i] == threaded.headers[i], + String("header[{0}]: single='{1}' threaded='{2}'").format( + i, single.headers[i], threaded.headers[i] + ), + ) + print("✅ test_threaded_vs_single_threaded passed") + + +fn test_crlf_threaded_vs_single() raises: + """Test CRLF consistency between single-threaded and threaded parsers.""" + # Build CSV with CRLF line endings, large enough for threading + var csv_content = String("header1,header2,header3\r\n") + for i in range(50): + csv_content += String("val{0},val{0},val{0}\r\n").format(i) + + var in_csv: Path = cwd().joinpath("test_data_crlf_threaded.csv") + in_csv.write_text(csv_content) + + var single = CsvReader(in_csv, num_threads=1) + var threaded = CsvReader(in_csv, num_threads=4) + + assert_true( + single.row_count == threaded.row_count, + String("crlf row_count: single={0} threaded={1}").format( + single.row_count, threaded.row_count + ), + ) + assert_true( + len(single) == len(threaded), + String("crlf length: single={0} threaded={1}").format( + len(single), len(threaded) + ), + ) + + var check_count = min(len(single), len(threaded)) + for i in range(check_count): + assert_true( + single[i] == threaded[i], + String("crlf element[{0}]: single='{1}' threaded='{2}'").format( + i, single[i], threaded[i] + ), + ) + print("✅ test_crlf_threaded_vs_single passed") + + +fn main(): + try: + test_basic_lf() + test_crlf_single_threaded() + test_threaded_vs_single_threaded() + test_crlf_threaded_vs_single() + print("\n✅ All CI tests PASSED") + except: + print("❌ CI test FAILED") \ No newline at end of file