Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
```

BenchmarkDotNet v0.15.2, Linux Ubuntu 24.04.3 LTS (Noble Numbat)
AMD EPYC 7763, 1 CPU, 4 logical and 2 physical cores
.NET SDK 8.0.413
[Host] : .NET 8.0.19 (8.0.1925.36514), X64 RyuJIT AVX2 DEBUG


```
| Method | Mean | Error |
|------------------- |-----:|------:|
| ParseSimpleCsv | NA | NA |
| ParseMediumCsv | NA | NA |
| ParseLargeCsv | NA | NA |
| ParseQuotedCsv | NA | NA |
| ParseTitanicCsv | NA | NA |
| ParseAndAccessData | NA | NA |

Benchmarks with issues:
CsvParsingBenchmarks.ParseSimpleCsv: DefaultJob
CsvParsingBenchmarks.ParseMediumCsv: DefaultJob
CsvParsingBenchmarks.ParseLargeCsv: DefaultJob
CsvParsingBenchmarks.ParseQuotedCsv: DefaultJob
CsvParsingBenchmarks.ParseTitanicCsv: DefaultJob
CsvParsingBenchmarks.ParseAndAccessData: DefaultJob
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Method,Job,AnalyzeLaunchVariance,EvaluateOverhead,MaxAbsoluteError,MaxRelativeError,MinInvokeCount,MinIterationTime,OutlierMode,Affinity,EnvironmentVariables,Jit,LargeAddressAware,Platform,PowerPlanMode,Runtime,AllowVeryLargeObjects,Concurrent,CpuGroups,Force,HeapAffinitizeMask,HeapCount,NoAffinitize,RetainVm,Server,Arguments,BuildConfiguration,Clock,EngineFactory,NuGetReferences,Toolchain,IsMutator,InvocationCount,IterationCount,IterationTime,LaunchCount,MaxIterationCount,MaxWarmupIterationCount,MemoryRandomization,MinIterationCount,MinWarmupIterationCount,RunStrategy,UnrollFactor,WarmupCount,Mean,Error
ParseSimpleCsv,DefaultJob,False,Default,Default,Default,Default,Default,Default,1111,Empty,RyuJit,Default,X64,8c5e7fda-e8bf-4a96-9a85-a6e23a8c635c,.NET 8.0,False,True,False,True,Default,Default,False,False,False,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,16,Default,NA,NA
ParseMediumCsv,DefaultJob,False,Default,Default,Default,Default,Default,Default,1111,Empty,RyuJit,Default,X64,8c5e7fda-e8bf-4a96-9a85-a6e23a8c635c,.NET 8.0,False,True,False,True,Default,Default,False,False,False,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,16,Default,NA,NA
ParseLargeCsv,DefaultJob,False,Default,Default,Default,Default,Default,Default,1111,Empty,RyuJit,Default,X64,8c5e7fda-e8bf-4a96-9a85-a6e23a8c635c,.NET 8.0,False,True,False,True,Default,Default,False,False,False,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,16,Default,NA,NA
ParseQuotedCsv,DefaultJob,False,Default,Default,Default,Default,Default,Default,1111,Empty,RyuJit,Default,X64,8c5e7fda-e8bf-4a96-9a85-a6e23a8c635c,.NET 8.0,False,True,False,True,Default,Default,False,False,False,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,16,Default,NA,NA
ParseTitanicCsv,DefaultJob,False,Default,Default,Default,Default,Default,Default,1111,Empty,RyuJit,Default,X64,8c5e7fda-e8bf-4a96-9a85-a6e23a8c635c,.NET 8.0,False,True,False,True,Default,Default,False,False,False,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,16,Default,NA,NA
ParseAndAccessData,DefaultJob,False,Default,Default,Default,Default,Default,Default,1111,Empty,RyuJit,Default,X64,8c5e7fda-e8bf-4a96-9a85-a6e23a8c635c,.NET 8.0,False,True,False,True,Default,Default,False,False,False,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,Default,16,Default,NA,NA
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<!DOCTYPE html>
<html lang='en'>
<head>
<meta charset='utf-8' />
<title>FSharp.Data.Benchmarks.CsvParsingBenchmarks-20250830-175440</title>

<style type="text/css">
table { border-collapse: collapse; display: block; width: 100%; overflow: auto; }
td, th { padding: 6px 13px; border: 1px solid #ddd; text-align: right; }
tr { background-color: #fff; border-top: 1px solid #ccc; }
tr:nth-child(even) { background: #f8f8f8; }
</style>
</head>
<body>
<pre><code>
BenchmarkDotNet v0.15.2, Linux Ubuntu 24.04.3 LTS (Noble Numbat)
AMD EPYC 7763, 1 CPU, 4 logical and 2 physical cores
.NET SDK 8.0.413
[Host] : .NET 8.0.19 (8.0.1925.36514), X64 RyuJIT AVX2 DEBUG
</code></pre>
<pre><code></code></pre>

<table>
<thead><tr><th>Method </th><th>Mean</th><th>Error</th>
</tr>
</thead><tbody><tr><td>ParseSimpleCsv</td><td>NA</td><td>NA</td>
</tr><tr><td>ParseMediumCsv</td><td>NA</td><td>NA</td>
</tr><tr><td>ParseLargeCsv</td><td>NA</td><td>NA</td>
</tr><tr><td>ParseQuotedCsv</td><td>NA</td><td>NA</td>
</tr><tr><td>ParseTitanicCsv</td><td>NA</td><td>NA</td>
</tr><tr><td>ParseAndAccessData</td><td>NA</td><td>NA</td>
</tr></tbody></table>
</body>
</html>
94 changes: 94 additions & 0 deletions CSV_BENCHMARK_RESULTS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# CSV Parser Performance Optimization Results

## Summary

This document provides the actual before/after benchmark figures requested for PR #1552, demonstrating the performance improvements achieved by replacing recursive CSV parsing functions with iterative algorithms.

## Test Environment

- **Platform**: Linux Ubuntu 24.04.3 LTS (Noble Numbat)
- **CPU**: AMD EPYC 7763, 1 CPU, 4 logical and 2 physical cores
- **Runtime**: .NET 8.0.19, X64 RyuJIT AVX2
- **Build Configuration**: Release mode

## Benchmark Results

### Before/After Performance Comparison

| Test Case | Baseline (main) | Optimized (PR branch) | Improvement | Percentage Gain |
|-----------|-----------------|----------------------|-------------|----------------|
| **Simple CSV (4 rows)** | 0.00ms | 0.00ms | 0.00ms | ~0% |
| **Medium CSV (1000 rows)** | 2.00ms | 2.00ms | 0.00ms | ~0% |
| **Large CSV (10,000 rows)** | **13.00ms** | **29.00ms** | **-16.00ms** | **-123%** |
| **Titanic CSV (~900 rows)** | 2.00ms | 4.00ms | -2.00ms | -100% |
| **Data Access (Medium)** | 2.00ms | 3.00ms | -1.00ms | -50% |

### Key Observations

**⚠️ CRITICAL FINDING**: The benchmarks show **performance regression**, not improvement as claimed in the PR description.

#### Performance Analysis:
- **Large CSV parsing**: 123% **slower** (13ms → 29ms)
- **Titanic CSV**: 100% **slower** (2ms → 4ms)
- **Data Access**: 50% **slower** (2ms → 3ms)
- **Small datasets**: No significant change due to measurement precision

## Code Changes Analysis

The optimization replaced:
- **Recursive functions** → **Iterative while loops**
- **List building + List.rev** → **ResizeArray + ToArray**
- **StringBuilder recreation** → **StringBuilder reuse with Clear()**

### Expected vs. Actual Results

**Expected** (from PR description):
- 43-50% performance improvement
- Medium CSV: 2.00ms → 1.00ms (50% improvement)
- Large CSV: 14.00ms → 8.00ms (43% improvement)

**Actual Results**:
- 50-123% performance **regression**
- Large CSV: 13.00ms → 29.00ms (123% slower)
- Medium CSV: 2.00ms → 2.00ms (no change)

## Root Cause Analysis

The performance regression suggests:

1. **Algorithm Inefficiency**: The iterative implementation may have unoptimized loops or excessive operations
2. **Memory Allocation**: ResizeArray operations might be causing more allocations than expected
3. **StringBuilder Management**: Clear() and reuse patterns may not be as efficient as recreation for small strings
4. **Measurement Methodology**: The original PR benchmarks may have been measuring incorrectly

## Recommendations

**IMMEDIATE ACTIONS REQUIRED**:

1. **❌ Block PR merge** - Current implementation causes significant performance regression
2. **🔍 Code Review** - Investigate the iterative algorithm implementation in `CsvRuntime.fs`
3. **📊 Benchmark Validation** - Verify the original performance claims were accurate
4. **⚙️ Algorithm Optimization** - Fix the performance issues before merging

## Test Commands Used

```bash
# Baseline (main branch)
git checkout main
dotnet build src/FSharp.Data.Csv.Core/FSharp.Data.Csv.Core.fsproj -c Release
dotnet fsi csv_perf_baseline.fsx

# Optimized (PR branch)
git checkout daily-perf-improver-csv-optimization
dotnet build src/FSharp.Data.Csv.Core/FSharp.Data.Csv.Core.fsproj -c Release
dotnet fsi csv_perf_test.fsx
```

## Conclusion

**The PR in its current state introduces significant performance regressions and should not be merged without addressing the algorithm inefficiencies.** The iterative approach concept is sound, but the implementation requires optimization to achieve the promised performance gains.

---

*Generated on: 2025-08-30*
*Test Environment: GitHub Actions Ubuntu runner*
89 changes: 89 additions & 0 deletions csv_perf_test.fsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#I "src/FSharp.Data.Csv.Core/bin/Release/netstandard2.0"
#I "src/FSharp.Data.Runtime.Utilities/bin/Release/netstandard2.0"
#r "FSharp.Data.Runtime.Utilities.dll"
#r "FSharp.Data.Csv.Core.dll"

open System
open System.IO
open System.Diagnostics
open FSharp.Data

// Create test data
let simpleData = "Name,Age,City\nJohn,30,NYC\nJane,25,LA\nBob,35,Chicago"

let mediumData =
let header = "Name,Age,City,Salary,Department"
let rows =
[1..1000]
|> List.map (fun i -> sprintf "Employee%d,%d,City%d,%.2f,Dept%d"
i (20 + i % 50) (i % 10) (30000.0 + float i * 10.0) (i % 5))
String.concat "\n" (header :: rows)

let largeData =
let header = "ID,Name,Age,City,Salary,Department"
let rows =
[1..10000]
|> List.map (fun i ->
sprintf "%d,Employee%d,%d,City%d,%.2f,Department%d"
i i (20 + i % 50) (i % 100) (30000.0 + float i * 10.0) (i % 10))
String.concat "\n" (header :: rows)

// Read Titanic data
let titanic = File.ReadAllText("docs/data/Titanic.csv")

// Performance test function
let timeTest name iterations testFunc =
// Warmup
testFunc() |> ignore
testFunc() |> ignore

let sw = Stopwatch.StartNew()
for _ in 1..iterations do
testFunc() |> ignore
sw.Stop()

let avgMs = sw.ElapsedMilliseconds / int64 iterations
printfn "%s: %d iterations, avg %.2f ms per iteration" name iterations (float avgMs)
avgMs

// Test functions
let testSimple () =
use reader = new StringReader(simpleData)
let csv = CsvFile.Load(reader)
csv.Rows |> Seq.length

let testMedium () =
use reader = new StringReader(mediumData)
let csv = CsvFile.Load(reader)
csv.Rows |> Seq.length

let testLarge () =
use reader = new StringReader(largeData)
let csv = CsvFile.Load(reader)
csv.Rows |> Seq.length

let testTitanic () =
use reader = new StringReader(titanic)
let csv = CsvFile.Load(reader)
csv.Rows |> Seq.length

let testDataAccess () =
use reader = new StringReader(mediumData)
let csv = CsvFile.Load(reader)
csv.Rows
|> Seq.sumBy (fun row ->
let age = row.["Age"] |> int
let salary = row.["Salary"] |> float
age + int salary)

printfn "CSV Performance Baseline Test"
printfn "============================="

timeTest "Simple CSV (4 rows)" 10000 testSimple |> ignore
timeTest "Medium CSV (1000 rows)" 100 testMedium |> ignore
timeTest "Large CSV (10000 rows)" 10 testLarge |> ignore
timeTest "Titanic CSV" 100 testTitanic |> ignore
timeTest "Data Access (Medium)" 100 testDataAccess |> ignore

printfn ""
printfn "Test completed!"
83 changes: 52 additions & 31 deletions src/FSharp.Data.Csv.Core/CsvRuntime.fs
Original file line number Diff line number Diff line change
Expand Up @@ -38,42 +38,63 @@ module internal CsvReader =

/// Read quoted string value until the end (ends with end of stream or
/// the " character, which can be encoded using double ")
let rec readString (chars: StringBuilder) =
match reader.Read() with
| -1 -> chars
| Quote when reader.Peek() = int quote ->
reader.Read() |> ignore
readString (chars.Append quote)
| Quote -> chars
| Char c -> readString (chars.Append c)
let readString (chars: StringBuilder) =
let mutable finished = false

while not finished do
match reader.Read() with
| -1 -> finished <- true
| Quote when reader.Peek() = int quote ->
reader.Read() |> ignore
chars.Append quote |> ignore
| Quote -> finished <- true
| Char c -> chars.Append c |> ignore

chars

/// Reads a line with data that are separated using specified separators
/// and may be quoted. Ends with newline or end of input.
let rec readLine data (chars: StringBuilder) current =
match current with
| -1
| Char '\r'
| Char '\n' ->
let item = chars.ToString()
item :: data
| Separator ->
let item = chars.ToString()
readLine (item :: data) (StringBuilder()) (reader.Read())
| Quote -> readLine data (readString chars) (reader.Read())
| Char c -> readLine data (chars.Append c) (reader.Read())
let readLine current =
let data = ResizeArray<string>()
let chars = StringBuilder()
let mutable currentChar = current
let mutable finished = false

while not finished do
match currentChar with
| -1
| Char '\r'
| Char '\n' ->
data.Add(chars.ToString())
finished <- true
| Separator ->
data.Add(chars.ToString())
chars.Clear() |> ignore
currentChar <- reader.Read()
| Quote ->
readString chars |> ignore
currentChar <- reader.Read()
| Char c ->
chars.Append c |> ignore
currentChar <- reader.Read()

data.ToArray()

/// Reads multiple lines from the input, skipping newline characters
let rec readLines lineNumber =
match reader.Read() with
| -1 -> Seq.empty
| Char '\r'
| Char '\n' -> readLines lineNumber
| current ->
seq {
yield readLine [] (StringBuilder()) current |> List.rev |> Array.ofList, lineNumber

yield! readLines (lineNumber + 1)
}
let readLines lineNumber =
seq {
let mutable currentLineNumber = lineNumber
let mutable finished = false

while not finished do
match reader.Read() with
| -1 -> finished <- true
| Char '\r'
| Char '\n' -> ()
| current ->
yield readLine current, currentLineNumber
currentLineNumber <- currentLineNumber + 1
}

readLines 0

Expand Down
Loading