RNABioInfo
diff --git a/‎.github/workflows/msbuild.yml‎
Lines changed: 44 additions & 0 deletions b/‎.github/workflows/msbuild.yml‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 13 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎bench/run_benchmarks.sh‎
Lines changed: 83 additions & 0 deletions b/‎bench/run_benchmarks.sh‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎src/z_educational_guide.md‎ ‎docs/educational_guide.md‎src/z_educational_guide.md renamed to docs/educational_guide.md b/‎src/z_educational_guide.md‎ ‎docs/educational_guide.md‎src/z_educational_guide.md renamed to docs/educational_guide.md
diff --git a/‎src/z_optimization_dev_notes.md‎ ‎docs/optimization_dev_notes.md‎src/z_optimization_dev_notes.md renamed to docs/optimization_dev_notes.md b/‎src/z_optimization_dev_notes.md‎ ‎docs/optimization_dev_notes.md‎src/z_optimization_dev_notes.md renamed to docs/optimization_dev_notes.md
diff --git a/‎docs/report.html‎
Lines changed: 81 additions & 0 deletions b/‎docs/report.html‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎docs/report.md‎
Lines changed: 109 additions & 0 deletions b/‎docs/report.md‎
Lines changed: 109 additions & 0 deletions
diff --git a/‎include/cycle_finder.h‎
Lines changed: 3 additions & 1 deletion b/‎include/cycle_finder.h‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎readme.md‎
Lines changed: 1 addition & 1 deletion b/‎readme.md‎
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,44 @@
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+name: MSBuild
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+
+env:
+  # Path to the solution file relative to the root of the project.
+  SOLUTION_FILE_PATH: .
+
+  # Configuration type to build.
+  # You can convert this to a build matrix if you need coverage of multiple configuration types.
+  # https://docs.github.com/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
+  BUILD_CONFIGURATION: Release
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+    runs-on: windows-latest
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Add MSBuild to PATH
+      uses: microsoft/setup-msbuild@v1.0.2
+
+    - name: Restore NuGet packages
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      run: nuget restore ${{env.SOLUTION_FILE_PATH}}
+
+    - name: Build
+      working-directory: ${{env.GITHUB_WORKSPACE}}
+      # Add additional options to the MSBuild command line here (like platform or verbosity level).
+      # See https://docs.microsoft.com/visualstudio/msbuild/msbuild-command-line-reference
+      run: msbuild /m /p:Configuration=${{env.BUILD_CONFIGURATION}} ${{env.SOLUTION_FILE_PATH}}
@@ -0,0 +1,13 @@
+# Changelog
+
+## [Unreleased] - 0.5.1
+- Fixed crash in spacer ordering when no reads are found (guard added).
+- Improved parallel scaling:
+  - Replaced frequent `#pragma omp critical` usage with per-thread buffers and serial merges.
+  - Introduced a lock-free visited bitmap (atomic 64-bit words) to remove synchronization hot-spots.
+- Reduced allocator contention and reused per-thread containers to lower memory churn under heavy parallelism.
+- Build fixes: added missing includes and small portability fixes so CMake build succeeds on target platforms.
+- Build: CMake configure and full build completed successfully (targets `mcaat` and `runTests` built).
+
+
+*Notes:* these changes focus on improving scalability and preventing serialization on large, memory-bound graphs.
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Simple benchmark runner for MCAAT cycle_finder
+# Produces CSV rows: threads,run,elapsed_seconds,max_rss_kb,cycles
+
+usage() {
+  cat <<EOF
+Usage: $0 --binary PATH --input PATH --threads LIST --runs N -o OUTCSV
+
+Options:
+  --binary PATH     Path to cycle_finder binary
+  --input PATH      Input graph file
+  --threads LIST    Comma-separated thread counts (e.g. 1,4,8,24)
+  --runs N          Number of runs per thread count (default: 3)
+  -o OUTCSV         Output CSV file (overwritten)
+
+Example:
+  $0 --binary ./bin/cycle_finder --input graphs/huge_graph.bin --threads 1,4,8,24 --runs 3 -o bench/results.csv
+EOF
+  exit 1
+}
+
+BINARY="" INPUT="" THREADS="" RUNS=3 OUTCSV=""
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --binary) BINARY="$2"; shift 2;;
+    --input) INPUT="$2"; shift 2;;
+    --threads) THREADS="$2"; shift 2;;
+    --runs) RUNS="$2"; shift 2;;
+    -o) OUTCSV="$2"; shift 2;;
+    -h|--help) usage;;
+    *) echo "Unknown arg: $1"; usage;;
+  esac
+done
+
+if [[ -z "$BINARY" || -z "$INPUT" || -z "$THREADS" || -z "$OUTCSV" ]]; then
+  usage
+fi
+
+mkdir -p "$(dirname "$OUTCSV")"
+
+echo "threads,run,elapsed_s,max_rss_kb,cycles" > "$OUTCSV"
+
+IFS=',' read -r -a THREAD_ARR <<< "$THREADS"
+
+for threads in "${THREAD_ARR[@]}"; do
+  for ((run=1; run<=RUNS; run++)); do
+    outtmp="/tmp/cycle_out.${threads}.${run}.json"
+    timetmp="/tmp/cycle_time.${threads}.${run}.txt"
+
+    echo "Running threads=$threads run=$run"
+
+    # Use /usr/bin/time to capture wall time and max RSS
+    /usr/bin/time -f "%e %M" -o "$timetmp" "$BINARY" --input "$INPUT" --threads "$threads" --out "$outtmp"
+
+    read -r elapsed rss_kb < "$timetmp"
+
+    # Try to extract cycles from output JSON if possible
+    cycles=""
+    if command -v jq >/dev/null 2>&1 && jq -e . "$outtmp" >/dev/null 2>&1; then
+      # Common fields: either cycles array or cycle_count
+      if jq -e '.cycles' "$outtmp" >/dev/null 2>&1; then
+        cycles=$(jq '.cycles | length' "$outtmp")
+      elif jq -e '.cycle_count' "$outtmp" >/dev/null 2>&1; then
+        cycles=$(jq '.cycle_count' "$outtmp")
+      fi
+    else
+      # Fallback: grep for numbers labeled cycles or count keys
+      cycles=$(grep -Eo '"cycle_count"[[:space:]]*:[[:space:]]*[0-9]+' "$outtmp" | head -n1 | grep -Eo '[0-9]+') || true
+      if [[ -z "$cycles" ]]; then
+        cycles=$(grep -Eo '"cycles"[[:space:]]*:[[:space:]]*[0-9]+' "$outtmp" | head -n1 | grep -Eo '[0-9]+') || true
+      fi
+    fi
+
+    echo "${threads},${run},${elapsed},${rss_kb},${cycles}" >> "$OUTCSV"
+
+    rm -f "$outtmp" "$timetmp"
+  done
+done
+
+echo "Done. Results at: $OUTCSV"
@@ -0,0 +1,81 @@
+<!doctype html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <title>MCAAT: Cycle Finder — Algorithmic & Optimization Report</title>
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <style>
+    body { font-family: Arial, sans-serif; max-width: 900px; margin: 2rem auto; line-height: 1.6; color:#222 }
+    pre { background:#f6f6f6; padding:0.5rem; }
+    h1,h2 { color:#0b63a3 }
+    .note { background:#fffbe6; border-left:4px solid #ffd24d; padding:0.5rem; }
+  </style>
+</head>
+<body>
+  <h1>MCAAT: Cycle Finder — Algorithmic & Optimization Report ✅</h1>
+  <p><strong>Scope:</strong> This document describes <em>only</em> the algorithmic changes and optimizations introduced in the <code>optimizations</code> branch for the cycle finder logic, organized step-by-step.</p>
+  <hr/>
+  <h2>Summary</h2>
+  <ol>
+    <li>Replaced critical sections with per-thread buffers + serial merge.</li>
+    <li>Introduced a lock-free atomic bitset as the visited structure.</li>
+    <li>Reduced allocations by reusing per-thread pools (megahit-style).</li>
+    <li>Applied traversal micro-optimizations (prefetch, fixed arrays, branch hints).</li>
+  </ol>
+
+  <h2>Step-by-step changes</h2>
+  <ol>
+    <li><strong>Per-thread buffers + serial merge</strong>
+      <ul>
+        <li>Replaced concurrent writes to shared containers with <code>local_chunks[tid]</code> and <code>local_results[tid]</code>, then merged serially.</li>
+        <li>Files: <code>ChunkStartNodes</code>, <code>FindApproximateCRISPRArrays</code>.</li>
+      </ul>
+    </li>
+
+    <li><strong>Lock-free visited bitmap</strong>
+      <ul>
+        <li>Introduced <code>std::vector<uint64_t> s_visited_words</code> and helpers: <code>InitializeVisitedGlobal</code>, <code>IsVisitedGlobal</code>, <code>MarkVisitedGlobal</code>.</li>
+        <li>Atomic builtins (<code>__atomic_load_n</code>, <code>__atomic_fetch_or</code>) are used with relaxed ordering.</li>
+      </ul>
+    </li>
+
+    <li><strong>Per-thread pools & fewer allocations</strong>
+      <ul>
+        <li>Added <code>static thread_local</code> pools for DLS stack and visited set; reuse capacity to avoid repeated alloc/free.</li>
+        <li>File: <code>DepthLevelSearch</code>.</li>
+      </ul>
+    </li>
+
+    <li><strong>Traversal micro-optimizations</strong>
+      <ul>
+        <li>Fixed-size neighbor arrays, prefetch, branch hints, and small unrolling in hot loops.</li>
+      </ul>
+    </li>
+
+    <li><strong>Serial result merging & memory hygiene</strong>
+      <ul>
+        <li>Per-thread maps are moved into the shared results in a serial loop; call <code>malloc_trim(0)</code> intermittently.</li>
+      </ul>
+    </li>
+  </ol>
+
+  <h2>Expected impact</h2>
+  <ul>
+    <li>Better multithreaded scaling due to reduced contention and allocator pressure.</li>
+    <li>Reasonable memory usage (1 bit per node for visited bitset).</li>
+  </ul>
+
+  <h2>Limitations & future work</h2>
+  <div class="note">NUMA-aware allocation and deeper profiling are recommended next steps.</div>
+
+  <h2>Quick validation</h2>
+  <ol>
+    <li>Checkout <code>optimizations</code>, build, and run the same workload across several thread counts (1, 8, 24, 48, 128).</li>
+    <li>Use <code>perf</code> and <code>numastat</code> to verify reduced contention and memory hotspots.</li>
+  </ol>
+
+  <hr/>
+  <p><strong>Files touched:</strong> <code>src/cycle_finder.cpp</code> (+ associated header updates).</p>
+  <p>TL;DR: per-thread buffers + lock-free visited bitmap + reused pools = less contention and better parallel throughput.</p>
+</body>
+</html>
@@ -0,0 +1,109 @@
+# MCAAT: Cycle Finder — Algorithmic & Optimization Report ✅
+
+**Scope:** This document describes *only* the algorithmic changes and optimizations introduced in the `optimizations` branch for the cycle finder logic. It is organized so you can read step-by-step what changed, why it was done, and the expected impact.
+
+---
+
+## Summary
+
+1. Replaced global critical sections and shared writes with *per-thread buffers* and a single serial merge step to remove contention.  
+2. Replaced lock-based or synchronized "visited" bookkeeping with a *lock-free atomic bitset* (1 bit per node).  
+3. Reduced allocations and allocator contention by *reusing per-thread pools* (megahit-style) and preallocating where useful.  
+4. Applied traversal micro-optimizations (fixed-size arrays, prefetch, branch hints) to reduce per-edge overhead.
+
+---
+
+## Step-by-step changes (algorithmic & optimization focus)
+
+1) Remove global critical sections → Per-thread buffers + serial merge 🔧
+   - What changed:
+     - Replaced OpenMP `#pragma omp critical` style updates to shared containers with `vector<...>` of per-thread collectors (e.g., `local_chunks` and `local_results`).
+     - After the parallel loop completes, a single-threaded loop merges per-thread buffers into the shared map or results container.
+   - Files/locations:
+     - `CycleFinder::ChunkStartNodes` (collect start nodes into `local_chunks[tid]` then merge).
+     - `CycleFinder::FindApproximateCRISPRArrays` (collect per-thread `local_results`, then merge into `this->results`).
+   - Why / Benefit:
+     - Eliminates high-contention points on hot shared data structures, enabling scaling to higher core counts.
+     - Serial merge cost is amortized and avoids expensive locking in hot loops.
+
+2) Lock-free visited bitmap (1 bit per node) 🔒→⚡
+   - What changed:
+     - Introduced a global `std::vector<uint64_t> s_visited_words` as a bitset (one bit per node).
+     - Provided helper inline functions: `InitializeVisitedGlobal(n)`, `IsVisitedGlobal(node)`, and `MarkVisitedGlobal(node)` implemented using GCC/Clang atomic builtins (`__atomic_load_n`, `__atomic_fetch_or`) with `__ATOMIC_RELAXED` ordering.
+   - Files/locations:
+     - `src/cycle_finder.cpp` (static `s_visited_words` and helpers) and uses in `FindCycle`, `FindCycleUtil`, and background checks.
+   - Why / Benefit:
+     - Avoids `vector<std::atomic>` pitfalls (copy/resize/copyability) and the overhead of locks around visited updates.
+     - One atomic word operation per change (bit flip) is cheap and scales well.
+     - Memory is compact (1 bit per node) and predictable for large graphs.
+   - Correctness note:
+     - Using relaxed atomics is acceptable because bits only transition from 0→1 (monotonic); races among writers do not break correctness, and reads can tolerate transient states.
+
+3) Reduce allocations and reuse per-thread pools (megahit-style) ♻️
+   - What changed:
+     - Introduced `static thread_local` pools for DLS (`dls_stack_pool` and `dls_visited_pool`) used by `DepthLevelSearch`.
+     - Pools are `clear()`ed between uses but retain capacity; small initial reserve is set to avoid repeated small allocations.
+   - Files/locations:
+     - `CycleFinder::DepthLevelSearch`.
+   - Why / Benefit:
+     - Avoids heavy allocator contention when many threads create/destroy temporaries frequently.
+     - Reduced per-edge latency and improved throughput during parallel graph traversal.
+
+4) Traversal micro-optimizations (branch hints, fixed arrays, prefetch) 🧠
+   - What changed:
+     - Use of fixed-size neighbor arrays (`uint64_t neighbors[MAX_EDGE_COUNT]`) rather than heap allocations per node.
+     - Prefetching neighbor buffers and using `__builtin_expect` branch hints to optimize hot paths.
+     - Small loop unrolling where out-degree is small (de Bruijn graph pattern) to reduce loop overhead.
+   - Files/locations:
+     - `DepthLevelSearch`, `_GetOutgoings`, and `_GetIncomings` helpers.
+   - Why / Benefit:
+     - Better cache locality and fewer branch mispredictions; straightforward per-edge speedups with little code complexity.
+
+5) Results merging and memory hygiene 🧽
+   - What changed:
+     - Per-thread `local_results` (maps) are merged serially into `this->results` after each bucket processed.
+     - Call `malloc_trim(0)` occasionally after buckets to release heap fragments back to the OS (for long runs with variable memory usage).
+   - Files/locations:
+     - `FindApproximateCRISPRArrays`.
+   - Why / Benefit:
+     - Reduces concurrent unordered_map modification (expensive) and helps long-running runs avoid growing memory footprints unnecessarily.
+
+---
+
+## Expected performance and behavior improvements
+
+- Improved scalability with thread counts beyond the earlier observed plateau (~24 cores) because:
+  - Contention points are removed or drastically reduced.
+  - Allocator pressure is lowered by reusing containers.
+  - Atomic operations on compact bitmaps replace heavier locks.
+- Memory cost: the visited bitset adds ~1 bit per node (compact) and per-thread buffers increase transient memory usage proportional to thread count but only for selected nodes.
+
+---
+
+## Limitations & future work
+
+- NUMA-aware allocation and memory binding were not implemented yet — this is the natural next step for large multi-socket machines where memory bandwidth dominates.
+- Further profiling (perf/VTune) is needed to quantify the exact causes of any remaining scalability bottlenecks (cache-line bouncing, allocator hotspots, or procedural serial sections).
+
+---
+
+## How to validate quickly (recommended)
+
+1. Check out the `optimizations` branch.
+2. Build (`cmake .. && make -j`) and run the same workload used before.
+3. Compare (a) execution time vs thread count (1, 8, 24, 48, 128), (b) throughput (nodes/sec), and (c) cycles found to ensure no correctness regression.
+4. Use `perf top` / `perf record` or `numastat` to verify reduced lock/atomic time and identify remaining hotspots.
+
+---
+
+## Files touched (algorithmic/optimization only)
+
+- `src/cycle_finder.cpp` — main implementation of lock-free visited bitmap, per-thread collectors, DLS pools, traversal micro-optimizations, merging logic.
+- `include/cycle_finder.h` — updated helpers and declarations related to visited bookkeeping (if applicable).
+
+---
+
+## TL;DR
+
+- Replaced shared locks with per-thread buffers + serial merges, added a compact lock-free visited bitmap, and reduced allocation churn via per-thread pools. These changes reduce contention and allocator pressure and improve multithreaded scaling while keeping memory usage reasonable for very large graphs.
+
@@ -30,8 +30,10 @@ class CycleFinder {
         // Use SDBG pointer from settings everywhere instead of storing a separate reference
         //SDBG& sdbg;
         uint16_t cluster_bounds;
-        vector<bool> visited;
+        // visited bitset stored as 64-bit words (1 bit per node). Use atomic builtins on the words to avoid non-copyable std::atomic in vectors.
+        vector<vector<uint64_t>> per_thread_visited;
         vector<bool> look_up_table;
+
         // thread count obtained from settings
 
         //#### DEVELOPER FUNCTIONS ####
 
@@ -12,7 +12,7 @@
   - Better data structures for preprocessing, `phmap::flat_hash_set`
   - Added compiler intrinsics to guide the hardware in the right direction
   - Reserving the capacity to prevent rehashing
-In depth technical details: [educational resource](./src/z_educational_guide.md) and [optimization developer notes](./src/z_optimization_dev_notes.md). As a result of the above optimizations we achieved __17-25__ times speedup in __1billion__ node graph(from 3 <span style="color:red">days</span> to 3 <span style="color:green">hours</span>). Considering the complexity of the graphs, this is a huge improvement.
+In depth technical details: [educational resource](./src/z_educational_guide.md) and [optimization developer notes](./src/z_optimization_dev_notes.md). 
 
 
 ### Installation using docker