From 8367c2eed3789ea70188d901ba2d3c425d950931 Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Thu, 12 Mar 2026 09:14:00 +0300
Subject: [PATCH 01/18] build: add pyproject.toml and migrate packaging from
 wheel metadata

---
 pyproject.toml | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 pyproject.toml

diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..b4f3ed5
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,50 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel>=0.42"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "krakenparser"
+version = "1.0.0"
+description = "A collection of scripts designed to process Kraken2 reports and convert them into CSV format."
+readme = {file = "README.md", content-type = "text/markdown"}
+license = {file = "LICENSE"}
+authors = [{name = "Ilia Popov", email = "iljapopov17@gmail.com"}]
+requires-python = ">=3.10,<=3.16"
+dependencies = [
+    "pandas>=2.0.0,<=2.3.3",
+    "matplotlib>=3.7.0,<=3.10.8",
+    "numpy>=1.24.0,<=2.3.5",
+    "seaborn>=0.12.0,<=0.13.2",
+    "scikit-bio>=0.6.0,<=0.7.2",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=7.0.0",
+    "pytest-cov>=4.0.0",
+]
+
+[project.urls]
+Homepage = "https://github.com/PopovIILab/KrakenParser"
+
+[project.scripts]
+KrakenParser = "krakenparser.krakenparser:main"
+
+[tool.setuptools.packages.find]
+where = ["."]
+
+
+[tool.pytest.ini_options]
+filterwarnings = [
+    "ignore:This figure includes Axes that are not compatible with tight_layout:UserWarning",
+    "ignore:Tight layout not applied.*:UserWarning",
+    "ignore:Samples with zero total abundance.*:UserWarning",
+]
+
+[tool.coverage.run]
+source = ["krakenparser"]
+omit = [
+    "krakenparser/krakenparser.py",
+    "krakenparser/mpa/transform2mpa.py",
+    "krakenparser/mpa/mpa_table.py",
+]

From 3ac8555904bde5d2ead7aea5a901acab86a82655 Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Mon, 16 Mar 2026 11:42:00 +0300
Subject: [PATCH 02/18] refactor(mpa): rewrite kreport2mpa as transform2mpa
 with batch mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add -i/--input dir mode alongside the existing single-file -r mode.
Remove run_kreport2mpa.sh wrapper — batch logic now lives in Python.
---
 krakenparser/mpa/__init__.py      |   0
 krakenparser/mpa/mpa_table.py     |  70 +++++++++++
 krakenparser/mpa/transform2mpa.py | 192 ++++++++++++++++++++++++++++++
 krakenparser/run_kreport2mpa.sh   |  64 ----------
 4 files changed, 262 insertions(+), 64 deletions(-)
 create mode 100644 krakenparser/mpa/__init__.py
 create mode 100644 krakenparser/mpa/mpa_table.py
 create mode 100644 krakenparser/mpa/transform2mpa.py
 delete mode 100755 krakenparser/run_kreport2mpa.sh

diff --git a/krakenparser/mpa/__init__.py b/krakenparser/mpa/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/krakenparser/mpa/mpa_table.py b/krakenparser/mpa/mpa_table.py
new file mode 100644
index 0000000..f66e63b
--- /dev/null
+++ b/krakenparser/mpa/mpa_table.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+"""Combine multiple MPA-format files into a single merged table."""
+
+import argparse
+import logging
+
+_log = logging.getLogger(__name__)
+
+
+def combine_mpa(in_files: list[str], o_file: str) -> None:
+    # Plain dict preserves insertion order (Python 3.7+).
+    taxa: dict[str, dict[int, str]] = {}
+    sample_names: list[str] = []
+
+    _log.info("Number of files to parse: %d", len(in_files))
+
+    for idx, in_path in enumerate(in_files):
+        sample_name = f"Sample #{idx + 1}"
+        with open(in_path) as fh:
+            for line in fh:
+                line = line.rstrip("\n")
+                if not line:
+                    continue
+                if line.startswith("#"):
+                    cols = line.split("\t")
+                    if len(cols) >= 2:
+                        sample_name = cols[-1]
+                    continue
+                cols = line.split("\t", 1)
+                if len(cols) < 2:
+                    continue
+                taxon, count = cols[0], cols[1]
+                if taxon not in taxa:
+                    taxa[taxon] = {}
+                taxa[taxon][idx] = count
+        sample_names.append(sample_name)
+
+    n_samples = len(sample_names)
+    n_taxa = len(taxa)
+    _log.info("Number of classifications to write: %d", n_taxa)
+
+    with open(o_file, "w") as fh:
+        fh.write("#Classification\t" + "\t".join(sample_names) + "\n")
+        for taxon, counts in taxa.items():
+            row = [counts.get(i, "0") for i in range(n_samples)]
+            fh.write(taxon + "\t" + "\t".join(row) + "\n")
+
+    _log.info("%d classifications written", n_taxa)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Combine MPA files into a single tab-delimited table."
+    )
+    parser.add_argument(
+        "-i", "--input",
+        required=True, nargs="+", dest="in_files",
+        help="Input MPA files (one per sample)",
+    )
+    parser.add_argument(
+        "-o", "--output",
+        required=True, dest="o_file",
+        help="Output merged MPA file",
+    )
+    args = parser.parse_args()
+    combine_mpa(args.in_files, args.o_file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/krakenparser/mpa/transform2mpa.py b/krakenparser/mpa/transform2mpa.py
new file mode 100644
index 0000000..480a1f9
--- /dev/null
+++ b/krakenparser/mpa/transform2mpa.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+"""Convert a Kraken2 report to MetaPhlAn (MPA) format."""
+
+import argparse
+import os
+import sys
+from pathlib import Path
+
+# Maps Kraken2 single-letter rank codes to MPA prefixes
+_RANK_PREFIX = {
+    "D": "d", "K": "k", "P": "p", "C": "c",
+    "O": "o", "F": "f", "G": "g", "S": "s",
+}
+
+
+def _parse_line(line: str):
+    """
+    Parse one Kraken2 report line.
+
+    Standard format (6 columns):
+        pct  cum_reads  direct_reads  rank  taxid  name(indented)
+
+    Returns (name, depth, rank, cum_reads, pct) or None on malformed input.
+    """
+    parts = line.rstrip("\n").split("\t")
+    if len(parts) < 5:
+        return None
+    try:
+        pct = float(parts[0])
+        cum_reads = int(parts[1])
+    except ValueError:
+        return None
+
+    rank = parts[3].strip()
+    name_field = parts[-1]  # always the last column regardless of format variant
+
+    depth = 0
+    for ch in name_field:
+        if ch == " ":
+            depth += 1
+        else:
+            break
+    name = name_field.strip()
+
+    return name, depth // 2, rank, cum_reads, pct
+
+
+def kreport_to_mpa(
+    report_path: str,
+    output_path: str,
+    display_header: bool = False,
+    include_intermediate: bool = False,
+    use_reads: bool = True,
+    remove_spaces: bool = True,
+) -> None:
+    """
+    Convert a single Kraken2 report to MPA format.
+
+    Uses a stack to track the current taxonomic path. Each entry is
+    (structural_depth, mpa_segment, is_standard_rank). When a node at
+    depth d is encountered, all stack entries with depth >= d are popped
+    before the new entry is pushed, keeping the path consistent.
+    """
+    # Stack entries: (structural_depth, mpa_segment, is_standard_rank)
+    stack: list[tuple[int, str, bool]] = []
+
+    with open(report_path) as r_fh, open(output_path, "w") as o_fh:
+        if display_header:
+            o_fh.write("#Classification\t" + os.path.basename(report_path) + "\n")
+
+        for line in r_fh:
+            parsed = _parse_line(line)
+            if parsed is None:
+                continue
+            name, depth, rank, cum_reads, pct = parsed
+
+            # Skip unclassified and root — never appear in MPA output
+            if rank in ("U", "R"):
+                continue
+
+            # Strip numeric suffix to get base rank (e.g. "S1" → "S", "G2" → "G")
+            rank_base = rank.rstrip("0123456789")
+            is_standard = rank_base in _RANK_PREFIX and rank == rank_base
+
+            if not is_standard and not include_intermediate:
+                continue
+
+            prefix = _RANK_PREFIX.get(rank_base, "x")
+            seg_name = name.replace(" ", "_") if remove_spaces else name
+            mpa_seg = f"{prefix}__{seg_name}"
+
+            # Trim stack to the current structural depth
+            while stack and stack[-1][0] >= depth:
+                stack.pop()
+            stack.append((depth, mpa_seg, is_standard))
+
+            # Build the full MPA path; omit intermediate (x__) segments when not requested
+            path = "|".join(
+                seg
+                for (_, seg, std) in stack
+                if include_intermediate or std
+            )
+
+            value = str(cum_reads) if use_reads else str(pct)
+            o_fh.write(f"{path}\t{value}\n")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Convert a Kraken2 report to MetaPhlAn (MPA) format."
+    )
+
+    mode = parser.add_mutually_exclusive_group(required=True)
+    mode.add_argument(
+        "-r", "--report-file", "--report",
+        dest="r_file",
+        help="Single input Kraken2 report file",
+    )
+    mode.add_argument(
+        "-i", "--input",
+        dest="input_dir",
+        help="Input directory containing Kraken2 report files (batch mode)",
+    )
+
+    parser.add_argument(
+        "-o", "--output",
+        required=True, dest="o_file",
+        help="Output MPA file (single mode) or output directory (batch mode)",
+    )
+    parser.add_argument(
+        "--display-header",
+        action="store_true", dest="add_header", default=False,
+        help="Write a header line with the sample name (filename)",
+    )
+    parser.add_argument(
+        "--read_count",
+        action="store_true", dest="use_reads", default=True,
+        help="Output clade read counts [default]",
+    )
+    parser.add_argument(
+        "--percentages",
+        action="store_false", dest="use_reads",
+        help="Output percentages instead of read counts",
+    )
+    parser.add_argument(
+        "--intermediate-ranks",
+        action="store_true", dest="x_include", default=False,
+        help="Include non-standard taxonomic ranks in output",
+    )
+    parser.add_argument(
+        "--no-intermediate-ranks",
+        action="store_false", dest="x_include",
+        help="Exclude non-standard taxonomic ranks [default]",
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--remove-spaces",
+        action="store_true", dest="remove_spaces", default=True,
+        help="Replace spaces with underscores in taxon names [default]",
+    )
+    group.add_argument(
+        "--keep-spaces",
+        action="store_false", dest="remove_spaces",
+        help="Keep spaces in taxon names",
+    )
+    args = parser.parse_args()
+
+    kwargs = dict(
+        display_header=args.add_header,
+        include_intermediate=args.x_include,
+        use_reads=args.use_reads,
+        remove_spaces=args.remove_spaces,
+    )
+
+    if args.input_dir:
+        input_dir = Path(args.input_dir)
+        if not input_dir.is_dir():
+            sys.exit(f"Error: input directory not found: {input_dir}")
+        output_dir = Path(args.o_file)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        for f in sorted(input_dir.iterdir()):
+            if not f.is_file():
+                continue
+            out_name = f.name.replace(".kreport", ".MPA.TXT")
+            kreport_to_mpa(str(f), str(output_dir / out_name), **kwargs)
+        print(f"Converted to MPA successfully. Output stored in {output_dir}")
+    else:
+        kreport_to_mpa(args.r_file, args.o_file, **kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/krakenparser/run_kreport2mpa.sh b/krakenparser/run_kreport2mpa.sh
deleted file mode 100755
index 7b40ee9..0000000
--- a/krakenparser/run_kreport2mpa.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-
-# Function to display usage information
-usage() {
-    echo "Usage: $(basename "$0") -i PATH_TO_SOURCE -o PATH_TO_DESTINATION"
-    echo
-    echo "  -i, --input PATH_TO_SOURCE       Path to the source directory containing files to convert."
-    echo "  -o, --output PATH_TO_DESTINATION  Path to the destination directory where converted files will be stored."
-    echo "  -h, --help                       Display this help message and exit."
-    echo
-    echo "This script converts files in the source directory with the .kreport extension to .MPA.TXT format."
-    echo "It uses KrakenTools' \`kreport2mpa.py\` Python script to perform the conversion."
-    echo "The converted files will be saved in the specified destination directory."
-    exit 0
-}
-
-# Initialize variables
-SOURCE_DIR=""
-DESTINATION_DIR=""
-
-# Parse command-line arguments
-while [[ "$#" -gt 0 ]]; do
-    case "$1" in
-        -i|--input) SOURCE_DIR="$2"; shift 2 ;;
-        -o|--output) DESTINATION_DIR="$2"; shift 2 ;;
-        -h|--help) usage ;;
-        *) echo "Unknown option: $1"; usage ;;
-    esac
-done
-
-# Check if required arguments are provided
-if [ -z "$SOURCE_DIR" ] || [ -z "$DESTINATION_DIR" ]; then
-    echo "Error: Both input and output paths are required."
-    usage
-fi
-
-# Determine the directory of this script
-SCRIPT_DIR=$(dirname "$(realpath "$0")")
-
-# Ensure KrakenTools script exists
-KREPORT_SCRIPT="$SCRIPT_DIR/kreport2mpa.py"
-if [ ! -f "$KREPORT_SCRIPT" ]; then
-    echo "Error: kreport2mpa.py not found in $SCRIPT_DIR"
-    exit 1
-fi
-
-# Create the destination folder if it does not exist
-mkdir -p "${DESTINATION_DIR}"
-
-# Convert files in the source directory
-for file in "${SOURCE_DIR}"/*.*; do
-    # Get the file name without path
-    filename=$(basename "${file}")
-    # Form the command to process the file
-    python "$SCRIPT_DIR/kreport2mpa.py" -r "${file}" -o "${DESTINATION_DIR}/${filename/.kreport/.MPA.TXT}" --display-header
-done
-
-# Check exit status
-if [ $? -ne 0 ]; then
-    echo "Error: Conversion process failed."
-    exit 1
-fi
-
-echo "Converted to MPA successfully. Output stored in $DESTINATION_DIR"
\ No newline at end of file

From 3ccdab86772f0402d5a883790cb0af400a3c47c2 Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Tue, 24 Mar 2026 14:08:00 +0300
Subject: [PATCH 03/18] refactor(counts): replace bash decombine scripts with
 Python split_mpa

Merge decombine.sh + decombine_viruses.sh into a single split_mpa.py.
Add --viruses-only and --keep-human flags.
Move convert2csv and processing_script into counts/ subpackage.
---
 krakenparser/convert2csv.py                   |  54 --------
 krakenparser/counts/__init__.py               |   0
 krakenparser/counts/convert2csv.py            |  45 +++++++
 .../{ => counts}/processing_script.py         |  37 +++---
 krakenparser/counts/split_mpa.py              | 112 ++++++++++++++++
 krakenparser/decombine.sh                     | 123 ------------------
 krakenparser/decombine_viruses.sh             | 111 ----------------
 7 files changed, 178 insertions(+), 304 deletions(-)
 delete mode 100755 krakenparser/convert2csv.py
 create mode 100644 krakenparser/counts/__init__.py
 create mode 100755 krakenparser/counts/convert2csv.py
 rename krakenparser/{ => counts}/processing_script.py (66%)
 create mode 100644 krakenparser/counts/split_mpa.py
 delete mode 100755 krakenparser/decombine.sh
 delete mode 100755 krakenparser/decombine_viruses.sh

diff --git a/krakenparser/convert2csv.py b/krakenparser/convert2csv.py
deleted file mode 100755
index 2b05823..0000000
--- a/krakenparser/convert2csv.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/usr/bin/env python
-
-import shutil
-from pathlib import Path
-import argparse
-import pandas as pd
-
-
-def convert_to_csv(input_file, output_file):
-    # Read the entire file into a DataFrame
-    data = pd.read_csv(input_file, sep="\t", header=None)
-
-    # Set the first row as the header
-    data.columns = data.iloc[0]
-    data = data.drop(data.index[0])
-
-    # Transpose the DataFrame so that sample names become rows and microbial taxa with their abundance become columns
-    data_transposed = data.T
-    data_transposed.columns = data_transposed.iloc[0]
-    data_transposed = data_transposed.drop(data_transposed.index[0])
-
-    # Save the transposed data to a new CSV file
-    data_transposed.to_csv(output_file, index_label="Sample_id")
-    print(f"Data has been successfully converted and saved as '{output_file}'.")
-
-    # Remove pycache if it exists
-    current_dir = Path(__file__).resolve().parent
-    pycache_dir = current_dir / "__pycache__"
-    if pycache_dir.exists() and pycache_dir.is_dir():
-        shutil.rmtree(pycache_dir)
-
-
-if __name__ == "__main__":
-    # Use argparse to handle command-line arguments
-    parser = argparse.ArgumentParser(
-        description="Reads a TXT file, reorganizes the data, and converts it into a CSV file."
-    )
-    parser.add_argument(
-        "-i",
-        "--input",
-        required=True,
-        help="Path to the input TXT file. This file should contain sample names in columns and microbial taxa in rows.",
-    )
-    parser.add_argument(
-        "-o",
-        "--output",
-        required=True,
-        help="Path to the output CSV file. The script will restructure the data and save it here.",
-    )
-
-    args = parser.parse_args()
-
-    # Call function with parsed arguments
-    convert_to_csv(args.input, args.output)
diff --git a/krakenparser/counts/__init__.py b/krakenparser/counts/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/krakenparser/counts/convert2csv.py b/krakenparser/counts/convert2csv.py
new file mode 100755
index 0000000..835ffc3
--- /dev/null
+++ b/krakenparser/counts/convert2csv.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+
+import argparse
+import logging
+from pathlib import Path
+import pandas as pd
+
+_log = logging.getLogger(__name__)
+
+
+def convert_to_csv(input_file, output_file):
+    in_path = Path(input_file)
+    if not in_path.is_file():
+        raise FileNotFoundError(f"Input file not found: {in_path}")
+    out_path = Path(output_file)
+    if not out_path.parent.exists():
+        raise FileNotFoundError(f"Output directory does not exist: {out_path.parent}")
+
+    data = pd.read_csv(in_path, sep="\t", index_col=0)
+    data.T.to_csv(out_path, index_label="Sample_id")
+    _log.info("Data converted and saved as '%s'.", output_file)
+
+
+if __name__ == "__main__":
+    # Use argparse to handle command-line arguments
+    parser = argparse.ArgumentParser(
+        description="Reads a TXT file, reorganizes the data, and converts it into a CSV file."
+    )
+    parser.add_argument(
+        "-i",
+        "--input",
+        required=True,
+        help="Path to the input TXT file. This file should contain sample names in columns and microbial taxa in rows.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        required=True,
+        help="Path to the output CSV file. The script will restructure the data and save it here.",
+    )
+
+    args = parser.parse_args()
+
+    # Call function with parsed arguments
+    convert_to_csv(args.input, args.output)
diff --git a/krakenparser/processing_script.py b/krakenparser/counts/processing_script.py
similarity index 66%
rename from krakenparser/processing_script.py
rename to krakenparser/counts/processing_script.py
index 911d306..b2b28ff 100755
--- a/krakenparser/processing_script.py
+++ b/krakenparser/counts/processing_script.py
@@ -1,7 +1,8 @@
 #!/usr/bin/env python
 
+import os
 import sys
-import shutil
+import tempfile
 import argparse
 from pathlib import Path
 
@@ -10,41 +11,45 @@ def modify_taxa_names(line):
     prefixes = ["s__", "g__", "f__", "o__", "c__", "p__"]
     for prefix in prefixes:
         if line.startswith(prefix):
-            # Remove the prefix and replace underscores with spaces
-            return line[len(prefix) :].replace("_", " ")
+            parts = line[len(prefix):].split("\t")
+            parts[0] = parts[0].replace("_", " ")
+            return "\t".join(parts)
     return line
 
 
 def process_files(source_file, destination_file):
+    src_path = Path(source_file)
+    if not src_path.is_file():
+        raise FileNotFoundError(f"Source file not found: {src_path}")
+    dest_path = Path(destination_file)
+    if not dest_path.is_file():
+        raise FileNotFoundError(f"Destination file not found: {dest_path}")
+
     # Read the first line from the source file and modify it
-    with open(source_file, "r") as file:
+    with open(src_path, "r") as file:
         first_line_source = file.readline()
     modified_first_line = "\t".join(
         word.split(".")[0] for word in first_line_source.split()
     )
 
     # Read all content from the destination file and modify taxa names
-    with open(destination_file, "r") as file:
+    with open(dest_path, "r") as file:
         lines = file.readlines()
     modified_lines = [modify_taxa_names(line.strip()) for line in lines]
 
     # Combine the modified first line with the modified content of the destination file
     updated_content = modified_first_line + "\n" + "\n".join(modified_lines)
 
-    # Write the updated content back to the destination file
-    with open(destination_file, "w") as file:
-        file.write(updated_content)
+    # Write atomically: write to a temp file in the same directory, then replace
+    with tempfile.NamedTemporaryFile(
+        mode="w", dir=dest_path.parent, delete=False, suffix=".tmp"
+    ) as tmp:
+        tmp.write(updated_content)
+        tmp_path = tmp.name
+    os.replace(tmp_path, dest_path)
 
     print(f"Processed {destination_file} successfully.")
 
-    # Get the path to the current directory (same location as the script)
-    current_dir = Path(__file__).resolve().parent
-    pycache_dir = current_dir / "__pycache__"
-
-    # Check if __pycache__ exists and remove it
-    if pycache_dir.exists() and pycache_dir.is_dir():
-        shutil.rmtree(pycache_dir)
-
 
 if __name__ == "__main__":
     # Use argparse to parse command-line arguments
diff --git a/krakenparser/counts/split_mpa.py b/krakenparser/counts/split_mpa.py
new file mode 100644
index 0000000..72b4e07
--- /dev/null
+++ b/krakenparser/counts/split_mpa.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+"""Split a combined MPA table into per-rank TXT files.
+
+Replaces decombine.sh and decombine_viruses.sh.
+"""
+
+import argparse
+import logging
+import re
+import sys
+from pathlib import Path
+
+_log = logging.getLogger(__name__)
+
+
+_RANKS = [
+    ("species", "s__", []),
+    ("genus",   "g__", ["s__"]),
+    ("family",  "f__", ["s__", "g__"]),
+    ("order",   "o__", ["s__", "g__", "f__"]),
+    ("class",   "c__", ["s__", "g__", "f__", "o__"]),
+    ("phylum",  "p__", ["s__", "g__", "f__", "o__", "c__"]),
+]
+
+_HUMAN_TAXA = {
+    "species": "s__Homo_sapiens",
+    "genus":   "g__Homo",
+    "family":  "f__Hominidae",
+    "order":   "o__Primates",
+    "class":   "c__Mammalia",
+    "phylum":  "p__Chordata",
+}
+
+_ACCESSION_RE = re.compile(r"(SRS|SRR|SRX|ERS|ERR|ERX|DRS|DRR|DRX)\d*-")
+
+
+def _strip_path_prefix(line: str) -> str:
+    """'d__X|p__Y|s__Z\t10\t20' → 's__Z\t10\t20'"""
+    tab = line.find("\t")
+    if tab == -1:
+        return line
+    path, rest = line[:tab], line[tab:]
+    pipe = path.rfind("|")
+    segment = path[pipe + 1:] if pipe != -1 else path
+    return _ACCESSION_RE.sub("", segment + rest)
+
+
+def split_mpa(
+    input_file: str,
+    output_dir: str,
+    viruses_only: bool = False,
+    keep_human: bool = False,
+) -> None:
+    in_path = Path(input_file)
+    if not in_path.is_file():
+        raise FileNotFoundError(f"Input file not found: {in_path}")
+    out_path = Path(output_dir)
+    (out_path / "txt").mkdir(parents=True, exist_ok=True)
+
+    lines = in_path.read_text().splitlines()
+    data_lines = [ln for ln in lines if not ln.startswith("#") and ln.strip()]
+
+    if viruses_only:
+        data_lines = [ln for ln in data_lines if "d__Viruses" in ln]
+
+    filter_human = not keep_human and not viruses_only
+
+    for rank_name, rank_prefix, exclude_prefixes in _RANKS:
+        result = []
+        human_pattern = _HUMAN_TAXA[rank_name]
+
+        for line in data_lines:
+            if rank_prefix not in line:
+                continue
+            if "t__" in line:
+                continue
+            if any(ep in line for ep in exclude_prefixes):
+                continue
+            if filter_human and human_pattern in line:
+                continue
+            result.append(_strip_path_prefix(line))
+
+        out_file = out_path / "txt" / f"counts_{rank_name}.txt"
+        out_file.write_text("\n".join(result) + ("\n" if result else ""))
+
+    _log.info("MPA file split successfully. Output stored in %s", output_dir)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Split a combined MPA table into per-rank TXT files."
+    )
+    parser.add_argument("-i", "--input", required=True, help="Input combined MPA file")
+    parser.add_argument("-o", "--output", required=True, help="Output directory")
+    parser.add_argument(
+        "--viruses-only",
+        action="store_true",
+        default=False,
+        help="Extract only Viruses domain taxa",
+    )
+    parser.add_argument(
+        "--keep-human",
+        action="store_true",
+        default=False,
+        help="Do not filter human-related taxa (default: filtered)",
+    )
+    args = parser.parse_args()
+    split_mpa(args.input, args.output, viruses_only=args.viruses_only, keep_human=args.keep_human)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/krakenparser/decombine.sh b/krakenparser/decombine.sh
deleted file mode 100755
index 0832018..0000000
--- a/krakenparser/decombine.sh
+++ /dev/null
@@ -1,123 +0,0 @@
-#!/bin/bash
-
-# Function to display detailed usage information
-usage() {
-    echo "Usage: $(basename "$0") -i PATH_TO_SOURCE_FILE -o PATH_TO_DESTINATION"
-    echo
-    echo "  -i, --input    PATH_TO_SOURCE_FILE    Path to the Combined MPA input file to be processed."
-    echo "  -o, --output   PATH_TO_DESTINATION    Path to the directory where processed output files will be stored."
-    echo "  -h, --help                           Display this help message and exit."
-    echo
-    echo "Description:"
-    echo "  This script processes a combined mpa file by extracting different taxonomic levels"
-    echo "  (species, genus, family, order, class, and phylum) and saving the results as separate text files."
-    echo "  Additionally, it removes human-related sequences to improve data accuracy."
-    echo
-    echo "Processing Details:"
-    echo "  - Extracts taxonomic levels using 'grep' with specific patterns."
-    echo "  - Filters out undesired taxonomic entries such as:"
-    echo "    - Species: Homo sapiens"
-    echo "    - Genus: Homo"
-    echo "    - Family: Hominidae"
-    echo "    - Order: Primates"
-    echo "    - Class: Mammalia"
-    echo "    - Phylum: Chordata"
-    echo "  - Outputs are stored as text files inside the specified destination directory."
-    exit 0
-}
-
-# Initialize variables
-SOURCE_FILE=""
-DESTINATION_DIR=""
-
-# Parse command-line arguments
-while [[ "$#" -gt 0 ]]; do
-    case "$1" in
-        -i|--input) SOURCE_FILE="$2"; shift 2 ;;
-        -o|--output) DESTINATION_DIR="$2"; shift 2 ;;
-        -h|--help) usage ;;
-        *) echo "Error: Unknown option $1"; usage ;;
-    esac
-done
-
-# Check if required arguments are provided
-if [[ -z "$SOURCE_FILE" || -z "$DESTINATION_DIR" ]]; then
-    echo "Error: Both input (-i) and output (-o) paths are required."
-    usage
-fi
-
-# Check if source file exists
-if [[ ! -f "$SOURCE_FILE" ]]; then
-    echo "Error: Input file '$SOURCE_FILE' not found!"
-    exit 1
-fi
-
-# Create destination directories
-mkdir -p "${DESTINATION_DIR}/txt"
-mkdir -p "${DESTINATION_DIR}/csv"
-
-# Process input file and generate output files
-grep -E "s__" "${SOURCE_FILE}" \
-| grep -v "t__" \
-| grep -v "s__Homo_sapiens" \
-| sed "s/^.*|//g" \
-| sed "s/SRS[0-9]*-//g" \
-> "${DESTINATION_DIR}/txt/counts_species.txt"
-
-grep -E "g__" "${SOURCE_FILE}" \
-| grep -v "t__" \
-| grep -v "s__" \
-| grep -v "g__Homo" \
-| sed "s/^.*|//g" \
-| sed "s/SRS[0-9]*-//g" \
-> "${DESTINATION_DIR}/txt/counts_genus.txt"
-
-grep -E "f__" "${SOURCE_FILE}" \
-| grep -v "t__" \
-| grep -v "s__" \
-| grep -v "g__" \
-| grep -v "f__Hominidae" \
-| sed "s/^.*|//g" \
-| sed "s/SRS[0-9]*-//g" \
-> "${DESTINATION_DIR}/txt/counts_family.txt"
-
-grep -E "o__" "${SOURCE_FILE}" \
-| grep -v "t__" \
-| grep -v "s__" \
-| grep -v "g__" \
-| grep -v "f__" \
-| grep -v "o__Primates" \
-| sed "s/^.*|//g" \
-| sed "s/SRS[0-9]*-//g" \
-> "${DESTINATION_DIR}/txt/counts_order.txt"
-
-grep -E "c__" "${SOURCE_FILE}" \
-| grep -v "t__" \
-| grep -v "s__" \
-| grep -v "g__" \
-| grep -v "f__" \
-| grep -v "o__" \
-| grep -v "c__Mammalia" \
-| sed "s/^.*|//g" \
-| sed "s/SRS[0-9]*-//g" \
-> "${DESTINATION_DIR}/txt/counts_class.txt"
-
-grep -E "p__" "${SOURCE_FILE}" \
-| grep -v "t__" \
-| grep -v "s__" \
-| grep -v "g__" \
-| grep -v "f__" \
-| grep -v "o__" \
-| grep -v "c__" \
-| grep -v "p__Chordata" \
-| sed "s/^.*|//g" \
-| sed "s/SRS[0-9]*-//g" \
-> "${DESTINATION_DIR}/txt/counts_phylum.txt"
-
-# Check for errors
-if [ $? -ne 0 ]; then
-    echo "Error: Failed to run decombine.sh"
-    exit 1
-fi
-
-echo "MPA file decombined successfully. Output stored in $DESTINATION_DIR"
\ No newline at end of file
diff --git a/krakenparser/decombine_viruses.sh b/krakenparser/decombine_viruses.sh
deleted file mode 100755
index 99c8260..0000000
--- a/krakenparser/decombine_viruses.sh
+++ /dev/null
@@ -1,111 +0,0 @@
-#!/bin/bash
-
-# Function to display detailed usage information
-usage() {
-    echo "Usage: $(basename "$0") -i PATH_TO_SOURCE_FILE -o PATH_TO_DESTINATION"
-    echo
-    echo "  -i, --input    PATH_TO_SOURCE_FILE    Path to the Combined MPA input file to be processed."
-    echo "  -o, --output   PATH_TO_DESTINATION    Path to the directory where processed output files will be stored."
-    echo "  -h, --help                           Display this help message and exit."
-    echo
-    echo "Description:"
-    echo "  This script processes a combined mpa file by extracting different taxonomic levels using only VIRUSES domain"
-    echo "  (species, genus, family, order, class, and phylum) and saving the results as separate text files."
-    echo
-    echo "Processing Details:"
-    echo "  - Extracts taxonomic levels only on VIRUSES domain using 'grep' with specific patterns."
-    echo "  - Outputs are stored as text files inside the specified destination directory."
-    exit 0
-}
-
-# Initialize variables
-SOURCE_FILE=""
-DESTINATION_DIR=""
-
-# Parse command-line arguments
-while [[ "$#" -gt 0 ]]; do
-    case "$1" in
-        -i|--input) SOURCE_FILE="$2"; shift 2 ;;
-        -o|--output) DESTINATION_DIR="$2"; shift 2 ;;
-        -h|--help) usage ;;
-        *) echo "Error: Unknown option $1"; usage ;;
-    esac
-done
-
-# Check if required arguments are provided
-if [[ -z "$SOURCE_FILE" || -z "$DESTINATION_DIR" ]]; then
-    echo "Error: Both input (-i) and output (-o) paths are required."
-    usage
-fi
-
-# Check if source file exists
-if [[ ! -f "$SOURCE_FILE" ]]; then
-    echo "Error: Input file '$SOURCE_FILE' not found!"
-    exit 1
-fi
-
-# Create destination directories
-mkdir -p "${DESTINATION_DIR}/txt"
-mkdir -p "${DESTINATION_DIR}/csv"
-
-VIRUSES_BUFFER=$(grep -E "d__Viruses" "${SOURCE_FILE}")
-
-# Process input file and generate output files
-echo "$VIRUSES_BUFFER" | grep -E "s__" \
-    | grep -v "t__" \
-    | sed "s/^.*|//g" \
-    | sed "s/SRS[0-9]*-//g" \
-    > "${DESTINATION_DIR}/txt/counts_species.txt"
-
-echo "$VIRUSES_BUFFER" | grep -E "g__" \
-    | grep -v "t__" \
-    | grep -v "s__" \
-    | sed "s/^.*|//g" \
-    | sed "s/SRS[0-9]*-//g" \
-    > "${DESTINATION_DIR}/txt/counts_genus.txt"
-
-echo "$VIRUSES_BUFFER" | grep -E "f__" \
-    | grep -v "t__" \
-    | grep -v "s__" \
-    | grep -v "g__" \
-    | sed "s/^.*|//g" \
-    | sed "s/SRS[0-9]*-//g" \
-    > "${DESTINATION_DIR}/txt/counts_family.txt"
-
-echo "$VIRUSES_BUFFER" | grep -E "o__" \
-    | grep -v "t__" \
-    | grep -v "s__" \
-    | grep -v "g__" \
-    | grep -v "f__" \
-    | sed "s/^.*|//g" \
-    | sed "s/SRS[0-9]*-//g" \
-    > "${DESTINATION_DIR}/txt/counts_order.txt"
-
-echo "$VIRUSES_BUFFER" | grep -E "c__" \
-    | grep -v "t__" \
-    | grep -v "s__" \
-    | grep -v "g__" \
-    | grep -v "f__" \
-    | grep -v "o__" \
-    | sed "s/^.*|//g" \
-    | sed "s/SRS[0-9]*-//g" \
-    > "${DESTINATION_DIR}/txt/counts_class.txt"
-
-echo "$VIRUSES_BUFFER" | grep -E "p__" \
-    | grep -v "t__" \
-    | grep -v "s__" \
-    | grep -v "g__" \
-    | grep -v "f__" \
-    | grep -v "o__" \
-    | grep -v "c__" \
-    | sed "s/^.*|//g" \
-    | sed "s/SRS[0-9]*-//g" \
-    > "${DESTINATION_DIR}/txt/counts_phylum.txt"
-
-# Check for errors
-if [ $? -ne 0 ]; then
-    echo "Error: Failed to run decombine_viruses.sh"
-    exit 1
-fi
-
-echo "MPA file decombined successfully. Output stored in $DESTINATION_DIR"
\ No newline at end of file

From eb92428cea0e4cd5e530cee523c65e0dbb946543 Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Wed, 1 Apr 2026 10:22:00 +0300
Subject: [PATCH 04/18] refactor(stats): move relabund and diversity into
 stats/ subpackage

---
 krakenparser/stats/__init__.py        |  0
 krakenparser/{ => stats}/diversity.py | 19 ++++++---------
 krakenparser/{ => stats}/relabund.py  | 35 ++++++++++++++++++---------
 3 files changed, 32 insertions(+), 22 deletions(-)
 create mode 100644 krakenparser/stats/__init__.py
 rename krakenparser/{ => stats}/diversity.py (88%)
 rename krakenparser/{ => stats}/relabund.py (71%)

diff --git a/krakenparser/stats/__init__.py b/krakenparser/stats/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/krakenparser/diversity.py b/krakenparser/stats/diversity.py
similarity index 88%
rename from krakenparser/diversity.py
rename to krakenparser/stats/diversity.py
index 2fe68f5..da3b336 100644
--- a/krakenparser/diversity.py
+++ b/krakenparser/stats/diversity.py
@@ -3,7 +3,6 @@
 import pandas as pd
 import numpy as np
 import sys
-import shutil
 import argparse
 from pathlib import Path
 from skbio.diversity import beta_diversity
@@ -20,11 +19,11 @@ def shannon_index(counts):
 
 # Define Pielou's evenness
 def pielou_evenness(counts):
-    counts = np.array(counts)
-    counts = counts[counts > 0]
-    H = shannon_index(counts)
-    S = len(counts)
-    return H / np.log(S) if S > 1 else 0
+    counts = np.asarray(counts)
+    S = int(np.sum(counts > 0))
+    if S <= 1:
+        return np.nan
+    return shannon_index(counts) / np.log(S)
 
 
 # Define Chao1 richness estimator
@@ -59,7 +58,7 @@ def calc_beta_div(df, output_path, rarefaction_depth):
     sample_ids = []
 
     for sample, row in df.iterrows():
-        counts = row.values.astype(int)
+        counts = np.round(row.values).astype(int)
         if counts.sum() >= rarefaction_depth:
             rarefied = subsample_counts(counts, n=rarefaction_depth)
             rarefied_counts.append(rarefied)
@@ -98,6 +97,8 @@ def calc_beta_div(df, output_path, rarefaction_depth):
     args = parser.parse_args()
 
     input_file = Path(args.input)
+    if not input_file.is_file():
+        sys.exit(f"Error: input file not found: {input_file}")
     output_dir = Path(args.output)
     output_dir.mkdir(parents=True, exist_ok=True)
 
@@ -108,7 +109,3 @@ def calc_beta_div(df, output_path, rarefaction_depth):
     print(
         f"α & β-diversities have been successfully calculated and saved to '{output_dir}'."
     )
-
-    pycache_dir = Path(__file__).resolve().parent / "__pycache__"
-    if pycache_dir.exists() and pycache_dir.is_dir():
-        shutil.rmtree(pycache_dir)
diff --git a/krakenparser/relabund.py b/krakenparser/stats/relabund.py
similarity index 71%
rename from krakenparser/relabund.py
rename to krakenparser/stats/relabund.py
index d7a7143..171f280 100644
--- a/krakenparser/relabund.py
+++ b/krakenparser/stats/relabund.py
@@ -1,14 +1,24 @@
 #!/usr/bin/env python
 
-import shutil
-from pathlib import Path
+import logging
+import warnings
 import argparse
+from pathlib import Path
 import pandas as pd
 
+_log = logging.getLogger(__name__)
+
 
 def calculate_rel_abund(input_file, output_file, other_threshold=None):
+    in_path = Path(input_file)
+    if not in_path.is_file():
+        raise FileNotFoundError(f"Input file not found: {in_path}")
+    out_path = Path(output_file)
+    if not out_path.parent.exists():
+        raise FileNotFoundError(f"Output directory does not exist: {out_path.parent}")
+
     # Load counts table
-    df = pd.read_csv(input_file)
+    df = pd.read_csv(in_path)
 
     # Reshape to long format: Sample_id, taxon, abundance
     long_df = df.melt(id_vars=["Sample_id"], var_name="taxon", value_name="abundance")
@@ -16,6 +26,15 @@ def calculate_rel_abund(input_file, output_file, other_threshold=None):
     # Summarize total abundance per sample (used for percentage calculation)
     total_abundance = long_df.groupby("Sample_id")["abundance"].transform("sum")
 
+    zero_samples = long_df.groupby("Sample_id")["abundance"].sum()
+    zero_samples = zero_samples[zero_samples == 0].index.tolist()
+    if zero_samples:
+        warnings.warn(
+            f"Samples with zero total abundance were excluded from output: {zero_samples}",
+            UserWarning,
+            stacklevel=2,
+        )
+
     # Calculate relative abundance (%)
     long_df["rel_abund_perc"] = (long_df["abundance"] / total_abundance) * 100
 
@@ -40,16 +59,10 @@ def calculate_rel_abund(input_file, output_file, other_threshold=None):
 
     # Save to CSV
     result.to_csv(output_file, index=False)
-    print(
-        f"Relative abundance has been successfully calculated and saved as '{output_file}'."
+    _log.info(
+        "Relative abundance saved as '%s'.", output_file
     )
 
-    # Remove __pycache__
-    current_dir = Path(__file__).resolve().parent
-    pycache_dir = current_dir / "__pycache__"
-    if pycache_dir.exists():
-        shutil.rmtree(pycache_dir)
-
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(

From 58c1d8a82c8970e22609f90b3646a8f06b5754df Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Sun, 5 Apr 2026 16:55:00 +0300
Subject: [PATCH 05/18] feat(pipeline): port full pipeline orchestration from
 bash to Python

Replace kraken2csv.sh with pipeline.py. Add run_pipeline() as a
callable library function. Add -o/--output and --overwrite flags.
Skip binary and hidden files automatically (_is_processable check).
---
 krakenparser/kraken2csv.sh |  90 ---------------------
 krakenparser/pipeline.py   | 159 +++++++++++++++++++++++++++++++++++++
 2 files changed, 159 insertions(+), 90 deletions(-)
 delete mode 100755 krakenparser/kraken2csv.sh
 create mode 100644 krakenparser/pipeline.py

diff --git a/krakenparser/kraken2csv.sh b/krakenparser/kraken2csv.sh
deleted file mode 100755
index 5b53f75..0000000
--- a/krakenparser/kraken2csv.sh
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/bin/bash
-
-# Function to display usage
-usage() {
-    echo "Usage: $(basename "$0") -i|--input PATH_TO_SOURCE"
-    echo
-    echo "  -i, --input   Path to the source directory — Kraken2 reports must be inside a subdirectory (e.g., data/kreports)"
-    echo "  -h            Show this help message"
-    exit 0
-}
-
-# Parse command-line arguments
-SOURCE_DIR=""
-
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        -i|--input) SOURCE_DIR="$2"; shift 2 ;;
-        -h|--help) usage ;;
-        *) echo "Unknown option: $1"; usage ;;
-    esac
-done
-
-# Ensure SOURCE_DIR is set
-if [ -z "$SOURCE_DIR" ]; then
-    echo "Error: input is required."
-    usage
-fi
-
-# Determine the directory of this script
-SCRIPT_DIR=$(dirname "$(realpath "$0")")
-
-# PART 1: CONVERT KRAKEN2 TO MPA
-
-# Setting the path to the source file directory and destination directory
-PARENT_DIR=$(dirname "$SOURCE_DIR")
-MPA_DIR="$PARENT_DIR/mpa"
-
-# Run the old script with the correct paths
-"$SCRIPT_DIR/run_kreport2mpa.sh" -i "$SOURCE_DIR" -o "$MPA_DIR"
-
-# PART 2: COMBINING MPAs
-
-COMBINED_FILE="$PARENT_DIR/COMBINED.txt"
-python "$SCRIPT_DIR/combine_mpa.py" -i "$MPA_DIR"/* -o "$COMBINED_FILE"
-if [ $? -ne 0 ]; then
-    echo "Error: Failed to run combine_mpa.py"
-    exit 1
-fi
-echo "MPA files combined successfully. Output stored in $COMBINED_FILE"
-
-# PART 3: DECOMBINING MPAs
-
-COUNTS_DIR="$PARENT_DIR/counts"
-
-"$SCRIPT_DIR/decombine.sh" -i "$COMBINED_FILE" -o "$COUNTS_DIR"
-
-# PART 4: PROCESS COUNTS TXT FILES
-
-for file in "$COUNTS_DIR"/txt/counts_*.txt; do
-    python "$SCRIPT_DIR/processing_script.py" -i "$COMBINED_FILE" -o "$file"
-    if [ $? -ne 0 ]; then
-        echo "Error: Failed to process $file"
-        exit 1
-    fi
-done
-
-# PART 5: CONVERT TXT FILES TO CSV
-
-for file in "$COUNTS_DIR"/txt/counts_*.txt; do
-    CSV_FILE="$COUNTS_DIR/csv/$(basename "$file" .txt).csv"
-    python "$SCRIPT_DIR/convert2csv.py" -i "$file" -o "$CSV_FILE"
-done
-
-# PART 6: CALCULATE RELATIVE ABUNDANCE
-
-mkdir -p "$PARENT_DIR/rel_abund"
-
-for file in "$COUNTS_DIR"/csv/counts_*.csv; do
-    base=$(basename "$file")
-    new_name="${base/counts_/ra_}"
-    CSV_RA_FILE="$PARENT_DIR/rel_abund/$new_name"
-    python "$SCRIPT_DIR/relabund.py" -i "$file" -o "$CSV_RA_FILE"
-done
-
-# PART 7: CALCULATE α & β-DIVERSITIES
-
-CSV_SPECIES_FILE="$COUNTS_DIR/csv/counts_species.csv"
-python "$SCRIPT_DIR/diversity.py" -i "$CSV_SPECIES_FILE" -o "$PARENT_DIR/diversity"
-
-echo "All steps completed successfully!"
\ No newline at end of file
diff --git a/krakenparser/pipeline.py b/krakenparser/pipeline.py
new file mode 100644
index 0000000..f0e29c6
--- /dev/null
+++ b/krakenparser/pipeline.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python
+"""Full KrakenParser pipeline: kreport → MPA → combined → counts → rel_abund → diversity."""
+
+import argparse
+import logging
+import shutil
+import sys
+from pathlib import Path
+
+_log = logging.getLogger(__name__)
+
+import pandas as pd
+
+from krakenparser.mpa.transform2mpa import kreport_to_mpa
+from krakenparser.mpa.mpa_table import combine_mpa
+from krakenparser.counts.split_mpa import split_mpa
+from krakenparser.counts.processing_script import process_files
+from krakenparser.counts.convert2csv import convert_to_csv
+from krakenparser.stats.relabund import calculate_rel_abund
+from krakenparser.stats.diversity import calc_alpha_div, calc_beta_div
+
+
+def _is_processable(path: Path) -> bool:
+    """Return False for hidden files, files with null bytes, or non-UTF-8 files."""
+    if path.name.startswith("."):
+        return False
+    try:
+        chunk = path.read_bytes()[:1024]
+        if b"\x00" in chunk:
+            return False
+        chunk.decode("utf-8")
+        return True
+    except (UnicodeDecodeError, OSError):
+        return False
+
+
+_OUTPUT_SUBDIRS = ("intermediate", "counts", "rel_abund", "diversity")
+
+
+def run_pipeline(
+    input_dir: str,
+    output_dir: str | None = None,
+    keep_human: bool = False,
+    rarefaction_depth: int = 1000,
+    overwrite: bool = False,
+) -> None:
+    source_dir = Path(input_dir)
+    if not source_dir.is_dir():
+        sys.exit(f"Error: input directory not found: {source_dir}")
+
+    out_dir = Path(output_dir) if output_dir else source_dir.parent
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    existing = [out_dir / d for d in _OUTPUT_SUBDIRS if (out_dir / d).exists()]
+    if existing and not overwrite:
+        names = ", ".join(d.name for d in existing)
+        sys.exit(
+            f"Error: output already exists in '{out_dir}' ({names}).\n"
+            "Use --overwrite to overwrite it."
+        )
+    if overwrite:
+        for d in existing:
+            shutil.rmtree(d)
+            _log.info("Removed existing directory: %s", d)
+
+    intermediate_dir = out_dir / "intermediate"
+    intermediate_dir.mkdir(exist_ok=True)
+
+    # Part 1: kreport → MPA
+    mpa_dir = intermediate_dir / "mpa"
+    mpa_dir.mkdir(exist_ok=True)
+    for f in sorted(source_dir.iterdir()):
+        if not f.is_file():
+            continue
+        if not _is_processable(f):
+            _log.info("Skipping: %s", f.name)
+            continue
+        out_name = f.stem + ".MPA.TXT"
+        kreport_to_mpa(str(f), str(mpa_dir / out_name), display_header=True)
+
+    # Part 2: combine MPAs
+    mpa_files = sorted(mpa_dir.glob("*.MPA.TXT"))
+    if not mpa_files:
+        sys.exit("Error: no MPA files found after conversion.")
+    combined_file = intermediate_dir / "COMBINED.txt"
+    combine_mpa([str(f) for f in mpa_files], str(combined_file))
+    _log.info("MPA files combined. Output: %s", combined_file)
+
+    # Part 3: split combined MPA by rank
+    split_mpa(str(combined_file), str(intermediate_dir), keep_human=keep_human)
+    txt_dir = intermediate_dir / "txt"
+
+    # Part 4: clean taxa names and add sample header
+    for txt_file in sorted(txt_dir.glob("counts_*.txt")):
+        process_files(str(combined_file), str(txt_file))
+
+    # Part 5: TXT → CSV
+    counts_dir = out_dir / "counts"
+    counts_dir.mkdir(exist_ok=True)
+    for txt_file in sorted(txt_dir.glob("counts_*.txt")):
+        csv_file = counts_dir / txt_file.with_suffix(".csv").name
+        convert_to_csv(str(txt_file), str(csv_file))
+
+    # Part 6: relative abundance
+    rel_abund_dir = out_dir / "rel_abund"
+    rel_abund_dir.mkdir(exist_ok=True)
+    for csv_file in sorted(counts_dir.glob("counts_*.csv")):
+        ra_file = rel_abund_dir / csv_file.name.replace("counts_", "ra_")
+        calculate_rel_abund(str(csv_file), str(ra_file))
+
+    # Part 7: α & β-diversities
+    species_csv = counts_dir / "counts_species.csv"
+    if not species_csv.exists():
+        sys.exit(f"Error: species counts not found: {species_csv}")
+    diversity_dir = out_dir / "diversity"
+    diversity_dir.mkdir(exist_ok=True)
+    df = pd.read_csv(species_csv, index_col=0)
+    calc_alpha_div(df, diversity_dir)
+    calc_beta_div(df, diversity_dir, rarefaction_depth=rarefaction_depth)
+
+    _log.info("All steps completed successfully!")
+
+
+def main() -> None:
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+    parser = argparse.ArgumentParser(
+        description="Run the full KrakenParser pipeline."
+    )
+    parser.add_argument(
+        "-i", "--input", required=True,
+        help="Directory containing Kraken2 report files",
+    )
+    parser.add_argument(
+        "-o", "--output", default=None,
+        help="Output directory (default: parent of input)",
+    )
+    parser.add_argument(
+        "--keep-human", action="store_true", default=False,
+        help="Do not filter human-related taxa (default: filtered)",
+    )
+    parser.add_argument(
+        "-d", "--depth", type=int, default=1000,
+        help="Rarefaction depth for β-diversity (default: 1000)",
+    )
+    parser.add_argument(
+        "--overwrite", action="store_true", default=False,
+        help="Overwrite the output directory if it already exists",
+    )
+    args = parser.parse_args()
+    run_pipeline(
+        args.input, args.output,
+        keep_human=args.keep_human,
+        rarefaction_depth=args.depth,
+        overwrite=args.overwrite,
+    )
+
+
+if __name__ == "__main__":
+    main()

From 21047684ff3c57baf2be6fbc88400e555c6d2090 Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Fri, 10 Apr 2026 13:30:00 +0300
Subject: [PATCH 06/18] feat(kpplot): add aggregate_by_metadata, fix grid axis,
 add cmap validation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add aggregate_by_metadata() to base for metadata grouping
- Fix streamgraph grid axis (x→y)
- Raise ValueError when cmap list is shorter than taxa count
- Add missing-sample validation on sample_order
---
 krakenparser/kpplot/base.py        | 25 ++++++++++++++++++++
 krakenparser/kpplot/clustermap.py  | 28 +++-------------------
 krakenparser/kpplot/stackedbar.py  | 34 ++++++++-------------------
 krakenparser/kpplot/streamgraph.py | 37 +++++++++---------------------
 4 files changed, 49 insertions(+), 75 deletions(-)

diff --git a/krakenparser/kpplot/base.py b/krakenparser/kpplot/base.py
index 97f5447..2df3f0c 100644
--- a/krakenparser/kpplot/base.py
+++ b/krakenparser/kpplot/base.py
@@ -1,4 +1,5 @@
 import matplotlib.pyplot as plt
+import pandas as pd
 from typing import Optional
 
 
@@ -31,3 +32,27 @@ def savefig(
         self.fig.savefig(
             path, dpi=dpi, transparent=transparent, bbox_inches=bbox_inches
         )
+
+
+def aggregate_by_metadata(
+    df: pd.DataFrame,
+    metadata: pd.DataFrame,
+    metadata_group: str,
+) -> pd.DataFrame:
+    """Merge df with metadata and re-normalise rel_abund_perc per group."""
+    if "Sample_id" not in metadata.columns:
+        raise ValueError("metadata must contain 'Sample_id' column")
+    if metadata_group not in metadata.columns:
+        raise ValueError(f"'{metadata_group}' column not found in metadata")
+    df = df.merge(
+        metadata[["Sample_id", metadata_group]], on="Sample_id", how="left"
+    )
+    df = (
+        df.groupby([metadata_group, "taxon"], as_index=False)["rel_abund_perc"]
+        .mean()
+        .rename(columns={metadata_group: "Sample_id"})
+    )
+    df["rel_abund_perc"] = df.groupby("Sample_id")["rel_abund_perc"].transform(
+        lambda x: (x / x.sum()) * 100
+    )
+    return df
diff --git a/krakenparser/kpplot/clustermap.py b/krakenparser/kpplot/clustermap.py
index 1a18ef8..f0ad5bb 100644
--- a/krakenparser/kpplot/clustermap.py
+++ b/krakenparser/kpplot/clustermap.py
@@ -2,9 +2,7 @@
 import matplotlib.pyplot as plt
 import seaborn as sns
 from typing import Optional, Tuple, Union, List
-from pathlib import Path
-import shutil
-from .base import KpPlotBase
+from .base import KpPlotBase, aggregate_by_metadata
 
 
 class KpClustermap(KpPlotBase):
@@ -81,28 +79,13 @@ def clustermap(
     - background_color: Background color of the figure.
 
     Returns:
-    - KpStackedBarplot: An object containing the clustermap figure and axis for customization or saving.
+    - KpClustermap: An object containing the clustermap figure and axis for customization or saving.
     """
 
     df = df.copy()
 
     if metadata is not None and metadata_group is not None:
-        if "Sample_id" not in metadata.columns:
-            raise ValueError("metadata must contain 'Sample_id' column")
-        if metadata_group not in metadata.columns:
-            raise ValueError(f"'{metadata_group}' column not found in metadata")
-
-        df = df.merge(
-            metadata[["Sample_id", metadata_group]], on="Sample_id", how="left"
-        )
-        df = (
-            df.groupby([metadata_group, "taxon"], as_index=False)["rel_abund_perc"]
-            .mean()
-            .rename(columns={metadata_group: "Sample_id"})
-        )
-        df["rel_abund_perc"] = df.groupby("Sample_id")["rel_abund_perc"].transform(
-            lambda x: (x / x.sum()) * 100
-        )
+        df = aggregate_by_metadata(df, metadata, metadata_group)
 
     if df[y_axis].dtype == object:
         other_mask = df[y_axis].str.startswith("Other")
@@ -187,10 +170,5 @@ def clustermap(
         style=yticks_style,
     )
 
-    current_dir = Path(__file__).resolve().parent
-    pycache_dir = current_dir / "__pycache__"
-    if pycache_dir.exists() and pycache_dir.is_dir():
-        shutil.rmtree(pycache_dir)
-
     plt.close(fig)
     return KpClustermap(fig, ax)
diff --git a/krakenparser/kpplot/stackedbar.py b/krakenparser/kpplot/stackedbar.py
index 44f3308..8140437 100644
--- a/krakenparser/kpplot/stackedbar.py
+++ b/krakenparser/kpplot/stackedbar.py
@@ -3,9 +3,7 @@
 import seaborn as sns
 import numpy as np
 from typing import Optional, Tuple, Union, List
-from pathlib import Path
-import shutil
-from .base import KpPlotBase
+from .base import KpPlotBase, aggregate_by_metadata
 
 
 class KpStackedBarplot(KpPlotBase):
@@ -89,24 +87,12 @@ def stacked_barplot(
     df = df.copy()
 
     if metadata is not None and metadata_group is not None:
-        if "Sample_id" not in metadata.columns:
-            raise ValueError("metadata must contain 'Sample_id' column")
-        if metadata_group not in metadata.columns:
-            raise ValueError(f"'{metadata_group}' column not found in metadata")
-
-        df = df.merge(
-            metadata[["Sample_id", metadata_group]], on="Sample_id", how="left"
-        )
-        df = (
-            df.groupby([metadata_group, "taxon"], as_index=False)["rel_abund_perc"]
-            .mean()
-            .rename(columns={metadata_group: "Sample_id"})
-        )
-        df["rel_abund_perc"] = df.groupby("Sample_id")["rel_abund_perc"].transform(
-            lambda x: (x / x.sum()) * 100
-        )
+        df = aggregate_by_metadata(df, metadata, metadata_group)
 
     if sample_order is not None:
+        missing = set(sample_order) - set(df["Sample_id"].unique())
+        if missing:
+            raise ValueError(f"Samples missing from data: {missing}")
         df = df[df["Sample_id"].isin(sample_order)]
         df["Sample_id"] = pd.Categorical(
             df["Sample_id"], categories=sample_order, ordered=True
@@ -127,6 +113,11 @@ def stacked_barplot(
             zip(df_plot.columns, sns.color_palette(cmap, n_colors=len(df_plot.columns)))
         )
     elif isinstance(cmap, list):
+        if len(cmap) < len(df_plot.columns):
+            raise ValueError(
+                f"cmap has {len(cmap)} colors but the data has {len(df_plot.columns)} "
+                "taxa; provide at least as many colors as taxa."
+            )
         color_dict = dict(zip(df_plot.columns, cmap))
     else:
         raise ValueError("cmap must be a str or a list of colors")
@@ -199,10 +190,5 @@ def stacked_barplot(
 
     ax.set_xlim(-0.5, len(df_plot.index) - 0.5)
 
-    current_dir = Path(__file__).resolve().parent
-    pycache_dir = current_dir / "__pycache__"
-    if pycache_dir.exists() and pycache_dir.is_dir():
-        shutil.rmtree(pycache_dir)
-
     plt.close(fig)
     return KpStackedBarplot(fig, ax)
diff --git a/krakenparser/kpplot/streamgraph.py b/krakenparser/kpplot/streamgraph.py
index ac82e46..6c389a2 100644
--- a/krakenparser/kpplot/streamgraph.py
+++ b/krakenparser/kpplot/streamgraph.py
@@ -3,9 +3,7 @@
 import seaborn as sns
 import numpy as np
 from typing import Optional, Tuple, Union, List
-from pathlib import Path
-import shutil
-from .base import KpPlotBase
+from .base import KpPlotBase, aggregate_by_metadata
 
 
 class KpStreamgraph(KpPlotBase):
@@ -90,24 +88,12 @@ def streamgraph(
     df = df.copy()
 
     if metadata is not None and metadata_group is not None:
-        if "Sample_id" not in metadata.columns:
-            raise ValueError("metadata must contain 'Sample_id' column")
-        if metadata_group not in metadata.columns:
-            raise ValueError(f"'{metadata_group}' column not found in metadata")
-
-        df = df.merge(
-            metadata[["Sample_id", metadata_group]], on="Sample_id", how="left"
-        )
-        df = (
-            df.groupby([metadata_group, "taxon"], as_index=False)["rel_abund_perc"]
-            .mean()
-            .rename(columns={metadata_group: "Sample_id"})
-        )
-        df["rel_abund_perc"] = df.groupby("Sample_id")["rel_abund_perc"].transform(
-            lambda x: (x / x.sum()) * 100
-        )
+        df = aggregate_by_metadata(df, metadata, metadata_group)
 
     if sample_order is not None:
+        missing = set(sample_order) - set(df["Sample_id"].unique())
+        if missing:
+            raise ValueError(f"Samples missing from data: {missing}")
         df = df[df["Sample_id"].isin(sample_order)]
         df["Sample_id"] = pd.Categorical(
             df["Sample_id"], categories=sample_order, ordered=True
@@ -128,6 +114,11 @@ def streamgraph(
             zip(df_plot.columns, sns.color_palette(cmap, n_colors=len(df_plot.columns)))
         )
     elif isinstance(cmap, list):
+        if len(cmap) < len(df_plot.columns):
+            raise ValueError(
+                f"cmap has {len(cmap)} colors but the data has {len(df_plot.columns)} "
+                "taxa; provide at least as many colors as taxa."
+            )
         color_dict = dict(zip(df_plot.columns, cmap))
     else:
         raise ValueError("cmap must be a str or a list of colors")
@@ -190,7 +181,7 @@ def streamgraph(
             text.set_fontstyle(legend_fontstyle)
 
     if grid:
-        ax.grid(axis="x", linestyle=grid_linestyle, alpha=grid_alpha, zorder=0)
+        ax.grid(axis="y", linestyle=grid_linestyle, alpha=grid_alpha, zorder=0)
 
     labels = df_plot.index.tolist()
     plt.xticks(
@@ -206,12 +197,6 @@ def streamgraph(
 
     ax.set_xlim(-0.5, len(df_plot.index) - 0.5)
 
-    current_dir = Path(__file__).resolve().parent
-    pycache_dir = current_dir / "__pycache__"
-
-    if pycache_dir.exists() and pycache_dir.is_dir():
-        shutil.rmtree(pycache_dir)
-
     plt.close(fig)
 
     return KpStreamgraph(fig, ax)

From ee330e85739a7dcd36162da448ea862d77c5e256 Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Wed, 15 Apr 2026 11:17:00 +0300
Subject: [PATCH 07/18] refactor(cli): use importlib.metadata for version, run
 scripts as -m modules
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove version.py — version now sourced from package metadata.
Dispatcher uses python -m package.module to preserve import context.
Default to full pipeline when -i is given without a subcommand.
---
 krakenparser/krakenparser.py | 63 +++++++++++++++++++++++-------------
 krakenparser/version.py      |  1 -
 2 files changed, 40 insertions(+), 24 deletions(-)
 delete mode 100755 krakenparser/version.py

diff --git a/krakenparser/krakenparser.py b/krakenparser/krakenparser.py
index 5eb9c71..6a560ce 100755
--- a/krakenparser/krakenparser.py
+++ b/krakenparser/krakenparser.py
@@ -1,13 +1,18 @@
 import argparse
+import logging
 import subprocess
-import shutil
 from pathlib import Path
 import sys
-from .version import __version__
+from importlib.metadata import version as _pkg_version, PackageNotFoundError as _PNF
+try:
+    __version__ = _pkg_version("krakenparser")
+except _PNF:
+    __version__ = "unknown"
 
 
 # Main function to run the tool
 def main():
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
     print("KrakenParser by Ilia V. Popov")
     # Set up argument parser
     parser = argparse.ArgumentParser(
@@ -17,7 +22,7 @@ def main():
     parser.add_argument(
         "--complete",
         action="store_true",
-        help="Run the full pipeline automated",
+        help="Run the full pipeline (also the default when -i is given)",
     )
     parser.add_argument(
         "--kreport2mpa",
@@ -65,19 +70,23 @@ def main():
 
     args, extra_args = parser.parse_known_args()
 
+    for _a in extra_args:
+        if "\x00" in _a:
+            sys.exit("Error: argument contains invalid null byte.")
+
     package_dir = Path(__file__).resolve().parent  # Directory of the current script
 
-    # Map flags to corresponding scripts
+    # Map flags to (script_path, base_args_to_prepend)
     command_map = {
-        "complete": package_dir / "kraken2csv.sh",
-        "kreport2mpa": package_dir / "run_kreport2mpa.sh",
-        "combine_mpa": package_dir / "combine_mpa.py",
-        "deconstruct": package_dir / "decombine.sh",
-        "deconstruct_viruses": package_dir / "decombine_viruses.sh",
-        "process": package_dir / "processing_script.py",
-        "txt2csv": package_dir / "convert2csv.py",
-        "relabund": package_dir / "relabund.py",
-        "diversity": package_dir / "diversity.py",
+        "complete":           (package_dir / "pipeline.py",                      []),
+        "kreport2mpa":        (package_dir / "mpa" / "transform2mpa.py",         []),
+        "combine_mpa":        (package_dir / "mpa" / "mpa_table.py",             []),
+        "deconstruct":        (package_dir / "counts" / "split_mpa.py",          []),
+        "deconstruct_viruses":(package_dir / "counts" / "split_mpa.py",          ["--viruses-only"]),
+        "process":            (package_dir / "counts" / "processing_script.py",  []),
+        "txt2csv":            (package_dir / "counts" / "convert2csv.py",        []),
+        "relabund":           (package_dir / "stats" / "relabund.py",            []),
+        "diversity":          (package_dir / "stats" / "diversity.py",           []),
     }
 
     if "-h" in sys.argv or "--help" in sys.argv:
@@ -85,21 +94,29 @@ def main():
             parser.print_help()
             return
 
+    def _build_cmd(script: Path, base_args: list[str], user_args: list[str]) -> list[str]:
+        if script.suffix == ".py":
+            # Run as module (-m) so the krakenparser package stays importable.
+            # Derive dotted module name from path relative to the package root.
+            module = ".".join(
+                script.relative_to(package_dir.parent).with_suffix("").parts
+            )
+            return [sys.executable, "-m", module] + base_args + user_args
+        return [str(script)] + base_args + user_args
+
     # Find which argument was given and run the corresponding script
-    for arg, script in command_map.items():
+    for arg, (script, base_args) in command_map.items():
         if getattr(args, arg):
-            subprocess.run(
-                [script] + extra_args, check=True
-            )  # Pass extra arguments to the script
+            subprocess.run(_build_cmd(script, base_args, extra_args), check=True)
             return
 
-    parser.print_help()
+    # Default to full pipeline when -i/--input is given without a subcommand
+    if "-i" in extra_args or "--input" in extra_args:
+        complete_script, complete_base = command_map["complete"]
+        subprocess.run(_build_cmd(complete_script, complete_base, extra_args), check=True)
+        return
 
-    pycache_dir = package_dir / "__pycache__"
-
-    # Check if __pycache__ exists and remove it
-    if pycache_dir.exists() and pycache_dir.is_dir():
-        shutil.rmtree(pycache_dir)
+    parser.print_help()
 
 
 if __name__ == "__main__":
diff --git a/krakenparser/version.py b/krakenparser/version.py
deleted file mode 100755
index 43c4ab0..0000000
--- a/krakenparser/version.py
+++ /dev/null
@@ -1 +0,0 @@
-__version__ = "0.6.1"

From 7dd691497e9947a582f6441832ef4f0be0bbc3eb Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Wed, 22 Apr 2026 09:44:00 +0300
Subject: [PATCH 08/18] =?UTF-8?q?ci:=20rename=20workflow=20to=20python-pac?=
 =?UTF-8?q?kage.yml,=20expand=20matrix=20to=203.10=E2=80=933.14?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add fail-fast: false, flake8 lint step, --no-build-isolation,
branch-scoped triggers (dev, main), codecov-action@v5.
---
 .github/workflows/ci.yml             | 23 --------------
 .github/workflows/python-package.yml | 46 ++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 23 deletions(-)
 delete mode 100644 .github/workflows/ci.yml
 create mode 100644 .github/workflows/python-package.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
deleted file mode 100644
index 9a19b2a..0000000
--- a/.github/workflows/ci.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-name: pytest
-on:
-  push:
-    branches: [ dev ]
-  pull_request:
-    branches: [ dev ]
-
-jobs:
-  build-test:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: [3.12]
-    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
-        with:
-          python-version: ${{ matrix.python-version }}
-      - run: |
-          python -m pip install --upgrade pip
-          pip install krakenparser
-          pip install pytest
-      - run: pytest -q
\ No newline at end of file
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
new file mode 100644
index 0000000..553faa6
--- /dev/null
+++ b/.github/workflows/python-package.yml
@@ -0,0 +1,46 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: Python package
+
+on:
+  push:
+    branches: [ "dev", "main" ]
+  pull_request:
+    branches: [ "dev", "main" ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install flake8 setuptools wheel
+        pip install -e ".[dev]" --no-build-isolation
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        pytest --cov=krakenparser --cov-report=xml
+    - name: Upload coverage to Codecov
+      if: matrix.python-version == '3.12'
+      uses: codecov/codecov-action@v5
+      with:
+        files: coverage.xml
+        token: ${{ secrets.CODECOV_TOKEN }}

From c37f5cb7857bb6635dda0d69698c1d51b8622384 Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Sun, 26 Apr 2026 15:03:00 +0300
Subject: [PATCH 09/18] ci: add publish.yml with Trusted Publishing for PyPI

Triggers on v* tags. Strips GitHub-specific markup from README
before build to ensure clean PyPI rendering.
---
 .github/workflows/publish.yml | 53 +++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 .github/workflows/publish.yml

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
new file mode 100644
index 0000000..3763435
--- /dev/null
+++ b/.github/workflows/publish.yml
@@ -0,0 +1,53 @@
+name: Publish to PyPI
+
+on:
+  push:
+    tags:
+      - "v*"
+  workflow_dispatch:
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - name: Install build tools
+        run: pip install build twine
+      - name: Strip GitHub-specific markup from README
+        run: |
+          python - <<'EOF'
+          import re, pathlib
+          text = pathlib.Path("README.md").read_text()
+          text = re.sub(
+              r'<img src="[^"]*#gh-(?:light|dark)-mode-only"[^\n]*/>\n',
+              '', text,
+          )
+          text = re.sub(r'\[!\[Downloads\]\([^\)]+\)\]\([^\)]+\)\n', '', text)
+          pathlib.Path("README.md").write_text(text)
+          EOF
+      - name: Build package
+        run: python -m build
+      - name: Check distribution
+        run: twine check dist/*
+      - name: Store distribution packages
+        uses: actions/upload-artifact@v4
+        with:
+          name: dist
+          path: dist/
+
+  publish:
+    needs: build
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: ubuntu-latest
+    environment: release
+    permissions:
+      id-token: write
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          name: dist
+          path: dist/
+      - uses: pypa/gh-action-pypi-publish@release/v1

From 6a5c80f0584383fdf7580f192b86c1fdb5feb34f Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Fri, 1 May 2026 10:58:00 +0300
Subject: [PATCH 10/18] test: add unit and integration test suite (62 tests)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Unit tests: _parse_line, shannon/pielou/chao1 indices, modify_taxa_names.
Integration: kreport_to_mpa, convert_to_csv, relabund, alpha/beta div,
split_mpa — all with reproducibility (SHA-256) checks.
---
 tests/conftest.py         |  66 +++++++
 tests/test_integration.py | 356 ++++++++++++++++++++++++++++++++++++++
 tests/test_units.py       | 146 ++++++++++++++++
 3 files changed, 568 insertions(+)
 create mode 100644 tests/conftest.py
 create mode 100644 tests/test_integration.py
 create mode 100644 tests/test_units.py

diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..8100a3e
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,66 @@
+import matplotlib
+matplotlib.use("Agg")
+
+import pandas as pd
+import pytest
+
+
+SAMPLE_KREPORT = (
+    "99.98\t999980\t0\tR\t1\troot\n"
+    "99.98\t999980\t0\tD\t2\t  Bacteria\n"
+    "85.00\t850000\t0\tP\t1224\t    Pseudomonadota\n"
+    "50.00\t500000\t200000\tG\t286\t      Pseudomonas\n"
+    "30.00\t300000\t300000\tS\t287\t        Pseudomonas aeruginosa\n"
+    "20.00\t200000\t200000\tG\t561\t      Escherichia\n"
+    "20.00\t200000\t200000\tS\t562\t        Escherichia coli\n"
+    "10.00\t100000\t0\tP\t976\t    Bacteroidota\n"
+    "10.00\t100000\t100000\tS\t817\t      Bacteroides fragilis\n"
+    "0.02\t200\t200\tU\t0\tunclassified\n"
+)
+
+# Tab-delimited TXT as produced by processing_script.py
+SAMPLE_COUNTS_TXT = (
+    "#Classification\tsample1\tsample2\n"
+    "Pseudomonas aeruginosa\t300000\t100000\n"
+    "Escherichia coli\t200000\t50000\n"
+    "Bacteroides fragilis\t100000\t200000\n"
+)
+
+
+@pytest.fixture
+def kreport_file(tmp_path):
+    f = tmp_path / "sample.kreport"
+    f.write_text(SAMPLE_KREPORT)
+    return f
+
+
+@pytest.fixture
+def counts_txt_file(tmp_path):
+    f = tmp_path / "counts_species.txt"
+    f.write_text(SAMPLE_COUNTS_TXT)
+    return f
+
+
+@pytest.fixture
+def counts_csv_file(tmp_path):
+    df = pd.DataFrame({
+        "Sample_id": ["S1", "S2"],
+        "Pseudomonas aeruginosa": [300000, 100000],
+        "Escherichia coli": [200000, 50000],
+        "Bacteroides fragilis": [100000, 200000],
+    })
+    f = tmp_path / "counts_species.csv"
+    df.to_csv(f, index=False)
+    return f
+
+
+@pytest.fixture
+def relabund_df():
+    return pd.DataFrame({
+        "Sample_id": ["S1", "S1", "S1", "S2", "S2", "S2"],
+        "taxon": [
+            "Pseudomonadota", "Bacillota", "Other (<4.0%)",
+            "Pseudomonadota", "Bacillota", "Other (<4.0%)",
+        ],
+        "rel_abund_perc": [70.0, 20.0, 10.0, 50.0, 35.0, 15.0],
+    })
diff --git a/tests/test_integration.py b/tests/test_integration.py
new file mode 100644
index 0000000..7fed2fb
--- /dev/null
+++ b/tests/test_integration.py
@@ -0,0 +1,356 @@
+"""Characterization / integration tests — file I/O, reproducibility."""
+
+import hashlib
+import itertools
+
+import pandas as pd
+import pytest
+
+from krakenparser.mpa.transform2mpa import kreport_to_mpa
+from krakenparser.counts.convert2csv import convert_to_csv
+from krakenparser.counts.processing_script import process_files
+from krakenparser.counts.split_mpa import split_mpa
+from krakenparser.stats.relabund import calculate_rel_abund
+from krakenparser.stats.diversity import calc_alpha_div, calc_beta_div
+
+
+SAMPLE_COMBINED_MPA = (
+    "#Classification\tsample1\tsample2\n"
+    "d__Bacteria|p__Pseudomonadota|g__Pseudomonas|s__Pseudomonas_aeruginosa\t300\t100\n"
+    "d__Bacteria|p__Pseudomonadota|g__Pseudomonas\t500\t200\n"
+    "d__Bacteria|p__Pseudomonadota|g__Homo|s__Homo_sapiens\t50\t30\n"
+    "d__Bacteria|p__Pseudomonadota|g__Homo\t60\t35\n"
+    "d__Bacteria|p__Pseudomonadota\t850\t350\n"
+    "d__Bacteria|p__Bacteroidota\t100\t80\n"
+    "d__Viruses|p__Uroviricota|s__Virus_alpha\t10\t5\n"
+)
+
+
+@pytest.fixture
+def combined_mpa_file(tmp_path):
+    f = tmp_path / "COMBINED.txt"
+    f.write_text(SAMPLE_COMBINED_MPA)
+    return f
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _sha256(path) -> str:
+    return hashlib.sha256(path.read_bytes()).hexdigest()
+
+
+# ---------------------------------------------------------------------------
+# kreport_to_mpa
+# ---------------------------------------------------------------------------
+
+def test_kreport_to_mpa_reproducible(kreport_file, tmp_path):
+    counter = itertools.count()
+
+    def run():
+        out = tmp_path / f"out_{next(counter)}.MPA.TXT"
+        kreport_to_mpa(str(kreport_file), str(out), display_header=True)
+        return _sha256(out)
+
+    assert run() == run()
+
+
+def test_kreport_to_mpa_standard_ranks_only(kreport_file, tmp_path):
+    out = tmp_path / "out.MPA.TXT"
+    kreport_to_mpa(str(kreport_file), str(out))
+    lines = out.read_text().splitlines()
+    paths = [ln.split("\t")[0] for ln in lines]
+
+    assert any("p__Pseudomonadota" in p for p in paths)
+    assert any("s__Pseudomonas_aeruginosa" in p for p in paths)
+    assert any(p.startswith("d__Bacteria|") for p in paths)
+
+
+def test_kreport_to_mpa_excludes_unclassified_and_root(kreport_file, tmp_path):
+    out = tmp_path / "out.MPA.TXT"
+    kreport_to_mpa(str(kreport_file), str(out))
+    content = out.read_text()
+    assert "unclassified" not in content
+    assert "root" not in content
+
+
+def test_kreport_to_mpa_display_header(kreport_file, tmp_path):
+    out = tmp_path / "out.MPA.TXT"
+    kreport_to_mpa(str(kreport_file), str(out), display_header=True)
+    first_line = out.read_text().splitlines()[0]
+    assert first_line.startswith("#Classification")
+    assert kreport_file.name in first_line
+
+
+def test_kreport_to_mpa_paths_are_hierarchical(kreport_file, tmp_path):
+    out = tmp_path / "out.MPA.TXT"
+    kreport_to_mpa(str(kreport_file), str(out))
+    lines = out.read_text().splitlines()
+    for ln in lines:
+        path = ln.split("\t")[0]
+        segments = path.split("|")
+        # Each segment must have a known two-letter prefix
+        for seg in segments:
+            assert "__" in seg, f"Unexpected MPA segment format: {seg!r}"
+
+
+# ---------------------------------------------------------------------------
+# convert_to_csv
+# ---------------------------------------------------------------------------
+
+def test_convert_to_csv_reproducible(counts_txt_file, tmp_path):
+    counter = itertools.count()
+
+    def run():
+        out = tmp_path / f"out_{next(counter)}.csv"
+        convert_to_csv(str(counts_txt_file), str(out))
+        return _sha256(out)
+
+    assert run() == run()
+
+
+def test_convert_to_csv_transposes_correctly(counts_txt_file, tmp_path):
+    out = tmp_path / "counts.csv"
+    convert_to_csv(str(counts_txt_file), str(out))
+    df = pd.read_csv(out)
+
+    assert "Sample_id" in df.columns
+    assert set(df["Sample_id"]) == {"sample1", "sample2"}
+    assert "Pseudomonas aeruginosa" in df.columns
+
+
+# ---------------------------------------------------------------------------
+# process_files
+# ---------------------------------------------------------------------------
+
+def test_process_files_adds_header_and_cleans_names(tmp_path):
+    source = tmp_path / "COMBINED.txt"
+    source.write_text(
+        "#Classification\tsample1.kreport\tsample2.kreport\n"
+        "d__Bacteria|p__X|s__Pseudomonas_aeruginosa\t300\t100\n"
+    )
+    dest = tmp_path / "counts_species.txt"
+    dest.write_text(
+        "s__Pseudomonas_aeruginosa\t300\t100\n"
+        "s__Escherichia_coli\t200\t50\n"
+    )
+    process_files(str(source), str(dest))
+    result = dest.read_text()
+    lines = result.splitlines()
+    assert lines[0] == "#Classification\tsample1\tsample2"
+    assert "Pseudomonas aeruginosa" in result
+    assert "Escherichia coli" in result
+    assert "s__" not in result
+
+
+def test_process_files_reproducible(tmp_path):
+    source = tmp_path / "COMBINED.txt"
+    source.write_text("#Classification\tsample1.kreport\n")
+    for i in range(2):
+        dest = tmp_path / f"counts_{i}.txt"
+        dest.write_text("s__Some_species\t10\n")
+        process_files(str(source), str(dest))
+    assert (tmp_path / "counts_0.txt").read_text() == (tmp_path / "counts_1.txt").read_text()
+
+
+def test_process_files_missing_source_raises(tmp_path):
+    dest = tmp_path / "counts.txt"
+    dest.write_text("s__X\t10\n")
+    with pytest.raises(FileNotFoundError):
+        process_files(str(tmp_path / "ghost.txt"), str(dest))
+
+
+def test_process_files_missing_dest_raises(tmp_path):
+    source = tmp_path / "COMBINED.txt"
+    source.write_text("#Classification\tsample1.kreport\n")
+    with pytest.raises(FileNotFoundError):
+        process_files(str(source), str(tmp_path / "ghost.txt"))
+
+
+def test_convert_to_csv_missing_input_raises(tmp_path):
+    with pytest.raises(FileNotFoundError):
+        convert_to_csv(str(tmp_path / "ghost.txt"), str(tmp_path / "out.csv"))
+
+
+# ---------------------------------------------------------------------------
+# calculate_rel_abund
+# ---------------------------------------------------------------------------
+
+def test_relabund_reproducible(counts_csv_file, tmp_path):
+    counter = itertools.count()
+
+    def run():
+        out = tmp_path / f"ra_{next(counter)}.csv"
+        calculate_rel_abund(str(counts_csv_file), str(out))
+        return _sha256(out)
+
+    assert run() == run()
+
+
+def test_relabund_sums_to_100_per_sample(counts_csv_file, tmp_path):
+    out = tmp_path / "ra.csv"
+    calculate_rel_abund(str(counts_csv_file), str(out))
+    df = pd.read_csv(out)
+    for sample, grp in df.groupby("Sample_id"):
+        total = grp["rel_abund_perc"].sum()
+        assert total == pytest.approx(100.0, abs=1e-6), f"{sample}: sum={total}"
+
+
+def test_relabund_other_threshold_creates_other_group(counts_csv_file, tmp_path):
+    out = tmp_path / "ra.csv"
+    calculate_rel_abund(str(counts_csv_file), str(out), other_threshold=99.0)
+    df = pd.read_csv(out)
+    assert df["taxon"].str.startswith("Other").any()
+
+
+def test_relabund_no_zero_rows(counts_csv_file, tmp_path):
+    out = tmp_path / "ra.csv"
+    calculate_rel_abund(str(counts_csv_file), str(out))
+    df = pd.read_csv(out)
+    assert (df["rel_abund_perc"] > 0).all()
+
+
+def test_relabund_missing_input_raises(tmp_path):
+    with pytest.raises(FileNotFoundError):
+        calculate_rel_abund(str(tmp_path / "ghost.csv"), str(tmp_path / "out.csv"))
+
+
+# ---------------------------------------------------------------------------
+# calc_alpha_div
+# ---------------------------------------------------------------------------
+
+def test_alpha_div_reproducible(counts_csv_file, tmp_path):
+    df = pd.read_csv(counts_csv_file, index_col=0)
+    counter = itertools.count()
+
+    def run():
+        out_dir = tmp_path / f"div_{next(counter)}"
+        out_dir.mkdir()
+        calc_alpha_div(df, out_dir)
+        return _sha256(out_dir / "alpha_div.csv")
+
+    assert run() == run()
+
+
+def test_alpha_div_output_columns(counts_csv_file, tmp_path):
+    df = pd.read_csv(counts_csv_file, index_col=0)
+    out_dir = tmp_path / "diversity"
+    out_dir.mkdir()
+    calc_alpha_div(df, out_dir)
+    result = pd.read_csv(out_dir / "alpha_div.csv")
+    assert set(result.columns) == {"Sample", "Shannon", "Pielou", "Chao1"}
+    assert len(result) == len(df)
+
+
+def test_alpha_div_shannon_non_negative(counts_csv_file, tmp_path):
+    df = pd.read_csv(counts_csv_file, index_col=0)
+    out_dir = tmp_path / "diversity"
+    out_dir.mkdir()
+    calc_alpha_div(df, out_dir)
+    result = pd.read_csv(out_dir / "alpha_div.csv")
+    assert (result["Shannon"] >= 0).all()
+
+
+# ---------------------------------------------------------------------------
+# calc_beta_div
+# ---------------------------------------------------------------------------
+
+def test_beta_div_output_files_exist(counts_csv_file, tmp_path):
+    df = pd.read_csv(counts_csv_file, index_col=0)
+    out_dir = tmp_path / "diversity"
+    out_dir.mkdir()
+    calc_beta_div(df, out_dir, rarefaction_depth=1000)
+    assert (out_dir / "beta_div_bray.csv").exists()
+    assert (out_dir / "beta_div_jaccard.csv").exists()
+
+
+def test_beta_div_matrix_is_square(counts_csv_file, tmp_path):
+    df = pd.read_csv(counts_csv_file, index_col=0)
+    out_dir = tmp_path / "diversity"
+    out_dir.mkdir()
+    calc_beta_div(df, out_dir, rarefaction_depth=1000)
+    bray = pd.read_csv(out_dir / "beta_div_bray.csv", index_col=0)
+    assert bray.shape[0] == bray.shape[1]
+
+
+def test_beta_div_diagonal_is_zero(counts_csv_file, tmp_path):
+    df = pd.read_csv(counts_csv_file, index_col=0)
+    out_dir = tmp_path / "diversity"
+    out_dir.mkdir()
+    calc_beta_div(df, out_dir, rarefaction_depth=1000)
+    bray = pd.read_csv(out_dir / "beta_div_bray.csv", index_col=0)
+    import numpy as np
+    assert np.allclose(np.diag(bray.values), 0.0)
+
+
+def test_beta_div_too_few_samples_raises(tmp_path):
+    df = pd.DataFrame(
+        {"Taxon_A": [100], "Taxon_B": [200]}, index=["S1"]
+    )
+    out_dir = tmp_path / "diversity"
+    out_dir.mkdir()
+    with pytest.raises(ValueError, match="rarefaction"):
+        calc_beta_div(df, out_dir, rarefaction_depth=1000)
+
+
+# ---------------------------------------------------------------------------
+# split_mpa
+# ---------------------------------------------------------------------------
+
+def test_split_mpa_creates_all_rank_files(combined_mpa_file, tmp_path):
+    split_mpa(str(combined_mpa_file), str(tmp_path))
+    for rank in ("species", "genus", "family", "order", "class", "phylum"):
+        assert (tmp_path / "txt" / f"counts_{rank}.txt").exists()
+
+
+def test_split_mpa_reproducible(combined_mpa_file, tmp_path):
+    counter = itertools.count()
+
+    def run():
+        out = tmp_path / f"out_{next(counter)}"
+        split_mpa(str(combined_mpa_file), str(out))
+        return _sha256(out / "txt" / "counts_species.txt")
+
+    assert run() == run()
+
+
+def test_split_mpa_filters_human_by_default(combined_mpa_file, tmp_path):
+    split_mpa(str(combined_mpa_file), str(tmp_path))
+    species = (tmp_path / "txt" / "counts_species.txt").read_text()
+    assert "Homo_sapiens" not in species
+    genus = (tmp_path / "txt" / "counts_genus.txt").read_text()
+    assert "g__Homo" not in genus
+
+
+def test_split_mpa_keep_human_retains_homo(combined_mpa_file, tmp_path):
+    split_mpa(str(combined_mpa_file), str(tmp_path), keep_human=True)
+    species = (tmp_path / "txt" / "counts_species.txt").read_text()
+    assert "Homo_sapiens" in species
+
+
+def test_split_mpa_viruses_only(combined_mpa_file, tmp_path):
+    split_mpa(str(combined_mpa_file), str(tmp_path), viruses_only=True)
+    species = (tmp_path / "txt" / "counts_species.txt").read_text()
+    assert "Virus_alpha" in species
+    assert "Pseudomonas_aeruginosa" not in species
+
+
+def test_split_mpa_strips_path_prefix(combined_mpa_file, tmp_path):
+    split_mpa(str(combined_mpa_file), str(tmp_path))
+    species = (tmp_path / "txt" / "counts_species.txt").read_text()
+    # No MPA path separators should remain
+    assert "|" not in species
+    # The rank prefix should remain (processing_script strips it later)
+    assert "s__" in species
+
+
+def test_split_mpa_genus_excludes_species_lines(combined_mpa_file, tmp_path):
+    split_mpa(str(combined_mpa_file), str(tmp_path))
+    genus = (tmp_path / "txt" / "counts_genus.txt").read_text()
+    assert "s__" not in genus
+
+
+def test_split_mpa_missing_input_raises(tmp_path):
+    with pytest.raises(FileNotFoundError):
+        split_mpa(str(tmp_path / "ghost.txt"), str(tmp_path / "out"))
diff --git a/tests/test_units.py b/tests/test_units.py
new file mode 100644
index 0000000..57618de
--- /dev/null
+++ b/tests/test_units.py
@@ -0,0 +1,146 @@
+"""Pure-function unit tests — no I/O, fully deterministic."""
+
+import math
+
+import numpy as np
+import pytest
+
+from krakenparser.mpa.transform2mpa import _parse_line
+from krakenparser.stats.diversity import chao1_index, pielou_evenness, shannon_index
+from krakenparser.counts.processing_script import modify_taxa_names
+
+
+# ---------------------------------------------------------------------------
+# _parse_line
+# ---------------------------------------------------------------------------
+
+def test_parse_line_standard_rank():
+    line = "50.00\t500000\t100000\tP\t1224\t    Pseudomonadota\n"
+    name, depth, rank, cum_reads, pct = _parse_line(line)
+    assert name == "Pseudomonadota"
+    assert depth == 2       # 4 leading spaces // 2
+    assert rank == "P"
+    assert cum_reads == 500000
+    assert pct == 50.0
+
+
+def test_parse_line_root_no_indent():
+    line = "99.98\t999980\t0\tR\t1\troot\n"
+    name, depth, rank, cum_reads, pct = _parse_line(line)
+    assert name == "root"
+    assert depth == 0
+    assert rank == "R"
+
+
+def test_parse_line_intermediate_rank():
+    line = "5.00\t50000\t0\tS1\t12345\t          Some subspecies\n"
+    name, depth, rank, _, _ = _parse_line(line)
+    assert name == "Some subspecies"
+    assert rank == "S1"
+    assert depth == 5       # 10 spaces // 2
+
+
+def test_parse_line_too_few_columns():
+    assert _parse_line("50.00\t500000\n") is None
+
+
+def test_parse_line_non_numeric_pct():
+    assert _parse_line("not_a_float\t500000\t0\tP\t1224\tBacteria\n") is None
+
+
+def test_parse_line_non_numeric_reads():
+    assert _parse_line("50.00\tnot_int\t0\tP\t1224\tBacteria\n") is None
+
+
+# ---------------------------------------------------------------------------
+# shannon_index
+# ---------------------------------------------------------------------------
+
+def test_shannon_uniform_four_species():
+    assert abs(shannon_index([1, 1, 1, 1]) - math.log(4)) < 1e-10
+
+
+def test_shannon_single_species_is_zero():
+    assert shannon_index([100]) == pytest.approx(0.0)
+
+
+def test_shannon_ignores_zero_counts():
+    assert abs(shannon_index([1, 1, 1, 1, 0, 0]) - math.log(4)) < 1e-10
+
+
+def test_shannon_two_equal_species():
+    assert abs(shannon_index([50, 50]) - math.log(2)) < 1e-10
+
+
+# ---------------------------------------------------------------------------
+# pielou_evenness
+# ---------------------------------------------------------------------------
+
+def test_pielou_single_species_returns_nan():
+    assert math.isnan(pielou_evenness([100]))
+
+
+def test_pielou_all_zeros_returns_nan():
+    assert math.isnan(pielou_evenness([0, 0, 0]))
+
+
+def test_pielou_uniform_equals_one():
+    assert pielou_evenness([1, 1, 1, 1]) == pytest.approx(1.0)
+
+
+def test_pielou_range_zero_to_one():
+    result = pielou_evenness([10, 2, 1, 1])
+    assert 0.0 < result < 1.0
+
+
+# ---------------------------------------------------------------------------
+# chao1_index
+# ---------------------------------------------------------------------------
+
+def test_chao1_f2_zero_uses_f1_formula():
+    # F1=3, F2=0 → S_obs + F1*(F1-1)/2
+    counts = [1, 1, 1, 5, 10]  # F1=3, F2=0, S_obs=5
+    expected = 5 + 3 * (3 - 1) / 2  # 5 + 3 = 8
+    assert chao1_index(counts) == pytest.approx(expected)
+
+
+def test_chao1_normal():
+    # F1=2, F2=2, S_obs=5 → S_obs + F1²/(2*F2)
+    counts = [1, 1, 2, 2, 5]
+    expected = 5 + (2 * 2) / (2 * 2)  # 5 + 1 = 6
+    assert chao1_index(counts) == pytest.approx(expected)
+
+
+def test_chao1_no_singletons():
+    counts = [5, 10, 15, 20]
+    # F1=0, F2=0 → uses F2==0 branch: S_obs + 0
+    assert chao1_index(counts) == pytest.approx(4.0)
+
+
+# ---------------------------------------------------------------------------
+# modify_taxa_names
+# ---------------------------------------------------------------------------
+
+def test_modify_taxa_names_strips_prefix_and_replaces_underscores():
+    assert modify_taxa_names("s__Homo_sapiens\t100\t200") == "Homo sapiens\t100\t200"
+
+
+def test_modify_taxa_names_genus():
+    assert modify_taxa_names("g__Escherichia_coli\t50") == "Escherichia coli\t50"
+
+
+def test_modify_taxa_names_all_prefixes():
+    for prefix in ["s__", "g__", "f__", "o__", "c__", "p__"]:
+        result = modify_taxa_names(f"{prefix}Some_name\t10")
+        assert result == "Some name\t10"
+
+
+def test_modify_taxa_names_no_prefix_unchanged():
+    line = "unclassified_reads\t100"
+    assert modify_taxa_names(line) == line
+
+
+def test_modify_taxa_names_count_fields_not_modified():
+    # Underscores in tab-separated count fields must be preserved
+    result = modify_taxa_names("s__My_taxon\t1_000\t2_000")
+    assert result == "My taxon\t1_000\t2_000"

From 00efa56aacc66f0610ea7ebc27a575f3c4fe1165 Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Wed, 6 May 2026 14:22:00 +0300
Subject: [PATCH 11/18] test: add kpplot tests and raise coverage to 83%

Add smoke tests for stacked_barplot, streamgraph, clustermap.
Switch test_full_pipeline to direct run_pipeline() call so pipeline.py
is included in coverage. Add overwrite protection tests.
---
 tests/test_full_pipeline.py |  43 +++++++++++----
 tests/test_kpplot.py        | 105 ++++++++++++++++++++++++++++++++++++
 2 files changed, 139 insertions(+), 9 deletions(-)
 create mode 100644 tests/test_kpplot.py

diff --git a/tests/test_full_pipeline.py b/tests/test_full_pipeline.py
index 360c781..599f625 100644
--- a/tests/test_full_pipeline.py
+++ b/tests/test_full_pipeline.py
@@ -1,9 +1,10 @@
 import zipfile
 import shutil
-import subprocess
 import pytest
 from pathlib import Path
 
+from krakenparser.pipeline import run_pipeline
+
 
 @pytest.fixture
 def demo_run(tmp_path):
@@ -29,21 +30,45 @@ def test_full_pipeline_end_to_end(demo_run):
     run_dir = demo_run["run_dir"]
     kreports_path = run_dir / "kreports"
 
-    # 1. Run KrakenParser
-    cmd = ["KrakenParser", "--complete", "-i", str(kreports_path)]
-    result = subprocess.run(cmd, capture_output=True, text=True)
-    assert result.returncode == 0, f"Pipeline failed:\n{result.stderr}"
+    run_pipeline(str(kreports_path))
 
-    # 2. Assert each rank‐level CSV exists and is non‐empty
+    # Assert each rank-level CSV exists and is non-empty
     ranks = ["phylum", "class", "order", "family", "genus", "species"]
     for rank in ranks:
-        csv_path = run_dir / "counts" / "csv" / f"counts_{rank}.csv"
+        csv_path = run_dir / "counts" / f"counts_{rank}.csv"
         assert csv_path.exists(), f"Missing counts_{rank}.csv"
         assert csv_path.stat().st_size > 0, f"counts_{rank}.csv is empty"
 
-    # 3. Assert relative‐abundance CSVs exist and are non‐empty
+    # Assert relative-abundance CSVs exist and are non-empty
     rel_dir = run_dir / "rel_abund"
     assert rel_dir.exists(), "rel_abund directory is missing"
     rel_species = rel_dir / "ra_species.csv"
-    assert rel_species.exists(), "Missing ra_species.csv in csv_relabund"
+    assert rel_species.exists(), "Missing ra_species.csv"
     assert rel_species.stat().st_size > 0, "ra_species.csv is empty"
+
+    # Assert diversity outputs exist
+    diversity_dir = run_dir / "diversity"
+    assert (diversity_dir / "alpha_div.csv").exists()
+
+    # Assert intermediate files exist
+    assert (run_dir / "intermediate" / "COMBINED.txt").exists()
+
+
+def test_pipeline_overwrite_protection(demo_run):
+    run_dir = demo_run["run_dir"]
+    kreports_path = run_dir / "kreports"
+
+    run_pipeline(str(kreports_path))
+
+    # Second run without --overwrite must exit
+    with pytest.raises(SystemExit):
+        run_pipeline(str(kreports_path))
+
+
+def test_pipeline_overwrite_flag(demo_run):
+    run_dir = demo_run["run_dir"]
+    kreports_path = run_dir / "kreports"
+
+    run_pipeline(str(kreports_path))
+    # Second run with overwrite=True must succeed
+    run_pipeline(str(kreports_path), overwrite=True)
diff --git a/tests/test_kpplot.py b/tests/test_kpplot.py
new file mode 100644
index 0000000..51cb5e1
--- /dev/null
+++ b/tests/test_kpplot.py
@@ -0,0 +1,105 @@
+"""kpplot smoke tests and parameter-validation tests."""
+
+import pytest
+
+from krakenparser.kpplot.stackedbar import stacked_barplot
+from krakenparser.kpplot.streamgraph import streamgraph
+from krakenparser.kpplot.clustermap import clustermap
+from krakenparser.kpplot.base import KpPlotBase, aggregate_by_metadata
+
+
+# ---------------------------------------------------------------------------
+# Smoke tests — verify each plot function returns without error
+# ---------------------------------------------------------------------------
+
+def test_stackedbar_returns_kpplotbase(relabund_df):
+    result = stacked_barplot(relabund_df)
+    assert isinstance(result, KpPlotBase)
+
+
+def test_streamgraph_returns_kpplotbase(relabund_df):
+    result = streamgraph(relabund_df)
+    assert isinstance(result, KpPlotBase)
+
+
+def test_clustermap_returns_kpplotbase(relabund_df):
+    result = clustermap(relabund_df)
+    assert isinstance(result, KpPlotBase)
+
+
+# ---------------------------------------------------------------------------
+# sample_order validation
+# ---------------------------------------------------------------------------
+
+def test_stackedbar_sample_order_missing_raises(relabund_df):
+    with pytest.raises(ValueError, match="Samples missing"):
+        stacked_barplot(relabund_df, sample_order=["S1", "S2", "GHOST"])
+
+
+def test_streamgraph_sample_order_missing_raises(relabund_df):
+    with pytest.raises(ValueError, match="Samples missing"):
+        streamgraph(relabund_df, sample_order=["S1", "GHOST"])
+
+
+def test_clustermap_sample_order_missing_raises(relabund_df):
+    with pytest.raises(ValueError, match="Samples missing"):
+        clustermap(relabund_df, sample_order=["S1", "GHOST"])
+
+
+# ---------------------------------------------------------------------------
+# cmap validation (stackedbar / streamgraph)
+# ---------------------------------------------------------------------------
+
+def test_stackedbar_cmap_too_short_raises(relabund_df):
+    with pytest.raises(ValueError, match="cmap"):
+        stacked_barplot(relabund_df, cmap=["red"])
+
+
+def test_streamgraph_cmap_too_short_raises(relabund_df):
+    with pytest.raises(ValueError, match="cmap"):
+        streamgraph(relabund_df, cmap=["red"])
+
+
+def test_stackedbar_cmap_invalid_type_raises(relabund_df):
+    with pytest.raises(ValueError, match="cmap"):
+        stacked_barplot(relabund_df, cmap=123)
+
+
+def test_streamgraph_cmap_invalid_type_raises(relabund_df):
+    with pytest.raises(ValueError, match="cmap"):
+        streamgraph(relabund_df, cmap=123)
+
+
+# ---------------------------------------------------------------------------
+# aggregate_by_metadata
+# ---------------------------------------------------------------------------
+
+def test_aggregate_by_metadata_basic(relabund_df):
+    import pandas as pd
+
+    metadata = pd.DataFrame({
+        "Sample_id": ["S1", "S2"],
+        "Group": ["A", "A"],
+    })
+    result = aggregate_by_metadata(relabund_df, metadata, "Group")
+    assert "Sample_id" in result.columns
+    assert set(result["Sample_id"]) == {"A"}
+    # Relative abundance should still sum to 100 per group
+    total = result["rel_abund_perc"].sum()
+    assert total == pytest.approx(100.0, abs=1e-6)
+
+
+def test_aggregate_by_metadata_missing_sample_id_column_raises(relabund_df):
+    import pandas as pd
+
+    bad_meta = pd.DataFrame({"Group": ["A", "B"], "X": [1, 2]})
+    with pytest.raises(ValueError, match="Sample_id"):
+        aggregate_by_metadata(relabund_df, bad_meta, "Group")
+
+
+def test_aggregate_by_metadata_missing_group_column_raises(relabund_df):
+    import pandas as pd
+
+    meta = pd.DataFrame({"Sample_id": ["S1", "S2"]})
+    with pytest.raises(ValueError, match="Group"):
+        aggregate_by_metadata(relabund_df, meta, "Group")

From 0aa7b8d74a65d345ac0a6df77cf2308d1fb108c0 Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Tue, 12 May 2026 08:31:00 +0300
Subject: [PATCH 12/18] docs: update README for v1.0.0

Add codecov badge. Document new -o/--output and --overwrite flags.
Add Before Visualization section highlighting --relabund -O.
Move step-by-step modules to collapsible <details> block.
Update Example Output Structure with intermediate/ directory.
---
 README.md | 172 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 94 insertions(+), 78 deletions(-)

diff --git a/README.md b/README.md
index 8a848c7..0349c7e 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,8 @@
 
 ![License](https://img.shields.io/badge/License-MIT-steelblue)
 [![Downloads](https://static.pepy.tech/badge/krakenparser)](https://pepy.tech/project/krakenparser)
-[![CI](https://github.com/PopovIILab/KrakenParser/actions/workflows/ci.yml/badge.svg)](https://github.com/PopovIILab/KrakenParser/actions/workflows/ci.yml)
+[![CI](https://github.com/PopovIILab/KrakenParser/actions/workflows/python-package.yml/badge.svg)](https://github.com/PopovIILab/KrakenParser/actions/workflows/python-package.yml)
+[![codecov](https://codecov.io/gh/PopovIILab/KrakenParser/graph/badge.svg)](https://codecov.io/gh/PopovIILab/KrakenParser)
 
 <img src="https://github.com/PopovIILab/KrakenParser/blob/main/imgs/KrakenParser_logo_light.png#gh-light-mode-only" align="left"/>
 <img src="https://github.com/PopovIILab/KrakenParser/blob/main/imgs/KrakenParser_logo_dark.png#gh-dark-mode-only" align="left"/>
@@ -108,9 +109,10 @@ X9,0.7232472324723247,0.7352941176470589,...,0.8066914498141264,0.0
 ## Quick Start (Full Pipeline)
 To run the full pipeline, use the following command:
 ```bash
-KrakenParser --complete -i data/kreports
+KrakenParser --complete -i data/kreports -o results/
 #Having troubles? Run KrakenParser --complete -h
 ```
+
 This will:
 1. Convert Kraken2 reports to MPA format
 2. Combine MPA files into a single file
@@ -120,162 +122,176 @@ This will:
 6. Calculate relative abundance
 7. Calculate α & β-diversities
 
-### **Input Requirements**
-- The Kraken2 reports must be inside a **subdirectory** (e.g., `data/kreports`).
-- The script automatically creates output directories and processes the data.
-
 ## Installation
 
 ```
 pip install krakenparser
 ```
 
-## Using Individual Modules
-You can also run each step manually if needed.
+## Before Visualization: Grouping Low-Abundance Taxa
+
+The full pipeline automatically calculates relative abundance. Before passing data to visualization, it is strongly recommended to re-run `--relabund` with the `-O` flag — this collapses all taxa below the chosen threshold into a single **"Other"** group, producing much cleaner and more readable plots.
+
+```bash
+KrakenParser --relabund -i data/counts/counts_species.csv -o data/rel_abund/ra_species.csv -O 4
+```
+
+This groups every taxon with relative abundance **< 4 %** into `Other (<4.0%)`. Adjust the threshold to your data.
+
+> **Note:** The pipeline-generated `rel_abund/ra_*.csv` files (no `-O`) preserve the full unfiltered data — use them for statistical analysis. Use the `-O` variant specifically for visualization.
+
+---
+
+<details>
+<summary><b>Using Individual Modules (Advanced)</b></summary>
+<br>
+
+Each step of the pipeline can also be run individually. This is useful for re-running a single step, debugging, or integrating KrakenParser into a custom workflow.
 
 ### **Step 1: Convert Kraken2 Reports to MPA Format**
 ```bash
-KrakenParser --kreport2mpa -i data/kreports -o data/mpa
+# Batch mode (directory)
+KrakenParser --kreport2mpa -i data/kreports -o data/intermediate/mpa
+# Single file
+KrakenParser --kreport2mpa -r data/kreports/sample.kreport -o data/intermediate/mpa/sample.MPA.TXT
 #Having troubles? Run KrakenParser --kreport2mpa -h
 ```
-This script converts Kraken2 `.kreport` files into **MPA format** using KrakenTools.
+Converts Kraken2 `.kreport` files into **MPA format**.
 
 ### **Step 2: Combine MPA Files**
 ```bash
-KrakenParser --combine_mpa -i data/mpa/* -o data/COMBINED.txt
+KrakenParser --combine_mpa -i data/intermediate/mpa/* -o data/intermediate/COMBINED.txt
 #Having troubles? Run KrakenParser --combine_mpa -h
 ```
-This merges multiple MPA files into a single combined file.
+Merges multiple MPA files into a single combined table.
 
 ### **Step 3: Extract Taxonomic Levels**
 ```bash
-KrakenParser --deconstruct -i data/COMBINED.txt -o data/counts
+KrakenParser --deconstruct -i data/intermediate/COMBINED.txt -o data/intermediate
 #Having troubles? Run KrakenParser --deconstruct -h
 ```
 
-If user wants to inspect **Viruses** domain separately:
+By default, human-related taxa (Homo sapiens, Hominidae, Primates, Mammalia, Chordata) are removed. To keep them:
 ```bash
-KrakenParser --deconstruct_viruses -i data/COMBINED.txt -o data/counts_viruses
-#Having troubles? Run KrakenParser --deconstruct_viruses -h
+KrakenParser --deconstruct -i data/intermediate/COMBINED.txt -o data/intermediate --keep-human
 ```
 
-This step extracts only species-level data (excluding human reads).
+To inspect the **Viruses** domain separately:
+```bash
+KrakenParser --deconstruct_viruses -i data/intermediate/COMBINED.txt -o data/counts_viruses
+#Having troubles? Run KrakenParser --deconstruct_viruses -h
+```
 
 ### **Step 4: Process Extracted Taxonomic Data**
 ```bash
-KrakenParser --process -i data/COMBINED.txt -o data/counts/txt/counts_phylum.txt
+KrakenParser --process -i data/intermediate/COMBINED.txt -o data/intermediate/txt/counts_phylum.txt
 #Having troubles? Run KrakenParser --process -h
 ```
 
-Repeat on other 5 taxonomical levels (class, order, family, genus, species) or wrap up `KrakenParser --process` to a loop!
+Repeat on other 5 taxonomical levels (class, order, family, genus, species) or wrap up `KrakenParser --process` in a loop.
 
-This script cleans up taxonomic names (removes prefixes, replaces underscores with spaces).
+Cleans up taxonomic names: removes prefixes (`s__`, `g__`, etc.) and replaces underscores with spaces.
 
 ### **Step 5: Convert TXT to CSV**
 ```bash
-KrakenParser --txt2csv -i data/counts/txt/counts_phylum.txt -o data/counts/csv/counts_phylum.csv
+KrakenParser --txt2csv -i data/intermediate/txt/counts_phylum.txt -o data/counts/counts_phylum.csv
 #Having troubles? Run KrakenParser --txt2csv -h
 ```
-Repeat on other 5 taxonomical levels (class, order, family, genus, species) or wrap up `KrakenParser --txt2csv` to a loop!
+Repeat on other 5 taxonomical levels or wrap in a loop. Transposes data so that sample names become rows.
 
-This converts the processed text files into structured CSV format.
-
-### **Step 6: Calculate relative abundance**
+### **Step 6: Calculate Relative Abundance**
 ```bash
-KrakenParser --relabund -i data/counts/csv/counts_phylum.csv -o data/counts/csv_relabund/counts_phylum.csv
+KrakenParser --relabund -i data/counts/counts_phylum.csv -o data/rel_abund/ra_phylum.csv
 #Having troubles? Run KrakenParser --relabund -h
 ```
-Repeat on other 5 taxonomical levels (class, order, family, genus, species) or wrap up `KrakenParser --relabund` to a loop!
-
-This calculates relative abundance and saves as CSV format.
+Repeat on other 5 taxonomical levels or wrap in a loop.
 
-If user wants to group low abundant taxa in "Other" group:
+With "Other" grouping:
 ```bash
-KrakenParser --relabund -i data/counts/csv/counts_phylum.csv -o data/counts/csv_relabund/counts_phylum.csv --other 3.5
-#Having troubles? Run KrakenParser --relabund -h
+KrakenParser --relabund -i data/counts/counts_phylum.csv -o data/rel_abund/ra_phylum.csv -O 3.5
 ```
+Groups all taxa with abundance < 3.5 % into `Other (<3.5%)`.
 
-This will group all the taxa that have abundance <3.5 into "Other <3.5%" group. Other parameters are welcome!
-
-### **Step 7: Calculate α & β-diversities**
+### **Step 7: Calculate α & β-Diversities**
 ```bash
-KrakenParser --diversity -i data/counts/csv/counts_species.csv -o data/diversity
+KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity
 #Having troubles? Run KrakenParser --diversity -h
 ```
 
-This calculates α & β-diversities and saves them as CSV format to directory provided in the output.
-
-If user wants to use another depth for β-diversity calculations:
+With a custom rarefaction depth for β-diversity:
 ```bash
-KrakenParser --diversity -i data/counts/csv/counts_species.csv -o data/diversity --depth 750
-#Having troubles? Run KrakenParser --diversity -h
+KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity --depth 750
 ```
 
-Other parameters are welcome!
+---
 
 ## Arguments Breakdown
-### **KrakenParser** (Main Pipeline)
-- Automates the entire workflow.
-- Takes **one argument**: the path to Kraken2 reports (`data/kreports`).
-- Runs all the scripts in sequence.
+
+### **--complete** (Full Pipeline)
+- Requires `-i`: path to the Kraken2 reports directory (e.g., `data/kreports`).
+- Optional `-o`: output directory (default: parent of `-i`).
+- Optional `--keep-human`: retain human-related taxa (default: filtered out).
 
 ### **--kreport2mpa** (Step 1)
-- Converts Kraken2 reports to MPA format.
-- Uses `KrakenTools/kreport2mpa.py`.
+- Batch mode: `-i DIR -o DIR` — converts all files in a directory.
+- Single-file mode: `-r FILE -o FILE`.
 
 ### **--combine_mpa** (Step 2)
-- Combines multiple MPA files into one.
-- Uses `KrakenTools/combine_mpa.py`.
+- `-i FILE [FILE ...]`: one or more MPA files.
+- `-o FILE`: output merged table.
 
 ### **--deconstruct** & **--deconstruct_viruses** (Step 3)
 - Extracts **phylum, class, order, family, genus, species** into separate text files.
-- Removes human-related reads (**--deconstruct** only).
+- `--deconstruct` removes human-related reads by default; use `--keep-human` to retain them.
+- `--deconstruct_viruses` extracts only the Viruses domain.
 
 ### **--process** (Step 4)
-- Cleans and formats extracted taxonomic data.
 - Removes prefixes (`s__`, `g__`, etc.), replaces underscores with spaces.
+- `-i`: COMBINED.txt (source for sample-name header); `-o`: target txt file.
 
 ### **--txt2csv** (Step 5)
-- Converts cleaned text files to CSV.
-- Transposes data so that sample names become rows.
+- Transposes a processed txt file into a CSV with sample names as rows.
 
 ### **--relabund** (Step 6)
-- Calculates relative abundance based on total abundance CSV.
-- Optionally can group low abundant taxa.
+- Calculates relative abundance from a total-counts CSV.
+- `-O FLOAT`: group taxa below FLOAT % into `Other (<FLOAT%)`.
 
 ### **--diversity** (Step 7)
-- Calculates α & β-diversities based on total species abundance CSV.
-- Shannon, Pielou & Chao1 indices for α-diversity
-- Bray-Curtis & Jaccard indices for β-diversity
-- Uses 1000 depth for β-diversity as default (can be adjusted with -d)
+- Shannon, Pielou & Chao1 for α-diversity.
+- Bray-Curtis & Jaccard for β-diversity.
+- `-d INT`: rarefaction depth for β-diversity (default: 1000).
+
+</details>
 
 ## Example Output Structure
 After running the full pipeline, the output directory will look like this:
 ```
 data/
 ├─ kreports/               # Input Kraken2 reports
-├─ mpa/                    # Converted MPA files
-├─ COMBINED.txt            # Merged MPA file
-├─ counts/
-│  ├─ txt/                 # Extracted taxonomic levels in TXT
-│  │  ├─ counts_species.txt
-│  │  ├─ counts_genus.txt
-│  │  ├─ counts_family.txt
-│  │  ├─ ...
-│  └─ csv/                 # Total abundance CSV output
-│     ├─ counts_species.csv
-│     ├─ counts_genus.csv
-│     ├─ counts_family.csv
-│     ├─ ...
+├─ counts/                 # Total abundance CSV output
+│  ├─ counts_species.csv
+│  ├─ counts_genus.csv
+│  ├─ ...
+│  └─ counts_phylum.csv
 ├─ rel_abund/              # Relative abundance CSV output
 │  ├─ ra_species.csv
 │  ├─ ra_genus.csv
-│  ├─ ra_family.csv
 │  ├─ ...
-└─ diversity/
-   ├─ alpha_div.csv
-   ├─ beta_div_bray.csv
-   └─ beta_div_jaccard.csv
+│  └─ ra_phylum.csv
+├─ diversity/              # Diversity metrics
+│  ├─ alpha_div.csv
+│  ├─ beta_div_bray.csv
+│  └─ beta_div_jaccard.csv
+└─ intermediate/           # Intermediate files
+   ├─ mpa/                 # Converted MPA files
+   │  ├─ sample.MPA.TXT
+   │  ├─ ...
+   ├─ COMBINED.txt         # Merged MPA table
+   └─ txt/                 # Extracted taxonomic levels in TXT
+      ├─ counts_species.txt
+      ├─ counts_genus.txt
+      ├─ ...
+      └─ counts_phylum.txt
 ```
 
 ## Conclusion

From 9664a56954e4346f687897215511a6c5ee4f3c0c Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Tue, 12 May 2026 09:15:00 +0300
Subject: [PATCH 13/18] ci: pre-install numpy to fix biom-format build on
 Python 3.14

biom-format uses a legacy setup.py that imports numpy at metadata
generation time. Pre-installing numpy ensures it is available before
pip resolves scikit-bio's transitive dependencies.
---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 553faa6..4f77e5b 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -27,7 +27,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        python -m pip install flake8 setuptools wheel
+        python -m pip install flake8 setuptools wheel numpy
         pip install -e ".[dev]" --no-build-isolation
     - name: Lint with flake8
       run: |

From 8d5d457b192c939d1c0a0ed9b02c35807d24d50e Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Tue, 12 May 2026 09:31:00 +0300
Subject: [PATCH 14/18] ci: drop Python 3.14 from matrix; cap requires-python
 to <=3.13
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

biom-format (transitive dep of scikit-bio) has no cp314 wheel and
its legacy setup.py has undeclared build-time deps (numpy, Cython).
Pin support to 3.10–3.13 until scikit-bio ecosystem catches up.
---
 .github/workflows/python-package.yml | 4 ++--
 pyproject.toml                       | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 4f77e5b..85b8411 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+        python-version: ["3.10", "3.11", "3.12", "3.13"]
 
     steps:
     - uses: actions/checkout@v4
@@ -27,7 +27,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        python -m pip install flake8 setuptools wheel numpy
+        python -m pip install flake8 setuptools wheel
         pip install -e ".[dev]" --no-build-isolation
     - name: Lint with flake8
       run: |
diff --git a/pyproject.toml b/pyproject.toml
index b4f3ed5..3d508f6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,7 @@ description = "A collection of scripts designed to process Kraken2 reports and c
 readme = {file = "README.md", content-type = "text/markdown"}
 license = {file = "LICENSE"}
 authors = [{name = "Ilia Popov", email = "iljapopov17@gmail.com"}]
-requires-python = ">=3.10,<=3.16"
+requires-python = ">=3.10,<=3.13"
 dependencies = [
     "pandas>=2.0.0,<=2.3.3",
     "matplotlib>=3.7.0,<=3.10.8",

From 06ddafb71cc8d343a44ad2e478307bce56265039 Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Tue, 12 May 2026 09:48:00 +0300
Subject: [PATCH 15/18] refactor(stats): replace scikit-bio with scipy for beta
 diversity

scikit-bio pulls in biom-format which has no cp314 wheel and
brittle build-time deps. Reimplement subsample_counts in numpy
and use scipy.spatial.distance for Bray-Curtis and Jaccard.
Restore Python 3.14 support and bump requires-python to <=3.16.
---
 .github/workflows/python-package.yml |  2 +-
 krakenparser/stats/diversity.py      | 58 ++++++++++++++--------------
 pyproject.toml                       |  4 +-
 3 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 85b8411..553faa6 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.10", "3.11", "3.12", "3.13"]
+        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
 
     steps:
     - uses: actions/checkout@v4
diff --git a/krakenparser/stats/diversity.py b/krakenparser/stats/diversity.py
index da3b336..c5ea216 100644
--- a/krakenparser/stats/diversity.py
+++ b/krakenparser/stats/diversity.py
@@ -1,15 +1,14 @@
 #!/usr/bin/env python
 
-import pandas as pd
-import numpy as np
-import sys
 import argparse
+import sys
 from pathlib import Path
-from skbio.diversity import beta_diversity
-from skbio.stats import subsample_counts
+
+import numpy as np
+import pandas as pd
+from scipy.spatial.distance import pdist, squareform
 
 
-# Define Shannon index
 def shannon_index(counts):
     counts = np.array(counts)
     counts = counts[counts > 0]
@@ -17,7 +16,6 @@ def shannon_index(counts):
     return -np.sum(proportions * np.log(proportions))
 
 
-# Define Pielou's evenness
 def pielou_evenness(counts):
     counts = np.asarray(counts)
     S = int(np.sum(counts > 0))
@@ -26,7 +24,6 @@ def pielou_evenness(counts):
     return shannon_index(counts) / np.log(S)
 
 
-# Define Chao1 richness estimator
 def chao1_index(counts):
     counts = np.array(counts)
     S_obs = np.sum(counts > 0)
@@ -37,6 +34,13 @@ def chao1_index(counts):
     return S_obs + (F1 * F1) / (2 * F2)
 
 
+def _subsample_counts(counts: np.ndarray, n: int) -> np.ndarray:
+    """Rarefy counts to n reads by sampling without replacement."""
+    indices = np.repeat(np.arange(len(counts)), counts)
+    sampled = np.random.choice(indices, size=n, replace=False)
+    return np.bincount(sampled, minlength=len(counts)).astype(int)
+
+
 def calc_alpha_div(df, output_path):
     results = []
     for sample_id, row in df.iterrows():
@@ -60,19 +64,23 @@ def calc_beta_div(df, output_path, rarefaction_depth):
     for sample, row in df.iterrows():
         counts = np.round(row.values).astype(int)
         if counts.sum() >= rarefaction_depth:
-            rarefied = subsample_counts(counts, n=rarefaction_depth)
+            rarefied = _subsample_counts(counts, n=rarefaction_depth)
             rarefied_counts.append(rarefied)
             sample_ids.append(sample)
 
     if len(rarefied_counts) < 2:
         raise ValueError("Not enough samples passed the rarefaction threshold.")
 
-    bray_df = beta_diversity(
-        "braycurtis", rarefied_counts, ids=sample_ids
-    ).to_data_frame()
-    jaccard_df = beta_diversity(
-        "jaccard", rarefied_counts, ids=sample_ids
-    ).to_data_frame()
+    X = np.array(rarefied_counts, dtype=float)
+
+    bray_df = pd.DataFrame(
+        squareform(pdist(X, metric="braycurtis")),
+        index=sample_ids, columns=sample_ids,
+    )
+    jaccard_df = pd.DataFrame(
+        squareform(pdist(X.astype(bool).astype(float), metric="jaccard")),
+        index=sample_ids, columns=sample_ids,
+    )
 
     bray_df.to_csv(output_path / "beta_div_bray.csv")
     jaccard_df.to_csv(output_path / "beta_div_jaccard.csv")
@@ -80,20 +88,12 @@ def calc_beta_div(df, output_path, rarefaction_depth):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Calculate α & β-diversities.")
-    parser.add_argument(
-        "-i",
-        "--input",
-        required=True,
-        help="Input total count table CSV (species level).",
-    )
-    parser.add_argument("-o", "--output", required=True, help="Output directory path.")
-    parser.add_argument(
-        "-d",
-        "--depth",
-        type=int,
-        default=1000,
-        help="Rarefaction depth for β diversity (default: 1000).",
-    )
+    parser.add_argument("-i", "--input", required=True,
+                        help="Input total count table CSV (species level).")
+    parser.add_argument("-o", "--output", required=True,
+                        help="Output directory path.")
+    parser.add_argument("-d", "--depth", type=int, default=1000,
+                        help="Rarefaction depth for β diversity (default: 1000).")
     args = parser.parse_args()
 
     input_file = Path(args.input)
diff --git a/pyproject.toml b/pyproject.toml
index 3d508f6..7af15d5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,13 +9,13 @@ description = "A collection of scripts designed to process Kraken2 reports and c
 readme = {file = "README.md", content-type = "text/markdown"}
 license = {file = "LICENSE"}
 authors = [{name = "Ilia Popov", email = "iljapopov17@gmail.com"}]
-requires-python = ">=3.10,<=3.13"
+requires-python = ">=3.10,<=3.16"
 dependencies = [
     "pandas>=2.0.0,<=2.3.3",
     "matplotlib>=3.7.0,<=3.10.8",
     "numpy>=1.24.0,<=2.3.5",
     "seaborn>=0.12.0,<=0.13.2",
-    "scikit-bio>=0.6.0,<=0.7.2",
+    "scipy>=1.9.0,<=1.16.3",
 ]
 
 [project.optional-dependencies]

From 4ce87509c71d3722a2e8b604831b066eca091840 Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Tue, 12 May 2026 10:05:00 +0300
Subject: [PATCH 16/18] feat(diversity): add -s/--seed for reproducible
 rarefaction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rarefaction is stochastic — without a fixed seed results differ
between runs. Replace np.random.choice with np.random.default_rng
and propagate the seed through calc_beta_div and run_pipeline.
---
 krakenparser/pipeline.py        |  8 +++++++-
 krakenparser/stats/diversity.py | 13 ++++++++-----
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/krakenparser/pipeline.py b/krakenparser/pipeline.py
index f0e29c6..02ca6a7 100644
--- a/krakenparser/pipeline.py
+++ b/krakenparser/pipeline.py
@@ -42,6 +42,7 @@ def run_pipeline(
     output_dir: str | None = None,
     keep_human: bool = False,
     rarefaction_depth: int = 1000,
+    seed: int | None = None,
     overwrite: bool = False,
 ) -> None:
     source_dir = Path(input_dir)
@@ -116,7 +117,7 @@ def run_pipeline(
     diversity_dir.mkdir(exist_ok=True)
     df = pd.read_csv(species_csv, index_col=0)
     calc_alpha_div(df, diversity_dir)
-    calc_beta_div(df, diversity_dir, rarefaction_depth=rarefaction_depth)
+    calc_beta_div(df, diversity_dir, rarefaction_depth=rarefaction_depth, seed=seed)
 
     _log.info("All steps completed successfully!")
 
@@ -142,6 +143,10 @@ def main() -> None:
         "-d", "--depth", type=int, default=1000,
         help="Rarefaction depth for β-diversity (default: 1000)",
     )
+    parser.add_argument(
+        "-s", "--seed", type=int, default=None,
+        help="Random seed for reproducible rarefaction (default: random)",
+    )
     parser.add_argument(
         "--overwrite", action="store_true", default=False,
         help="Overwrite the output directory if it already exists",
@@ -151,6 +156,7 @@ def main() -> None:
         args.input, args.output,
         keep_human=args.keep_human,
         rarefaction_depth=args.depth,
+        seed=args.seed,
         overwrite=args.overwrite,
     )
 
diff --git a/krakenparser/stats/diversity.py b/krakenparser/stats/diversity.py
index c5ea216..ac3c87f 100644
--- a/krakenparser/stats/diversity.py
+++ b/krakenparser/stats/diversity.py
@@ -34,10 +34,10 @@ def chao1_index(counts):
     return S_obs + (F1 * F1) / (2 * F2)
 
 
-def _subsample_counts(counts: np.ndarray, n: int) -> np.ndarray:
+def _subsample_counts(counts: np.ndarray, n: int, rng: np.random.Generator) -> np.ndarray:
     """Rarefy counts to n reads by sampling without replacement."""
     indices = np.repeat(np.arange(len(counts)), counts)
-    sampled = np.random.choice(indices, size=n, replace=False)
+    sampled = rng.choice(indices, size=n, replace=False)
     return np.bincount(sampled, minlength=len(counts)).astype(int)
 
 
@@ -57,14 +57,15 @@ def calc_alpha_div(df, output_path):
     alpha_df.to_csv(output_path / "alpha_div.csv")
 
 
-def calc_beta_div(df, output_path, rarefaction_depth):
+def calc_beta_div(df, output_path, rarefaction_depth, seed=None):
+    rng = np.random.default_rng(seed)
     rarefied_counts = []
     sample_ids = []
 
     for sample, row in df.iterrows():
         counts = np.round(row.values).astype(int)
         if counts.sum() >= rarefaction_depth:
-            rarefied = _subsample_counts(counts, n=rarefaction_depth)
+            rarefied = _subsample_counts(counts, n=rarefaction_depth, rng=rng)
             rarefied_counts.append(rarefied)
             sample_ids.append(sample)
 
@@ -94,6 +95,8 @@ def calc_beta_div(df, output_path, rarefaction_depth):
                         help="Output directory path.")
     parser.add_argument("-d", "--depth", type=int, default=1000,
                         help="Rarefaction depth for β diversity (default: 1000).")
+    parser.add_argument("-s", "--seed", type=int, default=None,
+                        help="Random seed for reproducible rarefaction (default: random).")
     args = parser.parse_args()
 
     input_file = Path(args.input)
@@ -105,7 +108,7 @@ def calc_beta_div(df, output_path, rarefaction_depth):
     df = pd.read_csv(input_file, index_col=0)
 
     calc_alpha_div(df, output_dir)
-    calc_beta_div(df, output_dir, args.depth)
+    calc_beta_div(df, output_dir, args.depth, seed=args.seed)
     print(
         f"α & β-diversities have been successfully calculated and saved to '{output_dir}'."
     )

From 185e8d4831e6e1710e6d67b274ac2b5afe5b3007 Mon Sep 17 00:00:00 2001
From: Ilia Popov <iljapopov17@gmail.com>
Date: Tue, 12 May 2026 10:08:00 +0300
Subject: [PATCH 17/18] docs: document -s/--seed flag in Quick Start and
 diversity step

---
 README.md | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 0349c7e..f8235f2 100644
--- a/README.md
+++ b/README.md
@@ -113,6 +113,11 @@ KrakenParser --complete -i data/kreports -o results/
 #Having troubles? Run KrakenParser --complete -h
 ```
 
+For **reproducible** β-diversity (rarefaction is stochastic by default):
+```bash
+KrakenParser -i data/kreports -o results/ -s 42
+```
+
 This will:
 1. Convert Kraken2 reports to MPA format
 2. Combine MPA files into a single file
@@ -218,9 +223,14 @@ KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity
 #Having troubles? Run KrakenParser --diversity -h
 ```
 
-With a custom rarefaction depth for β-diversity:
+With a custom rarefaction depth:
+```bash
+KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity -d 750
+```
+
+For reproducible results (rarefaction uses random subsampling — fix the seed to get the same matrix every run):
 ```bash
-KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity --depth 750
+KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity -s 42
 ```
 
 ---
@@ -231,6 +241,7 @@ KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity --d
 - Requires `-i`: path to the Kraken2 reports directory (e.g., `data/kreports`).
 - Optional `-o`: output directory (default: parent of `-i`).
 - Optional `--keep-human`: retain human-related taxa (default: filtered out).
+- Optional `-s INT`: random seed for reproducible β-diversity rarefaction (default: random).
 
 ### **--kreport2mpa** (Step 1)
 - Batch mode: `-i DIR -o DIR` — converts all files in a directory.
@@ -260,6 +271,7 @@ KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity --d
 - Shannon, Pielou & Chao1 for α-diversity.
 - Bray-Curtis & Jaccard for β-diversity.
 - `-d INT`: rarefaction depth for β-diversity (default: 1000).
+- `-s INT`: random seed for reproducible rarefaction (default: random — results vary between runs).
 
 </details>
 

From 14a21e2a23e5bcfcff1d614ee5b4b558b0651864 Mon Sep 17 00:00:00 2001
From: Ilya Popov <iljapopov17@gmail.com>
Date: Tue, 12 May 2026 15:49:12 +0200
Subject: [PATCH 18/18] Update README.md

---
 README.md | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index f8235f2..898b34e 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ You can run the entire pipeline with **a single command**, or use the scripts **
 
 ### Total abundance output
 
-`counts_phylum.csv` parsed from 7 kraken2 reports of metagenomic samples using `KrakenParser`:
+`counts_phylum.csv` parsed from 9 kraken2 reports of metagenomic samples using `KrakenParser`:
 
 ```
 Sample_id,Calditrichota,Caldisericota,Thermosulfidibacterota,Elusimicrobiota,Candidatus Fervidibacterota,Lentisphaerota,Kiritimatiellota,Vulcanimicrobiota,Thermodesulfobiota,Atribacterota,Dictyoglomota,Nitrospinota,Chrysiogenota,Coprothermobacterota,Aquificota,Thermotogota,Bdellovibrionota,Nitrospirota,Deferribacterota,Synergistota,Myxococcota,Acidobacteriota,Candidatus Bipolaricaulota,Candidatus Saccharibacteria,Candidatus Absconditabacteria,Fusobacteriota,Spirochaetota,Candidatus Omnitrophota,Chlamydiota,Verrucomicrobiota,Planctomycetota,Thermodesulfobacteriota,Campylobacterota,Candidatus Cloacimonadota,Fibrobacterota,Gemmatimonadota,Balneolota,Rhodothermota,Ignavibacteriota,Chlorobiota,Bacteroidota,Deinococcota,Thermomicrobiota,Armatimonadota,Chloroflexota,Cyanobacteriota,Mycoplasmatota,Actinomycetota,Bacillota,Pseudomonadota,Heterolobosea,Parabasalia,Fornicata,Evosea,Bacillariophyta,Cercozoa,Euglenozoa,Apicomplexa,Microsporidia,Basidiomycota,Ascomycota,Nanoarchaeota,Candidatus Micrarchaeota,Candidatus Thermoplasmatota,Candidatus Lokiarchaeota,Nitrososphaerota,Euryarchaeota,Thermoproteota,Hofneiviricota,Artverviricota,Nucleocytoviricota,Cossaviricota,Kitrinoviricota,Negarnaviricota,Lenarviricota,Pisuviricota,Peploviricota,Uroviricota
@@ -39,7 +39,7 @@ X9,0,3,2,16,7,1,23,12,10,9,1,2,134,40,390,289,29,372,27,81,150,90,9,88,32,287,88
 
 ### Relative abundance output
 
-`ra_phylum.csv` calculated from 7 kraken2 reports of metagenomic samples using `KrakenParser`:
+`ra_phylum.csv` calculated from 9 kraken2 reports of metagenomic samples using `KrakenParser`:
 
 ```
 Sample_id,taxon,rel_abund_perc
@@ -61,7 +61,7 @@ X9,Other (<4.0%),1.8979510606688494
 
 ### α-diversity output
 
-`alpha_div.csv` calculated from 7 kraken2 reports of metagenomic samples using `KrakenParser`:
+`alpha_div.csv` calculated from 9 kraken2 reports of metagenomic samples using `KrakenParser`:
 
 ```
 Sample,Shannon,Pielou,Chao1
@@ -74,7 +74,7 @@ X9,4.033664950188261,0.5050385978575492,3492.16
 
 ### β-diversity output
 
-`beta_div_bray.csv` calculated from 7 kraken2 reports of metagenomic samples using `KrakenParser`:
+`beta_div_bray.csv` calculated from 9 kraken2 reports of metagenomic samples using `KrakenParser`:
 
 ```
 ,X1,X2,...,X8,X9
@@ -85,7 +85,7 @@ X8,0.61,0.723,...,0.0,0.665
 X9,0.353,0.388,...,0.665,0.0
 ```
 
-`beta_div_jaccard.csv` calculated from 7 kraken2 reports of metagenomic samples using `KrakenParser`:
+`beta_div_jaccard.csv` calculated from 9 kraken2 reports of metagenomic samples using `KrakenParser`:
 
 ```
 ,X1,X2,...,X8,X9
@@ -278,8 +278,7 @@ KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity -s
 ## Example Output Structure
 After running the full pipeline, the output directory will look like this:
 ```
-data/
-├─ kreports/               # Input Kraken2 reports
+results/
 ├─ counts/                 # Total abundance CSV output
 │  ├─ counts_species.csv
 │  ├─ counts_genus.csv
@@ -296,7 +295,7 @@ data/
 │  └─ beta_div_jaccard.csv
 └─ intermediate/           # Intermediate files
    ├─ mpa/                 # Converted MPA files
-   │  ├─ sample.MPA.TXT
+   │  ├─ {sample}.txt
    │  ├─ ...
    ├─ COMBINED.txt         # Merged MPA table
    └─ txt/                 # Extracted taxonomic levels in TXT