diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
deleted file mode 100644
index 9a19b2a..0000000
--- a/.github/workflows/ci.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-name: pytest
-on:
- push:
- branches: [ dev ]
- pull_request:
- branches: [ dev ]
-
-jobs:
- build-test:
- runs-on: ubuntu-latest
- strategy:
- matrix:
- python-version: [3.12]
- steps:
- - uses: actions/checkout@v3
- - uses: actions/setup-python@v4
- with:
- python-version: ${{ matrix.python-version }}
- - run: |
- python -m pip install --upgrade pip
- pip install krakenparser
- pip install pytest
- - run: pytest -q
\ No newline at end of file
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
new file mode 100644
index 0000000..3763435
--- /dev/null
+++ b/.github/workflows/publish.yml
@@ -0,0 +1,53 @@
+name: Publish to PyPI
+
+on:
+ push:
+ tags:
+ - "v*"
+ workflow_dispatch:
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
+ with:
+ python-version: "3.12"
+ - name: Install build tools
+ run: pip install build twine
+ - name: Strip GitHub-specific markup from README
+ run: |
+ python - <<'EOF'
+ import re, pathlib
+ text = pathlib.Path("README.md").read_text()
+ text = re.sub(
+ r'
\n',
+ '', text,
+ )
+ text = re.sub(r'\[!\[Downloads\]\([^\)]+\)\]\([^\)]+\)\n', '', text)
+ pathlib.Path("README.md").write_text(text)
+ EOF
+ - name: Build package
+ run: python -m build
+ - name: Check distribution
+ run: twine check dist/*
+ - name: Store distribution packages
+ uses: actions/upload-artifact@v4
+ with:
+ name: dist
+ path: dist/
+
+ publish:
+ needs: build
+ if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+ runs-on: ubuntu-latest
+ environment: release
+ permissions:
+ id-token: write
+ steps:
+ - uses: actions/download-artifact@v4
+ with:
+ name: dist
+ path: dist/
+ - uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
new file mode 100644
index 0000000..553faa6
--- /dev/null
+++ b/.github/workflows/python-package.yml
@@ -0,0 +1,46 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: Python package
+
+on:
+ push:
+ branches: [ "dev", "main" ]
+ pull_request:
+ branches: [ "dev", "main" ]
+
+jobs:
+ build:
+
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+
+ steps:
+ - uses: actions/checkout@v4
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install flake8 setuptools wheel
+ pip install -e ".[dev]" --no-build-isolation
+ - name: Lint with flake8
+ run: |
+ # stop the build if there are Python syntax errors or undefined names
+ flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+ # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+ flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+ - name: Test with pytest
+ run: |
+ pytest --cov=krakenparser --cov-report=xml
+ - name: Upload coverage to Codecov
+ if: matrix.python-version == '3.12'
+ uses: codecov/codecov-action@v5
+ with:
+ files: coverage.xml
+ token: ${{ secrets.CODECOV_TOKEN }}
diff --git a/README.md b/README.md
index 8a848c7..898b34e 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,8 @@

[](https://pepy.tech/project/krakenparser)
-[](https://github.com/PopovIILab/KrakenParser/actions/workflows/ci.yml)
+[](https://github.com/PopovIILab/KrakenParser/actions/workflows/python-package.yml)
+[](https://codecov.io/gh/PopovIILab/KrakenParser)
@@ -24,7 +25,7 @@ You can run the entire pipeline with **a single command**, or use the scripts **
### Total abundance output
-`counts_phylum.csv` parsed from 7 kraken2 reports of metagenomic samples using `KrakenParser`:
+`counts_phylum.csv` parsed from 9 kraken2 reports of metagenomic samples using `KrakenParser`:
```
Sample_id,Calditrichota,Caldisericota,Thermosulfidibacterota,Elusimicrobiota,Candidatus Fervidibacterota,Lentisphaerota,Kiritimatiellota,Vulcanimicrobiota,Thermodesulfobiota,Atribacterota,Dictyoglomota,Nitrospinota,Chrysiogenota,Coprothermobacterota,Aquificota,Thermotogota,Bdellovibrionota,Nitrospirota,Deferribacterota,Synergistota,Myxococcota,Acidobacteriota,Candidatus Bipolaricaulota,Candidatus Saccharibacteria,Candidatus Absconditabacteria,Fusobacteriota,Spirochaetota,Candidatus Omnitrophota,Chlamydiota,Verrucomicrobiota,Planctomycetota,Thermodesulfobacteriota,Campylobacterota,Candidatus Cloacimonadota,Fibrobacterota,Gemmatimonadota,Balneolota,Rhodothermota,Ignavibacteriota,Chlorobiota,Bacteroidota,Deinococcota,Thermomicrobiota,Armatimonadota,Chloroflexota,Cyanobacteriota,Mycoplasmatota,Actinomycetota,Bacillota,Pseudomonadota,Heterolobosea,Parabasalia,Fornicata,Evosea,Bacillariophyta,Cercozoa,Euglenozoa,Apicomplexa,Microsporidia,Basidiomycota,Ascomycota,Nanoarchaeota,Candidatus Micrarchaeota,Candidatus Thermoplasmatota,Candidatus Lokiarchaeota,Nitrososphaerota,Euryarchaeota,Thermoproteota,Hofneiviricota,Artverviricota,Nucleocytoviricota,Cossaviricota,Kitrinoviricota,Negarnaviricota,Lenarviricota,Pisuviricota,Peploviricota,Uroviricota
@@ -38,7 +39,7 @@ X9,0,3,2,16,7,1,23,12,10,9,1,2,134,40,390,289,29,372,27,81,150,90,9,88,32,287,88
### Relative abundance output
-`ra_phylum.csv` calculated from 7 kraken2 reports of metagenomic samples using `KrakenParser`:
+`ra_phylum.csv` calculated from 9 kraken2 reports of metagenomic samples using `KrakenParser`:
```
Sample_id,taxon,rel_abund_perc
@@ -60,7 +61,7 @@ X9,Other (<4.0%),1.8979510606688494
### α-diversity output
-`alpha_div.csv` calculated from 7 kraken2 reports of metagenomic samples using `KrakenParser`:
+`alpha_div.csv` calculated from 9 kraken2 reports of metagenomic samples using `KrakenParser`:
```
Sample,Shannon,Pielou,Chao1
@@ -73,7 +74,7 @@ X9,4.033664950188261,0.5050385978575492,3492.16
### β-diversity output
-`beta_div_bray.csv` calculated from 7 kraken2 reports of metagenomic samples using `KrakenParser`:
+`beta_div_bray.csv` calculated from 9 kraken2 reports of metagenomic samples using `KrakenParser`:
```
,X1,X2,...,X8,X9
@@ -84,7 +85,7 @@ X8,0.61,0.723,...,0.0,0.665
X9,0.353,0.388,...,0.665,0.0
```
-`beta_div_jaccard.csv` calculated from 7 kraken2 reports of metagenomic samples using `KrakenParser`:
+`beta_div_jaccard.csv` calculated from 9 kraken2 reports of metagenomic samples using `KrakenParser`:
```
,X1,X2,...,X8,X9
@@ -108,9 +109,15 @@ X9,0.7232472324723247,0.7352941176470589,...,0.8066914498141264,0.0
## Quick Start (Full Pipeline)
To run the full pipeline, use the following command:
```bash
-KrakenParser --complete -i data/kreports
+KrakenParser --complete -i data/kreports -o results/
#Having troubles? Run KrakenParser --complete -h
```
+
+For **reproducible** β-diversity (rarefaction is stochastic by default):
+```bash
+KrakenParser -i data/kreports -o results/ -s 42
+```
+
This will:
1. Convert Kraken2 reports to MPA format
2. Combine MPA files into a single file
@@ -120,162 +127,182 @@ This will:
6. Calculate relative abundance
7. Calculate α & β-diversities
-### **Input Requirements**
-- The Kraken2 reports must be inside a **subdirectory** (e.g., `data/kreports`).
-- The script automatically creates output directories and processes the data.
-
## Installation
```
pip install krakenparser
```
-## Using Individual Modules
-You can also run each step manually if needed.
+## Before Visualization: Grouping Low-Abundance Taxa
+
+The full pipeline automatically calculates relative abundance. Before passing data to visualization, it is strongly recommended to re-run `--relabund` with the `-O` flag — this collapses all taxa below the chosen threshold into a single **"Other"** group, producing much cleaner and more readable plots.
+
+```bash
+KrakenParser --relabund -i data/counts/counts_species.csv -o data/rel_abund/ra_species.csv -O 4
+```
+
+This groups every taxon with relative abundance **< 4 %** into `Other (<4.0%)`. Adjust the threshold to your data.
+
+> **Note:** The pipeline-generated `rel_abund/ra_*.csv` files (no `-O`) preserve the full unfiltered data — use them for statistical analysis. Use the `-O` variant specifically for visualization.
+
+---
+
+
+Using Individual Modules (Advanced)
+
+
+Each step of the pipeline can also be run individually. This is useful for re-running a single step, debugging, or integrating KrakenParser into a custom workflow.
### **Step 1: Convert Kraken2 Reports to MPA Format**
```bash
-KrakenParser --kreport2mpa -i data/kreports -o data/mpa
+# Batch mode (directory)
+KrakenParser --kreport2mpa -i data/kreports -o data/intermediate/mpa
+# Single file
+KrakenParser --kreport2mpa -r data/kreports/sample.kreport -o data/intermediate/mpa/sample.MPA.TXT
#Having troubles? Run KrakenParser --kreport2mpa -h
```
-This script converts Kraken2 `.kreport` files into **MPA format** using KrakenTools.
+Converts Kraken2 `.kreport` files into **MPA format**.
### **Step 2: Combine MPA Files**
```bash
-KrakenParser --combine_mpa -i data/mpa/* -o data/COMBINED.txt
+KrakenParser --combine_mpa -i data/intermediate/mpa/* -o data/intermediate/COMBINED.txt
#Having troubles? Run KrakenParser --combine_mpa -h
```
-This merges multiple MPA files into a single combined file.
+Merges multiple MPA files into a single combined table.
### **Step 3: Extract Taxonomic Levels**
```bash
-KrakenParser --deconstruct -i data/COMBINED.txt -o data/counts
+KrakenParser --deconstruct -i data/intermediate/COMBINED.txt -o data/intermediate
#Having troubles? Run KrakenParser --deconstruct -h
```
-If user wants to inspect **Viruses** domain separately:
+By default, human-related taxa (Homo sapiens, Hominidae, Primates, Mammalia, Chordata) are removed. To keep them:
```bash
-KrakenParser --deconstruct_viruses -i data/COMBINED.txt -o data/counts_viruses
-#Having troubles? Run KrakenParser --deconstruct_viruses -h
+KrakenParser --deconstruct -i data/intermediate/COMBINED.txt -o data/intermediate --keep-human
```
-This step extracts only species-level data (excluding human reads).
+To inspect the **Viruses** domain separately:
+```bash
+KrakenParser --deconstruct_viruses -i data/intermediate/COMBINED.txt -o data/counts_viruses
+#Having troubles? Run KrakenParser --deconstruct_viruses -h
+```
### **Step 4: Process Extracted Taxonomic Data**
```bash
-KrakenParser --process -i data/COMBINED.txt -o data/counts/txt/counts_phylum.txt
+KrakenParser --process -i data/intermediate/COMBINED.txt -o data/intermediate/txt/counts_phylum.txt
#Having troubles? Run KrakenParser --process -h
```
-Repeat on other 5 taxonomical levels (class, order, family, genus, species) or wrap up `KrakenParser --process` to a loop!
+Repeat on other 5 taxonomical levels (class, order, family, genus, species) or wrap up `KrakenParser --process` in a loop.
-This script cleans up taxonomic names (removes prefixes, replaces underscores with spaces).
+Cleans up taxonomic names: removes prefixes (`s__`, `g__`, etc.) and replaces underscores with spaces.
### **Step 5: Convert TXT to CSV**
```bash
-KrakenParser --txt2csv -i data/counts/txt/counts_phylum.txt -o data/counts/csv/counts_phylum.csv
+KrakenParser --txt2csv -i data/intermediate/txt/counts_phylum.txt -o data/counts/counts_phylum.csv
#Having troubles? Run KrakenParser --txt2csv -h
```
-Repeat on other 5 taxonomical levels (class, order, family, genus, species) or wrap up `KrakenParser --txt2csv` to a loop!
+Repeat on other 5 taxonomical levels or wrap in a loop. Transposes data so that sample names become rows.
-This converts the processed text files into structured CSV format.
-
-### **Step 6: Calculate relative abundance**
+### **Step 6: Calculate Relative Abundance**
```bash
-KrakenParser --relabund -i data/counts/csv/counts_phylum.csv -o data/counts/csv_relabund/counts_phylum.csv
+KrakenParser --relabund -i data/counts/counts_phylum.csv -o data/rel_abund/ra_phylum.csv
#Having troubles? Run KrakenParser --relabund -h
```
-Repeat on other 5 taxonomical levels (class, order, family, genus, species) or wrap up `KrakenParser --relabund` to a loop!
-
-This calculates relative abundance and saves as CSV format.
+Repeat on other 5 taxonomical levels or wrap in a loop.
-If user wants to group low abundant taxa in "Other" group:
+With "Other" grouping:
```bash
-KrakenParser --relabund -i data/counts/csv/counts_phylum.csv -o data/counts/csv_relabund/counts_phylum.csv --other 3.5
-#Having troubles? Run KrakenParser --relabund -h
+KrakenParser --relabund -i data/counts/counts_phylum.csv -o data/rel_abund/ra_phylum.csv -O 3.5
```
+Groups all taxa with abundance < 3.5 % into `Other (<3.5%)`.
-This will group all the taxa that have abundance <3.5 into "Other <3.5%" group. Other parameters are welcome!
-
-### **Step 7: Calculate α & β-diversities**
+### **Step 7: Calculate α & β-Diversities**
```bash
-KrakenParser --diversity -i data/counts/csv/counts_species.csv -o data/diversity
+KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity
#Having troubles? Run KrakenParser --diversity -h
```
-This calculates α & β-diversities and saves them as CSV format to directory provided in the output.
+With a custom rarefaction depth:
+```bash
+KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity -d 750
+```
-If user wants to use another depth for β-diversity calculations:
+For reproducible results (rarefaction uses random subsampling — fix the seed to get the same matrix every run):
```bash
-KrakenParser --diversity -i data/counts/csv/counts_species.csv -o data/diversity --depth 750
-#Having troubles? Run KrakenParser --diversity -h
+KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity -s 42
```
-Other parameters are welcome!
+---
## Arguments Breakdown
-### **KrakenParser** (Main Pipeline)
-- Automates the entire workflow.
-- Takes **one argument**: the path to Kraken2 reports (`data/kreports`).
-- Runs all the scripts in sequence.
+
+### **--complete** (Full Pipeline)
+- Requires `-i`: path to the Kraken2 reports directory (e.g., `data/kreports`).
+- Optional `-o`: output directory (default: parent of `-i`).
+- Optional `--keep-human`: retain human-related taxa (default: filtered out).
+- Optional `-s INT`: random seed for reproducible β-diversity rarefaction (default: random).
### **--kreport2mpa** (Step 1)
-- Converts Kraken2 reports to MPA format.
-- Uses `KrakenTools/kreport2mpa.py`.
+- Batch mode: `-i DIR -o DIR` — converts all files in a directory.
+- Single-file mode: `-r FILE -o FILE`.
### **--combine_mpa** (Step 2)
-- Combines multiple MPA files into one.
-- Uses `KrakenTools/combine_mpa.py`.
+- `-i FILE [FILE ...]`: one or more MPA files.
+- `-o FILE`: output merged table.
### **--deconstruct** & **--deconstruct_viruses** (Step 3)
- Extracts **phylum, class, order, family, genus, species** into separate text files.
-- Removes human-related reads (**--deconstruct** only).
+- `--deconstruct` removes human-related reads by default; use `--keep-human` to retain them.
+- `--deconstruct_viruses` extracts only the Viruses domain.
### **--process** (Step 4)
-- Cleans and formats extracted taxonomic data.
- Removes prefixes (`s__`, `g__`, etc.), replaces underscores with spaces.
+- `-i`: COMBINED.txt (source for sample-name header); `-o`: target txt file.
### **--txt2csv** (Step 5)
-- Converts cleaned text files to CSV.
-- Transposes data so that sample names become rows.
+- Transposes a processed txt file into a CSV with sample names as rows.
### **--relabund** (Step 6)
-- Calculates relative abundance based on total abundance CSV.
-- Optionally can group low abundant taxa.
+- Calculates relative abundance from a total-counts CSV.
+- `-O FLOAT`: group taxa below FLOAT % into `Other (
## Example Output Structure
After running the full pipeline, the output directory will look like this:
```
-data/
-├─ kreports/ # Input Kraken2 reports
-├─ mpa/ # Converted MPA files
-├─ COMBINED.txt # Merged MPA file
-├─ counts/
-│ ├─ txt/ # Extracted taxonomic levels in TXT
-│ │ ├─ counts_species.txt
-│ │ ├─ counts_genus.txt
-│ │ ├─ counts_family.txt
-│ │ ├─ ...
-│ └─ csv/ # Total abundance CSV output
-│ ├─ counts_species.csv
-│ ├─ counts_genus.csv
-│ ├─ counts_family.csv
-│ ├─ ...
+results/
+├─ counts/ # Total abundance CSV output
+│ ├─ counts_species.csv
+│ ├─ counts_genus.csv
+│ ├─ ...
+│ └─ counts_phylum.csv
├─ rel_abund/ # Relative abundance CSV output
│ ├─ ra_species.csv
│ ├─ ra_genus.csv
-│ ├─ ra_family.csv
│ ├─ ...
-└─ diversity/
- ├─ alpha_div.csv
- ├─ beta_div_bray.csv
- └─ beta_div_jaccard.csv
+│ └─ ra_phylum.csv
+├─ diversity/ # Diversity metrics
+│ ├─ alpha_div.csv
+│ ├─ beta_div_bray.csv
+│ └─ beta_div_jaccard.csv
+└─ intermediate/ # Intermediate files
+ ├─ mpa/ # Converted MPA files
+ │ ├─ {sample}.txt
+ │ ├─ ...
+ ├─ COMBINED.txt # Merged MPA table
+ └─ txt/ # Extracted taxonomic levels in TXT
+ ├─ counts_species.txt
+ ├─ counts_genus.txt
+ ├─ ...
+ └─ counts_phylum.txt
```
## Conclusion
diff --git a/krakenparser/convert2csv.py b/krakenparser/convert2csv.py
deleted file mode 100755
index 2b05823..0000000
--- a/krakenparser/convert2csv.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/usr/bin/env python
-
-import shutil
-from pathlib import Path
-import argparse
-import pandas as pd
-
-
-def convert_to_csv(input_file, output_file):
- # Read the entire file into a DataFrame
- data = pd.read_csv(input_file, sep="\t", header=None)
-
- # Set the first row as the header
- data.columns = data.iloc[0]
- data = data.drop(data.index[0])
-
- # Transpose the DataFrame so that sample names become rows and microbial taxa with their abundance become columns
- data_transposed = data.T
- data_transposed.columns = data_transposed.iloc[0]
- data_transposed = data_transposed.drop(data_transposed.index[0])
-
- # Save the transposed data to a new CSV file
- data_transposed.to_csv(output_file, index_label="Sample_id")
- print(f"Data has been successfully converted and saved as '{output_file}'.")
-
- # Remove pycache if it exists
- current_dir = Path(__file__).resolve().parent
- pycache_dir = current_dir / "__pycache__"
- if pycache_dir.exists() and pycache_dir.is_dir():
- shutil.rmtree(pycache_dir)
-
-
-if __name__ == "__main__":
- # Use argparse to handle command-line arguments
- parser = argparse.ArgumentParser(
- description="Reads a TXT file, reorganizes the data, and converts it into a CSV file."
- )
- parser.add_argument(
- "-i",
- "--input",
- required=True,
- help="Path to the input TXT file. This file should contain sample names in columns and microbial taxa in rows.",
- )
- parser.add_argument(
- "-o",
- "--output",
- required=True,
- help="Path to the output CSV file. The script will restructure the data and save it here.",
- )
-
- args = parser.parse_args()
-
- # Call function with parsed arguments
- convert_to_csv(args.input, args.output)
diff --git a/krakenparser/counts/__init__.py b/krakenparser/counts/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/krakenparser/counts/convert2csv.py b/krakenparser/counts/convert2csv.py
new file mode 100755
index 0000000..835ffc3
--- /dev/null
+++ b/krakenparser/counts/convert2csv.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+
+import argparse
+import logging
+from pathlib import Path
+import pandas as pd
+
+_log = logging.getLogger(__name__)
+
+
+def convert_to_csv(input_file, output_file):
+ in_path = Path(input_file)
+ if not in_path.is_file():
+ raise FileNotFoundError(f"Input file not found: {in_path}")
+ out_path = Path(output_file)
+ if not out_path.parent.exists():
+ raise FileNotFoundError(f"Output directory does not exist: {out_path.parent}")
+
+ data = pd.read_csv(in_path, sep="\t", index_col=0)
+ data.T.to_csv(out_path, index_label="Sample_id")
+ _log.info("Data converted and saved as '%s'.", output_file)
+
+
+if __name__ == "__main__":
+ # Use argparse to handle command-line arguments
+ parser = argparse.ArgumentParser(
+ description="Reads a TXT file, reorganizes the data, and converts it into a CSV file."
+ )
+ parser.add_argument(
+ "-i",
+ "--input",
+ required=True,
+ help="Path to the input TXT file. This file should contain sample names in columns and microbial taxa in rows.",
+ )
+ parser.add_argument(
+ "-o",
+ "--output",
+ required=True,
+ help="Path to the output CSV file. The script will restructure the data and save it here.",
+ )
+
+ args = parser.parse_args()
+
+ # Call function with parsed arguments
+ convert_to_csv(args.input, args.output)
diff --git a/krakenparser/processing_script.py b/krakenparser/counts/processing_script.py
similarity index 66%
rename from krakenparser/processing_script.py
rename to krakenparser/counts/processing_script.py
index 911d306..b2b28ff 100755
--- a/krakenparser/processing_script.py
+++ b/krakenparser/counts/processing_script.py
@@ -1,7 +1,8 @@
#!/usr/bin/env python
+import os
import sys
-import shutil
+import tempfile
import argparse
from pathlib import Path
@@ -10,41 +11,45 @@ def modify_taxa_names(line):
prefixes = ["s__", "g__", "f__", "o__", "c__", "p__"]
for prefix in prefixes:
if line.startswith(prefix):
- # Remove the prefix and replace underscores with spaces
- return line[len(prefix) :].replace("_", " ")
+ parts = line[len(prefix):].split("\t")
+ parts[0] = parts[0].replace("_", " ")
+ return "\t".join(parts)
return line
def process_files(source_file, destination_file):
+ src_path = Path(source_file)
+ if not src_path.is_file():
+ raise FileNotFoundError(f"Source file not found: {src_path}")
+ dest_path = Path(destination_file)
+ if not dest_path.is_file():
+ raise FileNotFoundError(f"Destination file not found: {dest_path}")
+
# Read the first line from the source file and modify it
- with open(source_file, "r") as file:
+ with open(src_path, "r") as file:
first_line_source = file.readline()
modified_first_line = "\t".join(
word.split(".")[0] for word in first_line_source.split()
)
# Read all content from the destination file and modify taxa names
- with open(destination_file, "r") as file:
+ with open(dest_path, "r") as file:
lines = file.readlines()
modified_lines = [modify_taxa_names(line.strip()) for line in lines]
# Combine the modified first line with the modified content of the destination file
updated_content = modified_first_line + "\n" + "\n".join(modified_lines)
- # Write the updated content back to the destination file
- with open(destination_file, "w") as file:
- file.write(updated_content)
+ # Write atomically: write to a temp file in the same directory, then replace
+ with tempfile.NamedTemporaryFile(
+ mode="w", dir=dest_path.parent, delete=False, suffix=".tmp"
+ ) as tmp:
+ tmp.write(updated_content)
+ tmp_path = tmp.name
+ os.replace(tmp_path, dest_path)
print(f"Processed {destination_file} successfully.")
- # Get the path to the current directory (same location as the script)
- current_dir = Path(__file__).resolve().parent
- pycache_dir = current_dir / "__pycache__"
-
- # Check if __pycache__ exists and remove it
- if pycache_dir.exists() and pycache_dir.is_dir():
- shutil.rmtree(pycache_dir)
-
if __name__ == "__main__":
# Use argparse to parse command-line arguments
diff --git a/krakenparser/counts/split_mpa.py b/krakenparser/counts/split_mpa.py
new file mode 100644
index 0000000..72b4e07
--- /dev/null
+++ b/krakenparser/counts/split_mpa.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+"""Split a combined MPA table into per-rank TXT files.
+
+Replaces decombine.sh and decombine_viruses.sh.
+"""
+
+import argparse
+import logging
+import re
+import sys
+from pathlib import Path
+
+_log = logging.getLogger(__name__)
+
+
+_RANKS = [
+ ("species", "s__", []),
+ ("genus", "g__", ["s__"]),
+ ("family", "f__", ["s__", "g__"]),
+ ("order", "o__", ["s__", "g__", "f__"]),
+ ("class", "c__", ["s__", "g__", "f__", "o__"]),
+ ("phylum", "p__", ["s__", "g__", "f__", "o__", "c__"]),
+]
+
+_HUMAN_TAXA = {
+ "species": "s__Homo_sapiens",
+ "genus": "g__Homo",
+ "family": "f__Hominidae",
+ "order": "o__Primates",
+ "class": "c__Mammalia",
+ "phylum": "p__Chordata",
+}
+
+_ACCESSION_RE = re.compile(r"(SRS|SRR|SRX|ERS|ERR|ERX|DRS|DRR|DRX)\d*-")
+
+
+def _strip_path_prefix(line: str) -> str:
+ """'d__X|p__Y|s__Z\t10\t20' → 's__Z\t10\t20'"""
+ tab = line.find("\t")
+ if tab == -1:
+ return line
+ path, rest = line[:tab], line[tab:]
+ pipe = path.rfind("|")
+ segment = path[pipe + 1:] if pipe != -1 else path
+ return _ACCESSION_RE.sub("", segment + rest)
+
+
+def split_mpa(
+ input_file: str,
+ output_dir: str,
+ viruses_only: bool = False,
+ keep_human: bool = False,
+) -> None:
+ in_path = Path(input_file)
+ if not in_path.is_file():
+ raise FileNotFoundError(f"Input file not found: {in_path}")
+ out_path = Path(output_dir)
+ (out_path / "txt").mkdir(parents=True, exist_ok=True)
+
+ lines = in_path.read_text().splitlines()
+ data_lines = [ln for ln in lines if not ln.startswith("#") and ln.strip()]
+
+ if viruses_only:
+ data_lines = [ln for ln in data_lines if "d__Viruses" in ln]
+
+ filter_human = not keep_human and not viruses_only
+
+ for rank_name, rank_prefix, exclude_prefixes in _RANKS:
+ result = []
+ human_pattern = _HUMAN_TAXA[rank_name]
+
+ for line in data_lines:
+ if rank_prefix not in line:
+ continue
+ if "t__" in line:
+ continue
+ if any(ep in line for ep in exclude_prefixes):
+ continue
+ if filter_human and human_pattern in line:
+ continue
+ result.append(_strip_path_prefix(line))
+
+ out_file = out_path / "txt" / f"counts_{rank_name}.txt"
+ out_file.write_text("\n".join(result) + ("\n" if result else ""))
+
+ _log.info("MPA file split successfully. Output stored in %s", output_dir)
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(
+ description="Split a combined MPA table into per-rank TXT files."
+ )
+ parser.add_argument("-i", "--input", required=True, help="Input combined MPA file")
+ parser.add_argument("-o", "--output", required=True, help="Output directory")
+ parser.add_argument(
+ "--viruses-only",
+ action="store_true",
+ default=False,
+ help="Extract only Viruses domain taxa",
+ )
+ parser.add_argument(
+ "--keep-human",
+ action="store_true",
+ default=False,
+ help="Do not filter human-related taxa (default: filtered)",
+ )
+ args = parser.parse_args()
+ split_mpa(args.input, args.output, viruses_only=args.viruses_only, keep_human=args.keep_human)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/krakenparser/decombine.sh b/krakenparser/decombine.sh
deleted file mode 100755
index 0832018..0000000
--- a/krakenparser/decombine.sh
+++ /dev/null
@@ -1,123 +0,0 @@
-#!/bin/bash
-
-# Function to display detailed usage information
-usage() {
- echo "Usage: $(basename "$0") -i PATH_TO_SOURCE_FILE -o PATH_TO_DESTINATION"
- echo
- echo " -i, --input PATH_TO_SOURCE_FILE Path to the Combined MPA input file to be processed."
- echo " -o, --output PATH_TO_DESTINATION Path to the directory where processed output files will be stored."
- echo " -h, --help Display this help message and exit."
- echo
- echo "Description:"
- echo " This script processes a combined mpa file by extracting different taxonomic levels"
- echo " (species, genus, family, order, class, and phylum) and saving the results as separate text files."
- echo " Additionally, it removes human-related sequences to improve data accuracy."
- echo
- echo "Processing Details:"
- echo " - Extracts taxonomic levels using 'grep' with specific patterns."
- echo " - Filters out undesired taxonomic entries such as:"
- echo " - Species: Homo sapiens"
- echo " - Genus: Homo"
- echo " - Family: Hominidae"
- echo " - Order: Primates"
- echo " - Class: Mammalia"
- echo " - Phylum: Chordata"
- echo " - Outputs are stored as text files inside the specified destination directory."
- exit 0
-}
-
-# Initialize variables
-SOURCE_FILE=""
-DESTINATION_DIR=""
-
-# Parse command-line arguments
-while [[ "$#" -gt 0 ]]; do
- case "$1" in
- -i|--input) SOURCE_FILE="$2"; shift 2 ;;
- -o|--output) DESTINATION_DIR="$2"; shift 2 ;;
- -h|--help) usage ;;
- *) echo "Error: Unknown option $1"; usage ;;
- esac
-done
-
-# Check if required arguments are provided
-if [[ -z "$SOURCE_FILE" || -z "$DESTINATION_DIR" ]]; then
- echo "Error: Both input (-i) and output (-o) paths are required."
- usage
-fi
-
-# Check if source file exists
-if [[ ! -f "$SOURCE_FILE" ]]; then
- echo "Error: Input file '$SOURCE_FILE' not found!"
- exit 1
-fi
-
-# Create destination directories
-mkdir -p "${DESTINATION_DIR}/txt"
-mkdir -p "${DESTINATION_DIR}/csv"
-
-# Process input file and generate output files
-grep -E "s__" "${SOURCE_FILE}" \
-| grep -v "t__" \
-| grep -v "s__Homo_sapiens" \
-| sed "s/^.*|//g" \
-| sed "s/SRS[0-9]*-//g" \
-> "${DESTINATION_DIR}/txt/counts_species.txt"
-
-grep -E "g__" "${SOURCE_FILE}" \
-| grep -v "t__" \
-| grep -v "s__" \
-| grep -v "g__Homo" \
-| sed "s/^.*|//g" \
-| sed "s/SRS[0-9]*-//g" \
-> "${DESTINATION_DIR}/txt/counts_genus.txt"
-
-grep -E "f__" "${SOURCE_FILE}" \
-| grep -v "t__" \
-| grep -v "s__" \
-| grep -v "g__" \
-| grep -v "f__Hominidae" \
-| sed "s/^.*|//g" \
-| sed "s/SRS[0-9]*-//g" \
-> "${DESTINATION_DIR}/txt/counts_family.txt"
-
-grep -E "o__" "${SOURCE_FILE}" \
-| grep -v "t__" \
-| grep -v "s__" \
-| grep -v "g__" \
-| grep -v "f__" \
-| grep -v "o__Primates" \
-| sed "s/^.*|//g" \
-| sed "s/SRS[0-9]*-//g" \
-> "${DESTINATION_DIR}/txt/counts_order.txt"
-
-grep -E "c__" "${SOURCE_FILE}" \
-| grep -v "t__" \
-| grep -v "s__" \
-| grep -v "g__" \
-| grep -v "f__" \
-| grep -v "o__" \
-| grep -v "c__Mammalia" \
-| sed "s/^.*|//g" \
-| sed "s/SRS[0-9]*-//g" \
-> "${DESTINATION_DIR}/txt/counts_class.txt"
-
-grep -E "p__" "${SOURCE_FILE}" \
-| grep -v "t__" \
-| grep -v "s__" \
-| grep -v "g__" \
-| grep -v "f__" \
-| grep -v "o__" \
-| grep -v "c__" \
-| grep -v "p__Chordata" \
-| sed "s/^.*|//g" \
-| sed "s/SRS[0-9]*-//g" \
-> "${DESTINATION_DIR}/txt/counts_phylum.txt"
-
-# Check for errors
-if [ $? -ne 0 ]; then
- echo "Error: Failed to run decombine.sh"
- exit 1
-fi
-
-echo "MPA file decombined successfully. Output stored in $DESTINATION_DIR"
\ No newline at end of file
diff --git a/krakenparser/decombine_viruses.sh b/krakenparser/decombine_viruses.sh
deleted file mode 100755
index 99c8260..0000000
--- a/krakenparser/decombine_viruses.sh
+++ /dev/null
@@ -1,111 +0,0 @@
-#!/bin/bash
-
-# Function to display detailed usage information
-usage() {
- echo "Usage: $(basename "$0") -i PATH_TO_SOURCE_FILE -o PATH_TO_DESTINATION"
- echo
- echo " -i, --input PATH_TO_SOURCE_FILE Path to the Combined MPA input file to be processed."
- echo " -o, --output PATH_TO_DESTINATION Path to the directory where processed output files will be stored."
- echo " -h, --help Display this help message and exit."
- echo
- echo "Description:"
- echo " This script processes a combined mpa file by extracting different taxonomic levels using only VIRUSES domain"
- echo " (species, genus, family, order, class, and phylum) and saving the results as separate text files."
- echo
- echo "Processing Details:"
- echo " - Extracts taxonomic levels only on VIRUSES domain using 'grep' with specific patterns."
- echo " - Outputs are stored as text files inside the specified destination directory."
- exit 0
-}
-
-# Initialize variables
-SOURCE_FILE=""
-DESTINATION_DIR=""
-
-# Parse command-line arguments
-while [[ "$#" -gt 0 ]]; do
- case "$1" in
- -i|--input) SOURCE_FILE="$2"; shift 2 ;;
- -o|--output) DESTINATION_DIR="$2"; shift 2 ;;
- -h|--help) usage ;;
- *) echo "Error: Unknown option $1"; usage ;;
- esac
-done
-
-# Check if required arguments are provided
-if [[ -z "$SOURCE_FILE" || -z "$DESTINATION_DIR" ]]; then
- echo "Error: Both input (-i) and output (-o) paths are required."
- usage
-fi
-
-# Check if source file exists
-if [[ ! -f "$SOURCE_FILE" ]]; then
- echo "Error: Input file '$SOURCE_FILE' not found!"
- exit 1
-fi
-
-# Create destination directories
-mkdir -p "${DESTINATION_DIR}/txt"
-mkdir -p "${DESTINATION_DIR}/csv"
-
-VIRUSES_BUFFER=$(grep -E "d__Viruses" "${SOURCE_FILE}")
-
-# Process input file and generate output files
-echo "$VIRUSES_BUFFER" | grep -E "s__" \
- | grep -v "t__" \
- | sed "s/^.*|//g" \
- | sed "s/SRS[0-9]*-//g" \
- > "${DESTINATION_DIR}/txt/counts_species.txt"
-
-echo "$VIRUSES_BUFFER" | grep -E "g__" \
- | grep -v "t__" \
- | grep -v "s__" \
- | sed "s/^.*|//g" \
- | sed "s/SRS[0-9]*-//g" \
- > "${DESTINATION_DIR}/txt/counts_genus.txt"
-
-echo "$VIRUSES_BUFFER" | grep -E "f__" \
- | grep -v "t__" \
- | grep -v "s__" \
- | grep -v "g__" \
- | sed "s/^.*|//g" \
- | sed "s/SRS[0-9]*-//g" \
- > "${DESTINATION_DIR}/txt/counts_family.txt"
-
-echo "$VIRUSES_BUFFER" | grep -E "o__" \
- | grep -v "t__" \
- | grep -v "s__" \
- | grep -v "g__" \
- | grep -v "f__" \
- | sed "s/^.*|//g" \
- | sed "s/SRS[0-9]*-//g" \
- > "${DESTINATION_DIR}/txt/counts_order.txt"
-
-echo "$VIRUSES_BUFFER" | grep -E "c__" \
- | grep -v "t__" \
- | grep -v "s__" \
- | grep -v "g__" \
- | grep -v "f__" \
- | grep -v "o__" \
- | sed "s/^.*|//g" \
- | sed "s/SRS[0-9]*-//g" \
- > "${DESTINATION_DIR}/txt/counts_class.txt"
-
-echo "$VIRUSES_BUFFER" | grep -E "p__" \
- | grep -v "t__" \
- | grep -v "s__" \
- | grep -v "g__" \
- | grep -v "f__" \
- | grep -v "o__" \
- | grep -v "c__" \
- | sed "s/^.*|//g" \
- | sed "s/SRS[0-9]*-//g" \
- > "${DESTINATION_DIR}/txt/counts_phylum.txt"
-
-# Check for errors
-if [ $? -ne 0 ]; then
- echo "Error: Failed to run decombine_viruses.sh"
- exit 1
-fi
-
-echo "MPA file decombined successfully. Output stored in $DESTINATION_DIR"
\ No newline at end of file
diff --git a/krakenparser/kpplot/base.py b/krakenparser/kpplot/base.py
index 97f5447..2df3f0c 100644
--- a/krakenparser/kpplot/base.py
+++ b/krakenparser/kpplot/base.py
@@ -1,4 +1,5 @@
import matplotlib.pyplot as plt
+import pandas as pd
from typing import Optional
@@ -31,3 +32,27 @@ def savefig(
self.fig.savefig(
path, dpi=dpi, transparent=transparent, bbox_inches=bbox_inches
)
+
+
+def aggregate_by_metadata(
+ df: pd.DataFrame,
+ metadata: pd.DataFrame,
+ metadata_group: str,
+) -> pd.DataFrame:
+ """Merge df with metadata and re-normalise rel_abund_perc per group."""
+ if "Sample_id" not in metadata.columns:
+ raise ValueError("metadata must contain 'Sample_id' column")
+ if metadata_group not in metadata.columns:
+ raise ValueError(f"'{metadata_group}' column not found in metadata")
+ df = df.merge(
+ metadata[["Sample_id", metadata_group]], on="Sample_id", how="left"
+ )
+ df = (
+ df.groupby([metadata_group, "taxon"], as_index=False)["rel_abund_perc"]
+ .mean()
+ .rename(columns={metadata_group: "Sample_id"})
+ )
+ df["rel_abund_perc"] = df.groupby("Sample_id")["rel_abund_perc"].transform(
+ lambda x: (x / x.sum()) * 100
+ )
+ return df
diff --git a/krakenparser/kpplot/clustermap.py b/krakenparser/kpplot/clustermap.py
index 1a18ef8..f0ad5bb 100644
--- a/krakenparser/kpplot/clustermap.py
+++ b/krakenparser/kpplot/clustermap.py
@@ -2,9 +2,7 @@
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Optional, Tuple, Union, List
-from pathlib import Path
-import shutil
-from .base import KpPlotBase
+from .base import KpPlotBase, aggregate_by_metadata
class KpClustermap(KpPlotBase):
@@ -81,28 +79,13 @@ def clustermap(
- background_color: Background color of the figure.
Returns:
- - KpStackedBarplot: An object containing the clustermap figure and axis for customization or saving.
+ - KpClustermap: An object containing the clustermap figure and axis for customization or saving.
"""
df = df.copy()
if metadata is not None and metadata_group is not None:
- if "Sample_id" not in metadata.columns:
- raise ValueError("metadata must contain 'Sample_id' column")
- if metadata_group not in metadata.columns:
- raise ValueError(f"'{metadata_group}' column not found in metadata")
-
- df = df.merge(
- metadata[["Sample_id", metadata_group]], on="Sample_id", how="left"
- )
- df = (
- df.groupby([metadata_group, "taxon"], as_index=False)["rel_abund_perc"]
- .mean()
- .rename(columns={metadata_group: "Sample_id"})
- )
- df["rel_abund_perc"] = df.groupby("Sample_id")["rel_abund_perc"].transform(
- lambda x: (x / x.sum()) * 100
- )
+ df = aggregate_by_metadata(df, metadata, metadata_group)
if df[y_axis].dtype == object:
other_mask = df[y_axis].str.startswith("Other")
@@ -187,10 +170,5 @@ def clustermap(
style=yticks_style,
)
- current_dir = Path(__file__).resolve().parent
- pycache_dir = current_dir / "__pycache__"
- if pycache_dir.exists() and pycache_dir.is_dir():
- shutil.rmtree(pycache_dir)
-
plt.close(fig)
return KpClustermap(fig, ax)
diff --git a/krakenparser/kpplot/stackedbar.py b/krakenparser/kpplot/stackedbar.py
index 44f3308..8140437 100644
--- a/krakenparser/kpplot/stackedbar.py
+++ b/krakenparser/kpplot/stackedbar.py
@@ -3,9 +3,7 @@
import seaborn as sns
import numpy as np
from typing import Optional, Tuple, Union, List
-from pathlib import Path
-import shutil
-from .base import KpPlotBase
+from .base import KpPlotBase, aggregate_by_metadata
class KpStackedBarplot(KpPlotBase):
@@ -89,24 +87,12 @@ def stacked_barplot(
df = df.copy()
if metadata is not None and metadata_group is not None:
- if "Sample_id" not in metadata.columns:
- raise ValueError("metadata must contain 'Sample_id' column")
- if metadata_group not in metadata.columns:
- raise ValueError(f"'{metadata_group}' column not found in metadata")
-
- df = df.merge(
- metadata[["Sample_id", metadata_group]], on="Sample_id", how="left"
- )
- df = (
- df.groupby([metadata_group, "taxon"], as_index=False)["rel_abund_perc"]
- .mean()
- .rename(columns={metadata_group: "Sample_id"})
- )
- df["rel_abund_perc"] = df.groupby("Sample_id")["rel_abund_perc"].transform(
- lambda x: (x / x.sum()) * 100
- )
+ df = aggregate_by_metadata(df, metadata, metadata_group)
if sample_order is not None:
+ missing = set(sample_order) - set(df["Sample_id"].unique())
+ if missing:
+ raise ValueError(f"Samples missing from data: {missing}")
df = df[df["Sample_id"].isin(sample_order)]
df["Sample_id"] = pd.Categorical(
df["Sample_id"], categories=sample_order, ordered=True
@@ -127,6 +113,11 @@ def stacked_barplot(
zip(df_plot.columns, sns.color_palette(cmap, n_colors=len(df_plot.columns)))
)
elif isinstance(cmap, list):
+ if len(cmap) < len(df_plot.columns):
+ raise ValueError(
+ f"cmap has {len(cmap)} colors but the data has {len(df_plot.columns)} "
+ "taxa; provide at least as many colors as taxa."
+ )
color_dict = dict(zip(df_plot.columns, cmap))
else:
raise ValueError("cmap must be a str or a list of colors")
@@ -199,10 +190,5 @@ def stacked_barplot(
ax.set_xlim(-0.5, len(df_plot.index) - 0.5)
- current_dir = Path(__file__).resolve().parent
- pycache_dir = current_dir / "__pycache__"
- if pycache_dir.exists() and pycache_dir.is_dir():
- shutil.rmtree(pycache_dir)
-
plt.close(fig)
return KpStackedBarplot(fig, ax)
diff --git a/krakenparser/kpplot/streamgraph.py b/krakenparser/kpplot/streamgraph.py
index ac82e46..6c389a2 100644
--- a/krakenparser/kpplot/streamgraph.py
+++ b/krakenparser/kpplot/streamgraph.py
@@ -3,9 +3,7 @@
import seaborn as sns
import numpy as np
from typing import Optional, Tuple, Union, List
-from pathlib import Path
-import shutil
-from .base import KpPlotBase
+from .base import KpPlotBase, aggregate_by_metadata
class KpStreamgraph(KpPlotBase):
@@ -90,24 +88,12 @@ def streamgraph(
df = df.copy()
if metadata is not None and metadata_group is not None:
- if "Sample_id" not in metadata.columns:
- raise ValueError("metadata must contain 'Sample_id' column")
- if metadata_group not in metadata.columns:
- raise ValueError(f"'{metadata_group}' column not found in metadata")
-
- df = df.merge(
- metadata[["Sample_id", metadata_group]], on="Sample_id", how="left"
- )
- df = (
- df.groupby([metadata_group, "taxon"], as_index=False)["rel_abund_perc"]
- .mean()
- .rename(columns={metadata_group: "Sample_id"})
- )
- df["rel_abund_perc"] = df.groupby("Sample_id")["rel_abund_perc"].transform(
- lambda x: (x / x.sum()) * 100
- )
+ df = aggregate_by_metadata(df, metadata, metadata_group)
if sample_order is not None:
+ missing = set(sample_order) - set(df["Sample_id"].unique())
+ if missing:
+ raise ValueError(f"Samples missing from data: {missing}")
df = df[df["Sample_id"].isin(sample_order)]
df["Sample_id"] = pd.Categorical(
df["Sample_id"], categories=sample_order, ordered=True
@@ -128,6 +114,11 @@ def streamgraph(
zip(df_plot.columns, sns.color_palette(cmap, n_colors=len(df_plot.columns)))
)
elif isinstance(cmap, list):
+ if len(cmap) < len(df_plot.columns):
+ raise ValueError(
+ f"cmap has {len(cmap)} colors but the data has {len(df_plot.columns)} "
+ "taxa; provide at least as many colors as taxa."
+ )
color_dict = dict(zip(df_plot.columns, cmap))
else:
raise ValueError("cmap must be a str or a list of colors")
@@ -190,7 +181,7 @@ def streamgraph(
text.set_fontstyle(legend_fontstyle)
if grid:
- ax.grid(axis="x", linestyle=grid_linestyle, alpha=grid_alpha, zorder=0)
+ ax.grid(axis="y", linestyle=grid_linestyle, alpha=grid_alpha, zorder=0)
labels = df_plot.index.tolist()
plt.xticks(
@@ -206,12 +197,6 @@ def streamgraph(
ax.set_xlim(-0.5, len(df_plot.index) - 0.5)
- current_dir = Path(__file__).resolve().parent
- pycache_dir = current_dir / "__pycache__"
-
- if pycache_dir.exists() and pycache_dir.is_dir():
- shutil.rmtree(pycache_dir)
-
plt.close(fig)
return KpStreamgraph(fig, ax)
diff --git a/krakenparser/kraken2csv.sh b/krakenparser/kraken2csv.sh
deleted file mode 100755
index 5b53f75..0000000
--- a/krakenparser/kraken2csv.sh
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/bin/bash
-
-# Function to display usage
-usage() {
- echo "Usage: $(basename "$0") -i|--input PATH_TO_SOURCE"
- echo
- echo " -i, --input Path to the source directory — Kraken2 reports must be inside a subdirectory (e.g., data/kreports)"
- echo " -h Show this help message"
- exit 0
-}
-
-# Parse command-line arguments
-SOURCE_DIR=""
-
-while [[ $# -gt 0 ]]; do
- case "$1" in
- -i|--input) SOURCE_DIR="$2"; shift 2 ;;
- -h|--help) usage ;;
- *) echo "Unknown option: $1"; usage ;;
- esac
-done
-
-# Ensure SOURCE_DIR is set
-if [ -z "$SOURCE_DIR" ]; then
- echo "Error: input is required."
- usage
-fi
-
-# Determine the directory of this script
-SCRIPT_DIR=$(dirname "$(realpath "$0")")
-
-# PART 1: CONVERT KRAKEN2 TO MPA
-
-# Setting the path to the source file directory and destination directory
-PARENT_DIR=$(dirname "$SOURCE_DIR")
-MPA_DIR="$PARENT_DIR/mpa"
-
-# Run the old script with the correct paths
-"$SCRIPT_DIR/run_kreport2mpa.sh" -i "$SOURCE_DIR" -o "$MPA_DIR"
-
-# PART 2: COMBINING MPAs
-
-COMBINED_FILE="$PARENT_DIR/COMBINED.txt"
-python "$SCRIPT_DIR/combine_mpa.py" -i "$MPA_DIR"/* -o "$COMBINED_FILE"
-if [ $? -ne 0 ]; then
- echo "Error: Failed to run combine_mpa.py"
- exit 1
-fi
-echo "MPA files combined successfully. Output stored in $COMBINED_FILE"
-
-# PART 3: DECOMBINING MPAs
-
-COUNTS_DIR="$PARENT_DIR/counts"
-
-"$SCRIPT_DIR/decombine.sh" -i "$COMBINED_FILE" -o "$COUNTS_DIR"
-
-# PART 4: PROCESS COUNTS TXT FILES
-
-for file in "$COUNTS_DIR"/txt/counts_*.txt; do
- python "$SCRIPT_DIR/processing_script.py" -i "$COMBINED_FILE" -o "$file"
- if [ $? -ne 0 ]; then
- echo "Error: Failed to process $file"
- exit 1
- fi
-done
-
-# PART 5: CONVERT TXT FILES TO CSV
-
-for file in "$COUNTS_DIR"/txt/counts_*.txt; do
- CSV_FILE="$COUNTS_DIR/csv/$(basename "$file" .txt).csv"
- python "$SCRIPT_DIR/convert2csv.py" -i "$file" -o "$CSV_FILE"
-done
-
-# PART 6: CALCULATE RELATIVE ABUNDANCE
-
-mkdir -p "$PARENT_DIR/rel_abund"
-
-for file in "$COUNTS_DIR"/csv/counts_*.csv; do
- base=$(basename "$file")
- new_name="${base/counts_/ra_}"
- CSV_RA_FILE="$PARENT_DIR/rel_abund/$new_name"
- python "$SCRIPT_DIR/relabund.py" -i "$file" -o "$CSV_RA_FILE"
-done
-
-# PART 7: CALCULATE α & β-DIVERSITIES
-
-CSV_SPECIES_FILE="$COUNTS_DIR/csv/counts_species.csv"
-python "$SCRIPT_DIR/diversity.py" -i "$CSV_SPECIES_FILE" -o "$PARENT_DIR/diversity"
-
-echo "All steps completed successfully!"
\ No newline at end of file
diff --git a/krakenparser/krakenparser.py b/krakenparser/krakenparser.py
index 5eb9c71..6a560ce 100755
--- a/krakenparser/krakenparser.py
+++ b/krakenparser/krakenparser.py
@@ -1,13 +1,18 @@
import argparse
+import logging
import subprocess
-import shutil
from pathlib import Path
import sys
-from .version import __version__
+from importlib.metadata import version as _pkg_version, PackageNotFoundError as _PNF
+try:
+ __version__ = _pkg_version("krakenparser")
+except _PNF:
+ __version__ = "unknown"
# Main function to run the tool
def main():
+ logging.basicConfig(level=logging.INFO, format="%(message)s")
print("KrakenParser by Ilia V. Popov")
# Set up argument parser
parser = argparse.ArgumentParser(
@@ -17,7 +22,7 @@ def main():
parser.add_argument(
"--complete",
action="store_true",
- help="Run the full pipeline automated",
+ help="Run the full pipeline (also the default when -i is given)",
)
parser.add_argument(
"--kreport2mpa",
@@ -65,19 +70,23 @@ def main():
args, extra_args = parser.parse_known_args()
+ for _a in extra_args:
+ if "\x00" in _a:
+ sys.exit("Error: argument contains invalid null byte.")
+
package_dir = Path(__file__).resolve().parent # Directory of the current script
- # Map flags to corresponding scripts
+ # Map flags to (script_path, base_args_to_prepend)
command_map = {
- "complete": package_dir / "kraken2csv.sh",
- "kreport2mpa": package_dir / "run_kreport2mpa.sh",
- "combine_mpa": package_dir / "combine_mpa.py",
- "deconstruct": package_dir / "decombine.sh",
- "deconstruct_viruses": package_dir / "decombine_viruses.sh",
- "process": package_dir / "processing_script.py",
- "txt2csv": package_dir / "convert2csv.py",
- "relabund": package_dir / "relabund.py",
- "diversity": package_dir / "diversity.py",
+ "complete": (package_dir / "pipeline.py", []),
+ "kreport2mpa": (package_dir / "mpa" / "transform2mpa.py", []),
+ "combine_mpa": (package_dir / "mpa" / "mpa_table.py", []),
+ "deconstruct": (package_dir / "counts" / "split_mpa.py", []),
+ "deconstruct_viruses":(package_dir / "counts" / "split_mpa.py", ["--viruses-only"]),
+ "process": (package_dir / "counts" / "processing_script.py", []),
+ "txt2csv": (package_dir / "counts" / "convert2csv.py", []),
+ "relabund": (package_dir / "stats" / "relabund.py", []),
+ "diversity": (package_dir / "stats" / "diversity.py", []),
}
if "-h" in sys.argv or "--help" in sys.argv:
@@ -85,21 +94,29 @@ def main():
parser.print_help()
return
+ def _build_cmd(script: Path, base_args: list[str], user_args: list[str]) -> list[str]:
+ if script.suffix == ".py":
+ # Run as module (-m) so the krakenparser package stays importable.
+ # Derive dotted module name from path relative to the package root.
+ module = ".".join(
+ script.relative_to(package_dir.parent).with_suffix("").parts
+ )
+ return [sys.executable, "-m", module] + base_args + user_args
+ return [str(script)] + base_args + user_args
+
# Find which argument was given and run the corresponding script
- for arg, script in command_map.items():
+ for arg, (script, base_args) in command_map.items():
if getattr(args, arg):
- subprocess.run(
- [script] + extra_args, check=True
- ) # Pass extra arguments to the script
+ subprocess.run(_build_cmd(script, base_args, extra_args), check=True)
return
- parser.print_help()
+ # Default to full pipeline when -i/--input is given without a subcommand
+ if "-i" in extra_args or "--input" in extra_args:
+ complete_script, complete_base = command_map["complete"]
+ subprocess.run(_build_cmd(complete_script, complete_base, extra_args), check=True)
+ return
- pycache_dir = package_dir / "__pycache__"
-
- # Check if __pycache__ exists and remove it
- if pycache_dir.exists() and pycache_dir.is_dir():
- shutil.rmtree(pycache_dir)
+ parser.print_help()
if __name__ == "__main__":
diff --git a/krakenparser/mpa/__init__.py b/krakenparser/mpa/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/krakenparser/mpa/mpa_table.py b/krakenparser/mpa/mpa_table.py
new file mode 100644
index 0000000..f66e63b
--- /dev/null
+++ b/krakenparser/mpa/mpa_table.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+"""Combine multiple MPA-format files into a single merged table."""
+
+import argparse
+import logging
+
+_log = logging.getLogger(__name__)
+
+
+def combine_mpa(in_files: list[str], o_file: str) -> None:
+ # Plain dict preserves insertion order (Python 3.7+).
+ taxa: dict[str, dict[int, str]] = {}
+ sample_names: list[str] = []
+
+ _log.info("Number of files to parse: %d", len(in_files))
+
+ for idx, in_path in enumerate(in_files):
+ sample_name = f"Sample #{idx + 1}"
+ with open(in_path) as fh:
+ for line in fh:
+ line = line.rstrip("\n")
+ if not line:
+ continue
+ if line.startswith("#"):
+ cols = line.split("\t")
+ if len(cols) >= 2:
+ sample_name = cols[-1]
+ continue
+ cols = line.split("\t", 1)
+ if len(cols) < 2:
+ continue
+ taxon, count = cols[0], cols[1]
+ if taxon not in taxa:
+ taxa[taxon] = {}
+ taxa[taxon][idx] = count
+ sample_names.append(sample_name)
+
+ n_samples = len(sample_names)
+ n_taxa = len(taxa)
+ _log.info("Number of classifications to write: %d", n_taxa)
+
+ with open(o_file, "w") as fh:
+ fh.write("#Classification\t" + "\t".join(sample_names) + "\n")
+ for taxon, counts in taxa.items():
+ row = [counts.get(i, "0") for i in range(n_samples)]
+ fh.write(taxon + "\t" + "\t".join(row) + "\n")
+
+ _log.info("%d classifications written", n_taxa)
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(
+ description="Combine MPA files into a single tab-delimited table."
+ )
+ parser.add_argument(
+ "-i", "--input",
+ required=True, nargs="+", dest="in_files",
+ help="Input MPA files (one per sample)",
+ )
+ parser.add_argument(
+ "-o", "--output",
+ required=True, dest="o_file",
+ help="Output merged MPA file",
+ )
+ args = parser.parse_args()
+ combine_mpa(args.in_files, args.o_file)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/krakenparser/mpa/transform2mpa.py b/krakenparser/mpa/transform2mpa.py
new file mode 100644
index 0000000..480a1f9
--- /dev/null
+++ b/krakenparser/mpa/transform2mpa.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+"""Convert a Kraken2 report to MetaPhlAn (MPA) format."""
+
+import argparse
+import os
+import sys
+from pathlib import Path
+
+# Maps Kraken2 single-letter rank codes to MPA prefixes
+_RANK_PREFIX = {
+ "D": "d", "K": "k", "P": "p", "C": "c",
+ "O": "o", "F": "f", "G": "g", "S": "s",
+}
+
+
+def _parse_line(line: str):
+ """
+ Parse one Kraken2 report line.
+
+ Standard format (6 columns):
+ pct cum_reads direct_reads rank taxid name(indented)
+
+ Returns (name, depth, rank, cum_reads, pct) or None on malformed input.
+ """
+ parts = line.rstrip("\n").split("\t")
+ if len(parts) < 5:
+ return None
+ try:
+ pct = float(parts[0])
+ cum_reads = int(parts[1])
+ except ValueError:
+ return None
+
+ rank = parts[3].strip()
+ name_field = parts[-1] # always the last column regardless of format variant
+
+ depth = 0
+ for ch in name_field:
+ if ch == " ":
+ depth += 1
+ else:
+ break
+ name = name_field.strip()
+
+ return name, depth // 2, rank, cum_reads, pct
+
+
+def kreport_to_mpa(
+ report_path: str,
+ output_path: str,
+ display_header: bool = False,
+ include_intermediate: bool = False,
+ use_reads: bool = True,
+ remove_spaces: bool = True,
+) -> None:
+ """
+ Convert a single Kraken2 report to MPA format.
+
+ Uses a stack to track the current taxonomic path. Each entry is
+ (structural_depth, mpa_segment, is_standard_rank). When a node at
+ depth d is encountered, all stack entries with depth >= d are popped
+ before the new entry is pushed, keeping the path consistent.
+ """
+ # Stack entries: (structural_depth, mpa_segment, is_standard_rank)
+ stack: list[tuple[int, str, bool]] = []
+
+ with open(report_path) as r_fh, open(output_path, "w") as o_fh:
+ if display_header:
+ o_fh.write("#Classification\t" + os.path.basename(report_path) + "\n")
+
+ for line in r_fh:
+ parsed = _parse_line(line)
+ if parsed is None:
+ continue
+ name, depth, rank, cum_reads, pct = parsed
+
+ # Skip unclassified and root — never appear in MPA output
+ if rank in ("U", "R"):
+ continue
+
+ # Strip numeric suffix to get base rank (e.g. "S1" → "S", "G2" → "G")
+ rank_base = rank.rstrip("0123456789")
+ is_standard = rank_base in _RANK_PREFIX and rank == rank_base
+
+ if not is_standard and not include_intermediate:
+ continue
+
+ prefix = _RANK_PREFIX.get(rank_base, "x")
+ seg_name = name.replace(" ", "_") if remove_spaces else name
+ mpa_seg = f"{prefix}__{seg_name}"
+
+ # Trim stack to the current structural depth
+ while stack and stack[-1][0] >= depth:
+ stack.pop()
+ stack.append((depth, mpa_seg, is_standard))
+
+ # Build the full MPA path; omit intermediate (x__) segments when not requested
+ path = "|".join(
+ seg
+ for (_, seg, std) in stack
+ if include_intermediate or std
+ )
+
+ value = str(cum_reads) if use_reads else str(pct)
+ o_fh.write(f"{path}\t{value}\n")
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(
+ description="Convert a Kraken2 report to MetaPhlAn (MPA) format."
+ )
+
+ mode = parser.add_mutually_exclusive_group(required=True)
+ mode.add_argument(
+ "-r", "--report-file", "--report",
+ dest="r_file",
+ help="Single input Kraken2 report file",
+ )
+ mode.add_argument(
+ "-i", "--input",
+ dest="input_dir",
+ help="Input directory containing Kraken2 report files (batch mode)",
+ )
+
+ parser.add_argument(
+ "-o", "--output",
+ required=True, dest="o_file",
+ help="Output MPA file (single mode) or output directory (batch mode)",
+ )
+ parser.add_argument(
+ "--display-header",
+ action="store_true", dest="add_header", default=False,
+ help="Write a header line with the sample name (filename)",
+ )
+ parser.add_argument(
+ "--read_count",
+ action="store_true", dest="use_reads", default=True,
+ help="Output clade read counts [default]",
+ )
+ parser.add_argument(
+ "--percentages",
+ action="store_false", dest="use_reads",
+ help="Output percentages instead of read counts",
+ )
+ parser.add_argument(
+ "--intermediate-ranks",
+ action="store_true", dest="x_include", default=False,
+ help="Include non-standard taxonomic ranks in output",
+ )
+ parser.add_argument(
+ "--no-intermediate-ranks",
+ action="store_false", dest="x_include",
+ help="Exclude non-standard taxonomic ranks [default]",
+ )
+ group = parser.add_mutually_exclusive_group()
+ group.add_argument(
+ "--remove-spaces",
+ action="store_true", dest="remove_spaces", default=True,
+ help="Replace spaces with underscores in taxon names [default]",
+ )
+ group.add_argument(
+ "--keep-spaces",
+ action="store_false", dest="remove_spaces",
+ help="Keep spaces in taxon names",
+ )
+ args = parser.parse_args()
+
+ kwargs = dict(
+ display_header=args.add_header,
+ include_intermediate=args.x_include,
+ use_reads=args.use_reads,
+ remove_spaces=args.remove_spaces,
+ )
+
+ if args.input_dir:
+ input_dir = Path(args.input_dir)
+ if not input_dir.is_dir():
+ sys.exit(f"Error: input directory not found: {input_dir}")
+ output_dir = Path(args.o_file)
+ output_dir.mkdir(parents=True, exist_ok=True)
+ for f in sorted(input_dir.iterdir()):
+ if not f.is_file():
+ continue
+ out_name = f.name.replace(".kreport", ".MPA.TXT")
+ kreport_to_mpa(str(f), str(output_dir / out_name), **kwargs)
+ print(f"Converted to MPA successfully. Output stored in {output_dir}")
+ else:
+ kreport_to_mpa(args.r_file, args.o_file, **kwargs)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/krakenparser/pipeline.py b/krakenparser/pipeline.py
new file mode 100644
index 0000000..02ca6a7
--- /dev/null
+++ b/krakenparser/pipeline.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python
+"""Full KrakenParser pipeline: kreport → MPA → combined → counts → rel_abund → diversity."""
+
+import argparse
+import logging
+import shutil
+import sys
+from pathlib import Path
+
+_log = logging.getLogger(__name__)
+
+import pandas as pd
+
+from krakenparser.mpa.transform2mpa import kreport_to_mpa
+from krakenparser.mpa.mpa_table import combine_mpa
+from krakenparser.counts.split_mpa import split_mpa
+from krakenparser.counts.processing_script import process_files
+from krakenparser.counts.convert2csv import convert_to_csv
+from krakenparser.stats.relabund import calculate_rel_abund
+from krakenparser.stats.diversity import calc_alpha_div, calc_beta_div
+
+
+def _is_processable(path: Path) -> bool:
+ """Return False for hidden files, files with null bytes, or non-UTF-8 files."""
+ if path.name.startswith("."):
+ return False
+ try:
+ chunk = path.read_bytes()[:1024]
+ if b"\x00" in chunk:
+ return False
+ chunk.decode("utf-8")
+ return True
+ except (UnicodeDecodeError, OSError):
+ return False
+
+
+_OUTPUT_SUBDIRS = ("intermediate", "counts", "rel_abund", "diversity")
+
+
+def run_pipeline(
+ input_dir: str,
+ output_dir: str | None = None,
+ keep_human: bool = False,
+ rarefaction_depth: int = 1000,
+ seed: int | None = None,
+ overwrite: bool = False,
+) -> None:
+ source_dir = Path(input_dir)
+ if not source_dir.is_dir():
+ sys.exit(f"Error: input directory not found: {source_dir}")
+
+ out_dir = Path(output_dir) if output_dir else source_dir.parent
+ out_dir.mkdir(parents=True, exist_ok=True)
+
+ existing = [out_dir / d for d in _OUTPUT_SUBDIRS if (out_dir / d).exists()]
+ if existing and not overwrite:
+ names = ", ".join(d.name for d in existing)
+ sys.exit(
+ f"Error: output already exists in '{out_dir}' ({names}).\n"
+ "Use --overwrite to overwrite it."
+ )
+ if overwrite:
+ for d in existing:
+ shutil.rmtree(d)
+ _log.info("Removed existing directory: %s", d)
+
+ intermediate_dir = out_dir / "intermediate"
+ intermediate_dir.mkdir(exist_ok=True)
+
+ # Part 1: kreport → MPA
+ mpa_dir = intermediate_dir / "mpa"
+ mpa_dir.mkdir(exist_ok=True)
+ for f in sorted(source_dir.iterdir()):
+ if not f.is_file():
+ continue
+ if not _is_processable(f):
+ _log.info("Skipping: %s", f.name)
+ continue
+ out_name = f.stem + ".MPA.TXT"
+ kreport_to_mpa(str(f), str(mpa_dir / out_name), display_header=True)
+
+ # Part 2: combine MPAs
+ mpa_files = sorted(mpa_dir.glob("*.MPA.TXT"))
+ if not mpa_files:
+ sys.exit("Error: no MPA files found after conversion.")
+ combined_file = intermediate_dir / "COMBINED.txt"
+ combine_mpa([str(f) for f in mpa_files], str(combined_file))
+ _log.info("MPA files combined. Output: %s", combined_file)
+
+ # Part 3: split combined MPA by rank
+ split_mpa(str(combined_file), str(intermediate_dir), keep_human=keep_human)
+ txt_dir = intermediate_dir / "txt"
+
+ # Part 4: clean taxa names and add sample header
+ for txt_file in sorted(txt_dir.glob("counts_*.txt")):
+ process_files(str(combined_file), str(txt_file))
+
+ # Part 5: TXT → CSV
+ counts_dir = out_dir / "counts"
+ counts_dir.mkdir(exist_ok=True)
+ for txt_file in sorted(txt_dir.glob("counts_*.txt")):
+ csv_file = counts_dir / txt_file.with_suffix(".csv").name
+ convert_to_csv(str(txt_file), str(csv_file))
+
+ # Part 6: relative abundance
+ rel_abund_dir = out_dir / "rel_abund"
+ rel_abund_dir.mkdir(exist_ok=True)
+ for csv_file in sorted(counts_dir.glob("counts_*.csv")):
+ ra_file = rel_abund_dir / csv_file.name.replace("counts_", "ra_")
+ calculate_rel_abund(str(csv_file), str(ra_file))
+
+ # Part 7: α & β-diversities
+ species_csv = counts_dir / "counts_species.csv"
+ if not species_csv.exists():
+ sys.exit(f"Error: species counts not found: {species_csv}")
+ diversity_dir = out_dir / "diversity"
+ diversity_dir.mkdir(exist_ok=True)
+ df = pd.read_csv(species_csv, index_col=0)
+ calc_alpha_div(df, diversity_dir)
+ calc_beta_div(df, diversity_dir, rarefaction_depth=rarefaction_depth, seed=seed)
+
+ _log.info("All steps completed successfully!")
+
+
+def main() -> None:
+ logging.basicConfig(level=logging.INFO, format="%(message)s")
+ parser = argparse.ArgumentParser(
+ description="Run the full KrakenParser pipeline."
+ )
+ parser.add_argument(
+ "-i", "--input", required=True,
+ help="Directory containing Kraken2 report files",
+ )
+ parser.add_argument(
+ "-o", "--output", default=None,
+ help="Output directory (default: parent of input)",
+ )
+ parser.add_argument(
+ "--keep-human", action="store_true", default=False,
+ help="Do not filter human-related taxa (default: filtered)",
+ )
+ parser.add_argument(
+ "-d", "--depth", type=int, default=1000,
+ help="Rarefaction depth for β-diversity (default: 1000)",
+ )
+ parser.add_argument(
+ "-s", "--seed", type=int, default=None,
+ help="Random seed for reproducible rarefaction (default: random)",
+ )
+ parser.add_argument(
+ "--overwrite", action="store_true", default=False,
+ help="Overwrite the output directory if it already exists",
+ )
+ args = parser.parse_args()
+ run_pipeline(
+ args.input, args.output,
+ keep_human=args.keep_human,
+ rarefaction_depth=args.depth,
+ seed=args.seed,
+ overwrite=args.overwrite,
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/krakenparser/run_kreport2mpa.sh b/krakenparser/run_kreport2mpa.sh
deleted file mode 100755
index 7b40ee9..0000000
--- a/krakenparser/run_kreport2mpa.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-
-# Function to display usage information
-usage() {
- echo "Usage: $(basename "$0") -i PATH_TO_SOURCE -o PATH_TO_DESTINATION"
- echo
- echo " -i, --input PATH_TO_SOURCE Path to the source directory containing files to convert."
- echo " -o, --output PATH_TO_DESTINATION Path to the destination directory where converted files will be stored."
- echo " -h, --help Display this help message and exit."
- echo
- echo "This script converts files in the source directory with the .kreport extension to .MPA.TXT format."
- echo "It uses KrakenTools' \`kreport2mpa.py\` Python script to perform the conversion."
- echo "The converted files will be saved in the specified destination directory."
- exit 0
-}
-
-# Initialize variables
-SOURCE_DIR=""
-DESTINATION_DIR=""
-
-# Parse command-line arguments
-while [[ "$#" -gt 0 ]]; do
- case "$1" in
- -i|--input) SOURCE_DIR="$2"; shift 2 ;;
- -o|--output) DESTINATION_DIR="$2"; shift 2 ;;
- -h|--help) usage ;;
- *) echo "Unknown option: $1"; usage ;;
- esac
-done
-
-# Check if required arguments are provided
-if [ -z "$SOURCE_DIR" ] || [ -z "$DESTINATION_DIR" ]; then
- echo "Error: Both input and output paths are required."
- usage
-fi
-
-# Determine the directory of this script
-SCRIPT_DIR=$(dirname "$(realpath "$0")")
-
-# Ensure KrakenTools script exists
-KREPORT_SCRIPT="$SCRIPT_DIR/kreport2mpa.py"
-if [ ! -f "$KREPORT_SCRIPT" ]; then
- echo "Error: kreport2mpa.py not found in $SCRIPT_DIR"
- exit 1
-fi
-
-# Create the destination folder if it does not exist
-mkdir -p "${DESTINATION_DIR}"
-
-# Convert files in the source directory
-for file in "${SOURCE_DIR}"/*.*; do
- # Get the file name without path
- filename=$(basename "${file}")
- # Form the command to process the file
- python "$SCRIPT_DIR/kreport2mpa.py" -r "${file}" -o "${DESTINATION_DIR}/${filename/.kreport/.MPA.TXT}" --display-header
-done
-
-# Check exit status
-if [ $? -ne 0 ]; then
- echo "Error: Conversion process failed."
- exit 1
-fi
-
-echo "Converted to MPA successfully. Output stored in $DESTINATION_DIR"
\ No newline at end of file
diff --git a/krakenparser/stats/__init__.py b/krakenparser/stats/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/krakenparser/diversity.py b/krakenparser/stats/diversity.py
similarity index 52%
rename from krakenparser/diversity.py
rename to krakenparser/stats/diversity.py
index 2fe68f5..ac3c87f 100644
--- a/krakenparser/diversity.py
+++ b/krakenparser/stats/diversity.py
@@ -1,16 +1,14 @@
#!/usr/bin/env python
-import pandas as pd
-import numpy as np
-import sys
-import shutil
import argparse
+import sys
from pathlib import Path
-from skbio.diversity import beta_diversity
-from skbio.stats import subsample_counts
+
+import numpy as np
+import pandas as pd
+from scipy.spatial.distance import pdist, squareform
-# Define Shannon index
def shannon_index(counts):
counts = np.array(counts)
counts = counts[counts > 0]
@@ -18,16 +16,14 @@ def shannon_index(counts):
return -np.sum(proportions * np.log(proportions))
-# Define Pielou's evenness
def pielou_evenness(counts):
- counts = np.array(counts)
- counts = counts[counts > 0]
- H = shannon_index(counts)
- S = len(counts)
- return H / np.log(S) if S > 1 else 0
+ counts = np.asarray(counts)
+ S = int(np.sum(counts > 0))
+ if S <= 1:
+ return np.nan
+ return shannon_index(counts) / np.log(S)
-# Define Chao1 richness estimator
def chao1_index(counts):
counts = np.array(counts)
S_obs = np.sum(counts > 0)
@@ -38,6 +34,13 @@ def chao1_index(counts):
return S_obs + (F1 * F1) / (2 * F2)
+def _subsample_counts(counts: np.ndarray, n: int, rng: np.random.Generator) -> np.ndarray:
+ """Rarefy counts to n reads by sampling without replacement."""
+ indices = np.repeat(np.arange(len(counts)), counts)
+ sampled = rng.choice(indices, size=n, replace=False)
+ return np.bincount(sampled, minlength=len(counts)).astype(int)
+
+
def calc_alpha_div(df, output_path):
results = []
for sample_id, row in df.iterrows():
@@ -54,26 +57,31 @@ def calc_alpha_div(df, output_path):
alpha_df.to_csv(output_path / "alpha_div.csv")
-def calc_beta_div(df, output_path, rarefaction_depth):
+def calc_beta_div(df, output_path, rarefaction_depth, seed=None):
+ rng = np.random.default_rng(seed)
rarefied_counts = []
sample_ids = []
for sample, row in df.iterrows():
- counts = row.values.astype(int)
+ counts = np.round(row.values).astype(int)
if counts.sum() >= rarefaction_depth:
- rarefied = subsample_counts(counts, n=rarefaction_depth)
+ rarefied = _subsample_counts(counts, n=rarefaction_depth, rng=rng)
rarefied_counts.append(rarefied)
sample_ids.append(sample)
if len(rarefied_counts) < 2:
raise ValueError("Not enough samples passed the rarefaction threshold.")
- bray_df = beta_diversity(
- "braycurtis", rarefied_counts, ids=sample_ids
- ).to_data_frame()
- jaccard_df = beta_diversity(
- "jaccard", rarefied_counts, ids=sample_ids
- ).to_data_frame()
+ X = np.array(rarefied_counts, dtype=float)
+
+ bray_df = pd.DataFrame(
+ squareform(pdist(X, metric="braycurtis")),
+ index=sample_ids, columns=sample_ids,
+ )
+ jaccard_df = pd.DataFrame(
+ squareform(pdist(X.astype(bool).astype(float), metric="jaccard")),
+ index=sample_ids, columns=sample_ids,
+ )
bray_df.to_csv(output_path / "beta_div_bray.csv")
jaccard_df.to_csv(output_path / "beta_div_jaccard.csv")
@@ -81,34 +89,26 @@ def calc_beta_div(df, output_path, rarefaction_depth):
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Calculate α & β-diversities.")
- parser.add_argument(
- "-i",
- "--input",
- required=True,
- help="Input total count table CSV (species level).",
- )
- parser.add_argument("-o", "--output", required=True, help="Output directory path.")
- parser.add_argument(
- "-d",
- "--depth",
- type=int,
- default=1000,
- help="Rarefaction depth for β diversity (default: 1000).",
- )
+ parser.add_argument("-i", "--input", required=True,
+ help="Input total count table CSV (species level).")
+ parser.add_argument("-o", "--output", required=True,
+ help="Output directory path.")
+ parser.add_argument("-d", "--depth", type=int, default=1000,
+ help="Rarefaction depth for β diversity (default: 1000).")
+ parser.add_argument("-s", "--seed", type=int, default=None,
+ help="Random seed for reproducible rarefaction (default: random).")
args = parser.parse_args()
input_file = Path(args.input)
+ if not input_file.is_file():
+ sys.exit(f"Error: input file not found: {input_file}")
output_dir = Path(args.output)
output_dir.mkdir(parents=True, exist_ok=True)
df = pd.read_csv(input_file, index_col=0)
calc_alpha_div(df, output_dir)
- calc_beta_div(df, output_dir, args.depth)
+ calc_beta_div(df, output_dir, args.depth, seed=args.seed)
print(
f"α & β-diversities have been successfully calculated and saved to '{output_dir}'."
)
-
- pycache_dir = Path(__file__).resolve().parent / "__pycache__"
- if pycache_dir.exists() and pycache_dir.is_dir():
- shutil.rmtree(pycache_dir)
diff --git a/krakenparser/relabund.py b/krakenparser/stats/relabund.py
similarity index 71%
rename from krakenparser/relabund.py
rename to krakenparser/stats/relabund.py
index d7a7143..171f280 100644
--- a/krakenparser/relabund.py
+++ b/krakenparser/stats/relabund.py
@@ -1,14 +1,24 @@
#!/usr/bin/env python
-import shutil
-from pathlib import Path
+import logging
+import warnings
import argparse
+from pathlib import Path
import pandas as pd
+_log = logging.getLogger(__name__)
+
def calculate_rel_abund(input_file, output_file, other_threshold=None):
+ in_path = Path(input_file)
+ if not in_path.is_file():
+ raise FileNotFoundError(f"Input file not found: {in_path}")
+ out_path = Path(output_file)
+ if not out_path.parent.exists():
+ raise FileNotFoundError(f"Output directory does not exist: {out_path.parent}")
+
# Load counts table
- df = pd.read_csv(input_file)
+ df = pd.read_csv(in_path)
# Reshape to long format: Sample_id, taxon, abundance
long_df = df.melt(id_vars=["Sample_id"], var_name="taxon", value_name="abundance")
@@ -16,6 +26,15 @@ def calculate_rel_abund(input_file, output_file, other_threshold=None):
# Summarize total abundance per sample (used for percentage calculation)
total_abundance = long_df.groupby("Sample_id")["abundance"].transform("sum")
+ zero_samples = long_df.groupby("Sample_id")["abundance"].sum()
+ zero_samples = zero_samples[zero_samples == 0].index.tolist()
+ if zero_samples:
+ warnings.warn(
+ f"Samples with zero total abundance were excluded from output: {zero_samples}",
+ UserWarning,
+ stacklevel=2,
+ )
+
# Calculate relative abundance (%)
long_df["rel_abund_perc"] = (long_df["abundance"] / total_abundance) * 100
@@ -40,16 +59,10 @@ def calculate_rel_abund(input_file, output_file, other_threshold=None):
# Save to CSV
result.to_csv(output_file, index=False)
- print(
- f"Relative abundance has been successfully calculated and saved as '{output_file}'."
+ _log.info(
+ "Relative abundance saved as '%s'.", output_file
)
- # Remove __pycache__
- current_dir = Path(__file__).resolve().parent
- pycache_dir = current_dir / "__pycache__"
- if pycache_dir.exists():
- shutil.rmtree(pycache_dir)
-
if __name__ == "__main__":
parser = argparse.ArgumentParser(
diff --git a/krakenparser/version.py b/krakenparser/version.py
deleted file mode 100755
index 43c4ab0..0000000
--- a/krakenparser/version.py
+++ /dev/null
@@ -1 +0,0 @@
-__version__ = "0.6.1"
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..7af15d5
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,50 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel>=0.42"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "krakenparser"
+version = "1.0.0"
+description = "A collection of scripts designed to process Kraken2 reports and convert them into CSV format."
+readme = {file = "README.md", content-type = "text/markdown"}
+license = {file = "LICENSE"}
+authors = [{name = "Ilia Popov", email = "iljapopov17@gmail.com"}]
+requires-python = ">=3.10,<=3.16"
+dependencies = [
+ "pandas>=2.0.0,<=2.3.3",
+ "matplotlib>=3.7.0,<=3.10.8",
+ "numpy>=1.24.0,<=2.3.5",
+ "seaborn>=0.12.0,<=0.13.2",
+ "scipy>=1.9.0,<=1.16.3",
+]
+
+[project.optional-dependencies]
+dev = [
+ "pytest>=7.0.0",
+ "pytest-cov>=4.0.0",
+]
+
+[project.urls]
+Homepage = "https://github.com/PopovIILab/KrakenParser"
+
+[project.scripts]
+KrakenParser = "krakenparser.krakenparser:main"
+
+[tool.setuptools.packages.find]
+where = ["."]
+
+
+[tool.pytest.ini_options]
+filterwarnings = [
+ "ignore:This figure includes Axes that are not compatible with tight_layout:UserWarning",
+ "ignore:Tight layout not applied.*:UserWarning",
+ "ignore:Samples with zero total abundance.*:UserWarning",
+]
+
+[tool.coverage.run]
+source = ["krakenparser"]
+omit = [
+ "krakenparser/krakenparser.py",
+ "krakenparser/mpa/transform2mpa.py",
+ "krakenparser/mpa/mpa_table.py",
+]
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..8100a3e
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,66 @@
+import matplotlib
+matplotlib.use("Agg")
+
+import pandas as pd
+import pytest
+
+
+SAMPLE_KREPORT = (
+ "99.98\t999980\t0\tR\t1\troot\n"
+ "99.98\t999980\t0\tD\t2\t Bacteria\n"
+ "85.00\t850000\t0\tP\t1224\t Pseudomonadota\n"
+ "50.00\t500000\t200000\tG\t286\t Pseudomonas\n"
+ "30.00\t300000\t300000\tS\t287\t Pseudomonas aeruginosa\n"
+ "20.00\t200000\t200000\tG\t561\t Escherichia\n"
+ "20.00\t200000\t200000\tS\t562\t Escherichia coli\n"
+ "10.00\t100000\t0\tP\t976\t Bacteroidota\n"
+ "10.00\t100000\t100000\tS\t817\t Bacteroides fragilis\n"
+ "0.02\t200\t200\tU\t0\tunclassified\n"
+)
+
+# Tab-delimited TXT as produced by processing_script.py
+SAMPLE_COUNTS_TXT = (
+ "#Classification\tsample1\tsample2\n"
+ "Pseudomonas aeruginosa\t300000\t100000\n"
+ "Escherichia coli\t200000\t50000\n"
+ "Bacteroides fragilis\t100000\t200000\n"
+)
+
+
+@pytest.fixture
+def kreport_file(tmp_path):
+ f = tmp_path / "sample.kreport"
+ f.write_text(SAMPLE_KREPORT)
+ return f
+
+
+@pytest.fixture
+def counts_txt_file(tmp_path):
+ f = tmp_path / "counts_species.txt"
+ f.write_text(SAMPLE_COUNTS_TXT)
+ return f
+
+
+@pytest.fixture
+def counts_csv_file(tmp_path):
+ df = pd.DataFrame({
+ "Sample_id": ["S1", "S2"],
+ "Pseudomonas aeruginosa": [300000, 100000],
+ "Escherichia coli": [200000, 50000],
+ "Bacteroides fragilis": [100000, 200000],
+ })
+ f = tmp_path / "counts_species.csv"
+ df.to_csv(f, index=False)
+ return f
+
+
+@pytest.fixture
+def relabund_df():
+ return pd.DataFrame({
+ "Sample_id": ["S1", "S1", "S1", "S2", "S2", "S2"],
+ "taxon": [
+ "Pseudomonadota", "Bacillota", "Other (<4.0%)",
+ "Pseudomonadota", "Bacillota", "Other (<4.0%)",
+ ],
+ "rel_abund_perc": [70.0, 20.0, 10.0, 50.0, 35.0, 15.0],
+ })
diff --git a/tests/test_full_pipeline.py b/tests/test_full_pipeline.py
index 360c781..599f625 100644
--- a/tests/test_full_pipeline.py
+++ b/tests/test_full_pipeline.py
@@ -1,9 +1,10 @@
import zipfile
import shutil
-import subprocess
import pytest
from pathlib import Path
+from krakenparser.pipeline import run_pipeline
+
@pytest.fixture
def demo_run(tmp_path):
@@ -29,21 +30,45 @@ def test_full_pipeline_end_to_end(demo_run):
run_dir = demo_run["run_dir"]
kreports_path = run_dir / "kreports"
- # 1. Run KrakenParser
- cmd = ["KrakenParser", "--complete", "-i", str(kreports_path)]
- result = subprocess.run(cmd, capture_output=True, text=True)
- assert result.returncode == 0, f"Pipeline failed:\n{result.stderr}"
+ run_pipeline(str(kreports_path))
- # 2. Assert each rank‐level CSV exists and is non‐empty
+ # Assert each rank-level CSV exists and is non-empty
ranks = ["phylum", "class", "order", "family", "genus", "species"]
for rank in ranks:
- csv_path = run_dir / "counts" / "csv" / f"counts_{rank}.csv"
+ csv_path = run_dir / "counts" / f"counts_{rank}.csv"
assert csv_path.exists(), f"Missing counts_{rank}.csv"
assert csv_path.stat().st_size > 0, f"counts_{rank}.csv is empty"
- # 3. Assert relative‐abundance CSVs exist and are non‐empty
+ # Assert relative-abundance CSVs exist and are non-empty
rel_dir = run_dir / "rel_abund"
assert rel_dir.exists(), "rel_abund directory is missing"
rel_species = rel_dir / "ra_species.csv"
- assert rel_species.exists(), "Missing ra_species.csv in csv_relabund"
+ assert rel_species.exists(), "Missing ra_species.csv"
assert rel_species.stat().st_size > 0, "ra_species.csv is empty"
+
+ # Assert diversity outputs exist
+ diversity_dir = run_dir / "diversity"
+ assert (diversity_dir / "alpha_div.csv").exists()
+
+ # Assert intermediate files exist
+ assert (run_dir / "intermediate" / "COMBINED.txt").exists()
+
+
+def test_pipeline_overwrite_protection(demo_run):
+ run_dir = demo_run["run_dir"]
+ kreports_path = run_dir / "kreports"
+
+ run_pipeline(str(kreports_path))
+
+ # Second run without --overwrite must exit
+ with pytest.raises(SystemExit):
+ run_pipeline(str(kreports_path))
+
+
+def test_pipeline_overwrite_flag(demo_run):
+ run_dir = demo_run["run_dir"]
+ kreports_path = run_dir / "kreports"
+
+ run_pipeline(str(kreports_path))
+ # Second run with overwrite=True must succeed
+ run_pipeline(str(kreports_path), overwrite=True)
diff --git a/tests/test_integration.py b/tests/test_integration.py
new file mode 100644
index 0000000..7fed2fb
--- /dev/null
+++ b/tests/test_integration.py
@@ -0,0 +1,356 @@
+"""Characterization / integration tests — file I/O, reproducibility."""
+
+import hashlib
+import itertools
+
+import pandas as pd
+import pytest
+
+from krakenparser.mpa.transform2mpa import kreport_to_mpa
+from krakenparser.counts.convert2csv import convert_to_csv
+from krakenparser.counts.processing_script import process_files
+from krakenparser.counts.split_mpa import split_mpa
+from krakenparser.stats.relabund import calculate_rel_abund
+from krakenparser.stats.diversity import calc_alpha_div, calc_beta_div
+
+
+SAMPLE_COMBINED_MPA = (
+ "#Classification\tsample1\tsample2\n"
+ "d__Bacteria|p__Pseudomonadota|g__Pseudomonas|s__Pseudomonas_aeruginosa\t300\t100\n"
+ "d__Bacteria|p__Pseudomonadota|g__Pseudomonas\t500\t200\n"
+ "d__Bacteria|p__Pseudomonadota|g__Homo|s__Homo_sapiens\t50\t30\n"
+ "d__Bacteria|p__Pseudomonadota|g__Homo\t60\t35\n"
+ "d__Bacteria|p__Pseudomonadota\t850\t350\n"
+ "d__Bacteria|p__Bacteroidota\t100\t80\n"
+ "d__Viruses|p__Uroviricota|s__Virus_alpha\t10\t5\n"
+)
+
+
+@pytest.fixture
+def combined_mpa_file(tmp_path):
+ f = tmp_path / "COMBINED.txt"
+ f.write_text(SAMPLE_COMBINED_MPA)
+ return f
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _sha256(path) -> str:
+ return hashlib.sha256(path.read_bytes()).hexdigest()
+
+
+# ---------------------------------------------------------------------------
+# kreport_to_mpa
+# ---------------------------------------------------------------------------
+
+def test_kreport_to_mpa_reproducible(kreport_file, tmp_path):
+ counter = itertools.count()
+
+ def run():
+ out = tmp_path / f"out_{next(counter)}.MPA.TXT"
+ kreport_to_mpa(str(kreport_file), str(out), display_header=True)
+ return _sha256(out)
+
+ assert run() == run()
+
+
+def test_kreport_to_mpa_standard_ranks_only(kreport_file, tmp_path):
+ out = tmp_path / "out.MPA.TXT"
+ kreport_to_mpa(str(kreport_file), str(out))
+ lines = out.read_text().splitlines()
+ paths = [ln.split("\t")[0] for ln in lines]
+
+ assert any("p__Pseudomonadota" in p for p in paths)
+ assert any("s__Pseudomonas_aeruginosa" in p for p in paths)
+ assert any(p.startswith("d__Bacteria|") for p in paths)
+
+
+def test_kreport_to_mpa_excludes_unclassified_and_root(kreport_file, tmp_path):
+ out = tmp_path / "out.MPA.TXT"
+ kreport_to_mpa(str(kreport_file), str(out))
+ content = out.read_text()
+ assert "unclassified" not in content
+ assert "root" not in content
+
+
+def test_kreport_to_mpa_display_header(kreport_file, tmp_path):
+ out = tmp_path / "out.MPA.TXT"
+ kreport_to_mpa(str(kreport_file), str(out), display_header=True)
+ first_line = out.read_text().splitlines()[0]
+ assert first_line.startswith("#Classification")
+ assert kreport_file.name in first_line
+
+
+def test_kreport_to_mpa_paths_are_hierarchical(kreport_file, tmp_path):
+ out = tmp_path / "out.MPA.TXT"
+ kreport_to_mpa(str(kreport_file), str(out))
+ lines = out.read_text().splitlines()
+ for ln in lines:
+ path = ln.split("\t")[0]
+ segments = path.split("|")
+ # Each segment must have a known two-letter prefix
+ for seg in segments:
+ assert "__" in seg, f"Unexpected MPA segment format: {seg!r}"
+
+
+# ---------------------------------------------------------------------------
+# convert_to_csv
+# ---------------------------------------------------------------------------
+
+def test_convert_to_csv_reproducible(counts_txt_file, tmp_path):
+ counter = itertools.count()
+
+ def run():
+ out = tmp_path / f"out_{next(counter)}.csv"
+ convert_to_csv(str(counts_txt_file), str(out))
+ return _sha256(out)
+
+ assert run() == run()
+
+
+def test_convert_to_csv_transposes_correctly(counts_txt_file, tmp_path):
+ out = tmp_path / "counts.csv"
+ convert_to_csv(str(counts_txt_file), str(out))
+ df = pd.read_csv(out)
+
+ assert "Sample_id" in df.columns
+ assert set(df["Sample_id"]) == {"sample1", "sample2"}
+ assert "Pseudomonas aeruginosa" in df.columns
+
+
+# ---------------------------------------------------------------------------
+# process_files
+# ---------------------------------------------------------------------------
+
+def test_process_files_adds_header_and_cleans_names(tmp_path):
+ source = tmp_path / "COMBINED.txt"
+ source.write_text(
+ "#Classification\tsample1.kreport\tsample2.kreport\n"
+ "d__Bacteria|p__X|s__Pseudomonas_aeruginosa\t300\t100\n"
+ )
+ dest = tmp_path / "counts_species.txt"
+ dest.write_text(
+ "s__Pseudomonas_aeruginosa\t300\t100\n"
+ "s__Escherichia_coli\t200\t50\n"
+ )
+ process_files(str(source), str(dest))
+ result = dest.read_text()
+ lines = result.splitlines()
+ assert lines[0] == "#Classification\tsample1\tsample2"
+ assert "Pseudomonas aeruginosa" in result
+ assert "Escherichia coli" in result
+ assert "s__" not in result
+
+
+def test_process_files_reproducible(tmp_path):
+ source = tmp_path / "COMBINED.txt"
+ source.write_text("#Classification\tsample1.kreport\n")
+ for i in range(2):
+ dest = tmp_path / f"counts_{i}.txt"
+ dest.write_text("s__Some_species\t10\n")
+ process_files(str(source), str(dest))
+ assert (tmp_path / "counts_0.txt").read_text() == (tmp_path / "counts_1.txt").read_text()
+
+
+def test_process_files_missing_source_raises(tmp_path):
+ dest = tmp_path / "counts.txt"
+ dest.write_text("s__X\t10\n")
+ with pytest.raises(FileNotFoundError):
+ process_files(str(tmp_path / "ghost.txt"), str(dest))
+
+
+def test_process_files_missing_dest_raises(tmp_path):
+ source = tmp_path / "COMBINED.txt"
+ source.write_text("#Classification\tsample1.kreport\n")
+ with pytest.raises(FileNotFoundError):
+ process_files(str(source), str(tmp_path / "ghost.txt"))
+
+
+def test_convert_to_csv_missing_input_raises(tmp_path):
+ with pytest.raises(FileNotFoundError):
+ convert_to_csv(str(tmp_path / "ghost.txt"), str(tmp_path / "out.csv"))
+
+
+# ---------------------------------------------------------------------------
+# calculate_rel_abund
+# ---------------------------------------------------------------------------
+
+def test_relabund_reproducible(counts_csv_file, tmp_path):
+ counter = itertools.count()
+
+ def run():
+ out = tmp_path / f"ra_{next(counter)}.csv"
+ calculate_rel_abund(str(counts_csv_file), str(out))
+ return _sha256(out)
+
+ assert run() == run()
+
+
+def test_relabund_sums_to_100_per_sample(counts_csv_file, tmp_path):
+ out = tmp_path / "ra.csv"
+ calculate_rel_abund(str(counts_csv_file), str(out))
+ df = pd.read_csv(out)
+ for sample, grp in df.groupby("Sample_id"):
+ total = grp["rel_abund_perc"].sum()
+ assert total == pytest.approx(100.0, abs=1e-6), f"{sample}: sum={total}"
+
+
+def test_relabund_other_threshold_creates_other_group(counts_csv_file, tmp_path):
+ out = tmp_path / "ra.csv"
+ calculate_rel_abund(str(counts_csv_file), str(out), other_threshold=99.0)
+ df = pd.read_csv(out)
+ assert df["taxon"].str.startswith("Other").any()
+
+
+def test_relabund_no_zero_rows(counts_csv_file, tmp_path):
+ out = tmp_path / "ra.csv"
+ calculate_rel_abund(str(counts_csv_file), str(out))
+ df = pd.read_csv(out)
+ assert (df["rel_abund_perc"] > 0).all()
+
+
+def test_relabund_missing_input_raises(tmp_path):
+ with pytest.raises(FileNotFoundError):
+ calculate_rel_abund(str(tmp_path / "ghost.csv"), str(tmp_path / "out.csv"))
+
+
+# ---------------------------------------------------------------------------
+# calc_alpha_div
+# ---------------------------------------------------------------------------
+
+def test_alpha_div_reproducible(counts_csv_file, tmp_path):
+ df = pd.read_csv(counts_csv_file, index_col=0)
+ counter = itertools.count()
+
+ def run():
+ out_dir = tmp_path / f"div_{next(counter)}"
+ out_dir.mkdir()
+ calc_alpha_div(df, out_dir)
+ return _sha256(out_dir / "alpha_div.csv")
+
+ assert run() == run()
+
+
+def test_alpha_div_output_columns(counts_csv_file, tmp_path):
+ df = pd.read_csv(counts_csv_file, index_col=0)
+ out_dir = tmp_path / "diversity"
+ out_dir.mkdir()
+ calc_alpha_div(df, out_dir)
+ result = pd.read_csv(out_dir / "alpha_div.csv")
+ assert set(result.columns) == {"Sample", "Shannon", "Pielou", "Chao1"}
+ assert len(result) == len(df)
+
+
+def test_alpha_div_shannon_non_negative(counts_csv_file, tmp_path):
+ df = pd.read_csv(counts_csv_file, index_col=0)
+ out_dir = tmp_path / "diversity"
+ out_dir.mkdir()
+ calc_alpha_div(df, out_dir)
+ result = pd.read_csv(out_dir / "alpha_div.csv")
+ assert (result["Shannon"] >= 0).all()
+
+
+# ---------------------------------------------------------------------------
+# calc_beta_div
+# ---------------------------------------------------------------------------
+
+def test_beta_div_output_files_exist(counts_csv_file, tmp_path):
+ df = pd.read_csv(counts_csv_file, index_col=0)
+ out_dir = tmp_path / "diversity"
+ out_dir.mkdir()
+ calc_beta_div(df, out_dir, rarefaction_depth=1000)
+ assert (out_dir / "beta_div_bray.csv").exists()
+ assert (out_dir / "beta_div_jaccard.csv").exists()
+
+
+def test_beta_div_matrix_is_square(counts_csv_file, tmp_path):
+ df = pd.read_csv(counts_csv_file, index_col=0)
+ out_dir = tmp_path / "diversity"
+ out_dir.mkdir()
+ calc_beta_div(df, out_dir, rarefaction_depth=1000)
+ bray = pd.read_csv(out_dir / "beta_div_bray.csv", index_col=0)
+ assert bray.shape[0] == bray.shape[1]
+
+
+def test_beta_div_diagonal_is_zero(counts_csv_file, tmp_path):
+ df = pd.read_csv(counts_csv_file, index_col=0)
+ out_dir = tmp_path / "diversity"
+ out_dir.mkdir()
+ calc_beta_div(df, out_dir, rarefaction_depth=1000)
+ bray = pd.read_csv(out_dir / "beta_div_bray.csv", index_col=0)
+ import numpy as np
+ assert np.allclose(np.diag(bray.values), 0.0)
+
+
+def test_beta_div_too_few_samples_raises(tmp_path):
+ df = pd.DataFrame(
+ {"Taxon_A": [100], "Taxon_B": [200]}, index=["S1"]
+ )
+ out_dir = tmp_path / "diversity"
+ out_dir.mkdir()
+ with pytest.raises(ValueError, match="rarefaction"):
+ calc_beta_div(df, out_dir, rarefaction_depth=1000)
+
+
+# ---------------------------------------------------------------------------
+# split_mpa
+# ---------------------------------------------------------------------------
+
+def test_split_mpa_creates_all_rank_files(combined_mpa_file, tmp_path):
+ split_mpa(str(combined_mpa_file), str(tmp_path))
+ for rank in ("species", "genus", "family", "order", "class", "phylum"):
+ assert (tmp_path / "txt" / f"counts_{rank}.txt").exists()
+
+
+def test_split_mpa_reproducible(combined_mpa_file, tmp_path):
+ counter = itertools.count()
+
+ def run():
+ out = tmp_path / f"out_{next(counter)}"
+ split_mpa(str(combined_mpa_file), str(out))
+ return _sha256(out / "txt" / "counts_species.txt")
+
+ assert run() == run()
+
+
+def test_split_mpa_filters_human_by_default(combined_mpa_file, tmp_path):
+ split_mpa(str(combined_mpa_file), str(tmp_path))
+ species = (tmp_path / "txt" / "counts_species.txt").read_text()
+ assert "Homo_sapiens" not in species
+ genus = (tmp_path / "txt" / "counts_genus.txt").read_text()
+ assert "g__Homo" not in genus
+
+
+def test_split_mpa_keep_human_retains_homo(combined_mpa_file, tmp_path):
+ split_mpa(str(combined_mpa_file), str(tmp_path), keep_human=True)
+ species = (tmp_path / "txt" / "counts_species.txt").read_text()
+ assert "Homo_sapiens" in species
+
+
+def test_split_mpa_viruses_only(combined_mpa_file, tmp_path):
+ split_mpa(str(combined_mpa_file), str(tmp_path), viruses_only=True)
+ species = (tmp_path / "txt" / "counts_species.txt").read_text()
+ assert "Virus_alpha" in species
+ assert "Pseudomonas_aeruginosa" not in species
+
+
+def test_split_mpa_strips_path_prefix(combined_mpa_file, tmp_path):
+ split_mpa(str(combined_mpa_file), str(tmp_path))
+ species = (tmp_path / "txt" / "counts_species.txt").read_text()
+ # No MPA path separators should remain
+ assert "|" not in species
+ # The rank prefix should remain (processing_script strips it later)
+ assert "s__" in species
+
+
+def test_split_mpa_genus_excludes_species_lines(combined_mpa_file, tmp_path):
+ split_mpa(str(combined_mpa_file), str(tmp_path))
+ genus = (tmp_path / "txt" / "counts_genus.txt").read_text()
+ assert "s__" not in genus
+
+
+def test_split_mpa_missing_input_raises(tmp_path):
+ with pytest.raises(FileNotFoundError):
+ split_mpa(str(tmp_path / "ghost.txt"), str(tmp_path / "out"))
diff --git a/tests/test_kpplot.py b/tests/test_kpplot.py
new file mode 100644
index 0000000..51cb5e1
--- /dev/null
+++ b/tests/test_kpplot.py
@@ -0,0 +1,105 @@
+"""kpplot smoke tests and parameter-validation tests."""
+
+import pytest
+
+from krakenparser.kpplot.stackedbar import stacked_barplot
+from krakenparser.kpplot.streamgraph import streamgraph
+from krakenparser.kpplot.clustermap import clustermap
+from krakenparser.kpplot.base import KpPlotBase, aggregate_by_metadata
+
+
+# ---------------------------------------------------------------------------
+# Smoke tests — verify each plot function returns without error
+# ---------------------------------------------------------------------------
+
+def test_stackedbar_returns_kpplotbase(relabund_df):
+ result = stacked_barplot(relabund_df)
+ assert isinstance(result, KpPlotBase)
+
+
+def test_streamgraph_returns_kpplotbase(relabund_df):
+ result = streamgraph(relabund_df)
+ assert isinstance(result, KpPlotBase)
+
+
+def test_clustermap_returns_kpplotbase(relabund_df):
+ result = clustermap(relabund_df)
+ assert isinstance(result, KpPlotBase)
+
+
+# ---------------------------------------------------------------------------
+# sample_order validation
+# ---------------------------------------------------------------------------
+
+def test_stackedbar_sample_order_missing_raises(relabund_df):
+ with pytest.raises(ValueError, match="Samples missing"):
+ stacked_barplot(relabund_df, sample_order=["S1", "S2", "GHOST"])
+
+
+def test_streamgraph_sample_order_missing_raises(relabund_df):
+ with pytest.raises(ValueError, match="Samples missing"):
+ streamgraph(relabund_df, sample_order=["S1", "GHOST"])
+
+
+def test_clustermap_sample_order_missing_raises(relabund_df):
+ with pytest.raises(ValueError, match="Samples missing"):
+ clustermap(relabund_df, sample_order=["S1", "GHOST"])
+
+
+# ---------------------------------------------------------------------------
+# cmap validation (stackedbar / streamgraph)
+# ---------------------------------------------------------------------------
+
+def test_stackedbar_cmap_too_short_raises(relabund_df):
+ with pytest.raises(ValueError, match="cmap"):
+ stacked_barplot(relabund_df, cmap=["red"])
+
+
+def test_streamgraph_cmap_too_short_raises(relabund_df):
+ with pytest.raises(ValueError, match="cmap"):
+ streamgraph(relabund_df, cmap=["red"])
+
+
+def test_stackedbar_cmap_invalid_type_raises(relabund_df):
+ with pytest.raises(ValueError, match="cmap"):
+ stacked_barplot(relabund_df, cmap=123)
+
+
+def test_streamgraph_cmap_invalid_type_raises(relabund_df):
+ with pytest.raises(ValueError, match="cmap"):
+ streamgraph(relabund_df, cmap=123)
+
+
+# ---------------------------------------------------------------------------
+# aggregate_by_metadata
+# ---------------------------------------------------------------------------
+
+def test_aggregate_by_metadata_basic(relabund_df):
+ import pandas as pd
+
+ metadata = pd.DataFrame({
+ "Sample_id": ["S1", "S2"],
+ "Group": ["A", "A"],
+ })
+ result = aggregate_by_metadata(relabund_df, metadata, "Group")
+ assert "Sample_id" in result.columns
+ assert set(result["Sample_id"]) == {"A"}
+ # Relative abundance should still sum to 100 per group
+ total = result["rel_abund_perc"].sum()
+ assert total == pytest.approx(100.0, abs=1e-6)
+
+
+def test_aggregate_by_metadata_missing_sample_id_column_raises(relabund_df):
+ import pandas as pd
+
+ bad_meta = pd.DataFrame({"Group": ["A", "B"], "X": [1, 2]})
+ with pytest.raises(ValueError, match="Sample_id"):
+ aggregate_by_metadata(relabund_df, bad_meta, "Group")
+
+
+def test_aggregate_by_metadata_missing_group_column_raises(relabund_df):
+ import pandas as pd
+
+ meta = pd.DataFrame({"Sample_id": ["S1", "S2"]})
+ with pytest.raises(ValueError, match="Group"):
+ aggregate_by_metadata(relabund_df, meta, "Group")
diff --git a/tests/test_units.py b/tests/test_units.py
new file mode 100644
index 0000000..57618de
--- /dev/null
+++ b/tests/test_units.py
@@ -0,0 +1,146 @@
+"""Pure-function unit tests — no I/O, fully deterministic."""
+
+import math
+
+import numpy as np
+import pytest
+
+from krakenparser.mpa.transform2mpa import _parse_line
+from krakenparser.stats.diversity import chao1_index, pielou_evenness, shannon_index
+from krakenparser.counts.processing_script import modify_taxa_names
+
+
+# ---------------------------------------------------------------------------
+# _parse_line
+# ---------------------------------------------------------------------------
+
+def test_parse_line_standard_rank():
+ line = "50.00\t500000\t100000\tP\t1224\t Pseudomonadota\n"
+ name, depth, rank, cum_reads, pct = _parse_line(line)
+ assert name == "Pseudomonadota"
+ assert depth == 2 # 4 leading spaces // 2
+ assert rank == "P"
+ assert cum_reads == 500000
+ assert pct == 50.0
+
+
+def test_parse_line_root_no_indent():
+ line = "99.98\t999980\t0\tR\t1\troot\n"
+ name, depth, rank, cum_reads, pct = _parse_line(line)
+ assert name == "root"
+ assert depth == 0
+ assert rank == "R"
+
+
+def test_parse_line_intermediate_rank():
+ line = "5.00\t50000\t0\tS1\t12345\t Some subspecies\n"
+ name, depth, rank, _, _ = _parse_line(line)
+ assert name == "Some subspecies"
+ assert rank == "S1"
+ assert depth == 5 # 10 spaces // 2
+
+
+def test_parse_line_too_few_columns():
+ assert _parse_line("50.00\t500000\n") is None
+
+
+def test_parse_line_non_numeric_pct():
+ assert _parse_line("not_a_float\t500000\t0\tP\t1224\tBacteria\n") is None
+
+
+def test_parse_line_non_numeric_reads():
+ assert _parse_line("50.00\tnot_int\t0\tP\t1224\tBacteria\n") is None
+
+
+# ---------------------------------------------------------------------------
+# shannon_index
+# ---------------------------------------------------------------------------
+
+def test_shannon_uniform_four_species():
+ assert abs(shannon_index([1, 1, 1, 1]) - math.log(4)) < 1e-10
+
+
+def test_shannon_single_species_is_zero():
+ assert shannon_index([100]) == pytest.approx(0.0)
+
+
+def test_shannon_ignores_zero_counts():
+ assert abs(shannon_index([1, 1, 1, 1, 0, 0]) - math.log(4)) < 1e-10
+
+
+def test_shannon_two_equal_species():
+ assert abs(shannon_index([50, 50]) - math.log(2)) < 1e-10
+
+
+# ---------------------------------------------------------------------------
+# pielou_evenness
+# ---------------------------------------------------------------------------
+
+def test_pielou_single_species_returns_nan():
+ assert math.isnan(pielou_evenness([100]))
+
+
+def test_pielou_all_zeros_returns_nan():
+ assert math.isnan(pielou_evenness([0, 0, 0]))
+
+
+def test_pielou_uniform_equals_one():
+ assert pielou_evenness([1, 1, 1, 1]) == pytest.approx(1.0)
+
+
+def test_pielou_range_zero_to_one():
+ result = pielou_evenness([10, 2, 1, 1])
+ assert 0.0 < result < 1.0
+
+
+# ---------------------------------------------------------------------------
+# chao1_index
+# ---------------------------------------------------------------------------
+
+def test_chao1_f2_zero_uses_f1_formula():
+ # F1=3, F2=0 → S_obs + F1*(F1-1)/2
+ counts = [1, 1, 1, 5, 10] # F1=3, F2=0, S_obs=5
+ expected = 5 + 3 * (3 - 1) / 2 # 5 + 3 = 8
+ assert chao1_index(counts) == pytest.approx(expected)
+
+
+def test_chao1_normal():
+ # F1=2, F2=2, S_obs=5 → S_obs + F1²/(2*F2)
+ counts = [1, 1, 2, 2, 5]
+ expected = 5 + (2 * 2) / (2 * 2) # 5 + 1 = 6
+ assert chao1_index(counts) == pytest.approx(expected)
+
+
+def test_chao1_no_singletons():
+ counts = [5, 10, 15, 20]
+ # F1=0, F2=0 → uses F2==0 branch: S_obs + 0
+ assert chao1_index(counts) == pytest.approx(4.0)
+
+
+# ---------------------------------------------------------------------------
+# modify_taxa_names
+# ---------------------------------------------------------------------------
+
+def test_modify_taxa_names_strips_prefix_and_replaces_underscores():
+ assert modify_taxa_names("s__Homo_sapiens\t100\t200") == "Homo sapiens\t100\t200"
+
+
+def test_modify_taxa_names_genus():
+ assert modify_taxa_names("g__Escherichia_coli\t50") == "Escherichia coli\t50"
+
+
+def test_modify_taxa_names_all_prefixes():
+ for prefix in ["s__", "g__", "f__", "o__", "c__", "p__"]:
+ result = modify_taxa_names(f"{prefix}Some_name\t10")
+ assert result == "Some name\t10"
+
+
+def test_modify_taxa_names_no_prefix_unchanged():
+ line = "unclassified_reads\t100"
+ assert modify_taxa_names(line) == line
+
+
+def test_modify_taxa_names_count_fields_not_modified():
+ # Underscores in tab-separated count fields must be preserved
+ result = modify_taxa_names("s__My_taxon\t1_000\t2_000")
+ assert result == "My taxon\t1_000\t2_000"