From 8367c2eed3789ea70188d901ba2d3c425d950931 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Thu, 12 Mar 2026 09:14:00 +0300 Subject: [PATCH 01/18] build: add pyproject.toml and migrate packaging from wheel metadata --- pyproject.toml | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b4f3ed5 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,50 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel>=0.42"] +build-backend = "setuptools.build_meta" + +[project] +name = "krakenparser" +version = "1.0.0" +description = "A collection of scripts designed to process Kraken2 reports and convert them into CSV format." +readme = {file = "README.md", content-type = "text/markdown"} +license = {file = "LICENSE"} +authors = [{name = "Ilia Popov", email = "iljapopov17@gmail.com"}] +requires-python = ">=3.10,<=3.16" +dependencies = [ + "pandas>=2.0.0,<=2.3.3", + "matplotlib>=3.7.0,<=3.10.8", + "numpy>=1.24.0,<=2.3.5", + "seaborn>=0.12.0,<=0.13.2", + "scikit-bio>=0.6.0,<=0.7.2", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.0.0", + "pytest-cov>=4.0.0", +] + +[project.urls] +Homepage = "https://github.com/PopovIILab/KrakenParser" + +[project.scripts] +KrakenParser = "krakenparser.krakenparser:main" + +[tool.setuptools.packages.find] +where = ["."] + + +[tool.pytest.ini_options] +filterwarnings = [ + "ignore:This figure includes Axes that are not compatible with tight_layout:UserWarning", + "ignore:Tight layout not applied.*:UserWarning", + "ignore:Samples with zero total abundance.*:UserWarning", +] + +[tool.coverage.run] +source = ["krakenparser"] +omit = [ + "krakenparser/krakenparser.py", + "krakenparser/mpa/transform2mpa.py", + "krakenparser/mpa/mpa_table.py", +] From 3ac8555904bde5d2ead7aea5a901acab86a82655 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Mon, 16 Mar 2026 11:42:00 +0300 Subject: [PATCH 02/18] refactor(mpa): rewrite kreport2mpa as transform2mpa with batch mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add -i/--input dir mode alongside the existing single-file -r mode. Remove run_kreport2mpa.sh wrapper — batch logic now lives in Python. --- krakenparser/mpa/__init__.py | 0 krakenparser/mpa/mpa_table.py | 70 +++++++++++ krakenparser/mpa/transform2mpa.py | 192 ++++++++++++++++++++++++++++++ krakenparser/run_kreport2mpa.sh | 64 ---------- 4 files changed, 262 insertions(+), 64 deletions(-) create mode 100644 krakenparser/mpa/__init__.py create mode 100644 krakenparser/mpa/mpa_table.py create mode 100644 krakenparser/mpa/transform2mpa.py delete mode 100755 krakenparser/run_kreport2mpa.sh diff --git a/krakenparser/mpa/__init__.py b/krakenparser/mpa/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/krakenparser/mpa/mpa_table.py b/krakenparser/mpa/mpa_table.py new file mode 100644 index 0000000..f66e63b --- /dev/null +++ b/krakenparser/mpa/mpa_table.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +"""Combine multiple MPA-format files into a single merged table.""" + +import argparse +import logging + +_log = logging.getLogger(__name__) + + +def combine_mpa(in_files: list[str], o_file: str) -> None: + # Plain dict preserves insertion order (Python 3.7+). + taxa: dict[str, dict[int, str]] = {} + sample_names: list[str] = [] + + _log.info("Number of files to parse: %d", len(in_files)) + + for idx, in_path in enumerate(in_files): + sample_name = f"Sample #{idx + 1}" + with open(in_path) as fh: + for line in fh: + line = line.rstrip("\n") + if not line: + continue + if line.startswith("#"): + cols = line.split("\t") + if len(cols) >= 2: + sample_name = cols[-1] + continue + cols = line.split("\t", 1) + if len(cols) < 2: + continue + taxon, count = cols[0], cols[1] + if taxon not in taxa: + taxa[taxon] = {} + taxa[taxon][idx] = count + sample_names.append(sample_name) + + n_samples = len(sample_names) + n_taxa = len(taxa) + _log.info("Number of classifications to write: %d", n_taxa) + + with open(o_file, "w") as fh: + fh.write("#Classification\t" + "\t".join(sample_names) + "\n") + for taxon, counts in taxa.items(): + row = [counts.get(i, "0") for i in range(n_samples)] + fh.write(taxon + "\t" + "\t".join(row) + "\n") + + _log.info("%d classifications written", n_taxa) + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Combine MPA files into a single tab-delimited table." + ) + parser.add_argument( + "-i", "--input", + required=True, nargs="+", dest="in_files", + help="Input MPA files (one per sample)", + ) + parser.add_argument( + "-o", "--output", + required=True, dest="o_file", + help="Output merged MPA file", + ) + args = parser.parse_args() + combine_mpa(args.in_files, args.o_file) + + +if __name__ == "__main__": + main() diff --git a/krakenparser/mpa/transform2mpa.py b/krakenparser/mpa/transform2mpa.py new file mode 100644 index 0000000..480a1f9 --- /dev/null +++ b/krakenparser/mpa/transform2mpa.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python +"""Convert a Kraken2 report to MetaPhlAn (MPA) format.""" + +import argparse +import os +import sys +from pathlib import Path + +# Maps Kraken2 single-letter rank codes to MPA prefixes +_RANK_PREFIX = { + "D": "d", "K": "k", "P": "p", "C": "c", + "O": "o", "F": "f", "G": "g", "S": "s", +} + + +def _parse_line(line: str): + """ + Parse one Kraken2 report line. + + Standard format (6 columns): + pct cum_reads direct_reads rank taxid name(indented) + + Returns (name, depth, rank, cum_reads, pct) or None on malformed input. + """ + parts = line.rstrip("\n").split("\t") + if len(parts) < 5: + return None + try: + pct = float(parts[0]) + cum_reads = int(parts[1]) + except ValueError: + return None + + rank = parts[3].strip() + name_field = parts[-1] # always the last column regardless of format variant + + depth = 0 + for ch in name_field: + if ch == " ": + depth += 1 + else: + break + name = name_field.strip() + + return name, depth // 2, rank, cum_reads, pct + + +def kreport_to_mpa( + report_path: str, + output_path: str, + display_header: bool = False, + include_intermediate: bool = False, + use_reads: bool = True, + remove_spaces: bool = True, +) -> None: + """ + Convert a single Kraken2 report to MPA format. + + Uses a stack to track the current taxonomic path. Each entry is + (structural_depth, mpa_segment, is_standard_rank). When a node at + depth d is encountered, all stack entries with depth >= d are popped + before the new entry is pushed, keeping the path consistent. + """ + # Stack entries: (structural_depth, mpa_segment, is_standard_rank) + stack: list[tuple[int, str, bool]] = [] + + with open(report_path) as r_fh, open(output_path, "w") as o_fh: + if display_header: + o_fh.write("#Classification\t" + os.path.basename(report_path) + "\n") + + for line in r_fh: + parsed = _parse_line(line) + if parsed is None: + continue + name, depth, rank, cum_reads, pct = parsed + + # Skip unclassified and root — never appear in MPA output + if rank in ("U", "R"): + continue + + # Strip numeric suffix to get base rank (e.g. "S1" → "S", "G2" → "G") + rank_base = rank.rstrip("0123456789") + is_standard = rank_base in _RANK_PREFIX and rank == rank_base + + if not is_standard and not include_intermediate: + continue + + prefix = _RANK_PREFIX.get(rank_base, "x") + seg_name = name.replace(" ", "_") if remove_spaces else name + mpa_seg = f"{prefix}__{seg_name}" + + # Trim stack to the current structural depth + while stack and stack[-1][0] >= depth: + stack.pop() + stack.append((depth, mpa_seg, is_standard)) + + # Build the full MPA path; omit intermediate (x__) segments when not requested + path = "|".join( + seg + for (_, seg, std) in stack + if include_intermediate or std + ) + + value = str(cum_reads) if use_reads else str(pct) + o_fh.write(f"{path}\t{value}\n") + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Convert a Kraken2 report to MetaPhlAn (MPA) format." + ) + + mode = parser.add_mutually_exclusive_group(required=True) + mode.add_argument( + "-r", "--report-file", "--report", + dest="r_file", + help="Single input Kraken2 report file", + ) + mode.add_argument( + "-i", "--input", + dest="input_dir", + help="Input directory containing Kraken2 report files (batch mode)", + ) + + parser.add_argument( + "-o", "--output", + required=True, dest="o_file", + help="Output MPA file (single mode) or output directory (batch mode)", + ) + parser.add_argument( + "--display-header", + action="store_true", dest="add_header", default=False, + help="Write a header line with the sample name (filename)", + ) + parser.add_argument( + "--read_count", + action="store_true", dest="use_reads", default=True, + help="Output clade read counts [default]", + ) + parser.add_argument( + "--percentages", + action="store_false", dest="use_reads", + help="Output percentages instead of read counts", + ) + parser.add_argument( + "--intermediate-ranks", + action="store_true", dest="x_include", default=False, + help="Include non-standard taxonomic ranks in output", + ) + parser.add_argument( + "--no-intermediate-ranks", + action="store_false", dest="x_include", + help="Exclude non-standard taxonomic ranks [default]", + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + "--remove-spaces", + action="store_true", dest="remove_spaces", default=True, + help="Replace spaces with underscores in taxon names [default]", + ) + group.add_argument( + "--keep-spaces", + action="store_false", dest="remove_spaces", + help="Keep spaces in taxon names", + ) + args = parser.parse_args() + + kwargs = dict( + display_header=args.add_header, + include_intermediate=args.x_include, + use_reads=args.use_reads, + remove_spaces=args.remove_spaces, + ) + + if args.input_dir: + input_dir = Path(args.input_dir) + if not input_dir.is_dir(): + sys.exit(f"Error: input directory not found: {input_dir}") + output_dir = Path(args.o_file) + output_dir.mkdir(parents=True, exist_ok=True) + for f in sorted(input_dir.iterdir()): + if not f.is_file(): + continue + out_name = f.name.replace(".kreport", ".MPA.TXT") + kreport_to_mpa(str(f), str(output_dir / out_name), **kwargs) + print(f"Converted to MPA successfully. Output stored in {output_dir}") + else: + kreport_to_mpa(args.r_file, args.o_file, **kwargs) + + +if __name__ == "__main__": + main() diff --git a/krakenparser/run_kreport2mpa.sh b/krakenparser/run_kreport2mpa.sh deleted file mode 100755 index 7b40ee9..0000000 --- a/krakenparser/run_kreport2mpa.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash - -# Function to display usage information -usage() { - echo "Usage: $(basename "$0") -i PATH_TO_SOURCE -o PATH_TO_DESTINATION" - echo - echo " -i, --input PATH_TO_SOURCE Path to the source directory containing files to convert." - echo " -o, --output PATH_TO_DESTINATION Path to the destination directory where converted files will be stored." - echo " -h, --help Display this help message and exit." - echo - echo "This script converts files in the source directory with the .kreport extension to .MPA.TXT format." - echo "It uses KrakenTools' \`kreport2mpa.py\` Python script to perform the conversion." - echo "The converted files will be saved in the specified destination directory." - exit 0 -} - -# Initialize variables -SOURCE_DIR="" -DESTINATION_DIR="" - -# Parse command-line arguments -while [[ "$#" -gt 0 ]]; do - case "$1" in - -i|--input) SOURCE_DIR="$2"; shift 2 ;; - -o|--output) DESTINATION_DIR="$2"; shift 2 ;; - -h|--help) usage ;; - *) echo "Unknown option: $1"; usage ;; - esac -done - -# Check if required arguments are provided -if [ -z "$SOURCE_DIR" ] || [ -z "$DESTINATION_DIR" ]; then - echo "Error: Both input and output paths are required." - usage -fi - -# Determine the directory of this script -SCRIPT_DIR=$(dirname "$(realpath "$0")") - -# Ensure KrakenTools script exists -KREPORT_SCRIPT="$SCRIPT_DIR/kreport2mpa.py" -if [ ! -f "$KREPORT_SCRIPT" ]; then - echo "Error: kreport2mpa.py not found in $SCRIPT_DIR" - exit 1 -fi - -# Create the destination folder if it does not exist -mkdir -p "${DESTINATION_DIR}" - -# Convert files in the source directory -for file in "${SOURCE_DIR}"/*.*; do - # Get the file name without path - filename=$(basename "${file}") - # Form the command to process the file - python "$SCRIPT_DIR/kreport2mpa.py" -r "${file}" -o "${DESTINATION_DIR}/${filename/.kreport/.MPA.TXT}" --display-header -done - -# Check exit status -if [ $? -ne 0 ]; then - echo "Error: Conversion process failed." - exit 1 -fi - -echo "Converted to MPA successfully. Output stored in $DESTINATION_DIR" \ No newline at end of file From 3ccdab86772f0402d5a883790cb0af400a3c47c2 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Tue, 24 Mar 2026 14:08:00 +0300 Subject: [PATCH 03/18] refactor(counts): replace bash decombine scripts with Python split_mpa Merge decombine.sh + decombine_viruses.sh into a single split_mpa.py. Add --viruses-only and --keep-human flags. Move convert2csv and processing_script into counts/ subpackage. --- krakenparser/convert2csv.py | 54 -------- krakenparser/counts/__init__.py | 0 krakenparser/counts/convert2csv.py | 45 +++++++ .../{ => counts}/processing_script.py | 37 +++--- krakenparser/counts/split_mpa.py | 112 ++++++++++++++++ krakenparser/decombine.sh | 123 ------------------ krakenparser/decombine_viruses.sh | 111 ---------------- 7 files changed, 178 insertions(+), 304 deletions(-) delete mode 100755 krakenparser/convert2csv.py create mode 100644 krakenparser/counts/__init__.py create mode 100755 krakenparser/counts/convert2csv.py rename krakenparser/{ => counts}/processing_script.py (66%) create mode 100644 krakenparser/counts/split_mpa.py delete mode 100755 krakenparser/decombine.sh delete mode 100755 krakenparser/decombine_viruses.sh diff --git a/krakenparser/convert2csv.py b/krakenparser/convert2csv.py deleted file mode 100755 index 2b05823..0000000 --- a/krakenparser/convert2csv.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python - -import shutil -from pathlib import Path -import argparse -import pandas as pd - - -def convert_to_csv(input_file, output_file): - # Read the entire file into a DataFrame - data = pd.read_csv(input_file, sep="\t", header=None) - - # Set the first row as the header - data.columns = data.iloc[0] - data = data.drop(data.index[0]) - - # Transpose the DataFrame so that sample names become rows and microbial taxa with their abundance become columns - data_transposed = data.T - data_transposed.columns = data_transposed.iloc[0] - data_transposed = data_transposed.drop(data_transposed.index[0]) - - # Save the transposed data to a new CSV file - data_transposed.to_csv(output_file, index_label="Sample_id") - print(f"Data has been successfully converted and saved as '{output_file}'.") - - # Remove pycache if it exists - current_dir = Path(__file__).resolve().parent - pycache_dir = current_dir / "__pycache__" - if pycache_dir.exists() and pycache_dir.is_dir(): - shutil.rmtree(pycache_dir) - - -if __name__ == "__main__": - # Use argparse to handle command-line arguments - parser = argparse.ArgumentParser( - description="Reads a TXT file, reorganizes the data, and converts it into a CSV file." - ) - parser.add_argument( - "-i", - "--input", - required=True, - help="Path to the input TXT file. This file should contain sample names in columns and microbial taxa in rows.", - ) - parser.add_argument( - "-o", - "--output", - required=True, - help="Path to the output CSV file. The script will restructure the data and save it here.", - ) - - args = parser.parse_args() - - # Call function with parsed arguments - convert_to_csv(args.input, args.output) diff --git a/krakenparser/counts/__init__.py b/krakenparser/counts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/krakenparser/counts/convert2csv.py b/krakenparser/counts/convert2csv.py new file mode 100755 index 0000000..835ffc3 --- /dev/null +++ b/krakenparser/counts/convert2csv.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python + +import argparse +import logging +from pathlib import Path +import pandas as pd + +_log = logging.getLogger(__name__) + + +def convert_to_csv(input_file, output_file): + in_path = Path(input_file) + if not in_path.is_file(): + raise FileNotFoundError(f"Input file not found: {in_path}") + out_path = Path(output_file) + if not out_path.parent.exists(): + raise FileNotFoundError(f"Output directory does not exist: {out_path.parent}") + + data = pd.read_csv(in_path, sep="\t", index_col=0) + data.T.to_csv(out_path, index_label="Sample_id") + _log.info("Data converted and saved as '%s'.", output_file) + + +if __name__ == "__main__": + # Use argparse to handle command-line arguments + parser = argparse.ArgumentParser( + description="Reads a TXT file, reorganizes the data, and converts it into a CSV file." + ) + parser.add_argument( + "-i", + "--input", + required=True, + help="Path to the input TXT file. This file should contain sample names in columns and microbial taxa in rows.", + ) + parser.add_argument( + "-o", + "--output", + required=True, + help="Path to the output CSV file. The script will restructure the data and save it here.", + ) + + args = parser.parse_args() + + # Call function with parsed arguments + convert_to_csv(args.input, args.output) diff --git a/krakenparser/processing_script.py b/krakenparser/counts/processing_script.py similarity index 66% rename from krakenparser/processing_script.py rename to krakenparser/counts/processing_script.py index 911d306..b2b28ff 100755 --- a/krakenparser/processing_script.py +++ b/krakenparser/counts/processing_script.py @@ -1,7 +1,8 @@ #!/usr/bin/env python +import os import sys -import shutil +import tempfile import argparse from pathlib import Path @@ -10,41 +11,45 @@ def modify_taxa_names(line): prefixes = ["s__", "g__", "f__", "o__", "c__", "p__"] for prefix in prefixes: if line.startswith(prefix): - # Remove the prefix and replace underscores with spaces - return line[len(prefix) :].replace("_", " ") + parts = line[len(prefix):].split("\t") + parts[0] = parts[0].replace("_", " ") + return "\t".join(parts) return line def process_files(source_file, destination_file): + src_path = Path(source_file) + if not src_path.is_file(): + raise FileNotFoundError(f"Source file not found: {src_path}") + dest_path = Path(destination_file) + if not dest_path.is_file(): + raise FileNotFoundError(f"Destination file not found: {dest_path}") + # Read the first line from the source file and modify it - with open(source_file, "r") as file: + with open(src_path, "r") as file: first_line_source = file.readline() modified_first_line = "\t".join( word.split(".")[0] for word in first_line_source.split() ) # Read all content from the destination file and modify taxa names - with open(destination_file, "r") as file: + with open(dest_path, "r") as file: lines = file.readlines() modified_lines = [modify_taxa_names(line.strip()) for line in lines] # Combine the modified first line with the modified content of the destination file updated_content = modified_first_line + "\n" + "\n".join(modified_lines) - # Write the updated content back to the destination file - with open(destination_file, "w") as file: - file.write(updated_content) + # Write atomically: write to a temp file in the same directory, then replace + with tempfile.NamedTemporaryFile( + mode="w", dir=dest_path.parent, delete=False, suffix=".tmp" + ) as tmp: + tmp.write(updated_content) + tmp_path = tmp.name + os.replace(tmp_path, dest_path) print(f"Processed {destination_file} successfully.") - # Get the path to the current directory (same location as the script) - current_dir = Path(__file__).resolve().parent - pycache_dir = current_dir / "__pycache__" - - # Check if __pycache__ exists and remove it - if pycache_dir.exists() and pycache_dir.is_dir(): - shutil.rmtree(pycache_dir) - if __name__ == "__main__": # Use argparse to parse command-line arguments diff --git a/krakenparser/counts/split_mpa.py b/krakenparser/counts/split_mpa.py new file mode 100644 index 0000000..72b4e07 --- /dev/null +++ b/krakenparser/counts/split_mpa.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python +"""Split a combined MPA table into per-rank TXT files. + +Replaces decombine.sh and decombine_viruses.sh. +""" + +import argparse +import logging +import re +import sys +from pathlib import Path + +_log = logging.getLogger(__name__) + + +_RANKS = [ + ("species", "s__", []), + ("genus", "g__", ["s__"]), + ("family", "f__", ["s__", "g__"]), + ("order", "o__", ["s__", "g__", "f__"]), + ("class", "c__", ["s__", "g__", "f__", "o__"]), + ("phylum", "p__", ["s__", "g__", "f__", "o__", "c__"]), +] + +_HUMAN_TAXA = { + "species": "s__Homo_sapiens", + "genus": "g__Homo", + "family": "f__Hominidae", + "order": "o__Primates", + "class": "c__Mammalia", + "phylum": "p__Chordata", +} + +_ACCESSION_RE = re.compile(r"(SRS|SRR|SRX|ERS|ERR|ERX|DRS|DRR|DRX)\d*-") + + +def _strip_path_prefix(line: str) -> str: + """'d__X|p__Y|s__Z\t10\t20' → 's__Z\t10\t20'""" + tab = line.find("\t") + if tab == -1: + return line + path, rest = line[:tab], line[tab:] + pipe = path.rfind("|") + segment = path[pipe + 1:] if pipe != -1 else path + return _ACCESSION_RE.sub("", segment + rest) + + +def split_mpa( + input_file: str, + output_dir: str, + viruses_only: bool = False, + keep_human: bool = False, +) -> None: + in_path = Path(input_file) + if not in_path.is_file(): + raise FileNotFoundError(f"Input file not found: {in_path}") + out_path = Path(output_dir) + (out_path / "txt").mkdir(parents=True, exist_ok=True) + + lines = in_path.read_text().splitlines() + data_lines = [ln for ln in lines if not ln.startswith("#") and ln.strip()] + + if viruses_only: + data_lines = [ln for ln in data_lines if "d__Viruses" in ln] + + filter_human = not keep_human and not viruses_only + + for rank_name, rank_prefix, exclude_prefixes in _RANKS: + result = [] + human_pattern = _HUMAN_TAXA[rank_name] + + for line in data_lines: + if rank_prefix not in line: + continue + if "t__" in line: + continue + if any(ep in line for ep in exclude_prefixes): + continue + if filter_human and human_pattern in line: + continue + result.append(_strip_path_prefix(line)) + + out_file = out_path / "txt" / f"counts_{rank_name}.txt" + out_file.write_text("\n".join(result) + ("\n" if result else "")) + + _log.info("MPA file split successfully. Output stored in %s", output_dir) + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Split a combined MPA table into per-rank TXT files." + ) + parser.add_argument("-i", "--input", required=True, help="Input combined MPA file") + parser.add_argument("-o", "--output", required=True, help="Output directory") + parser.add_argument( + "--viruses-only", + action="store_true", + default=False, + help="Extract only Viruses domain taxa", + ) + parser.add_argument( + "--keep-human", + action="store_true", + default=False, + help="Do not filter human-related taxa (default: filtered)", + ) + args = parser.parse_args() + split_mpa(args.input, args.output, viruses_only=args.viruses_only, keep_human=args.keep_human) + + +if __name__ == "__main__": + main() diff --git a/krakenparser/decombine.sh b/krakenparser/decombine.sh deleted file mode 100755 index 0832018..0000000 --- a/krakenparser/decombine.sh +++ /dev/null @@ -1,123 +0,0 @@ -#!/bin/bash - -# Function to display detailed usage information -usage() { - echo "Usage: $(basename "$0") -i PATH_TO_SOURCE_FILE -o PATH_TO_DESTINATION" - echo - echo " -i, --input PATH_TO_SOURCE_FILE Path to the Combined MPA input file to be processed." - echo " -o, --output PATH_TO_DESTINATION Path to the directory where processed output files will be stored." - echo " -h, --help Display this help message and exit." - echo - echo "Description:" - echo " This script processes a combined mpa file by extracting different taxonomic levels" - echo " (species, genus, family, order, class, and phylum) and saving the results as separate text files." - echo " Additionally, it removes human-related sequences to improve data accuracy." - echo - echo "Processing Details:" - echo " - Extracts taxonomic levels using 'grep' with specific patterns." - echo " - Filters out undesired taxonomic entries such as:" - echo " - Species: Homo sapiens" - echo " - Genus: Homo" - echo " - Family: Hominidae" - echo " - Order: Primates" - echo " - Class: Mammalia" - echo " - Phylum: Chordata" - echo " - Outputs are stored as text files inside the specified destination directory." - exit 0 -} - -# Initialize variables -SOURCE_FILE="" -DESTINATION_DIR="" - -# Parse command-line arguments -while [[ "$#" -gt 0 ]]; do - case "$1" in - -i|--input) SOURCE_FILE="$2"; shift 2 ;; - -o|--output) DESTINATION_DIR="$2"; shift 2 ;; - -h|--help) usage ;; - *) echo "Error: Unknown option $1"; usage ;; - esac -done - -# Check if required arguments are provided -if [[ -z "$SOURCE_FILE" || -z "$DESTINATION_DIR" ]]; then - echo "Error: Both input (-i) and output (-o) paths are required." - usage -fi - -# Check if source file exists -if [[ ! -f "$SOURCE_FILE" ]]; then - echo "Error: Input file '$SOURCE_FILE' not found!" - exit 1 -fi - -# Create destination directories -mkdir -p "${DESTINATION_DIR}/txt" -mkdir -p "${DESTINATION_DIR}/csv" - -# Process input file and generate output files -grep -E "s__" "${SOURCE_FILE}" \ -| grep -v "t__" \ -| grep -v "s__Homo_sapiens" \ -| sed "s/^.*|//g" \ -| sed "s/SRS[0-9]*-//g" \ -> "${DESTINATION_DIR}/txt/counts_species.txt" - -grep -E "g__" "${SOURCE_FILE}" \ -| grep -v "t__" \ -| grep -v "s__" \ -| grep -v "g__Homo" \ -| sed "s/^.*|//g" \ -| sed "s/SRS[0-9]*-//g" \ -> "${DESTINATION_DIR}/txt/counts_genus.txt" - -grep -E "f__" "${SOURCE_FILE}" \ -| grep -v "t__" \ -| grep -v "s__" \ -| grep -v "g__" \ -| grep -v "f__Hominidae" \ -| sed "s/^.*|//g" \ -| sed "s/SRS[0-9]*-//g" \ -> "${DESTINATION_DIR}/txt/counts_family.txt" - -grep -E "o__" "${SOURCE_FILE}" \ -| grep -v "t__" \ -| grep -v "s__" \ -| grep -v "g__" \ -| grep -v "f__" \ -| grep -v "o__Primates" \ -| sed "s/^.*|//g" \ -| sed "s/SRS[0-9]*-//g" \ -> "${DESTINATION_DIR}/txt/counts_order.txt" - -grep -E "c__" "${SOURCE_FILE}" \ -| grep -v "t__" \ -| grep -v "s__" \ -| grep -v "g__" \ -| grep -v "f__" \ -| grep -v "o__" \ -| grep -v "c__Mammalia" \ -| sed "s/^.*|//g" \ -| sed "s/SRS[0-9]*-//g" \ -> "${DESTINATION_DIR}/txt/counts_class.txt" - -grep -E "p__" "${SOURCE_FILE}" \ -| grep -v "t__" \ -| grep -v "s__" \ -| grep -v "g__" \ -| grep -v "f__" \ -| grep -v "o__" \ -| grep -v "c__" \ -| grep -v "p__Chordata" \ -| sed "s/^.*|//g" \ -| sed "s/SRS[0-9]*-//g" \ -> "${DESTINATION_DIR}/txt/counts_phylum.txt" - -# Check for errors -if [ $? -ne 0 ]; then - echo "Error: Failed to run decombine.sh" - exit 1 -fi - -echo "MPA file decombined successfully. Output stored in $DESTINATION_DIR" \ No newline at end of file diff --git a/krakenparser/decombine_viruses.sh b/krakenparser/decombine_viruses.sh deleted file mode 100755 index 99c8260..0000000 --- a/krakenparser/decombine_viruses.sh +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash - -# Function to display detailed usage information -usage() { - echo "Usage: $(basename "$0") -i PATH_TO_SOURCE_FILE -o PATH_TO_DESTINATION" - echo - echo " -i, --input PATH_TO_SOURCE_FILE Path to the Combined MPA input file to be processed." - echo " -o, --output PATH_TO_DESTINATION Path to the directory where processed output files will be stored." - echo " -h, --help Display this help message and exit." - echo - echo "Description:" - echo " This script processes a combined mpa file by extracting different taxonomic levels using only VIRUSES domain" - echo " (species, genus, family, order, class, and phylum) and saving the results as separate text files." - echo - echo "Processing Details:" - echo " - Extracts taxonomic levels only on VIRUSES domain using 'grep' with specific patterns." - echo " - Outputs are stored as text files inside the specified destination directory." - exit 0 -} - -# Initialize variables -SOURCE_FILE="" -DESTINATION_DIR="" - -# Parse command-line arguments -while [[ "$#" -gt 0 ]]; do - case "$1" in - -i|--input) SOURCE_FILE="$2"; shift 2 ;; - -o|--output) DESTINATION_DIR="$2"; shift 2 ;; - -h|--help) usage ;; - *) echo "Error: Unknown option $1"; usage ;; - esac -done - -# Check if required arguments are provided -if [[ -z "$SOURCE_FILE" || -z "$DESTINATION_DIR" ]]; then - echo "Error: Both input (-i) and output (-o) paths are required." - usage -fi - -# Check if source file exists -if [[ ! -f "$SOURCE_FILE" ]]; then - echo "Error: Input file '$SOURCE_FILE' not found!" - exit 1 -fi - -# Create destination directories -mkdir -p "${DESTINATION_DIR}/txt" -mkdir -p "${DESTINATION_DIR}/csv" - -VIRUSES_BUFFER=$(grep -E "d__Viruses" "${SOURCE_FILE}") - -# Process input file and generate output files -echo "$VIRUSES_BUFFER" | grep -E "s__" \ - | grep -v "t__" \ - | sed "s/^.*|//g" \ - | sed "s/SRS[0-9]*-//g" \ - > "${DESTINATION_DIR}/txt/counts_species.txt" - -echo "$VIRUSES_BUFFER" | grep -E "g__" \ - | grep -v "t__" \ - | grep -v "s__" \ - | sed "s/^.*|//g" \ - | sed "s/SRS[0-9]*-//g" \ - > "${DESTINATION_DIR}/txt/counts_genus.txt" - -echo "$VIRUSES_BUFFER" | grep -E "f__" \ - | grep -v "t__" \ - | grep -v "s__" \ - | grep -v "g__" \ - | sed "s/^.*|//g" \ - | sed "s/SRS[0-9]*-//g" \ - > "${DESTINATION_DIR}/txt/counts_family.txt" - -echo "$VIRUSES_BUFFER" | grep -E "o__" \ - | grep -v "t__" \ - | grep -v "s__" \ - | grep -v "g__" \ - | grep -v "f__" \ - | sed "s/^.*|//g" \ - | sed "s/SRS[0-9]*-//g" \ - > "${DESTINATION_DIR}/txt/counts_order.txt" - -echo "$VIRUSES_BUFFER" | grep -E "c__" \ - | grep -v "t__" \ - | grep -v "s__" \ - | grep -v "g__" \ - | grep -v "f__" \ - | grep -v "o__" \ - | sed "s/^.*|//g" \ - | sed "s/SRS[0-9]*-//g" \ - > "${DESTINATION_DIR}/txt/counts_class.txt" - -echo "$VIRUSES_BUFFER" | grep -E "p__" \ - | grep -v "t__" \ - | grep -v "s__" \ - | grep -v "g__" \ - | grep -v "f__" \ - | grep -v "o__" \ - | grep -v "c__" \ - | sed "s/^.*|//g" \ - | sed "s/SRS[0-9]*-//g" \ - > "${DESTINATION_DIR}/txt/counts_phylum.txt" - -# Check for errors -if [ $? -ne 0 ]; then - echo "Error: Failed to run decombine_viruses.sh" - exit 1 -fi - -echo "MPA file decombined successfully. Output stored in $DESTINATION_DIR" \ No newline at end of file From eb92428cea0e4cd5e530cee523c65e0dbb946543 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Wed, 1 Apr 2026 10:22:00 +0300 Subject: [PATCH 04/18] refactor(stats): move relabund and diversity into stats/ subpackage --- krakenparser/stats/__init__.py | 0 krakenparser/{ => stats}/diversity.py | 19 ++++++--------- krakenparser/{ => stats}/relabund.py | 35 ++++++++++++++++++--------- 3 files changed, 32 insertions(+), 22 deletions(-) create mode 100644 krakenparser/stats/__init__.py rename krakenparser/{ => stats}/diversity.py (88%) rename krakenparser/{ => stats}/relabund.py (71%) diff --git a/krakenparser/stats/__init__.py b/krakenparser/stats/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/krakenparser/diversity.py b/krakenparser/stats/diversity.py similarity index 88% rename from krakenparser/diversity.py rename to krakenparser/stats/diversity.py index 2fe68f5..da3b336 100644 --- a/krakenparser/diversity.py +++ b/krakenparser/stats/diversity.py @@ -3,7 +3,6 @@ import pandas as pd import numpy as np import sys -import shutil import argparse from pathlib import Path from skbio.diversity import beta_diversity @@ -20,11 +19,11 @@ def shannon_index(counts): # Define Pielou's evenness def pielou_evenness(counts): - counts = np.array(counts) - counts = counts[counts > 0] - H = shannon_index(counts) - S = len(counts) - return H / np.log(S) if S > 1 else 0 + counts = np.asarray(counts) + S = int(np.sum(counts > 0)) + if S <= 1: + return np.nan + return shannon_index(counts) / np.log(S) # Define Chao1 richness estimator @@ -59,7 +58,7 @@ def calc_beta_div(df, output_path, rarefaction_depth): sample_ids = [] for sample, row in df.iterrows(): - counts = row.values.astype(int) + counts = np.round(row.values).astype(int) if counts.sum() >= rarefaction_depth: rarefied = subsample_counts(counts, n=rarefaction_depth) rarefied_counts.append(rarefied) @@ -98,6 +97,8 @@ def calc_beta_div(df, output_path, rarefaction_depth): args = parser.parse_args() input_file = Path(args.input) + if not input_file.is_file(): + sys.exit(f"Error: input file not found: {input_file}") output_dir = Path(args.output) output_dir.mkdir(parents=True, exist_ok=True) @@ -108,7 +109,3 @@ def calc_beta_div(df, output_path, rarefaction_depth): print( f"α & β-diversities have been successfully calculated and saved to '{output_dir}'." ) - - pycache_dir = Path(__file__).resolve().parent / "__pycache__" - if pycache_dir.exists() and pycache_dir.is_dir(): - shutil.rmtree(pycache_dir) diff --git a/krakenparser/relabund.py b/krakenparser/stats/relabund.py similarity index 71% rename from krakenparser/relabund.py rename to krakenparser/stats/relabund.py index d7a7143..171f280 100644 --- a/krakenparser/relabund.py +++ b/krakenparser/stats/relabund.py @@ -1,14 +1,24 @@ #!/usr/bin/env python -import shutil -from pathlib import Path +import logging +import warnings import argparse +from pathlib import Path import pandas as pd +_log = logging.getLogger(__name__) + def calculate_rel_abund(input_file, output_file, other_threshold=None): + in_path = Path(input_file) + if not in_path.is_file(): + raise FileNotFoundError(f"Input file not found: {in_path}") + out_path = Path(output_file) + if not out_path.parent.exists(): + raise FileNotFoundError(f"Output directory does not exist: {out_path.parent}") + # Load counts table - df = pd.read_csv(input_file) + df = pd.read_csv(in_path) # Reshape to long format: Sample_id, taxon, abundance long_df = df.melt(id_vars=["Sample_id"], var_name="taxon", value_name="abundance") @@ -16,6 +26,15 @@ def calculate_rel_abund(input_file, output_file, other_threshold=None): # Summarize total abundance per sample (used for percentage calculation) total_abundance = long_df.groupby("Sample_id")["abundance"].transform("sum") + zero_samples = long_df.groupby("Sample_id")["abundance"].sum() + zero_samples = zero_samples[zero_samples == 0].index.tolist() + if zero_samples: + warnings.warn( + f"Samples with zero total abundance were excluded from output: {zero_samples}", + UserWarning, + stacklevel=2, + ) + # Calculate relative abundance (%) long_df["rel_abund_perc"] = (long_df["abundance"] / total_abundance) * 100 @@ -40,16 +59,10 @@ def calculate_rel_abund(input_file, output_file, other_threshold=None): # Save to CSV result.to_csv(output_file, index=False) - print( - f"Relative abundance has been successfully calculated and saved as '{output_file}'." + _log.info( + "Relative abundance saved as '%s'.", output_file ) - # Remove __pycache__ - current_dir = Path(__file__).resolve().parent - pycache_dir = current_dir / "__pycache__" - if pycache_dir.exists(): - shutil.rmtree(pycache_dir) - if __name__ == "__main__": parser = argparse.ArgumentParser( From 58c1d8a82c8970e22609f90b3646a8f06b5754df Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Sun, 5 Apr 2026 16:55:00 +0300 Subject: [PATCH 05/18] feat(pipeline): port full pipeline orchestration from bash to Python Replace kraken2csv.sh with pipeline.py. Add run_pipeline() as a callable library function. Add -o/--output and --overwrite flags. Skip binary and hidden files automatically (_is_processable check). --- krakenparser/kraken2csv.sh | 90 --------------------- krakenparser/pipeline.py | 159 +++++++++++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+), 90 deletions(-) delete mode 100755 krakenparser/kraken2csv.sh create mode 100644 krakenparser/pipeline.py diff --git a/krakenparser/kraken2csv.sh b/krakenparser/kraken2csv.sh deleted file mode 100755 index 5b53f75..0000000 --- a/krakenparser/kraken2csv.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash - -# Function to display usage -usage() { - echo "Usage: $(basename "$0") -i|--input PATH_TO_SOURCE" - echo - echo " -i, --input Path to the source directory — Kraken2 reports must be inside a subdirectory (e.g., data/kreports)" - echo " -h Show this help message" - exit 0 -} - -# Parse command-line arguments -SOURCE_DIR="" - -while [[ $# -gt 0 ]]; do - case "$1" in - -i|--input) SOURCE_DIR="$2"; shift 2 ;; - -h|--help) usage ;; - *) echo "Unknown option: $1"; usage ;; - esac -done - -# Ensure SOURCE_DIR is set -if [ -z "$SOURCE_DIR" ]; then - echo "Error: input is required." - usage -fi - -# Determine the directory of this script -SCRIPT_DIR=$(dirname "$(realpath "$0")") - -# PART 1: CONVERT KRAKEN2 TO MPA - -# Setting the path to the source file directory and destination directory -PARENT_DIR=$(dirname "$SOURCE_DIR") -MPA_DIR="$PARENT_DIR/mpa" - -# Run the old script with the correct paths -"$SCRIPT_DIR/run_kreport2mpa.sh" -i "$SOURCE_DIR" -o "$MPA_DIR" - -# PART 2: COMBINING MPAs - -COMBINED_FILE="$PARENT_DIR/COMBINED.txt" -python "$SCRIPT_DIR/combine_mpa.py" -i "$MPA_DIR"/* -o "$COMBINED_FILE" -if [ $? -ne 0 ]; then - echo "Error: Failed to run combine_mpa.py" - exit 1 -fi -echo "MPA files combined successfully. Output stored in $COMBINED_FILE" - -# PART 3: DECOMBINING MPAs - -COUNTS_DIR="$PARENT_DIR/counts" - -"$SCRIPT_DIR/decombine.sh" -i "$COMBINED_FILE" -o "$COUNTS_DIR" - -# PART 4: PROCESS COUNTS TXT FILES - -for file in "$COUNTS_DIR"/txt/counts_*.txt; do - python "$SCRIPT_DIR/processing_script.py" -i "$COMBINED_FILE" -o "$file" - if [ $? -ne 0 ]; then - echo "Error: Failed to process $file" - exit 1 - fi -done - -# PART 5: CONVERT TXT FILES TO CSV - -for file in "$COUNTS_DIR"/txt/counts_*.txt; do - CSV_FILE="$COUNTS_DIR/csv/$(basename "$file" .txt).csv" - python "$SCRIPT_DIR/convert2csv.py" -i "$file" -o "$CSV_FILE" -done - -# PART 6: CALCULATE RELATIVE ABUNDANCE - -mkdir -p "$PARENT_DIR/rel_abund" - -for file in "$COUNTS_DIR"/csv/counts_*.csv; do - base=$(basename "$file") - new_name="${base/counts_/ra_}" - CSV_RA_FILE="$PARENT_DIR/rel_abund/$new_name" - python "$SCRIPT_DIR/relabund.py" -i "$file" -o "$CSV_RA_FILE" -done - -# PART 7: CALCULATE α & β-DIVERSITIES - -CSV_SPECIES_FILE="$COUNTS_DIR/csv/counts_species.csv" -python "$SCRIPT_DIR/diversity.py" -i "$CSV_SPECIES_FILE" -o "$PARENT_DIR/diversity" - -echo "All steps completed successfully!" \ No newline at end of file diff --git a/krakenparser/pipeline.py b/krakenparser/pipeline.py new file mode 100644 index 0000000..f0e29c6 --- /dev/null +++ b/krakenparser/pipeline.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python +"""Full KrakenParser pipeline: kreport → MPA → combined → counts → rel_abund → diversity.""" + +import argparse +import logging +import shutil +import sys +from pathlib import Path + +_log = logging.getLogger(__name__) + +import pandas as pd + +from krakenparser.mpa.transform2mpa import kreport_to_mpa +from krakenparser.mpa.mpa_table import combine_mpa +from krakenparser.counts.split_mpa import split_mpa +from krakenparser.counts.processing_script import process_files +from krakenparser.counts.convert2csv import convert_to_csv +from krakenparser.stats.relabund import calculate_rel_abund +from krakenparser.stats.diversity import calc_alpha_div, calc_beta_div + + +def _is_processable(path: Path) -> bool: + """Return False for hidden files, files with null bytes, or non-UTF-8 files.""" + if path.name.startswith("."): + return False + try: + chunk = path.read_bytes()[:1024] + if b"\x00" in chunk: + return False + chunk.decode("utf-8") + return True + except (UnicodeDecodeError, OSError): + return False + + +_OUTPUT_SUBDIRS = ("intermediate", "counts", "rel_abund", "diversity") + + +def run_pipeline( + input_dir: str, + output_dir: str | None = None, + keep_human: bool = False, + rarefaction_depth: int = 1000, + overwrite: bool = False, +) -> None: + source_dir = Path(input_dir) + if not source_dir.is_dir(): + sys.exit(f"Error: input directory not found: {source_dir}") + + out_dir = Path(output_dir) if output_dir else source_dir.parent + out_dir.mkdir(parents=True, exist_ok=True) + + existing = [out_dir / d for d in _OUTPUT_SUBDIRS if (out_dir / d).exists()] + if existing and not overwrite: + names = ", ".join(d.name for d in existing) + sys.exit( + f"Error: output already exists in '{out_dir}' ({names}).\n" + "Use --overwrite to overwrite it." + ) + if overwrite: + for d in existing: + shutil.rmtree(d) + _log.info("Removed existing directory: %s", d) + + intermediate_dir = out_dir / "intermediate" + intermediate_dir.mkdir(exist_ok=True) + + # Part 1: kreport → MPA + mpa_dir = intermediate_dir / "mpa" + mpa_dir.mkdir(exist_ok=True) + for f in sorted(source_dir.iterdir()): + if not f.is_file(): + continue + if not _is_processable(f): + _log.info("Skipping: %s", f.name) + continue + out_name = f.stem + ".MPA.TXT" + kreport_to_mpa(str(f), str(mpa_dir / out_name), display_header=True) + + # Part 2: combine MPAs + mpa_files = sorted(mpa_dir.glob("*.MPA.TXT")) + if not mpa_files: + sys.exit("Error: no MPA files found after conversion.") + combined_file = intermediate_dir / "COMBINED.txt" + combine_mpa([str(f) for f in mpa_files], str(combined_file)) + _log.info("MPA files combined. Output: %s", combined_file) + + # Part 3: split combined MPA by rank + split_mpa(str(combined_file), str(intermediate_dir), keep_human=keep_human) + txt_dir = intermediate_dir / "txt" + + # Part 4: clean taxa names and add sample header + for txt_file in sorted(txt_dir.glob("counts_*.txt")): + process_files(str(combined_file), str(txt_file)) + + # Part 5: TXT → CSV + counts_dir = out_dir / "counts" + counts_dir.mkdir(exist_ok=True) + for txt_file in sorted(txt_dir.glob("counts_*.txt")): + csv_file = counts_dir / txt_file.with_suffix(".csv").name + convert_to_csv(str(txt_file), str(csv_file)) + + # Part 6: relative abundance + rel_abund_dir = out_dir / "rel_abund" + rel_abund_dir.mkdir(exist_ok=True) + for csv_file in sorted(counts_dir.glob("counts_*.csv")): + ra_file = rel_abund_dir / csv_file.name.replace("counts_", "ra_") + calculate_rel_abund(str(csv_file), str(ra_file)) + + # Part 7: α & β-diversities + species_csv = counts_dir / "counts_species.csv" + if not species_csv.exists(): + sys.exit(f"Error: species counts not found: {species_csv}") + diversity_dir = out_dir / "diversity" + diversity_dir.mkdir(exist_ok=True) + df = pd.read_csv(species_csv, index_col=0) + calc_alpha_div(df, diversity_dir) + calc_beta_div(df, diversity_dir, rarefaction_depth=rarefaction_depth) + + _log.info("All steps completed successfully!") + + +def main() -> None: + logging.basicConfig(level=logging.INFO, format="%(message)s") + parser = argparse.ArgumentParser( + description="Run the full KrakenParser pipeline." + ) + parser.add_argument( + "-i", "--input", required=True, + help="Directory containing Kraken2 report files", + ) + parser.add_argument( + "-o", "--output", default=None, + help="Output directory (default: parent of input)", + ) + parser.add_argument( + "--keep-human", action="store_true", default=False, + help="Do not filter human-related taxa (default: filtered)", + ) + parser.add_argument( + "-d", "--depth", type=int, default=1000, + help="Rarefaction depth for β-diversity (default: 1000)", + ) + parser.add_argument( + "--overwrite", action="store_true", default=False, + help="Overwrite the output directory if it already exists", + ) + args = parser.parse_args() + run_pipeline( + args.input, args.output, + keep_human=args.keep_human, + rarefaction_depth=args.depth, + overwrite=args.overwrite, + ) + + +if __name__ == "__main__": + main() From 21047684ff3c57baf2be6fbc88400e555c6d2090 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Fri, 10 Apr 2026 13:30:00 +0300 Subject: [PATCH 06/18] feat(kpplot): add aggregate_by_metadata, fix grid axis, add cmap validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add aggregate_by_metadata() to base for metadata grouping - Fix streamgraph grid axis (x→y) - Raise ValueError when cmap list is shorter than taxa count - Add missing-sample validation on sample_order --- krakenparser/kpplot/base.py | 25 ++++++++++++++++++++ krakenparser/kpplot/clustermap.py | 28 +++------------------- krakenparser/kpplot/stackedbar.py | 34 ++++++++------------------- krakenparser/kpplot/streamgraph.py | 37 +++++++++--------------------- 4 files changed, 49 insertions(+), 75 deletions(-) diff --git a/krakenparser/kpplot/base.py b/krakenparser/kpplot/base.py index 97f5447..2df3f0c 100644 --- a/krakenparser/kpplot/base.py +++ b/krakenparser/kpplot/base.py @@ -1,4 +1,5 @@ import matplotlib.pyplot as plt +import pandas as pd from typing import Optional @@ -31,3 +32,27 @@ def savefig( self.fig.savefig( path, dpi=dpi, transparent=transparent, bbox_inches=bbox_inches ) + + +def aggregate_by_metadata( + df: pd.DataFrame, + metadata: pd.DataFrame, + metadata_group: str, +) -> pd.DataFrame: + """Merge df with metadata and re-normalise rel_abund_perc per group.""" + if "Sample_id" not in metadata.columns: + raise ValueError("metadata must contain 'Sample_id' column") + if metadata_group not in metadata.columns: + raise ValueError(f"'{metadata_group}' column not found in metadata") + df = df.merge( + metadata[["Sample_id", metadata_group]], on="Sample_id", how="left" + ) + df = ( + df.groupby([metadata_group, "taxon"], as_index=False)["rel_abund_perc"] + .mean() + .rename(columns={metadata_group: "Sample_id"}) + ) + df["rel_abund_perc"] = df.groupby("Sample_id")["rel_abund_perc"].transform( + lambda x: (x / x.sum()) * 100 + ) + return df diff --git a/krakenparser/kpplot/clustermap.py b/krakenparser/kpplot/clustermap.py index 1a18ef8..f0ad5bb 100644 --- a/krakenparser/kpplot/clustermap.py +++ b/krakenparser/kpplot/clustermap.py @@ -2,9 +2,7 @@ import matplotlib.pyplot as plt import seaborn as sns from typing import Optional, Tuple, Union, List -from pathlib import Path -import shutil -from .base import KpPlotBase +from .base import KpPlotBase, aggregate_by_metadata class KpClustermap(KpPlotBase): @@ -81,28 +79,13 @@ def clustermap( - background_color: Background color of the figure. Returns: - - KpStackedBarplot: An object containing the clustermap figure and axis for customization or saving. + - KpClustermap: An object containing the clustermap figure and axis for customization or saving. """ df = df.copy() if metadata is not None and metadata_group is not None: - if "Sample_id" not in metadata.columns: - raise ValueError("metadata must contain 'Sample_id' column") - if metadata_group not in metadata.columns: - raise ValueError(f"'{metadata_group}' column not found in metadata") - - df = df.merge( - metadata[["Sample_id", metadata_group]], on="Sample_id", how="left" - ) - df = ( - df.groupby([metadata_group, "taxon"], as_index=False)["rel_abund_perc"] - .mean() - .rename(columns={metadata_group: "Sample_id"}) - ) - df["rel_abund_perc"] = df.groupby("Sample_id")["rel_abund_perc"].transform( - lambda x: (x / x.sum()) * 100 - ) + df = aggregate_by_metadata(df, metadata, metadata_group) if df[y_axis].dtype == object: other_mask = df[y_axis].str.startswith("Other") @@ -187,10 +170,5 @@ def clustermap( style=yticks_style, ) - current_dir = Path(__file__).resolve().parent - pycache_dir = current_dir / "__pycache__" - if pycache_dir.exists() and pycache_dir.is_dir(): - shutil.rmtree(pycache_dir) - plt.close(fig) return KpClustermap(fig, ax) diff --git a/krakenparser/kpplot/stackedbar.py b/krakenparser/kpplot/stackedbar.py index 44f3308..8140437 100644 --- a/krakenparser/kpplot/stackedbar.py +++ b/krakenparser/kpplot/stackedbar.py @@ -3,9 +3,7 @@ import seaborn as sns import numpy as np from typing import Optional, Tuple, Union, List -from pathlib import Path -import shutil -from .base import KpPlotBase +from .base import KpPlotBase, aggregate_by_metadata class KpStackedBarplot(KpPlotBase): @@ -89,24 +87,12 @@ def stacked_barplot( df = df.copy() if metadata is not None and metadata_group is not None: - if "Sample_id" not in metadata.columns: - raise ValueError("metadata must contain 'Sample_id' column") - if metadata_group not in metadata.columns: - raise ValueError(f"'{metadata_group}' column not found in metadata") - - df = df.merge( - metadata[["Sample_id", metadata_group]], on="Sample_id", how="left" - ) - df = ( - df.groupby([metadata_group, "taxon"], as_index=False)["rel_abund_perc"] - .mean() - .rename(columns={metadata_group: "Sample_id"}) - ) - df["rel_abund_perc"] = df.groupby("Sample_id")["rel_abund_perc"].transform( - lambda x: (x / x.sum()) * 100 - ) + df = aggregate_by_metadata(df, metadata, metadata_group) if sample_order is not None: + missing = set(sample_order) - set(df["Sample_id"].unique()) + if missing: + raise ValueError(f"Samples missing from data: {missing}") df = df[df["Sample_id"].isin(sample_order)] df["Sample_id"] = pd.Categorical( df["Sample_id"], categories=sample_order, ordered=True @@ -127,6 +113,11 @@ def stacked_barplot( zip(df_plot.columns, sns.color_palette(cmap, n_colors=len(df_plot.columns))) ) elif isinstance(cmap, list): + if len(cmap) < len(df_plot.columns): + raise ValueError( + f"cmap has {len(cmap)} colors but the data has {len(df_plot.columns)} " + "taxa; provide at least as many colors as taxa." + ) color_dict = dict(zip(df_plot.columns, cmap)) else: raise ValueError("cmap must be a str or a list of colors") @@ -199,10 +190,5 @@ def stacked_barplot( ax.set_xlim(-0.5, len(df_plot.index) - 0.5) - current_dir = Path(__file__).resolve().parent - pycache_dir = current_dir / "__pycache__" - if pycache_dir.exists() and pycache_dir.is_dir(): - shutil.rmtree(pycache_dir) - plt.close(fig) return KpStackedBarplot(fig, ax) diff --git a/krakenparser/kpplot/streamgraph.py b/krakenparser/kpplot/streamgraph.py index ac82e46..6c389a2 100644 --- a/krakenparser/kpplot/streamgraph.py +++ b/krakenparser/kpplot/streamgraph.py @@ -3,9 +3,7 @@ import seaborn as sns import numpy as np from typing import Optional, Tuple, Union, List -from pathlib import Path -import shutil -from .base import KpPlotBase +from .base import KpPlotBase, aggregate_by_metadata class KpStreamgraph(KpPlotBase): @@ -90,24 +88,12 @@ def streamgraph( df = df.copy() if metadata is not None and metadata_group is not None: - if "Sample_id" not in metadata.columns: - raise ValueError("metadata must contain 'Sample_id' column") - if metadata_group not in metadata.columns: - raise ValueError(f"'{metadata_group}' column not found in metadata") - - df = df.merge( - metadata[["Sample_id", metadata_group]], on="Sample_id", how="left" - ) - df = ( - df.groupby([metadata_group, "taxon"], as_index=False)["rel_abund_perc"] - .mean() - .rename(columns={metadata_group: "Sample_id"}) - ) - df["rel_abund_perc"] = df.groupby("Sample_id")["rel_abund_perc"].transform( - lambda x: (x / x.sum()) * 100 - ) + df = aggregate_by_metadata(df, metadata, metadata_group) if sample_order is not None: + missing = set(sample_order) - set(df["Sample_id"].unique()) + if missing: + raise ValueError(f"Samples missing from data: {missing}") df = df[df["Sample_id"].isin(sample_order)] df["Sample_id"] = pd.Categorical( df["Sample_id"], categories=sample_order, ordered=True @@ -128,6 +114,11 @@ def streamgraph( zip(df_plot.columns, sns.color_palette(cmap, n_colors=len(df_plot.columns))) ) elif isinstance(cmap, list): + if len(cmap) < len(df_plot.columns): + raise ValueError( + f"cmap has {len(cmap)} colors but the data has {len(df_plot.columns)} " + "taxa; provide at least as many colors as taxa." + ) color_dict = dict(zip(df_plot.columns, cmap)) else: raise ValueError("cmap must be a str or a list of colors") @@ -190,7 +181,7 @@ def streamgraph( text.set_fontstyle(legend_fontstyle) if grid: - ax.grid(axis="x", linestyle=grid_linestyle, alpha=grid_alpha, zorder=0) + ax.grid(axis="y", linestyle=grid_linestyle, alpha=grid_alpha, zorder=0) labels = df_plot.index.tolist() plt.xticks( @@ -206,12 +197,6 @@ def streamgraph( ax.set_xlim(-0.5, len(df_plot.index) - 0.5) - current_dir = Path(__file__).resolve().parent - pycache_dir = current_dir / "__pycache__" - - if pycache_dir.exists() and pycache_dir.is_dir(): - shutil.rmtree(pycache_dir) - plt.close(fig) return KpStreamgraph(fig, ax) From ee330e85739a7dcd36162da448ea862d77c5e256 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Wed, 15 Apr 2026 11:17:00 +0300 Subject: [PATCH 07/18] refactor(cli): use importlib.metadata for version, run scripts as -m modules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove version.py — version now sourced from package metadata. Dispatcher uses python -m package.module to preserve import context. Default to full pipeline when -i is given without a subcommand. --- krakenparser/krakenparser.py | 63 +++++++++++++++++++++++------------- krakenparser/version.py | 1 - 2 files changed, 40 insertions(+), 24 deletions(-) delete mode 100755 krakenparser/version.py diff --git a/krakenparser/krakenparser.py b/krakenparser/krakenparser.py index 5eb9c71..6a560ce 100755 --- a/krakenparser/krakenparser.py +++ b/krakenparser/krakenparser.py @@ -1,13 +1,18 @@ import argparse +import logging import subprocess -import shutil from pathlib import Path import sys -from .version import __version__ +from importlib.metadata import version as _pkg_version, PackageNotFoundError as _PNF +try: + __version__ = _pkg_version("krakenparser") +except _PNF: + __version__ = "unknown" # Main function to run the tool def main(): + logging.basicConfig(level=logging.INFO, format="%(message)s") print("KrakenParser by Ilia V. Popov") # Set up argument parser parser = argparse.ArgumentParser( @@ -17,7 +22,7 @@ def main(): parser.add_argument( "--complete", action="store_true", - help="Run the full pipeline automated", + help="Run the full pipeline (also the default when -i is given)", ) parser.add_argument( "--kreport2mpa", @@ -65,19 +70,23 @@ def main(): args, extra_args = parser.parse_known_args() + for _a in extra_args: + if "\x00" in _a: + sys.exit("Error: argument contains invalid null byte.") + package_dir = Path(__file__).resolve().parent # Directory of the current script - # Map flags to corresponding scripts + # Map flags to (script_path, base_args_to_prepend) command_map = { - "complete": package_dir / "kraken2csv.sh", - "kreport2mpa": package_dir / "run_kreport2mpa.sh", - "combine_mpa": package_dir / "combine_mpa.py", - "deconstruct": package_dir / "decombine.sh", - "deconstruct_viruses": package_dir / "decombine_viruses.sh", - "process": package_dir / "processing_script.py", - "txt2csv": package_dir / "convert2csv.py", - "relabund": package_dir / "relabund.py", - "diversity": package_dir / "diversity.py", + "complete": (package_dir / "pipeline.py", []), + "kreport2mpa": (package_dir / "mpa" / "transform2mpa.py", []), + "combine_mpa": (package_dir / "mpa" / "mpa_table.py", []), + "deconstruct": (package_dir / "counts" / "split_mpa.py", []), + "deconstruct_viruses":(package_dir / "counts" / "split_mpa.py", ["--viruses-only"]), + "process": (package_dir / "counts" / "processing_script.py", []), + "txt2csv": (package_dir / "counts" / "convert2csv.py", []), + "relabund": (package_dir / "stats" / "relabund.py", []), + "diversity": (package_dir / "stats" / "diversity.py", []), } if "-h" in sys.argv or "--help" in sys.argv: @@ -85,21 +94,29 @@ def main(): parser.print_help() return + def _build_cmd(script: Path, base_args: list[str], user_args: list[str]) -> list[str]: + if script.suffix == ".py": + # Run as module (-m) so the krakenparser package stays importable. + # Derive dotted module name from path relative to the package root. + module = ".".join( + script.relative_to(package_dir.parent).with_suffix("").parts + ) + return [sys.executable, "-m", module] + base_args + user_args + return [str(script)] + base_args + user_args + # Find which argument was given and run the corresponding script - for arg, script in command_map.items(): + for arg, (script, base_args) in command_map.items(): if getattr(args, arg): - subprocess.run( - [script] + extra_args, check=True - ) # Pass extra arguments to the script + subprocess.run(_build_cmd(script, base_args, extra_args), check=True) return - parser.print_help() + # Default to full pipeline when -i/--input is given without a subcommand + if "-i" in extra_args or "--input" in extra_args: + complete_script, complete_base = command_map["complete"] + subprocess.run(_build_cmd(complete_script, complete_base, extra_args), check=True) + return - pycache_dir = package_dir / "__pycache__" - - # Check if __pycache__ exists and remove it - if pycache_dir.exists() and pycache_dir.is_dir(): - shutil.rmtree(pycache_dir) + parser.print_help() if __name__ == "__main__": diff --git a/krakenparser/version.py b/krakenparser/version.py deleted file mode 100755 index 43c4ab0..0000000 --- a/krakenparser/version.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = "0.6.1" From 7dd691497e9947a582f6441832ef4f0be0bbc3eb Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Wed, 22 Apr 2026 09:44:00 +0300 Subject: [PATCH 08/18] =?UTF-8?q?ci:=20rename=20workflow=20to=20python-pac?= =?UTF-8?q?kage.yml,=20expand=20matrix=20to=203.10=E2=80=933.14?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add fail-fast: false, flake8 lint step, --no-build-isolation, branch-scoped triggers (dev, main), codecov-action@v5. --- .github/workflows/ci.yml | 23 -------------- .github/workflows/python-package.yml | 46 ++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 23 deletions(-) delete mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/python-package.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index 9a19b2a..0000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: pytest -on: - push: - branches: [ dev ] - pull_request: - branches: [ dev ] - -jobs: - build-test: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: [3.12] - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - run: | - python -m pip install --upgrade pip - pip install krakenparser - pip install pytest - - run: pytest -q \ No newline at end of file diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 0000000..553faa6 --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,46 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Python package + +on: + push: + branches: [ "dev", "main" ] + pull_request: + branches: [ "dev", "main" ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install flake8 setuptools wheel + pip install -e ".[dev]" --no-build-isolation + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest --cov=krakenparser --cov-report=xml + - name: Upload coverage to Codecov + if: matrix.python-version == '3.12' + uses: codecov/codecov-action@v5 + with: + files: coverage.xml + token: ${{ secrets.CODECOV_TOKEN }} From c37f5cb7857bb6635dda0d69698c1d51b8622384 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Sun, 26 Apr 2026 15:03:00 +0300 Subject: [PATCH 09/18] ci: add publish.yml with Trusted Publishing for PyPI Triggers on v* tags. Strips GitHub-specific markup from README before build to ensure clean PyPI rendering. --- .github/workflows/publish.yml | 53 +++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 .github/workflows/publish.yml diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..3763435 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,53 @@ +name: Publish to PyPI + +on: + push: + tags: + - "v*" + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Install build tools + run: pip install build twine + - name: Strip GitHub-specific markup from README + run: | + python - <<'EOF' + import re, pathlib + text = pathlib.Path("README.md").read_text() + text = re.sub( + r'\n', + '', text, + ) + text = re.sub(r'\[!\[Downloads\]\([^\)]+\)\]\([^\)]+\)\n', '', text) + pathlib.Path("README.md").write_text(text) + EOF + - name: Build package + run: python -m build + - name: Check distribution + run: twine check dist/* + - name: Store distribution packages + uses: actions/upload-artifact@v4 + with: + name: dist + path: dist/ + + publish: + needs: build + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') + runs-on: ubuntu-latest + environment: release + permissions: + id-token: write + steps: + - uses: actions/download-artifact@v4 + with: + name: dist + path: dist/ + - uses: pypa/gh-action-pypi-publish@release/v1 From 6a5c80f0584383fdf7580f192b86c1fdb5feb34f Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Fri, 1 May 2026 10:58:00 +0300 Subject: [PATCH 10/18] test: add unit and integration test suite (62 tests) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unit tests: _parse_line, shannon/pielou/chao1 indices, modify_taxa_names. Integration: kreport_to_mpa, convert_to_csv, relabund, alpha/beta div, split_mpa — all with reproducibility (SHA-256) checks. --- tests/conftest.py | 66 +++++++ tests/test_integration.py | 356 ++++++++++++++++++++++++++++++++++++++ tests/test_units.py | 146 ++++++++++++++++ 3 files changed, 568 insertions(+) create mode 100644 tests/conftest.py create mode 100644 tests/test_integration.py create mode 100644 tests/test_units.py diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..8100a3e --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,66 @@ +import matplotlib +matplotlib.use("Agg") + +import pandas as pd +import pytest + + +SAMPLE_KREPORT = ( + "99.98\t999980\t0\tR\t1\troot\n" + "99.98\t999980\t0\tD\t2\t Bacteria\n" + "85.00\t850000\t0\tP\t1224\t Pseudomonadota\n" + "50.00\t500000\t200000\tG\t286\t Pseudomonas\n" + "30.00\t300000\t300000\tS\t287\t Pseudomonas aeruginosa\n" + "20.00\t200000\t200000\tG\t561\t Escherichia\n" + "20.00\t200000\t200000\tS\t562\t Escherichia coli\n" + "10.00\t100000\t0\tP\t976\t Bacteroidota\n" + "10.00\t100000\t100000\tS\t817\t Bacteroides fragilis\n" + "0.02\t200\t200\tU\t0\tunclassified\n" +) + +# Tab-delimited TXT as produced by processing_script.py +SAMPLE_COUNTS_TXT = ( + "#Classification\tsample1\tsample2\n" + "Pseudomonas aeruginosa\t300000\t100000\n" + "Escherichia coli\t200000\t50000\n" + "Bacteroides fragilis\t100000\t200000\n" +) + + +@pytest.fixture +def kreport_file(tmp_path): + f = tmp_path / "sample.kreport" + f.write_text(SAMPLE_KREPORT) + return f + + +@pytest.fixture +def counts_txt_file(tmp_path): + f = tmp_path / "counts_species.txt" + f.write_text(SAMPLE_COUNTS_TXT) + return f + + +@pytest.fixture +def counts_csv_file(tmp_path): + df = pd.DataFrame({ + "Sample_id": ["S1", "S2"], + "Pseudomonas aeruginosa": [300000, 100000], + "Escherichia coli": [200000, 50000], + "Bacteroides fragilis": [100000, 200000], + }) + f = tmp_path / "counts_species.csv" + df.to_csv(f, index=False) + return f + + +@pytest.fixture +def relabund_df(): + return pd.DataFrame({ + "Sample_id": ["S1", "S1", "S1", "S2", "S2", "S2"], + "taxon": [ + "Pseudomonadota", "Bacillota", "Other (<4.0%)", + "Pseudomonadota", "Bacillota", "Other (<4.0%)", + ], + "rel_abund_perc": [70.0, 20.0, 10.0, 50.0, 35.0, 15.0], + }) diff --git a/tests/test_integration.py b/tests/test_integration.py new file mode 100644 index 0000000..7fed2fb --- /dev/null +++ b/tests/test_integration.py @@ -0,0 +1,356 @@ +"""Characterization / integration tests — file I/O, reproducibility.""" + +import hashlib +import itertools + +import pandas as pd +import pytest + +from krakenparser.mpa.transform2mpa import kreport_to_mpa +from krakenparser.counts.convert2csv import convert_to_csv +from krakenparser.counts.processing_script import process_files +from krakenparser.counts.split_mpa import split_mpa +from krakenparser.stats.relabund import calculate_rel_abund +from krakenparser.stats.diversity import calc_alpha_div, calc_beta_div + + +SAMPLE_COMBINED_MPA = ( + "#Classification\tsample1\tsample2\n" + "d__Bacteria|p__Pseudomonadota|g__Pseudomonas|s__Pseudomonas_aeruginosa\t300\t100\n" + "d__Bacteria|p__Pseudomonadota|g__Pseudomonas\t500\t200\n" + "d__Bacteria|p__Pseudomonadota|g__Homo|s__Homo_sapiens\t50\t30\n" + "d__Bacteria|p__Pseudomonadota|g__Homo\t60\t35\n" + "d__Bacteria|p__Pseudomonadota\t850\t350\n" + "d__Bacteria|p__Bacteroidota\t100\t80\n" + "d__Viruses|p__Uroviricota|s__Virus_alpha\t10\t5\n" +) + + +@pytest.fixture +def combined_mpa_file(tmp_path): + f = tmp_path / "COMBINED.txt" + f.write_text(SAMPLE_COMBINED_MPA) + return f + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _sha256(path) -> str: + return hashlib.sha256(path.read_bytes()).hexdigest() + + +# --------------------------------------------------------------------------- +# kreport_to_mpa +# --------------------------------------------------------------------------- + +def test_kreport_to_mpa_reproducible(kreport_file, tmp_path): + counter = itertools.count() + + def run(): + out = tmp_path / f"out_{next(counter)}.MPA.TXT" + kreport_to_mpa(str(kreport_file), str(out), display_header=True) + return _sha256(out) + + assert run() == run() + + +def test_kreport_to_mpa_standard_ranks_only(kreport_file, tmp_path): + out = tmp_path / "out.MPA.TXT" + kreport_to_mpa(str(kreport_file), str(out)) + lines = out.read_text().splitlines() + paths = [ln.split("\t")[0] for ln in lines] + + assert any("p__Pseudomonadota" in p for p in paths) + assert any("s__Pseudomonas_aeruginosa" in p for p in paths) + assert any(p.startswith("d__Bacteria|") for p in paths) + + +def test_kreport_to_mpa_excludes_unclassified_and_root(kreport_file, tmp_path): + out = tmp_path / "out.MPA.TXT" + kreport_to_mpa(str(kreport_file), str(out)) + content = out.read_text() + assert "unclassified" not in content + assert "root" not in content + + +def test_kreport_to_mpa_display_header(kreport_file, tmp_path): + out = tmp_path / "out.MPA.TXT" + kreport_to_mpa(str(kreport_file), str(out), display_header=True) + first_line = out.read_text().splitlines()[0] + assert first_line.startswith("#Classification") + assert kreport_file.name in first_line + + +def test_kreport_to_mpa_paths_are_hierarchical(kreport_file, tmp_path): + out = tmp_path / "out.MPA.TXT" + kreport_to_mpa(str(kreport_file), str(out)) + lines = out.read_text().splitlines() + for ln in lines: + path = ln.split("\t")[0] + segments = path.split("|") + # Each segment must have a known two-letter prefix + for seg in segments: + assert "__" in seg, f"Unexpected MPA segment format: {seg!r}" + + +# --------------------------------------------------------------------------- +# convert_to_csv +# --------------------------------------------------------------------------- + +def test_convert_to_csv_reproducible(counts_txt_file, tmp_path): + counter = itertools.count() + + def run(): + out = tmp_path / f"out_{next(counter)}.csv" + convert_to_csv(str(counts_txt_file), str(out)) + return _sha256(out) + + assert run() == run() + + +def test_convert_to_csv_transposes_correctly(counts_txt_file, tmp_path): + out = tmp_path / "counts.csv" + convert_to_csv(str(counts_txt_file), str(out)) + df = pd.read_csv(out) + + assert "Sample_id" in df.columns + assert set(df["Sample_id"]) == {"sample1", "sample2"} + assert "Pseudomonas aeruginosa" in df.columns + + +# --------------------------------------------------------------------------- +# process_files +# --------------------------------------------------------------------------- + +def test_process_files_adds_header_and_cleans_names(tmp_path): + source = tmp_path / "COMBINED.txt" + source.write_text( + "#Classification\tsample1.kreport\tsample2.kreport\n" + "d__Bacteria|p__X|s__Pseudomonas_aeruginosa\t300\t100\n" + ) + dest = tmp_path / "counts_species.txt" + dest.write_text( + "s__Pseudomonas_aeruginosa\t300\t100\n" + "s__Escherichia_coli\t200\t50\n" + ) + process_files(str(source), str(dest)) + result = dest.read_text() + lines = result.splitlines() + assert lines[0] == "#Classification\tsample1\tsample2" + assert "Pseudomonas aeruginosa" in result + assert "Escherichia coli" in result + assert "s__" not in result + + +def test_process_files_reproducible(tmp_path): + source = tmp_path / "COMBINED.txt" + source.write_text("#Classification\tsample1.kreport\n") + for i in range(2): + dest = tmp_path / f"counts_{i}.txt" + dest.write_text("s__Some_species\t10\n") + process_files(str(source), str(dest)) + assert (tmp_path / "counts_0.txt").read_text() == (tmp_path / "counts_1.txt").read_text() + + +def test_process_files_missing_source_raises(tmp_path): + dest = tmp_path / "counts.txt" + dest.write_text("s__X\t10\n") + with pytest.raises(FileNotFoundError): + process_files(str(tmp_path / "ghost.txt"), str(dest)) + + +def test_process_files_missing_dest_raises(tmp_path): + source = tmp_path / "COMBINED.txt" + source.write_text("#Classification\tsample1.kreport\n") + with pytest.raises(FileNotFoundError): + process_files(str(source), str(tmp_path / "ghost.txt")) + + +def test_convert_to_csv_missing_input_raises(tmp_path): + with pytest.raises(FileNotFoundError): + convert_to_csv(str(tmp_path / "ghost.txt"), str(tmp_path / "out.csv")) + + +# --------------------------------------------------------------------------- +# calculate_rel_abund +# --------------------------------------------------------------------------- + +def test_relabund_reproducible(counts_csv_file, tmp_path): + counter = itertools.count() + + def run(): + out = tmp_path / f"ra_{next(counter)}.csv" + calculate_rel_abund(str(counts_csv_file), str(out)) + return _sha256(out) + + assert run() == run() + + +def test_relabund_sums_to_100_per_sample(counts_csv_file, tmp_path): + out = tmp_path / "ra.csv" + calculate_rel_abund(str(counts_csv_file), str(out)) + df = pd.read_csv(out) + for sample, grp in df.groupby("Sample_id"): + total = grp["rel_abund_perc"].sum() + assert total == pytest.approx(100.0, abs=1e-6), f"{sample}: sum={total}" + + +def test_relabund_other_threshold_creates_other_group(counts_csv_file, tmp_path): + out = tmp_path / "ra.csv" + calculate_rel_abund(str(counts_csv_file), str(out), other_threshold=99.0) + df = pd.read_csv(out) + assert df["taxon"].str.startswith("Other").any() + + +def test_relabund_no_zero_rows(counts_csv_file, tmp_path): + out = tmp_path / "ra.csv" + calculate_rel_abund(str(counts_csv_file), str(out)) + df = pd.read_csv(out) + assert (df["rel_abund_perc"] > 0).all() + + +def test_relabund_missing_input_raises(tmp_path): + with pytest.raises(FileNotFoundError): + calculate_rel_abund(str(tmp_path / "ghost.csv"), str(tmp_path / "out.csv")) + + +# --------------------------------------------------------------------------- +# calc_alpha_div +# --------------------------------------------------------------------------- + +def test_alpha_div_reproducible(counts_csv_file, tmp_path): + df = pd.read_csv(counts_csv_file, index_col=0) + counter = itertools.count() + + def run(): + out_dir = tmp_path / f"div_{next(counter)}" + out_dir.mkdir() + calc_alpha_div(df, out_dir) + return _sha256(out_dir / "alpha_div.csv") + + assert run() == run() + + +def test_alpha_div_output_columns(counts_csv_file, tmp_path): + df = pd.read_csv(counts_csv_file, index_col=0) + out_dir = tmp_path / "diversity" + out_dir.mkdir() + calc_alpha_div(df, out_dir) + result = pd.read_csv(out_dir / "alpha_div.csv") + assert set(result.columns) == {"Sample", "Shannon", "Pielou", "Chao1"} + assert len(result) == len(df) + + +def test_alpha_div_shannon_non_negative(counts_csv_file, tmp_path): + df = pd.read_csv(counts_csv_file, index_col=0) + out_dir = tmp_path / "diversity" + out_dir.mkdir() + calc_alpha_div(df, out_dir) + result = pd.read_csv(out_dir / "alpha_div.csv") + assert (result["Shannon"] >= 0).all() + + +# --------------------------------------------------------------------------- +# calc_beta_div +# --------------------------------------------------------------------------- + +def test_beta_div_output_files_exist(counts_csv_file, tmp_path): + df = pd.read_csv(counts_csv_file, index_col=0) + out_dir = tmp_path / "diversity" + out_dir.mkdir() + calc_beta_div(df, out_dir, rarefaction_depth=1000) + assert (out_dir / "beta_div_bray.csv").exists() + assert (out_dir / "beta_div_jaccard.csv").exists() + + +def test_beta_div_matrix_is_square(counts_csv_file, tmp_path): + df = pd.read_csv(counts_csv_file, index_col=0) + out_dir = tmp_path / "diversity" + out_dir.mkdir() + calc_beta_div(df, out_dir, rarefaction_depth=1000) + bray = pd.read_csv(out_dir / "beta_div_bray.csv", index_col=0) + assert bray.shape[0] == bray.shape[1] + + +def test_beta_div_diagonal_is_zero(counts_csv_file, tmp_path): + df = pd.read_csv(counts_csv_file, index_col=0) + out_dir = tmp_path / "diversity" + out_dir.mkdir() + calc_beta_div(df, out_dir, rarefaction_depth=1000) + bray = pd.read_csv(out_dir / "beta_div_bray.csv", index_col=0) + import numpy as np + assert np.allclose(np.diag(bray.values), 0.0) + + +def test_beta_div_too_few_samples_raises(tmp_path): + df = pd.DataFrame( + {"Taxon_A": [100], "Taxon_B": [200]}, index=["S1"] + ) + out_dir = tmp_path / "diversity" + out_dir.mkdir() + with pytest.raises(ValueError, match="rarefaction"): + calc_beta_div(df, out_dir, rarefaction_depth=1000) + + +# --------------------------------------------------------------------------- +# split_mpa +# --------------------------------------------------------------------------- + +def test_split_mpa_creates_all_rank_files(combined_mpa_file, tmp_path): + split_mpa(str(combined_mpa_file), str(tmp_path)) + for rank in ("species", "genus", "family", "order", "class", "phylum"): + assert (tmp_path / "txt" / f"counts_{rank}.txt").exists() + + +def test_split_mpa_reproducible(combined_mpa_file, tmp_path): + counter = itertools.count() + + def run(): + out = tmp_path / f"out_{next(counter)}" + split_mpa(str(combined_mpa_file), str(out)) + return _sha256(out / "txt" / "counts_species.txt") + + assert run() == run() + + +def test_split_mpa_filters_human_by_default(combined_mpa_file, tmp_path): + split_mpa(str(combined_mpa_file), str(tmp_path)) + species = (tmp_path / "txt" / "counts_species.txt").read_text() + assert "Homo_sapiens" not in species + genus = (tmp_path / "txt" / "counts_genus.txt").read_text() + assert "g__Homo" not in genus + + +def test_split_mpa_keep_human_retains_homo(combined_mpa_file, tmp_path): + split_mpa(str(combined_mpa_file), str(tmp_path), keep_human=True) + species = (tmp_path / "txt" / "counts_species.txt").read_text() + assert "Homo_sapiens" in species + + +def test_split_mpa_viruses_only(combined_mpa_file, tmp_path): + split_mpa(str(combined_mpa_file), str(tmp_path), viruses_only=True) + species = (tmp_path / "txt" / "counts_species.txt").read_text() + assert "Virus_alpha" in species + assert "Pseudomonas_aeruginosa" not in species + + +def test_split_mpa_strips_path_prefix(combined_mpa_file, tmp_path): + split_mpa(str(combined_mpa_file), str(tmp_path)) + species = (tmp_path / "txt" / "counts_species.txt").read_text() + # No MPA path separators should remain + assert "|" not in species + # The rank prefix should remain (processing_script strips it later) + assert "s__" in species + + +def test_split_mpa_genus_excludes_species_lines(combined_mpa_file, tmp_path): + split_mpa(str(combined_mpa_file), str(tmp_path)) + genus = (tmp_path / "txt" / "counts_genus.txt").read_text() + assert "s__" not in genus + + +def test_split_mpa_missing_input_raises(tmp_path): + with pytest.raises(FileNotFoundError): + split_mpa(str(tmp_path / "ghost.txt"), str(tmp_path / "out")) diff --git a/tests/test_units.py b/tests/test_units.py new file mode 100644 index 0000000..57618de --- /dev/null +++ b/tests/test_units.py @@ -0,0 +1,146 @@ +"""Pure-function unit tests — no I/O, fully deterministic.""" + +import math + +import numpy as np +import pytest + +from krakenparser.mpa.transform2mpa import _parse_line +from krakenparser.stats.diversity import chao1_index, pielou_evenness, shannon_index +from krakenparser.counts.processing_script import modify_taxa_names + + +# --------------------------------------------------------------------------- +# _parse_line +# --------------------------------------------------------------------------- + +def test_parse_line_standard_rank(): + line = "50.00\t500000\t100000\tP\t1224\t Pseudomonadota\n" + name, depth, rank, cum_reads, pct = _parse_line(line) + assert name == "Pseudomonadota" + assert depth == 2 # 4 leading spaces // 2 + assert rank == "P" + assert cum_reads == 500000 + assert pct == 50.0 + + +def test_parse_line_root_no_indent(): + line = "99.98\t999980\t0\tR\t1\troot\n" + name, depth, rank, cum_reads, pct = _parse_line(line) + assert name == "root" + assert depth == 0 + assert rank == "R" + + +def test_parse_line_intermediate_rank(): + line = "5.00\t50000\t0\tS1\t12345\t Some subspecies\n" + name, depth, rank, _, _ = _parse_line(line) + assert name == "Some subspecies" + assert rank == "S1" + assert depth == 5 # 10 spaces // 2 + + +def test_parse_line_too_few_columns(): + assert _parse_line("50.00\t500000\n") is None + + +def test_parse_line_non_numeric_pct(): + assert _parse_line("not_a_float\t500000\t0\tP\t1224\tBacteria\n") is None + + +def test_parse_line_non_numeric_reads(): + assert _parse_line("50.00\tnot_int\t0\tP\t1224\tBacteria\n") is None + + +# --------------------------------------------------------------------------- +# shannon_index +# --------------------------------------------------------------------------- + +def test_shannon_uniform_four_species(): + assert abs(shannon_index([1, 1, 1, 1]) - math.log(4)) < 1e-10 + + +def test_shannon_single_species_is_zero(): + assert shannon_index([100]) == pytest.approx(0.0) + + +def test_shannon_ignores_zero_counts(): + assert abs(shannon_index([1, 1, 1, 1, 0, 0]) - math.log(4)) < 1e-10 + + +def test_shannon_two_equal_species(): + assert abs(shannon_index([50, 50]) - math.log(2)) < 1e-10 + + +# --------------------------------------------------------------------------- +# pielou_evenness +# --------------------------------------------------------------------------- + +def test_pielou_single_species_returns_nan(): + assert math.isnan(pielou_evenness([100])) + + +def test_pielou_all_zeros_returns_nan(): + assert math.isnan(pielou_evenness([0, 0, 0])) + + +def test_pielou_uniform_equals_one(): + assert pielou_evenness([1, 1, 1, 1]) == pytest.approx(1.0) + + +def test_pielou_range_zero_to_one(): + result = pielou_evenness([10, 2, 1, 1]) + assert 0.0 < result < 1.0 + + +# --------------------------------------------------------------------------- +# chao1_index +# --------------------------------------------------------------------------- + +def test_chao1_f2_zero_uses_f1_formula(): + # F1=3, F2=0 → S_obs + F1*(F1-1)/2 + counts = [1, 1, 1, 5, 10] # F1=3, F2=0, S_obs=5 + expected = 5 + 3 * (3 - 1) / 2 # 5 + 3 = 8 + assert chao1_index(counts) == pytest.approx(expected) + + +def test_chao1_normal(): + # F1=2, F2=2, S_obs=5 → S_obs + F1²/(2*F2) + counts = [1, 1, 2, 2, 5] + expected = 5 + (2 * 2) / (2 * 2) # 5 + 1 = 6 + assert chao1_index(counts) == pytest.approx(expected) + + +def test_chao1_no_singletons(): + counts = [5, 10, 15, 20] + # F1=0, F2=0 → uses F2==0 branch: S_obs + 0 + assert chao1_index(counts) == pytest.approx(4.0) + + +# --------------------------------------------------------------------------- +# modify_taxa_names +# --------------------------------------------------------------------------- + +def test_modify_taxa_names_strips_prefix_and_replaces_underscores(): + assert modify_taxa_names("s__Homo_sapiens\t100\t200") == "Homo sapiens\t100\t200" + + +def test_modify_taxa_names_genus(): + assert modify_taxa_names("g__Escherichia_coli\t50") == "Escherichia coli\t50" + + +def test_modify_taxa_names_all_prefixes(): + for prefix in ["s__", "g__", "f__", "o__", "c__", "p__"]: + result = modify_taxa_names(f"{prefix}Some_name\t10") + assert result == "Some name\t10" + + +def test_modify_taxa_names_no_prefix_unchanged(): + line = "unclassified_reads\t100" + assert modify_taxa_names(line) == line + + +def test_modify_taxa_names_count_fields_not_modified(): + # Underscores in tab-separated count fields must be preserved + result = modify_taxa_names("s__My_taxon\t1_000\t2_000") + assert result == "My taxon\t1_000\t2_000" From 00efa56aacc66f0610ea7ebc27a575f3c4fe1165 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Wed, 6 May 2026 14:22:00 +0300 Subject: [PATCH 11/18] test: add kpplot tests and raise coverage to 83% Add smoke tests for stacked_barplot, streamgraph, clustermap. Switch test_full_pipeline to direct run_pipeline() call so pipeline.py is included in coverage. Add overwrite protection tests. --- tests/test_full_pipeline.py | 43 +++++++++++---- tests/test_kpplot.py | 105 ++++++++++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+), 9 deletions(-) create mode 100644 tests/test_kpplot.py diff --git a/tests/test_full_pipeline.py b/tests/test_full_pipeline.py index 360c781..599f625 100644 --- a/tests/test_full_pipeline.py +++ b/tests/test_full_pipeline.py @@ -1,9 +1,10 @@ import zipfile import shutil -import subprocess import pytest from pathlib import Path +from krakenparser.pipeline import run_pipeline + @pytest.fixture def demo_run(tmp_path): @@ -29,21 +30,45 @@ def test_full_pipeline_end_to_end(demo_run): run_dir = demo_run["run_dir"] kreports_path = run_dir / "kreports" - # 1. Run KrakenParser - cmd = ["KrakenParser", "--complete", "-i", str(kreports_path)] - result = subprocess.run(cmd, capture_output=True, text=True) - assert result.returncode == 0, f"Pipeline failed:\n{result.stderr}" + run_pipeline(str(kreports_path)) - # 2. Assert each rank‐level CSV exists and is non‐empty + # Assert each rank-level CSV exists and is non-empty ranks = ["phylum", "class", "order", "family", "genus", "species"] for rank in ranks: - csv_path = run_dir / "counts" / "csv" / f"counts_{rank}.csv" + csv_path = run_dir / "counts" / f"counts_{rank}.csv" assert csv_path.exists(), f"Missing counts_{rank}.csv" assert csv_path.stat().st_size > 0, f"counts_{rank}.csv is empty" - # 3. Assert relative‐abundance CSVs exist and are non‐empty + # Assert relative-abundance CSVs exist and are non-empty rel_dir = run_dir / "rel_abund" assert rel_dir.exists(), "rel_abund directory is missing" rel_species = rel_dir / "ra_species.csv" - assert rel_species.exists(), "Missing ra_species.csv in csv_relabund" + assert rel_species.exists(), "Missing ra_species.csv" assert rel_species.stat().st_size > 0, "ra_species.csv is empty" + + # Assert diversity outputs exist + diversity_dir = run_dir / "diversity" + assert (diversity_dir / "alpha_div.csv").exists() + + # Assert intermediate files exist + assert (run_dir / "intermediate" / "COMBINED.txt").exists() + + +def test_pipeline_overwrite_protection(demo_run): + run_dir = demo_run["run_dir"] + kreports_path = run_dir / "kreports" + + run_pipeline(str(kreports_path)) + + # Second run without --overwrite must exit + with pytest.raises(SystemExit): + run_pipeline(str(kreports_path)) + + +def test_pipeline_overwrite_flag(demo_run): + run_dir = demo_run["run_dir"] + kreports_path = run_dir / "kreports" + + run_pipeline(str(kreports_path)) + # Second run with overwrite=True must succeed + run_pipeline(str(kreports_path), overwrite=True) diff --git a/tests/test_kpplot.py b/tests/test_kpplot.py new file mode 100644 index 0000000..51cb5e1 --- /dev/null +++ b/tests/test_kpplot.py @@ -0,0 +1,105 @@ +"""kpplot smoke tests and parameter-validation tests.""" + +import pytest + +from krakenparser.kpplot.stackedbar import stacked_barplot +from krakenparser.kpplot.streamgraph import streamgraph +from krakenparser.kpplot.clustermap import clustermap +from krakenparser.kpplot.base import KpPlotBase, aggregate_by_metadata + + +# --------------------------------------------------------------------------- +# Smoke tests — verify each plot function returns without error +# --------------------------------------------------------------------------- + +def test_stackedbar_returns_kpplotbase(relabund_df): + result = stacked_barplot(relabund_df) + assert isinstance(result, KpPlotBase) + + +def test_streamgraph_returns_kpplotbase(relabund_df): + result = streamgraph(relabund_df) + assert isinstance(result, KpPlotBase) + + +def test_clustermap_returns_kpplotbase(relabund_df): + result = clustermap(relabund_df) + assert isinstance(result, KpPlotBase) + + +# --------------------------------------------------------------------------- +# sample_order validation +# --------------------------------------------------------------------------- + +def test_stackedbar_sample_order_missing_raises(relabund_df): + with pytest.raises(ValueError, match="Samples missing"): + stacked_barplot(relabund_df, sample_order=["S1", "S2", "GHOST"]) + + +def test_streamgraph_sample_order_missing_raises(relabund_df): + with pytest.raises(ValueError, match="Samples missing"): + streamgraph(relabund_df, sample_order=["S1", "GHOST"]) + + +def test_clustermap_sample_order_missing_raises(relabund_df): + with pytest.raises(ValueError, match="Samples missing"): + clustermap(relabund_df, sample_order=["S1", "GHOST"]) + + +# --------------------------------------------------------------------------- +# cmap validation (stackedbar / streamgraph) +# --------------------------------------------------------------------------- + +def test_stackedbar_cmap_too_short_raises(relabund_df): + with pytest.raises(ValueError, match="cmap"): + stacked_barplot(relabund_df, cmap=["red"]) + + +def test_streamgraph_cmap_too_short_raises(relabund_df): + with pytest.raises(ValueError, match="cmap"): + streamgraph(relabund_df, cmap=["red"]) + + +def test_stackedbar_cmap_invalid_type_raises(relabund_df): + with pytest.raises(ValueError, match="cmap"): + stacked_barplot(relabund_df, cmap=123) + + +def test_streamgraph_cmap_invalid_type_raises(relabund_df): + with pytest.raises(ValueError, match="cmap"): + streamgraph(relabund_df, cmap=123) + + +# --------------------------------------------------------------------------- +# aggregate_by_metadata +# --------------------------------------------------------------------------- + +def test_aggregate_by_metadata_basic(relabund_df): + import pandas as pd + + metadata = pd.DataFrame({ + "Sample_id": ["S1", "S2"], + "Group": ["A", "A"], + }) + result = aggregate_by_metadata(relabund_df, metadata, "Group") + assert "Sample_id" in result.columns + assert set(result["Sample_id"]) == {"A"} + # Relative abundance should still sum to 100 per group + total = result["rel_abund_perc"].sum() + assert total == pytest.approx(100.0, abs=1e-6) + + +def test_aggregate_by_metadata_missing_sample_id_column_raises(relabund_df): + import pandas as pd + + bad_meta = pd.DataFrame({"Group": ["A", "B"], "X": [1, 2]}) + with pytest.raises(ValueError, match="Sample_id"): + aggregate_by_metadata(relabund_df, bad_meta, "Group") + + +def test_aggregate_by_metadata_missing_group_column_raises(relabund_df): + import pandas as pd + + meta = pd.DataFrame({"Sample_id": ["S1", "S2"]}) + with pytest.raises(ValueError, match="Group"): + aggregate_by_metadata(relabund_df, meta, "Group") From 0aa7b8d74a65d345ac0a6df77cf2308d1fb108c0 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Tue, 12 May 2026 08:31:00 +0300 Subject: [PATCH 12/18] docs: update README for v1.0.0 Add codecov badge. Document new -o/--output and --overwrite flags. Add Before Visualization section highlighting --relabund -O. Move step-by-step modules to collapsible
block. Update Example Output Structure with intermediate/ directory. --- README.md | 172 +++++++++++++++++++++++++++++------------------------- 1 file changed, 94 insertions(+), 78 deletions(-) diff --git a/README.md b/README.md index 8a848c7..0349c7e 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,8 @@ ![License](https://img.shields.io/badge/License-MIT-steelblue) [![Downloads](https://static.pepy.tech/badge/krakenparser)](https://pepy.tech/project/krakenparser) -[![CI](https://github.com/PopovIILab/KrakenParser/actions/workflows/ci.yml/badge.svg)](https://github.com/PopovIILab/KrakenParser/actions/workflows/ci.yml) +[![CI](https://github.com/PopovIILab/KrakenParser/actions/workflows/python-package.yml/badge.svg)](https://github.com/PopovIILab/KrakenParser/actions/workflows/python-package.yml) +[![codecov](https://codecov.io/gh/PopovIILab/KrakenParser/graph/badge.svg)](https://codecov.io/gh/PopovIILab/KrakenParser) @@ -108,9 +109,10 @@ X9,0.7232472324723247,0.7352941176470589,...,0.8066914498141264,0.0 ## Quick Start (Full Pipeline) To run the full pipeline, use the following command: ```bash -KrakenParser --complete -i data/kreports +KrakenParser --complete -i data/kreports -o results/ #Having troubles? Run KrakenParser --complete -h ``` + This will: 1. Convert Kraken2 reports to MPA format 2. Combine MPA files into a single file @@ -120,162 +122,176 @@ This will: 6. Calculate relative abundance 7. Calculate α & β-diversities -### **Input Requirements** -- The Kraken2 reports must be inside a **subdirectory** (e.g., `data/kreports`). -- The script automatically creates output directories and processes the data. - ## Installation ``` pip install krakenparser ``` -## Using Individual Modules -You can also run each step manually if needed. +## Before Visualization: Grouping Low-Abundance Taxa + +The full pipeline automatically calculates relative abundance. Before passing data to visualization, it is strongly recommended to re-run `--relabund` with the `-O` flag — this collapses all taxa below the chosen threshold into a single **"Other"** group, producing much cleaner and more readable plots. + +```bash +KrakenParser --relabund -i data/counts/counts_species.csv -o data/rel_abund/ra_species.csv -O 4 +``` + +This groups every taxon with relative abundance **< 4 %** into `Other (<4.0%)`. Adjust the threshold to your data. + +> **Note:** The pipeline-generated `rel_abund/ra_*.csv` files (no `-O`) preserve the full unfiltered data — use them for statistical analysis. Use the `-O` variant specifically for visualization. + +--- + +
+Using Individual Modules (Advanced) +
+ +Each step of the pipeline can also be run individually. This is useful for re-running a single step, debugging, or integrating KrakenParser into a custom workflow. ### **Step 1: Convert Kraken2 Reports to MPA Format** ```bash -KrakenParser --kreport2mpa -i data/kreports -o data/mpa +# Batch mode (directory) +KrakenParser --kreport2mpa -i data/kreports -o data/intermediate/mpa +# Single file +KrakenParser --kreport2mpa -r data/kreports/sample.kreport -o data/intermediate/mpa/sample.MPA.TXT #Having troubles? Run KrakenParser --kreport2mpa -h ``` -This script converts Kraken2 `.kreport` files into **MPA format** using KrakenTools. +Converts Kraken2 `.kreport` files into **MPA format**. ### **Step 2: Combine MPA Files** ```bash -KrakenParser --combine_mpa -i data/mpa/* -o data/COMBINED.txt +KrakenParser --combine_mpa -i data/intermediate/mpa/* -o data/intermediate/COMBINED.txt #Having troubles? Run KrakenParser --combine_mpa -h ``` -This merges multiple MPA files into a single combined file. +Merges multiple MPA files into a single combined table. ### **Step 3: Extract Taxonomic Levels** ```bash -KrakenParser --deconstruct -i data/COMBINED.txt -o data/counts +KrakenParser --deconstruct -i data/intermediate/COMBINED.txt -o data/intermediate #Having troubles? Run KrakenParser --deconstruct -h ``` -If user wants to inspect **Viruses** domain separately: +By default, human-related taxa (Homo sapiens, Hominidae, Primates, Mammalia, Chordata) are removed. To keep them: ```bash -KrakenParser --deconstruct_viruses -i data/COMBINED.txt -o data/counts_viruses -#Having troubles? Run KrakenParser --deconstruct_viruses -h +KrakenParser --deconstruct -i data/intermediate/COMBINED.txt -o data/intermediate --keep-human ``` -This step extracts only species-level data (excluding human reads). +To inspect the **Viruses** domain separately: +```bash +KrakenParser --deconstruct_viruses -i data/intermediate/COMBINED.txt -o data/counts_viruses +#Having troubles? Run KrakenParser --deconstruct_viruses -h +``` ### **Step 4: Process Extracted Taxonomic Data** ```bash -KrakenParser --process -i data/COMBINED.txt -o data/counts/txt/counts_phylum.txt +KrakenParser --process -i data/intermediate/COMBINED.txt -o data/intermediate/txt/counts_phylum.txt #Having troubles? Run KrakenParser --process -h ``` -Repeat on other 5 taxonomical levels (class, order, family, genus, species) or wrap up `KrakenParser --process` to a loop! +Repeat on other 5 taxonomical levels (class, order, family, genus, species) or wrap up `KrakenParser --process` in a loop. -This script cleans up taxonomic names (removes prefixes, replaces underscores with spaces). +Cleans up taxonomic names: removes prefixes (`s__`, `g__`, etc.) and replaces underscores with spaces. ### **Step 5: Convert TXT to CSV** ```bash -KrakenParser --txt2csv -i data/counts/txt/counts_phylum.txt -o data/counts/csv/counts_phylum.csv +KrakenParser --txt2csv -i data/intermediate/txt/counts_phylum.txt -o data/counts/counts_phylum.csv #Having troubles? Run KrakenParser --txt2csv -h ``` -Repeat on other 5 taxonomical levels (class, order, family, genus, species) or wrap up `KrakenParser --txt2csv` to a loop! +Repeat on other 5 taxonomical levels or wrap in a loop. Transposes data so that sample names become rows. -This converts the processed text files into structured CSV format. - -### **Step 6: Calculate relative abundance** +### **Step 6: Calculate Relative Abundance** ```bash -KrakenParser --relabund -i data/counts/csv/counts_phylum.csv -o data/counts/csv_relabund/counts_phylum.csv +KrakenParser --relabund -i data/counts/counts_phylum.csv -o data/rel_abund/ra_phylum.csv #Having troubles? Run KrakenParser --relabund -h ``` -Repeat on other 5 taxonomical levels (class, order, family, genus, species) or wrap up `KrakenParser --relabund` to a loop! - -This calculates relative abundance and saves as CSV format. +Repeat on other 5 taxonomical levels or wrap in a loop. -If user wants to group low abundant taxa in "Other" group: +With "Other" grouping: ```bash -KrakenParser --relabund -i data/counts/csv/counts_phylum.csv -o data/counts/csv_relabund/counts_phylum.csv --other 3.5 -#Having troubles? Run KrakenParser --relabund -h +KrakenParser --relabund -i data/counts/counts_phylum.csv -o data/rel_abund/ra_phylum.csv -O 3.5 ``` +Groups all taxa with abundance < 3.5 % into `Other (<3.5%)`. -This will group all the taxa that have abundance <3.5 into "Other <3.5%" group. Other parameters are welcome! - -### **Step 7: Calculate α & β-diversities** +### **Step 7: Calculate α & β-Diversities** ```bash -KrakenParser --diversity -i data/counts/csv/counts_species.csv -o data/diversity +KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity #Having troubles? Run KrakenParser --diversity -h ``` -This calculates α & β-diversities and saves them as CSV format to directory provided in the output. - -If user wants to use another depth for β-diversity calculations: +With a custom rarefaction depth for β-diversity: ```bash -KrakenParser --diversity -i data/counts/csv/counts_species.csv -o data/diversity --depth 750 -#Having troubles? Run KrakenParser --diversity -h +KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity --depth 750 ``` -Other parameters are welcome! +--- ## Arguments Breakdown -### **KrakenParser** (Main Pipeline) -- Automates the entire workflow. -- Takes **one argument**: the path to Kraken2 reports (`data/kreports`). -- Runs all the scripts in sequence. + +### **--complete** (Full Pipeline) +- Requires `-i`: path to the Kraken2 reports directory (e.g., `data/kreports`). +- Optional `-o`: output directory (default: parent of `-i`). +- Optional `--keep-human`: retain human-related taxa (default: filtered out). ### **--kreport2mpa** (Step 1) -- Converts Kraken2 reports to MPA format. -- Uses `KrakenTools/kreport2mpa.py`. +- Batch mode: `-i DIR -o DIR` — converts all files in a directory. +- Single-file mode: `-r FILE -o FILE`. ### **--combine_mpa** (Step 2) -- Combines multiple MPA files into one. -- Uses `KrakenTools/combine_mpa.py`. +- `-i FILE [FILE ...]`: one or more MPA files. +- `-o FILE`: output merged table. ### **--deconstruct** & **--deconstruct_viruses** (Step 3) - Extracts **phylum, class, order, family, genus, species** into separate text files. -- Removes human-related reads (**--deconstruct** only). +- `--deconstruct` removes human-related reads by default; use `--keep-human` to retain them. +- `--deconstruct_viruses` extracts only the Viruses domain. ### **--process** (Step 4) -- Cleans and formats extracted taxonomic data. - Removes prefixes (`s__`, `g__`, etc.), replaces underscores with spaces. +- `-i`: COMBINED.txt (source for sample-name header); `-o`: target txt file. ### **--txt2csv** (Step 5) -- Converts cleaned text files to CSV. -- Transposes data so that sample names become rows. +- Transposes a processed txt file into a CSV with sample names as rows. ### **--relabund** (Step 6) -- Calculates relative abundance based on total abundance CSV. -- Optionally can group low abundant taxa. +- Calculates relative abundance from a total-counts CSV. +- `-O FLOAT`: group taxa below FLOAT % into `Other ( ## Example Output Structure After running the full pipeline, the output directory will look like this: ``` data/ ├─ kreports/ # Input Kraken2 reports -├─ mpa/ # Converted MPA files -├─ COMBINED.txt # Merged MPA file -├─ counts/ -│ ├─ txt/ # Extracted taxonomic levels in TXT -│ │ ├─ counts_species.txt -│ │ ├─ counts_genus.txt -│ │ ├─ counts_family.txt -│ │ ├─ ... -│ └─ csv/ # Total abundance CSV output -│ ├─ counts_species.csv -│ ├─ counts_genus.csv -│ ├─ counts_family.csv -│ ├─ ... +├─ counts/ # Total abundance CSV output +│ ├─ counts_species.csv +│ ├─ counts_genus.csv +│ ├─ ... +│ └─ counts_phylum.csv ├─ rel_abund/ # Relative abundance CSV output │ ├─ ra_species.csv │ ├─ ra_genus.csv -│ ├─ ra_family.csv │ ├─ ... -└─ diversity/ - ├─ alpha_div.csv - ├─ beta_div_bray.csv - └─ beta_div_jaccard.csv +│ └─ ra_phylum.csv +├─ diversity/ # Diversity metrics +│ ├─ alpha_div.csv +│ ├─ beta_div_bray.csv +│ └─ beta_div_jaccard.csv +└─ intermediate/ # Intermediate files + ├─ mpa/ # Converted MPA files + │ ├─ sample.MPA.TXT + │ ├─ ... + ├─ COMBINED.txt # Merged MPA table + └─ txt/ # Extracted taxonomic levels in TXT + ├─ counts_species.txt + ├─ counts_genus.txt + ├─ ... + └─ counts_phylum.txt ``` ## Conclusion From 9664a56954e4346f687897215511a6c5ee4f3c0c Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Tue, 12 May 2026 09:15:00 +0300 Subject: [PATCH 13/18] ci: pre-install numpy to fix biom-format build on Python 3.14 biom-format uses a legacy setup.py that imports numpy at metadata generation time. Pre-installing numpy ensures it is available before pip resolves scikit-bio's transitive dependencies. --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 553faa6..4f77e5b 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -27,7 +27,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install flake8 setuptools wheel + python -m pip install flake8 setuptools wheel numpy pip install -e ".[dev]" --no-build-isolation - name: Lint with flake8 run: | From 8d5d457b192c939d1c0a0ed9b02c35807d24d50e Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Tue, 12 May 2026 09:31:00 +0300 Subject: [PATCH 14/18] ci: drop Python 3.14 from matrix; cap requires-python to <=3.13 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit biom-format (transitive dep of scikit-bio) has no cp314 wheel and its legacy setup.py has undeclared build-time deps (numpy, Cython). Pin support to 3.10–3.13 until scikit-bio ecosystem catches up. --- .github/workflows/python-package.yml | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 4f77e5b..85b8411 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] + python-version: ["3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v4 @@ -27,7 +27,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install flake8 setuptools wheel numpy + python -m pip install flake8 setuptools wheel pip install -e ".[dev]" --no-build-isolation - name: Lint with flake8 run: | diff --git a/pyproject.toml b/pyproject.toml index b4f3ed5..3d508f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ description = "A collection of scripts designed to process Kraken2 reports and c readme = {file = "README.md", content-type = "text/markdown"} license = {file = "LICENSE"} authors = [{name = "Ilia Popov", email = "iljapopov17@gmail.com"}] -requires-python = ">=3.10,<=3.16" +requires-python = ">=3.10,<=3.13" dependencies = [ "pandas>=2.0.0,<=2.3.3", "matplotlib>=3.7.0,<=3.10.8", From 06ddafb71cc8d343a44ad2e478307bce56265039 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Tue, 12 May 2026 09:48:00 +0300 Subject: [PATCH 15/18] refactor(stats): replace scikit-bio with scipy for beta diversity scikit-bio pulls in biom-format which has no cp314 wheel and brittle build-time deps. Reimplement subsample_counts in numpy and use scipy.spatial.distance for Bray-Curtis and Jaccard. Restore Python 3.14 support and bump requires-python to <=3.16. --- .github/workflows/python-package.yml | 2 +- krakenparser/stats/diversity.py | 58 ++++++++++++++-------------- pyproject.toml | 4 +- 3 files changed, 32 insertions(+), 32 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 85b8411..553faa6 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.10", "3.11", "3.12", "3.13"] + python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] steps: - uses: actions/checkout@v4 diff --git a/krakenparser/stats/diversity.py b/krakenparser/stats/diversity.py index da3b336..c5ea216 100644 --- a/krakenparser/stats/diversity.py +++ b/krakenparser/stats/diversity.py @@ -1,15 +1,14 @@ #!/usr/bin/env python -import pandas as pd -import numpy as np -import sys import argparse +import sys from pathlib import Path -from skbio.diversity import beta_diversity -from skbio.stats import subsample_counts + +import numpy as np +import pandas as pd +from scipy.spatial.distance import pdist, squareform -# Define Shannon index def shannon_index(counts): counts = np.array(counts) counts = counts[counts > 0] @@ -17,7 +16,6 @@ def shannon_index(counts): return -np.sum(proportions * np.log(proportions)) -# Define Pielou's evenness def pielou_evenness(counts): counts = np.asarray(counts) S = int(np.sum(counts > 0)) @@ -26,7 +24,6 @@ def pielou_evenness(counts): return shannon_index(counts) / np.log(S) -# Define Chao1 richness estimator def chao1_index(counts): counts = np.array(counts) S_obs = np.sum(counts > 0) @@ -37,6 +34,13 @@ def chao1_index(counts): return S_obs + (F1 * F1) / (2 * F2) +def _subsample_counts(counts: np.ndarray, n: int) -> np.ndarray: + """Rarefy counts to n reads by sampling without replacement.""" + indices = np.repeat(np.arange(len(counts)), counts) + sampled = np.random.choice(indices, size=n, replace=False) + return np.bincount(sampled, minlength=len(counts)).astype(int) + + def calc_alpha_div(df, output_path): results = [] for sample_id, row in df.iterrows(): @@ -60,19 +64,23 @@ def calc_beta_div(df, output_path, rarefaction_depth): for sample, row in df.iterrows(): counts = np.round(row.values).astype(int) if counts.sum() >= rarefaction_depth: - rarefied = subsample_counts(counts, n=rarefaction_depth) + rarefied = _subsample_counts(counts, n=rarefaction_depth) rarefied_counts.append(rarefied) sample_ids.append(sample) if len(rarefied_counts) < 2: raise ValueError("Not enough samples passed the rarefaction threshold.") - bray_df = beta_diversity( - "braycurtis", rarefied_counts, ids=sample_ids - ).to_data_frame() - jaccard_df = beta_diversity( - "jaccard", rarefied_counts, ids=sample_ids - ).to_data_frame() + X = np.array(rarefied_counts, dtype=float) + + bray_df = pd.DataFrame( + squareform(pdist(X, metric="braycurtis")), + index=sample_ids, columns=sample_ids, + ) + jaccard_df = pd.DataFrame( + squareform(pdist(X.astype(bool).astype(float), metric="jaccard")), + index=sample_ids, columns=sample_ids, + ) bray_df.to_csv(output_path / "beta_div_bray.csv") jaccard_df.to_csv(output_path / "beta_div_jaccard.csv") @@ -80,20 +88,12 @@ def calc_beta_div(df, output_path, rarefaction_depth): if __name__ == "__main__": parser = argparse.ArgumentParser(description="Calculate α & β-diversities.") - parser.add_argument( - "-i", - "--input", - required=True, - help="Input total count table CSV (species level).", - ) - parser.add_argument("-o", "--output", required=True, help="Output directory path.") - parser.add_argument( - "-d", - "--depth", - type=int, - default=1000, - help="Rarefaction depth for β diversity (default: 1000).", - ) + parser.add_argument("-i", "--input", required=True, + help="Input total count table CSV (species level).") + parser.add_argument("-o", "--output", required=True, + help="Output directory path.") + parser.add_argument("-d", "--depth", type=int, default=1000, + help="Rarefaction depth for β diversity (default: 1000).") args = parser.parse_args() input_file = Path(args.input) diff --git a/pyproject.toml b/pyproject.toml index 3d508f6..7af15d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,13 +9,13 @@ description = "A collection of scripts designed to process Kraken2 reports and c readme = {file = "README.md", content-type = "text/markdown"} license = {file = "LICENSE"} authors = [{name = "Ilia Popov", email = "iljapopov17@gmail.com"}] -requires-python = ">=3.10,<=3.13" +requires-python = ">=3.10,<=3.16" dependencies = [ "pandas>=2.0.0,<=2.3.3", "matplotlib>=3.7.0,<=3.10.8", "numpy>=1.24.0,<=2.3.5", "seaborn>=0.12.0,<=0.13.2", - "scikit-bio>=0.6.0,<=0.7.2", + "scipy>=1.9.0,<=1.16.3", ] [project.optional-dependencies] From 4ce87509c71d3722a2e8b604831b066eca091840 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Tue, 12 May 2026 10:05:00 +0300 Subject: [PATCH 16/18] feat(diversity): add -s/--seed for reproducible rarefaction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rarefaction is stochastic — without a fixed seed results differ between runs. Replace np.random.choice with np.random.default_rng and propagate the seed through calc_beta_div and run_pipeline. --- krakenparser/pipeline.py | 8 +++++++- krakenparser/stats/diversity.py | 13 ++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/krakenparser/pipeline.py b/krakenparser/pipeline.py index f0e29c6..02ca6a7 100644 --- a/krakenparser/pipeline.py +++ b/krakenparser/pipeline.py @@ -42,6 +42,7 @@ def run_pipeline( output_dir: str | None = None, keep_human: bool = False, rarefaction_depth: int = 1000, + seed: int | None = None, overwrite: bool = False, ) -> None: source_dir = Path(input_dir) @@ -116,7 +117,7 @@ def run_pipeline( diversity_dir.mkdir(exist_ok=True) df = pd.read_csv(species_csv, index_col=0) calc_alpha_div(df, diversity_dir) - calc_beta_div(df, diversity_dir, rarefaction_depth=rarefaction_depth) + calc_beta_div(df, diversity_dir, rarefaction_depth=rarefaction_depth, seed=seed) _log.info("All steps completed successfully!") @@ -142,6 +143,10 @@ def main() -> None: "-d", "--depth", type=int, default=1000, help="Rarefaction depth for β-diversity (default: 1000)", ) + parser.add_argument( + "-s", "--seed", type=int, default=None, + help="Random seed for reproducible rarefaction (default: random)", + ) parser.add_argument( "--overwrite", action="store_true", default=False, help="Overwrite the output directory if it already exists", @@ -151,6 +156,7 @@ def main() -> None: args.input, args.output, keep_human=args.keep_human, rarefaction_depth=args.depth, + seed=args.seed, overwrite=args.overwrite, ) diff --git a/krakenparser/stats/diversity.py b/krakenparser/stats/diversity.py index c5ea216..ac3c87f 100644 --- a/krakenparser/stats/diversity.py +++ b/krakenparser/stats/diversity.py @@ -34,10 +34,10 @@ def chao1_index(counts): return S_obs + (F1 * F1) / (2 * F2) -def _subsample_counts(counts: np.ndarray, n: int) -> np.ndarray: +def _subsample_counts(counts: np.ndarray, n: int, rng: np.random.Generator) -> np.ndarray: """Rarefy counts to n reads by sampling without replacement.""" indices = np.repeat(np.arange(len(counts)), counts) - sampled = np.random.choice(indices, size=n, replace=False) + sampled = rng.choice(indices, size=n, replace=False) return np.bincount(sampled, minlength=len(counts)).astype(int) @@ -57,14 +57,15 @@ def calc_alpha_div(df, output_path): alpha_df.to_csv(output_path / "alpha_div.csv") -def calc_beta_div(df, output_path, rarefaction_depth): +def calc_beta_div(df, output_path, rarefaction_depth, seed=None): + rng = np.random.default_rng(seed) rarefied_counts = [] sample_ids = [] for sample, row in df.iterrows(): counts = np.round(row.values).astype(int) if counts.sum() >= rarefaction_depth: - rarefied = _subsample_counts(counts, n=rarefaction_depth) + rarefied = _subsample_counts(counts, n=rarefaction_depth, rng=rng) rarefied_counts.append(rarefied) sample_ids.append(sample) @@ -94,6 +95,8 @@ def calc_beta_div(df, output_path, rarefaction_depth): help="Output directory path.") parser.add_argument("-d", "--depth", type=int, default=1000, help="Rarefaction depth for β diversity (default: 1000).") + parser.add_argument("-s", "--seed", type=int, default=None, + help="Random seed for reproducible rarefaction (default: random).") args = parser.parse_args() input_file = Path(args.input) @@ -105,7 +108,7 @@ def calc_beta_div(df, output_path, rarefaction_depth): df = pd.read_csv(input_file, index_col=0) calc_alpha_div(df, output_dir) - calc_beta_div(df, output_dir, args.depth) + calc_beta_div(df, output_dir, args.depth, seed=args.seed) print( f"α & β-diversities have been successfully calculated and saved to '{output_dir}'." ) From 185e8d4831e6e1710e6d67b274ac2b5afe5b3007 Mon Sep 17 00:00:00 2001 From: Ilia Popov Date: Tue, 12 May 2026 10:08:00 +0300 Subject: [PATCH 17/18] docs: document -s/--seed flag in Quick Start and diversity step --- README.md | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0349c7e..f8235f2 100644 --- a/README.md +++ b/README.md @@ -113,6 +113,11 @@ KrakenParser --complete -i data/kreports -o results/ #Having troubles? Run KrakenParser --complete -h ``` +For **reproducible** β-diversity (rarefaction is stochastic by default): +```bash +KrakenParser -i data/kreports -o results/ -s 42 +``` + This will: 1. Convert Kraken2 reports to MPA format 2. Combine MPA files into a single file @@ -218,9 +223,14 @@ KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity #Having troubles? Run KrakenParser --diversity -h ``` -With a custom rarefaction depth for β-diversity: +With a custom rarefaction depth: +```bash +KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity -d 750 +``` + +For reproducible results (rarefaction uses random subsampling — fix the seed to get the same matrix every run): ```bash -KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity --depth 750 +KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity -s 42 ``` --- @@ -231,6 +241,7 @@ KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity --d - Requires `-i`: path to the Kraken2 reports directory (e.g., `data/kreports`). - Optional `-o`: output directory (default: parent of `-i`). - Optional `--keep-human`: retain human-related taxa (default: filtered out). +- Optional `-s INT`: random seed for reproducible β-diversity rarefaction (default: random). ### **--kreport2mpa** (Step 1) - Batch mode: `-i DIR -o DIR` — converts all files in a directory. @@ -260,6 +271,7 @@ KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity --d - Shannon, Pielou & Chao1 for α-diversity. - Bray-Curtis & Jaccard for β-diversity. - `-d INT`: rarefaction depth for β-diversity (default: 1000). +- `-s INT`: random seed for reproducible rarefaction (default: random — results vary between runs).
From 14a21e2a23e5bcfcff1d614ee5b4b558b0651864 Mon Sep 17 00:00:00 2001 From: Ilya Popov Date: Tue, 12 May 2026 15:49:12 +0200 Subject: [PATCH 18/18] Update README.md --- README.md | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index f8235f2..898b34e 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ You can run the entire pipeline with **a single command**, or use the scripts ** ### Total abundance output -`counts_phylum.csv` parsed from 7 kraken2 reports of metagenomic samples using `KrakenParser`: +`counts_phylum.csv` parsed from 9 kraken2 reports of metagenomic samples using `KrakenParser`: ``` Sample_id,Calditrichota,Caldisericota,Thermosulfidibacterota,Elusimicrobiota,Candidatus Fervidibacterota,Lentisphaerota,Kiritimatiellota,Vulcanimicrobiota,Thermodesulfobiota,Atribacterota,Dictyoglomota,Nitrospinota,Chrysiogenota,Coprothermobacterota,Aquificota,Thermotogota,Bdellovibrionota,Nitrospirota,Deferribacterota,Synergistota,Myxococcota,Acidobacteriota,Candidatus Bipolaricaulota,Candidatus Saccharibacteria,Candidatus Absconditabacteria,Fusobacteriota,Spirochaetota,Candidatus Omnitrophota,Chlamydiota,Verrucomicrobiota,Planctomycetota,Thermodesulfobacteriota,Campylobacterota,Candidatus Cloacimonadota,Fibrobacterota,Gemmatimonadota,Balneolota,Rhodothermota,Ignavibacteriota,Chlorobiota,Bacteroidota,Deinococcota,Thermomicrobiota,Armatimonadota,Chloroflexota,Cyanobacteriota,Mycoplasmatota,Actinomycetota,Bacillota,Pseudomonadota,Heterolobosea,Parabasalia,Fornicata,Evosea,Bacillariophyta,Cercozoa,Euglenozoa,Apicomplexa,Microsporidia,Basidiomycota,Ascomycota,Nanoarchaeota,Candidatus Micrarchaeota,Candidatus Thermoplasmatota,Candidatus Lokiarchaeota,Nitrososphaerota,Euryarchaeota,Thermoproteota,Hofneiviricota,Artverviricota,Nucleocytoviricota,Cossaviricota,Kitrinoviricota,Negarnaviricota,Lenarviricota,Pisuviricota,Peploviricota,Uroviricota @@ -39,7 +39,7 @@ X9,0,3,2,16,7,1,23,12,10,9,1,2,134,40,390,289,29,372,27,81,150,90,9,88,32,287,88 ### Relative abundance output -`ra_phylum.csv` calculated from 7 kraken2 reports of metagenomic samples using `KrakenParser`: +`ra_phylum.csv` calculated from 9 kraken2 reports of metagenomic samples using `KrakenParser`: ``` Sample_id,taxon,rel_abund_perc @@ -61,7 +61,7 @@ X9,Other (<4.0%),1.8979510606688494 ### α-diversity output -`alpha_div.csv` calculated from 7 kraken2 reports of metagenomic samples using `KrakenParser`: +`alpha_div.csv` calculated from 9 kraken2 reports of metagenomic samples using `KrakenParser`: ``` Sample,Shannon,Pielou,Chao1 @@ -74,7 +74,7 @@ X9,4.033664950188261,0.5050385978575492,3492.16 ### β-diversity output -`beta_div_bray.csv` calculated from 7 kraken2 reports of metagenomic samples using `KrakenParser`: +`beta_div_bray.csv` calculated from 9 kraken2 reports of metagenomic samples using `KrakenParser`: ``` ,X1,X2,...,X8,X9 @@ -85,7 +85,7 @@ X8,0.61,0.723,...,0.0,0.665 X9,0.353,0.388,...,0.665,0.0 ``` -`beta_div_jaccard.csv` calculated from 7 kraken2 reports of metagenomic samples using `KrakenParser`: +`beta_div_jaccard.csv` calculated from 9 kraken2 reports of metagenomic samples using `KrakenParser`: ``` ,X1,X2,...,X8,X9 @@ -278,8 +278,7 @@ KrakenParser --diversity -i data/counts/counts_species.csv -o data/diversity -s ## Example Output Structure After running the full pipeline, the output directory will look like this: ``` -data/ -├─ kreports/ # Input Kraken2 reports +results/ ├─ counts/ # Total abundance CSV output │ ├─ counts_species.csv │ ├─ counts_genus.csv @@ -296,7 +295,7 @@ data/ │ └─ beta_div_jaccard.csv └─ intermediate/ # Intermediate files ├─ mpa/ # Converted MPA files - │ ├─ sample.MPA.TXT + │ ├─ {sample}.txt │ ├─ ... ├─ COMBINED.txt # Merged MPA table └─ txt/ # Extracted taxonomic levels in TXT