Merge pull request #22 from RIVM-bioinformatics/dev

florianzwagemaker · web-flow · commit f631d7227dc5 · 2025-11-03T14:43:57.000+01:00
Use AminoExtract 0.4.1
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -4,26 +4,32 @@ on:
     branches:
       - main
 
+permissions:
+  contents: write
+  pull-requests: write
+
 jobs:
   bump-version:
     name: Release version
     runs-on: ubuntu-latest
 
     steps:
-      - uses: GoogleCloudPlatform/release-please-action@v2
+      - uses: GoogleCloudPlatform/release-please-action@v4
         id: release
         with:
           release-type: python
-          package-name: TrueConsense
+          token: ${{ secrets.RELEASE_TOKEN }}
+          config-file: release-please-config.yaml
+          manifest-file: .release-please-manifest.json
 
   update-docs:
     needs: bump-version
     name: Update docs
     runs-on: ubuntu-latest
-    if: "contains(github.event.head_commit.message, 'chore:')"
+    if: contains(github.event.head_commit.message, 'chore(main):')
     steps:
       - name: Checkout main
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
@@ -38,17 +44,15 @@ jobs:
         run: |
           git config --global user.name "Github Actions"
           git config --global user.email '41898282+github-actions[bot]@users.noreply.github.com'
-
       - name: Setup Python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v5
         with:
-          python-version: '3.7'
+          python-version: '3.11'
 
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
           pip install -r docs-requirements.txt
-
       - name: Publish docs
         run: |
-          mike deploy --config-file mkdocs.yml --push --force --update-aliases $(git tag --sort=committerdate | tail -1 | sed 's/v//') latest
+          mike deploy --config-file mkdocs.yml --push --update-aliases $(git tag --sort=committerdate | tail -1 | sed 's/v//') latest
diff --git a/.github/workflows/sync.yml b/.github/workflows/sync.yml
@@ -4,26 +4,32 @@ on:
     branches:
       - main
 
+permissions:
+  contents: write
+  pull-requests: write
+
 jobs:
   sync-branches:
     runs-on: ubuntu-latest
     name: Syncing branches
-    if: "contains(github.event.head_commit.message, 'chore:')"
+    # if: contains(github.event.head_commit.message, 'chore(main):')
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
 
       - name: Set up Node
-        uses: actions/setup-node@v1
+        uses: actions/setup-node@v4
         with:
-          node-version: 12
+          node-version: 20
 
+# note: this is a fork of the original sync-branches action which seems to be abandoned. We might need a better solution in the future
       - name: Opening pull request
         id: pull
-        uses: tretuna/sync-branches@1.4.0
+        uses: jdtx0/branch-sync@v1.5.1
         with:
           GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
           FROM_BRANCH: "main"
           TO_BRANCH: "dev"
           PULL_REQUEST_TITLE: "chore: sync main to dev"
+          PULL_REQUEST_AUTO_MERGE_METHOD: "squash"
diff --git a/.release-please-manifest.json b/.release-please-manifest.json
@@ -0,0 +1,3 @@
+{
+  ".": "0.5.1"
+}
diff --git a/TrueConsense/Outputs.py b/TrueConsense/Outputs.py
@@ -8,7 +8,7 @@
 from .Events import ListInserts
 from .indexing import Readbam
 from .Sequences import BuildConsensus
-
+from AminoExtract.gff_data import GFFColumns
 
 def WriteGFF(gffheader, gffdict, output_gff, name):
     """Function takes a GFF header, a dictionary of GFF features, an output directory, and a name for
@@ -26,16 +26,49 @@ def WriteGFF(gffheader, gffdict, output_gff, name):
         the name of the file you want to write
 
     """
+    cols = GFFColumns.get_names()
+    cols_without_attr = [col for col in cols if col != "attributes"]
+
+    def combine_dict_into_attributes(input_dict: dict[str, str]) -> str:
+        attribute_dict = {}
+        for k, v in input_dict.items():
+            if k == "attributes":
+                for attribute in v.split(";"):
+                    if attribute == "":
+                        continue
+                    key, value = attribute.split("=")
+                    attribute_dict[key] = value
+            else:
+                attribute_dict[k] = v
+
+        return ";".join(f"{k}={v}" for k, v in attribute_dict.items())
+
+    def clean_dict(input_dict: dict[str, str]) -> dict[str, str]:
+        clean_dict = {}
+        attribute_dict = {}
+        for k, v in input_dict.items():
+            if str(k).lower() not in cols_without_attr:
+                attribute_dict[str(k).lower()] = str(v)
+            else:
+                clean_dict[str(k).lower()] = str(v)
+
+        attribute_str = combine_dict_into_attributes(attribute_dict)
+        clean_dict["attributes"] = attribute_str
+        assert list(clean_dict.keys()) == cols
+        return clean_dict
+
+
+    # the gffdict will have 0, 1, 2, etc as keys, for each line in the GFF file
+    # the values will be dictionaries containing the GFF columns for that line, with a lot of additional columns
+    # these additional columns will all be forced into the attributes column
+
     with open(output_gff, "w") as out:
-        out.write(gffheader)
+        out.write(gffheader.raw_text)
+
 
-        for k, v in gffdict.items():
-            for nk, nv in v.items():
-                if str(nk) == str(list(v.keys())[-1]):
-                    out.write(str(nv))
-                else:
-                    out.write(str(nv) + "\t")
-            out.write("\n")
+        for line_number, gff_data in gffdict.items():
+            cleaned_data = clean_dict(gff_data)
+            out.write("\t".join([str(v) for v in cleaned_data.values()]) + "\n")
 
 
 def WriteOutputs(
@@ -65,6 +98,7 @@ def WriteOutputs(
     )[0]
 
     if output_gff is not None:
+        
         WriteGFF(gffheader, newgff, output_gff, name)
 
     if output_vcf is not None:
diff --git a/TrueConsense/Sequences.py b/TrueConsense/Sequences.py
@@ -102,8 +102,15 @@ def _orf_codonposition(gffdict, p):
         end = gffdict[k].get("end") + 1
 
         if p in range(start, end):
-            a.append(str(gffdict[k].get("attributes").split(";")[1].split("=")[-1]))
-            a.append((p - start) % 3)
+            # attributes itself can also be "", so both no attributes and empty attributes are skipped
+            attr = gffdict[k].get("attributes", "")
+            if not attr:
+                continue
+            split_attr = attr.split(";")
+            if len(split_attr) < 2:
+                a.extend((str(attr.split(";")[0].split("=")[-1]), (p - start) % 3))
+                continue
+            a.extend((str(attr.split(";")[1].split("=")[-1]), (p - start) % 3))
     if a:
         return tuple(a)
     return None, None
diff --git a/TrueConsense/TrueConsense.py b/TrueConsense/TrueConsense.py
@@ -209,53 +209,56 @@ def check_index_override(fname):
     return args
 
 
-def main():
-    if len(sys.argv[1:]) < 1:
+def main(args: list[str] | None = None):
+    if not args:
+        args = sys.argv[1:]
+
+    if len(args) < 1:
         print(
             "TrueConsense was called but no arguments were given, please try again.\nUse 'TrueConsense -h' to see the help document"
         )
         sys.exit(1)
-    args = GetArgs(sys.argv[1:])
+    parsed_args = GetArgs(args)
 
-    bam = Readbam(args.input)
+    bam = Readbam(parsed_args.input)
 
-    with cf.ThreadPoolExecutor(max_workers=args.threads) as xc:
-        IndexDF = xc.submit(BuildIndex, args.input, args.reference)
-        IndexGff = xc.submit(Gffindex, args.features)
+    with cf.ThreadPoolExecutor(max_workers=parsed_args.threads) as xc:
+        IndexDF = xc.submit(BuildIndex, parsed_args.input, parsed_args.reference)
+        IndexGff = xc.submit(Gffindex, parsed_args.features)
 
         IndexDF = IndexDF.result()
         IndexGff = IndexGff.result()
 
-    if args.index_override:
+    if parsed_args.index_override:
         IndexDF = Override_index_positions(
-            IndexDF, read_override_index(args.index_override)
+            IndexDF, read_override_index(parsed_args.index_override)
         )
 
     indexDict = IndexDF.to_dict("index")
     GffHeader = IndexGff.header
     GffDF = IndexGff.df
-    GffDF["seq_id"] = args.samplename
+    GffDF["seqid"] = parsed_args.samplename
     GffDict = GffDF.to_dict("index")
 
-    with cf.ThreadPoolExecutor(max_workers=args.threads) as xc:
-        if args.depth_of_coverage is not None:
-            xc.submit(BuildCoverage, indexDict, args.depth_of_coverage)
+    with cf.ThreadPoolExecutor(max_workers=parsed_args.threads) as xc:
+        if parsed_args.depth_of_coverage is not None:
+            xc.submit(BuildCoverage, indexDict, parsed_args.depth_of_coverage)
 
-    if args.noambiguity is False:
+    if parsed_args.noambiguity is False:
         IncludeAmbig = True
-    elif args.noambiguity is True:
+    elif parsed_args.noambiguity is True:
         IncludeAmbig = False
 
     WriteOutputs(
-        args.coverage_level,
+        parsed_args.coverage_level,
         indexDict,
         GffDict,
-        args.input,
+        parsed_args.input,
         IncludeAmbig,
-        args.variants,
-        args.samplename,
-        args.reference,
-        args.output_gff,
+        parsed_args.variants,
+        parsed_args.samplename,
+        parsed_args.reference,
+        parsed_args.output_gff,
         GffHeader,
-        args.output,
+        parsed_args.output,
     )
diff --git a/TrueConsense/indexing.py b/TrueConsense/indexing.py
@@ -1,4 +1,4 @@
-import gffpandas.gffpandas as gffpd
+from AminoExtract import SequenceReader, GFFDataFrame
 import pandas as pd
 import pysam
 
@@ -19,7 +19,7 @@ def Readbam(f):
     return pysam.AlignmentFile(f, "rb")
 
 
-def Gffindex(file):
+def Gffindex(file: str) -> GFFDataFrame:
     """Reads in a GFF3 file and returns a pandas dataframe
 
     Parameters
@@ -32,7 +32,8 @@ def Gffindex(file):
         A dataframe
 
     """
-    return gffpd.read_gff3(file)
+    reader = SequenceReader(logger=None)
+    return reader.read_gff(file)
 
 
 def read_override_index(f):
diff --git a/docs/installation.md b/docs/installation.md
@@ -5,18 +5,17 @@ TrueConsense is only available on Linux (or Linux-based) operating systems. MacO
 TrueConsense will be made available for installation through Conda and Pip. However, this is currently not yet available.
 We will update these docs when installation through Conda and/or pip is available.
 
-## Prerequisistes
+## Prerequisites
 
-TrueConsense requires Python 3.7 or later to be installed on your system (or in an environment).
+TrueConsense requires Python 3.10 or later to be installed on your system (or in an environment).
 
 Other dependencies will be installed during the installation, your don't have to install them manually. These extra dependencies are as follows:
 
-* pysam<0.16
-* pandas>=1.2.3
-* gffpandas>=1.2.0
-* parmap>=1.5.2
-* tqdm>=4.59.0
-* biopython>=1.78
+* pysam==0.23.3
+* pandas==2.3.*
+* tqdm==4.59.*
+* biopython==1.85
+* aminoextract==0.4.1
 
 We strongly advise you to use a conda environment (or similar) to make sure there won't be any conflicts in package dependencies.
 
diff --git a/env.yml b/env.yml
@@ -2,16 +2,11 @@ name: TrueConsense
 channels:
   - bioconda
   - conda-forge
-  - intel
-  - anaconda
-  - defaults
 dependencies:
-  - python=3.7
-  - pandas>=1.2.3
-  - pysam<0.16
-  - biopython>=1.78
+  - python>=3.10
+  - pandas==2.3.*
+  - pysam==0.23.3
+  - biopython==1.85
   - tqdm==4.59.0
-  - biopython>=1.78
-  - pip
-  - pip:
-    - gffpandas>=1.2
+  - aminoextract==0.4.1
+  - pytest==8.4.*
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/release-please-config.json b/release-please-config.json
diff --git a/setup.py b/setup.py