Skip to content

Commit d9f0316

Browse files
authored
Merge pull request #48 from gbouras13/dev
v1.6.1
2 parents e4b5e81 + 04cf15b commit d9f0316

10 files changed

Lines changed: 120 additions & 75 deletions

File tree

.github/workflows/release.yaml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,12 @@ jobs:
2626
shell: bash -l {0}
2727
run: |
2828
python -m pip install -U pip
29-
just install
29+
pip install -e .
30+
pip install black
31+
pip install isort
32+
pip install pytest
33+
pip install pytest-cov
34+
pip install git+https://github.com/rrwick/Unicycler.git
3035
- name: Build a binary wheel and a source tarball
3136
shell: bash -l {0}
3237
run: just build

HISTORY.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
11
# History
22

3+
1.6.1 (2024-03-02)
4+
------------------
5+
6+
* Bug fixes and added tests for `--depth_filter` which would crash in some scenarios.
7+
38
1.6.0 (2024-03-01)
49
------------------
510

6-
* Addes `--depth_filter`. This will filter out all contigs that have long- (and short-read for `plassembler run`) read copy numbers that are less than the specified depth filter. Defaults to 0.25x.
11+
* Adds `--depth_filter`. This will filter out all contigs that have long- (and short-read for `plassembler run`) read copy numbers that are less than the specified depth filter. Defaults to 0.25x.
712
* Adds `--unicycler_options` and `--spades_options` which allows passing extra Unicycler options (#46)
813

914
1.5.1 (2024-02-01)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "plassembler"
3-
version = "1.6.0" # change VERSION too
3+
version = "1.6.1" # change VERSION too
44
description = "Quickly and accurately assemble plasmids in hybrid sequenced bacterial isolates"
55
authors = ["George Bouras <george.bouras@adelaide.edu.au>"]
66
license = "MIT"

src/plassembler/utils/VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.6.0
1+
1.6.1

src/plassembler/utils/plass_class.py

Lines changed: 76 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -705,62 +705,51 @@ def combine_depth_mash_tsvs(self, prefix, depth_filter):
705705
combined_depth_mash_df["contig"].str.contains("chromosome"), "PLSDB_hit"
706706
] = ""
707707

708-
# filter the dataframe by depth filter
709-
all_contig_ids = combined_depth_mash_df["contig"].astype(str).tolist()
708+
# get chroms and plasmids
710709

711-
if "mean_depth_short" in combined_depth_mash_df.columns:
712-
combined_depth_mash_df = combined_depth_mash_df[
710+
combined_chrom_df = combined_depth_mash_df[
711+
combined_depth_mash_df["contig"].str.contains("chromosome")
712+
]
713+
combined_plasmid_df = combined_depth_mash_df[
714+
~combined_depth_mash_df["contig"].str.contains("chromosome")
715+
]
716+
717+
# get all plasmid contig ids and then filter
718+
all_plasmid_contig_ids = combined_plasmid_df["contig"].astype(str).tolist()
719+
720+
if "mean_depth_short" in combined_plasmid_df.columns:
721+
combined_plasmid_df = combined_plasmid_df[
713722
(
714-
combined_depth_mash_df["plasmid_copy_number_long"].astype(float)
723+
combined_plasmid_df["plasmid_copy_number_long"].astype(float)
715724
> depth_filter
716725
)
717726
| (
718-
combined_depth_mash_df["plasmid_copy_number_short"].astype(float)
727+
combined_plasmid_df["plasmid_copy_number_short"].astype(float)
719728
> depth_filter
720729
)
721-
| (
722-
combined_depth_mash_df["contig"].str.contains(
723-
"chromosome", case=False
724-
)
725-
)
726730
]
727731
else: # plassembler long (long only)
728-
combined_depth_mash_df = combined_depth_mash_df[
732+
combined_plasmid_df = combined_plasmid_df[
729733
(
730-
combined_depth_mash_df["plasmid_copy_number_long"].astype(float)
734+
combined_plasmid_df["plasmid_copy_number_long"].astype(float)
731735
> depth_filter
732736
)
733-
| (
734-
combined_depth_mash_df["contig"].str.contains(
735-
"chromosome", case=False
736-
)
737-
)
738737
]
739738

740-
# get list of all ids that were kept above threshold
741-
kept_contig_ids = combined_depth_mash_df["contig"].astype(str).tolist()
739+
# get list of all ids that were kept above depth threshold
740+
kept_plasmid_contig_ids = combined_plasmid_df["contig"].astype(str).tolist()
742741

743742
# List of 'contig_ids' that were filtered out
744743
filtered_out_contig_ids = [
745744
contig_id
746-
for contig_id in all_contig_ids
747-
if contig_id not in kept_contig_ids
748-
]
749-
750-
# Identify the index of the first row that doesn't contain 'chromosome' in 'contig_id' - first plasmid index
751-
# None otherwise
752-
filtered_indices = combined_depth_mash_df.index[
753-
~combined_depth_mash_df["contig"].str.contains("chromosome")
745+
for contig_id in all_plasmid_contig_ids
746+
if contig_id not in kept_plasmid_contig_ids
754747
]
755748

756-
if not filtered_indices.empty:
757-
p1_idx = filtered_indices.min()
758-
else:
759-
p1_idx = None
760-
749+
# logging
761750
# needs to be at least 1 filtered out id if the filtering did anything
762751
logger.info(f"Filtering contigs below depth filter: {depth_filter}.")
763-
if "mean_depth_short" in combined_depth_mash_df.columns:
752+
if "mean_depth_short" in combined_plasmid_df.columns:
764753
logger.info(
765754
f"All plasmids whose short and long read copy numbers are both below {depth_filter} will be removed."
766755
)
@@ -770,24 +759,24 @@ def combine_depth_mash_tsvs(self, prefix, depth_filter):
770759
)
771760

772761
if len(filtered_out_contig_ids) > 0:
773-
if p1_idx is None:
762+
if len(kept_plasmid_contig_ids) == 0:
774763
logger.warning(f"There are 0 plasmids left after depth filtering.")
775764
else:
776765
logger.info(
777-
f"{len(filtered_indices)} plasmids were filtered as they were below the depth filter."
766+
f"{len(filtered_out_contig_ids)} plasmids were filtered as they were below the depth filter."
778767
)
779-
780-
# Updating 'contig_id' names starting from 1 from the identified index
781-
# if it is None then there are only chromosome contigs left so no need for this
782-
if p1_idx is not None:
783-
combined_depth_mash_df.loc[p1_idx:, "contig"] = range(
784-
1, len(combined_depth_mash_df) - p1_idx + 2
785-
)
786-
# Reset index after renaming
787-
combined_depth_mash_df.reset_index(drop=True, inplace=True)
788768
else:
789769
logger.info(f"No plasmids were filtered due to low depth.")
790770

771+
# concat dfs back
772+
# there is 1+ plasmid
773+
if len(kept_plasmid_contig_ids) > 0:
774+
combined_depth_mash_df = pd.concat(
775+
[combined_chrom_df, combined_plasmid_df], axis=0
776+
)
777+
else: # only chroms
778+
combined_depth_mash_df = combined_chrom_df
779+
791780
combined_depth_mash_df.to_csv(
792781
os.path.join(outdir, prefix + "_summary.tsv"), sep="\t", index=False
793782
)
@@ -796,7 +785,7 @@ def combine_depth_mash_tsvs(self, prefix, depth_filter):
796785

797786
def finalise_contigs(self, prefix):
798787
"""
799-
Renames the contigs of unicycler with the new plasmid copy numbers and outputs finalised file
788+
Filters contigs plassembler run
800789
"""
801790
outdir = self.outdir
802791

@@ -808,23 +797,42 @@ def finalise_contigs(self, prefix):
808797
].reset_index(drop=True)
809798
# get contigs only
810799
plasmid_fasta = os.path.join(outdir, "unicycler_output", "assembly.fasta")
811-
i = 0
812800
with open(os.path.join(outdir, prefix + "_plasmids.fasta"), "w") as dna_fa:
813801
for dna_record in SeqIO.parse(plasmid_fasta, "fasta"):
814802
split_desc = dna_record.description.split(" ")
803+
contig_id = str(split_desc[0])
815804
# only keep the contigs that passed the depth threshold
816-
if str(split_desc[0]) not in filtered_out_contig_ids:
805+
if contig_id not in filtered_out_contig_ids:
806+
sr_copy_number_list = combined_depth_mash_df.loc[
807+
combined_depth_mash_df["contig"] == contig_id,
808+
"plasmid_copy_number_short",
809+
].values
810+
lr_copy_number_list = combined_depth_mash_df.loc[
811+
combined_depth_mash_df["contig"] == contig_id,
812+
"plasmid_copy_number_long",
813+
].values
814+
if len(sr_copy_number_list) > 0:
815+
sr_copy_number = sr_copy_number_list[0]
816+
else:
817+
logger.error("Plassembler failed")
818+
819+
if len(lr_copy_number_list) > 0:
820+
lr_copy_number = lr_copy_number_list[0]
821+
else:
822+
logger.error("Plassembler failed")
823+
824+
# will be ordered so new_contig_count will be the index of the df
825+
# but 1 index for the output
817826
if "circular" in dna_record.description: # circular contigs
818-
id_updated = f"{i} {split_desc[1]} plasmid_copy_number_short={combined_depth_mash_df.plasmid_copy_number_short[i]}x plasmid_copy_number_long={combined_depth_mash_df.plasmid_copy_number_long[i]}x circular=true"
827+
id_updated = f"{contig_id} {split_desc[1]} plasmid_copy_number_short={sr_copy_number}x plasmid_copy_number_long={lr_copy_number}x circular=true"
819828
else: # non circular contigs
820-
id_updated = f"{i} {split_desc[1]} plasmid_copy_number_short={combined_depth_mash_df.plasmid_copy_number_short[i]}x plasmid_copy_number_long={combined_depth_mash_df.plasmid_copy_number_long[i]}x"
821-
i += 1
829+
id_updated = f"{contig_id} {split_desc[1]} plasmid_copy_number_short={sr_copy_number}x plasmid_copy_number_long={lr_copy_number}x"
822830
record = SeqRecord(dna_record.seq, id=id_updated, description="")
823831
SeqIO.write(record, dna_fa, "fasta")
824832

825833
def finalise_contigs_long(self, prefix):
826834
"""
827-
Renames the contigs of assembly with new ones
835+
Filters contigs plassembler long
828836
"""
829837
outdir = self.outdir
830838

@@ -836,21 +844,30 @@ def finalise_contigs_long(self, prefix):
836844
].reset_index(drop=True)
837845
# get contigs only
838846
plasmid_fasta = os.path.join(outdir, "plasmids.fasta")
839-
i = 0
840847
with open(os.path.join(outdir, prefix + "_plasmids.fasta"), "w") as dna_fa:
841848
for dna_record in SeqIO.parse(plasmid_fasta, "fasta"):
842849
# only keep the contigs that passed the depth threshold
843-
if str(dna_record.id) not in filtered_out_contig_ids:
850+
contig_id = str(dna_record.id)
851+
if contig_id not in filtered_out_contig_ids:
844852
length = len(dna_record.seq)
845-
copy_number = combined_depth_mash_df.plasmid_copy_number_long[i]
853+
lr_copy_number_list = combined_depth_mash_df.loc[
854+
combined_depth_mash_df["contig"] == contig_id,
855+
"plasmid_copy_number_long",
856+
].values
857+
if len(lr_copy_number_list) > 0:
858+
lr_copy_number = lr_copy_number_list[0]
859+
else:
860+
logger.error("Plassembler failed")
861+
846862
if (
847863
"circular" in dna_record.description
848864
): # circular contigs from canu
849-
desc = f"len={length} plasmid_copy_number_long={copy_number}x circular=True"
865+
desc = f"len={length} plasmid_copy_number_long={lr_copy_number}x circular=True"
850866
else:
851-
desc = f"len={length} plasmid_copy_number_long={copy_number}x"
852-
i += 1
853-
record = SeqRecord(dna_record.seq, id=str(i), description=desc)
867+
desc = (
868+
f"len={length} plasmid_copy_number_long={lr_copy_number}x"
869+
)
870+
record = SeqRecord(dna_record.seq, id=contig_id, description=desc)
854871
SeqIO.write(record, dna_fa, "fasta")
855872

856873

4.7 KB
Binary file not shown.
6.5 KB
Binary file not shown.
7.75 MB
Binary file not shown.
3.24 MB
Binary file not shown.

tests/test_end_to_end.py

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -124,17 +124,6 @@ def test_plassembler_case_3(self):
124124
exec_command(cmd)
125125
remove_directory(outdir)
126126

127-
def test_plassembler_case_3(self):
128-
"""test plassembler run - chromosome and plasmids assembled with Flye"""
129-
longreads: Path = f"{end_to_end}/input_fastq.gz"
130-
s1: Path = f"{end_to_end}/input_R1.fastq.gz"
131-
s2: Path = f"{end_to_end}/input_R2.fastq.gz"
132-
chromosome = 50000
133-
outdir: Path = f"{end_to_end}/test_out"
134-
cmd = f"plassembler run -l {longreads} -c {chromosome} -1 {s1} -2 {s2} -d {plassembler_db_dir} -o {outdir} -t 8 -f"
135-
exec_command(cmd)
136-
remove_directory(outdir)
137-
138127
def test_plassembler_case_4(self):
139128
"""test plassembler run case 4. Only chromosome assembled with flye, no plasmid in recovery."""
140129
longreads: Path = f"{end_to_end}/abaumanii_plasmid.fastq.gz"
@@ -159,7 +148,7 @@ def test_plassembler_multiple_chromosomes_no_plasmids(self):
159148

160149
# copy number
161150

162-
def test_plassembler_case_depth_filter(self):
151+
def test_plassembler_case_depth_filter_all(self):
163152
"""test plassembler run depth_filter 1.2 - will have no plasmids left"""
164153
longreads: Path = f"{end_to_end}/input_fastq.gz"
165154
s1: Path = f"{end_to_end}/input_R1.fastq.gz"
@@ -170,6 +159,17 @@ def test_plassembler_case_depth_filter(self):
170159
exec_command(cmd)
171160
remove_directory(outdir)
172161

162+
def test_plassembler_case_depth_filter_some(self):
163+
"""test plassembler run depth_filter 0.6 with input_half lr - will have only 1 plasmid"""
164+
longreads: Path = f"{end_to_end}/input_half.fastq.gz"
165+
s1: Path = f"{end_to_end}/input_R1.fastq.gz"
166+
s2: Path = f"{end_to_end}/input_R2.fastq.gz"
167+
chromosome = 50000
168+
outdir: Path = f"{end_to_end}/test_out"
169+
cmd = f"plassembler run -l {longreads} -c {chromosome} -1 {s1} -2 {s2} -d {plassembler_db_dir} -o {outdir} -t 8 -f --depth_filter 0.6"
170+
exec_command(cmd)
171+
remove_directory(outdir)
172+
173173
def test_plassembler_case_extra_unicycler_spades_opts(self):
174174
"""test plassembler with extra unicycler and spades opts"""
175175
longreads: Path = f"{end_to_end}/input_fastq.gz"
@@ -263,6 +263,24 @@ def test_plassembler_long(self):
263263
exec_command(cmd)
264264
remove_directory(outdir)
265265

266+
def test_plassembler_depth_all(self):
267+
"""test plassembler long depth filter will all plasmids filtered"""
268+
longreads: Path = f"{end_to_end}/input_fastq.gz"
269+
chromosome = 50000
270+
outdir: Path = f"{end_to_end}/test_out"
271+
cmd = f"plassembler long -l {longreads} -c {chromosome} -d {plassembler_db_dir} -o {outdir} --depth_filter 10 -t 8 -f"
272+
exec_command(cmd)
273+
remove_directory(outdir)
274+
275+
def test_plassembler_depth_some(self):
276+
"""test plassembler long depth filter will some plasmids filtered"""
277+
longreads: Path = f"{end_to_end}/input_depth_filter.fastq.gz"
278+
chromosome = 50000
279+
outdir: Path = f"{end_to_end}/test_out"
280+
cmd = f"plassembler long -l {longreads} -c {chromosome} -d {plassembler_db_dir} -o {outdir} --depth_filter 2 -t 8 -f"
281+
exec_command(cmd)
282+
remove_directory(outdir)
283+
266284
# def test_plassembler_long_canu(self):
267285
# """test plassembler long canu"""
268286
# longreads: Path = f"{end_to_end}/input_fastq.gz"

0 commit comments

Comments
 (0)