@@ -705,62 +705,51 @@ def combine_depth_mash_tsvs(self, prefix, depth_filter):
705705 combined_depth_mash_df ["contig" ].str .contains ("chromosome" ), "PLSDB_hit"
706706 ] = ""
707707
708- # filter the dataframe by depth filter
709- all_contig_ids = combined_depth_mash_df ["contig" ].astype (str ).tolist ()
708+ # get chroms and plasmids
710709
711- if "mean_depth_short" in combined_depth_mash_df .columns :
712- combined_depth_mash_df = combined_depth_mash_df [
710+ combined_chrom_df = combined_depth_mash_df [
711+ combined_depth_mash_df ["contig" ].str .contains ("chromosome" )
712+ ]
713+ combined_plasmid_df = combined_depth_mash_df [
714+ ~ combined_depth_mash_df ["contig" ].str .contains ("chromosome" )
715+ ]
716+
717+ # get all plasmid contig ids and then filter
718+ all_plasmid_contig_ids = combined_plasmid_df ["contig" ].astype (str ).tolist ()
719+
720+ if "mean_depth_short" in combined_plasmid_df .columns :
721+ combined_plasmid_df = combined_plasmid_df [
713722 (
714- combined_depth_mash_df ["plasmid_copy_number_long" ].astype (float )
723+ combined_plasmid_df ["plasmid_copy_number_long" ].astype (float )
715724 > depth_filter
716725 )
717726 | (
718- combined_depth_mash_df ["plasmid_copy_number_short" ].astype (float )
727+ combined_plasmid_df ["plasmid_copy_number_short" ].astype (float )
719728 > depth_filter
720729 )
721- | (
722- combined_depth_mash_df ["contig" ].str .contains (
723- "chromosome" , case = False
724- )
725- )
726730 ]
727731 else : # plassembler long (long only)
728- combined_depth_mash_df = combined_depth_mash_df [
732+ combined_plasmid_df = combined_plasmid_df [
729733 (
730- combined_depth_mash_df ["plasmid_copy_number_long" ].astype (float )
734+ combined_plasmid_df ["plasmid_copy_number_long" ].astype (float )
731735 > depth_filter
732736 )
733- | (
734- combined_depth_mash_df ["contig" ].str .contains (
735- "chromosome" , case = False
736- )
737- )
738737 ]
739738
740- # get list of all ids that were kept above threshold
741- kept_contig_ids = combined_depth_mash_df ["contig" ].astype (str ).tolist ()
739+ # get list of all ids that were kept above depth threshold
740+ kept_plasmid_contig_ids = combined_plasmid_df ["contig" ].astype (str ).tolist ()
742741
743742 # List of 'contig_ids' that were filtered out
744743 filtered_out_contig_ids = [
745744 contig_id
746- for contig_id in all_contig_ids
747- if contig_id not in kept_contig_ids
748- ]
749-
750- # Identify the index of the first row that doesn't contain 'chromosome' in 'contig_id' - first plasmid index
751- # None otherwise
752- filtered_indices = combined_depth_mash_df .index [
753- ~ combined_depth_mash_df ["contig" ].str .contains ("chromosome" )
745+ for contig_id in all_plasmid_contig_ids
746+ if contig_id not in kept_plasmid_contig_ids
754747 ]
755748
756- if not filtered_indices .empty :
757- p1_idx = filtered_indices .min ()
758- else :
759- p1_idx = None
760-
749+ # logging
761750 # needs to be at least 1 filtered out id if the filtering did anything
762751 logger .info (f"Filtering contigs below depth filter: { depth_filter } ." )
763- if "mean_depth_short" in combined_depth_mash_df .columns :
752+ if "mean_depth_short" in combined_plasmid_df .columns :
764753 logger .info (
765754 f"All plasmids whose short and long read copy numbers are both below { depth_filter } will be removed."
766755 )
@@ -770,24 +759,24 @@ def combine_depth_mash_tsvs(self, prefix, depth_filter):
770759 )
771760
772761 if len (filtered_out_contig_ids ) > 0 :
773- if p1_idx is None :
762+ if len ( kept_plasmid_contig_ids ) == 0 :
774763 logger .warning (f"There are 0 plasmids left after depth filtering." )
775764 else :
776765 logger .info (
777- f"{ len (filtered_indices )} plasmids were filtered as they were below the depth filter."
766+ f"{ len (filtered_out_contig_ids )} plasmids were filtered as they were below the depth filter."
778767 )
779-
780- # Updating 'contig_id' names starting from 1 from the identified index
781- # if it is None then there are only chromosome contigs left so no need for this
782- if p1_idx is not None :
783- combined_depth_mash_df .loc [p1_idx :, "contig" ] = range (
784- 1 , len (combined_depth_mash_df ) - p1_idx + 2
785- )
786- # Reset index after renaming
787- combined_depth_mash_df .reset_index (drop = True , inplace = True )
788768 else :
789769 logger .info (f"No plasmids were filtered due to low depth." )
790770
771+ # concat dfs back
772+ # there is 1+ plasmid
773+ if len (kept_plasmid_contig_ids ) > 0 :
774+ combined_depth_mash_df = pd .concat (
775+ [combined_chrom_df , combined_plasmid_df ], axis = 0
776+ )
777+ else : # only chroms
778+ combined_depth_mash_df = combined_chrom_df
779+
791780 combined_depth_mash_df .to_csv (
792781 os .path .join (outdir , prefix + "_summary.tsv" ), sep = "\t " , index = False
793782 )
@@ -796,7 +785,7 @@ def combine_depth_mash_tsvs(self, prefix, depth_filter):
796785
797786 def finalise_contigs (self , prefix ):
798787 """
799- Renames the contigs of unicycler with the new plasmid copy numbers and outputs finalised file
788+ Filters contigs plassembler run
800789 """
801790 outdir = self .outdir
802791
@@ -808,23 +797,42 @@ def finalise_contigs(self, prefix):
808797 ].reset_index (drop = True )
809798 # get contigs only
810799 plasmid_fasta = os .path .join (outdir , "unicycler_output" , "assembly.fasta" )
811- i = 0
812800 with open (os .path .join (outdir , prefix + "_plasmids.fasta" ), "w" ) as dna_fa :
813801 for dna_record in SeqIO .parse (plasmid_fasta , "fasta" ):
814802 split_desc = dna_record .description .split (" " )
803+ contig_id = str (split_desc [0 ])
815804 # only keep the contigs that passed the depth threshold
816- if str (split_desc [0 ]) not in filtered_out_contig_ids :
805+ if contig_id not in filtered_out_contig_ids :
806+ sr_copy_number_list = combined_depth_mash_df .loc [
807+ combined_depth_mash_df ["contig" ] == contig_id ,
808+ "plasmid_copy_number_short" ,
809+ ].values
810+ lr_copy_number_list = combined_depth_mash_df .loc [
811+ combined_depth_mash_df ["contig" ] == contig_id ,
812+ "plasmid_copy_number_long" ,
813+ ].values
814+ if len (sr_copy_number_list ) > 0 :
815+ sr_copy_number = sr_copy_number_list [0 ]
816+ else :
817+ logger .error ("Plassembler failed" )
818+
819+ if len (lr_copy_number_list ) > 0 :
820+ lr_copy_number = lr_copy_number_list [0 ]
821+ else :
822+ logger .error ("Plassembler failed" )
823+
824+ # will be ordered so new_contig_count will be the index of the df
825+ # but 1 index for the output
817826 if "circular" in dna_record .description : # circular contigs
818- id_updated = f"{ i } { split_desc [1 ]} plasmid_copy_number_short={ combined_depth_mash_df . plasmid_copy_number_short [ i ] } x plasmid_copy_number_long={ combined_depth_mash_df . plasmid_copy_number_long [ i ] } x circular=true"
827+ id_updated = f"{ contig_id } { split_desc [1 ]} plasmid_copy_number_short={ sr_copy_number } x plasmid_copy_number_long={ lr_copy_number } x circular=true"
819828 else : # non circular contigs
820- id_updated = f"{ i } { split_desc [1 ]} plasmid_copy_number_short={ combined_depth_mash_df .plasmid_copy_number_short [i ]} x plasmid_copy_number_long={ combined_depth_mash_df .plasmid_copy_number_long [i ]} x"
821- i += 1
829+ id_updated = f"{ contig_id } { split_desc [1 ]} plasmid_copy_number_short={ sr_copy_number } x plasmid_copy_number_long={ lr_copy_number } x"
822830 record = SeqRecord (dna_record .seq , id = id_updated , description = "" )
823831 SeqIO .write (record , dna_fa , "fasta" )
824832
825833 def finalise_contigs_long (self , prefix ):
826834 """
827- Renames the contigs of assembly with new ones
835+ Filters contigs plassembler long
828836 """
829837 outdir = self .outdir
830838
@@ -836,21 +844,30 @@ def finalise_contigs_long(self, prefix):
836844 ].reset_index (drop = True )
837845 # get contigs only
838846 plasmid_fasta = os .path .join (outdir , "plasmids.fasta" )
839- i = 0
840847 with open (os .path .join (outdir , prefix + "_plasmids.fasta" ), "w" ) as dna_fa :
841848 for dna_record in SeqIO .parse (plasmid_fasta , "fasta" ):
842849 # only keep the contigs that passed the depth threshold
843- if str (dna_record .id ) not in filtered_out_contig_ids :
850+ contig_id = str (dna_record .id )
851+ if contig_id not in filtered_out_contig_ids :
844852 length = len (dna_record .seq )
845- copy_number = combined_depth_mash_df .plasmid_copy_number_long [i ]
853+ lr_copy_number_list = combined_depth_mash_df .loc [
854+ combined_depth_mash_df ["contig" ] == contig_id ,
855+ "plasmid_copy_number_long" ,
856+ ].values
857+ if len (lr_copy_number_list ) > 0 :
858+ lr_copy_number = lr_copy_number_list [0 ]
859+ else :
860+ logger .error ("Plassembler failed" )
861+
846862 if (
847863 "circular" in dna_record .description
848864 ): # circular contigs from canu
849- desc = f"len={ length } plasmid_copy_number_long={ copy_number } x circular=True"
865+ desc = f"len={ length } plasmid_copy_number_long={ lr_copy_number } x circular=True"
850866 else :
851- desc = f"len={ length } plasmid_copy_number_long={ copy_number } x"
852- i += 1
853- record = SeqRecord (dna_record .seq , id = str (i ), description = desc )
867+ desc = (
868+ f"len={ length } plasmid_copy_number_long={ lr_copy_number } x"
869+ )
870+ record = SeqRecord (dna_record .seq , id = contig_id , description = desc )
854871 SeqIO .write (record , dna_fa , "fasta" )
855872
856873
0 commit comments