From 091409750b92ffab68789179f29f619872fe998b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Garc=C3=ADa=20Ruano?= Date: Wed, 28 Jan 2026 11:53:31 +0100 Subject: [PATCH 1/2] Handle pre-calculated structures - Add optional parameter to extract secondary structure from input files. - Add optional parameter to ssf_extract functions. - Avoid running RNAfold if secondary structures are read --- bin/SSF_extraction.py | 10 +++++--- bin/lncDC-train.py | 28 ++++++++++++++++++---- bin/lncDC.py | 15 ++++++++++-- bin/train_SSF_extraction.py | 11 ++++++--- bin/utils.py | 48 +++++++++++++++++++++++++++++++++++++ 5 files changed, 100 insertions(+), 12 deletions(-) create mode 100644 bin/utils.py diff --git a/bin/SSF_extraction.py b/bin/SSF_extraction.py index 672c270..038d2ee 100644 --- a/bin/SSF_extraction.py +++ b/bin/SSF_extraction.py @@ -183,10 +183,14 @@ def feature_extract_kmer(dataset, thread, feature, mrna_mer, lncrna_mer): return dataset def ssf_extract(dataset, thread, mrna_1mer, lncrna_1mer, mrna_2mer, lncrna_2mer, - mrna_3mer, lncrna_3mer, mrna_4mer, lncrna_4mer, mrna_5mer, lncrna_5mer): + mrna_3mer, lncrna_3mer, mrna_4mer, lncrna_4mer, mrna_5mer, lncrna_5mer, secondary_structure=None): # extract the secondary structure of a transcript - print("Calculating secondary structures of the transcripts by RNAfold ... (This process may take a long time!)") - dataset = feature_extract_ss(dataset, thread, run_rnafold_parallel) + if secondary_structure is None: + print("Calculating secondary structures of the transcripts by RNAfold ... (This process may take a long time!)") + dataset = feature_extract_ss(dataset, thread, run_rnafold_parallel) + else: + print("Using pre-calculated secondary structures") + dataset['Secondary_structure'] = secondary_structure print("Extracting SSF features ...") # feature 1: Secondary structure score of kmer 1 diff --git a/bin/lncDC-train.py b/bin/lncDC-train.py index ca07e35..186f357 100644 --- a/bin/lncDC-train.py +++ b/bin/lncDC-train.py @@ -24,6 +24,8 @@ from imblearn.under_sampling import RandomUnderSampler import train_SIF_PF_extraction +from utils import load_precomputed_ss + seed = 666 def file_exist(filename,parser): @@ -108,6 +110,8 @@ def under_over_process(x, y, njobs): x_resampled, y_resampled = smote.fit_resample(x_underSampled, y_underSampled) return x_resampled, y_resampled + + def main(): parser = argparse.ArgumentParser(description='LncDC: a machine learning based tool for long non-coding RNA detection from RNA-Seq data') parser.add_argument('-v','--version', action = 'version', version = '%(prog)s version:1.3.5') @@ -122,6 +126,7 @@ def main(): action = "store_true") parser.add_argument('-t','--thread', help = '(Optional) The number of threads assigned to use. Set -1 to use all cpus. Default value: -1.', type = int, default = -1) + parser.add_argument('-s','--ss_file', type = bool, help = '(Optional) Flag to indicate that input mRNA and lncRNA files are SS files', required = False, default = False) args = parser.parse_args() mrna = args.mrna @@ -129,6 +134,7 @@ def main(): lncrna = args.lncrna ss_feature = args.secondary output_prefix = args.output + ss_file = args.ss_file thread = args.thread if thread == -1: @@ -163,8 +169,15 @@ def main(): if not os.path.isfile(output_prefix): os.makedirs(os.path.dirname(output_prefix), exist_ok = True) - # load mrna data - mrna_data = load_fasta(mrna) + if not ss_file: + # load mrna data + mrna_data = load_fasta(mrna) + else: + # Load pre-calculated secondary structures + print("Loading pre-calculated secondary structures for mRNA ...") + mrna_data, mrna_ss = load_precomputed_ss(mrna) + pass + # load cds data cds_data = load_fasta(cds) @@ -176,9 +189,16 @@ def main(): mrna_dataset.loc[i, 'Sequence'] = mrna_data[i] mrna_dataset.loc[i,'type'] = 'mrna' mrna_dataset.loc[i, 'CDS_seq'] = cds_data[i] + # TODO: If SS data is loaded, cds may not match. We need to make sure they do match beforehand. - # load lncrna data - lncrna_data = load_fasta(lncrna) + if not ss_file: + # load lncrna data + lncrna_data = load_fasta(lncrna) + else: + # Load pre-calculated secondary structures + print("Loading pre-calculated secondary structures for lncRNA ...") + lncrna_data, lncrna_ss = load_precomputed_ss(lncrna) + pass # initialize a lncrna dataframe lncrna_dataset = pd.DataFrame(index=range(len(lncrna_data)), columns=['Sequence', 'type', 'CDS_seq']) diff --git a/bin/lncDC.py b/bin/lncDC.py index f40a83b..42f971f 100644 --- a/bin/lncDC.py +++ b/bin/lncDC.py @@ -19,6 +19,8 @@ import SIF_PF_extraction import argparse +from utils import load_precomputed_ss + def file_check(filename, rule, filetype): if not filename.endswith(rule): sys.stderr.write("ERROR: Please use the "+filetype+ " file, which ends with: "+ rule +" \n") @@ -108,6 +110,7 @@ def main(): type = str, default = default_data_path+'train_ss_table') parser.add_argument('-t','--thread', help = '(Optional) The number of threads assigned to use. Set -1 to use all cpus. Default value: -1.', type = int, default = -1) + parser.add_argument('-s','--ss_file', type = bool, help = '(Optional) Flag to indicate that input mRNA and lncRNA files are SS files', required = False, default = False) args = parser.parse_args() inputfile = args.input @@ -118,6 +121,7 @@ def main(): scaler = args.scaler ss_feature = args.secondary ss_kmer_file = args.kmer + ss_file = args.ss_file thread = args.thread if thread == -1: @@ -204,7 +208,13 @@ def main(): if not os.path.isfile(outputfile): os.makedirs(os.path.dirname(outputfile), exist_ok = True) - test_data = load_fasta(inputfile) + if not ss_file: + print("Loading input fasta file ...") + test_data = load_fasta(inputfile) + else: + print("Loading input secondary structure file ...") + # Load pre-calculated secondary structures + test_data, test_ss = load_precomputed_ss(inputfile) print() print("Initializing dataframe ...") @@ -387,7 +397,8 @@ def main(): # extract SSF features dataset = SSF_extraction.ssf_extract(dataset, thread, mrna_1mer, lncrna_1mer, mrna_2mer, lncrna_2mer, mrna_3mer, lncrna_3mer, - mrna_4mer, lncrna_4mer, mrna_5mer, lncrna_5mer) + mrna_4mer, lncrna_4mer, mrna_5mer, lncrna_5mer, + secondary_structure=test_ss) full_columns = ['Description', 'Transcript_length', 'GC_content', 'Fickett_score', 'ORF_T0_length', 'ORF_T1_length','ORF_T2_length', 'ORF_T0_coverage', 'ORF_T1_coverage', 'ORF_T3_coverage', diff --git a/bin/train_SSF_extraction.py b/bin/train_SSF_extraction.py index a54f090..8bd12ef 100644 --- a/bin/train_SSF_extraction.py +++ b/bin/train_SSF_extraction.py @@ -213,10 +213,15 @@ def feature_extract_kmer(dataset, thread, feature, mrna_mer, lncrna_mer): dataset = pd.concat(pool.map(parallel_function, df_chunks), ignore_index = False) return dataset -def ssf_extract(dataset, thread, output_table): +def ssf_extract(dataset, thread, output_table, secondary_structure=None): # extract the secondary structure of a transcript - print("Calculating secondary structures of the transcripts by RNAfold ... (This process may take a long time!)") - dataset = feature_extract_ss(dataset, thread, run_rnafold_parallel) + if secondary_structure is None: + print("Calculating secondary structures of the transcripts by RNAfold ... (This process may take a long time!)") + dataset = feature_extract_ss(dataset, thread, run_rnafold_parallel) + else: + print("Loading pre-calculated secondary structures ...") + # TODO: Check if we should merge in index + dataset['Secondary_structure'] = secondary_structure print("Generating sequence and secondary structure k-mer tables ...") # make ss kmer tables diff --git a/bin/utils.py b/bin/utils.py new file mode 100644 index 0000000..f734ac8 --- /dev/null +++ b/bin/utils.py @@ -0,0 +1,48 @@ +import re +import sys +import gzip + +def load_fasta(filename): + ''' + load_fasta + Inputs: + filename - name of FASTA file to load + Returns: + a list of sequences + ''' + sequences = [] + seq = '' + + if filename.endswith((".gz", ".Z", ".z")): + fd = gzip.open(filename, 'rt') + else: + fd = open(filename, 'r') + + for line in fd: + if line.startswith('>'): + if seq != '': + sequences.append(seq.replace('a','A').replace('t','T').replace('g','G').replace('c','C')) + seq = '' + else: + seq += line.strip() + if seq != '': + sequences.append(seq.replace('a','A').replace('t','T').replace('g','G').replace('c','C')) + + fd.close() + return sequences + + +def load_precomputed_ss(ss_file): + seqs_plus_ss = load_fasta(ss_file) + pattern = r"([acgtnACGTN]+)([\.()]+)(-?\d+(\.\d+)?)$" + sequences = [] + secondary_structures = [] + for entry in seqs_plus_ss: + match = re.match(pattern, entry) + if match: + sequences.append(match.group(1).replace('a','A').replace('t','T').replace('g','G').replace('c','C')) + secondary_structures.append(match.group(2)) + else: + sys.stderr.write("ERROR: The format of the secondary structure file is incorrect! \n") + sys.exit(1) + return sequences, secondary_structures \ No newline at end of file From ae59b2d4a286a28b9ce79c221065fb389e6ae925 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Garc=C3=ADa-Ruano?= Date: Thu, 29 Jan 2026 18:32:17 +0100 Subject: [PATCH 2/2] Filter out secondary structures by length and unsupported characters --- bin/lncDC-train.py | 40 +++++++++++++++++++++++++--------------- bin/lncDC.py | 17 +++++++++++++---- bin/utils.py | 24 ++++++++++++++++++------ 3 files changed, 56 insertions(+), 25 deletions(-) diff --git a/bin/lncDC-train.py b/bin/lncDC-train.py index 186f357..e016ead 100644 --- a/bin/lncDC-train.py +++ b/bin/lncDC-train.py @@ -169,17 +169,17 @@ def main(): if not os.path.isfile(output_prefix): os.makedirs(os.path.dirname(output_prefix), exist_ok = True) - if not ss_file: - # load mrna data - mrna_data = load_fasta(mrna) - else: + # load cds data + cds_data = load_fasta(cds) + + if ss_file: # Load pre-calculated secondary structures print("Loading pre-calculated secondary structures for mRNA ...") mrna_data, mrna_ss = load_precomputed_ss(mrna) - pass - - # load cds data - cds_data = load_fasta(cds) + cds_data = [cds.upper().replace('U', 'T') for cds in cds_data] + else: + # load mrna data + mrna_data = load_fasta(mrna) print() print("Initializing dataframe ...") @@ -189,16 +189,18 @@ def main(): mrna_dataset.loc[i, 'Sequence'] = mrna_data[i] mrna_dataset.loc[i,'type'] = 'mrna' mrna_dataset.loc[i, 'CDS_seq'] = cds_data[i] - # TODO: If SS data is loaded, cds may not match. We need to make sure they do match beforehand. - if not ss_file: - # load lncrna data - lncrna_data = load_fasta(lncrna) - else: + if ss_file: # Load pre-calculated secondary structures print("Loading pre-calculated secondary structures for lncRNA ...") lncrna_data, lncrna_ss = load_precomputed_ss(lncrna) - pass + ss_data = pd.concat([pd.Series(mrna_ss), pd.Series(lncrna_ss)], ignore_index=True) + print("Total Number of secondary structures loaded: " + str(ss_data.index.size)) + print("NOTE: Secondary structures will be filtered to match valid sequences (i.e., only sequences with A,T,G,C and length between 200 and 20000 nt are kept).") + else: + # load lncrna data + lncrna_data = load_fasta(lncrna) + ss_data = None # initialize a lncrna dataframe lncrna_dataset = pd.DataFrame(index=range(len(lncrna_data)), columns=['Sequence', 'type', 'CDS_seq']) @@ -218,12 +220,15 @@ def main(): for i in range(dataset.index.size): if len(re.findall(r'[^ATGC]',dataset.loc[i,'Sequence'])) > 0: dataset.loc[i,'Sequence'] = float('NaN') + if ss_data is not None: + ss_data.loc[i] = float('NaN') dataset.dropna(how = 'any', inplace = True) # reset the index of the dataframe dataset.reset_index(drop = True, inplace = True) print("Calculating transcript lengths ...") print() + print(dataset.head()) # Calculate the length of the transcripts for i in range(dataset.index.size): dataset.loc[i,'Transcript_length'] = len(dataset.loc[i, 'Sequence']) @@ -236,6 +241,7 @@ def main(): print("Removing Non-valid transcripts (sequence that have non-ATGCatgc letters & sequence length less than 200 nt) ...") print("Number of valid transcripts for training: " + str(dataset.index.size)) + if dataset.index.size == 0: sys.stderr.write("No valid transcripts detected! \n") @@ -309,6 +315,10 @@ def main(): dataset = dataset[(dataset['Transcript_length'] >= 200) & (dataset['Transcript_length'] <= 20000)] dataset = dataset.reset_index(drop=True) + if ss_data is not None: + ss_data = ss_data[ss_data.str.len().between(200, 20000)] + ss_data = ss_data.reset_index(drop=True) + print("Removing Non-valid transcripts (sequence that have non-ATGCatgc" + " letters & sequence length less than 200 nt) ...") print("Filtering out transcripts with sequence length greater than 20,000" + "nt due to the limited addressable range of the RNAfold program ...") @@ -332,7 +342,7 @@ def main(): dataset = dataset[columns] # extract Secondary Structure Features - dataset = train_SSF_extraction.ssf_extract(dataset, thread, output_prefix) + dataset = train_SSF_extraction.ssf_extract(dataset, thread, output_prefix, secondary_structure=ss_data) full_columns = ['type', 'Transcript_length', 'GC_content', 'Fickett_score', 'ORF_T0_length', 'ORF_T1_length','ORF_T2_length', 'ORF_T0_coverage', 'ORF_T1_coverage', 'ORF_T3_coverage', 'Hexamer_score_ORF_T0','Hexamer_score_ORF_T1', 'Hexamer_score_ORF_T2', 'Hexamer_score_ORF_T3', diff --git a/bin/lncDC.py b/bin/lncDC.py index 42f971f..2d3a182 100644 --- a/bin/lncDC.py +++ b/bin/lncDC.py @@ -208,13 +208,15 @@ def main(): if not os.path.isfile(outputfile): os.makedirs(os.path.dirname(outputfile), exist_ok = True) - if not ss_file: - print("Loading input fasta file ...") - test_data = load_fasta(inputfile) - else: + if ss_file: print("Loading input secondary structure file ...") # Load pre-calculated secondary structures test_data, test_ss = load_precomputed_ss(inputfile) + print("Total Number of secondary structures loaded: " + str(test_ss.index.size)) + print("NOTE: Secondary structures will be filtered to match valid sequences (i.e., only sequences with A,T,G,C and length between 200 and 20000 nt are kept).") + if not ss_file: + print("Loading input fasta file ...") + test_data = load_fasta(inputfile) print() print("Initializing dataframe ...") @@ -230,6 +232,8 @@ def main(): for i in range(dataset.index.size): if len(re.findall(r'[^ATGC]',dataset.loc[i,'Sequence'])) > 0: dataset.loc[i,'Sequence'] = float('NaN') + if ss_file: + test_ss.loc[i] = float('NaN') dataset.dropna(how = 'any', inplace = True) # reset the index of the dataframe dataset.reset_index(drop = True, inplace = True) @@ -258,6 +262,7 @@ def main(): # print() dataset = dataset[dataset['Transcript_length'] >= 200] dataset = dataset.reset_index(drop=True) + print("Removing Non-valid transcripts (sequence that have non-ATGCatgc letters & sequence length less than 200 nt) ...") print("Number of valid transcripts: " + str(dataset.index.size)) @@ -314,6 +319,10 @@ def main(): # Filter out sequence length less than 200nt or more than 20000nt dataset = dataset[(dataset['Transcript_length'] >= 200) & (dataset['Transcript_length'] <= 20000)] dataset = dataset.reset_index(drop=True) + + if ss_data is not None: + ss_data = ss_data[ss_data.str.len().between(200, 20000)] + ss_data = ss_data.reset_index(drop=True) print("Removing Non-valid transcripts (sequence that have non-ATGCatgc" + " letters & sequence length less than 200 nt) ...") print("Filtering out transcripts with sequence length greater than 20,000" + "nt due to the limited addressable range of the RNAfold program ...") diff --git a/bin/utils.py b/bin/utils.py index f734ac8..eb62812 100644 --- a/bin/utils.py +++ b/bin/utils.py @@ -34,15 +34,27 @@ def load_fasta(filename): def load_precomputed_ss(ss_file): seqs_plus_ss = load_fasta(ss_file) - pattern = r"([acgtnACGTN]+)([\.()]+)(-?\d+(\.\d+)?)$" + pattern = r"([acgtunACGTUN]+)([\.()]+)(-?\d+(\.\d+)?)$" sequences = [] secondary_structures = [] - for entry in seqs_plus_ss: + invalid_count = 0 + for idx, entry in enumerate(seqs_plus_ss): match = re.match(pattern, entry) if match: - sequences.append(match.group(1).replace('a','A').replace('t','T').replace('g','G').replace('c','C')) - secondary_structures.append(match.group(2)) + seq = match.group(1).upper().replace('U', 'T') + ss = match.group(2) + if len(seq) != len(ss): + print(f"[load_precomputed_ss] WARNING: Sequence/structure length mismatch at entry {idx}: seq_len={len(seq)}, ss_len={len(ss)}\nSequence: {seq[:50]}...\nStructure: {ss[:50]}...") + invalid_count += 1 + continue + sequences.append(seq) + secondary_structures.append(ss) else: - sys.stderr.write("ERROR: The format of the secondary structure file is incorrect! \n") - sys.exit(1) + print(f"[load_precomputed_ss] ERROR: Format error at entry {idx}: {entry}") + invalid_count += 1 + continue + print(f"[load_precomputed_ss] Loaded {len(sequences)} valid entries, {invalid_count} invalid entries skipped.") + if len(sequences) == 0: + sys.stderr.write("ERROR: No valid sequence/structure pairs loaded from file!\n") + sys.exit(1) return sequences, secondary_structures \ No newline at end of file