Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions bin/SSF_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,10 +183,14 @@ def feature_extract_kmer(dataset, thread, feature, mrna_mer, lncrna_mer):
return dataset

def ssf_extract(dataset, thread, mrna_1mer, lncrna_1mer, mrna_2mer, lncrna_2mer,
mrna_3mer, lncrna_3mer, mrna_4mer, lncrna_4mer, mrna_5mer, lncrna_5mer):
mrna_3mer, lncrna_3mer, mrna_4mer, lncrna_4mer, mrna_5mer, lncrna_5mer, secondary_structure=None):
# extract the secondary structure of a transcript
print("Calculating secondary structures of the transcripts by RNAfold ... (This process may take a long time!)")
dataset = feature_extract_ss(dataset, thread, run_rnafold_parallel)
if secondary_structure is None:
print("Calculating secondary structures of the transcripts by RNAfold ... (This process may take a long time!)")
dataset = feature_extract_ss(dataset, thread, run_rnafold_parallel)
else:
print("Using pre-calculated secondary structures")
dataset['Secondary_structure'] = secondary_structure

print("Extracting SSF features ...")
# feature 1: Secondary structure score of kmer 1
Expand Down
40 changes: 35 additions & 5 deletions bin/lncDC-train.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
from imblearn.under_sampling import RandomUnderSampler
import train_SIF_PF_extraction

from utils import load_precomputed_ss

seed = 666

def file_exist(filename,parser):
Expand Down Expand Up @@ -108,6 +110,8 @@ def under_over_process(x, y, njobs):
x_resampled, y_resampled = smote.fit_resample(x_underSampled, y_underSampled)
return x_resampled, y_resampled



def main():
parser = argparse.ArgumentParser(description='LncDC: a machine learning based tool for long non-coding RNA detection from RNA-Seq data')
parser.add_argument('-v','--version', action = 'version', version = '%(prog)s version:1.3.5')
Expand All @@ -122,13 +126,15 @@ def main():
action = "store_true")
parser.add_argument('-t','--thread', help = '(Optional) The number of threads assigned to use. Set -1 to use all cpus. Default value: -1.',
type = int, default = -1)
parser.add_argument('-s','--ss_file', type = bool, help = '(Optional) Flag to indicate that input mRNA and lncRNA files are SS files', required = False, default = False)

args = parser.parse_args()
mrna = args.mrna
cds = args.cds
lncrna = args.lncrna
ss_feature = args.secondary
output_prefix = args.output
ss_file = args.ss_file

thread = args.thread
if thread == -1:
Expand Down Expand Up @@ -163,10 +169,17 @@ def main():
if not os.path.isfile(output_prefix):
os.makedirs(os.path.dirname(output_prefix), exist_ok = True)

# load mrna data
mrna_data = load_fasta(mrna)
# load cds data
cds_data = load_fasta(cds)

if ss_file:
# Load pre-calculated secondary structures
print("Loading pre-calculated secondary structures for mRNA ...")
mrna_data, mrna_ss = load_precomputed_ss(mrna)
cds_data = [cds.upper().replace('U', 'T') for cds in cds_data]
else:
# load mrna data
mrna_data = load_fasta(mrna)

print()
print("Initializing dataframe ...")
Expand All @@ -177,8 +190,17 @@ def main():
mrna_dataset.loc[i,'type'] = 'mrna'
mrna_dataset.loc[i, 'CDS_seq'] = cds_data[i]

# load lncrna data
lncrna_data = load_fasta(lncrna)
if ss_file:
# Load pre-calculated secondary structures
print("Loading pre-calculated secondary structures for lncRNA ...")
lncrna_data, lncrna_ss = load_precomputed_ss(lncrna)
ss_data = pd.concat([pd.Series(mrna_ss), pd.Series(lncrna_ss)], ignore_index=True)
print("Total Number of secondary structures loaded: " + str(ss_data.index.size))
print("NOTE: Secondary structures will be filtered to match valid sequences (i.e., only sequences with A,T,G,C and length between 200 and 20000 nt are kept).")
else:
# load lncrna data
lncrna_data = load_fasta(lncrna)
ss_data = None

# initialize a lncrna dataframe
lncrna_dataset = pd.DataFrame(index=range(len(lncrna_data)), columns=['Sequence', 'type', 'CDS_seq'])
Expand All @@ -198,12 +220,15 @@ def main():
for i in range(dataset.index.size):
if len(re.findall(r'[^ATGC]',dataset.loc[i,'Sequence'])) > 0:
dataset.loc[i,'Sequence'] = float('NaN')
if ss_data is not None:
ss_data.loc[i] = float('NaN')
dataset.dropna(how = 'any', inplace = True)
# reset the index of the dataframe
dataset.reset_index(drop = True, inplace = True)

print("Calculating transcript lengths ...")
print()
print(dataset.head())
# Calculate the length of the transcripts
for i in range(dataset.index.size):
dataset.loc[i,'Transcript_length'] = len(dataset.loc[i, 'Sequence'])
Expand All @@ -216,6 +241,7 @@ def main():

print("Removing Non-valid transcripts (sequence that have non-ATGCatgc letters & sequence length less than 200 nt) ...")
print("Number of valid transcripts for training: " + str(dataset.index.size))


if dataset.index.size == 0:
sys.stderr.write("No valid transcripts detected! \n")
Expand Down Expand Up @@ -289,6 +315,10 @@ def main():
dataset = dataset[(dataset['Transcript_length'] >= 200) & (dataset['Transcript_length'] <= 20000)]
dataset = dataset.reset_index(drop=True)

if ss_data is not None:
ss_data = ss_data[ss_data.str.len().between(200, 20000)]
ss_data = ss_data.reset_index(drop=True)

print("Removing Non-valid transcripts (sequence that have non-ATGCatgc" + " letters & sequence length less than 200 nt) ...")
print("Filtering out transcripts with sequence length greater than 20,000" + "nt due to the limited addressable range of the RNAfold program ...")

Expand All @@ -312,7 +342,7 @@ def main():
dataset = dataset[columns]

# extract Secondary Structure Features
dataset = train_SSF_extraction.ssf_extract(dataset, thread, output_prefix)
dataset = train_SSF_extraction.ssf_extract(dataset, thread, output_prefix, secondary_structure=ss_data)
full_columns = ['type', 'Transcript_length', 'GC_content', 'Fickett_score', 'ORF_T0_length',
'ORF_T1_length','ORF_T2_length', 'ORF_T0_coverage', 'ORF_T1_coverage', 'ORF_T3_coverage',
'Hexamer_score_ORF_T0','Hexamer_score_ORF_T1', 'Hexamer_score_ORF_T2', 'Hexamer_score_ORF_T3',
Expand Down
24 changes: 22 additions & 2 deletions bin/lncDC.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
import SIF_PF_extraction
import argparse

from utils import load_precomputed_ss

def file_check(filename, rule, filetype):
if not filename.endswith(rule):
sys.stderr.write("ERROR: Please use the "+filetype+ " file, which ends with: "+ rule +" \n")
Expand Down Expand Up @@ -108,6 +110,7 @@ def main():
type = str, default = default_data_path+'train_ss_table')
parser.add_argument('-t','--thread', help = '(Optional) The number of threads assigned to use. Set -1 to use all cpus. Default value: -1.',
type = int, default = -1)
parser.add_argument('-s','--ss_file', type = bool, help = '(Optional) Flag to indicate that input mRNA and lncRNA files are SS files', required = False, default = False)

args = parser.parse_args()
inputfile = args.input
Expand All @@ -118,6 +121,7 @@ def main():
scaler = args.scaler
ss_feature = args.secondary
ss_kmer_file = args.kmer
ss_file = args.ss_file

thread = args.thread
if thread == -1:
Expand Down Expand Up @@ -204,7 +208,15 @@ def main():
if not os.path.isfile(outputfile):
os.makedirs(os.path.dirname(outputfile), exist_ok = True)

test_data = load_fasta(inputfile)
if ss_file:
print("Loading input secondary structure file ...")
# Load pre-calculated secondary structures
test_data, test_ss = load_precomputed_ss(inputfile)
print("Total Number of secondary structures loaded: " + str(test_ss.index.size))
print("NOTE: Secondary structures will be filtered to match valid sequences (i.e., only sequences with A,T,G,C and length between 200 and 20000 nt are kept).")
if not ss_file:
print("Loading input fasta file ...")
test_data = load_fasta(inputfile)

print()
print("Initializing dataframe ...")
Expand All @@ -220,6 +232,8 @@ def main():
for i in range(dataset.index.size):
if len(re.findall(r'[^ATGC]',dataset.loc[i,'Sequence'])) > 0:
dataset.loc[i,'Sequence'] = float('NaN')
if ss_file:
test_ss.loc[i] = float('NaN')
dataset.dropna(how = 'any', inplace = True)
# reset the index of the dataframe
dataset.reset_index(drop = True, inplace = True)
Expand Down Expand Up @@ -248,6 +262,7 @@ def main():
# print()
dataset = dataset[dataset['Transcript_length'] >= 200]
dataset = dataset.reset_index(drop=True)


print("Removing Non-valid transcripts (sequence that have non-ATGCatgc letters & sequence length less than 200 nt) ...")
print("Number of valid transcripts: " + str(dataset.index.size))
Expand Down Expand Up @@ -304,6 +319,10 @@ def main():
# Filter out sequence length less than 200nt or more than 20000nt
dataset = dataset[(dataset['Transcript_length'] >= 200) & (dataset['Transcript_length'] <= 20000)]
dataset = dataset.reset_index(drop=True)

if ss_data is not None:
ss_data = ss_data[ss_data.str.len().between(200, 20000)]
ss_data = ss_data.reset_index(drop=True)

print("Removing Non-valid transcripts (sequence that have non-ATGCatgc" + " letters & sequence length less than 200 nt) ...")
print("Filtering out transcripts with sequence length greater than 20,000" + "nt due to the limited addressable range of the RNAfold program ...")
Expand Down Expand Up @@ -387,7 +406,8 @@ def main():
# extract SSF features
dataset = SSF_extraction.ssf_extract(dataset, thread, mrna_1mer, lncrna_1mer,
mrna_2mer, lncrna_2mer, mrna_3mer, lncrna_3mer,
mrna_4mer, lncrna_4mer, mrna_5mer, lncrna_5mer)
mrna_4mer, lncrna_4mer, mrna_5mer, lncrna_5mer,
secondary_structure=test_ss)

full_columns = ['Description', 'Transcript_length', 'GC_content', 'Fickett_score', 'ORF_T0_length',
'ORF_T1_length','ORF_T2_length', 'ORF_T0_coverage', 'ORF_T1_coverage', 'ORF_T3_coverage',
Expand Down
11 changes: 8 additions & 3 deletions bin/train_SSF_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,10 +213,15 @@ def feature_extract_kmer(dataset, thread, feature, mrna_mer, lncrna_mer):
dataset = pd.concat(pool.map(parallel_function, df_chunks), ignore_index = False)
return dataset

def ssf_extract(dataset, thread, output_table):
def ssf_extract(dataset, thread, output_table, secondary_structure=None):
# extract the secondary structure of a transcript
print("Calculating secondary structures of the transcripts by RNAfold ... (This process may take a long time!)")
dataset = feature_extract_ss(dataset, thread, run_rnafold_parallel)
if secondary_structure is None:
print("Calculating secondary structures of the transcripts by RNAfold ... (This process may take a long time!)")
dataset = feature_extract_ss(dataset, thread, run_rnafold_parallel)
else:
print("Loading pre-calculated secondary structures ...")
# TODO: Check if we should merge in index
dataset['Secondary_structure'] = secondary_structure

print("Generating sequence and secondary structure k-mer tables ...")
# make ss kmer tables
Expand Down
60 changes: 60 additions & 0 deletions bin/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import re
import sys
import gzip

def load_fasta(filename):
'''
load_fasta
Inputs:
filename - name of FASTA file to load
Returns:
a list of sequences
'''
sequences = []
seq = ''

if filename.endswith((".gz", ".Z", ".z")):
fd = gzip.open(filename, 'rt')
else:
fd = open(filename, 'r')

for line in fd:
if line.startswith('>'):
if seq != '':
sequences.append(seq.replace('a','A').replace('t','T').replace('g','G').replace('c','C'))
seq = ''
else:
seq += line.strip()
if seq != '':
sequences.append(seq.replace('a','A').replace('t','T').replace('g','G').replace('c','C'))

fd.close()
return sequences


def load_precomputed_ss(ss_file):
seqs_plus_ss = load_fasta(ss_file)
pattern = r"([acgtunACGTUN]+)([\.()]+)(-?\d+(\.\d+)?)$"
sequences = []
secondary_structures = []
invalid_count = 0
for idx, entry in enumerate(seqs_plus_ss):
match = re.match(pattern, entry)
if match:
seq = match.group(1).upper().replace('U', 'T')
ss = match.group(2)
if len(seq) != len(ss):
print(f"[load_precomputed_ss] WARNING: Sequence/structure length mismatch at entry {idx}: seq_len={len(seq)}, ss_len={len(ss)}\nSequence: {seq[:50]}...\nStructure: {ss[:50]}...")
invalid_count += 1
continue
sequences.append(seq)
secondary_structures.append(ss)
else:
print(f"[load_precomputed_ss] ERROR: Format error at entry {idx}: {entry}")
invalid_count += 1
continue
print(f"[load_precomputed_ss] Loaded {len(sequences)} valid entries, {invalid_count} invalid entries skipped.")
if len(sequences) == 0:
sys.stderr.write("ERROR: No valid sequence/structure pairs loaded from file!\n")
sys.exit(1)
return sequences, secondary_structures