Skip to content

Commit dbbb94f

Browse files
Merge pull request #3 from arnald-alonso/spyder_error_zero_reads
Spyder error zero reads
2 parents 650fc32 + 152b0af commit dbbb94f

10 files changed

Lines changed: 810 additions & 53 deletions

R/stats_algs.R

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,35 @@
44
# INPUT DATA
55
args <- commandArgs(trailingOnly = TRUE)
66
path <- args[1] # path to the project outputs/ folder
7-
alg <- args[2] # algorithm: star or htseq-gene or htseq-exon
7+
alg <- args[2] # algorithm: star or htseq-gene or htseq-exon or kallisto
88
# READS RPKM AND COUNT MATRICES
9-
rpkms <- read.table(paste(path, alg, "_rpkms.txt", sep=""), sep = "\t", header = T, stringsAsFactors = F)
10-
snames1 <- as.character(read.table(paste(path, alg, "_rpkms.txt", sep=""), sep = "\t", header = F, stringsAsFactors = F, nrows = 1))
11-
counts <- read.table(paste(path, alg, "_counts.txt", sep=""), sep = "\t", header = T, stringsAsFactors = F)
12-
snames2 <- as.character(read.table(paste(path, alg, "_counts.txt", sep=""), sep = "\t", header = F, stringsAsFactors = F, nrows = 1))
9+
if (alg != 'kallisto'){
10+
rpkms <- read.table(paste(path, alg, "_rpkms.txt", sep=""), sep = "\t", header = T, stringsAsFactors = F)
11+
snames1 <- as.character(read.table(paste(path, alg, "_rpkms.txt", sep=""), sep = "\t", header = F, stringsAsFactors = F, nrows = 1))
12+
counts <- read.table(paste(path, alg, "_counts.txt", sep=""), sep = "\t", header = T, stringsAsFactors = F)
13+
snames2 <- as.character(read.table(paste(path, alg, "_counts.txt", sep=""), sep = "\t", header = F, stringsAsFactors = F, nrows = 1))
14+
labs <- c("RPKM","COUNTS")
15+
} else{
16+
rpkms <- read.table(paste(path, alg, "_tpm.txt", sep=""), sep = "\t", header = T, stringsAsFactors = F, na.strings = '-nan')
17+
rpkms <- rpkms[,2:ncol(rpkms)]
18+
snames1 <- as.character(read.table(paste(path, alg, "_tpm.txt", sep=""), sep = "\t", header = F, stringsAsFactors = F, nrows = 1))
19+
snames1 <- snames1[2:length(snames1)]
20+
counts <- read.table(paste(path, alg, "_est_counts.txt", sep=""), sep = "\t", header = T, stringsAsFactors = F)
21+
counts <- counts[,2:ncol(counts)]
22+
snames2 <- as.character(read.table(paste(path, alg, "_est_counts.txt", sep=""), sep = "\t", header = F, stringsAsFactors = F, nrows = 1))
23+
snames2 <- snames2[2:length(snames2)]
24+
labs <- c("TPM","EST_COUNTS")
25+
}
1326
# ONLY USES FEATURES WITH AT LEAST TWO RAW COUNTS IN ONE LIBRARY
1427
inc <- which(rowSums(counts[2:ncol(counts)], na.rm = T) > 1)
1528
NT <- c()
1629
for (i in 1:2){
1730
N <- matrix(NA,nrow=0,ncol=0)
18-
if (i == 1) {G <- rpkms[inc,];lab <- "RPKM"; snames <- snames1[2:length(snames1)]}
19-
if (i == 2) {G <- counts[inc,];lab <- "COUNTS"; snames <- snames2[2:length(snames2)]}
31+
if (i == 1) {G <- rpkms[inc,];lab <- labs[1]; snames <- snames1[2:length(snames1)]}
32+
if (i == 2) {G <- counts[inc,];lab <- labs[2]; snames <- snames2[2:length(snames2)]}
2033
if (ncol(G)>2){ # at least two samples available
2134
lcounts <- log2(G[, 2:ncol(G)]+1)
22-
s_ok <- (which(colSums(is.na(lcounts)) == 0)) # remove samples with NA (not processed yet)
35+
s_ok <- (which((colSums(G[, 2:ncol(G)]) > 10000)&(colSums(is.na(lcounts)) == 0))) # remove samples with NA (not processed yet)
2336
if (length(s_ok) > 1){
2437
## QUANTILES
2538
tp <- round(t(apply(lcounts[, s_ok], 2, quantile, probs = c(0.05,0.25,0.5,0.75,0.95), na.rm = T)), 2)
@@ -49,3 +62,4 @@ for (i in 1:2){
4962
NT <- cbind(snames[s_ok], NT)
5063
write.table(NT, file = paste(path, alg, "_pca", ".txt", sep=""), quote = F, row.names = F, sep = "\t")
5164

65+

lib/html_lib.py

Lines changed: 73 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -33,18 +33,24 @@ def get_menu(config, ns):
3333
menu.append('<h1>Alignment QC:</h1>')
3434
if enabled.has_key("picard"):
3535
menu.append('<h2><a #highlight="" href="./picard.html">- Picard</a></h2>')
36+
if enabled.has_key("picard_IS"):
37+
menu.append('<h2><a #highlight="" href="./picard-is.html">- Picard Insert Size</a></h2>')
3638
if enabled.has_key("star"):
3739
menu.append('<h2><a #highlight="" href="./star.html">- STAR</a></h2>')
40+
if enabled.has_key("kallisto"):
41+
menu.append('<h2><a #highlight="" href="./kallisto.html">- KALLISTO</a></h2>')
3842
if enabled.has_key("htseq-gene"):
3943
menu.append('<h2><a #highlight="" href="./htseq-gene.html">- HTseq-Gene</a></h2>')
4044
if enabled.has_key("htseq-exon"):
4145
menu.append('<h2><a #highlight="" href="./htseq-exon.html">- HTseq-Exon</a></h2>')
4246
if ns > 1:
43-
if enabled.has_key("star") or enabled.has_key("htseq-gene") or enabled.has_key("htseq-exon"):
47+
if enabled.has_key("star") or enabled.has_key("htseq-gene") or enabled.has_key("htseq-exon") or enabled.has_key("kallisto"):
4448
menu.append('<h1>Count statistics:</h1>')
4549
menu.append('<h2><a #highlight="" href="./downloads.html">- DOWNLOADS</a></h2>')
4650
if enabled.has_key("star"):
4751
menu.append('<h2><a #highlight="" href="./star2.html">- STAR</a></h2>')
52+
if enabled.has_key("kallisto"):
53+
menu.append('<h2><a #highlight="" href="./kallisto2.html">- KALLISTO</a></h2>')
4854
if enabled.has_key("htseq-gene"):
4955
menu.append('<h2><a href="./htseq-gene2.html">- HTseq-Gene</a></h2>')
5056
if enabled.has_key("htseq-exon"):
@@ -62,8 +68,8 @@ def get_menu(config, ns):
6268
return menu
6369

6470
def print_samples(path,config):
65-
analysis = ['trimgalore', 'fastqc', 'kallisto', 'star', 'star-fusion', 'picard', "htseq-gene", "htseq-exon", "varscan", 'gatk']
66-
sta= {"trimgalore":"TrimGalore", "fastqc":"FastQC","star":"STAR","star-fusion":"STAR-Fusion","picard":"PicardQC","kallisto":"Kallisto","htseq-gene":"HTseq-gene","htseq-exon":"HTseq-exon", "varscan":"VARSCAN", "gatk":"GATK"}
71+
analysis = ['trimgalore', 'fastqc', 'kallisto', 'star', 'star-fusion', 'picard', "htseq-gene", "htseq-exon", "picard_IS", "varscan", 'gatk']
72+
sta= {"trimgalore":"TrimGalore", "fastqc":"FastQC","star":"STAR","star-fusion":"STAR-Fusion","picard":"PicardQC","kallisto":"Kallisto","htseq-gene":"HTseq-gene","htseq-exon":"HTseq-exon", "picard_IS":"Picard-InsertSize", "varscan":"VARSCAN", "gatk":"GATK"}
6773
# SAMPLES LIST
6874
samples = dict()
6975
f = open(path + "/samples.list",'r')
@@ -148,6 +154,16 @@ def print_samples(path,config):
148154
else:
149155
res.append("FAIL")
150156
results["star-fusion"][x] = " / ".join(res)
157+
elif i=="picard_IS":
158+
for x, y in sorted(samples.iteritems()):
159+
res = []
160+
if sok.has_key(x):
161+
link = "../results_picard_IS/" + x + ".txt"
162+
link = '<a href="LINK" target="_blank">OK</a>'.replace("LINK", link)
163+
res.append(link)
164+
else:
165+
res.append("FAIL")
166+
results["picard_IS"][x] = " / ".join(res)
151167
elif i=="picard":
152168
for x, y in sorted(samples.iteritems()):
153169
res = []
@@ -330,6 +346,7 @@ def stats_picard(path,samples,config):
330346
if i+n[k] in files:
331347
f = open(path+"/results_picard"+"/"+i+n[k],'r')
332348
nx = 0
349+
kdiff0 = 0
333350
for ii in f:
334351
if ii.startswith("PF_BASES"):
335352
head = ii.rstrip().split("\t")
@@ -341,17 +358,26 @@ def stats_picard(path,samples,config):
341358
vals = ii.strip("\n").split("\t")
342359
for kk in range(len(head)):
343360
stats[k][head[kk]] = vals[kk]
361+
if vals[kk] != "":
362+
if float(vals[kk]) > 0:
363+
kdiff0 += 1
344364
nx = 0
365+
if kdiff0 == 0:
366+
ex = 1
367+
break
345368
f.close()
346369
else:
347370
ex = 1
371+
break
348372
if ex ==1:
349373
tr = "<td bgcolor='#CC3300'>"+i+"</td>"
350374
o = i
351375
for ii in range(18):
352376
tr += "<td bgcolor='#CC3300'>NA</td>"
353377
o += "\tNA"
354378
tr = "<tr>"+tr+"</tr>"
379+
table.append(tr)
380+
s = ["NA" for ii in range(10)]
355381
else:
356382
align = int(stats[0]["PF_ALIGNED_BASES"])
357383
cod = int(stats[0]["CODING_BASES"])
@@ -389,27 +415,56 @@ def stats_picard(path,samples,config):
389415
tr +="<td bgcolor='#99CC66'>0</td>"
390416
o += "\t0"
391417
tr = "<tr>"+tr+"</tr>"
392-
table.append(tr)
393-
o = o.split("\t")
394-
st= 0
395-
s = []
396-
for ind in [10, 11, 12, 5, 7, 3]:
397-
s.append(str(round(float(o[ind]) * float(o[2])/float(o[1]),3)))
398-
if ind != 3:
399-
st += float(o[ind]) * float(o[2])/float(o[1])
400-
for ind in [15,16,17,18]:
401-
if o[ind] != "0":
402-
s.append(str(round(float(o[ind]) * 100,3)))
403-
else:
404-
s.append("0")
405-
s[5] = str(round(100 - st,3))
418+
table.append(tr)
419+
o = o.split("\t")
420+
st= 0
421+
s = []
422+
for ind in [10, 11, 12, 5, 7, 3]:
423+
s.append(str(round(float(o[ind]) * float(o[2])/float(o[1]),3)))
424+
if ind != 3:
425+
st += float(o[ind]) * float(o[2])/float(o[1])
426+
for ind in [15,16,17,18]:
427+
if o[ind] != "0":
428+
s.append(str(round(float(o[ind]) * 100,3)))
429+
else:
430+
s.append("0")
431+
s[5] = str(round(100 - st,3))
406432
print >> out, i + "\t" + "\t".join(s)
407433
out.close()
408434
return "<table>"+"\n".join(table)+"</table>"
409435
else:
410436
return ""
411437

412438

439+
def stats_picard_2(path,samples,config):
440+
n = os.listdir(path)
441+
hh = "\t".join(['sample_id','MEDIAN_INSERT_SIZE','MEDIAN_ABSOLUTE_DEVIATION','MIN_INSERT_SIZE',
442+
'MAX_INSERT_SIZE','MEAN_INSERT_SIZE','STANDARD_DEVIATION','READ_PAIRS', 'LINK_TXT', 'LINK_PDF'])
443+
if config.has_key("picard_IS") and ("results_picard_IS" in n):
444+
files = os.listdir(path+"/results_picard_IS")
445+
out = open(path + "/outputs/stats_picard2.txt",'w')
446+
print >> out, hh
447+
data = list()
448+
for i in sorted(samples.keys()):
449+
if i + ".txt" in files:
450+
f = open(path+"/results_picard_IS"+"/"+i+".txt",'r')
451+
k = 0
452+
while (1):
453+
j = f.readline()
454+
if j.startswith("MEDIAN_INSERT_SIZE"):
455+
j = f.readline().strip("\n").split("\t")
456+
break
457+
k += 1
458+
if k > 10 or len(j) == 0:
459+
j = ['NA' for i in range(7)]
460+
break
461+
print >> out, "\t".join([i] + j + ['<a href="../results_picard_IS/' + i + '.txt" target="_blank">+</a>', '<a href="../results_picard_IS/' + i + '.pdf" target="_blank">+</a>'])
462+
f.close()
463+
else:
464+
print >> out, "\t".join([i] + ['NA' for i in range(9)])
465+
out.close()
466+
return 1
467+
413468
def skeleton(path, path2html):
414469
print "> Building HTML and OUTPUT folders skeletons..."
415470
print " - Path: " + path
@@ -507,7 +562,7 @@ def check_config(path):
507562
# Parses the configuration file
508563
print "> Parsing configuration file..."
509564
try:
510-
z = ["trimgalore", "fastqc", "star", "star-fusion", "picard", "htseq-gene", "htseq-exon", "kallisto", "varscan", "gatk"]
565+
z = ["trimgalore", "fastqc", "star", "star-fusion", "picard", "htseq-gene", "htseq-exon", "kallisto", "picard_IS", "varscan", "gatk"]
511566
f = open(path + "/config.txt", 'r')
512567
analysis = dict()
513568
analysis["cluster"] = dict()

lib/programs.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,28 @@ def sam2sortbam(timestamp, path_base, folder, samples, nproc, wt, q):
366366
return submit_job_super("sam2sortbam", path_base + folder, wt, str(nproc), q, len(samples), bsub_suffix, nchild, timestamp)
367367

368368

369+
def picard_IS(timestamp, path_base, folder, samples, nproc, wt, q):
370+
output = "results_picard_IS"
371+
secure_mkdir(path_base + folder, output)
372+
print "## Picard-InsertSize"
373+
print "> Writing jobs for Picard InsertSize..."
374+
nproc, nchild, bsub_suffix = manager.get_bsub_arg(nproc, len(samples))
375+
commands = list()
376+
ksamp = sortbysize(samples)
377+
proc_files = os.listdir(path_base + folder + "/results_sam2sortbam/")
378+
for sample in ksamp:
379+
in_file = path_base + folder + "/results_sam2sortbam/" + sample + ".sorted.bam"
380+
if sample + ".sorted.bam" in proc_files:
381+
for i in range(len(config.nannots)):
382+
out_file = in_file.replace("results_sam2sortbam/", "results_picard_IS/").replace(".sorted.bam", "")
383+
call = "java -jar " + config.path_picard + "/CollectInsertSizeMetrics.jar I="+in_file+" O="+out_file+".txt H="+out_file+".pdf"
384+
commands.append(call + sample_checker.replace("#FOLDER", path_base + folder + "/results_picard_IS").replace("#SAMPLE", sample))
385+
else:
386+
print "Warning: [Picard] Sorted BAM file not found -> " + in_file
387+
create_scripts(nchild, commands, path_base, folder, output)
388+
return submit_job_super("picard_IS", path_base + folder, wt, str(nproc), q, len(samples), bsub_suffix, nchild, timestamp)
389+
390+
369391
def varscan(timestamp, path_base, folder, samples, nproc, wt, q, genome_build, args):
370392
ref = config.path_fasta.replace("#LABEL",genome_build)
371393
output = "results_varscan"

lib/spider_stats.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import os
2-
import sys
32
import config
43

54
head_star_log = ["Sample", "Links","Started job","Started mapping","Finished","Mapping speed [Mr/h]","Input reads [n]","Input read length (mean)","Uniquely mapped [n]","Uniquely mapped [%]","Mapped length","Splices [n]","Splices annotated [n]","Splices GT/AG [n]","Splices: GC/AG [n]","Splices: AT/AC [n]","Splices: Non-canonical [n]","Mismatch rate per base [%]","Deletion rate per base [%]","Deletion average length","Insertion rate per base [%]","Insertion average length","Multimapping reads [n]","Multimapping reads [%]","Multimapping reads (+) [n]","Multimapping reads (+) [%]","Unmapped reads: too many mismatches [%]","Unmapped reads: too short [%]","Unmapped reads: other [%]"]
@@ -523,7 +522,7 @@ def stats_star(path, samples):
523522
r = list()
524523
for sample in samples:
525524
if N[i].has_key(sample):
526-
if len(N[i][sample]) > j:
525+
if (len(N[i][sample]) > j) and (MT[sample] > 0):
527526
s = str(round(float(N[i][sample][j]) * 1000000000 / (float(L[ng[j]]) * MT[sample]),4))
528527
else:
529528
s = "NA"

lib/vcrparser.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,14 +55,14 @@ def project_process(path_base, folder):
5555
for i in f:
5656
if not i.startswith("%"):
5757
i = i.strip("\n").split("\t")
58-
if i[0] in ["trimgalore", "fastqc", "kallisto", "star", "star-fusion", "picard", "htseq-gene", "htseq-exon", "varscan", "gatk"]:
58+
if i[0] in ["trimgalore", "fastqc", "kallisto", "star", "star-fusion", "picard", "htseq-gene", "htseq-exon", "picard_IS", "varscan", "gatk"]:
5959
i[1] = i[1].split("/")[0]
6060
if i[1] != "0":
6161
config[i[0]] = i[1]
62-
if config.has_key("varscan") or config.has_key("gatk"):
62+
if config.has_key("varscan") or config.has_key("gatk") or config.has_key("picard_IS") :
6363
config["sam2sortbam"] = 1
6464
if len(config) > 0:
65-
for pg in ["trimgalore", "fastqc", "kallisto", "star", "star-fusion", "picard", "htseq-gene", "htseq-exon", "sam2sortbam", "varscan", "gatk"]:
65+
for pg in ["trimgalore", "fastqc", "kallisto", "star", "star-fusion", "picard", "htseq-gene", "htseq-exon", "sam2sortbam", "picard_IS", "varscan", "gatk"]:
6666
if config.has_key(pg):
6767
print "Process: " + pg
6868
if not pids.has_key(pg):

0 commit comments

Comments
 (0)