Skip to content

Commit 1388a97

Browse files
authored
Merge pull request #146 from RBL-NCI/activeDev
Active dev
2 parents fe78ad6 + 1be71e4 commit 1388a97

7 files changed

Lines changed: 116 additions & 99 deletions

File tree

config/cluster_config.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ __default__:
99
error: .%j.{wildcards}.err
1010

1111
qc_barcode:
12-
threads: 4
13-
mem: 100g
12+
threads: 8
13+
mem: 75g
1414
time: 00-04:00:00
1515

1616
demultiplex:
@@ -34,8 +34,8 @@ qc_screen_validator:
3434
star:
3535
time: 04-00:00:00
3636
gres: lscratch:800
37-
threads: 32
38-
mem: 85g
37+
threads: 16
38+
mem: 120g
3939

4040
index_stats:
4141
threads: 8

config/snakemake_config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ contrastManifest: "OUTPUT_DIR/manifests/contrasts.tsv"
2222
multiplexflag: "Y" #whether samples are multiplexed ["Y","N"]
2323
umiSeparator: "rbc:" #required for nondemultiplexed samples to determine delimiter for deduplication [":", "_", "rbc:"]
2424
mismatch: 1 #required for multiplexed samples, number of bp mismatches allowed in demultiplexing [1,2,3]
25+
barcode_qc_flag: "PROCESS" #barcodes will undergo QC to ensure uniformity within samples; if set to IGNORE this qc step will be skipped
2526
reference: "" #reference organism ["hg38","mm10"]
2627
filterlength: 20 #minimum read length to include in analysis [any int >20]
2728
phredQuality: 20 #minimum quality score for 3’ end trimming

run_snakemake.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ fi
3535
# functions
3636
#########################################################
3737
load_modules(){
38-
if [[ $1 =~ "python" ]]; then module load python/3.8; fi
38+
if [[ $1 =~ "python" ]]; then module load python/3.7; fi
3939
if [[ $1 =~ "snakemake" ]]; then module load snakemake/7.19.1; fi
4040
if [[ $1 =~ "graphviz" ]]; then module load graphviz/2.40; fi
4141
}
@@ -194,7 +194,7 @@ check_manifest_qc(){
194194
echo "The barcode manifest check FAILED. Check "${bc_file}" for more information."
195195
exit 1
196196
else
197-
echo "-- barcode check completed successfully (sample $mp_id)"
197+
echo "-- barcode manifest check completed successfully (sample $mp_id)"
198198
fi
199199
done
200200
fi

workflow/Snakefile

Lines changed: 86 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ contrast_manifest = config['contrastManifest']
4141
# user parameters
4242
multiplex_flag = config['multiplexflag'].capitalize()
4343
mismatch = config['mismatch']
44+
barcode_qc_flag = config['barcode_qc_flag']
4445
species_ref = config['reference']
4546
filter_length = config['filterlength']
4647
phredQuality = config['phredQuality']
@@ -219,6 +220,28 @@ def rename_cmd(wildcards):
219220
cmd = 'mv ' + source + ' ' + destination + '; ' + cmd
220221
return(cmd)
221222

223+
def get_input_qc_troubleshoot(wildcards):
224+
print("HERE")
225+
d=dict()
226+
227+
l = "log_list"
228+
t = "txt_bc"
229+
p = "png_bc"
230+
231+
if (multiplex_flag == 'Y'):
232+
print("m flag on")
233+
d[l] = expand(join(out_dir,'log','STAR','{sp}.log'),sp=sp_list),
234+
d[t] = expand(join(out_dir,'{mp}', '01_qc_post','{mp}_barcode_passed.txt'),mp=samp_dict.keys())
235+
if (barcode_qc_flag == "PROCESS"):
236+
print("b flag on")
237+
d[p] = expand(join(out_dir,'{mp}', '01_qc_post','{mp}_barcode.png'),mp=samp_dict.keys())
238+
else:
239+
print("m flag off")
240+
d[l] = expand(join(out_dir,'log','STAR','{sp}.log'),sp=sp_list),
241+
print(d)
242+
d="YO"
243+
return (d)
244+
222245
#read in contrast list for manorm
223246
def get_de_list():
224247

@@ -381,7 +404,7 @@ rule all:
381404
# multiqc
382405
join(out_dir,'qc','multiqc_report.html'),
383406

384-
# qc_troubleshoot
407+
# # qc_troubleshoot
385408
join(out_dir,'qc','qc_report_troubleshooting.html'),
386409

387410
# dedup
@@ -408,7 +431,7 @@ rule all:
408431
# # qc barcode
409432
# expand(join(out_dir,'{mp}','01_qc_post','{mp}_barcode_counts.txt'),mp=mp_list),
410433
# expand(join(out_dir,'{mp}','01_qc_post','{mp}_barcode.png'),mp=mp_list),
411-
# expand(join(out_dir,'{mp}','01_qc_post','{mp}_barcode.txt'),mp=mp_list),
434+
# expand(join(out_dir,'{mp}','01_qc_post','{mp}_barcode_passed.txt'),mp=mp_list),
412435

413436
# # demux OR nondemux
414437
# expand(join(out_dir,'{mp}', '02_preprocess','ultraplex_demux_5bc_no_match.fastq.gz'), mp=samp_dict.keys()),
@@ -462,9 +485,11 @@ else:
462485
if (multiplex_flag == 'Y'):
463486
rule qc_barcode:
464487
"""
465-
generate counts of barcodes and output to text file
466-
will run python script that determines barcode expected and generates mismatches based on input
467-
output barplot with top barcode counts
488+
if the flag in the config file is set to process, barcodes will be reviewed to ensure uniformtiy amongst samples.
489+
- generate counts of barcodes and output to text file
490+
- will run python script that determines barcode expected and generates mismatches based on input
491+
- output barplot with top barcode counts
492+
If it's set to IGNORE, then it will skip this processing and create the needed QC file
468493
"""
469494
input:
470495
fq = get_fq_names,
@@ -473,30 +498,40 @@ if (multiplex_flag == 'Y'):
473498
memG = getmemG_80perc("qc_barcode"),
474499
R = join(source_dir,'workflow', 'scripts', '02_barcode_qc.R'),
475500
base = join(out_dir,'{mp}','01_qc_post'),
501+
qc_dir=join(out_dir,'qc'),
476502
mm = mismatch,
477503
bc_len = barcode_length,
478-
start_pos = 6 if barcode_length==6 else 4
504+
start_pos = 6 if barcode_length==6 else 4,
505+
b_qc_flag=barcode_qc_flag
479506
threads: getthreads("qc_barcode")
480507
envmodules:
481508
config['R'],
482509
output:
483510
counts = temp(join(out_dir,'{mp}','01_qc_post','{mp}_barcode_counts.txt')),
484511
png = temp(join(out_dir,'{mp}','01_qc_post','{mp}_barcode.png')),
485-
txt = temp(join(out_dir,'{mp}','01_qc_post','{mp}_barcode.txt'))
512+
txt = join(out_dir,'{mp}','01_qc_post','{mp}_barcode_passed.txt')
486513
shell:
487514
"""
488515
set -exo pipefail
489-
gunzip -c {input.fq} \\
490-
| awk 'NR%4==2 {{print substr($0, {params.start_pos}, {params.bc_len});}}' \\
491-
| LC_ALL=C sort --buffer-size={params.memG} --parallel={threads} --temporary-directory=/lscratch/${{SLURM_JOB_ID}} -n \\
492-
| uniq -c > {output.counts};
493-
494-
Rscript {params.R} --sample_manifest {sample_manifest} \\
495-
--multiplex_manifest {multiplex_manifest} \\
496-
--barcode_input {output.counts} \\
497-
--mismatch {params.mm} \\
498-
--mpid {wildcards.mp} \\
499-
--output_dir {params.base}
516+
517+
if [[ {params.b_qc_flag} == "PROCESS" ]]; then
518+
gunzip -c {input.fq} \\
519+
| awk 'NR%4==2 {{print substr($0, {params.start_pos}, {params.bc_len});}}' \\
520+
| LC_ALL=C sort --buffer-size={params.memG} --parallel={threads} --temporary-directory=/lscratch/${{SLURM_JOB_ID}} -n \\
521+
| uniq -c > {output.counts};
522+
523+
Rscript {params.R} --sample_manifest {sample_manifest} \\
524+
--multiplex_manifest {multiplex_manifest} \\
525+
--barcode_input {output.counts} \\
526+
--mismatch {params.mm} \\
527+
--mpid {wildcards.mp} \\
528+
--output_dir {params.base} \\
529+
--qc_dir {params.base}
530+
else
531+
echo "Barcode QC checking was skipped for this run" > {output.txt}
532+
touch {output.png}
533+
touch {output.counts}
534+
fi
500535
"""
501536

502537
rule demultiplex:
@@ -785,7 +820,7 @@ rule star:
785820
--sjdbScore {params.s_sj} \
786821
--winAnchorMultimapNmax {params.s_anchor}
787822
788-
# sort file\
823+
# sort file
789824
samtools sort -m 80G -T $tmp_dir $tmp_dir/{params.out_prefix}Aligned.out.bam -o $tmp_dir/{params.out_prefix}Aligned.sortedByCoord.out.bam
790825
791826
# move STAR files and final log file to output
@@ -861,66 +896,44 @@ rule multiqc:
861896
-o {params.out_dir}
862897
"""
863898

864-
# Pipeline splits to handle multiplexed QC vs non-multiplexed QC
865-
if (multiplex_flag == 'Y'):
866-
rule qc_troubleshoot:
867-
"""
868-
generates a PDF of barcode plots and alignment plots for qc troubleshooting
899+
rule qc_troubleshoot:
900+
"""
901+
generates a PDF of barcode plots and alignment plots for qc troubleshooting
902+
"""
903+
input:
904+
log_list=expand(join(out_dir,'log','STAR','{sp}.log'),sp=sp_list),
905+
params:
906+
rname = "08_qc_troubleshoot",
907+
R = join(source_dir,'workflow','scripts','03_qc_report.Rmd'),
908+
qc = join(source_dir,'workflow','scripts','03_qc_unmapped_check.sh'),
909+
log_dir = join(out_dir,'log','STAR'),
910+
single_qc = single_qc_threshold,
911+
proj_qc = project_qc_threshold,
912+
m_flag=multiplex_flag,
913+
b_flag=barcode_qc_flag,
914+
txt_bc=expand(join(out_dir,'{mp}', '01_qc_post','{mp}_barcode_passed.txt'),mp=samp_dict.keys())
915+
envmodules:
916+
config['R']
917+
output:
918+
html = join(out_dir,'qc','qc_report_troubleshooting.html'),
919+
qc = join(out_dir, 'log','STAR','qc_pass.txt')
920+
shell:
869921
"""
870-
input:
871-
log_list = expand(join(out_dir,'log','STAR','{sp}.log'),sp=sp_list),
872-
txt_bc = expand(join(out_dir,'{mp}', '01_qc_post','{mp}_barcode.txt'),mp=samp_dict.keys()),
873-
png_bc = expand(join(out_dir,'{mp}', '01_qc_post','{mp}_barcode.png'),mp=samp_dict.keys())
874-
params:
875-
rname = "08_qc_troubleshoot",
876-
R = join(source_dir,'workflow','scripts','03_qc_report.Rmd'),
877-
qc = join(source_dir,'workflow','scripts','03_qc_unmapped_check.sh'),
878-
log_dir = join(out_dir,'log','STAR'),
879-
single_qc = single_qc_threshold,
880-
proj_qc = project_qc_threshold
881-
envmodules:
882-
config['R']
883-
output:
884-
html = join(out_dir,'qc','qc_report_troubleshooting.html'),
885-
qc = join(out_dir, 'log','STAR','qc_pass.txt')
886-
shell:
887-
"""
922+
if [[ {params.m_flag} == "Y" ]] && [[ {params.b_flag} == "PROCESS" ]] ; then
888923
Rscript -e 'library(rmarkdown); \
889924
rmarkdown::render("{params.R}",
890925
output_file = "{output.html}", \
891-
params= list(log_list = "{input.log_list}", \
892-
b_txt = "{input.txt_bc}"))'
893-
894-
sh {params.qc} {params.log_dir} {params.single_qc} {params.proj_qc}
895-
"""
896-
else:
897-
rule qc_troubleshoot:
898-
"""
899-
generates a PDF of barcode plots and alignment plots for qc troubleshooting
900-
"""
901-
input:
902-
log_list = expand(join(out_dir,'log','STAR','{sp}.log'),sp=sp_list),
903-
params:
904-
rname = "08_qc_troubleshoot",
905-
R = join(source_dir,'workflow','scripts','03_qc_report.Rmd'),
906-
qc = join(source_dir,'workflow','scripts','03_qc_unmapped_check.sh'),
907-
log_dir = join(out_dir,'log','STAR'),
908-
single_qc = single_qc_threshold,
909-
proj_qc = project_qc_threshold
910-
envmodules:
911-
config['R']
912-
output:
913-
html = join(out_dir,'qc','qc_report_troubleshooting.html'),
914-
qc = join(out_dir, 'log','STAR','qc_pass.txt')
915-
shell:
916-
"""
926+
params= list(log_list = "{input.log_list}", \
927+
b_txt = "{params.txt_bc}"))'
928+
else
917929
Rscript -e 'library(rmarkdown); \
918-
rmarkdown::render("{params.R}",
930+
rmarkdown::render("{params.R}",
919931
output_file = "{output.html}", \
920-
params= list(log_list = "{input.log_list}"))'
921-
922-
sh {params.qc} {params.log_dir} {params.single_qc} {params.proj_qc}
923-
"""
932+
params= list(log_list = "{input.log_list}"))'
933+
fi
934+
935+
sh {params.qc} {params.log_dir} {params.single_qc} {params.proj_qc}
936+
"""
924937

925938
rule dedup:
926939
"""

workflow/scripts/02_barcode_qc.R

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ parser$add_argument("-bu","--barcode_input", dest="barcode_input", required=TRUE
2020
parser$add_argument("-o","--output_dir", dest="output_dir", required=TRUE, help="dir for barcode plots and summary")
2121
parser$add_argument("-m","--mismatch", dest="mismatch", required=TRUE, help="number of mismatches allowed")
2222
parser$add_argument("-i","--mpid", dest="mpid", required=TRUE, help="sampleid")
23+
parser$add_argument("-q","--qc_dir", dest="qc_dir", required=TRUE, help="dir for failed QC manifests")
2324

2425
args <- parser$parse_args()
2526
sample_manifest = args$sample_manifest
@@ -32,12 +33,12 @@ mpid = args$mpid
3233
#test input
3334
testing="N"
3435
if(testing=="Y"){
35-
sample_manifest = "~/../../Volumes/CCBR_Pipeliner/iCLIP/test/sample_manifests/sample_mm10_one.tsv"
36-
multiplex_manifest = "~/../../Volumes/CCBR_Pipeliner/iCLIP/test/multiplex_manifests/multiplex_mm10_one.tsv"
37-
barcode_input = "~/../../Volumes/sevillas2-1/test.barcode_counts.txt"
38-
output_dir = "~/../../Volumes/sevillas2-1"
36+
sample_manifest = "~/../../Volumes/RBL_NCI/Wolin/mov10_par_Y_r2_01062023/manifests/samples.tsv"
37+
multiplex_manifest = "~/../../Volumes/RBL_NCI/Wolin/mov10_par_Y_r2_01062023/manifests/multiplex.tsv"
38+
barcode_input = "~/../../Volumes/RBL_NCI/Wolin/mov10_par_Y_r2_01062023/sam_testing/barcode_counts.txt"
39+
output_dir = "~/../../Volumes/RBL_NCI/Wolin/mov10_par_Y_r2_01062023/sam_testing"
3940
mismatch = 1
40-
mpid="test_1"
41+
mpid="CLIP"
4142
}
4243

4344
#Read sample/multiplex manifests
@@ -90,9 +91,7 @@ for (rowid in rownames(barcode_combo)){
9091

9192
#if there were errors, print message and exit
9293
if(error_message!=""){
93-
cat(paste0("The number of differences between barocdes is less than, or equal to, the number of mismatches
94-
in the following barcodes:\n",error_message))
95-
#exit()
94+
cat(paste0("The number of differences between barocdes is less than, or equal to, the number of mismatches in the following barcodes:\n",error_message))
9695
} else{
9796
print("Barcode schema passes QC")
9897
}
@@ -154,7 +153,7 @@ diff_lis=setdiff(paste0("*",bc_exp),df_counts_merged$barcode_id)
154153
if(length(diff_lis)==0){
155154
print("Barcodes observed pass QC")
156155
#print text file
157-
file_save = paste0(output_dir,'/',mpid,'_barcode.txt')
156+
file_save = paste0(output_dir,'/',mpid,'_barcode_passed.txt')
158157
fileConn<-file(file_save)
159158
line1 = paste0("\n* SampleID ",mpid, " \n")
160159
line2 = paste0("\t + Number of mismatches allowed ", mismatch, " \n")
@@ -179,7 +178,7 @@ if(length(diff_lis)==0){
179178
ggsave(file_save,p)
180179

181180
} else{
182-
file_save = paste0(output_dir,'/',mpid,'_barcode_errors.txt')
181+
file_save = paste0(qc_dir,'/',mpid,'_barcode_errors.txt')
183182
fileConn<-file(file_save)
184183
line1 = paste0("\n* SampleID ",mpid, " \n")
185184
line2 = paste0("\t + Number of mismatches allowed ", mismatch, " \n")
@@ -194,4 +193,8 @@ if(length(diff_lis)==0){
194193

195194
writeLines(paste(line1,line2,line3,line4,line5,sep=""), fileConn)
196195
close(fileConn)
196+
197+
print(paste0("Barcodes failed QC check. Review file located here:",file_save,"."))
198+
print(paste0("To check the reads go to the /project/01_preprocess/01_fastq/ dir and run `zcat {name_of_sample} | wc -l. Divide this number by 4 to get the total number of reads"))
199+
print(paste0("If satisfied with this read count change the config/snakemake_config file barcode_qc_flag to IGNORE"))
197200
}

workflow/scripts/06_annotation.Rmd

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
title: ""
33
output: html_document
44
params:
5-
samplename: "Ro_Clip_u"
6-
peak_in: "/Users/homanpj/OneDrive - National Institutes of Health/Loaner/Wolin/CLIP/mESC_clip_3_clip3_u/04_annotation/02_peaks/Ro_Clip_u_ALLreadPeaks_annotation_complete.txt"
7-
output_table: "/Users/homanpj/OneDrive - National Institutes of Health/Loaner/Wolin/CLIP/mESC_clip_3_clip3_u/04_annotation/Ro_Clip_u_peakannotation_finalx.txt"
5+
samplename: "YDKO_IgG_500"
6+
peak_in: "~/../../Volumes/RBL_NCI/Wolin/mov10_par_Y_r2_01062023/04_annotation/02_peaks/YDKO_IgG_500_ALLreadPeaks_annotation_complete.txt"
7+
output_table: "~/../../Volumes/RBL_NCI/Wolin/mov10_par_Y_r2_01062023/04_annotation/YDKO_IgG_500_annotation_ALLreadPeaks_final_table.txt"
88
readdepth: 3
99
PeakIdnt: "ALL" #MultiMap, Unique, all
1010
# params:
@@ -20,7 +20,7 @@ params:
2020
# readdepth: 5
2121
# PeakIdnt: "ALL" #MultiMap, Unique, all
2222
---
23-
23+
2424
<!-- # Load libraries, set theme -->
2525
```{r envirment, include=F ,echo=F,warning=F,message=FALSE}
2626
rm(list=setdiff(ls(), "params"))

0 commit comments

Comments
 (0)