@@ -4,7 +4,7 @@ S. Sevilla
44P. Homan
55
66* Overview *
7- - Multiplexed samples are split based on provided barcodes and named using provide manifests
7+ - Multiplexed samples are split based on provided barcodes and named using provide manifests, maximum 10 samples
88- Adaptors are stripped from samples
99- Samples are unzipped and split into smaller fastq files to increase speed
1010- Samples are aligned using NovaAlign
@@ -14,8 +14,6 @@ P. Homan
1414* Requirements *
1515- Read specific input requirements, and execution information on the Wikipage
1616located at: https://github.com/RBL-NCI/iCLIP.git
17-
18- Pipeline info: activeDev 05272021
1917'''
2018
2119report : "report/workflow.rst"
8987if (splice_aware == 'N' ):
9088 align_list = ["unaware" ]
9189else :
92- align_list = ["unaware" , "masked" , " unmasked" ]
90+ align_list = ["unmasked" ]
9391
9492###############################################################
9593# create sample lists
@@ -279,14 +277,6 @@ def get_align_input(wildcards):
279277 f1 = join (out_dir ,'04_sam' ,'03_genomic' ,'{sp}.{al}.split.{n}.sam' ),
280278 return (f1 )
281279
282- #determine dedup input based on splcie_aware flag
283- def input_mapq_corrected_bam (wildcards ):
284- if (splice_aware == "N" ):
285- f1 = join (out_dir ,'09_dedup' ,'01_bam' ,'{sp}.unaware.dedup.bam' ),
286- else :
287- f1 = join (out_dir ,'10_mapq_score' ,'{sp}.mapq_recalculated.bam' ),
288- return (f1 )
289-
290280###############################################################
291281# main code
292282###############################################################
@@ -318,10 +308,14 @@ else:
318308
319309## if samples are spliced
320310if splice_aware == 'Y' :
311+ input_unmapped = expand (join (out_dir ,'04_sam' ,'04_unmapped' ,'{sp}.{al}.complete.bam' ), sp = sp_list , al = align_list )
312+
321313 input_splice = [expand (join (out_dir ,'10_mapq_score' ,'{sp}.readids.txt' ), sp = sp_list ),
322314 expand (join (out_dir ,'10_mapq_score' ,'{sp}.unaware.subset.bam' ),sp = sp_list ),
323315 expand (join (out_dir ,'10_mapq_score' ,'{sp}.mapq_recalculated.bam' ),sp = sp_list )]
324316else :
317+ input_unmapped = expand (join (out_dir ,'09_dedup' ,'01_bam' ,'{sp}.{al}.dedup.bam' ), sp = sp_list , al = align_list ),
318+
325319 input_splice = [expand (join (out_dir ,'09_dedup' ,'01_bam' ,'{sp}.{al}.dedup.bam' ),sp = sp_list , al = align_list )]
326320
327321#local rules
@@ -384,30 +378,27 @@ rule all:
384378 join (out_dir ,'qc' ,'qc_report.html' ),
385379
386380 #Unmapped read output
387- expand ( join ( out_dir , '04_sam' , '04_unmapped' , '{sp}.{al}.complete.bam' ), sp = sp_list , al = align_list ) ,
381+ input_unmapped ,
388382
389383 #Deduplicate
390- expand (join (out_dir ,'09_dedup' ,'01_bam' ,'{sp}.unmasked .dedup.bam' ), sp = sp_list ),
384+ expand (join (out_dir ,'09_dedup' ,'01_bam' ,'{sp}.{al} .dedup.bam' ), sp = sp_list , al = align_list ),
391385
392- #MapQ recalculation
393- #input_splice,
394-
395- # #Bam processing
396- # expand(join(out_dir,'09_dedup','03_unique','{sp}.dedup.unique.i.bam'), sp=sp_list),
386+ #Bam processing
387+ expand (join (out_dir ,'09_dedup' ,'03_unique' ,'{sp}.dedup.unique.i.bam' ), sp = sp_list ),
397388
398- # # Bed files
399- # expand(join(out_dir,'11_bed','{sp}_all.bed'), sp=sp_list),
400- # expand(join(out_dir,'11_bed','{sp}_unique.bed'), sp=sp_list),
389+ #Bed files
390+ expand (join (out_dir ,'11_bed' ,'{sp}_all.bed' ), sp = sp_list ),
391+ expand (join (out_dir ,'11_bed' ,'{sp}_unique.bed' ), sp = sp_list ),
401392
402- # # SAF
403- # expand(join(out_dir,'12_SAF/{sp}_'+ str(nt_merge) +'_all.SAF'), sp=sp_list),
404- # expand(join(out_dir,'12_SAF/{sp}_'+ str(nt_merge) +'_unique.SAF'), sp=sp_list),
393+ #SAF
394+ expand (join (out_dir ,'12_SAF/{sp}_' + str (nt_merge ) + '_all.SAF' ), sp = sp_list ),
395+ expand (join (out_dir ,'12_SAF/{sp}_' + str (nt_merge ) + '_unique.SAF' ), sp = sp_list ),
405396
406- # # Count features
407- # expand(join(out_dir,'14_counts ','uniquereadpeaks','{sp}_' + str(nt_merge) + '_uniqueCounts.txt'), sp=sp_list),
408- # expand(join(out_dir,'14_counts ','uniquereadpeaks','{sp}_'+ str(nt_merge) +'_allFracMMCounts.txt'), sp=sp_list),
409- # expand(join(out_dir,'14_counts ','allreadpeaks','{sp}_'+ str(nt_merge) +'_uniqueCounts.txt'), sp=sp_list),
410- # expand(join(out_dir,'14_counts ','allreadpeaks','{sp}_'+ str(nt_merge) +'_allFracMMCounts.txt'), sp=sp_list),
397+ #Count features
398+ expand (join (out_dir ,'13_counts ' ,'uniquereadpeaks' ,'{sp}_' + str (nt_merge ) + '_uniqueCounts.txt' ), sp = sp_list ),
399+ expand (join (out_dir ,'13_counts ' ,'uniquereadpeaks' ,'{sp}_' + str (nt_merge ) + '_allFracMMCounts.txt' ), sp = sp_list ),
400+ expand (join (out_dir ,'13_counts ' ,'allreadpeaks' ,'{sp}_' + str (nt_merge ) + '_uniqueCounts.txt' ), sp = sp_list ),
401+ expand (join (out_dir ,'13_counts ' ,'allreadpeaks' ,'{sp}_' + str (nt_merge ) + '_allFracMMCounts.txt' ), sp = sp_list ),
411402
412403 # #In progress
413404 # join(out_dir,'15_annotation', 'project',,'annotations.txt'),
@@ -418,8 +409,13 @@ rule all:
418409 # #expand(join(out_dir,'14_annotation','{sp}_'+ str(nt_merge) +'_MD15.txt'),zip, mp=mp_list, sp=sp_list),
419410 #expand(join(out_dir,'14_annotation','{sp}_'+ str(nt_merge) +'_MD15.html'),zip, mp=mp_list, sp=sp_list),
420411
421- include : join (source_dir ,"workflow/rules/common.smk" )
422- include : join (source_dir ,"workflow/rules/other.smk" )
412+ #common and other SMK
413+ if source_dir == "" :
414+ include : "rules/common.smk"
415+ include : "rules/other.smk"
416+ else :
417+ include : join (source_dir ,"workflow/rules/common.smk" )
418+ include : join (source_dir ,"workflow/rules/other.smk" )
423419
424420###############################################################
425421# snakemake rules
@@ -1203,18 +1199,18 @@ else:
12031199
12041200rule dedup :
12051201 """
1206- deduplicate merged.i.bam files
1202+ deduplicate
12071203 """
12081204 input :
1209- f1 = join (out_dir ,'08_bam_merged' ,'{sp}.unmasked .merged.si.bam' ),
1205+ f1 = join (out_dir ,'08_bam_merged' ,'{sp}.{al} .merged.si.bam' ),
12101206 params :
12111207 rname = '23_dedup' ,
12121208 umi = umi_parameter
12131209 envmodules :
12141210 config ['umitools' ]
12151211 output :
1216- o1 = join (out_dir ,'09_dedup' ,'01_bam' ,'{sp}.unmasked .dedup.bam' ),
1217- o2 = join (out_dir ,'09_dedup' ,'01_bam' ,'{sp}.unmasked .dedup.log' ),
1212+ o1 = join (out_dir ,'09_dedup' ,'01_bam' ,'{sp}.{al} .dedup.bam' ),
1213+ o2 = join (out_dir ,'09_dedup' ,'01_bam' ,'{sp}.{al} .dedup.log' ),
12181214 shell :
12191215 """
12201216 umi_tools dedup \
@@ -1224,102 +1220,12 @@ rule dedup:
12241220 --log2stderr -L {output.o2};
12251221 """
12261222
1227- #pipeline splits for mapq score recalculation on splice_aware samples
1228- if (splice_aware == 'Y' ):
1229- rule generate_readids :
1230- """
1231- generate readids from unmasked files
1232- """
1233- input :
1234- f1 = join (out_dir ,'09_dedup' ,'01_bam' ,'{sp}.unmasked.dedup.bam' )
1235- params :
1236- rname = '24a_readids' ,
1237- envmodules :
1238- config ['samtools' ]
1239- output :
1240- o1 = join (out_dir ,'10_mapq_score' ,'{sp}.readids.txt' ),
1241- shell :
1242- """
1243- samtools view {input.f1}|cut -f1|sort|uniq > {output.o1}
1244- """
1245-
1246- rule subsample_reads :
1247- """
1248- subsample:
1249- A) splice unaware BAM
1250- B) splice aware (masked exon) BAM
1251-
1252- using the readids from splice aware (unmasked exon) in rule generate_readids
1253- """
1254- input :
1255- id = join (out_dir ,'10_mapq_score' ,'{sp}.readids.txt' ),
1256- unaware = join (out_dir ,'08_bam_merged' ,'{sp}.unaware.merged.si.bam' ),
1257- masked = join (out_dir ,'08_bam_merged' ,'{sp}.masked.merged.si.bam' ),
1258- params :
1259- rname = '24b_subsample' ,
1260- script = join (source_dir ,'workflow' ,'scripts' ,'06_filter_bam_by_readids.py' ),
1261- envmodules :
1262- config ['python' ]
1263- output :
1264- unaware = join (out_dir ,'10_mapq_score' ,'{sp}.unaware.subset.bam' ),
1265- masked = join (out_dir ,'10_mapq_score' ,'{sp}.masked.subset.bam' ),
1266- shell :
1267- """
1268- python {params.script} --inputBAM {input.unaware} --outputBAM {output.unaware} --readids {input.id};
1269- python {params.script} --inputBAM {input.masked} --outputBAM {output.masked} --readids {input.id}
1270- """
1271-
1272- rule mapq_recalc :
1273- """
1274- input deduplicate (C) BAM file and the 2 BAMs subset A) and subset B) pysam script for MAPQ
1275- correction - outputs D) updated mapq score bam file
1276- """
1277- input :
1278- unaware = join (out_dir ,'10_mapq_score' ,'{sp}.unaware.subset.bam' ),
1279- masked = join (out_dir ,'10_mapq_score' ,'{sp}.masked.subset.bam' ),
1280- unmasked = join (out_dir ,'09_dedup' ,'01_bam' ,'{sp}.unmasked.dedup.bam' ),
1281- params :
1282- rname = '24c_recalc' ,
1283- script = join (source_dir ,'workflow' ,'scripts' ,'07_correct_mapq.py' ),
1284- unaware = join (out_dir ,'10_mapq_score' ,'{sp}.unaware.subset.bam' ),
1285- masked = join (out_dir ,'10_mapq_score' ,'{sp}.masked.subset.bam' ),
1286- envmodules :
1287- config ['python' ]
1288- output :
1289- bam = join (out_dir ,'10_mapq_score' ,'{sp}.mapq_recalculated.bam' ),
1290- tsv = join (out_dir ,'10_mapq_score' ,'{sp}.mapq_recalculated.tsv' ),
1291- shell :
1292- """
1293- python {params.script} \
1294- --inputBAM1 {input.unaware} --inputBAM2 {input.masked} --inputBAM3 {input.unmasked} \
1295- --outBAM {output.bam} --out {output.tsv}
1296- """
1297-
1298- rule mapq_stats :
1299- """
1300- TODO
1301- **Use D) for visualization
1302- """
1303- input :
1304- f1 = expand (join (out_dir ,'10_mapq_score' ,'{sp}.mapq_recalculated.bam' ),sp = sp_list ),
1305- params :
1306- rname = '24c_mapq_stats' ,
1307- script = join (source_dir ,'workflow' ,'scripts' ,'.py' ),
1308- envmodules :
1309- config ['R' ]
1310- output :
1311- o1 = join (out_dir ,'10_mapq_score' ,'report.pdf' ),
1312- shell :
1313- """
1314- Rscript {params.script} {input.f1} {output.o1}
1315- """
1316-
13171223rule sort_index_dedup :
13181224 """
13191225 sort dedup.bam file
13201226 """
13211227 input :
1322- f1 = input_mapq_corrected_bam
1228+ f1 = expand ( join ( out_dir , '09_dedup' , '01_bam' , '{{sp}}.{al}.dedup.bam' ), al = align_list ),
13231229 params :
13241230 rname = '25_si_dedup' ,
13251231 envmodules :
@@ -1456,8 +1362,8 @@ rule feature_counts_allreads:
14561362 envmodules :
14571363 config ['subread' ]
14581364 output :
1459- out_unique = join (out_dir ,'14_counts ' ,'allreadpeaks' ,'{sp}_' + str (nt_merge ) + '_uniqueCounts.txt' ),
1460- out_all = join (out_dir ,'14_counts ' ,'allreadpeaks' ,'{sp}_' + str (nt_merge ) + '_allFracMMCounts.txt' )
1365+ out_unique = join (out_dir ,'13_counts ' ,'allreadpeaks' ,'{sp}_' + str (nt_merge ) + '_uniqueCounts.txt' ),
1366+ out_all = join (out_dir ,'13_counts ' ,'allreadpeaks' ,'{sp}_' + str (nt_merge ) + '_allFracMMCounts.txt' )
14611367 shell :
14621368 """
14631369 featureCounts -F SAF \
@@ -1498,8 +1404,8 @@ rule feature_counts_uniquereads:
14981404 envmodules :
14991405 config ['subread' ]
15001406 output :
1501- out_unique = join (out_dir ,'14_counts ' ,'uniquereadpeaks' ,'{sp}_' + str (nt_merge ) + '_uniqueCounts.txt' ),
1502- out_all = join (out_dir ,'14_counts ' ,'uniquereadpeaks' ,'{sp}_' + str (nt_merge ) + '_allFracMMCounts.txt' )
1407+ out_unique = join (out_dir ,'13_counts ' ,'uniquereadpeaks' ,'{sp}_' + str (nt_merge ) + '_uniqueCounts.txt' ),
1408+ out_all = join (out_dir ,'13_counts ' ,'uniquereadpeaks' ,'{sp}_' + str (nt_merge ) + '_allFracMMCounts.txt' )
15031409 shell :
15041410 """
15051411 featureCounts -F SAF \
@@ -1530,7 +1436,7 @@ rule feature_counts_uniquereads:
15301436# generate annotation table once per project
15311437# """
15321438# input:
1533- # expand(join(out_dir,'14_counts ', 'allreadpeaks', '{sp}_'+ str(nt_merge) +'_uniqueCounts.txt'), sp=sp_list),
1439+ # expand(join(out_dir,'13_counts ', 'allreadpeaks', '{sp}_'+ str(nt_merge) +'_uniqueCounts.txt'), sp=sp_list),
15341440# params:
15351441# rname='36_proj_anno',
15361442# script = join(source_dir,'workflow','scripts','06_annotation.R'),
@@ -1578,8 +1484,8 @@ rule feature_counts_uniquereads:
15781484
15791485# '''
15801486# input:
1581- # unique = join(out_dir,'14_counts ', 'allreadpeaks', '{sp}_'+ str(nt_merge) +'_uniqueCounts.txt'),
1582- # all = join(out_dir,'14_counts ', 'allreadpeaks', '{sp}_'+ str(nt_merge) +'_allFracMMCounts.txt'),
1487+ # unique = join(out_dir,'13_counts ', 'allreadpeaks', '{sp}_'+ str(nt_merge) +'_uniqueCounts.txt'),
1488+ # all = join(out_dir,'13_counts ', 'allreadpeaks', '{sp}_'+ str(nt_merge) +'_allFracMMCounts.txt'),
15831489# anno = join(out_dir,'15_annotation', 'project','annotations.txt'),
15841490# params:
15851491# rname = '37_peak_anno',
0 commit comments