Skip to content

Commit fb473c1

Browse files
authored
Merge pull request #79 from RBL-NCI/activeDev
v0.6
2 parents 65365f0 + 7efe557 commit fb473c1

4 files changed

Lines changed: 54 additions & 149 deletions

File tree

.tests/snakemake_config.yaml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,9 @@ nt_merge: 50 #minimum distance of nucleotides to merge peaks [10,20,30,40,50,60]
2424
peak_id: "all" #report peaks for unique peaks only or unique and fractional mm ["unique","all"]
2525
DE_method: "manorm" #choose DE method ["manorm","none"]
2626
splice_junction: "Y" #include splice junctions in peak calls: "manorm" #choose DE method ["manorm","none"]
27-
SY_flag: "Y" #if mm10, flag to run additional annotations with Soyeong's BED files
2827

2928
#modules, container parameters
30-
container_dir: "/data/RBL_NCI/iCLIP/container"
29+
container_dir: "/data/CCBR_Pipeliner/iCLIP/container"
3130
bedtools: "bedtools/2.29.2"
3231
bowtie2: "bowtie/2-2.3.4"
3332
fastq_screen: "fastq_screen/0.14.0"
@@ -37,8 +36,8 @@ multiqc: "multiqc/1.9"
3736
novocraft: "novocraft/4.03.01"
3837
perl: "perl/5.24.3"
3938
python: "python/3.7"
40-
Qt: "Qt/5.14.2"
41-
singularity: "singularity/3.7.0"
39+
Qt: "Qt/5.13.2"
40+
singularity: "singularity"
4241
samtools: "samtools/1.11"
4342
umitools: "umitools/1.1.1"
4443
subread: "subread/2.0.1"

config/index_config.yaml

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
hg38:
2-
std: '/data/CCBR_Pipeliner/iCLIP/index/active/2021_0607/hg38/06_final/hg38_final_nosplicing.nix'
2+
std: '/data/CCBR_Pipeliner/iCLIP/index/active/phil/hg38/hg38.nix'
33
spliceaware_unmasked:
4-
50bp: '/data/CCBR_Pipeliner/iCLIP/index/active/2021_0607/hg38/06_final/hg38_final_unmaskedexon_46.nix'
5-
75bp: '/data/CCBR_Pipeliner/iCLIP/index/active/2021_0607/hg38/06_final/hg38_final_unmaskedexon_71.nix'
4+
50bp: '/data/CCBR_Pipeliner/iCLIP/index/active/phil/hg38/gencode.v32.chr_patch_hapl_scaff.annotation.gtf.SplicedTransc_46.nix'
5+
75bp: '/data/CCBR_Pipeliner/iCLIP/index/active/phil/hg38/gencode.v32.chr_patch_hapl_scaff.annotation.gtf.SplicedTransc_71.nix'
66
spliceaware_masked:
7-
50bp: '/data/CCBR_Pipeliner/iCLIP/index/active/2021_0607/hg38/06_final/hg38_final_maskedexon_46.nix'
8-
75bp: '/data/CCBR_Pipeliner/iCLIP/index/active/2021_0607/hg38/06_final/hg38_final_maskedexon_71.nix'
7+
50bp: '/data/CCBR_Pipeliner/iCLIP/index/active/phil/hg38/gencode.v32.chr_patch_hapl_scaff.annotation.gtf.SplicedTransc.maskedexon_46.nix'
8+
75bp: '/data/CCBR_Pipeliner/iCLIP/index/active/phil/hg38/gencode.v32.chr_patch_hapl_scaff.annotation.gtf.SplicedTransc.maskedexon_71.nix'
99
gencode_path: '/data/CCBR_Pipeliner/iCLIP/ref/annotations/hg38/Gencode_V32/fromGencode/gencode.v32.annotation.gtf.txt'
1010
refseq_path: '/data/CCBR_Pipeliner/iCLIP/ref/annotations/hg38/NCBI_RefSeq/GCF_000001405.39_GRCh38.p13_genomic.gtf.txt'
1111
canonical_path: '/data/CCBR_Pipeliner/iCLIP/ref/annotations/hg38/Gencode_V32/fromUCSC/KnownCanonical/KnownCanonical_GencodeM32_GRCh38.txt'
@@ -14,13 +14,13 @@ hg38:
1414
sy_path: '/data/CCBR_Pipeliner/iCLIP/ref/annotations/mm10/additional_anno/'
1515
alias_path: '/data/CCBR_Pipeliner/iCLIP/ref/annotations/hg38/hg38.chromAlias.txt'
1616
mm10:
17-
std: '/data/CCBR_Pipeliner/iCLIP/index/active/2021_0607/mm10/06_final/mm10_final_nosplicing.nix'
17+
std: '/data/CCBR_Pipeliner/iCLIP/index/active/phil/mm10/mm10.nix'
1818
spliceaware_unmasked:
19-
50bp: '/data/CCBR_Pipeliner/iCLIP/index/active/2021_0607/mm10/06_final/mm10_final_unmaskedexon_46.nix'
20-
75bp: '/data/CCBR_Pipeliner/iCLIP/index/active/2021_0607/mm10/06_final/mm10_final_unmaskedexon_71.nix'
19+
50bp: '/data/CCBR_Pipeliner/iCLIP/index/active/phil/mm10/mm10_splice50bp_unmasked.nix'
20+
75bp: '/data/CCBR_Pipeliner/iCLIP/index/active/phil/mm10/mm10_splice75bp_unmasked.nix'
2121
spliceaware_masked:
22-
50bp: '/data/CCBR_Pipeliner/iCLIP/index/active/2021_0607/mm10/06_final/mm10_final_maskedexon_46.nix'
23-
75bp: '/data/CCBR_Pipeliner/iCLIP/index/active/2021_0607/mm10/06_final/mm10_final_maskedexon_71.nix'
22+
50bp: '/data/CCBR_Pipeliner/iCLIP/index/active/phil/mm10/mm10_splice50bp_masked.nix'
23+
75bp: '/data/CCBR_Pipeliner/iCLIP/index/active/phil/mm10/mm10_splice75bp_masked.nix'
2424
gencode_path: '/data/CCBR_Pipeliner/iCLIP/ref/annotations/mm10/Gencode_VM23/fromGencode/gencode.vM23.annotation.gtf.txt'
2525
refseq_path: '/data/CCBR_Pipeliner/iCLIP/ref/annotations/mm10/NCBI_RefSeq/GCF_000001635.26_GRCm38.p6_genomic.gtf.txt'
2626
canonical_path: '/data/CCBR_Pipeliner/iCLIP/ref/annotations/mm10/Gencode_VM23/fromUCSC/KnownCanonical/KnownCanonical_GencodeM23_GRCm38.txt'

run_snakemake.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ if [[ $pipeline = "cluster" ]] || [[ $pipeline = "local" ]]; then
7171

7272
#submit jobs to cluster
7373
if [[ $pipeline = "cluster" ]]; then
74-
sbatch --job-name="iCLIP" --gres=lscratch:200 --time=120:00:00 --output=${output_dir}/log/${log_time}_00_%j_%x.out --mail-type=BEGIN,END,FAIL \
74+
sbatch --job-name="iCLIP" --gres=lscratch:200 --time=24:00:00 --output=${output_dir}/log/${log_time}_00_%j_%x.out --mail-type=BEGIN,END,FAIL \
7575
snakemake --use-envmodules --latency-wait 120 -s ${output_dir}/workflow/${log_time}_Snakefile --configfile ${output_dir}/log/${log_time}_00_snakemake_config.yaml \
7676
--printshellcmds --cluster-config ${output_dir}/log/${log_time}_00_cluster_config.yml --keep-going \
7777
--restart-times 1 --cluster "sbatch --gres {cluster.gres} --cpus-per-task {cluster.threads} \

workflow/Snakefile

Lines changed: 40 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ S. Sevilla
44
P. Homan
55
66
* Overview *
7-
- Multiplexed samples are split based on provided barcodes and named using provide manifests
7+
- Multiplexed samples are split based on provided barcodes and named using provide manifests, maximum 10 samples
88
- Adaptors are stripped from samples
99
- Samples are unzipped and split into smaller fastq files to increase speed
1010
- Samples are aligned using NovaAlign
@@ -14,8 +14,6 @@ P. Homan
1414
* Requirements *
1515
- Read specific input requirements, and execution information on the Wikipage
1616
located at: https://github.com/RBL-NCI/iCLIP.git
17-
18-
Pipeline info: activeDev 05272021
1917
'''
2018

2119
report: "report/workflow.rst"
@@ -89,7 +87,7 @@ else:
8987
if (splice_aware == 'N'):
9088
align_list = ["unaware"]
9189
else:
92-
align_list = ["unaware","masked","unmasked"]
90+
align_list = ["unmasked"]
9391

9492
###############################################################
9593
# create sample lists
@@ -279,14 +277,6 @@ def get_align_input(wildcards):
279277
f1 = join(out_dir,'04_sam','03_genomic','{sp}.{al}.split.{n}.sam'),
280278
return(f1)
281279

282-
#determine dedup input based on splcie_aware flag
283-
def input_mapq_corrected_bam(wildcards):
284-
if (splice_aware=="N"):
285-
f1 = join(out_dir,'09_dedup','01_bam','{sp}.unaware.dedup.bam'),
286-
else:
287-
f1 = join(out_dir,'10_mapq_score','{sp}.mapq_recalculated.bam'),
288-
return(f1)
289-
290280
###############################################################
291281
# main code
292282
###############################################################
@@ -318,10 +308,14 @@ else:
318308

319309
## if samples are spliced
320310
if splice_aware == 'Y':
311+
input_unmapped = expand(join(out_dir,'04_sam','04_unmapped','{sp}.{al}.complete.bam'), sp=sp_list, al=align_list)
312+
321313
input_splice = [expand(join(out_dir,'10_mapq_score','{sp}.readids.txt'), sp=sp_list),
322314
expand(join(out_dir,'10_mapq_score','{sp}.unaware.subset.bam'),sp=sp_list),
323315
expand(join(out_dir,'10_mapq_score','{sp}.mapq_recalculated.bam'),sp=sp_list)]
324316
else:
317+
input_unmapped = expand(join(out_dir,'09_dedup','01_bam','{sp}.{al}.dedup.bam'), sp=sp_list, al=align_list),
318+
325319
input_splice = [expand(join(out_dir,'09_dedup','01_bam','{sp}.{al}.dedup.bam'),sp=sp_list, al=align_list)]
326320

327321
#local rules
@@ -384,30 +378,27 @@ rule all:
384378
join(out_dir,'qc','qc_report.html'),
385379

386380
#Unmapped read output
387-
expand(join(out_dir,'04_sam','04_unmapped','{sp}.{al}.complete.bam'), sp=sp_list, al=align_list),
381+
input_unmapped,
388382

389383
#Deduplicate
390-
expand(join(out_dir,'09_dedup','01_bam','{sp}.unmasked.dedup.bam'), sp=sp_list),
384+
expand(join(out_dir,'09_dedup','01_bam','{sp}.{al}.dedup.bam'), sp=sp_list, al=align_list),
391385

392-
#MapQ recalculation
393-
#input_splice,
394-
395-
# #Bam processing
396-
# expand(join(out_dir,'09_dedup','03_unique','{sp}.dedup.unique.i.bam'), sp=sp_list),
386+
#Bam processing
387+
expand(join(out_dir,'09_dedup','03_unique','{sp}.dedup.unique.i.bam'), sp=sp_list),
397388

398-
# #Bed files
399-
# expand(join(out_dir,'11_bed','{sp}_all.bed'), sp=sp_list),
400-
# expand(join(out_dir,'11_bed','{sp}_unique.bed'), sp=sp_list),
389+
#Bed files
390+
expand(join(out_dir,'11_bed','{sp}_all.bed'), sp=sp_list),
391+
expand(join(out_dir,'11_bed','{sp}_unique.bed'), sp=sp_list),
401392

402-
# #SAF
403-
# expand(join(out_dir,'12_SAF/{sp}_'+ str(nt_merge) +'_all.SAF'), sp=sp_list),
404-
# expand(join(out_dir,'12_SAF/{sp}_'+ str(nt_merge) +'_unique.SAF'), sp=sp_list),
393+
#SAF
394+
expand(join(out_dir,'12_SAF/{sp}_'+ str(nt_merge) +'_all.SAF'), sp=sp_list),
395+
expand(join(out_dir,'12_SAF/{sp}_'+ str(nt_merge) +'_unique.SAF'), sp=sp_list),
405396

406-
# #Count features
407-
# expand(join(out_dir,'14_counts','uniquereadpeaks','{sp}_' + str(nt_merge) + '_uniqueCounts.txt'), sp=sp_list),
408-
# expand(join(out_dir,'14_counts','uniquereadpeaks','{sp}_'+ str(nt_merge) +'_allFracMMCounts.txt'), sp=sp_list),
409-
# expand(join(out_dir,'14_counts','allreadpeaks','{sp}_'+ str(nt_merge) +'_uniqueCounts.txt'), sp=sp_list),
410-
# expand(join(out_dir,'14_counts','allreadpeaks','{sp}_'+ str(nt_merge) +'_allFracMMCounts.txt'), sp=sp_list),
397+
#Count features
398+
expand(join(out_dir,'13_counts','uniquereadpeaks','{sp}_' + str(nt_merge) + '_uniqueCounts.txt'), sp=sp_list),
399+
expand(join(out_dir,'13_counts','uniquereadpeaks','{sp}_'+ str(nt_merge) +'_allFracMMCounts.txt'), sp=sp_list),
400+
expand(join(out_dir,'13_counts','allreadpeaks','{sp}_'+ str(nt_merge) +'_uniqueCounts.txt'), sp=sp_list),
401+
expand(join(out_dir,'13_counts','allreadpeaks','{sp}_'+ str(nt_merge) +'_allFracMMCounts.txt'), sp=sp_list),
411402

412403
# #In progress
413404
# join(out_dir,'15_annotation', 'project',,'annotations.txt'),
@@ -418,8 +409,13 @@ rule all:
418409
# #expand(join(out_dir,'14_annotation','{sp}_'+ str(nt_merge) +'_MD15.txt'),zip, mp=mp_list, sp=sp_list),
419410
#expand(join(out_dir,'14_annotation','{sp}_'+ str(nt_merge) +'_MD15.html'),zip, mp=mp_list, sp=sp_list),
420411

421-
include: join(source_dir,"workflow/rules/common.smk")
422-
include: join(source_dir,"workflow/rules/other.smk")
412+
#common and other SMK
413+
if source_dir == "":
414+
include: "rules/common.smk"
415+
include: "rules/other.smk"
416+
else:
417+
include: join(source_dir,"workflow/rules/common.smk")
418+
include: join(source_dir,"workflow/rules/other.smk")
423419

424420
###############################################################
425421
# snakemake rules
@@ -1203,18 +1199,18 @@ else:
12031199

12041200
rule dedup:
12051201
"""
1206-
deduplicate merged.i.bam files
1202+
deduplicate
12071203
"""
12081204
input:
1209-
f1 = join(out_dir,'08_bam_merged','{sp}.unmasked.merged.si.bam'),
1205+
f1 = join(out_dir,'08_bam_merged','{sp}.{al}.merged.si.bam'),
12101206
params:
12111207
rname='23_dedup',
12121208
umi = umi_parameter
12131209
envmodules:
12141210
config['umitools']
12151211
output:
1216-
o1 = join(out_dir,'09_dedup','01_bam','{sp}.unmasked.dedup.bam'),
1217-
o2 = join(out_dir,'09_dedup','01_bam','{sp}.unmasked.dedup.log'),
1212+
o1 = join(out_dir,'09_dedup','01_bam','{sp}.{al}.dedup.bam'),
1213+
o2 = join(out_dir,'09_dedup','01_bam','{sp}.{al}.dedup.log'),
12181214
shell:
12191215
"""
12201216
umi_tools dedup \
@@ -1224,102 +1220,12 @@ rule dedup:
12241220
--log2stderr -L {output.o2};
12251221
"""
12261222

1227-
#pipeline splits for mapq score recalculation on splice_aware samples
1228-
if (splice_aware == 'Y'):
1229-
rule generate_readids:
1230-
"""
1231-
generate readids from unmasked files
1232-
"""
1233-
input:
1234-
f1 = join(out_dir,'09_dedup','01_bam','{sp}.unmasked.dedup.bam')
1235-
params:
1236-
rname='24a_readids',
1237-
envmodules:
1238-
config['samtools']
1239-
output:
1240-
o1 = join(out_dir,'10_mapq_score','{sp}.readids.txt'),
1241-
shell:
1242-
"""
1243-
samtools view {input.f1}|cut -f1|sort|uniq > {output.o1}
1244-
"""
1245-
1246-
rule subsample_reads:
1247-
"""
1248-
subsample:
1249-
A) splice unaware BAM
1250-
B) splice aware (masked exon) BAM
1251-
1252-
using the readids from splice aware (unmasked exon) in rule generate_readids
1253-
"""
1254-
input:
1255-
id = join(out_dir,'10_mapq_score','{sp}.readids.txt'),
1256-
unaware = join(out_dir,'08_bam_merged','{sp}.unaware.merged.si.bam'),
1257-
masked = join(out_dir,'08_bam_merged','{sp}.masked.merged.si.bam'),
1258-
params:
1259-
rname='24b_subsample',
1260-
script = join(source_dir,'workflow','scripts','06_filter_bam_by_readids.py'),
1261-
envmodules:
1262-
config['python']
1263-
output:
1264-
unaware = join(out_dir,'10_mapq_score','{sp}.unaware.subset.bam'),
1265-
masked = join(out_dir,'10_mapq_score','{sp}.masked.subset.bam'),
1266-
shell:
1267-
"""
1268-
python {params.script} --inputBAM {input.unaware} --outputBAM {output.unaware} --readids {input.id};
1269-
python {params.script} --inputBAM {input.masked} --outputBAM {output.masked} --readids {input.id}
1270-
"""
1271-
1272-
rule mapq_recalc:
1273-
"""
1274-
input deduplicate (C) BAM file and the 2 BAMs subset A) and subset B) pysam script for MAPQ
1275-
correction - outputs D) updated mapq score bam file
1276-
"""
1277-
input:
1278-
unaware = join(out_dir,'10_mapq_score','{sp}.unaware.subset.bam'),
1279-
masked = join(out_dir,'10_mapq_score','{sp}.masked.subset.bam'),
1280-
unmasked = join(out_dir,'09_dedup','01_bam','{sp}.unmasked.dedup.bam'),
1281-
params:
1282-
rname='24c_recalc',
1283-
script = join(source_dir,'workflow','scripts','07_correct_mapq.py'),
1284-
unaware = join(out_dir,'10_mapq_score','{sp}.unaware.subset.bam'),
1285-
masked = join(out_dir,'10_mapq_score','{sp}.masked.subset.bam'),
1286-
envmodules:
1287-
config['python']
1288-
output:
1289-
bam = join(out_dir,'10_mapq_score','{sp}.mapq_recalculated.bam'),
1290-
tsv = join(out_dir,'10_mapq_score','{sp}.mapq_recalculated.tsv'),
1291-
shell:
1292-
"""
1293-
python {params.script} \
1294-
--inputBAM1 {input.unaware} --inputBAM2 {input.masked} --inputBAM3 {input.unmasked} \
1295-
--outBAM {output.bam} --out {output.tsv}
1296-
"""
1297-
1298-
rule mapq_stats:
1299-
"""
1300-
TODO
1301-
**Use D) for visualization
1302-
"""
1303-
input:
1304-
f1 = expand(join(out_dir,'10_mapq_score','{sp}.mapq_recalculated.bam'),sp=sp_list),
1305-
params:
1306-
rname='24c_mapq_stats',
1307-
script = join(source_dir,'workflow','scripts','.py'),
1308-
envmodules:
1309-
config['R']
1310-
output:
1311-
o1 = join(out_dir,'10_mapq_score','report.pdf'),
1312-
shell:
1313-
"""
1314-
Rscript {params.script} {input.f1} {output.o1}
1315-
"""
1316-
13171223
rule sort_index_dedup:
13181224
"""
13191225
sort dedup.bam file
13201226
"""
13211227
input:
1322-
f1 = input_mapq_corrected_bam
1228+
f1 = expand(join(out_dir,'09_dedup','01_bam','{{sp}}.{al}.dedup.bam'), al=align_list),
13231229
params:
13241230
rname='25_si_dedup',
13251231
envmodules:
@@ -1456,8 +1362,8 @@ rule feature_counts_allreads:
14561362
envmodules:
14571363
config['subread']
14581364
output:
1459-
out_unique = join(out_dir,'14_counts','allreadpeaks','{sp}_' + str(nt_merge) + '_uniqueCounts.txt'),
1460-
out_all = join(out_dir,'14_counts','allreadpeaks','{sp}_' + str(nt_merge) + '_allFracMMCounts.txt')
1365+
out_unique = join(out_dir,'13_counts','allreadpeaks','{sp}_' + str(nt_merge) + '_uniqueCounts.txt'),
1366+
out_all = join(out_dir,'13_counts','allreadpeaks','{sp}_' + str(nt_merge) + '_allFracMMCounts.txt')
14611367
shell:
14621368
"""
14631369
featureCounts -F SAF \
@@ -1498,8 +1404,8 @@ rule feature_counts_uniquereads:
14981404
envmodules:
14991405
config['subread']
15001406
output:
1501-
out_unique = join(out_dir,'14_counts','uniquereadpeaks','{sp}_' + str(nt_merge) + '_uniqueCounts.txt'),
1502-
out_all = join(out_dir,'14_counts','uniquereadpeaks','{sp}_' + str(nt_merge) + '_allFracMMCounts.txt')
1407+
out_unique = join(out_dir,'13_counts','uniquereadpeaks','{sp}_' + str(nt_merge) + '_uniqueCounts.txt'),
1408+
out_all = join(out_dir,'13_counts','uniquereadpeaks','{sp}_' + str(nt_merge) + '_allFracMMCounts.txt')
15031409
shell:
15041410
"""
15051411
featureCounts -F SAF \
@@ -1530,7 +1436,7 @@ rule feature_counts_uniquereads:
15301436
# generate annotation table once per project
15311437
# """
15321438
# input:
1533-
# expand(join(out_dir,'14_counts', 'allreadpeaks', '{sp}_'+ str(nt_merge) +'_uniqueCounts.txt'), sp=sp_list),
1439+
# expand(join(out_dir,'13_counts', 'allreadpeaks', '{sp}_'+ str(nt_merge) +'_uniqueCounts.txt'), sp=sp_list),
15341440
# params:
15351441
# rname='36_proj_anno',
15361442
# script = join(source_dir,'workflow','scripts','06_annotation.R'),
@@ -1578,8 +1484,8 @@ rule feature_counts_uniquereads:
15781484

15791485
# '''
15801486
# input:
1581-
# unique = join(out_dir,'14_counts', 'allreadpeaks', '{sp}_'+ str(nt_merge) +'_uniqueCounts.txt'),
1582-
# all = join(out_dir,'14_counts', 'allreadpeaks', '{sp}_'+ str(nt_merge) +'_allFracMMCounts.txt'),
1487+
# unique = join(out_dir,'13_counts', 'allreadpeaks', '{sp}_'+ str(nt_merge) +'_uniqueCounts.txt'),
1488+
# all = join(out_dir,'13_counts', 'allreadpeaks', '{sp}_'+ str(nt_merge) +'_allFracMMCounts.txt'),
15831489
# anno = join(out_dir,'15_annotation', 'project','annotations.txt'),
15841490
# params:
15851491
# rname = '37_peak_anno',

0 commit comments

Comments
 (0)