Skip to content

Latest commit

 

History

History
832 lines (658 loc) · 28.7 KB

File metadata and controls

832 lines (658 loc) · 28.7 KB

eCLIP

Wilfried Guiblet 2025-09-25

Libraries

Environment

srun --pty -p norm --ntasks=64 bash
cd /mnt/gridftp/guibletwm/CCBRRBL16

module load mamba
mamba create -n eclipV2
mamba activate eclipV2

mamba install bioconda::seqkit
mamba install bioconda::umi_tools

module load singularity
module load python

ZAP

YeoLab eCLIP

Prep STAR

# STAR index hg38

singularity \
    --quiet \
    run \
    --contain \
    --ipc \
    --cleanenv \
    --no-eval \
    --userns \
    --home \
    /mnt/gridftp/guibletwm/CCBRRBL16/:/vyjYpl/ \
    --pwd \
    /vyjYpl \
    /mnt/gridftp/guibletwm/CCBRRBL16/brianyee_star:2.7.6a.sif \
    STAR \
      --runThreadN 64 \
      --runMode genomeGenerate \
      --genomeDir STAR_hg38/ \
      --genomeFastaFiles STAR_hg38/hg38.fa \
      --sjdbGTFfile STAR_hg38/gencode.v47.annotation.gtf \
      --sjdbOverhang 100 \
      --limitSjdbInsertNsj 4807600 

# STAR index repeatmasker hg38

singularity \
    --quiet \
    run \
    --contain \
    --ipc \
    --cleanenv \
    --no-eval \
    --userns \
    --home \
    /mnt/gridftp/guibletwm/CCBRRBL16/:/vyjYpl/ \
    --pwd \
    /vyjYpl \
    /mnt/gridftp/guibletwm/CCBRRBL16/brianyee_star:2.7.6a.sif \
    STAR \
      --runThreadN 64 \
      --runMode genomeGenerate \
      --genomeDir STAR_hg38_repeat/ \
      --genomeFastaFiles STAR_hg38_repeat/hg38.fa \
      --sjdbGTFfile STAR_hg38_repeat/rmsk.hg38.gtf \
      --sjdbOverhang 100 \
      --limitSjdbInsertNsj 4807600 

Run eCLIP

cwltool --parallel --singularity --tmpdir-prefix /scratch/cluster_scratch/guibletwm/ --cachedir  /scratch/cluster_scratch/guibletwm/ --leave-tmpdir /mnt/gridftp/guibletwm/CCBRRBL16/eCLIP/cwl/wf_get_peaks_scatter_se.cwl /mnt/gridftp/guibletwm/CCBRRBL16/manifests/ZAP.yaml

cwltool --parallel --singularity --tmpdir-prefix /scratch/cluster_scratch/guibletwm/ --cachedir  /scratch/cluster_scratch/guibletwm/ --leave-tmpdir /mnt/gridftp/guibletwm/CCBRRBL16/merge_peaks/cwl/wf_get_reproducible_eclip_peaks.cwl /mnt/gridftp/guibletwm/CCBRRBL16/manifests/ZAP.merge.yaml
python post_process_eCLIP.py --yaml_file manifests/ZAP.yaml --directory ZAP > ZAP.report.txt

Problem: target is a repetitive element. Let’s realign and keep repeats.

ReAlign

Run STAR

for sample in 'rep1_IP' 'rep2_IP' 'rep1_INPUT' 'rep2_INPUT' ; do

  singularity \
      --quiet \
      run \
      --contain \
      --ipc \
      --cleanenv \
      --no-eval \
      --userns \
      --home \
      /mnt/gridftp/guibletwm/CCBRRBL16/:/vyjYpl/ \
      --pwd \
      /vyjYpl \
      /mnt/gridftp/guibletwm/CCBRRBL16/brianyee_star:2.7.6a.sif \
      STAR \
        --runMode alignReads \
        --runThreadN 64 \
        --genomeDir STAR_hg38 \
        --genomeLoad NoSharedMemory \
        --readFilesIn \
        ZAP/ZAP.${sample}.umi.r1.fqTrTr.sorted.fq.gz \
        --readFilesCommand zcat \
        --outSAMunmapped None \
        --outFilterMultimapNmax 10000 \
        --outFilterMultimapScoreRange 5 \
        --outFileNamePrefix Realign/ZAP.${sample} \
        --outSAMattributes All \
        --outSAMtype BAM Unsorted \
        --outFilterType Normal \
        --outReadsUnmapped Fastx \
        --outFilterScoreMin 0 \
        --outSAMattrRGline ID:foo \
        --outStd Log \
        --alignEndsType Local \
        --outBAMcompression 10 \
        --outSAMmode Full
  
  samtools sort --threads 64 -o Realign/ZAP.${sample}_Aligned.sorted.bam Realign/ZAP.${sample}_Aligned.out.bam
  samtools index --threads 64 Realign/ZAP.${sample}_Aligned.sorted.bam 
  umi_tools dedup -I Realign/ZAP.${sample}_Aligned.sorted.bam -S Realign/ZAP.${sample}.deduped.bam -L dedup.log --method=unique
  samtools index --threads 64 -M Realign/ZAP.*.deduped.bam
  
done

Peak Calling

singularity shell -B $(pwd)/:/data2/ ctk.sif
export PERL5LIB=/opt/conda/lib/czplib

for sample in 'rep1_IP' 'rep2_IP' 'rep1_INPUT' 'rep2_INPUT' ; do

  bedtools bamtobed -split -i ${sample}.deduped.bam > ${sample}.deduped.bed

  /opt/conda/lib/ctk/tag2peak.pl \
    -big -ss \
    -p 0.001 --multi-test \
    --valley-seeking \
    --valley-depth 0.90 \
    Realign/ZAP.${sample}.deduped.bed Realign/ZAP.${sample}.CTKpeaks.bed \
    --out-boundary Realign/ZAP.${sample}.CTKpeaks.boundary.bed \
    --out-half-PH Realign/ZAP.${sample}.CTKpeaks.halfPH.bed \
    -minPH 100
    
done

cat Realign/ZAP.*.CTKpeaks.boundary.bed | bedtools sort -i - | bedtools merge -s -i - -c 6 -o distinct | awk -v FS='\t' -v OFS='\t' '{print $1,$2,$3,"merged_peak","0",$4}' > Realign/ZAP.CTKpeaks.merged.formatted.bed

Annotate

# Same Strand

bedtools intersect -s -wao \
          -a Realign/ZAP.CTKpeaks.merged.formatted.bed \
          -b Realign/rmsk.hg38.custom.bed \
            > Realign/ZAP.CTKpeaks.merged.rmsk.SameStrand.bed

bedtools intersect -s -wao \
          -a Realign/ZAP.CTKpeaks.merged.formatted.bed \
          -b Realign/gencode.v44.annotation.bed \
            > Realign/ZAP.CTKpeaks.merged.gencode.SameStrand.bed

bedtools intersect -s -wao \
          -a Realign/ZAP.CTKpeaks.merged.formatted.bed \
          -b Realign/hg38.GENCODE_V44.UCSC.introns.bed \
          | awk 'BEGIN {FS = "\t"; OFS = "\t"} $10 != "." {split($10, arr, "_"); $14 = arr[3]} $10 == "." { $14 = "." } 1' \
            > Realign/ZAP.CTKpeaks.merged.introns.SameStrand.bed

bedtools intersect -s -wao \
          -a Realign/ZAP.CTKpeaks.merged.formatted.bed \
          -b Realign/ncRNA.bed \
            > Realign/ZAP.CTKpeaks.merged.ncRNA.SameStrand.bed

bedtools intersect -s -wao \
          -a Realign/ZAP.CTKpeaks.merged.formatted.bed \
          -b Realign/hg38.GENCODE_V44.UCSC.5UTR.bed Realign/hg38.GENCODE_V44.UCSC.3UTR.bed \
          -names 5UTR 3UTR \
            > Realign/ZAP.CTKpeaks.merged.UTRs.SameStrand.bed
            
     
# Opposite Strand
     
bedtools intersect -S -wao \
          -a Realign/ZAP.CTKpeaks.merged.formatted.bed \
          -b Realign/rmsk.hg38.custom.bed \
            > Realign/ZAP.CTKpeaks.merged.rmsk.OppoStrand.bed

bedtools intersect -S -wao \
          -a Realign/ZAP.CTKpeaks.merged.formatted.bed \
          -b Realign/gencode.v44.annotation.bed \
            > Realign/ZAP.CTKpeaks.merged.gencode.OppoStrand.bed

bedtools intersect -S -wao \
          -a Realign/ZAP.CTKpeaks.merged.formatted.bed \
          -b Realign/hg38.GENCODE_V44.UCSC.introns.bed \
          | awk 'BEGIN {FS = "\t"; OFS = "\t"} $10 != "." {split($10, arr, "_"); $14 = arr[3]} $10 == "." { $14 = "." } 1' \
            > Realign/ZAP.CTKpeaks.merged.introns.OppoStrand.bed

bedtools intersect -S -wao \
          -a Realign/ZAP.CTKpeaks.merged.formatted.bed \
          -b Realign/ncRNA.bed \
            > Realign/ZAP.CTKpeaks.merged.ncRNA.OppoStrand.bed

bedtools intersect -S -wao \
          -a Realign/ZAP.CTKpeaks.merged.formatted.bed \
          -b Realign/hg38.GENCODE_V44.UCSC.5UTR.bed Realign/hg38.GENCODE_V44.UCSC.3UTR.bed \
          -names 5UTR 3UTR \
            > Realign/ZAP.CTKpeaks.merged.UTRs.OppoStrand.bed       
            

python MergeAnnotations.py \
  --SameStrandRMSK Realign/ZAP.CTKpeaks.merged.rmsk.SameStrand.bed \
  --SameStrandGenCode Realign/ZAP.CTKpeaks.merged.gencode.SameStrand.bed \
  --SameStrandIntrons Realign/ZAP.CTKpeaks.merged.introns.SameStrand.bed \
  --SameStrandncRNA Realign/ZAP.CTKpeaks.merged.ncRNA.SameStrand.bed \
  --SameStrandUTRs Realign/ZAP.CTKpeaks.merged.UTRs.SameStrand.bed \
  --OppoStrandRMSK Realign/ZAP.CTKpeaks.merged.rmsk.OppoStrand.bed \
  --OppoStrandGenCode Realign/ZAP.CTKpeaks.merged.gencode.OppoStrand.bed \
  --OppoStrandIntrons Realign/ZAP.CTKpeaks.merged.introns.OppoStrand.bed \
  --OppoStrandncRNA Realign/ZAP.CTKpeaks.merged.ncRNA.OppoStrand.bed \
  --OppoStrandUTRs Realign/ZAP.CTKpeaks.merged.UTRs.OppoStrand.bed \
  --Output Realign/ZAP.CTKpeaks.merged.annotation_complete.txt            
awk -v FS='\t' -v OFS='\t' 'NR>1 {gsub(/[ \t]/,"",$8); gsub(/[ \t]/,"",$10); gsub(/[ \t]/,"",$17) ; print $2,$3,$4,$8,$10,$6,$17}' Realign/ZAP.CTKpeaks.merged.annotation_complete.txt  | bedtools multicov -s -bed - -bams Realign/ZAP.rep1_IP.deduped.bam Realign/ZAP.rep2_IP.deduped.bam Realign/ZAP.rep1_INPUT.deduped.bam Realign/ZAP.rep2_INPUT.deduped.bam > Realign/ZAP.CTKpeaks.merged.annotation_complete.counts

DE Analysis

input <- read.table('/Users/guibletwm/Lab_Work/CCRRBL-16/ZAP.CTKpeaks.merged.annotation_complete.counts',header=FALSE,sep="\t")

colnames(input) <- c('chrom', 'start', 'end','gene_name','gene_type', 'strand', 'ncRNA','IP_1', 'IP_2', 'INPUT_1', 'INPUT_2')
rownames(input) <- paste(input$chrom, input$start, input$end, input$strand, input$gene_name, input$gene_type, sep='|')

counts <- input[,c('IP_1', 'IP_2', 'INPUT_1', 'INPUT_2')]
#rownames(counts) <- paste(input$chrom, input$start, input$end, input$strand, sep='|')
counts <- data.frame(counts)
conditions <- c(rep('IP', 2), rep('INPUT', 2))

dds <- DESeqDataSetFromMatrix(countData = counts,
                              colData = data.frame(condition = conditions),
                              design = ~ condition)

dds$condition <- relevel(dds$condition, ref = "INPUT")

smallestGroupSize <- 3
keep <- rowSums(counts(dds) >= 50) >= smallestGroupSize
dds <- dds[keep,]

dds <- DESeq(dds)
res <- results(dds)
summary(res)
## 
## out of 1728 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 699, 40%
## LFC < 0 (down)     : 712, 41%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 48)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
sum(res$padj < 0.05, na.rm=TRUE)
## [1] 1351
resOrdered <- res[order(res$padj),]
resOrdered <- resOrdered[which(resOrdered$padj < 0.05),]

write.table(as.data.frame(resOrdered), file='/Users/guibletwm/Lab_Work/CCRRBL-16/CTKpeaks.deseq.txt', sep='\t')

norm_counts <- counts(dds, normalized = TRUE)
toplot <- merge(input[,c('chrom', 'start', 'end','gene_name','gene_type', 'strand', 'ncRNA')], norm_counts, by = 0)

# Find which rows are 7S
toplot <- toplot %>%
  mutate(is_7S = grepl("RN7SL1|RN7SL2|RN7SL3", gene_name))


ggplot(data = toplot) +
  geom_point(aes(y=IP_1, x=INPUT_1, color = is_7S)) +
  scale_color_manual(values = c("FALSE" = "darkgrey", "TRUE" = "red")) +
  labs(title = 'ZAP rep1', x = "INPUT_1 (log norm count)", y = "IP_1 (log norm count)") +
  #coord_cartesian(xlim = c(1e1, 10e3), ylim = c(1e1,10e3)) +
  scale_x_log10() +
  scale_y_log10() +
  geom_text_repel(
    data = subset(toplot, is_7S),
    aes(label = gene_name, y=IP_1, x=INPUT_1),
    size = 3,
    max.overlaps = Inf
  ) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
  theme_minimal()

ggplot(data = toplot) +
  geom_point(aes(y=IP_2, x=INPUT_2, color = is_7S)) +
  scale_color_manual(values = c("FALSE" = "darkgrey", "TRUE" = "red")) +
  labs(title = 'ZAP rep2', x = "INPUT_2 (log norm count)", y = "IP_2 (log norm count)") +
  #coord_cartesian(xlim = c(1e1, 10e3), ylim = c(1e1,10e3)) +
  scale_x_log10() +
  scale_y_log10() +
  geom_text_repel(
    data = subset(toplot, is_7S),
    aes(label = gene_name, y=IP_1, x=INPUT_1),
    size = 3,
    max.overlaps = Inf
  ) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
  theme_minimal()

# Extract row names of significant genes
sig_genes <- rownames(resOrdered)
# Subset normalized counts for significant genes only
norm_counts_sig <- norm_counts[sig_genes, ]

toplot <- merge(input[,c('chrom', 'start', 'end','gene_name','gene_type', 'strand', 'ncRNA')], norm_counts_sig, by = 0)

# Find which rows are tRNA
toplot <- toplot %>%
  mutate(is_7S = grepl("RN7SL1|RN7SL2|RN7SL3", gene_name))


ggplot(data = toplot) +
  geom_point(aes(y=IP_1, x=INPUT_1, color = is_7S)) +
  scale_color_manual(values = c("FALSE" = "darkgrey", "TRUE" = "red")) +
  labs(title = 'ZAP significant only', x = "INPUT_1 (log norm count)", y = "IP_1 (log norm count)") +
  scale_x_log10() +
  scale_y_log10() +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
  #coord_cartesian(xlim = c(1, 5e3), ylim = c(1,5e3)) +
  geom_text_repel(
    data = subset(toplot, is_7S),
    aes(label = gene_name, y=IP_1, x=INPUT_1),
    size = 3,
    max.overlaps = Inf
  ) +
  theme_minimal()

Count Read IDs in regions

grep 7SL Realign/rmsk.hg38.custom.bed > Realign/7SL.bed
grep -w 5S Realign/rmsk.hg38.custom.bed > Realign/5S.bed
grep -w LSU-rRNA_Hsa Realign/rmsk.hg38.custom.bed > Realign/LLU.bed
grep -w SSU-rRNA_Hsa Realign/rmsk.hg38.custom.bed > Realign/SSU.bed
awk -v FS='\t' -v OFS='\t' '{if ($5 == "tRNA") print $0}' Realign/rmsk.hg38.custom.bed > Realign/tRNA.bed
cp Realign/hg38.GENCODE_V44.UCSC.protein_coding.bed Realign/mRNA.bed
cp Realign/hg38.GENCODE_V44.UCSC.5UTR.bed Realign/UTR5.bed
cp Realign/hg38.GENCODE_V44.UCSC.3UTR.bed Realign/UTR3.bed

bedtools intersect -v -a Realign/mRNA.bed -b Realign/UTR5.bed Realign/UTR3.bed > Realign/CDS.bed

cat Realign/7SL.bed Realign/5S.bed Realign/LLU.bed Realign/SSU.bed Realign/tRNA.bed Realign/mRNA.bed > Realign/Features.hg38.bed
# COUNT UNIQ IDS IN IDENTIFIED PEAKS

rm Realign/ZAP.*.peaks.readcounts.txt


for sample in 'rep1_IP' 'rep2_IP' 'rep1_INPUT' 'rep2_INPUT' ; do
  for region in '7SL' '5S' 'LLU' 'SSU' 'tRNA' 'mRNA'; do
    samtools view -bS -L Realign/ZAP.CTKpeaks.merged.formatted.bed Realign/ZAP.${sample}.deduped.bam | samtools view  -L Realign/${region}.bed - | cut -f1 | sort -u | wc -l | awk '{print $1}' >> Realign/ZAP.${sample}.peaks.readcounts.txt
  done;
  samtools view -bS -L Realign/ZAP.CTKpeaks.merged.formatted.bed Realign/ZAP.${sample}.deduped.bam | samtools view -L Realign/Features.hg38.bed - | cut -f1 | sort -u | wc -l | awk '{print $1}'>> Realign/ZAP.${sample}.peaks.readcounts.txt
  samtools view -L Realign/ZAP.CTKpeaks.merged.formatted.bed Realign/ZAP.${sample}.deduped.bam | cut -f1 | sort -u | wc -l | awk '{print $1}'>> Realign/ZAP.${sample}.peaks.readcounts.txt
done


# COUNT UNIQ IDS IN UTRS AND CDS


rm Realign/ZAP.*.readcounts.gene.txt

for sample in 'rep1_IP' 'rep2_IP' 'rep1_INPUT' 'rep2_INPUT' ; do
  for region in 'UTR5' 'UTR3' 'CDS' ; do
    echo ${region} >> Realign/ZAP.${sample}.readcounts.gene.txt
    samtools view -L Realign/${region}.bed Realign/ZAP.${sample}.deduped.bam | cut -f1 | sort -u | wc -l | awk '{print $1}' >> Realign/ZAP.${sample}.readcounts.gene.txt
  done;
  #echo 'Total Read Count' >> Realign/ZAP.${sample}.readcounts.gene.txt
  #samtools view -c Realign/ZAP.${sample}.deduped.bam >> Realign/ZAP.${sample}.readcounts.gene.txt
done
library(dplyr)
library(tidyr)
library(tibble)

ZAP.rep1.IP.counts <- read.delim ("~/Lab_Work/CCRRBL-16/ZAP.rep1_IP.peaks.readcounts.txt", header = F)
ZAP.rep2.IP.counts <- read.delim ("~/Lab_Work/CCRRBL-16/ZAP.rep2_IP.peaks.readcounts.txt", header = F)
ZAP.rep1.INPUT.counts <- read.delim ("~/Lab_Work/CCRRBL-16/ZAP.rep1_INPUT.peaks.readcounts.txt", header = F)
ZAP.rep2.INPUT.counts <- read.delim ("~/Lab_Work/CCRRBL-16/ZAP.rep2_INPUT.peaks.readcounts.txt", header = F)

ZAP.counts <- cbind(ZAP.rep1.IP.counts, ZAP.rep2.IP.counts, ZAP.rep1.INPUT.counts, ZAP.rep2.INPUT.counts)
rownames(ZAP.counts) <- c('7SL', '5S', 'LLU', 'SSU', 'tRNA', 'mRNA', 'All', 'Total')
colnames(ZAP.counts) <- c('IP_1', 'IP_2', 'INPUT_1', 'INPUT_2')

as.numeric(ZAP.counts['All',]) / as.numeric(ZAP.counts['Total',]) 
## [1] 0.9159303 0.9484168 0.9540938 0.9690679
ZAP.counts
##          IP_1    IP_2  INPUT_1  INPUT_2
## 7SL     15284   10788   289073   300135
## 5S     414224  575022   241937   497249
## LLU    412905  567606  8466904 16766740
## SSU    475438  436818  3277038  6946845
## tRNA     7114   12121    72534   145041
## mRNA   613952  487279  4552228  7128831
## All   1583573 1739200 13165937 25056683
## Total 1728923 1833793 13799416 25856479
# Divide each numeric column by its last value in that column

ZAP.counts_norm <- ZAP.counts %>%
  tibble::as_tibble() %>%
  dplyr::mutate(across(where(is.numeric), ~ .x / dplyr::last(.x))) %>%
  utils::head(-2)

# Reshape dataframe for ggplot
toplot <- ZAP.counts_norm %>%
  rownames_to_column(var = "Feature") %>% 
  pivot_longer(
    cols = -Feature,,            # pivot all *except* row_id
    names_to = "Sample",          
    values_to = "Perc_Counts"             
  )

# Plot
ggplot(toplot, aes(fill=Feature, y=Perc_Counts, x=Sample)) + 
    geom_bar(position="stack", stat="identity") +
  theme_classic(base_size = 20) 

# Normalize by colsum instead
ZAP.counts_norm2 <- ZAP.counts %>%
  utils::head(-2)  %>%
  dplyr::mutate(across(
    where(is.numeric), 
    ~ .x / sum(.x, na.rm = TRUE)  # divide by column sum
  ))

ZAP.counts_norm2
##             IP_1        IP_2     INPUT_1     INPUT_2
## 7SL  0.007882751 0.005162627 0.017105201 0.009442709
## 5S   0.213636788 0.275178333 0.014316041 0.015644219
## LLU  0.212956511 0.271629386 0.501008715 0.527507437
## SSU  0.245208021 0.209040435 0.193910856 0.218558432
## tRNA 0.003669059 0.005800537 0.004292025 0.004563213
## mRNA 0.316646870 0.233188683 0.269367162 0.224283991
# Reshape dataframe for ggplot
toplot2 <- ZAP.counts_norm2 %>%
  rownames_to_column(var = "Feature") %>% 
  pivot_longer(
    cols = -Feature,,            # pivot all *except* row_id
    names_to = "Sample",          
    values_to = "Perc_Counts"             
  )

ggplot(toplot2, aes(fill=Feature, y=Perc_Counts, x=Sample)) + 
    geom_bar(position="stack", stat="identity") +
  theme_classic(base_size = 20) 

ZAP.counts_norm3 <- ZAP.counts[c('7SL', 'tRNA', 'mRNA'),] %>%
  mutate(across(
    where(is.numeric), 
    ~ .x / sum(.x, na.rm = TRUE)  # divide by column sum
  ))

# Reshape dataframe for ggplot
toplot3 <- ZAP.counts_norm3 %>%
  rownames_to_column(var = "Feature") %>% 
  pivot_longer(
    cols = -Feature,,            # pivot all *except* row_id
    names_to = "Sample",          
    values_to = "Perc_Counts"             
  )

ggplot(toplot3, aes(fill=Feature, y=Perc_Counts, x=Sample)) + 
    geom_bar(position="stack", stat="identity") +
  theme_classic(base_size = 20) 

Tet

YeoLab eCLIP

cwltool --parallel --singularity --tmpdir-prefix /scratch/cluster_scratch/guibletwm/ --cachedir  /scratch/cluster_scratch/guibletwm/ --leave-tmpdir /mnt/gridftp/guibletwm/CCBRRBL16/eCLIP/cwl/wf_get_peaks_scatter_se.cwl /mnt/gridftp/guibletwm/CCBRRBL16/manifests/Tet.yaml

cwltool --parallel --singularity --tmpdir-prefix /scratch/cluster_scratch/guibletwm/ --cachedir  /scratch/cluster_scratch/guibletwm/ --leave-tmpdir /mnt/gridftp/guibletwm/CCBRRBL16/merge_peaks/cwl/wf_get_reproducible_eclip_peaks.cwl /mnt/gridftp/guibletwm/CCBRRBL16/manifests/Tet.merge.yaml
python post_process_eCLIP.py --yaml_file manifests/Tet.yaml --directory Tet > Tet.report.txt

ReAlign

for sample in 'rep1_IP' 'rep2_IP' 'rep1_INPUT' 'rep2_INPUT' ; do

  singularity \
      --quiet \
      run \
      --contain \
      --ipc \
      --cleanenv \
      --no-eval \
      --userns \
      --home \
      /mnt/gridftp/guibletwm/CCBRRBL16/:/vyjYpl/ \
      --pwd \
      /vyjYpl \
      /mnt/gridftp/guibletwm/CCBRRBL16/brianyee_star:2.7.6a.sif \
      STAR \
        --runMode alignReads \
        --runThreadN 64 \
        --genomeDir STAR_hg38 \
        --genomeLoad NoSharedMemory \
        --readFilesIn \
        Tet/Tet${sample}.umi.r1.fqTrTr.sorted.fq.gz \
        --readFilesCommand zcat \
        --outSAMunmapped None \
        --outFilterMultimapNmax 10000 \
        --outFilterMultimapScoreRange 5 \
        --outFileNamePrefix Realign/ZAP.${sample} \
        --outSAMattributes All \
        --outSAMtype BAM Unsorted \
        --outFilterType Normal \
        --outReadsUnmapped Fastx \
        --outFilterScoreMin 0 \
        --outSAMattrRGline ID:foo \
        --outStd Log \
        --alignEndsType Local \
        --outBAMcompression 10 \
        --outSAMmode Full
  
  samtools sort --threads 64 -o Realign/Tet.${sample}_Aligned.sorted.bam Realign/Tet.${sample}_Aligned.out.bam
  samtools index --threads 64 Realign/Tet.${sample}_Aligned.sorted.bam 
  umi_tools dedup -I Realign/Tet.${sample}_Aligned.sorted.bam -S Realign/Tet.${sample}.deduped.bam -L dedup.log --method=unique
  samtools index --threads 64 -M Realign/Tet.*.deduped.bam
  
done

Peak Calling

singularity shell -B $(pwd)/:/data2/ ctk.sif
export PERL5LIB=/opt/conda/lib/czplib

for sample in 'rep1_IP' 'rep2_IP' 'rep1_INPUT' 'rep2_INPUT' ; do

  bedtools bamtobed -split -i ${sample}.deduped.bam > ${sample}.deduped.bed

  /opt/conda/lib/ctk/tag2peak.pl \
    -big -ss \
    -p 0.001 --multi-test \
    --valley-seeking \
    --valley-depth 0.90 \
   Realign/Tet.${sample}.deduped.bed Realign/Tet.${sample}.CTKpeaks.bed \
    --out-boundary Realign/Tet.${sample}.CTKpeaks.boundary.bed \
    --out-half-PH Realign/Tet.${sample}.CTKpeaks.halfPH.bed \
    -minPH 100
    
done

cat Realign/Tet.*.CTKpeaks.boundary.bed | bedtools sort -i - | bedtools merge -s -i - -c 6 -o distinct | awk -v FS='\t' -v OFS='\t' '{print $1,$2,$3,"merged_peak","0",$4}' > Realign/Tet.CTKpeaks.merged.formatted.bed

Annotate

# Same Strand

bedtools intersect -s -wao \
          -a Realign/Tet.CTKpeaks.merged.formatted.bed \
          -b Realign/rmsk.hg38.custom.bed \
            > Realign/Tet.CTKpeaks.merged.rmsk.SameStrand.bed

bedtools intersect -s -wao \
          -a Realign/Tet.CTKpeaks.merged.formatted.bed \
          -b Realign/gencode.v44.annotation.bed \
            > Realign/Tet.CTKpeaks.merged.gencode.SameStrand.bed

bedtools intersect -s -wao \
          -a Realign/Tet.CTKpeaks.merged.formatted.bed \
          -b Realign/hg38.GENCODE_V44.UCSC.introns.bed \
          | awk 'BEGIN {FS = "\t"; OFS = "\t"} $10 != "." {split($10, arr, "_"); $14 = arr[3]} $10 == "." { $14 = "." } 1' \
            > Realign/Tet.CTKpeaks.merged.introns.SameStrand.bed

bedtools intersect -s -wao \
          -a Realign/Tet.CTKpeaks.merged.formatted.bed \
          -b Realign/ncRNA.bed \
            > Realign/Tet.CTKpeaks.merged.ncRNA.SameStrand.bed

bedtools intersect -s -wao \
          -a Realign/Tet.CTKpeaks.merged.formatted.bed \
          -b Realign/hg38.GENCODE_V44.UCSC.5UTR.bed Realign/hg38.GENCODE_V44.UCSC.3UTR.bed \
          -names 5UTR 3UTR \
            > Realign/Tet.CTKpeaks.merged.UTRs.SameStrand.bed
            
     
# Opposite Strand
     
bedtools intersect -S -wao \
          -a Realign/Tet.CTKpeaks.merged.formatted.bed \
          -b Realign/rmsk.hg38.custom.bed \
            > Realign/Tet.CTKpeaks.merged.rmsk.OppoStrand.bed

bedtools intersect -S -wao \
          -a Realign/Tet.CTKpeaks.merged.formatted.bed \
          -b Realign/gencode.v44.annotation.bed \
            > Realign/Tet.CTKpeaks.merged.gencode.OppoStrand.bed

bedtools intersect -S -wao \
          -a Realign/Tet.CTKpeaks.merged.formatted.bed \
          -b Realign/hg38.GENCODE_V44.UCSC.introns.bed \
          | awk 'BEGIN {FS = "\t"; OFS = "\t"} $10 != "." {split($10, arr, "_"); $14 = arr[3]} $10 == "." { $14 = "." } 1' \
            > Realign/Tet.CTKpeaks.merged.introns.OppoStrand.bed

bedtools intersect -S -wao \
          -a Realign/Tet.CTKpeaks.merged.formatted.bed \
          -b Realign/ncRNA.bed \
            > Realign/Tet.CTKpeaks.merged.ncRNA.OppoStrand.bed

bedtools intersect -S -wao \
          -a Realign/Tet.CTKpeaks.merged.formatted.bed \
          -b Realign/hg38.GENCODE_V44.UCSC.5UTR.bed Realign/hg38.GENCODE_V44.UCSC.3UTR.bed \
          -names 5UTR 3UTR \
            > Realign/Tet.CTKpeaks.merged.UTRs.OppoStrand.bed       
            

python MergeAnnotations.py \
  --SameStrandRMSK Realign/Tet.CTKpeaks.merged.rmsk.SameStrand.bed \
  --SameStrandGenCode Realign/Tet.CTKpeaks.merged.gencode.SameStrand.bed \
  --SameStrandIntrons Realign/Tet.CTKpeaks.merged.introns.SameStrand.bed \
  --SameStrandncRNA Realign/Tet.CTKpeaks.merged.ncRNA.SameStrand.bed \
  --SameStrandUTRs Realign/Tet.CTKpeaks.merged.UTRs.SameStrand.bed \
  --OppoStrandRMSK Realign/Tet.CTKpeaks.merged.rmsk.OppoStrand.bed \
  --OppoStrandGenCode Realign/Tet.CTKpeaks.merged.gencode.OppoStrand.bed \
  --OppoStrandIntrons Realign/Tet.CTKpeaks.merged.introns.OppoStrand.bed \
  --OppoStrandncRNA Realign/Tet.CTKpeaks.merged.ncRNA.OppoStrand.bed \
  --OppoStrandUTRs Realign/Tet.CTKpeaks.merged.UTRs.OppoStrand.bed \
  --Output Realign/Tet.CTKpeaks.merged.annotation_complete.txt            
awk -v FS='\t' -v OFS='\t' 'NR>1 {gsub(/[ \t]/,"",$8); gsub(/[ \t]/,"",$10); gsub(/[ \t]/,"",$17) ; print $2,$3,$4,$8,$10,$6,$17}' Realign/Tet.CTKpeaks.merged.annotation_complete.txt  | bedtools multicov -s -bed - -bams Realign/Tet.rep1_IP.deduped.bam Realign/Tet.rep2_IP.deduped.bam Realign/Tet.rep1_INPUT.deduped.bam Realign/Tet.rep2_INPUT.deduped.bam > Realign/Tet.CTKpeaks.merged.annotation_complete.counts

DE Analysis

library(DESeq2)
library(dplyr)
library(ggplot2)

input <- read.table('/Users/guibletwm/Lab_Work/CCRRBL-16/Tet.CTKpeaks.merged.annotation_complete.counts',header=FALSE,sep="\t")

colnames(input) <- c('chrom', 'start', 'end','gene_name','gene_type', 'strand', 'ncRNA','IP_1', 'IP_2', 'INPUT_1', 'INPUT_2')
rownames(input) <- paste(input$chrom, input$start, input$end, input$strand, input$gene_name, input$gene_type, sep='|')

counts <- input[,c('IP_1', 'IP_2', 'INPUT_1', 'INPUT_2')]
#rownames(counts) <- paste(input$chrom, input$start, input$end, input$strand, sep='|')
counts <- data.frame(counts)
conditions <- c(rep('IP', 2), rep('INPUT', 2))

dds <- DESeqDataSetFromMatrix(countData = counts,
                              colData = data.frame(condition = conditions),
                              design = ~ condition)

dds$condition <- relevel(dds$condition, ref = "INPUT")

smallestGroupSize <- 3
keep <- rowSums(counts(dds) >= 50) >= smallestGroupSize
dds <- dds[keep,]

dds <- DESeq(dds)
res <- results(dds)
summary(res)
## 
## out of 520 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 0, 0%
## LFC < 0 (down)     : 26, 5%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 59)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
sum(res$padj < 0.05, na.rm=TRUE)
## [1] 0
resOrdered <- res[order(res$padj),]
resOrdered <- resOrdered[which(resOrdered$padj < 0.05),]

write.table(as.data.frame(resOrdered), file='/Users/guibletwm/Lab_Work/CCRRBL-16/Tet.CTKpeaks.merged.deseq.txt', sep='\t')

norm_counts <- counts(dds, normalized = TRUE)
toplot <- merge(input[,c('chrom', 'start', 'end','gene_name','gene_type', 'strand', 'ncRNA')], norm_counts, by = 0)

# Find which rows are tRNA
tRNA_rows <- toplot %>% filter(ncRNA == "tRNA")

ggplot(data = toplot) +
  geom_point(aes(y=IP_1, x=INPUT_1, color = ncRNA == "tRNA")) +
  scale_color_manual(values = c("darkgrey", "red")) +
  labs(title = 'Tet rep1', x = "INPUT_1 (log norm count)", y = "IP_1 (log norm count)") +
  coord_cartesian(xlim = c(1e1, 10e3), ylim = c(1e1,10e3)) +
  scale_x_log10() +
  scale_y_log10() +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
  theme_minimal()

ggplot(data = toplot) +
  geom_point(aes(y=IP_2, x=INPUT_2, color = ncRNA == "tRNA")) +
  scale_color_manual(values = c("darkgrey", "red")) +
  labs(title = 'Tet rep2', x = "INPUT_2 (log norm count)", y = "IP_2 (log norm count)") +
  coord_cartesian(xlim = c(1e1, 10e3), ylim = c(1e1,10e3)) +
  scale_x_log10() +
  scale_y_log10() +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
  theme_minimal()

# Extract row names of significant genes
sig_genes <- rownames(resOrdered)
# Subset normalized counts for significant genes only
norm_counts_sig <- norm_counts[sig_genes, ]

toplot <- merge(input[,c('chrom', 'start', 'end','gene_name','gene_type', 'strand', 'ncRNA')], norm_counts_sig, by = 0)

# Find which rows are tRNA
tRNA_rows <- toplot %>% filter(ncRNA == "tRNA")

ggplot(data = toplot) +
  geom_point(aes(y=IP_1, x=INPUT_1, color = ncRNA == "tRNA")) +
  scale_color_manual(values = c("darkgrey", "red")) +
  labs(title = 'Tet significant only', x = "INPUT_1 (log norm count)", y = "IP_1 (log norm count)") +
  scale_x_log10() +
  scale_y_log10() +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
  coord_cartesian(xlim = c(1, 5e3), ylim = c(1,5e3)) +
  theme_minimal()