Skip to content

Commit cd867fd

Browse files
committed
Fixed bug for trueA and updated vignette
1 parent 38c98f4 commit cd867fd

8 files changed

Lines changed: 487 additions & 194 deletions

File tree

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,4 @@ Depends:
1616
R.utils
1717
VignetteBuilder: knitr
1818
License: GPL-3
19-
RoxygenNote: 6.1.0
19+
RoxygenNote: 6.1.1

R/data_preparation.R

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ prepare_data<-function(
176176
if(!is.na(TRUE_A_TOKEN)){
177177

178178
trueA<-t(na.omit(pd[subs,grep(TRUE_A_TOKEN, colnames(pd))]))
179+
trueA <- apply(trueA,c(1,2),as.numeric)
179180
rownames(trueA)<-gsub(TRUE_A_TOKEN, "", colnames(pd)[grep(TRUE_A_TOKEN, colnames(pd))])
180181

181182
save(trueA, file=sprintf("%s/trueA.RData", OUTPUTDIR))
@@ -184,10 +185,11 @@ prepare_data<-function(
184185

185186
if(!is.na(HOUSEMAN_A_TOKEN)){
186187

187-
Ahouseman2012<-t(na.omit(pd[,grep(HOUSEMAN_A_TOKEN, colnames(pd))]))
188-
rownames(Ahouseman2012)<-NULL
188+
trueA<-t(na.omit(pd[,grep(HOUSEMAN_A_TOKEN, colnames(pd))]))
189+
trueA <- apply(trueA,c(1,2),as.numeric)
190+
rownames(trueA)<-gsub(HOUSEMAN_A_TOKEN, "", colnames(pd)[grep(HOUSEMAN_A_TOKEN, colnames(pd))])
189191

190-
save(Ahouseman2012, file=sprintf("%s/Ahouseman2012.RData", OUTPUTDIR))
192+
save(trueA, file=sprintf("%s/trueA.RData", OUTPUTDIR))
191193

192194
}else if(ESTIMATE_HOUSEMAN_PROP){
193195

@@ -199,12 +201,12 @@ prepare_data<-function(
199201
print("Estimating proportions using the Houseman et al, 2012 method")
200202
res<-estimateProportionsCP(rnb.set, REF_CT_COLUMN, NA, 2000, full.output = TRUE)
201203

202-
Ahouseman2012<-t(res$contributions.nonneg)
203-
Ahouseman2012[Ahouseman2012<1e-5]<-0
204+
trueA<-t(res$contributions.nonneg)
205+
trueA[trueA<1e-5]<-0
204206

205-
Ahouseman2012<-sweep(Ahouseman2012, 2, colSums(Ahouseman2012),"/")
207+
trueA<-sweep(trueA, 2, colSums(trueA),"/")
206208

207-
save(Ahouseman2012, file=sprintf("%s/Ahouseman2012.RData", OUTPUTDIR))
209+
save(trueA, file=sprintf("%s/trueA.RData", OUTPUTDIR))
208210
}
209211
}
210212

@@ -587,8 +589,12 @@ filter.annotation.biseq<-function(
587589
#' @param REF_CT_COLUMN Column name in \code{RNB_SET} used to extract methylation information on the reference cell types.
588590
#' @param PHENO_COLUMNS Vector of column names in the phenotypic table of \code{RNB_SET} that is kept and exported for further
589591
#' exploration.
592+
#' @param PREPARE_TRUE_PROPORTIONS Flag indicating if true proportions are either available in \code{RNB_SET} or to be estimated
593+
#' with Houseman's reference-based deconvolution approach.
590594
#' @param TRUE_A_TOKEN String present in the column names of \code{RNB_SET} used for selecting the true proportions of the corresponding
591595
#' cell types.
596+
#' @param HOUSEMAN_A_TOKEN Similar to \code{TRUE_A_TOKEN}, but not containing the true proportions, rather the estimated proportions
597+
#' by Houseman's method.
592598
#' @param ID_COLUMN Sample-specific ID column name in \code{RNB_SET}
593599
#' @param FILTER_COVERAGE Flag indicating, if site-filtering based on coverage is to be conducted.
594600
#' @param MIN_COVERAGE Minimum number of reads required in each sample for the site to be considered for adding to MeDeCom.
@@ -612,7 +618,9 @@ prepare_data_BS <- function(
612618
SAMPLE_SELECTION_GREP=NA,
613619
REF_CT_COLUMN=NA,
614620
PHENO_COLUMNS=NA,
621+
PREPARE_TRUE_PROPORTIONS=FALSE,
615622
TRUE_A_TOKEN=NA,
623+
HOUSEMAN_A_TOKEN=NA,
616624
ID_COLUMN=rnb.getOption("identifiers.column"),
617625
FILTER_COVERAGE = hasCovg(RNB_SET),
618626
MIN_COVERAGE=5,
@@ -659,13 +667,26 @@ prepare_data_BS <- function(
659667
sample_ids<-pd[,ID_COLUMN]
660668
saveRDS(sample_ids, file=sprintf("%s/sample_ids.RDS", OUTPUTDIR))
661669
}
662-
if(!is.na(TRUE_A_TOKEN)){
663-
664-
trueA<-t(na.omit(pd[subs,grep(TRUE_A_TOKEN, colnames(pd))]))
665-
rownames(trueA)<-gsub(TRUE_A_TOKEN, "", colnames(pd)[grep(TRUE_A_TOKEN, colnames(pd))])
666-
667-
save(trueA, file=sprintf("%s/trueA.RData", OUTPUTDIR))
670+
if(PREPARE_TRUE_PROPORTIONS){
671+
if(!is.na(TRUE_A_TOKEN)){
672+
673+
trueA<-t(na.omit(pd[subs,grep(TRUE_A_TOKEN, colnames(pd))]))
674+
trueA <- apply(trueA,c(1,2),as.numeric)
675+
rownames(trueA)<-gsub(TRUE_A_TOKEN, "", colnames(pd)[grep(TRUE_A_TOKEN, colnames(pd))])
676+
677+
save(trueA, file=sprintf("%s/trueA.RData", OUTPUTDIR))
678+
679+
}
668680

681+
if(!is.na(HOUSEMAN_A_TOKEN)){
682+
683+
trueA<-t(na.omit(pd[,grep(HOUSEMAN_A_TOKEN, colnames(pd))]))
684+
trueA <- apply(trueA,c(1,2),as.numeric)
685+
rownames(trueA)<-gsub(HOUSEMAN_A_TOKEN, "", colnames(pd)[grep(HOUSEMAN_A_TOKEN, colnames(pd))])
686+
687+
save(trueA, file=sprintf("%s/trueA.RData", OUTPUTDIR))
688+
689+
}
669690
}
670691
if(!is.na(REF_CT_COLUMN)){
671692
ct<-pd[[REF_CT_COLUMN]]

R/start_analysis.R

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -357,8 +357,11 @@ start_medecom_analysis<-function(
357357
ref.meth <- trueT
358358
save(ref.meth,file=file.path(store.path,"ref_meth.RData"))
359359
}
360+
if(!is.null(trueA)){
361+
ref.props <- trueA
362+
save(ref.props,file=file.path(store.path,"ref_props.RData"))
363+
}
360364
}
361-
362365
return(result)
363366
}
364367

@@ -544,7 +547,9 @@ start_decomp_pipeline <- function(rnb.set,
544547
SAMPLE_SELECTION_GREP = sample.selection.grep,
545548
REF_CT_COLUMN=ref.ct.column,
546549
PHENO_COLUMNS=pheno.cols,
550+
PREPARE_TRUE_PROPORTIONS=prepare.true.proportions,
547551
TRUE_A_TOKEN=true.A.token,
552+
HOUSEMAN_A_TOKEN=houseman.A.token,
548553
ID_COLUMN=id.column,
549554
FILTER_COVERAGE = filter.coverage,
550555
MIN_COVERAGE=min.coverage,

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Large automated pipeline for running MeDeCom
33

44
# Using Decomp
5-
*DecompPipeline* includes three major steps, all of them are extensively documented. A more detailed introduction into *DecompPipeline* can be found in the package vignette (https://github.com/lutsik/DecompPipeline/blob/master/vignettes/DecompPipeline.Rmd).
5+
*DecompPipeline* includes three major steps, all of them are extensively documented. A more detailed introduction into *DecompPipeline* can be found in the package vignette (https://github.com/lutsik/DecompPipeline/blob/master/vignettes/DecompPipeline.md).
66

77
## CpG filtering
88
There are dedicated preprocessing steps for both array-based data sets (```prepare_data```) and sequencing-based data sets (```prepare_data_BS```).

man/prepare_data_BS.Rd

Lines changed: 9 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vignettes/DecompPipeline.Rmd

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,17 @@
11
---
2-
title: "DecompPipeline: Preprocessing of DNA Methylation data for MeDeCom"
2+
title: 'DecompPipeline: Preprocessing of DNA Methylation data for MeDeCom'
33
author: "Michael Scherer, Pavlo Lutsik"
4-
date: "`r Sys.Date()`"
4+
date: '`r Sys.Date()`'
55
output:
6-
rmarkdown::html_document:
7-
mathjax: default
8-
toc: true
9-
number_sections: false
10-
fig_width: 5
6+
html_document:
117
fig_height: 5
12-
vignette: >
13-
%\VignetteIndexEntry{MeDeCom}
14-
%\VignetteEngine{knitr::rmarkdown}
15-
\usepackage[utf8]{inputenc}
8+
fig_width: 5
9+
keep_md: yes
10+
mathjax: default
11+
number_sections: no
12+
toc: yes
13+
pdf_document:
14+
toc: yes
1615
bibliography: biblio.bib
1716
---
1817

@@ -66,7 +65,7 @@ names(data.prep)
6665

6766
For bisulfite sequencing data sets, different filtering criteria apply. First, a absolute coverage threshold can be specified with ```MIN_COVERAGE``` to remove all sites with lower coverage. Similar to array-based data sets, upper and lower quantile of coverage can be omitted using ```MIN_COVG_QUANT``` and ```MAX_COVG_QUANT```. In complete accordance with array-based data sets, sites having missing values, located at annotated SNPs and on sex chromosomes can be removed.
6867

69-
```{r}
68+
```{r, eval=T}
7069
rnb.set <- load.rnb.set(system.file("extdata/small_rnbSet.zip",package="DecompPipeline"))
7170
data.prep.bs <- prepare_data_BS(RNB_SET = rnb.set,
7271
MIN_COVERAGE = 5,
@@ -96,7 +95,7 @@ Since performing MeDeCom on complete 450k/EPIC or BS datasets is still computati
9695

9796
For most of the options (except for **houseman2012**, **jaffe2014**, and **range**) the number of selected sites can be specified using the parameter ```N_MARKERS```. In contrast to CpG filtering, subset selection is independent of the data type (array-based and BS). The function returns a list, with each entry containing row indices of the selected sites:
9897

99-
```{r}
98+
```{r, eval=T}
10099
cg_subsets <- prepare_CG_subsets(rnb.set=data.prep$rnb.set.filtered,
101100
MARKER_SELECTION = c("houseman2012","var"),
102101
N_MARKERS = 1000
@@ -108,7 +107,7 @@ lengths(cg_subsets)
108107

109108
After these preprocessing steps, you are ready to perfom the actual MeDeCom analysis using the ```start_medecom_analysis``` function. To store output in a format that is later on readable by FactorViz, you need to set the flag ```factorviz.outputs```. Further parameters are described in detail in the reference manual.
110109

111-
```{r}
110+
```{r, eval=F}
112111
md.res <- start_medecom_analysis(rnb.set=data.prep$rnb.set.filtered,
113112
cg_groups = cg_subsets,
114113
Ks=2:5,
@@ -120,7 +119,7 @@ md.res <- start_medecom_analysis(rnb.set=data.prep$rnb.set.filtered,
120119

121120
You can also peform all the steps above, by just calling a single function:
122121

123-
```{r}
122+
```{r, eval=F}
124123
md.res <- start_decomp_pipeline(rnb.set=rnb.set,
125124
Ks=2:5,
126125
lambda.grid = c(0.01,0.001),

0 commit comments

Comments
 (0)