pfizer-opensource
diff --git a/‎DESCRIPTION‎
Lines changed: 6 additions & 5 deletions b/‎DESCRIPTION‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎NEWS‎
Lines changed: 6 additions & 0 deletions b/‎NEWS‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎R/GSVA.R‎
Lines changed: 10 additions & 9 deletions b/‎R/GSVA.R‎
Lines changed: 10 additions & 9 deletions
diff --git a/‎README.md‎
Lines changed: 57 additions & 167 deletions b/‎README.md‎
Lines changed: 57 additions & 167 deletions
diff --git a/‎data-raw/Build_DILI_Hotgenes.R‎
Lines changed: 174 additions & 0 deletions b/‎data-raw/Build_DILI_Hotgenes.R‎
Lines changed: 174 additions & 0 deletions
diff --git a/‎examples/Hotgeneslimma_Example.R‎
Lines changed: 0 additions & 2 deletions b/‎examples/Hotgeneslimma_Example.R‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎inst/extdata/dili_raw.RDS‎
839 KB b/‎inst/extdata/dili_raw.RDS‎
839 KB
@@ -1,14 +1,14 @@
 Package: Hotgenes
 Type: Package
-Title: Tools to simplify Omics DE analysis
-Version: 0.0.52
+Title: Tools to simplify Omics DE Analysis
+Version: 0.0.53
 Author: Richard Virgen-Slane <Richard.Virgen-Slane@pfizer.com>
 Maintainer: Richard Virgen-Slane <Richard.Virgen-Slane@pfizer.com>
 Description: Converts outputs from DESeq2, limma, or a method of your
     choice into a Hotgenes object to simplify downstream analysis. 
     Recall all results, specific genes, or identify genes of interest with 
     hierarchical clustering on principal components.
-License: Apache License (>= 2)
+License: Apache License (>= 2) | file LICENSE
 Encoding: UTF-8
 LazyData: true
 RoxygenNote: 7.3.3
@@ -31,7 +31,6 @@ Imports:
     grid,
     GSVA (>= 1.46.0),
     janitor,
-    knitr,
     limma,
     methods,
     msigdbr (>= 7.5.1),
@@ -61,13 +60,15 @@ Suggests:
     apeglm,
     edgeR,
     devtools,
+    knitr,
     roxygen2,
     testthat,
     org.Hs.eg.db,
     org.Mm.eg.db,
     plotly,
     DelayedArray (>= 0.24.0),
     DBI,
-    rmarkdown
+    rmarkdown,
+    vsn
 VignetteBuilder: knitr
 Depends: R (>= 4.2.0)
@@ -1,3 +1,9 @@
+# version 0.0.53:
+    - Added Build_DILI_Hotgenes.R to data-raw — builds a Hotgenes object
+      from the publicly available DILI discovery proteomics dataset
+      (Federspiel et al., MassIVE MSV000089782).
+    - Other clean up
+
 # version 0.0.52:
     - Exported make_stat_frame()
     - Cleaned up documentation in Venn methods.
 
@@ -22,7 +22,7 @@
 #' @param method string for method to use for GSVA package. Options
 #' include: c("ssgsea","gsva", "zscore", "plage").
 #' @export
-#' @example examples/Hotgeneslimma_Example.R
+#' @example examples/GSVA_Example.R
 
 HotgeneSets <- function(Hotgenes = NULL,
 ExpressionSlots = NULL,
@@ -58,7 +58,7 @@ geneset_weights <- gsva_outList$geneset_weights
 
 if(use_vooma){
 
-glue::glue("using vooma") %>% message()
+  cli::cli_inform("using vooma")
 
 # limma method
 if(is.null(voomaGroup)){
@@ -87,9 +87,8 @@ vooma_plot <- NULL
 }
 
 if(use_weights){
-glue::glue("using geneset weights") %>% message()
-
 
+cli::cli_inform("using geneset weights")
 trend <- geneset_weights
 
 } else {
@@ -103,7 +102,8 @@ fit <- limma::lmFit(vm_exp)
 
 # using contrast matrix
 if(!is.null(contrast_matrix)){
-glue::glue("using contrast matrix") %>% message()
+  
+cli::cli_inform("using contrast matrix")  
 
 fit_final <- limma::contrasts.fit(fit = fit,
 contrasts = contrast_matrix) 
@@ -162,17 +162,18 @@ Matched_ExpSel <- match.arg(ExpressionSlots, ExprOptions)
 
 # prepare to remap
 if (MapperCol == "Feature") {
-print("using Feature col")
 
+cli::cli_inform("using Feature col")
+  
 NormalizedData <- Normalized_Data_(
 Hotgenes,
 slot = Matched_ExpSel
 ) %>%
 as.matrix()
 } else if (MapperCol != "Feature") {
-paste0("using ", MapperCol, " col") %>%
-print()
 
+cli::cli_inform("using {MapperCol} col")  
+  
 # This converts ids from expression data to
 # ids supplied in original mapper slot
 
@@ -198,7 +199,7 @@ as.matrix()
 
 # build a new mapper for geneset names
 
-print("building mapper")
+cli::cli_inform("building mapper")
 
 Featur_s <- rownames(NormalizedData)
 
 
@@ -0,0 +1,174 @@
+# Build DILI Discovery Proteomics Hotgenes Object -------------------------
+# Source: Federspiel et al. (2023), J Hepatol
+# Data:   MassIVE MSV000089782 (public deposit)
+#
+# Differences from the original analysis:
+#   The original analysis used two variables from private metadata
+#   (1219_p1-p5_protein_KEY.xlsx) that are not available in the public
+#   MassIVE deposit:
+#
+#   (1) subject: patient ID — used with duplicateCorrelation() to control
+#       for repeated measures (DO and DF samples from the same patient).
+#       Without subject IDs this block cannot be reproduced and is omitted.
+#
+#   (2) channel: exact TMT channel label (e.g. 126C, 127N) within each
+#       pool — used with voomaByGroup() to compensate for channel-specific
+#       variance. Pool (P1-P5) is used as a proxy here, as it captures
+#       the dominant source of TMT batch variance.
+#
+#   All other analytical steps — VSN normalization, voomaByGroup, robust
+#   lmFit, and BH-adjusted p-value threshold of 0.1 — reproduce those
+#   described in Federspiel et al.
+
+
+# 1. Load data ------------------------------------------------------------
+# dili_raw.RDS is a pre-parsed snapshot of the public MassIVE deposit
+# (MSV000089782), saved to inst/extdata to avoid a runtime download.
+# To refresh it, run the commented block below.
+
+raw_df <- readRDS(
+  system.file("extdata", "dili_raw.RDS",
+              package = "Hotgenes",
+              mustWork = TRUE)
+)
+
+# To re-download and refresh dili_raw.RDS from MassIVE:
+if (FALSE) {
+  url <- paste0(
+    "https://massive.ucsd.edu/ProteoSAFe/DownloadResultFile?",
+    "file=f.MSV000089782%2Fupdates%2F2023-02-28_jfederspiel_1bc96582",
+    "%2Fother%2FDILI_discovery_data.xlsx&forceDownload=true"
+  )
+  
+  tmp    <- tempfile(fileext = ".xlsx")
+  download.file(url, tmp, mode = "wb")
+  
+  raw_df <- readxl::read_excel(tmp) |>
+    janitor::clean_names()
+  
+  saveRDS(
+    raw_df,
+    file = file.path(getwd(), "inst", "extdata", "dili_raw.RDS")
+  )
+}
+
+
+# 2. Sample columns -------------------------------------------------------
+
+sample_cols <- colnames(raw_df)[
+  grepl("^(do|df|hv|nafld|ndo|ndf)_", colnames(raw_df))
+]
+
+
+# 3. Filter proteins ------------------------------------------------------
+# - remove contaminants
+# - keep reviewed UniProt entries (^sp|)
+# - keep min_peps > 0
+# - generate unique gene symbols as Feature (required for GSEA)
+
+filtered_exps <- raw_df |>
+  dplyr::filter(!grepl("contaminant", .data$protein)) |>
+  dplyr::filter(grepl("^sp[|]", .data$protein)) |>
+  dplyr::filter(.data$min_peps > 0) |>
+  dplyr::mutate(Feature = make.names(.data$gene_symbol, unique = TRUE))
+
+
+# 4. Expression matrix ----------------------------------------------------
+
+expr_matrix <- filtered_exps |>
+  dplyr::select("Feature", dplyr::any_of(sample_cols)) |>
+  tibble::column_to_rownames("Feature") |>
+  as.matrix()
+
+
+# 5. Protein ID mapper ----------------------------------------------------
+
+mapper_df <- filtered_exps |>
+  dplyr::select(
+    "Feature",
+    "Gene"        = "gene_symbol",
+    "Protein"     = "protein",
+    "Description" = "description"
+  )
+
+
+# 6. Sample metadata (coldata) --------------------------------------------
+
+coldata <- data.frame(
+  Sample    = sample_cols,
+  Condition = toupper(sub("_p[0-9]+_[0-9]+$", "", sample_cols)),
+  Pool      = toupper(sub(".*_(p[0-9]+)_.*", "\\1", sample_cols)),
+  row.names = sample_cols,
+  stringsAsFactors = TRUE
+)
+
+coldata[["Condition"]] <- factor(
+  coldata[["Condition"]],
+  levels = c("HV", "DO", "DF", "NDO", "NDF", "NAFLD")
+)
+
+
+# 7. VSN normalization ----------------------------------------------------
+
+expr_matrix[is.na(expr_matrix)] <- 0
+expr_matrix_vsn <- limma::normalizeVSN(expr_matrix)
+
+
+# 8. Design matrix --------------------------------------------------------
+
+design <- model.matrix(~ 0 + Condition, data = coldata)
+colnames(design) <- gsub("Condition", "", colnames(design))
+
+
+# 9. voomaByGroup ---------------------------------------------------------
+
+vm_exp <- limma::voomaByGroup(
+  y      = expr_matrix_vsn,
+  group  = coldata[["Pool"]],
+  design = design,
+  plot   = FALSE
+)
+
+
+# 10. Robust lmFit --------------------------------------------------------
+# Note: duplicateCorrelation() is omitted — subject IDs required to model
+# repeated measures (DO/DF pairing) are not available in the public data.
+
+fit <- limma::lmFit(
+  vm_exp,
+  design,
+  method = "robust"
+)
+
+
+# 11. Contrasts -----------------------------------------------------------
+
+contrasts_mat <- limma::makeContrasts(
+  DO_vs_HV    = DO    - HV,
+  DF_vs_HV    = DF    - HV,
+  NDO_vs_HV   = NDO   - HV,
+  NDF_vs_HV   = NDF   - HV,
+  NAFLD_vs_HV = NAFLD - HV,
+  DF_vs_DO    = DF    - DO,
+  NDO_vs_DO   = NDO   - DO,
+  levels = design
+)
+
+fit2 <- limma::contrasts.fit(fit, contrasts_mat)
+fit2 <- limma::eBayes(fit2)
+
+
+# 12. Hotgenes object -----------------------------------------------------
+
+dili_hotgenes <- Hotgenes::Hotgeneslimma(
+  limmafit        = fit2,
+  coldata         = coldata,
+  Expression      = vm_exp,
+  Expression_name = "VSN",
+  Exps_list       = list(log2 = log2(expr_matrix + 1)),
+  Mapper          = mapper_df
+)
+
+dili_hotgenes
+
+rm(list = ls())
@@ -15,8 +15,6 @@ if(interactive()) {
 
   # Hotgeneslimma -----------------------------------------------------------
   require(DESeq2)
-  #require(limma)
-  #require(edgeR)
 
   dds_con_dir <- system.file("extdata",
                              "dds_con.Rdata",