1+ # Build DILI Discovery Proteomics Hotgenes Object -------------------------
2+ # Source: Federspiel et al. (2023), J Hepatol
3+ # Data: MassIVE MSV000089782 (public deposit)
4+ #
5+ # Differences from the original analysis:
6+ # The original analysis used two variables from private metadata
7+ # (1219_p1-p5_protein_KEY.xlsx) that are not available in the public
8+ # MassIVE deposit:
9+ #
10+ # (1) subject: patient ID — used with duplicateCorrelation() to control
11+ # for repeated measures (DO and DF samples from the same patient).
12+ # Without subject IDs this block cannot be reproduced and is omitted.
13+ #
14+ # (2) channel: exact TMT channel label (e.g. 126C, 127N) within each
15+ # pool — used with voomaByGroup() to compensate for channel-specific
16+ # variance. Pool (P1-P5) is used as a proxy here, as it captures
17+ # the dominant source of TMT batch variance.
18+ #
19+ # All other analytical steps — VSN normalization, voomaByGroup, robust
20+ # lmFit, and BH-adjusted p-value threshold of 0.1 — reproduce those
21+ # described in Federspiel et al.
22+
23+
24+ # 1. Load data ------------------------------------------------------------
25+ # dili_raw.RDS is a pre-parsed snapshot of the public MassIVE deposit
26+ # (MSV000089782), saved to inst/extdata to avoid a runtime download.
27+ # To refresh it, run the commented block below.
28+
29+ raw_df <- readRDS(
30+ system.file(" extdata" , " dili_raw.RDS" ,
31+ package = " Hotgenes" ,
32+ mustWork = TRUE )
33+ )
34+
35+ # To re-download and refresh dili_raw.RDS from MassIVE:
36+ if (FALSE ) {
37+ url <- paste0(
38+ " https://massive.ucsd.edu/ProteoSAFe/DownloadResultFile?" ,
39+ " file=f.MSV000089782%2Fupdates%2F2023-02-28_jfederspiel_1bc96582" ,
40+ " %2Fother%2FDILI_discovery_data.xlsx&forceDownload=true"
41+ )
42+
43+ tmp <- tempfile(fileext = " .xlsx" )
44+ download.file(url , tmp , mode = " wb" )
45+
46+ raw_df <- readxl :: read_excel(tmp ) | >
47+ janitor :: clean_names()
48+
49+ saveRDS(
50+ raw_df ,
51+ file = file.path(getwd(), " inst" , " extdata" , " dili_raw.RDS" )
52+ )
53+ }
54+
55+
56+ # 2. Sample columns -------------------------------------------------------
57+
58+ sample_cols <- colnames(raw_df )[
59+ grepl(" ^(do|df|hv|nafld|ndo|ndf)_" , colnames(raw_df ))
60+ ]
61+
62+
63+ # 3. Filter proteins ------------------------------------------------------
64+ # - remove contaminants
65+ # - keep reviewed UniProt entries (^sp|)
66+ # - keep min_peps > 0
67+ # - generate unique gene symbols as Feature (required for GSEA)
68+
69+ filtered_exps <- raw_df | >
70+ dplyr :: filter(! grepl(" contaminant" , .data $ protein )) | >
71+ dplyr :: filter(grepl(" ^sp[|]" , .data $ protein )) | >
72+ dplyr :: filter(.data $ min_peps > 0 ) | >
73+ dplyr :: mutate(Feature = make.names(.data $ gene_symbol , unique = TRUE ))
74+
75+
76+ # 4. Expression matrix ----------------------------------------------------
77+
78+ expr_matrix <- filtered_exps | >
79+ dplyr :: select(" Feature" , dplyr :: any_of(sample_cols )) | >
80+ tibble :: column_to_rownames(" Feature" ) | >
81+ as.matrix()
82+
83+
84+ # 5. Protein ID mapper ----------------------------------------------------
85+
86+ mapper_df <- filtered_exps | >
87+ dplyr :: select(
88+ " Feature" ,
89+ " Gene" = " gene_symbol" ,
90+ " Protein" = " protein" ,
91+ " Description" = " description"
92+ )
93+
94+
95+ # 6. Sample metadata (coldata) --------------------------------------------
96+
97+ coldata <- data.frame (
98+ Sample = sample_cols ,
99+ Condition = toupper(sub(" _p[0-9]+_[0-9]+$" , " " , sample_cols )),
100+ Pool = toupper(sub(" .*_(p[0-9]+)_.*" , " \\ 1" , sample_cols )),
101+ row.names = sample_cols ,
102+ stringsAsFactors = TRUE
103+ )
104+
105+ coldata [[" Condition" ]] <- factor (
106+ coldata [[" Condition" ]],
107+ levels = c(" HV" , " DO" , " DF" , " NDO" , " NDF" , " NAFLD" )
108+ )
109+
110+
111+ # 7. VSN normalization ----------------------------------------------------
112+
113+ expr_matrix [is.na(expr_matrix )] <- 0
114+ expr_matrix_vsn <- limma :: normalizeVSN(expr_matrix )
115+
116+
117+ # 8. Design matrix --------------------------------------------------------
118+
119+ design <- model.matrix(~ 0 + Condition , data = coldata )
120+ colnames(design ) <- gsub(" Condition" , " " , colnames(design ))
121+
122+
123+ # 9. voomaByGroup ---------------------------------------------------------
124+
125+ vm_exp <- limma :: voomaByGroup(
126+ y = expr_matrix_vsn ,
127+ group = coldata [[" Pool" ]],
128+ design = design ,
129+ plot = FALSE
130+ )
131+
132+
133+ # 10. Robust lmFit --------------------------------------------------------
134+ # Note: duplicateCorrelation() is omitted — subject IDs required to model
135+ # repeated measures (DO/DF pairing) are not available in the public data.
136+
137+ fit <- limma :: lmFit(
138+ vm_exp ,
139+ design ,
140+ method = " robust"
141+ )
142+
143+
144+ # 11. Contrasts -----------------------------------------------------------
145+
146+ contrasts_mat <- limma :: makeContrasts(
147+ DO_vs_HV = DO - HV ,
148+ DF_vs_HV = DF - HV ,
149+ NDO_vs_HV = NDO - HV ,
150+ NDF_vs_HV = NDF - HV ,
151+ NAFLD_vs_HV = NAFLD - HV ,
152+ DF_vs_DO = DF - DO ,
153+ NDO_vs_DO = NDO - DO ,
154+ levels = design
155+ )
156+
157+ fit2 <- limma :: contrasts.fit(fit , contrasts_mat )
158+ fit2 <- limma :: eBayes(fit2 )
159+
160+
161+ # 12. Hotgenes object -----------------------------------------------------
162+
163+ dili_hotgenes <- Hotgenes :: Hotgeneslimma(
164+ limmafit = fit2 ,
165+ coldata = coldata ,
166+ Expression = vm_exp ,
167+ Expression_name = " VSN" ,
168+ Exps_list = list (log2 = log2(expr_matrix + 1 )),
169+ Mapper = mapper_df
170+ )
171+
172+ dili_hotgenes
173+
174+ rm(list = ls())
0 commit comments