ClinicalNetDynamics/cnd_project_clustering_checks_pipe.Rmd at main · HolmesLab/ClinicalNetDynamics · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
---
Author: Carrisa V. Cocuzza, PhD
Purpose: CND project, TCP dataset, Train/Test subset, dimensional phenotyping, hierarchical clustering checks.
Output: html_notebook
---
```{r}
#install.packages("NbClust")
```

```{r}
# LIBRARIES
library(dplyr)
library(bestNormalize)
library(parallel)
library(tibble)
library(tidyr)
library(Hmisc)
packages <- c("here", "psych", "NbClust", "igraph", "qgraph")
lapply(packages, require, character.only = T)
```


```{r}
# LOAD DATA. NOTE: CHANGE TO YOUR DIRECTORY.
dirHere <- "/gpfs/milgram/project/holmes/cvc23/ClinicalNetDynamics/data/results/misc_results_CND/"
#fileHere <- paste(dirHere,"dataHere_Imputed_TrainTest.csv",sep="")
fileHere <- paste(dirHere,"dataHere_Imputed_TrainTest_select.csv",sep="")

dataHere <- read.csv(fileHere)
class(dataHere)
nMetricsHere <- ncol(dataHere)
nSubjsHere <- nrow(dataHere)
print(paste("There are",nSubjsHere,"subjects and",nMetricsHere-2,"scales."))
```


```{r}
# Individual differences correlations between all measures
dataHere_Matrix <- as.matrix(dataHere[,3:nMetricsHere])
#indivCorrs <- rcorr(dataHere_Matrix,type="spearman")
indivCorrs <- cor(dataHere_Matrix,method="spearman")
```


```{r}
# Clustering: checking that it works
nClusters <- 4
h <- hclust(as.dist(1-indivCorrs),method="ward.D2")
clusters <- cutree(h,nClusters)
print(clusters)
```


```{r}
# nbclust: find ideal clusters

# "NbClust" criteria, to select number of clusters:
nom_methode =  c("kl", "ch", "hartigan", "cindex", "db", "silhouette",
                 "ratkowsky", "ball", "ptbiserial", "gap", "frey", "mcclain",
                 "gamma", "gplus", "dunn", "sdindex", "sdbw",
                 "tau")

# NOTE: other indices:
# "ccc", "scott", "marriot", "trcovw", "tracew", "friedman", "rubin" -->  (cannot handle NaN/NULL)
# "duda", "pseudot2", "beale" -- > (for some reason cannot find best (best.nc object))
# "hubert", "dindex" -- (throws error "replacement has length zero")

nbPartition = rep(NaN, length(nom_methode))

for (i in 1:length(nom_methode)){
  nbPartition[i] = NbClust(indivCorrs,
                           diss = as.dist(((1-indivCorrs)),
                                          diag = FALSE),
                           distance = NULL,
                           min.nc = 4,
                           max.nc = 6,
                           method="ward.D2",
                           index = nom_methode[i])$Best.nc[1]
  print(paste("Number of Clusters: ", nbPartition[i]," (criteria: ",
              nom_methode[i], ")", sep=""))
}
nb_cluster_table = table(nbPartition)
nb_cluster = as.integer(names(nb_cluster_table[which.max(nb_cluster_table)]))
print(paste("Number of clusters chosen = ",nb_cluster,sep=""))

```

```{r}
# Perform clustering with chosen number of clusters from above
nClusters <- nb_cluster
h <- hclust(as.dist(1-indivCorrs),method="ward.D2")
clusters <- cutree(h,nClusters)
which(clusters==1)
```