This repository was archived by the owner on May 21, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclusteringPart2.R
More file actions
69 lines (59 loc) · 1.92 KB
/
clusteringPart2.R
File metadata and controls
69 lines (59 loc) · 1.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
## This part asks for the discovering of the
## correct number of clusters.
library(kernlab)
library(ggplot2)
require(plot3D)
library(tm)
source("connect.R")
source("gapkernel.R")
findBestK <- function(K, limit,name, kReal){
ss <-vector(length = limit)
set.seed(20)
for( i in seq(2,limit+1)) {
possibleError <- tryCatch({
ss[i-1] <- sum(withinss(kkmeans(K, centers = i, nstart = 10)))
},
error=function(e) print("Param combination does not work")
)
}
k = which.min(ss)
jpeg(paste("charts/",name,".jpg"))
plot(seq(2,limit+1), ss, type="o",
main = paste(name,"-kernel ", "k real = ",as.character(kReal) ),
xlab = "different k", ylab="sum of ss")
dev.off()
}
##Size of the samples
N = 30
## Read table
reuters <- read.table("reuters.txt.gz", header=TRUE)
reuters$Content <- as.character(reuters$Content)
reuters$Topic <- factor(reuters$Topic)
## Sample a number cat of categories
cat = 15
set.seed(6)
topics <- factor( sample(unique(reuters$Topic),cat) )
reuters <- reuters[ sapply(reuters[,1], function (x) x %in% topics) , ]
set.seed(6)
reuters <- reuters[sample(1:nrow(reuters),N),]
##Preprocess the data
docs <- Corpus(VectorSource(reuters$Content))
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs,removeNumbers)
reuters$Content <- sapply(docs, function(x){x$content})
##Correct number of categories
kReal = length(unique(reuters[,1]))
limit = 11
k <- stringdot("spectrum", length=2)
K <- kernelMatrix(k, reuters$Content)
findBestK(K,limit,"SPECTRUM",kReal )
k <- new("kernel", .Data=connect, kpar=list())
K <- kernelMatrix(k ,reuters$Content)
findBestK(K,limit,"CONNECT",kReal )
k <- makeGapKernel(0.1, 8)
K <- kernelMatrix(k, reuters$Content)
findBestK(K,limit,"GAP",kReal )
k <- stringdot("exponential",length= 4, lambda = 1.4 )
K <- kernelMatrix(k, reuters$Content)
findBestK(K, limit,"EXPONENTIAL",kReal )