Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 3 additions & 141 deletions randomForest.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@


setwd("~/PycharmProjects/titanic")


Expand All @@ -25,7 +24,8 @@ exp<- t[,c(3,5:8,10:12)]
colnames(exp)

x<- data.frame(target,exp)
x<-x[sapply(x, function(x) !any(is.na(x)))]
#x<-x[sapply(x, function(x) !any(is.na(x)))]
#vamos a atender el asunto de los NA

str(x)

Expand All @@ -34,141 +34,8 @@ str(x)
x$Sex<-ifelse(x$Sex=="female",0,1)
CABIN<- lapply(x$Cabin, substring, 1, 1)
x$Cabin<-unlist(CABIN)
table(x$Embarked)
empty <- x$Embarked == ""
x$Embarked[empty] <- "S"
x$Embarked<-ifelse(x$Embarked=="C",1,ifelse(x$Embarked=="Q",2,3))
x$Cabin<-as.factor(x$Cabin)
table(x$Cabin)
x$Cabin<-ifelse(x$Cabin=="A",1,ifelse(x$Cabin=="B",
2,ifelse(x$Cabin=="C",3,
ifelse(x$Cabin=="D",
4,ifelse(x$Cabin=="E",
5,ifelse(x$Cabin=="F",
6,ifelse(x$Cabin=="G",
7,ifelse(x$Cabin=="T",8,9))))))))



str(x)


x<-lapply(x, normalize)

x<-as.data.frame(x)

set.seed(7)

rand <- sample(1:nrow(x), nrow(x))

# train is our training sample.
train = x[rand[1:600], ]

# Create a holdout set for evaluating model performance.
# Note: cross-validation is even better than a single holdout sample.

cv = x[601:nrow(x), ]

# Review the outcome variable distribution.CLASIFICACIOON
table(Y_train, useNA = "ifany")

# Set the seed for reproducibility.
set.seed(1)

RF<-randomForest(as.factor(train$target)~.,
data = train[,-1],importance=TRUE,proximity=T,ntree=500)

p<-predict(RF,cv[,-1])

Accuracy(p,cv$target)

########################### intentemos otros modelos que tengo a mano

set.seed(1)

RF<-svm(as.factor(train$target)~.,data = train[,-1],
scale = TRUE,kernel="polynomial",type="C-classification",degree=3)

p<-predict(RF,cv[,-1])

Accuracy(p,cv$target)


#################################################

model <- keras_model_sequential()

model %>%
layer_dense(units = ncol(train[,-1]), activation = 'relu',
input_shape = ncol(train[,-1]), ) %>%
layer_dropout(rate = 0.99) %>%
layer_dense(units = 10, activation = 'relu') %>%
layer_dropout(rate = 0.95) %>%
layer_dense(units = 1, activation = 'sigmoid')

history <- model %>% compile(
loss = 'binary_crossentropy',
optimizer = optimizer_adam(lr=0.001),
metrics = c('accuracy')
)

model %>% fit(
as.matrix(train[,-1],dimname=NULL), train$target,
epochs =5,
batch_size = 20,
validation_split = 0.0
)

#####################################################

#intentemos sacar mas informacion de la data


View(t$Name)
View(t$Ticket)
TT<- lapply(t$Ticket, substring, 1, 1)
x$Ticket<-as.factor(unlist(TT))
table(x$Ticket)



#################con la data nueva

setwd("~/PycharmProjects/titanic")


library(caret)
library(randomForest)
library(MLmetrics)
library(e1071)
library (keras)


normalize <- function(x) {
return((x - min(x)) / (max(x) - min(x)))
}


# read data.frame with target and paste it to explanatory variables data.frame

t <- read.csv("train.csv")
target <- t$Survived

colnames(t)
exp<- t[,c(3,5:8,10:12)]
colnames(exp)

x<- data.frame(target,exp)
x<-x[sapply(x, function(x) !any(is.na(x)))]

str(x)

#tenemos tres variables con factores, vamos a cambiarlas a numericas

x$Sex<-ifelse(x$Sex=="female",0,1)
CABIN<- lapply(x$Cabin, substring, 1, 1)
x$Cabin<-unlist(CABIN)


table(x$Embarked)
empty <- x$Embarked == ""
Expand All @@ -184,11 +51,10 @@ x$Cabin<-ifelse(x$Cabin=="A",1,ifelse(x$Cabin=="B",
6,ifelse(x$Cabin=="G",
7,ifelse(x$Cabin=="T",8,9))))))))


x<-na.aggregate(x)

str(x)


x<-lapply(x, normalize)

x<-as.data.frame(x)
Expand Down Expand Up @@ -224,7 +90,3 @@ Accuracy(p,cv$target)