is-vaje/v7/lab7_code.R

################################################################
#
# Combining machine learning algorithms
#
################################################################

#install.packages("CORElearn")
library(CORElearn)

vehicle <- read.table("vehicle.txt", sep=",", header = T)
summary(vehicle)

set.seed(8678686)
sel <- sample(1:nrow(vehicle), size=as.integer(nrow(vehicle)*0.7), replace=F)
learn <- vehicle[sel,]
test <- vehicle[-sel,]

learn$Class <- as.factor(learn$Class)
test$Class <- as.factor(test$Class)

table(learn$Class)
table(test$Class)

CA <- function(observed, predicted)
{
  t <- table(observed, predicted)

  sum(diag(t)) / sum(t)
}

#
# A simple tree baseline.
#


#
# Voting
#

modelDT <- CoreModel(Class ~ ., learn, model="tree")
modelNB <- CoreModel(Class ~ ., learn, model="bayes")
modelKNN <- CoreModel(Class ~ ., learn, model="knn", kInNN = 2)

predDT <- predict(modelDT, test, type = "class")
result.caDT <- CA(test$Class, predDT)
result.caDT

predNB <- predict(modelNB, test, type="class")
result.caNB <- CA(test$Class, predNB)
result.caNB

predKNN <- predict(modelKNN, test, type="class")
result.caKNN <- CA(test$Class, predKNN)
result.caKNN

# combine predictions into a data frame
pred <- data.frame(predDT, predNB, predKNN)
pred

# the class with the most votes wins
voting <- function(predictions)
{
  res <- vector()

  for (i in 1 : nrow(predictions))
  {
    vec <- unlist(predictions[i,])
    res[i] <- names(which.max(table(vec)))
  }

  factor(res, levels=levels(predictions[,1]))
}

predicted <- voting(pred)
result.voting <- CA(test$Class, predicted)
result.voting

#
# Weighted voting
#

predDT.prob <- predict(modelDT, test, type="probability")
predNB.prob <- predict(modelNB, test, type="probability")
predKNN.prob <- predict(modelKNN, test, type="probability")

# combine predictions into a data frame
pred.prob <- result.caDT * predDT.prob + result.caNB * predNB.prob + result.caKNN * predKNN.prob
pred.prob

# We can visualize the joint output space!
heatmap(pred.prob, col=c('red','green','blue'))
legend(x="right", legend=c("min", "med", "max"),
       fill=c('red','green','blue'))

# pick the class with the highest score
highest <- apply(pred.prob, 1, which.max)
classes <- levels(learn$Class)
predicted <- classes[highest]

result.wvoting <- CA(test$Class, predicted)
result.wvoting


#
# Bagging
#

#install.packages("ipred")
library(ipred)

bag <- bagging(Class ~ ., learn, nbagg=14)
bag.pred <- predict(bag, test, type="class")
result.bagging <- CA(test$Class, bag.pred)
result.bagging


#
# Random forest as a variation of bagging
#

# install.packages("randomForest")
library(randomForest)

rf <- randomForest(Class ~ ., learn)
rf.predicted <- predict(rf, test, type = "class")
result.rf <- CA(test$Class, rf.predicted)
result.rf


#
# Boosting
#

# install.packages("adabag")
library(adabag)

bm <- boosting(Class ~ ., learn)
predictions.bm <- predict(bm, test)
names(predictions.bm)

predicted.bm <- predictions.bm$class
result.boosting <- CA(test$Class, predicted.bm)
result.boosting

#
# Caret package
#

# install.packages("caret")
library(caret)
caretModel <- train(Class ~ ., learn, method="xgbLinear", eta=1, verbose=1)
pred <- predict(caretModel, test)
result.xgb <- CA(test$Class, pred)
result.xgb

# Let's visualize currently considered ensemble methods.
performances <- c(result.rf,
                  result.bagging,
                  result.wvoting,
                  result.boosting,
                  result.caDT,
                  result.caKNN,
                  result.caNB,
                  result.xgb)

algo.names <- c("RF","Bagging","Weighted vote","Boosting","DT","KNN","NB","XGB")
ensemble.model <- as.factor(c(1,1,1,1,0,0,0,1))
result.vec <- data.frame(performances, algo.names,ensemble.model)
reordering <- order(result.vec$performances)
result.vec <- result.vec[reordering,]
rownames(result.vec) <- NULL
result.vec

library(ggplot2)
positions <- as.vector(result.vec$algo.names)
ggplot(data=result.vec, aes(x=algo.names, y=performances, color = ensemble.model)) +
  geom_point(size = 10, shape = 4) +
  scale_x_discrete(limits = positions) +
  ylim(0.5,0.8) +
  xlab("Ensemble type") +
  ylab("Accuracy") +
  geom_hline(yintercept=max(performances), color = "darkgreen") +
  geom_hline(yintercept=min(performances), color = "black") +
  title("Performance comparison") +
  geom_text(label = positions,nudge_x = 0.2, nudge_y = -0.01) +
  theme_bw() +
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())


#
# Cross-validation
#

# the library ipred is needed to perform cross-validation
library(ipred)

# Tell the cross-validation which model to use
mymodel.coremodel <- function(formula, data, target.model){CoreModel(formula, data, model=target.model)}
# Tell the cross-validation how to obtain predictions
mypredict.generic <- function(object, newdata){predict(object, newdata, type = "class")}
# force the predict function to return class labels only and also destroy the internal representation of a given model
mypredict.coremodel <- function(object, newdata) {pred <- predict(object, newdata)$class; destroyModels(object); pred}
cvError <- errorest(Class~., data=learn, model = mymodel.coremodel, predict = mypredict.coremodel, target.model = "tree")
1 - cvError$error