is-vaje/v7/lab7_code.R

208 lines
5.2 KiB
R

################################################################
#
# Combining machine learning algorithms
#
################################################################
#install.packages("CORElearn")
library(CORElearn)
vehicle <- read.table("vehicle.txt", sep=",", header = T)
summary(vehicle)
set.seed(8678686)
sel <- sample(1:nrow(vehicle), size=as.integer(nrow(vehicle)*0.7), replace=F)
learn <- vehicle[sel,]
test <- vehicle[-sel,]
learn$Class <- as.factor(learn$Class)
test$Class <- as.factor(test$Class)
table(learn$Class)
table(test$Class)
CA <- function(observed, predicted)
{
t <- table(observed, predicted)
sum(diag(t)) / sum(t)
}
#
# A simple tree baseline.
#
#
# Voting
#
modelDT <- CoreModel(Class ~ ., learn, model="tree")
modelNB <- CoreModel(Class ~ ., learn, model="bayes")
modelKNN <- CoreModel(Class ~ ., learn, model="knn", kInNN = 2)
predDT <- predict(modelDT, test, type = "class")
result.caDT <- CA(test$Class, predDT)
result.caDT
predNB <- predict(modelNB, test, type="class")
result.caNB <- CA(test$Class, predNB)
result.caNB
predKNN <- predict(modelKNN, test, type="class")
result.caKNN <- CA(test$Class, predKNN)
result.caKNN
# combine predictions into a data frame
pred <- data.frame(predDT, predNB, predKNN)
pred
# the class with the most votes wins
voting <- function(predictions)
{
res <- vector()
for (i in 1 : nrow(predictions))
{
vec <- unlist(predictions[i,])
res[i] <- names(which.max(table(vec)))
}
factor(res, levels=levels(predictions[,1]))
}
predicted <- voting(pred)
result.voting <- CA(test$Class, predicted)
result.voting
#
# Weighted voting
#
predDT.prob <- predict(modelDT, test, type="probability")
predNB.prob <- predict(modelNB, test, type="probability")
predKNN.prob <- predict(modelKNN, test, type="probability")
# combine predictions into a data frame
pred.prob <- result.caDT * predDT.prob + result.caNB * predNB.prob + result.caKNN * predKNN.prob
pred.prob
# We can visualize the joint output space!
heatmap(pred.prob, col=c('red','green','blue'))
legend(x="right", legend=c("min", "med", "max"),
fill=c('red','green','blue'))
# pick the class with the highest score
highest <- apply(pred.prob, 1, which.max)
classes <- levels(learn$Class)
predicted <- classes[highest]
result.wvoting <- CA(test$Class, predicted)
result.wvoting
#
# Bagging
#
#install.packages("ipred")
library(ipred)
bag <- bagging(Class ~ ., learn, nbagg=14)
bag.pred <- predict(bag, test, type="class")
result.bagging <- CA(test$Class, bag.pred)
result.bagging
#
# Random forest as a variation of bagging
#
# install.packages("randomForest")
library(randomForest)
rf <- randomForest(Class ~ ., learn)
rf.predicted <- predict(rf, test, type = "class")
result.rf <- CA(test$Class, rf.predicted)
result.rf
#
# Boosting
#
# install.packages("adabag")
library(adabag)
bm <- boosting(Class ~ ., learn)
predictions.bm <- predict(bm, test)
names(predictions.bm)
predicted.bm <- predictions.bm$class
result.boosting <- CA(test$Class, predicted.bm)
result.boosting
#
# Caret package
#
# install.packages("caret")
library(caret)
caretModel <- train(Class ~ ., learn, method="xgbLinear", eta=1, verbose=1)
pred <- predict(caretModel, test)
result.xgb <- CA(test$Class, pred)
result.xgb
# Let's visualize currently considered ensemble methods.
performances <- c(result.rf,
result.bagging,
result.wvoting,
result.boosting,
result.caDT,
result.caKNN,
result.caNB,
result.xgb)
algo.names <- c("RF","Bagging","Weighted vote","Boosting","DT","KNN","NB","XGB")
ensemble.model <- as.factor(c(1,1,1,1,0,0,0,1))
result.vec <- data.frame(performances, algo.names,ensemble.model)
reordering <- order(result.vec$performances)
result.vec <- result.vec[reordering,]
rownames(result.vec) <- NULL
result.vec
library(ggplot2)
positions <- as.vector(result.vec$algo.names)
ggplot(data=result.vec, aes(x=algo.names, y=performances, color = ensemble.model)) +
geom_point(size = 10, shape = 4) +
scale_x_discrete(limits = positions) +
ylim(0.5,0.8) +
xlab("Ensemble type") +
ylab("Accuracy") +
geom_hline(yintercept=max(performances), color = "darkgreen") +
geom_hline(yintercept=min(performances), color = "black") +
title("Performance comparison") +
geom_text(label = positions,nudge_x = 0.2, nudge_y = -0.01) +
theme_bw() +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
#
# Cross-validation
#
# the library ipred is needed to perform cross-validation
library(ipred)
# Tell the cross-validation which model to use
mymodel.coremodel <- function(formula, data, target.model){CoreModel(formula, data, model=target.model)}
# Tell the cross-validation how to obtain predictions
mypredict.generic <- function(object, newdata){predict(object, newdata, type = "class")}
# force the predict function to return class labels only and also destroy the internal representation of a given model
mypredict.coremodel <- function(object, newdata) {pred <- predict(object, newdata)$class; destroyModels(object); pred}
cvError <- errorest(Class~., data=learn, model = mymodel.coremodel, predict = mypredict.coremodel, target.model = "tree")
1 - cvError$error