208 lines
5.2 KiB
R
208 lines
5.2 KiB
R
################################################################
|
|
#
|
|
# Combining machine learning algorithms
|
|
#
|
|
################################################################
|
|
|
|
#install.packages("CORElearn")
|
|
library(CORElearn)
|
|
|
|
vehicle <- read.table("vehicle.txt", sep=",", header = T)
|
|
summary(vehicle)
|
|
|
|
set.seed(8678686)
|
|
sel <- sample(1:nrow(vehicle), size=as.integer(nrow(vehicle)*0.7), replace=F)
|
|
learn <- vehicle[sel,]
|
|
test <- vehicle[-sel,]
|
|
|
|
learn$Class <- as.factor(learn$Class)
|
|
test$Class <- as.factor(test$Class)
|
|
|
|
table(learn$Class)
|
|
table(test$Class)
|
|
|
|
CA <- function(observed, predicted)
|
|
{
|
|
t <- table(observed, predicted)
|
|
|
|
sum(diag(t)) / sum(t)
|
|
}
|
|
|
|
#
|
|
# A simple tree baseline.
|
|
#
|
|
|
|
|
|
#
|
|
# Voting
|
|
#
|
|
|
|
modelDT <- CoreModel(Class ~ ., learn, model="tree")
|
|
modelNB <- CoreModel(Class ~ ., learn, model="bayes")
|
|
modelKNN <- CoreModel(Class ~ ., learn, model="knn", kInNN = 2)
|
|
|
|
predDT <- predict(modelDT, test, type = "class")
|
|
result.caDT <- CA(test$Class, predDT)
|
|
result.caDT
|
|
|
|
predNB <- predict(modelNB, test, type="class")
|
|
result.caNB <- CA(test$Class, predNB)
|
|
result.caNB
|
|
|
|
predKNN <- predict(modelKNN, test, type="class")
|
|
result.caKNN <- CA(test$Class, predKNN)
|
|
result.caKNN
|
|
|
|
# combine predictions into a data frame
|
|
pred <- data.frame(predDT, predNB, predKNN)
|
|
pred
|
|
|
|
# the class with the most votes wins
|
|
voting <- function(predictions)
|
|
{
|
|
res <- vector()
|
|
|
|
for (i in 1 : nrow(predictions))
|
|
{
|
|
vec <- unlist(predictions[i,])
|
|
res[i] <- names(which.max(table(vec)))
|
|
}
|
|
|
|
factor(res, levels=levels(predictions[,1]))
|
|
}
|
|
|
|
predicted <- voting(pred)
|
|
result.voting <- CA(test$Class, predicted)
|
|
result.voting
|
|
|
|
#
|
|
# Weighted voting
|
|
#
|
|
|
|
predDT.prob <- predict(modelDT, test, type="probability")
|
|
predNB.prob <- predict(modelNB, test, type="probability")
|
|
predKNN.prob <- predict(modelKNN, test, type="probability")
|
|
|
|
# combine predictions into a data frame
|
|
pred.prob <- result.caDT * predDT.prob + result.caNB * predNB.prob + result.caKNN * predKNN.prob
|
|
pred.prob
|
|
|
|
# We can visualize the joint output space!
|
|
heatmap(pred.prob, col=c('red','green','blue'))
|
|
legend(x="right", legend=c("min", "med", "max"),
|
|
fill=c('red','green','blue'))
|
|
|
|
# pick the class with the highest score
|
|
highest <- apply(pred.prob, 1, which.max)
|
|
classes <- levels(learn$Class)
|
|
predicted <- classes[highest]
|
|
|
|
result.wvoting <- CA(test$Class, predicted)
|
|
result.wvoting
|
|
|
|
|
|
|
|
#
|
|
# Bagging
|
|
#
|
|
|
|
#install.packages("ipred")
|
|
library(ipred)
|
|
|
|
bag <- bagging(Class ~ ., learn, nbagg=14)
|
|
bag.pred <- predict(bag, test, type="class")
|
|
result.bagging <- CA(test$Class, bag.pred)
|
|
result.bagging
|
|
|
|
|
|
#
|
|
# Random forest as a variation of bagging
|
|
#
|
|
|
|
# install.packages("randomForest")
|
|
library(randomForest)
|
|
|
|
rf <- randomForest(Class ~ ., learn)
|
|
rf.predicted <- predict(rf, test, type = "class")
|
|
result.rf <- CA(test$Class, rf.predicted)
|
|
result.rf
|
|
|
|
|
|
#
|
|
# Boosting
|
|
#
|
|
|
|
# install.packages("adabag")
|
|
library(adabag)
|
|
|
|
bm <- boosting(Class ~ ., learn)
|
|
predictions.bm <- predict(bm, test)
|
|
names(predictions.bm)
|
|
|
|
predicted.bm <- predictions.bm$class
|
|
result.boosting <- CA(test$Class, predicted.bm)
|
|
result.boosting
|
|
|
|
#
|
|
# Caret package
|
|
#
|
|
|
|
# install.packages("caret")
|
|
library(caret)
|
|
caretModel <- train(Class ~ ., learn, method="xgbLinear", eta=1, verbose=1)
|
|
pred <- predict(caretModel, test)
|
|
result.xgb <- CA(test$Class, pred)
|
|
result.xgb
|
|
|
|
# Let's visualize currently considered ensemble methods.
|
|
performances <- c(result.rf,
|
|
result.bagging,
|
|
result.wvoting,
|
|
result.boosting,
|
|
result.caDT,
|
|
result.caKNN,
|
|
result.caNB,
|
|
result.xgb)
|
|
|
|
algo.names <- c("RF","Bagging","Weighted vote","Boosting","DT","KNN","NB","XGB")
|
|
ensemble.model <- as.factor(c(1,1,1,1,0,0,0,1))
|
|
result.vec <- data.frame(performances, algo.names,ensemble.model)
|
|
reordering <- order(result.vec$performances)
|
|
result.vec <- result.vec[reordering,]
|
|
rownames(result.vec) <- NULL
|
|
result.vec
|
|
|
|
library(ggplot2)
|
|
positions <- as.vector(result.vec$algo.names)
|
|
ggplot(data=result.vec, aes(x=algo.names, y=performances, color = ensemble.model)) +
|
|
geom_point(size = 10, shape = 4) +
|
|
scale_x_discrete(limits = positions) +
|
|
ylim(0.5,0.8) +
|
|
xlab("Ensemble type") +
|
|
ylab("Accuracy") +
|
|
geom_hline(yintercept=max(performances), color = "darkgreen") +
|
|
geom_hline(yintercept=min(performances), color = "black") +
|
|
title("Performance comparison") +
|
|
geom_text(label = positions,nudge_x = 0.2, nudge_y = -0.01) +
|
|
theme_bw() +
|
|
theme(axis.title.x=element_blank(),
|
|
axis.text.x=element_blank(),
|
|
axis.ticks.x=element_blank())
|
|
|
|
|
|
#
|
|
# Cross-validation
|
|
#
|
|
|
|
# the library ipred is needed to perform cross-validation
|
|
library(ipred)
|
|
|
|
# Tell the cross-validation which model to use
|
|
mymodel.coremodel <- function(formula, data, target.model){CoreModel(formula, data, model=target.model)}
|
|
# Tell the cross-validation how to obtain predictions
|
|
mypredict.generic <- function(object, newdata){predict(object, newdata, type = "class")}
|
|
# force the predict function to return class labels only and also destroy the internal representation of a given model
|
|
mypredict.coremodel <- function(object, newdata) {pred <- predict(object, newdata)$class; destroyModels(object); pred}
|
|
cvError <- errorest(Class~., data=learn, model = mymodel.coremodel, predict = mypredict.coremodel, target.model = "tree")
|
|
1 - cvError$error
|