diff --git a/v4/lab4_code.R b/v4/lab4_code.R new file mode 100644 index 0000000..06bcb59 --- /dev/null +++ b/v4/lab4_code.R @@ -0,0 +1,397 @@ +# +# +# INTRODUCTION TO CLASSIFICATION +# +# + +# +# Work plan: +# +# - Read in the data +# - Split the data into training and test sets +# - Build a model using the training set +# - Evaluate the model using the test set +# + + + +# Please download the data file "players.txt" into a local directory +# then set that directory as the current working directory of R. +# You can achive this using the "setwd" command or by selecting "File -> Change dir..." + + +# We are going to use the packages: rpart, and pROC. +# Make sure that the packages are installed. +# +# You install a package in R with the function install.packages(): +# +# install.packages("pROC") +# library(pROC) +# +# To install packages without root access: +# +# install.packages("pROC", lib="/mylibs/Rpackages/") +# library(pROC, lib.loc="/mylibs/Rpackages/") +# + +# Read in the dataset +players <- read.table("players.txt", header = T, sep = ",") + +# Get the summary statistics of the data +summary(players) + + + +############################################################################ +# +# ATTENTION!!!!!!!!! +# +# The "id" attribute uniquely identifies a player within our data set. +# This attribute cannot be used to classify new players as they will +# have different id numbers. +# +# Before we continue, we have to remove the "id" attribute from our data set! +# +############################################################################ + + +players$id <- NULL +names(players) + + +############################################################################ +# +# +# Please make sure that the "id" attribute has been removed!!!!!!!!!! +# +# +############################################################################ + + + + + + +############################################################################ +# +# Example 1 +# +# Prediction of playing position +# +############################################################################ + + +# +# We want to build a model to predict a player's playing position +# with respect to the given player's statistics. +# +# The target variable "position" is discrete - we term this a classification task. +# +# We aim to verify whether or not it is possible to use historical data to predict +# playing positions for new players. +# + +# We are going to split the data into a training and testing data set. +# The training data set consists of players that ended their careers before 1999. +# The test data set consists of players that began their careers after 1999. + +learn <- players[players$lastseason <= 1999,] +test <- players[players$firstseason > 1999,] + +# We used the "firstseason" and "lastseason" attributes to split the data. +# Therefore the attributes are not going to contribute to the modelling task, so +# we will remove them. + +learn$firstseason <- NULL +learn$lastseason <- NULL + +test$firstseason <- NULL +test$lastseason <- NULL + +# We inspect the target values distribution in the defined data sets. +# (number of examples and frequency of individual classes) + +nrow(learn) +table(learn$position) + +nrow(test) +table(test$position) + + + + +# +# +# MAJORITY CLASSIFIER +# +# + + +# The majority class is the class with the highest number of training examples +which.max(table(learn$position)) + +majority.class <- names(which.max(table(learn$position))) +majority.class + +# The majority classifier classifies all test instances into the majority class. +# The accuracy of the majority class +sum(test$position == majority.class) / length(test$position) + + + + +# +# +# DECISION TREES +# +# + + +# load the "rpart" package that implements decision trees in R +library(rpart) + +# Fit a decision tree +dt <- rpart(position ~ ., data = learn) + +# +# The first argument of the rpart function is a model formula that indicates +# the functional form of the model. The "~" symbol stands for "modeled as". +# +# The formula "position ~ . " states that we want a model that predicts the +# target attribute "position" using all other attributes present in the data +# (which is the meaning of the "." character). +# +# If we wanted, for example, a model for the "position" attribute as a function +# of the attributes "height", "width", and "pts", we should have +# indicated the formula as "position ~ height + weight + pts". +# +# The second argument of the rpart function provides the training examples. +# + + +# The content of the dt object +dt + +# A graphical representation of our tree +plot(dt) +text(dt, pretty = 1) + +# To navigate the tree, start from the root node. If the attribute of the +# observation satisfies the logical test, then navigate down the left branch. +# Otherwise navigate down the right branch. Continue until a leaf node is reached. +# A leaf node represents a class label (or class label distribution). + + +# Observed values in the test samples (the ground truth) +observed <- test$position +observed + + +# The "predict" function returns a vector of predicted responses from a fitted tree. +# The "type" argument denotes the form of predicted value returned. The setting +# type = "class" will cause the response in a form of a vector of predicted +# class labels. + +predicted <- predict(dt, test, type = "class") +predicted + +# confusion matrix +t <- table(observed, predicted) +t + +# The diagonal elements in the confusion matrix represent the number of correctly +# classified test instances + +# The classification accuracy is the fraction of correctly classified test instances +sum(diag(t)) / sum(t) + + +# Let's write a function that calculates the classification accuracy +CA <- function(observed, predicted) +{ + t <- table(observed, predicted) + + sum(diag(t)) / sum(t) +} + +# Function call +CA(observed, predicted) + + + + +# The setting type = "prob" will cause the predict function to +# provide its answer in the form of a matrix with class probabilities. + +predMat <- predict(dt, test, type = "prob") +predMat + +# A matrix of the observed class probabilities (the ground truth) +# (the real class has the probability of 1, others have 0) +obsMat <- model.matrix( ~ position-1, test) +obsMat + + +# IMPORTANT: check whether the columns refer to the same attributes +# (if not, change the order of columns) +colnames(predMat) +colnames(obsMat) + + +# The Brier score +brier.score <- function(observedMatrix, predictedMatrix) +{ + sum((observedMatrix - predictedMatrix) ^ 2) / nrow(predictedMatrix) +} + +brier.score(obsMat, predMat) + + + + +############################################################################ +# +# Example 2 +# +# Does a player make at least 80% of free-throws attempted? +# +############################################################################ + +# +# It is a binary problem (the target variable is discrete with values YES and NO). +# + +# +# We are going to create a data set using the players data. +# + +# Get the players who have attempted at least one free throw during the career. +bin.players <- players[players$fta > 0,] + +# Free-throw success rate +rate <- bin.players$ftm / bin.players$fta + +# Create a discrete attribute "ftexpert" that will be our target variable. +ftexpert <- vector() +ftexpert[rate >= 0.8] <- "YES" +ftexpert[rate < 0.8] <- "NO" +bin.players$ftexpert <- as.factor(ftexpert) + +# Remove the "fta" and "ftm" attributes. +bin.players$fta <- NULL +bin.players$ftm <- NULL + +summary(bin.players) + + + +# Split the data into two disjoint sets, one for training and one for testing. +bin.learn <- bin.players[1:1500,] +bin.test <- bin.players[-(1:1500),] + +# Inspect the distribution of the class values in the data sets. +table(bin.learn$ftexpert) +table(bin.test$ftexpert) + + +# Train a decision tree +dt2 <- rpart(ftexpert ~ ., data = bin.learn) +plot(dt2) +text(dt2, pretty = 1) + + +bin.observed <- bin.test$ftexpert +bin.predicted <- predict(dt2, bin.test, type="class") + + +table(bin.observed, bin.predicted) + + +# Classification accuracy of the decision tree +CA(bin.observed, bin.predicted) + +# Exercise -> majority is important! +table(bin.observed) +sum(bin.observed == "NO") / length(bin.observed) + + +# The sensitivity of a model (recall) +Sensitivity <- function(observed, predicted, pos.class) +{ + t <- table(observed, predicted) + + t[pos.class, pos.class] / sum(t[pos.class,]) +} + +# The specificity of a model +Specificity <- function(observed, predicted, pos.class) +{ + t <- table(observed, predicted) + + # identify the negative class name + neg.class <- which(row.names(t) != pos.class) + + t[neg.class, neg.class] / sum(t[neg.class,]) +} + +# let's say that the value "YES" is our positive class... + +Sensitivity(bin.observed, bin.predicted, "YES") +Specificity(bin.observed, bin.predicted, "YES") + +# +# ROC curve +# + +library(pROC) +library(ggplot2) +library(dplyr) +library(ggrepel) # For nicer ROC visualization + +bin.predMat <- predict(dt2, bin.test, type = "prob") + +# Remember, we treat the value "YES" as the positive class +rocobj <- roc(bin.observed, bin.predMat[,"YES"]) +plot(rocobj) + +# This part below is optional -> nicer ROC +tmpDf <- data.frame(rocobj$sensitivities, 1-rocobj$specificities, rocobj$thresholds) +colnames(tmpDf) <- c("Sensitivity","Specificity", "Thresholds") + +tmpDf %>% ggplot(aes(x=Specificity,y=Sensitivity,color=Thresholds))+ + geom_point()+ + geom_line()+ + geom_abline(intercept = 0, slope = 1, alpha=0.2)+ + theme_classic()+ + ylab("Sensitivity (TPR)")+ + xlab("1-Specificity (FPR)")+ + geom_label_repel(aes(label = round(Thresholds,2)), + box.padding = 0.35, + point.padding = 0.5, + segment.color = 'grey50') + +names(rocobj) + +cutoffs <- rocobj$thresholds +cutoffs + +# identify the best cut-off value (for example, by minimazing the distance from the point (0,1)) +tp = rocobj$sensitivities +fp = 1 - rocobj$specificities + +dist <- (1-tp)^2 + fp^2 +dist +which.min(dist) + +best.cutoff <- cutoffs[which.min(dist)] +best.cutoff + +# the selected cut-off value has impact on the model's performance +predicted.label <- factor(ifelse(bin.predMat[,"YES"] >= best.cutoff, "YES", "NO")) + +table(bin.observed, predicted.label) +CA(bin.observed, predicted.label) +Sensitivity(bin.observed, predicted.label, "YES") +Specificity(bin.observed, predicted.label, "YES") + + diff --git a/v5/lab4_code.R b/v5/lab4_code.R new file mode 100644 index 0000000..06bcb59 --- /dev/null +++ b/v5/lab4_code.R @@ -0,0 +1,397 @@ +# +# +# INTRODUCTION TO CLASSIFICATION +# +# + +# +# Work plan: +# +# - Read in the data +# - Split the data into training and test sets +# - Build a model using the training set +# - Evaluate the model using the test set +# + + + +# Please download the data file "players.txt" into a local directory +# then set that directory as the current working directory of R. +# You can achive this using the "setwd" command or by selecting "File -> Change dir..." + + +# We are going to use the packages: rpart, and pROC. +# Make sure that the packages are installed. +# +# You install a package in R with the function install.packages(): +# +# install.packages("pROC") +# library(pROC) +# +# To install packages without root access: +# +# install.packages("pROC", lib="/mylibs/Rpackages/") +# library(pROC, lib.loc="/mylibs/Rpackages/") +# + +# Read in the dataset +players <- read.table("players.txt", header = T, sep = ",") + +# Get the summary statistics of the data +summary(players) + + + +############################################################################ +# +# ATTENTION!!!!!!!!! +# +# The "id" attribute uniquely identifies a player within our data set. +# This attribute cannot be used to classify new players as they will +# have different id numbers. +# +# Before we continue, we have to remove the "id" attribute from our data set! +# +############################################################################ + + +players$id <- NULL +names(players) + + +############################################################################ +# +# +# Please make sure that the "id" attribute has been removed!!!!!!!!!! +# +# +############################################################################ + + + + + + +############################################################################ +# +# Example 1 +# +# Prediction of playing position +# +############################################################################ + + +# +# We want to build a model to predict a player's playing position +# with respect to the given player's statistics. +# +# The target variable "position" is discrete - we term this a classification task. +# +# We aim to verify whether or not it is possible to use historical data to predict +# playing positions for new players. +# + +# We are going to split the data into a training and testing data set. +# The training data set consists of players that ended their careers before 1999. +# The test data set consists of players that began their careers after 1999. + +learn <- players[players$lastseason <= 1999,] +test <- players[players$firstseason > 1999,] + +# We used the "firstseason" and "lastseason" attributes to split the data. +# Therefore the attributes are not going to contribute to the modelling task, so +# we will remove them. + +learn$firstseason <- NULL +learn$lastseason <- NULL + +test$firstseason <- NULL +test$lastseason <- NULL + +# We inspect the target values distribution in the defined data sets. +# (number of examples and frequency of individual classes) + +nrow(learn) +table(learn$position) + +nrow(test) +table(test$position) + + + + +# +# +# MAJORITY CLASSIFIER +# +# + + +# The majority class is the class with the highest number of training examples +which.max(table(learn$position)) + +majority.class <- names(which.max(table(learn$position))) +majority.class + +# The majority classifier classifies all test instances into the majority class. +# The accuracy of the majority class +sum(test$position == majority.class) / length(test$position) + + + + +# +# +# DECISION TREES +# +# + + +# load the "rpart" package that implements decision trees in R +library(rpart) + +# Fit a decision tree +dt <- rpart(position ~ ., data = learn) + +# +# The first argument of the rpart function is a model formula that indicates +# the functional form of the model. The "~" symbol stands for "modeled as". +# +# The formula "position ~ . " states that we want a model that predicts the +# target attribute "position" using all other attributes present in the data +# (which is the meaning of the "." character). +# +# If we wanted, for example, a model for the "position" attribute as a function +# of the attributes "height", "width", and "pts", we should have +# indicated the formula as "position ~ height + weight + pts". +# +# The second argument of the rpart function provides the training examples. +# + + +# The content of the dt object +dt + +# A graphical representation of our tree +plot(dt) +text(dt, pretty = 1) + +# To navigate the tree, start from the root node. If the attribute of the +# observation satisfies the logical test, then navigate down the left branch. +# Otherwise navigate down the right branch. Continue until a leaf node is reached. +# A leaf node represents a class label (or class label distribution). + + +# Observed values in the test samples (the ground truth) +observed <- test$position +observed + + +# The "predict" function returns a vector of predicted responses from a fitted tree. +# The "type" argument denotes the form of predicted value returned. The setting +# type = "class" will cause the response in a form of a vector of predicted +# class labels. + +predicted <- predict(dt, test, type = "class") +predicted + +# confusion matrix +t <- table(observed, predicted) +t + +# The diagonal elements in the confusion matrix represent the number of correctly +# classified test instances + +# The classification accuracy is the fraction of correctly classified test instances +sum(diag(t)) / sum(t) + + +# Let's write a function that calculates the classification accuracy +CA <- function(observed, predicted) +{ + t <- table(observed, predicted) + + sum(diag(t)) / sum(t) +} + +# Function call +CA(observed, predicted) + + + + +# The setting type = "prob" will cause the predict function to +# provide its answer in the form of a matrix with class probabilities. + +predMat <- predict(dt, test, type = "prob") +predMat + +# A matrix of the observed class probabilities (the ground truth) +# (the real class has the probability of 1, others have 0) +obsMat <- model.matrix( ~ position-1, test) +obsMat + + +# IMPORTANT: check whether the columns refer to the same attributes +# (if not, change the order of columns) +colnames(predMat) +colnames(obsMat) + + +# The Brier score +brier.score <- function(observedMatrix, predictedMatrix) +{ + sum((observedMatrix - predictedMatrix) ^ 2) / nrow(predictedMatrix) +} + +brier.score(obsMat, predMat) + + + + +############################################################################ +# +# Example 2 +# +# Does a player make at least 80% of free-throws attempted? +# +############################################################################ + +# +# It is a binary problem (the target variable is discrete with values YES and NO). +# + +# +# We are going to create a data set using the players data. +# + +# Get the players who have attempted at least one free throw during the career. +bin.players <- players[players$fta > 0,] + +# Free-throw success rate +rate <- bin.players$ftm / bin.players$fta + +# Create a discrete attribute "ftexpert" that will be our target variable. +ftexpert <- vector() +ftexpert[rate >= 0.8] <- "YES" +ftexpert[rate < 0.8] <- "NO" +bin.players$ftexpert <- as.factor(ftexpert) + +# Remove the "fta" and "ftm" attributes. +bin.players$fta <- NULL +bin.players$ftm <- NULL + +summary(bin.players) + + + +# Split the data into two disjoint sets, one for training and one for testing. +bin.learn <- bin.players[1:1500,] +bin.test <- bin.players[-(1:1500),] + +# Inspect the distribution of the class values in the data sets. +table(bin.learn$ftexpert) +table(bin.test$ftexpert) + + +# Train a decision tree +dt2 <- rpart(ftexpert ~ ., data = bin.learn) +plot(dt2) +text(dt2, pretty = 1) + + +bin.observed <- bin.test$ftexpert +bin.predicted <- predict(dt2, bin.test, type="class") + + +table(bin.observed, bin.predicted) + + +# Classification accuracy of the decision tree +CA(bin.observed, bin.predicted) + +# Exercise -> majority is important! +table(bin.observed) +sum(bin.observed == "NO") / length(bin.observed) + + +# The sensitivity of a model (recall) +Sensitivity <- function(observed, predicted, pos.class) +{ + t <- table(observed, predicted) + + t[pos.class, pos.class] / sum(t[pos.class,]) +} + +# The specificity of a model +Specificity <- function(observed, predicted, pos.class) +{ + t <- table(observed, predicted) + + # identify the negative class name + neg.class <- which(row.names(t) != pos.class) + + t[neg.class, neg.class] / sum(t[neg.class,]) +} + +# let's say that the value "YES" is our positive class... + +Sensitivity(bin.observed, bin.predicted, "YES") +Specificity(bin.observed, bin.predicted, "YES") + +# +# ROC curve +# + +library(pROC) +library(ggplot2) +library(dplyr) +library(ggrepel) # For nicer ROC visualization + +bin.predMat <- predict(dt2, bin.test, type = "prob") + +# Remember, we treat the value "YES" as the positive class +rocobj <- roc(bin.observed, bin.predMat[,"YES"]) +plot(rocobj) + +# This part below is optional -> nicer ROC +tmpDf <- data.frame(rocobj$sensitivities, 1-rocobj$specificities, rocobj$thresholds) +colnames(tmpDf) <- c("Sensitivity","Specificity", "Thresholds") + +tmpDf %>% ggplot(aes(x=Specificity,y=Sensitivity,color=Thresholds))+ + geom_point()+ + geom_line()+ + geom_abline(intercept = 0, slope = 1, alpha=0.2)+ + theme_classic()+ + ylab("Sensitivity (TPR)")+ + xlab("1-Specificity (FPR)")+ + geom_label_repel(aes(label = round(Thresholds,2)), + box.padding = 0.35, + point.padding = 0.5, + segment.color = 'grey50') + +names(rocobj) + +cutoffs <- rocobj$thresholds +cutoffs + +# identify the best cut-off value (for example, by minimazing the distance from the point (0,1)) +tp = rocobj$sensitivities +fp = 1 - rocobj$specificities + +dist <- (1-tp)^2 + fp^2 +dist +which.min(dist) + +best.cutoff <- cutoffs[which.min(dist)] +best.cutoff + +# the selected cut-off value has impact on the model's performance +predicted.label <- factor(ifelse(bin.predMat[,"YES"] >= best.cutoff, "YES", "NO")) + +table(bin.observed, predicted.label) +CA(bin.observed, predicted.label) +Sensitivity(bin.observed, predicted.label, "YES") +Specificity(bin.observed, predicted.label, "YES") + + diff --git a/v6/lab6_code.R b/v6/lab6_code.R new file mode 100644 index 0000000..ba8bf0b --- /dev/null +++ b/v6/lab6_code.R @@ -0,0 +1,229 @@ +### +# +# CLUSTERING +# +### + +# Load the dataset +cities <- read.table("worldcities.csv", sep=",", header = T) +summary(cities) +x <- cities$lng +y <- cities$lat +x <- x / max(x) +y <- y / max(y) +plot(x,y) +df = data.frame(x, y) + +# KMeans clustering +results <- kmeans(df, centers=6) +results +plot(x, y, col=results$cluster) + +# Are clusters aligned with classes? +library(ggplot2) +data(iris) +subset <- iris[,1:2] + +# classes +p1<-qplot(subset$Sepal.Length,subset$Sepal.Width,color = iris$Species, main = "Ground truth") + theme_bw() + +resultsIris <-kmeans(subset, centers = 3) +assignments <- as.factor(resultsIris$cluster) +p2<-qplot(subset$Sepal.Length,subset$Sepal.Width,color = assignments, main = "Clustering") + theme_bw() +gridExtra::grid.arrange(p1, p2, nrow = 2) + +# Exercise -> try to cluster the iris data according to Length and Width! + +# Try to fit models and see what can be learnt best! +library(mclust) +fit <- Mclust(df) +# summarize the fits +plot(fit) +summary(fit) + +#Hierarchical clustering +distMat = dist(df) +resultsH <- hclust(distMat) + +plot(resultsH) +clusters <- cutree(resultsH, 6) +plot(x, y, col=clusters) + + +### +# +# ATTRIBUTE EVALUATION +# +### + + +# Attribute evaluation functions are implemented in the package "CORElearn" + +# Load the library +#install.packages("CORElearn") +library(CORElearn) + + +# +# Attribute estimation in classification +# + + + +# +# Example 1 +# + +mr <- read.table("mushroom.txt", sep=",", header=T) +mr$edibility <- as.factor(mr$edibility) +summary(mr) + +qplot(mr$edibility, ylab="Number of species", main="Edibility of mushrooms", geom = c("bar")) + + +# the attrEval function evaluates the quality of the attributes/dependent variables +# specified by the formula. Parameter formula is used as a mechanism to select +# attributes and prediction variable(class). The simplest way is to specify just +# response variable: "Class ~ .". In this case all other attributes in the data set +# are evaluated with the selected heuristic method. + +# attribute evaluation using information gain +sort(attrEval(edibility ~ ., mr, "InfGain"), decreasing = TRUE) + +# build a decision tree using information gain as a splitting criterion +dt <- CoreModel(edibility ~ ., mr, model="tree", selectionEstimator="InfGain") +plot(dt, mr) + + +# +# Example 2 +# + +quadrant <- read.table("quadrant.txt", sep=",", header=T) +summary(quadrant) + +quadrant$Class <- as.factor(quadrant$Class) + +plot(quadrant, col=quadrant$Class) +plot(quadrant$a1, quadrant$a2, col=quadrant$Class) + +# attribute evaluation using information gain +sort(attrEval(Class ~ ., quadrant, "InfGain"), decreasing = TRUE) + +# information gain is a myopic measure +# (it assumes that attributes are conditionally independent given the class) + +# using information gain to construct a decision tree will produce a poor result +dt2 <- CoreModel(Class ~ ., quadrant, model="tree", selectionEstimator="InfGain") +plot(dt2, quadrant) + +# non-myopic measures (Relief and ReleifF) +sort(attrEval(Class ~ ., quadrant, "Relief"), decreasing = TRUE) +sort(attrEval(Class ~ ., quadrant, "ReliefFequalK"), decreasing = TRUE) +sort(attrEval(Class ~ ., quadrant, "ReliefFexpRank"), decreasing = TRUE) + +# use ?attrEval to find out different variations of ReliefF measure... + +# build a decision tree using ReliefFequalK as a splitting criterion +dt3 <- CoreModel(Class ~ ., quadrant, model="tree", selectionEstimator = "ReliefFequalK") +plot(dt3, quadrant) + + +# +# Example 3 +# + +players <- read.table("players.txt", sep=",", header=TRUE) +summary(players) +players$position <- as.factor(players$position) + +sort(attrEval(position ~ ., players, "InfGain"), decreasing = TRUE) +sort(attrEval(position ~ ., players, "Gini"), decreasing = TRUE) + +# the "id" attribute is overestimated, +# although it does not carry any useful information + +# GainRatio moderates the overestimation of the "id" attribute +sort(attrEval(position ~ ., players, "GainRatio"), decreasing = TRUE) + +# Both ReliefF and MDL measures identify the "id" attribute as irrelevant +sort(attrEval(position ~ ., players, "ReliefFequalK"), decreasing = TRUE) +sort(attrEval(position ~ ., players, "MDL"), decreasing = TRUE) + +# +# +# Attribute selection +# +# + +student <- read.table("student.txt", sep=",", header=T) +student$G1 <- cut(student$G1, c(-Inf, 9, 20), labels=c("fail", "pass")) +student$G2 <- cut(student$G2, c(-Inf, 9, 20), labels=c("fail", "pass")) +student$G3 <- cut(student$G3, c(-Inf, 9, 20), labels=c("fail", "pass")) +student$studytime <- cut(student$studytime, c(-Inf, 1, 2, 3, 4), labels=c("none", "few", "hefty", "endless")) + +train = student[1:(nrow(student) * 0.7),] +test = student[(nrow(student) * 0.7):nrow(student),] + +# +# Feature selection using a filter method +# + +# evaluate the quality of attributes +sort(attrEval(studytime ~ ., student, "MDL"), decreasing = TRUE) + +# train a model using everything +model <- CoreModel(studytime ~ ., data=train, model='tree') +predictions <- predict(model, test, type='class') +sum(predictions == test$studytime) / length(predictions) + +# train a model using the best evaluated attributes +model <- CoreModel(studytime ~ sex + Walc + Dalc + freetime + higher + paid, data=train, model='tree') +predictions <- predict(model, test, type='class') +sum(predictions == test$studytime) / length(predictions) + + +# using the appropriate combination of attributes... +model <- CoreModel(studytime ~ Dalc + paid + G2, data=train, model='tree') +predictions <- predict(model, test, type='class') +sum(predictions == test$studytime) / length(predictions) + +# What about some model-based feature importances? +library(caret) +set.seed(123780) +#Number randomely variable selected is mtry +control <- trainControl(method='repeatedcv',number=10, repeats=1) + +rf.model <- train(studytime ~ ., + data=train, + method='rf', + metric='Accuracy', + trControl=control) + +## varImp extracts the feature relevances +rf.importances<- varImp(rf.model, scale = FALSE) +plot(rf.importances, top = 20) +importance.df.values <- rf.importances$importance +importance.df.names <- rownames(rf.importances$importance) +importance.rf.whole <- data.frame(score = importance.df.values,cnames = importance.df.names) +importance.rf.whole <- importance.rf.whole[order(importance.rf.whole$Overall, decreasing = T),] +feature.names.rf <- importance.rf.whole$cnames[1:20] + +mdl.importances <- sort(attrEval(studytime ~ ., student, "MDL"), decreasing = TRUE)[1:20] +feature.names.mdl <- names(mdl.importances) +paste0(length(intersect(feature.names.rf, feature.names.mdl))*100/20,"% overlap between top ranked features!") + +scaleVector <- function(x) {(x - min(x))/(max(x) - min(x))} + +# Let's visualize how similar the rankings' distributions are +rf.all <- scaleVector(importance.rf.whole$Overall) +mdl.all <- scaleVector(as.vector(mdl.importances)) + +rank.frame <- data.frame(rf.all, mdl.all) + +library(dplyr) +library(tidyr) + +# How do the two distributions differ? +summary(rank.frame) +rank.frame %>% gather() %>% ggplot(aes(x=value,fill=key))+geom_density(alpha=0.4)+theme_bw() diff --git a/v7/lab7_code.R b/v7/lab7_code.R new file mode 100644 index 0000000..128f364 --- /dev/null +++ b/v7/lab7_code.R @@ -0,0 +1,207 @@ +################################################################ +# +# Combining machine learning algorithms +# +################################################################ + +#install.packages("CORElearn") +library(CORElearn) + +vehicle <- read.table("vehicle.txt", sep=",", header = T) +summary(vehicle) + +set.seed(8678686) +sel <- sample(1:nrow(vehicle), size=as.integer(nrow(vehicle)*0.7), replace=F) +learn <- vehicle[sel,] +test <- vehicle[-sel,] + +learn$Class <- as.factor(learn$Class) +test$Class <- as.factor(test$Class) + +table(learn$Class) +table(test$Class) + +CA <- function(observed, predicted) +{ + t <- table(observed, predicted) + + sum(diag(t)) / sum(t) +} + +# +# A simple tree baseline. +# + + +# +# Voting +# + +modelDT <- CoreModel(Class ~ ., learn, model="tree") +modelNB <- CoreModel(Class ~ ., learn, model="bayes") +modelKNN <- CoreModel(Class ~ ., learn, model="knn", kInNN = 2) + +predDT <- predict(modelDT, test, type = "class") +result.caDT <- CA(test$Class, predDT) +result.caDT + +predNB <- predict(modelNB, test, type="class") +result.caNB <- CA(test$Class, predNB) +result.caNB + +predKNN <- predict(modelKNN, test, type="class") +result.caKNN <- CA(test$Class, predKNN) +result.caKNN + +# combine predictions into a data frame +pred <- data.frame(predDT, predNB, predKNN) +pred + +# the class with the most votes wins +voting <- function(predictions) +{ + res <- vector() + + for (i in 1 : nrow(predictions)) + { + vec <- unlist(predictions[i,]) + res[i] <- names(which.max(table(vec))) + } + + factor(res, levels=levels(predictions[,1])) +} + +predicted <- voting(pred) +result.voting <- CA(test$Class, predicted) +result.voting + +# +# Weighted voting +# + +predDT.prob <- predict(modelDT, test, type="probability") +predNB.prob <- predict(modelNB, test, type="probability") +predKNN.prob <- predict(modelKNN, test, type="probability") + +# combine predictions into a data frame +pred.prob <- result.caDT * predDT.prob + result.caNB * predNB.prob + result.caKNN * predKNN.prob +pred.prob + +# We can visualize the joint output space! +heatmap(pred.prob, col=c('red','green','blue')) +legend(x="right", legend=c("min", "med", "max"), + fill=c('red','green','blue')) + +# pick the class with the highest score +highest <- apply(pred.prob, 1, which.max) +classes <- levels(learn$Class) +predicted <- classes[highest] + +result.wvoting <- CA(test$Class, predicted) +result.wvoting + + + +# +# Bagging +# + +#install.packages("ipred") +library(ipred) + +bag <- bagging(Class ~ ., learn, nbagg=14) +bag.pred <- predict(bag, test, type="class") +result.bagging <- CA(test$Class, bag.pred) +result.bagging + + +# +# Random forest as a variation of bagging +# + +# install.packages("randomForest") +library(randomForest) + +rf <- randomForest(Class ~ ., learn) +rf.predicted <- predict(rf, test, type = "class") +result.rf <- CA(test$Class, rf.predicted) +result.rf + + +# +# Boosting +# + +# install.packages("adabag") +library(adabag) + +bm <- boosting(Class ~ ., learn) +predictions.bm <- predict(bm, test) +names(predictions.bm) + +predicted.bm <- predictions.bm$class +result.boosting <- CA(test$Class, predicted.bm) +result.boosting + +# +# Caret package +# + +# install.packages("caret") +library(caret) +caretModel <- train(Class ~ ., learn, method="xgbLinear", eta=1, verbose=1) +pred <- predict(caretModel, test) +result.xgb <- CA(test$Class, pred) +result.xgb + +# Let's visualize currently considered ensemble methods. +performances <- c(result.rf, + result.bagging, + result.wvoting, + result.boosting, + result.caDT, + result.caKNN, + result.caNB, + result.xgb) + +algo.names <- c("RF","Bagging","Weighted vote","Boosting","DT","KNN","NB","XGB") +ensemble.model <- as.factor(c(1,1,1,1,0,0,0,1)) +result.vec <- data.frame(performances, algo.names,ensemble.model) +reordering <- order(result.vec$performances) +result.vec <- result.vec[reordering,] +rownames(result.vec) <- NULL +result.vec + +library(ggplot2) +positions <- as.vector(result.vec$algo.names) +ggplot(data=result.vec, aes(x=algo.names, y=performances, color = ensemble.model)) + + geom_point(size = 10, shape = 4) + + scale_x_discrete(limits = positions) + + ylim(0.5,0.8) + + xlab("Ensemble type") + + ylab("Accuracy") + + geom_hline(yintercept=max(performances), color = "darkgreen") + + geom_hline(yintercept=min(performances), color = "black") + + title("Performance comparison") + + geom_text(label = positions,nudge_x = 0.2, nudge_y = -0.01) + + theme_bw() + + theme(axis.title.x=element_blank(), + axis.text.x=element_blank(), + axis.ticks.x=element_blank()) + + +# +# Cross-validation +# + +# the library ipred is needed to perform cross-validation +library(ipred) + +# Tell the cross-validation which model to use +mymodel.coremodel <- function(formula, data, target.model){CoreModel(formula, data, model=target.model)} +# Tell the cross-validation how to obtain predictions +mypredict.generic <- function(object, newdata){predict(object, newdata, type = "class")} +# force the predict function to return class labels only and also destroy the internal representation of a given model +mypredict.coremodel <- function(object, newdata) {pred <- predict(object, newdata)$class; destroyModels(object); pred} +cvError <- errorest(Class~., data=learn, model = mymodel.coremodel, predict = mypredict.coremodel, target.model = "tree") +1 - cvError$error