Push vaje
parent
7d334030e2
commit
15f1e5b996
|
@ -0,0 +1,397 @@
|
|||
#
|
||||
#
|
||||
# INTRODUCTION TO CLASSIFICATION
|
||||
#
|
||||
#
|
||||
|
||||
#
|
||||
# Work plan:
|
||||
#
|
||||
# - Read in the data
|
||||
# - Split the data into training and test sets
|
||||
# - Build a model using the training set
|
||||
# - Evaluate the model using the test set
|
||||
#
|
||||
|
||||
|
||||
|
||||
# Please download the data file "players.txt" into a local directory
|
||||
# then set that directory as the current working directory of R.
|
||||
# You can achive this using the "setwd" command or by selecting "File -> Change dir..."
|
||||
|
||||
|
||||
# We are going to use the packages: rpart, and pROC.
|
||||
# Make sure that the packages are installed.
|
||||
#
|
||||
# You install a package in R with the function install.packages():
|
||||
#
|
||||
# install.packages("pROC")
|
||||
# library(pROC)
|
||||
#
|
||||
# To install packages without root access:
|
||||
#
|
||||
# install.packages("pROC", lib="/mylibs/Rpackages/")
|
||||
# library(pROC, lib.loc="/mylibs/Rpackages/")
|
||||
#
|
||||
|
||||
# Read in the dataset
|
||||
players <- read.table("players.txt", header = T, sep = ",")
|
||||
|
||||
# Get the summary statistics of the data
|
||||
summary(players)
|
||||
|
||||
|
||||
|
||||
############################################################################
|
||||
#
|
||||
# ATTENTION!!!!!!!!!
|
||||
#
|
||||
# The "id" attribute uniquely identifies a player within our data set.
|
||||
# This attribute cannot be used to classify new players as they will
|
||||
# have different id numbers.
|
||||
#
|
||||
# Before we continue, we have to remove the "id" attribute from our data set!
|
||||
#
|
||||
############################################################################
|
||||
|
||||
|
||||
players$id <- NULL
|
||||
names(players)
|
||||
|
||||
|
||||
############################################################################
|
||||
#
|
||||
#
|
||||
# Please make sure that the "id" attribute has been removed!!!!!!!!!!
|
||||
#
|
||||
#
|
||||
############################################################################
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
############################################################################
|
||||
#
|
||||
# Example 1
|
||||
#
|
||||
# Prediction of playing position
|
||||
#
|
||||
############################################################################
|
||||
|
||||
|
||||
#
|
||||
# We want to build a model to predict a player's playing position
|
||||
# with respect to the given player's statistics.
|
||||
#
|
||||
# The target variable "position" is discrete - we term this a classification task.
|
||||
#
|
||||
# We aim to verify whether or not it is possible to use historical data to predict
|
||||
# playing positions for new players.
|
||||
#
|
||||
|
||||
# We are going to split the data into a training and testing data set.
|
||||
# The training data set consists of players that ended their careers before 1999.
|
||||
# The test data set consists of players that began their careers after 1999.
|
||||
|
||||
learn <- players[players$lastseason <= 1999,]
|
||||
test <- players[players$firstseason > 1999,]
|
||||
|
||||
# We used the "firstseason" and "lastseason" attributes to split the data.
|
||||
# Therefore the attributes are not going to contribute to the modelling task, so
|
||||
# we will remove them.
|
||||
|
||||
learn$firstseason <- NULL
|
||||
learn$lastseason <- NULL
|
||||
|
||||
test$firstseason <- NULL
|
||||
test$lastseason <- NULL
|
||||
|
||||
# We inspect the target values distribution in the defined data sets.
|
||||
# (number of examples and frequency of individual classes)
|
||||
|
||||
nrow(learn)
|
||||
table(learn$position)
|
||||
|
||||
nrow(test)
|
||||
table(test$position)
|
||||
|
||||
|
||||
|
||||
|
||||
#
|
||||
#
|
||||
# MAJORITY CLASSIFIER
|
||||
#
|
||||
#
|
||||
|
||||
|
||||
# The majority class is the class with the highest number of training examples
|
||||
which.max(table(learn$position))
|
||||
|
||||
majority.class <- names(which.max(table(learn$position)))
|
||||
majority.class
|
||||
|
||||
# The majority classifier classifies all test instances into the majority class.
|
||||
# The accuracy of the majority class
|
||||
sum(test$position == majority.class) / length(test$position)
|
||||
|
||||
|
||||
|
||||
|
||||
#
|
||||
#
|
||||
# DECISION TREES
|
||||
#
|
||||
#
|
||||
|
||||
|
||||
# load the "rpart" package that implements decision trees in R
|
||||
library(rpart)
|
||||
|
||||
# Fit a decision tree
|
||||
dt <- rpart(position ~ ., data = learn)
|
||||
|
||||
#
|
||||
# The first argument of the rpart function is a model formula that indicates
|
||||
# the functional form of the model. The "~" symbol stands for "modeled as".
|
||||
#
|
||||
# The formula "position ~ . " states that we want a model that predicts the
|
||||
# target attribute "position" using all other attributes present in the data
|
||||
# (which is the meaning of the "." character).
|
||||
#
|
||||
# If we wanted, for example, a model for the "position" attribute as a function
|
||||
# of the attributes "height", "width", and "pts", we should have
|
||||
# indicated the formula as "position ~ height + weight + pts".
|
||||
#
|
||||
# The second argument of the rpart function provides the training examples.
|
||||
#
|
||||
|
||||
|
||||
# The content of the dt object
|
||||
dt
|
||||
|
||||
# A graphical representation of our tree
|
||||
plot(dt)
|
||||
text(dt, pretty = 1)
|
||||
|
||||
# To navigate the tree, start from the root node. If the attribute of the
|
||||
# observation satisfies the logical test, then navigate down the left branch.
|
||||
# Otherwise navigate down the right branch. Continue until a leaf node is reached.
|
||||
# A leaf node represents a class label (or class label distribution).
|
||||
|
||||
|
||||
# Observed values in the test samples (the ground truth)
|
||||
observed <- test$position
|
||||
observed
|
||||
|
||||
|
||||
# The "predict" function returns a vector of predicted responses from a fitted tree.
|
||||
# The "type" argument denotes the form of predicted value returned. The setting
|
||||
# type = "class" will cause the response in a form of a vector of predicted
|
||||
# class labels.
|
||||
|
||||
predicted <- predict(dt, test, type = "class")
|
||||
predicted
|
||||
|
||||
# confusion matrix
|
||||
t <- table(observed, predicted)
|
||||
t
|
||||
|
||||
# The diagonal elements in the confusion matrix represent the number of correctly
|
||||
# classified test instances
|
||||
|
||||
# The classification accuracy is the fraction of correctly classified test instances
|
||||
sum(diag(t)) / sum(t)
|
||||
|
||||
|
||||
# Let's write a function that calculates the classification accuracy
|
||||
CA <- function(observed, predicted)
|
||||
{
|
||||
t <- table(observed, predicted)
|
||||
|
||||
sum(diag(t)) / sum(t)
|
||||
}
|
||||
|
||||
# Function call
|
||||
CA(observed, predicted)
|
||||
|
||||
|
||||
|
||||
|
||||
# The setting type = "prob" will cause the predict function to
|
||||
# provide its answer in the form of a matrix with class probabilities.
|
||||
|
||||
predMat <- predict(dt, test, type = "prob")
|
||||
predMat
|
||||
|
||||
# A matrix of the observed class probabilities (the ground truth)
|
||||
# (the real class has the probability of 1, others have 0)
|
||||
obsMat <- model.matrix( ~ position-1, test)
|
||||
obsMat
|
||||
|
||||
|
||||
# IMPORTANT: check whether the columns refer to the same attributes
|
||||
# (if not, change the order of columns)
|
||||
colnames(predMat)
|
||||
colnames(obsMat)
|
||||
|
||||
|
||||
# The Brier score
|
||||
brier.score <- function(observedMatrix, predictedMatrix)
|
||||
{
|
||||
sum((observedMatrix - predictedMatrix) ^ 2) / nrow(predictedMatrix)
|
||||
}
|
||||
|
||||
brier.score(obsMat, predMat)
|
||||
|
||||
|
||||
|
||||
|
||||
############################################################################
|
||||
#
|
||||
# Example 2
|
||||
#
|
||||
# Does a player make at least 80% of free-throws attempted?
|
||||
#
|
||||
############################################################################
|
||||
|
||||
#
|
||||
# It is a binary problem (the target variable is discrete with values YES and NO).
|
||||
#
|
||||
|
||||
#
|
||||
# We are going to create a data set using the players data.
|
||||
#
|
||||
|
||||
# Get the players who have attempted at least one free throw during the career.
|
||||
bin.players <- players[players$fta > 0,]
|
||||
|
||||
# Free-throw success rate
|
||||
rate <- bin.players$ftm / bin.players$fta
|
||||
|
||||
# Create a discrete attribute "ftexpert" that will be our target variable.
|
||||
ftexpert <- vector()
|
||||
ftexpert[rate >= 0.8] <- "YES"
|
||||
ftexpert[rate < 0.8] <- "NO"
|
||||
bin.players$ftexpert <- as.factor(ftexpert)
|
||||
|
||||
# Remove the "fta" and "ftm" attributes.
|
||||
bin.players$fta <- NULL
|
||||
bin.players$ftm <- NULL
|
||||
|
||||
summary(bin.players)
|
||||
|
||||
|
||||
|
||||
# Split the data into two disjoint sets, one for training and one for testing.
|
||||
bin.learn <- bin.players[1:1500,]
|
||||
bin.test <- bin.players[-(1:1500),]
|
||||
|
||||
# Inspect the distribution of the class values in the data sets.
|
||||
table(bin.learn$ftexpert)
|
||||
table(bin.test$ftexpert)
|
||||
|
||||
|
||||
# Train a decision tree
|
||||
dt2 <- rpart(ftexpert ~ ., data = bin.learn)
|
||||
plot(dt2)
|
||||
text(dt2, pretty = 1)
|
||||
|
||||
|
||||
bin.observed <- bin.test$ftexpert
|
||||
bin.predicted <- predict(dt2, bin.test, type="class")
|
||||
|
||||
|
||||
table(bin.observed, bin.predicted)
|
||||
|
||||
|
||||
# Classification accuracy of the decision tree
|
||||
CA(bin.observed, bin.predicted)
|
||||
|
||||
# Exercise -> majority is important!
|
||||
table(bin.observed)
|
||||
sum(bin.observed == "NO") / length(bin.observed)
|
||||
|
||||
|
||||
# The sensitivity of a model (recall)
|
||||
Sensitivity <- function(observed, predicted, pos.class)
|
||||
{
|
||||
t <- table(observed, predicted)
|
||||
|
||||
t[pos.class, pos.class] / sum(t[pos.class,])
|
||||
}
|
||||
|
||||
# The specificity of a model
|
||||
Specificity <- function(observed, predicted, pos.class)
|
||||
{
|
||||
t <- table(observed, predicted)
|
||||
|
||||
# identify the negative class name
|
||||
neg.class <- which(row.names(t) != pos.class)
|
||||
|
||||
t[neg.class, neg.class] / sum(t[neg.class,])
|
||||
}
|
||||
|
||||
# let's say that the value "YES" is our positive class...
|
||||
|
||||
Sensitivity(bin.observed, bin.predicted, "YES")
|
||||
Specificity(bin.observed, bin.predicted, "YES")
|
||||
|
||||
#
|
||||
# ROC curve
|
||||
#
|
||||
|
||||
library(pROC)
|
||||
library(ggplot2)
|
||||
library(dplyr)
|
||||
library(ggrepel) # For nicer ROC visualization
|
||||
|
||||
bin.predMat <- predict(dt2, bin.test, type = "prob")
|
||||
|
||||
# Remember, we treat the value "YES" as the positive class
|
||||
rocobj <- roc(bin.observed, bin.predMat[,"YES"])
|
||||
plot(rocobj)
|
||||
|
||||
# This part below is optional -> nicer ROC
|
||||
tmpDf <- data.frame(rocobj$sensitivities, 1-rocobj$specificities, rocobj$thresholds)
|
||||
colnames(tmpDf) <- c("Sensitivity","Specificity", "Thresholds")
|
||||
|
||||
tmpDf %>% ggplot(aes(x=Specificity,y=Sensitivity,color=Thresholds))+
|
||||
geom_point()+
|
||||
geom_line()+
|
||||
geom_abline(intercept = 0, slope = 1, alpha=0.2)+
|
||||
theme_classic()+
|
||||
ylab("Sensitivity (TPR)")+
|
||||
xlab("1-Specificity (FPR)")+
|
||||
geom_label_repel(aes(label = round(Thresholds,2)),
|
||||
box.padding = 0.35,
|
||||
point.padding = 0.5,
|
||||
segment.color = 'grey50')
|
||||
|
||||
names(rocobj)
|
||||
|
||||
cutoffs <- rocobj$thresholds
|
||||
cutoffs
|
||||
|
||||
# identify the best cut-off value (for example, by minimazing the distance from the point (0,1))
|
||||
tp = rocobj$sensitivities
|
||||
fp = 1 - rocobj$specificities
|
||||
|
||||
dist <- (1-tp)^2 + fp^2
|
||||
dist
|
||||
which.min(dist)
|
||||
|
||||
best.cutoff <- cutoffs[which.min(dist)]
|
||||
best.cutoff
|
||||
|
||||
# the selected cut-off value has impact on the model's performance
|
||||
predicted.label <- factor(ifelse(bin.predMat[,"YES"] >= best.cutoff, "YES", "NO"))
|
||||
|
||||
table(bin.observed, predicted.label)
|
||||
CA(bin.observed, predicted.label)
|
||||
Sensitivity(bin.observed, predicted.label, "YES")
|
||||
Specificity(bin.observed, predicted.label, "YES")
|
||||
|
||||
|
|
@ -0,0 +1,397 @@
|
|||
#
|
||||
#
|
||||
# INTRODUCTION TO CLASSIFICATION
|
||||
#
|
||||
#
|
||||
|
||||
#
|
||||
# Work plan:
|
||||
#
|
||||
# - Read in the data
|
||||
# - Split the data into training and test sets
|
||||
# - Build a model using the training set
|
||||
# - Evaluate the model using the test set
|
||||
#
|
||||
|
||||
|
||||
|
||||
# Please download the data file "players.txt" into a local directory
|
||||
# then set that directory as the current working directory of R.
|
||||
# You can achive this using the "setwd" command or by selecting "File -> Change dir..."
|
||||
|
||||
|
||||
# We are going to use the packages: rpart, and pROC.
|
||||
# Make sure that the packages are installed.
|
||||
#
|
||||
# You install a package in R with the function install.packages():
|
||||
#
|
||||
# install.packages("pROC")
|
||||
# library(pROC)
|
||||
#
|
||||
# To install packages without root access:
|
||||
#
|
||||
# install.packages("pROC", lib="/mylibs/Rpackages/")
|
||||
# library(pROC, lib.loc="/mylibs/Rpackages/")
|
||||
#
|
||||
|
||||
# Read in the dataset
|
||||
players <- read.table("players.txt", header = T, sep = ",")
|
||||
|
||||
# Get the summary statistics of the data
|
||||
summary(players)
|
||||
|
||||
|
||||
|
||||
############################################################################
|
||||
#
|
||||
# ATTENTION!!!!!!!!!
|
||||
#
|
||||
# The "id" attribute uniquely identifies a player within our data set.
|
||||
# This attribute cannot be used to classify new players as they will
|
||||
# have different id numbers.
|
||||
#
|
||||
# Before we continue, we have to remove the "id" attribute from our data set!
|
||||
#
|
||||
############################################################################
|
||||
|
||||
|
||||
players$id <- NULL
|
||||
names(players)
|
||||
|
||||
|
||||
############################################################################
|
||||
#
|
||||
#
|
||||
# Please make sure that the "id" attribute has been removed!!!!!!!!!!
|
||||
#
|
||||
#
|
||||
############################################################################
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
############################################################################
|
||||
#
|
||||
# Example 1
|
||||
#
|
||||
# Prediction of playing position
|
||||
#
|
||||
############################################################################
|
||||
|
||||
|
||||
#
|
||||
# We want to build a model to predict a player's playing position
|
||||
# with respect to the given player's statistics.
|
||||
#
|
||||
# The target variable "position" is discrete - we term this a classification task.
|
||||
#
|
||||
# We aim to verify whether or not it is possible to use historical data to predict
|
||||
# playing positions for new players.
|
||||
#
|
||||
|
||||
# We are going to split the data into a training and testing data set.
|
||||
# The training data set consists of players that ended their careers before 1999.
|
||||
# The test data set consists of players that began their careers after 1999.
|
||||
|
||||
learn <- players[players$lastseason <= 1999,]
|
||||
test <- players[players$firstseason > 1999,]
|
||||
|
||||
# We used the "firstseason" and "lastseason" attributes to split the data.
|
||||
# Therefore the attributes are not going to contribute to the modelling task, so
|
||||
# we will remove them.
|
||||
|
||||
learn$firstseason <- NULL
|
||||
learn$lastseason <- NULL
|
||||
|
||||
test$firstseason <- NULL
|
||||
test$lastseason <- NULL
|
||||
|
||||
# We inspect the target values distribution in the defined data sets.
|
||||
# (number of examples and frequency of individual classes)
|
||||
|
||||
nrow(learn)
|
||||
table(learn$position)
|
||||
|
||||
nrow(test)
|
||||
table(test$position)
|
||||
|
||||
|
||||
|
||||
|
||||
#
|
||||
#
|
||||
# MAJORITY CLASSIFIER
|
||||
#
|
||||
#
|
||||
|
||||
|
||||
# The majority class is the class with the highest number of training examples
|
||||
which.max(table(learn$position))
|
||||
|
||||
majority.class <- names(which.max(table(learn$position)))
|
||||
majority.class
|
||||
|
||||
# The majority classifier classifies all test instances into the majority class.
|
||||
# The accuracy of the majority class
|
||||
sum(test$position == majority.class) / length(test$position)
|
||||
|
||||
|
||||
|
||||
|
||||
#
|
||||
#
|
||||
# DECISION TREES
|
||||
#
|
||||
#
|
||||
|
||||
|
||||
# load the "rpart" package that implements decision trees in R
|
||||
library(rpart)
|
||||
|
||||
# Fit a decision tree
|
||||
dt <- rpart(position ~ ., data = learn)
|
||||
|
||||
#
|
||||
# The first argument of the rpart function is a model formula that indicates
|
||||
# the functional form of the model. The "~" symbol stands for "modeled as".
|
||||
#
|
||||
# The formula "position ~ . " states that we want a model that predicts the
|
||||
# target attribute "position" using all other attributes present in the data
|
||||
# (which is the meaning of the "." character).
|
||||
#
|
||||
# If we wanted, for example, a model for the "position" attribute as a function
|
||||
# of the attributes "height", "width", and "pts", we should have
|
||||
# indicated the formula as "position ~ height + weight + pts".
|
||||
#
|
||||
# The second argument of the rpart function provides the training examples.
|
||||
#
|
||||
|
||||
|
||||
# The content of the dt object
|
||||
dt
|
||||
|
||||
# A graphical representation of our tree
|
||||
plot(dt)
|
||||
text(dt, pretty = 1)
|
||||
|
||||
# To navigate the tree, start from the root node. If the attribute of the
|
||||
# observation satisfies the logical test, then navigate down the left branch.
|
||||
# Otherwise navigate down the right branch. Continue until a leaf node is reached.
|
||||
# A leaf node represents a class label (or class label distribution).
|
||||
|
||||
|
||||
# Observed values in the test samples (the ground truth)
|
||||
observed <- test$position
|
||||
observed
|
||||
|
||||
|
||||
# The "predict" function returns a vector of predicted responses from a fitted tree.
|
||||
# The "type" argument denotes the form of predicted value returned. The setting
|
||||
# type = "class" will cause the response in a form of a vector of predicted
|
||||
# class labels.
|
||||
|
||||
predicted <- predict(dt, test, type = "class")
|
||||
predicted
|
||||
|
||||
# confusion matrix
|
||||
t <- table(observed, predicted)
|
||||
t
|
||||
|
||||
# The diagonal elements in the confusion matrix represent the number of correctly
|
||||
# classified test instances
|
||||
|
||||
# The classification accuracy is the fraction of correctly classified test instances
|
||||
sum(diag(t)) / sum(t)
|
||||
|
||||
|
||||
# Let's write a function that calculates the classification accuracy
|
||||
CA <- function(observed, predicted)
|
||||
{
|
||||
t <- table(observed, predicted)
|
||||
|
||||
sum(diag(t)) / sum(t)
|
||||
}
|
||||
|
||||
# Function call
|
||||
CA(observed, predicted)
|
||||
|
||||
|
||||
|
||||
|
||||
# The setting type = "prob" will cause the predict function to
|
||||
# provide its answer in the form of a matrix with class probabilities.
|
||||
|
||||
predMat <- predict(dt, test, type = "prob")
|
||||
predMat
|
||||
|
||||
# A matrix of the observed class probabilities (the ground truth)
|
||||
# (the real class has the probability of 1, others have 0)
|
||||
obsMat <- model.matrix( ~ position-1, test)
|
||||
obsMat
|
||||
|
||||
|
||||
# IMPORTANT: check whether the columns refer to the same attributes
|
||||
# (if not, change the order of columns)
|
||||
colnames(predMat)
|
||||
colnames(obsMat)
|
||||
|
||||
|
||||
# The Brier score
|
||||
brier.score <- function(observedMatrix, predictedMatrix)
|
||||
{
|
||||
sum((observedMatrix - predictedMatrix) ^ 2) / nrow(predictedMatrix)
|
||||
}
|
||||
|
||||
brier.score(obsMat, predMat)
|
||||
|
||||
|
||||
|
||||
|
||||
############################################################################
|
||||
#
|
||||
# Example 2
|
||||
#
|
||||
# Does a player make at least 80% of free-throws attempted?
|
||||
#
|
||||
############################################################################
|
||||
|
||||
#
|
||||
# It is a binary problem (the target variable is discrete with values YES and NO).
|
||||
#
|
||||
|
||||
#
|
||||
# We are going to create a data set using the players data.
|
||||
#
|
||||
|
||||
# Get the players who have attempted at least one free throw during the career.
|
||||
bin.players <- players[players$fta > 0,]
|
||||
|
||||
# Free-throw success rate
|
||||
rate <- bin.players$ftm / bin.players$fta
|
||||
|
||||
# Create a discrete attribute "ftexpert" that will be our target variable.
|
||||
ftexpert <- vector()
|
||||
ftexpert[rate >= 0.8] <- "YES"
|
||||
ftexpert[rate < 0.8] <- "NO"
|
||||
bin.players$ftexpert <- as.factor(ftexpert)
|
||||
|
||||
# Remove the "fta" and "ftm" attributes.
|
||||
bin.players$fta <- NULL
|
||||
bin.players$ftm <- NULL
|
||||
|
||||
summary(bin.players)
|
||||
|
||||
|
||||
|
||||
# Split the data into two disjoint sets, one for training and one for testing.
|
||||
bin.learn <- bin.players[1:1500,]
|
||||
bin.test <- bin.players[-(1:1500),]
|
||||
|
||||
# Inspect the distribution of the class values in the data sets.
|
||||
table(bin.learn$ftexpert)
|
||||
table(bin.test$ftexpert)
|
||||
|
||||
|
||||
# Train a decision tree
|
||||
dt2 <- rpart(ftexpert ~ ., data = bin.learn)
|
||||
plot(dt2)
|
||||
text(dt2, pretty = 1)
|
||||
|
||||
|
||||
bin.observed <- bin.test$ftexpert
|
||||
bin.predicted <- predict(dt2, bin.test, type="class")
|
||||
|
||||
|
||||
table(bin.observed, bin.predicted)
|
||||
|
||||
|
||||
# Classification accuracy of the decision tree
|
||||
CA(bin.observed, bin.predicted)
|
||||
|
||||
# Exercise -> majority is important!
|
||||
table(bin.observed)
|
||||
sum(bin.observed == "NO") / length(bin.observed)
|
||||
|
||||
|
||||
# The sensitivity of a model (recall)
|
||||
Sensitivity <- function(observed, predicted, pos.class)
|
||||
{
|
||||
t <- table(observed, predicted)
|
||||
|
||||
t[pos.class, pos.class] / sum(t[pos.class,])
|
||||
}
|
||||
|
||||
# The specificity of a model
|
||||
Specificity <- function(observed, predicted, pos.class)
|
||||
{
|
||||
t <- table(observed, predicted)
|
||||
|
||||
# identify the negative class name
|
||||
neg.class <- which(row.names(t) != pos.class)
|
||||
|
||||
t[neg.class, neg.class] / sum(t[neg.class,])
|
||||
}
|
||||
|
||||
# let's say that the value "YES" is our positive class...
|
||||
|
||||
Sensitivity(bin.observed, bin.predicted, "YES")
|
||||
Specificity(bin.observed, bin.predicted, "YES")
|
||||
|
||||
#
|
||||
# ROC curve
|
||||
#
|
||||
|
||||
library(pROC)
|
||||
library(ggplot2)
|
||||
library(dplyr)
|
||||
library(ggrepel) # For nicer ROC visualization
|
||||
|
||||
bin.predMat <- predict(dt2, bin.test, type = "prob")
|
||||
|
||||
# Remember, we treat the value "YES" as the positive class
|
||||
rocobj <- roc(bin.observed, bin.predMat[,"YES"])
|
||||
plot(rocobj)
|
||||
|
||||
# This part below is optional -> nicer ROC
|
||||
tmpDf <- data.frame(rocobj$sensitivities, 1-rocobj$specificities, rocobj$thresholds)
|
||||
colnames(tmpDf) <- c("Sensitivity","Specificity", "Thresholds")
|
||||
|
||||
tmpDf %>% ggplot(aes(x=Specificity,y=Sensitivity,color=Thresholds))+
|
||||
geom_point()+
|
||||
geom_line()+
|
||||
geom_abline(intercept = 0, slope = 1, alpha=0.2)+
|
||||
theme_classic()+
|
||||
ylab("Sensitivity (TPR)")+
|
||||
xlab("1-Specificity (FPR)")+
|
||||
geom_label_repel(aes(label = round(Thresholds,2)),
|
||||
box.padding = 0.35,
|
||||
point.padding = 0.5,
|
||||
segment.color = 'grey50')
|
||||
|
||||
names(rocobj)
|
||||
|
||||
cutoffs <- rocobj$thresholds
|
||||
cutoffs
|
||||
|
||||
# identify the best cut-off value (for example, by minimazing the distance from the point (0,1))
|
||||
tp = rocobj$sensitivities
|
||||
fp = 1 - rocobj$specificities
|
||||
|
||||
dist <- (1-tp)^2 + fp^2
|
||||
dist
|
||||
which.min(dist)
|
||||
|
||||
best.cutoff <- cutoffs[which.min(dist)]
|
||||
best.cutoff
|
||||
|
||||
# the selected cut-off value has impact on the model's performance
|
||||
predicted.label <- factor(ifelse(bin.predMat[,"YES"] >= best.cutoff, "YES", "NO"))
|
||||
|
||||
table(bin.observed, predicted.label)
|
||||
CA(bin.observed, predicted.label)
|
||||
Sensitivity(bin.observed, predicted.label, "YES")
|
||||
Specificity(bin.observed, predicted.label, "YES")
|
||||
|
||||
|
|
@ -0,0 +1,229 @@
|
|||
###
|
||||
#
|
||||
# CLUSTERING
|
||||
#
|
||||
###
|
||||
|
||||
# Load the dataset
|
||||
cities <- read.table("worldcities.csv", sep=",", header = T)
|
||||
summary(cities)
|
||||
x <- cities$lng
|
||||
y <- cities$lat
|
||||
x <- x / max(x)
|
||||
y <- y / max(y)
|
||||
plot(x,y)
|
||||
df = data.frame(x, y)
|
||||
|
||||
# KMeans clustering
|
||||
results <- kmeans(df, centers=6)
|
||||
results
|
||||
plot(x, y, col=results$cluster)
|
||||
|
||||
# Are clusters aligned with classes?
|
||||
library(ggplot2)
|
||||
data(iris)
|
||||
subset <- iris[,1:2]
|
||||
|
||||
# classes
|
||||
p1<-qplot(subset$Sepal.Length,subset$Sepal.Width,color = iris$Species, main = "Ground truth") + theme_bw()
|
||||
|
||||
resultsIris <-kmeans(subset, centers = 3)
|
||||
assignments <- as.factor(resultsIris$cluster)
|
||||
p2<-qplot(subset$Sepal.Length,subset$Sepal.Width,color = assignments, main = "Clustering") + theme_bw()
|
||||
gridExtra::grid.arrange(p1, p2, nrow = 2)
|
||||
|
||||
# Exercise -> try to cluster the iris data according to Length and Width!
|
||||
|
||||
# Try to fit models and see what can be learnt best!
|
||||
library(mclust)
|
||||
fit <- Mclust(df)
|
||||
# summarize the fits
|
||||
plot(fit)
|
||||
summary(fit)
|
||||
|
||||
#Hierarchical clustering
|
||||
distMat = dist(df)
|
||||
resultsH <- hclust(distMat)
|
||||
|
||||
plot(resultsH)
|
||||
clusters <- cutree(resultsH, 6)
|
||||
plot(x, y, col=clusters)
|
||||
|
||||
|
||||
###
|
||||
#
|
||||
# ATTRIBUTE EVALUATION
|
||||
#
|
||||
###
|
||||
|
||||
|
||||
# Attribute evaluation functions are implemented in the package "CORElearn"
|
||||
|
||||
# Load the library
|
||||
#install.packages("CORElearn")
|
||||
library(CORElearn)
|
||||
|
||||
|
||||
#
|
||||
# Attribute estimation in classification
|
||||
#
|
||||
|
||||
|
||||
|
||||
#
|
||||
# Example 1
|
||||
#
|
||||
|
||||
mr <- read.table("mushroom.txt", sep=",", header=T)
|
||||
mr$edibility <- as.factor(mr$edibility)
|
||||
summary(mr)
|
||||
|
||||
qplot(mr$edibility, ylab="Number of species", main="Edibility of mushrooms", geom = c("bar"))
|
||||
|
||||
|
||||
# the attrEval function evaluates the quality of the attributes/dependent variables
|
||||
# specified by the formula. Parameter formula is used as a mechanism to select
|
||||
# attributes and prediction variable(class). The simplest way is to specify just
|
||||
# response variable: "Class ~ .". In this case all other attributes in the data set
|
||||
# are evaluated with the selected heuristic method.
|
||||
|
||||
# attribute evaluation using information gain
|
||||
sort(attrEval(edibility ~ ., mr, "InfGain"), decreasing = TRUE)
|
||||
|
||||
# build a decision tree using information gain as a splitting criterion
|
||||
dt <- CoreModel(edibility ~ ., mr, model="tree", selectionEstimator="InfGain")
|
||||
plot(dt, mr)
|
||||
|
||||
|
||||
#
|
||||
# Example 2
|
||||
#
|
||||
|
||||
quadrant <- read.table("quadrant.txt", sep=",", header=T)
|
||||
summary(quadrant)
|
||||
|
||||
quadrant$Class <- as.factor(quadrant$Class)
|
||||
|
||||
plot(quadrant, col=quadrant$Class)
|
||||
plot(quadrant$a1, quadrant$a2, col=quadrant$Class)
|
||||
|
||||
# attribute evaluation using information gain
|
||||
sort(attrEval(Class ~ ., quadrant, "InfGain"), decreasing = TRUE)
|
||||
|
||||
# information gain is a myopic measure
|
||||
# (it assumes that attributes are conditionally independent given the class)
|
||||
|
||||
# using information gain to construct a decision tree will produce a poor result
|
||||
dt2 <- CoreModel(Class ~ ., quadrant, model="tree", selectionEstimator="InfGain")
|
||||
plot(dt2, quadrant)
|
||||
|
||||
# non-myopic measures (Relief and ReleifF)
|
||||
sort(attrEval(Class ~ ., quadrant, "Relief"), decreasing = TRUE)
|
||||
sort(attrEval(Class ~ ., quadrant, "ReliefFequalK"), decreasing = TRUE)
|
||||
sort(attrEval(Class ~ ., quadrant, "ReliefFexpRank"), decreasing = TRUE)
|
||||
|
||||
# use ?attrEval to find out different variations of ReliefF measure...
|
||||
|
||||
# build a decision tree using ReliefFequalK as a splitting criterion
|
||||
dt3 <- CoreModel(Class ~ ., quadrant, model="tree", selectionEstimator = "ReliefFequalK")
|
||||
plot(dt3, quadrant)
|
||||
|
||||
|
||||
#
|
||||
# Example 3
|
||||
#
|
||||
|
||||
players <- read.table("players.txt", sep=",", header=TRUE)
|
||||
summary(players)
|
||||
players$position <- as.factor(players$position)
|
||||
|
||||
sort(attrEval(position ~ ., players, "InfGain"), decreasing = TRUE)
|
||||
sort(attrEval(position ~ ., players, "Gini"), decreasing = TRUE)
|
||||
|
||||
# the "id" attribute is overestimated,
|
||||
# although it does not carry any useful information
|
||||
|
||||
# GainRatio moderates the overestimation of the "id" attribute
|
||||
sort(attrEval(position ~ ., players, "GainRatio"), decreasing = TRUE)
|
||||
|
||||
# Both ReliefF and MDL measures identify the "id" attribute as irrelevant
|
||||
sort(attrEval(position ~ ., players, "ReliefFequalK"), decreasing = TRUE)
|
||||
sort(attrEval(position ~ ., players, "MDL"), decreasing = TRUE)
|
||||
|
||||
#
|
||||
#
|
||||
# Attribute selection
|
||||
#
|
||||
#
|
||||
|
||||
student <- read.table("student.txt", sep=",", header=T)
|
||||
student$G1 <- cut(student$G1, c(-Inf, 9, 20), labels=c("fail", "pass"))
|
||||
student$G2 <- cut(student$G2, c(-Inf, 9, 20), labels=c("fail", "pass"))
|
||||
student$G3 <- cut(student$G3, c(-Inf, 9, 20), labels=c("fail", "pass"))
|
||||
student$studytime <- cut(student$studytime, c(-Inf, 1, 2, 3, 4), labels=c("none", "few", "hefty", "endless"))
|
||||
|
||||
train = student[1:(nrow(student) * 0.7),]
|
||||
test = student[(nrow(student) * 0.7):nrow(student),]
|
||||
|
||||
#
|
||||
# Feature selection using a filter method
|
||||
#
|
||||
|
||||
# evaluate the quality of attributes
|
||||
sort(attrEval(studytime ~ ., student, "MDL"), decreasing = TRUE)
|
||||
|
||||
# train a model using everything
|
||||
model <- CoreModel(studytime ~ ., data=train, model='tree')
|
||||
predictions <- predict(model, test, type='class')
|
||||
sum(predictions == test$studytime) / length(predictions)
|
||||
|
||||
# train a model using the best evaluated attributes
|
||||
model <- CoreModel(studytime ~ sex + Walc + Dalc + freetime + higher + paid, data=train, model='tree')
|
||||
predictions <- predict(model, test, type='class')
|
||||
sum(predictions == test$studytime) / length(predictions)
|
||||
|
||||
|
||||
# using the appropriate combination of attributes...
|
||||
model <- CoreModel(studytime ~ Dalc + paid + G2, data=train, model='tree')
|
||||
predictions <- predict(model, test, type='class')
|
||||
sum(predictions == test$studytime) / length(predictions)
|
||||
|
||||
# What about some model-based feature importances?
|
||||
library(caret)
|
||||
set.seed(123780)
|
||||
#Number randomely variable selected is mtry
|
||||
control <- trainControl(method='repeatedcv',number=10, repeats=1)
|
||||
|
||||
rf.model <- train(studytime ~ .,
|
||||
data=train,
|
||||
method='rf',
|
||||
metric='Accuracy',
|
||||
trControl=control)
|
||||
|
||||
## varImp extracts the feature relevances
|
||||
rf.importances<- varImp(rf.model, scale = FALSE)
|
||||
plot(rf.importances, top = 20)
|
||||
importance.df.values <- rf.importances$importance
|
||||
importance.df.names <- rownames(rf.importances$importance)
|
||||
importance.rf.whole <- data.frame(score = importance.df.values,cnames = importance.df.names)
|
||||
importance.rf.whole <- importance.rf.whole[order(importance.rf.whole$Overall, decreasing = T),]
|
||||
feature.names.rf <- importance.rf.whole$cnames[1:20]
|
||||
|
||||
mdl.importances <- sort(attrEval(studytime ~ ., student, "MDL"), decreasing = TRUE)[1:20]
|
||||
feature.names.mdl <- names(mdl.importances)
|
||||
paste0(length(intersect(feature.names.rf, feature.names.mdl))*100/20,"% overlap between top ranked features!")
|
||||
|
||||
scaleVector <- function(x) {(x - min(x))/(max(x) - min(x))}
|
||||
|
||||
# Let's visualize how similar the rankings' distributions are
|
||||
rf.all <- scaleVector(importance.rf.whole$Overall)
|
||||
mdl.all <- scaleVector(as.vector(mdl.importances))
|
||||
|
||||
rank.frame <- data.frame(rf.all, mdl.all)
|
||||
|
||||
library(dplyr)
|
||||
library(tidyr)
|
||||
|
||||
# How do the two distributions differ?
|
||||
summary(rank.frame)
|
||||
rank.frame %>% gather() %>% ggplot(aes(x=value,fill=key))+geom_density(alpha=0.4)+theme_bw()
|
|
@ -0,0 +1,207 @@
|
|||
################################################################
|
||||
#
|
||||
# Combining machine learning algorithms
|
||||
#
|
||||
################################################################
|
||||
|
||||
#install.packages("CORElearn")
|
||||
library(CORElearn)
|
||||
|
||||
vehicle <- read.table("vehicle.txt", sep=",", header = T)
|
||||
summary(vehicle)
|
||||
|
||||
set.seed(8678686)
|
||||
sel <- sample(1:nrow(vehicle), size=as.integer(nrow(vehicle)*0.7), replace=F)
|
||||
learn <- vehicle[sel,]
|
||||
test <- vehicle[-sel,]
|
||||
|
||||
learn$Class <- as.factor(learn$Class)
|
||||
test$Class <- as.factor(test$Class)
|
||||
|
||||
table(learn$Class)
|
||||
table(test$Class)
|
||||
|
||||
CA <- function(observed, predicted)
|
||||
{
|
||||
t <- table(observed, predicted)
|
||||
|
||||
sum(diag(t)) / sum(t)
|
||||
}
|
||||
|
||||
#
|
||||
# A simple tree baseline.
|
||||
#
|
||||
|
||||
|
||||
#
|
||||
# Voting
|
||||
#
|
||||
|
||||
modelDT <- CoreModel(Class ~ ., learn, model="tree")
|
||||
modelNB <- CoreModel(Class ~ ., learn, model="bayes")
|
||||
modelKNN <- CoreModel(Class ~ ., learn, model="knn", kInNN = 2)
|
||||
|
||||
predDT <- predict(modelDT, test, type = "class")
|
||||
result.caDT <- CA(test$Class, predDT)
|
||||
result.caDT
|
||||
|
||||
predNB <- predict(modelNB, test, type="class")
|
||||
result.caNB <- CA(test$Class, predNB)
|
||||
result.caNB
|
||||
|
||||
predKNN <- predict(modelKNN, test, type="class")
|
||||
result.caKNN <- CA(test$Class, predKNN)
|
||||
result.caKNN
|
||||
|
||||
# combine predictions into a data frame
|
||||
pred <- data.frame(predDT, predNB, predKNN)
|
||||
pred
|
||||
|
||||
# the class with the most votes wins
|
||||
voting <- function(predictions)
|
||||
{
|
||||
res <- vector()
|
||||
|
||||
for (i in 1 : nrow(predictions))
|
||||
{
|
||||
vec <- unlist(predictions[i,])
|
||||
res[i] <- names(which.max(table(vec)))
|
||||
}
|
||||
|
||||
factor(res, levels=levels(predictions[,1]))
|
||||
}
|
||||
|
||||
predicted <- voting(pred)
|
||||
result.voting <- CA(test$Class, predicted)
|
||||
result.voting
|
||||
|
||||
#
|
||||
# Weighted voting
|
||||
#
|
||||
|
||||
predDT.prob <- predict(modelDT, test, type="probability")
|
||||
predNB.prob <- predict(modelNB, test, type="probability")
|
||||
predKNN.prob <- predict(modelKNN, test, type="probability")
|
||||
|
||||
# combine predictions into a data frame
|
||||
pred.prob <- result.caDT * predDT.prob + result.caNB * predNB.prob + result.caKNN * predKNN.prob
|
||||
pred.prob
|
||||
|
||||
# We can visualize the joint output space!
|
||||
heatmap(pred.prob, col=c('red','green','blue'))
|
||||
legend(x="right", legend=c("min", "med", "max"),
|
||||
fill=c('red','green','blue'))
|
||||
|
||||
# pick the class with the highest score
|
||||
highest <- apply(pred.prob, 1, which.max)
|
||||
classes <- levels(learn$Class)
|
||||
predicted <- classes[highest]
|
||||
|
||||
result.wvoting <- CA(test$Class, predicted)
|
||||
result.wvoting
|
||||
|
||||
|
||||
|
||||
#
|
||||
# Bagging
|
||||
#
|
||||
|
||||
#install.packages("ipred")
|
||||
library(ipred)
|
||||
|
||||
bag <- bagging(Class ~ ., learn, nbagg=14)
|
||||
bag.pred <- predict(bag, test, type="class")
|
||||
result.bagging <- CA(test$Class, bag.pred)
|
||||
result.bagging
|
||||
|
||||
|
||||
#
|
||||
# Random forest as a variation of bagging
|
||||
#
|
||||
|
||||
# install.packages("randomForest")
|
||||
library(randomForest)
|
||||
|
||||
rf <- randomForest(Class ~ ., learn)
|
||||
rf.predicted <- predict(rf, test, type = "class")
|
||||
result.rf <- CA(test$Class, rf.predicted)
|
||||
result.rf
|
||||
|
||||
|
||||
#
|
||||
# Boosting
|
||||
#
|
||||
|
||||
# install.packages("adabag")
|
||||
library(adabag)
|
||||
|
||||
bm <- boosting(Class ~ ., learn)
|
||||
predictions.bm <- predict(bm, test)
|
||||
names(predictions.bm)
|
||||
|
||||
predicted.bm <- predictions.bm$class
|
||||
result.boosting <- CA(test$Class, predicted.bm)
|
||||
result.boosting
|
||||
|
||||
#
|
||||
# Caret package
|
||||
#
|
||||
|
||||
# install.packages("caret")
|
||||
library(caret)
|
||||
caretModel <- train(Class ~ ., learn, method="xgbLinear", eta=1, verbose=1)
|
||||
pred <- predict(caretModel, test)
|
||||
result.xgb <- CA(test$Class, pred)
|
||||
result.xgb
|
||||
|
||||
# Let's visualize currently considered ensemble methods.
|
||||
performances <- c(result.rf,
|
||||
result.bagging,
|
||||
result.wvoting,
|
||||
result.boosting,
|
||||
result.caDT,
|
||||
result.caKNN,
|
||||
result.caNB,
|
||||
result.xgb)
|
||||
|
||||
algo.names <- c("RF","Bagging","Weighted vote","Boosting","DT","KNN","NB","XGB")
|
||||
ensemble.model <- as.factor(c(1,1,1,1,0,0,0,1))
|
||||
result.vec <- data.frame(performances, algo.names,ensemble.model)
|
||||
reordering <- order(result.vec$performances)
|
||||
result.vec <- result.vec[reordering,]
|
||||
rownames(result.vec) <- NULL
|
||||
result.vec
|
||||
|
||||
library(ggplot2)
|
||||
positions <- as.vector(result.vec$algo.names)
|
||||
ggplot(data=result.vec, aes(x=algo.names, y=performances, color = ensemble.model)) +
|
||||
geom_point(size = 10, shape = 4) +
|
||||
scale_x_discrete(limits = positions) +
|
||||
ylim(0.5,0.8) +
|
||||
xlab("Ensemble type") +
|
||||
ylab("Accuracy") +
|
||||
geom_hline(yintercept=max(performances), color = "darkgreen") +
|
||||
geom_hline(yintercept=min(performances), color = "black") +
|
||||
title("Performance comparison") +
|
||||
geom_text(label = positions,nudge_x = 0.2, nudge_y = -0.01) +
|
||||
theme_bw() +
|
||||
theme(axis.title.x=element_blank(),
|
||||
axis.text.x=element_blank(),
|
||||
axis.ticks.x=element_blank())
|
||||
|
||||
|
||||
#
|
||||
# Cross-validation
|
||||
#
|
||||
|
||||
# the library ipred is needed to perform cross-validation
|
||||
library(ipred)
|
||||
|
||||
# Tell the cross-validation which model to use
|
||||
mymodel.coremodel <- function(formula, data, target.model){CoreModel(formula, data, model=target.model)}
|
||||
# Tell the cross-validation how to obtain predictions
|
||||
mypredict.generic <- function(object, newdata){predict(object, newdata, type = "class")}
|
||||
# force the predict function to return class labels only and also destroy the internal representation of a given model
|
||||
mypredict.coremodel <- function(object, newdata) {pred <- predict(object, newdata)$class; destroyModels(object); pred}
|
||||
cvError <- errorest(Class~., data=learn, model = mymodel.coremodel, predict = mypredict.coremodel, target.model = "tree")
|
||||
1 - cvError$error
|
Loading…
Reference in New Issue