398 lines
9.7 KiB
R
398 lines
9.7 KiB
R
|
#
|
||
|
#
|
||
|
# INTRODUCTION TO CLASSIFICATION
|
||
|
#
|
||
|
#
|
||
|
|
||
|
#
|
||
|
# Work plan:
|
||
|
#
|
||
|
# - Read in the data
|
||
|
# - Split the data into training and test sets
|
||
|
# - Build a model using the training set
|
||
|
# - Evaluate the model using the test set
|
||
|
#
|
||
|
|
||
|
|
||
|
|
||
|
# Please download the data file "players.txt" into a local directory
|
||
|
# then set that directory as the current working directory of R.
|
||
|
# You can achive this using the "setwd" command or by selecting "File -> Change dir..."
|
||
|
|
||
|
|
||
|
# We are going to use the packages: rpart, and pROC.
|
||
|
# Make sure that the packages are installed.
|
||
|
#
|
||
|
# You install a package in R with the function install.packages():
|
||
|
#
|
||
|
# install.packages("pROC")
|
||
|
# library(pROC)
|
||
|
#
|
||
|
# To install packages without root access:
|
||
|
#
|
||
|
# install.packages("pROC", lib="/mylibs/Rpackages/")
|
||
|
# library(pROC, lib.loc="/mylibs/Rpackages/")
|
||
|
#
|
||
|
|
||
|
# Read in the dataset
|
||
|
players <- read.table("players.txt", header = T, sep = ",")
|
||
|
|
||
|
# Get the summary statistics of the data
|
||
|
summary(players)
|
||
|
|
||
|
|
||
|
|
||
|
############################################################################
|
||
|
#
|
||
|
# ATTENTION!!!!!!!!!
|
||
|
#
|
||
|
# The "id" attribute uniquely identifies a player within our data set.
|
||
|
# This attribute cannot be used to classify new players as they will
|
||
|
# have different id numbers.
|
||
|
#
|
||
|
# Before we continue, we have to remove the "id" attribute from our data set!
|
||
|
#
|
||
|
############################################################################
|
||
|
|
||
|
|
||
|
players$id <- NULL
|
||
|
names(players)
|
||
|
|
||
|
|
||
|
############################################################################
|
||
|
#
|
||
|
#
|
||
|
# Please make sure that the "id" attribute has been removed!!!!!!!!!!
|
||
|
#
|
||
|
#
|
||
|
############################################################################
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
############################################################################
|
||
|
#
|
||
|
# Example 1
|
||
|
#
|
||
|
# Prediction of playing position
|
||
|
#
|
||
|
############################################################################
|
||
|
|
||
|
|
||
|
#
|
||
|
# We want to build a model to predict a player's playing position
|
||
|
# with respect to the given player's statistics.
|
||
|
#
|
||
|
# The target variable "position" is discrete - we term this a classification task.
|
||
|
#
|
||
|
# We aim to verify whether or not it is possible to use historical data to predict
|
||
|
# playing positions for new players.
|
||
|
#
|
||
|
|
||
|
# We are going to split the data into a training and testing data set.
|
||
|
# The training data set consists of players that ended their careers before 1999.
|
||
|
# The test data set consists of players that began their careers after 1999.
|
||
|
|
||
|
learn <- players[players$lastseason <= 1999,]
|
||
|
test <- players[players$firstseason > 1999,]
|
||
|
|
||
|
# We used the "firstseason" and "lastseason" attributes to split the data.
|
||
|
# Therefore the attributes are not going to contribute to the modelling task, so
|
||
|
# we will remove them.
|
||
|
|
||
|
learn$firstseason <- NULL
|
||
|
learn$lastseason <- NULL
|
||
|
|
||
|
test$firstseason <- NULL
|
||
|
test$lastseason <- NULL
|
||
|
|
||
|
# We inspect the target values distribution in the defined data sets.
|
||
|
# (number of examples and frequency of individual classes)
|
||
|
|
||
|
nrow(learn)
|
||
|
table(learn$position)
|
||
|
|
||
|
nrow(test)
|
||
|
table(test$position)
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
#
|
||
|
#
|
||
|
# MAJORITY CLASSIFIER
|
||
|
#
|
||
|
#
|
||
|
|
||
|
|
||
|
# The majority class is the class with the highest number of training examples
|
||
|
which.max(table(learn$position))
|
||
|
|
||
|
majority.class <- names(which.max(table(learn$position)))
|
||
|
majority.class
|
||
|
|
||
|
# The majority classifier classifies all test instances into the majority class.
|
||
|
# The accuracy of the majority class
|
||
|
sum(test$position == majority.class) / length(test$position)
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
#
|
||
|
#
|
||
|
# DECISION TREES
|
||
|
#
|
||
|
#
|
||
|
|
||
|
|
||
|
# load the "rpart" package that implements decision trees in R
|
||
|
library(rpart)
|
||
|
|
||
|
# Fit a decision tree
|
||
|
dt <- rpart(position ~ ., data = learn)
|
||
|
|
||
|
#
|
||
|
# The first argument of the rpart function is a model formula that indicates
|
||
|
# the functional form of the model. The "~" symbol stands for "modeled as".
|
||
|
#
|
||
|
# The formula "position ~ . " states that we want a model that predicts the
|
||
|
# target attribute "position" using all other attributes present in the data
|
||
|
# (which is the meaning of the "." character).
|
||
|
#
|
||
|
# If we wanted, for example, a model for the "position" attribute as a function
|
||
|
# of the attributes "height", "width", and "pts", we should have
|
||
|
# indicated the formula as "position ~ height + weight + pts".
|
||
|
#
|
||
|
# The second argument of the rpart function provides the training examples.
|
||
|
#
|
||
|
|
||
|
|
||
|
# The content of the dt object
|
||
|
dt
|
||
|
|
||
|
# A graphical representation of our tree
|
||
|
plot(dt)
|
||
|
text(dt, pretty = 1)
|
||
|
|
||
|
# To navigate the tree, start from the root node. If the attribute of the
|
||
|
# observation satisfies the logical test, then navigate down the left branch.
|
||
|
# Otherwise navigate down the right branch. Continue until a leaf node is reached.
|
||
|
# A leaf node represents a class label (or class label distribution).
|
||
|
|
||
|
|
||
|
# Observed values in the test samples (the ground truth)
|
||
|
observed <- test$position
|
||
|
observed
|
||
|
|
||
|
|
||
|
# The "predict" function returns a vector of predicted responses from a fitted tree.
|
||
|
# The "type" argument denotes the form of predicted value returned. The setting
|
||
|
# type = "class" will cause the response in a form of a vector of predicted
|
||
|
# class labels.
|
||
|
|
||
|
predicted <- predict(dt, test, type = "class")
|
||
|
predicted
|
||
|
|
||
|
# confusion matrix
|
||
|
t <- table(observed, predicted)
|
||
|
t
|
||
|
|
||
|
# The diagonal elements in the confusion matrix represent the number of correctly
|
||
|
# classified test instances
|
||
|
|
||
|
# The classification accuracy is the fraction of correctly classified test instances
|
||
|
sum(diag(t)) / sum(t)
|
||
|
|
||
|
|
||
|
# Let's write a function that calculates the classification accuracy
|
||
|
CA <- function(observed, predicted)
|
||
|
{
|
||
|
t <- table(observed, predicted)
|
||
|
|
||
|
sum(diag(t)) / sum(t)
|
||
|
}
|
||
|
|
||
|
# Function call
|
||
|
CA(observed, predicted)
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
# The setting type = "prob" will cause the predict function to
|
||
|
# provide its answer in the form of a matrix with class probabilities.
|
||
|
|
||
|
predMat <- predict(dt, test, type = "prob")
|
||
|
predMat
|
||
|
|
||
|
# A matrix of the observed class probabilities (the ground truth)
|
||
|
# (the real class has the probability of 1, others have 0)
|
||
|
obsMat <- model.matrix( ~ position-1, test)
|
||
|
obsMat
|
||
|
|
||
|
|
||
|
# IMPORTANT: check whether the columns refer to the same attributes
|
||
|
# (if not, change the order of columns)
|
||
|
colnames(predMat)
|
||
|
colnames(obsMat)
|
||
|
|
||
|
|
||
|
# The Brier score
|
||
|
brier.score <- function(observedMatrix, predictedMatrix)
|
||
|
{
|
||
|
sum((observedMatrix - predictedMatrix) ^ 2) / nrow(predictedMatrix)
|
||
|
}
|
||
|
|
||
|
brier.score(obsMat, predMat)
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
############################################################################
|
||
|
#
|
||
|
# Example 2
|
||
|
#
|
||
|
# Does a player make at least 80% of free-throws attempted?
|
||
|
#
|
||
|
############################################################################
|
||
|
|
||
|
#
|
||
|
# It is a binary problem (the target variable is discrete with values YES and NO).
|
||
|
#
|
||
|
|
||
|
#
|
||
|
# We are going to create a data set using the players data.
|
||
|
#
|
||
|
|
||
|
# Get the players who have attempted at least one free throw during the career.
|
||
|
bin.players <- players[players$fta > 0,]
|
||
|
|
||
|
# Free-throw success rate
|
||
|
rate <- bin.players$ftm / bin.players$fta
|
||
|
|
||
|
# Create a discrete attribute "ftexpert" that will be our target variable.
|
||
|
ftexpert <- vector()
|
||
|
ftexpert[rate >= 0.8] <- "YES"
|
||
|
ftexpert[rate < 0.8] <- "NO"
|
||
|
bin.players$ftexpert <- as.factor(ftexpert)
|
||
|
|
||
|
# Remove the "fta" and "ftm" attributes.
|
||
|
bin.players$fta <- NULL
|
||
|
bin.players$ftm <- NULL
|
||
|
|
||
|
summary(bin.players)
|
||
|
|
||
|
|
||
|
|
||
|
# Split the data into two disjoint sets, one for training and one for testing.
|
||
|
bin.learn <- bin.players[1:1500,]
|
||
|
bin.test <- bin.players[-(1:1500),]
|
||
|
|
||
|
# Inspect the distribution of the class values in the data sets.
|
||
|
table(bin.learn$ftexpert)
|
||
|
table(bin.test$ftexpert)
|
||
|
|
||
|
|
||
|
# Train a decision tree
|
||
|
dt2 <- rpart(ftexpert ~ ., data = bin.learn)
|
||
|
plot(dt2)
|
||
|
text(dt2, pretty = 1)
|
||
|
|
||
|
|
||
|
bin.observed <- bin.test$ftexpert
|
||
|
bin.predicted <- predict(dt2, bin.test, type="class")
|
||
|
|
||
|
|
||
|
table(bin.observed, bin.predicted)
|
||
|
|
||
|
|
||
|
# Classification accuracy of the decision tree
|
||
|
CA(bin.observed, bin.predicted)
|
||
|
|
||
|
# Exercise -> majority is important!
|
||
|
table(bin.observed)
|
||
|
sum(bin.observed == "NO") / length(bin.observed)
|
||
|
|
||
|
|
||
|
# The sensitivity of a model (recall)
|
||
|
Sensitivity <- function(observed, predicted, pos.class)
|
||
|
{
|
||
|
t <- table(observed, predicted)
|
||
|
|
||
|
t[pos.class, pos.class] / sum(t[pos.class,])
|
||
|
}
|
||
|
|
||
|
# The specificity of a model
|
||
|
Specificity <- function(observed, predicted, pos.class)
|
||
|
{
|
||
|
t <- table(observed, predicted)
|
||
|
|
||
|
# identify the negative class name
|
||
|
neg.class <- which(row.names(t) != pos.class)
|
||
|
|
||
|
t[neg.class, neg.class] / sum(t[neg.class,])
|
||
|
}
|
||
|
|
||
|
# let's say that the value "YES" is our positive class...
|
||
|
|
||
|
Sensitivity(bin.observed, bin.predicted, "YES")
|
||
|
Specificity(bin.observed, bin.predicted, "YES")
|
||
|
|
||
|
#
|
||
|
# ROC curve
|
||
|
#
|
||
|
|
||
|
library(pROC)
|
||
|
library(ggplot2)
|
||
|
library(dplyr)
|
||
|
library(ggrepel) # For nicer ROC visualization
|
||
|
|
||
|
bin.predMat <- predict(dt2, bin.test, type = "prob")
|
||
|
|
||
|
# Remember, we treat the value "YES" as the positive class
|
||
|
rocobj <- roc(bin.observed, bin.predMat[,"YES"])
|
||
|
plot(rocobj)
|
||
|
|
||
|
# This part below is optional -> nicer ROC
|
||
|
tmpDf <- data.frame(rocobj$sensitivities, 1-rocobj$specificities, rocobj$thresholds)
|
||
|
colnames(tmpDf) <- c("Sensitivity","Specificity", "Thresholds")
|
||
|
|
||
|
tmpDf %>% ggplot(aes(x=Specificity,y=Sensitivity,color=Thresholds))+
|
||
|
geom_point()+
|
||
|
geom_line()+
|
||
|
geom_abline(intercept = 0, slope = 1, alpha=0.2)+
|
||
|
theme_classic()+
|
||
|
ylab("Sensitivity (TPR)")+
|
||
|
xlab("1-Specificity (FPR)")+
|
||
|
geom_label_repel(aes(label = round(Thresholds,2)),
|
||
|
box.padding = 0.35,
|
||
|
point.padding = 0.5,
|
||
|
segment.color = 'grey50')
|
||
|
|
||
|
names(rocobj)
|
||
|
|
||
|
cutoffs <- rocobj$thresholds
|
||
|
cutoffs
|
||
|
|
||
|
# identify the best cut-off value (for example, by minimazing the distance from the point (0,1))
|
||
|
tp = rocobj$sensitivities
|
||
|
fp = 1 - rocobj$specificities
|
||
|
|
||
|
dist <- (1-tp)^2 + fp^2
|
||
|
dist
|
||
|
which.min(dist)
|
||
|
|
||
|
best.cutoff <- cutoffs[which.min(dist)]
|
||
|
best.cutoff
|
||
|
|
||
|
# the selected cut-off value has impact on the model's performance
|
||
|
predicted.label <- factor(ifelse(bin.predMat[,"YES"] >= best.cutoff, "YES", "NO"))
|
||
|
|
||
|
table(bin.observed, predicted.label)
|
||
|
CA(bin.observed, predicted.label)
|
||
|
Sensitivity(bin.observed, predicted.label, "YES")
|
||
|
Specificity(bin.observed, predicted.label, "YES")
|
||
|
|
||
|
|