is-vaje/v5/lab4_code.R

#
#
# INTRODUCTION TO CLASSIFICATION
#
#

#
# Work plan:
#
# - Read in the data
# - Split the data into training and test sets
# - Build a model using the training set
# - Evaluate the model using the test set
#


# Please download the data file "players.txt" into a local directory
# then set that directory as the current working directory of R.
# You can achive this using the "setwd" command or by selecting "File -> Change dir..."


# We are going to use the packages: rpart, and pROC. 
# Make sure that the packages are installed.
#
# You install a package in R with the function install.packages():
#
#     install.packages("pROC")
#     library(pROC)
#
# To install packages without root access:
#
#     install.packages("pROC", lib="/mylibs/Rpackages/")
#     library(pROC, lib.loc="/mylibs/Rpackages/")
#

# Read in the dataset 
players <- read.table("players.txt", header = T, sep = ",")

# Get the summary statistics of the data
summary(players)


############################################################################
#
# ATTENTION!!!!!!!!!
#
# The "id" attribute uniquely identifies a player within our data set. 
# This attribute cannot be used to classify new players as they will 
# have different id numbers.
# 
# Before we continue, we have to remove the "id" attribute from our data set!
#
############################################################################


players$id <- NULL
names(players)


############################################################################
#
#
# Please make sure that the "id" attribute has been removed!!!!!!!!!!
#
#
############################################################################


############################################################################
#
# Example 1
#
# Prediction of playing position
#
############################################################################


#
# We want to build a model to predict a player's playing position 
# with respect to the given player's statistics.
#
# The target variable "position" is discrete - we term this a classification task.   
#
# We aim to verify whether or not it is possible to use historical data to predict
# playing positions for new players. 
#

# We are going to split the data into a training and testing data set.
# The training data set consists of players that ended their careers before 1999.
# The test data set consists of players that began their careers after 1999.

learn <- players[players$lastseason <= 1999,]
test <- players[players$firstseason > 1999,]

# We used the "firstseason" and "lastseason" attributes to split the data.
# Therefore the attributes are not going to contribute to the modelling task, so
# we will remove them.

learn$firstseason <- NULL
learn$lastseason <- NULL

test$firstseason <- NULL
test$lastseason <- NULL

# We inspect the target values distribution in the defined data sets.
# (number of examples and frequency of individual classes)

nrow(learn)
table(learn$position)

nrow(test)
table(test$position)


#
#
# MAJORITY CLASSIFIER
#
#


# The majority class is the class with the highest number of training examples
which.max(table(learn$position))

majority.class <- names(which.max(table(learn$position)))
majority.class

# The majority classifier classifies all test instances into the majority class.
# The accuracy of the majority class
sum(test$position == majority.class) / length(test$position)


#
#
# DECISION TREES
#
#


# load the "rpart" package that implements decision trees in R
library(rpart)

# Fit a decision tree
dt <- rpart(position ~ ., data = learn)

#
# The first argument of the rpart function is a model formula that indicates 
# the functional form of the model. The "~" symbol stands for "modeled as".
#
# The formula "position ~ . " states that we want a model that predicts the 
# target attribute "position" using all other attributes present in the data 
# (which is the meaning of the "." character).
# 
# If we wanted, for example, a model for the "position" attribute as a function 
# of the attributes "height", "width", and "pts", we should have 
# indicated the formula as "position ~ height + weight + pts".  
# 
# The second argument of the rpart function provides the training examples.
#


# The content of the dt object 
dt

# A graphical representation of our tree
plot(dt)
text(dt, pretty = 1)

# To navigate the tree, start from the root node. If the attribute of the 
# observation satisfies the logical test, then navigate down the left branch.
# Otherwise navigate down the right branch. Continue until a leaf node is reached.
# A leaf node represents a class label (or class label distribution). 


# Observed values in the test samples (the ground truth)
observed <- test$position
observed


# The "predict" function returns a vector of predicted responses from a fitted tree.
# The "type" argument denotes the form of predicted value returned. The setting
# type = "class" will cause the response in a form of a vector of predicted 
# class labels.

predicted <- predict(dt, test, type = "class")
predicted

# confusion matrix
t <- table(observed, predicted)
t

# The diagonal elements in the confusion matrix represent the number of correctly 
# classified test instances

# The classification accuracy is the fraction of correctly classified test instances
sum(diag(t)) / sum(t)


# Let's write a function that calculates the classification accuracy
CA <- function(observed, predicted)
{
  t <- table(observed, predicted)
  
  sum(diag(t)) / sum(t)
}

# Function call
CA(observed, predicted)


# The setting type = "prob" will cause the predict function to 
# provide its answer in the form of a matrix with class probabilities.

predMat <- predict(dt, test, type = "prob")
predMat

# A matrix of the observed class probabilities (the ground truth) 
# (the real class has the probability of 1, others have 0)
obsMat <- model.matrix( ~ position-1, test)
obsMat


# IMPORTANT: check whether the columns refer to the same attributes
# (if not, change the order of columns)
colnames(predMat)
colnames(obsMat)


# The Brier score
brier.score <- function(observedMatrix, predictedMatrix)
{
  sum((observedMatrix - predictedMatrix) ^ 2) / nrow(predictedMatrix)
}

brier.score(obsMat, predMat)


############################################################################
#
# Example 2
#
# Does a player make at least 80% of free-throws attempted?
#
############################################################################

#
# It is a binary problem (the target variable is discrete with values YES and NO). 
#

#
# We are going to create a data set using the players data.
#

# Get the players who have attempted at least one free throw during the career.
bin.players <- players[players$fta > 0,]

# Free-throw success rate
rate <- bin.players$ftm / bin.players$fta

# Create a discrete attribute "ftexpert" that will be our target variable.
ftexpert <- vector()
ftexpert[rate >= 0.8] <- "YES"
ftexpert[rate < 0.8] <- "NO"
bin.players$ftexpert <- as.factor(ftexpert)

# Remove the "fta" and "ftm" attributes.
bin.players$fta <- NULL
bin.players$ftm <- NULL

summary(bin.players)


# Split the data into two disjoint sets, one for training and one for testing.
bin.learn <- bin.players[1:1500,]
bin.test <- bin.players[-(1:1500),]

# Inspect the distribution of the class values in the data sets.
table(bin.learn$ftexpert)
table(bin.test$ftexpert)


# Train a decision tree
dt2 <- rpart(ftexpert ~ ., data = bin.learn)
plot(dt2)
text(dt2, pretty = 1)


bin.observed <- bin.test$ftexpert
bin.predicted <- predict(dt2, bin.test, type="class")


table(bin.observed, bin.predicted)


# Classification accuracy of the decision tree
CA(bin.observed, bin.predicted)

# Exercise -> majority is important!
table(bin.observed)
sum(bin.observed == "NO") / length(bin.observed)


# The sensitivity of a model (recall)
Sensitivity <- function(observed, predicted, pos.class)
{
  t <- table(observed, predicted)
  
  t[pos.class, pos.class] / sum(t[pos.class,])
}

# The specificity of a model
Specificity <- function(observed, predicted, pos.class)
{
  t <- table(observed, predicted)
  
  # identify the negative class name
  neg.class <- which(row.names(t) != pos.class)

  t[neg.class, neg.class] / sum(t[neg.class,])
}

# let's say that the value "YES" is our positive class...

Sensitivity(bin.observed, bin.predicted, "YES")
Specificity(bin.observed, bin.predicted, "YES")

#
# ROC curve
#

library(pROC)
library(ggplot2)
library(dplyr)
library(ggrepel) # For nicer ROC visualization

bin.predMat <- predict(dt2, bin.test, type = "prob")

# Remember, we treat the value "YES" as the positive class
rocobj <- roc(bin.observed, bin.predMat[,"YES"])
plot(rocobj)

# This part below is optional -> nicer ROC
tmpDf <- data.frame(rocobj$sensitivities, 1-rocobj$specificities, rocobj$thresholds)
colnames(tmpDf) <- c("Sensitivity","Specificity", "Thresholds")

tmpDf %>% ggplot(aes(x=Specificity,y=Sensitivity,color=Thresholds))+
  geom_point()+
  geom_line()+
  geom_abline(intercept = 0, slope = 1, alpha=0.2)+
  theme_classic()+
  ylab("Sensitivity (TPR)")+
  xlab("1-Specificity (FPR)")+
  geom_label_repel(aes(label = round(Thresholds,2)),
                   box.padding   = 0.35, 
                   point.padding = 0.5,
                   segment.color = 'grey50')

names(rocobj)

cutoffs <- rocobj$thresholds
cutoffs

# identify the best cut-off value (for example, by minimazing the distance from the point (0,1))
tp = rocobj$sensitivities
fp = 1 - rocobj$specificities

dist <- (1-tp)^2 + fp^2
dist
which.min(dist)

best.cutoff <- cutoffs[which.min(dist)]
best.cutoff

# the selected cut-off value has impact on the model's performance
predicted.label <- factor(ifelse(bin.predMat[,"YES"] >= best.cutoff, "YES", "NO"))

table(bin.observed, predicted.label)
CA(bin.observed, predicted.label)
Sensitivity(bin.observed, predicted.label, "YES")
Specificity(bin.observed, predicted.label, "YES")
Push vaje 2022-12-18 15:30:24 +01:00			`#`
			`#`
			`# INTRODUCTION TO CLASSIFICATION`
			`#`
			`#`

			`#`
			`# Work plan:`
			`#`
			`# - Read in the data`
			`# - Split the data into training and test sets`
			`# - Build a model using the training set`
			`# - Evaluate the model using the test set`
			`#`



			`# Please download the data file "players.txt" into a local directory`
			`# then set that directory as the current working directory of R.`
			`# You can achive this using the "setwd" command or by selecting "File -> Change dir..."`


			`# We are going to use the packages: rpart, and pROC.`
			`# Make sure that the packages are installed.`
			`#`
			`# You install a package in R with the function install.packages():`
			`#`
			`# install.packages("pROC")`
			`# library(pROC)`
			`#`
			`# To install packages without root access:`
			`#`
			`# install.packages("pROC", lib="/mylibs/Rpackages/")`
			`# library(pROC, lib.loc="/mylibs/Rpackages/")`
			`#`

			`# Read in the dataset`
			`players <- read.table("players.txt", header = T, sep = ",")`

			`# Get the summary statistics of the data`
			`summary(players)`



			`############################################################################`
			`#`
			`# ATTENTION!!!!!!!!!`
			`#`
			`# The "id" attribute uniquely identifies a player within our data set.`
			`# This attribute cannot be used to classify new players as they will`
			`# have different id numbers.`
			`#`
			`# Before we continue, we have to remove the "id" attribute from our data set!`
			`#`
			`############################################################################`


			`players$id <- NULL`
			`names(players)`


			`############################################################################`
			`#`
			`#`
			`# Please make sure that the "id" attribute has been removed!!!!!!!!!!`
			`#`
			`#`
			`############################################################################`






			`############################################################################`
			`#`
			`# Example 1`
			`#`
			`# Prediction of playing position`
			`#`
			`############################################################################`


			`#`
			`# We want to build a model to predict a player's playing position`
			`# with respect to the given player's statistics.`
			`#`
			`# The target variable "position" is discrete - we term this a classification task.`
			`#`
			`# We aim to verify whether or not it is possible to use historical data to predict`
			`# playing positions for new players.`
			`#`

			`# We are going to split the data into a training and testing data set.`
			`# The training data set consists of players that ended their careers before 1999.`
			`# The test data set consists of players that began their careers after 1999.`

			`learn <- players[players$lastseason <= 1999,]`
			`test <- players[players$firstseason > 1999,]`

			`# We used the "firstseason" and "lastseason" attributes to split the data.`
			`# Therefore the attributes are not going to contribute to the modelling task, so`
			`# we will remove them.`

			`learn$firstseason <- NULL`
			`learn$lastseason <- NULL`

			`test$firstseason <- NULL`
			`test$lastseason <- NULL`

			`# We inspect the target values distribution in the defined data sets.`
			`# (number of examples and frequency of individual classes)`

			`nrow(learn)`
			`table(learn$position)`

			`nrow(test)`
			`table(test$position)`




			`#`
			`#`
			`# MAJORITY CLASSIFIER`
			`#`
			`#`


			`# The majority class is the class with the highest number of training examples`
			`which.max(table(learn$position))`

			`majority.class <- names(which.max(table(learn$position)))`
			`majority.class`

			`# The majority classifier classifies all test instances into the majority class.`
			`# The accuracy of the majority class`
			`sum(test$position == majority.class) / length(test$position)`




			`#`
			`#`
			`# DECISION TREES`
			`#`
			`#`


			`# load the "rpart" package that implements decision trees in R`
			`library(rpart)`

			`# Fit a decision tree`
			`dt <- rpart(position ~ ., data = learn)`

			`#`
			`# The first argument of the rpart function is a model formula that indicates`
			`# the functional form of the model. The "~" symbol stands for "modeled as".`
			`#`
			`# The formula "position ~ . " states that we want a model that predicts the`
			`# target attribute "position" using all other attributes present in the data`
			`# (which is the meaning of the "." character).`
			`#`
			`# If we wanted, for example, a model for the "position" attribute as a function`
			`# of the attributes "height", "width", and "pts", we should have`
			`# indicated the formula as "position ~ height + weight + pts".`
			`#`
			`# The second argument of the rpart function provides the training examples.`
			`#`


			`# The content of the dt object`
			`dt`

			`# A graphical representation of our tree`
			`plot(dt)`
			`text(dt, pretty = 1)`

			`# To navigate the tree, start from the root node. If the attribute of the`
			`# observation satisfies the logical test, then navigate down the left branch.`
			`# Otherwise navigate down the right branch. Continue until a leaf node is reached.`
			`# A leaf node represents a class label (or class label distribution).`


			`# Observed values in the test samples (the ground truth)`
			`observed <- test$position`
			`observed`


			`# The "predict" function returns a vector of predicted responses from a fitted tree.`
			`# The "type" argument denotes the form of predicted value returned. The setting`
			`# type = "class" will cause the response in a form of a vector of predicted`
			`# class labels.`

			`predicted <- predict(dt, test, type = "class")`
			`predicted`

			`# confusion matrix`
			`t <- table(observed, predicted)`
			`t`

			`# The diagonal elements in the confusion matrix represent the number of correctly`
			`# classified test instances`

			`# The classification accuracy is the fraction of correctly classified test instances`
			`sum(diag(t)) / sum(t)`


			`# Let's write a function that calculates the classification accuracy`
			`CA <- function(observed, predicted)`
			`{`
			`t <- table(observed, predicted)`

			`sum(diag(t)) / sum(t)`
			`}`

			`# Function call`
			`CA(observed, predicted)`




			`# The setting type = "prob" will cause the predict function to`
			`# provide its answer in the form of a matrix with class probabilities.`

			`predMat <- predict(dt, test, type = "prob")`
			`predMat`

			`# A matrix of the observed class probabilities (the ground truth)`
			`# (the real class has the probability of 1, others have 0)`
			`obsMat <- model.matrix( ~ position-1, test)`
			`obsMat`


			`# IMPORTANT: check whether the columns refer to the same attributes`
			`# (if not, change the order of columns)`
			`colnames(predMat)`
			`colnames(obsMat)`


			`# The Brier score`
			`brier.score <- function(observedMatrix, predictedMatrix)`
			`{`
			`sum((observedMatrix - predictedMatrix) ^ 2) / nrow(predictedMatrix)`
			`}`

			`brier.score(obsMat, predMat)`




			`############################################################################`
			`#`
			`# Example 2`
			`#`
			`# Does a player make at least 80% of free-throws attempted?`
			`#`
			`############################################################################`

			`#`
			`# It is a binary problem (the target variable is discrete with values YES and NO).`
			`#`

			`#`
			`# We are going to create a data set using the players data.`
			`#`

			`# Get the players who have attempted at least one free throw during the career.`
			`bin.players <- players[players$fta > 0,]`

			`# Free-throw success rate`
			`rate <- bin.players$ftm / bin.players$fta`

			`# Create a discrete attribute "ftexpert" that will be our target variable.`
			`ftexpert <- vector()`
			`ftexpert[rate >= 0.8] <- "YES"`
			`ftexpert[rate < 0.8] <- "NO"`
			`bin.players$ftexpert <- as.factor(ftexpert)`

			`# Remove the "fta" and "ftm" attributes.`
			`bin.players$fta <- NULL`
			`bin.players$ftm <- NULL`

			`summary(bin.players)`



			`# Split the data into two disjoint sets, one for training and one for testing.`
			`bin.learn <- bin.players[1:1500,]`
			`bin.test <- bin.players[-(1:1500),]`

			`# Inspect the distribution of the class values in the data sets.`
			`table(bin.learn$ftexpert)`
			`table(bin.test$ftexpert)`


			`# Train a decision tree`
			`dt2 <- rpart(ftexpert ~ ., data = bin.learn)`
			`plot(dt2)`
			`text(dt2, pretty = 1)`


			`bin.observed <- bin.test$ftexpert`
			`bin.predicted <- predict(dt2, bin.test, type="class")`


			`table(bin.observed, bin.predicted)`


			`# Classification accuracy of the decision tree`
			`CA(bin.observed, bin.predicted)`

			`# Exercise -> majority is important!`
			`table(bin.observed)`
			`sum(bin.observed == "NO") / length(bin.observed)`


			`# The sensitivity of a model (recall)`
			`Sensitivity <- function(observed, predicted, pos.class)`
			`{`
			`t <- table(observed, predicted)`

			`t[pos.class, pos.class] / sum(t[pos.class,])`
			`}`

			`# The specificity of a model`
			`Specificity <- function(observed, predicted, pos.class)`
			`{`
			`t <- table(observed, predicted)`

			`# identify the negative class name`
			`neg.class <- which(row.names(t) != pos.class)`

			`t[neg.class, neg.class] / sum(t[neg.class,])`
			`}`

			`# let's say that the value "YES" is our positive class...`

			`Sensitivity(bin.observed, bin.predicted, "YES")`
			`Specificity(bin.observed, bin.predicted, "YES")`

			`#`
			`# ROC curve`
			`#`

			`library(pROC)`
			`library(ggplot2)`
			`library(dplyr)`
			`library(ggrepel) # For nicer ROC visualization`

			`bin.predMat <- predict(dt2, bin.test, type = "prob")`

			`# Remember, we treat the value "YES" as the positive class`
			`rocobj <- roc(bin.observed, bin.predMat[,"YES"])`
			`plot(rocobj)`

			`# This part below is optional -> nicer ROC`
			`tmpDf <- data.frame(rocobj$sensitivities, 1-rocobj$specificities, rocobj$thresholds)`
			`colnames(tmpDf) <- c("Sensitivity","Specificity", "Thresholds")`

			`tmpDf %>% ggplot(aes(x=Specificity,y=Sensitivity,color=Thresholds))+`
			`geom_point()+`
			`geom_line()+`
			`geom_abline(intercept = 0, slope = 1, alpha=0.2)+`
			`theme_classic()+`
			`ylab("Sensitivity (TPR)")+`
			`xlab("1-Specificity (FPR)")+`
			`geom_label_repel(aes(label = round(Thresholds,2)),`
			`box.padding = 0.35,`
			`point.padding = 0.5,`
			`segment.color = 'grey50')`

			`names(rocobj)`

			`cutoffs <- rocobj$thresholds`
			`cutoffs`

			`# identify the best cut-off value (for example, by minimazing the distance from the point (0,1))`
			`tp = rocobj$sensitivities`
			`fp = 1 - rocobj$specificities`

			`dist <- (1-tp)^2 + fp^2`
			`dist`
			`which.min(dist)`

			`best.cutoff <- cutoffs[which.min(dist)]`
			`best.cutoff`

			`# the selected cut-off value has impact on the model's performance`
			`predicted.label <- factor(ifelse(bin.predMat[,"YES"] >= best.cutoff, "YES", "NO"))`

			`table(bin.observed, predicted.label)`
			`CA(bin.observed, predicted.label)`
			`Sensitivity(bin.observed, predicted.label, "YES")`
			`Specificity(bin.observed, predicted.label, "YES")`