Push vaje

2022-12-18 09:30:24 -05:00 · 2022-12-18 09:30:24 -05:00 · 15f1e5b996
parent 7d334030e2
commit 15f1e5b996
4 changed files with 1230 additions and 0 deletions
--- a/v4/lab4_code.R
+++ b/v4/lab4_code.R
@ -0,0 +1,397 @@
+#
+#
+# INTRODUCTION TO CLASSIFICATION
+#
+#
+
+#
+# Work plan:
+#
+# - Read in the data
+# - Split the data into training and test sets
+# - Build a model using the training set
+# - Evaluate the model using the test set
+#
+
+
+
+# Please download the data file "players.txt" into a local directory
+# then set that directory as the current working directory of R.
+# You can achive this using the "setwd" command or by selecting "File -> Change dir..."
+
+
+# We are going to use the packages: rpart, and pROC. 
+# Make sure that the packages are installed.
+#
+# You install a package in R with the function install.packages():
+#
+#     install.packages("pROC")
+#     library(pROC)
+#
+# To install packages without root access:
+#
+#     install.packages("pROC", lib="/mylibs/Rpackages/")
+#     library(pROC, lib.loc="/mylibs/Rpackages/")
+#
+
+# Read in the dataset 
+players <- read.table("players.txt", header = T, sep = ",")
+
+# Get the summary statistics of the data
+summary(players)
+
+
+
+############################################################################
+#
+# ATTENTION!!!!!!!!!
+#
+# The "id" attribute uniquely identifies a player within our data set. 
+# This attribute cannot be used to classify new players as they will 
+# have different id numbers.
+# 
+# Before we continue, we have to remove the "id" attribute from our data set!
+#
+############################################################################
+
+
+players$id <- NULL
+names(players)
+
+
+############################################################################
+#
+#
+# Please make sure that the "id" attribute has been removed!!!!!!!!!!
+#
+#
+############################################################################
+
+
+
+
+
+
+############################################################################
+#
+# Example 1
+#
+# Prediction of playing position
+#
+############################################################################
+
+
+#
+# We want to build a model to predict a player's playing position 
+# with respect to the given player's statistics.
+#
+# The target variable "position" is discrete - we term this a classification task.   
+#
+# We aim to verify whether or not it is possible to use historical data to predict
+# playing positions for new players. 
+#
+
+# We are going to split the data into a training and testing data set.
+# The training data set consists of players that ended their careers before 1999.
+# The test data set consists of players that began their careers after 1999.
+
+learn <- players[players$lastseason <= 1999,]
+test <- players[players$firstseason > 1999,]
+
+# We used the "firstseason" and "lastseason" attributes to split the data.
+# Therefore the attributes are not going to contribute to the modelling task, so
+# we will remove them.
+
+learn$firstseason <- NULL
+learn$lastseason <- NULL
+
+test$firstseason <- NULL
+test$lastseason <- NULL
+
+# We inspect the target values distribution in the defined data sets.
+# (number of examples and frequency of individual classes)
+
+nrow(learn)
+table(learn$position)
+
+nrow(test)
+table(test$position)
+
+
+
+
+#
+#
+# MAJORITY CLASSIFIER
+#
+#
+
+
+# The majority class is the class with the highest number of training examples
+which.max(table(learn$position))
+
+majority.class <- names(which.max(table(learn$position)))
+majority.class
+
+# The majority classifier classifies all test instances into the majority class.
+# The accuracy of the majority class
+sum(test$position == majority.class) / length(test$position)
+
+
+
+
+#
+#
+# DECISION TREES
+#
+#
+
+
+# load the "rpart" package that implements decision trees in R
+library(rpart)
+
+# Fit a decision tree
+dt <- rpart(position ~ ., data = learn)
+
+#
+# The first argument of the rpart function is a model formula that indicates 
+# the functional form of the model. The "~" symbol stands for "modeled as".
+#
+# The formula "position ~ . " states that we want a model that predicts the 
+# target attribute "position" using all other attributes present in the data 
+# (which is the meaning of the "." character).
+# 
+# If we wanted, for example, a model for the "position" attribute as a function 
+# of the attributes "height", "width", and "pts", we should have 
+# indicated the formula as "position ~ height + weight + pts".  
+# 
+# The second argument of the rpart function provides the training examples.
+#
+
+
+# The content of the dt object 
+dt
+
+# A graphical representation of our tree
+plot(dt)
+text(dt, pretty = 1)
+
+# To navigate the tree, start from the root node. If the attribute of the 
+# observation satisfies the logical test, then navigate down the left branch.
+# Otherwise navigate down the right branch. Continue until a leaf node is reached.
+# A leaf node represents a class label (or class label distribution). 
+
+
+# Observed values in the test samples (the ground truth)
+observed <- test$position
+observed
+
+
+# The "predict" function returns a vector of predicted responses from a fitted tree.
+# The "type" argument denotes the form of predicted value returned. The setting
+# type = "class" will cause the response in a form of a vector of predicted 
+# class labels.
+
+predicted <- predict(dt, test, type = "class")
+predicted
+
+# confusion matrix
+t <- table(observed, predicted)
+t
+
+# The diagonal elements in the confusion matrix represent the number of correctly 
+# classified test instances
+
+# The classification accuracy is the fraction of correctly classified test instances
+sum(diag(t)) / sum(t)
+
+
+# Let's write a function that calculates the classification accuracy
+CA <- function(observed, predicted)
+{
+  t <- table(observed, predicted)
+  
+  sum(diag(t)) / sum(t)
+}
+
+# Function call
+CA(observed, predicted)
+
+
+
+
+# The setting type = "prob" will cause the predict function to 
+# provide its answer in the form of a matrix with class probabilities.
+
+predMat <- predict(dt, test, type = "prob")
+predMat
+
+# A matrix of the observed class probabilities (the ground truth) 
+# (the real class has the probability of 1, others have 0)
+obsMat <- model.matrix( ~ position-1, test)
+obsMat
+
+
+# IMPORTANT: check whether the columns refer to the same attributes
+# (if not, change the order of columns)
+colnames(predMat)
+colnames(obsMat)
+
+
+# The Brier score
+brier.score <- function(observedMatrix, predictedMatrix)
+{
+  sum((observedMatrix - predictedMatrix) ^ 2) / nrow(predictedMatrix)
+}
+
+brier.score(obsMat, predMat)
+
+
+
+
+############################################################################
+#
+# Example 2
+#
+# Does a player make at least 80% of free-throws attempted?
+#
+############################################################################
+
+#
+# It is a binary problem (the target variable is discrete with values YES and NO). 
+#
+
+#
+# We are going to create a data set using the players data.
+#
+
+# Get the players who have attempted at least one free throw during the career.
+bin.players <- players[players$fta > 0,]
+
+# Free-throw success rate
+rate <- bin.players$ftm / bin.players$fta
+
+# Create a discrete attribute "ftexpert" that will be our target variable.
+ftexpert <- vector()
+ftexpert[rate >= 0.8] <- "YES"
+ftexpert[rate < 0.8] <- "NO"
+bin.players$ftexpert <- as.factor(ftexpert)
+
+# Remove the "fta" and "ftm" attributes.
+bin.players$fta <- NULL
+bin.players$ftm <- NULL
+
+summary(bin.players)
+
+
+
+# Split the data into two disjoint sets, one for training and one for testing.
+bin.learn <- bin.players[1:1500,]
+bin.test <- bin.players[-(1:1500),]
+
+# Inspect the distribution of the class values in the data sets.
+table(bin.learn$ftexpert)
+table(bin.test$ftexpert)
+
+
+# Train a decision tree
+dt2 <- rpart(ftexpert ~ ., data = bin.learn)
+plot(dt2)
+text(dt2, pretty = 1)
+
+
+bin.observed <- bin.test$ftexpert
+bin.predicted <- predict(dt2, bin.test, type="class")
+
+
+table(bin.observed, bin.predicted)
+
+
+# Classification accuracy of the decision tree
+CA(bin.observed, bin.predicted)
+
+# Exercise -> majority is important!
+table(bin.observed)
+sum(bin.observed == "NO") / length(bin.observed)
+
+
+# The sensitivity of a model (recall)
+Sensitivity <- function(observed, predicted, pos.class)
+{
+  t <- table(observed, predicted)
+  
+  t[pos.class, pos.class] / sum(t[pos.class,])
+}
+
+# The specificity of a model
+Specificity <- function(observed, predicted, pos.class)
+{
+  t <- table(observed, predicted)
+  
+  # identify the negative class name
+  neg.class <- which(row.names(t) != pos.class)
+
+  t[neg.class, neg.class] / sum(t[neg.class,])
+}
+
+# let's say that the value "YES" is our positive class...
+
+Sensitivity(bin.observed, bin.predicted, "YES")
+Specificity(bin.observed, bin.predicted, "YES")
+
+#
+# ROC curve
+#
+
+library(pROC)
+library(ggplot2)
+library(dplyr)
+library(ggrepel) # For nicer ROC visualization
+
+bin.predMat <- predict(dt2, bin.test, type = "prob")
+
+# Remember, we treat the value "YES" as the positive class
+rocobj <- roc(bin.observed, bin.predMat[,"YES"])
+plot(rocobj)
+
+# This part below is optional -> nicer ROC
+tmpDf <- data.frame(rocobj$sensitivities, 1-rocobj$specificities, rocobj$thresholds)
+colnames(tmpDf) <- c("Sensitivity","Specificity", "Thresholds")
+
+tmpDf %>% ggplot(aes(x=Specificity,y=Sensitivity,color=Thresholds))+
+  geom_point()+
+  geom_line()+
+  geom_abline(intercept = 0, slope = 1, alpha=0.2)+
+  theme_classic()+
+  ylab("Sensitivity (TPR)")+
+  xlab("1-Specificity (FPR)")+
+  geom_label_repel(aes(label = round(Thresholds,2)),
+                   box.padding   = 0.35, 
+                   point.padding = 0.5,
+                   segment.color = 'grey50')
+
+names(rocobj)
+
+cutoffs <- rocobj$thresholds
+cutoffs
+
+# identify the best cut-off value (for example, by minimazing the distance from the point (0,1))
+tp = rocobj$sensitivities
+fp = 1 - rocobj$specificities
+
+dist <- (1-tp)^2 + fp^2
+dist
+which.min(dist)
+
+best.cutoff <- cutoffs[which.min(dist)]
+best.cutoff
+
+# the selected cut-off value has impact on the model's performance
+predicted.label <- factor(ifelse(bin.predMat[,"YES"] >= best.cutoff, "YES", "NO"))
+
+table(bin.observed, predicted.label)
+CA(bin.observed, predicted.label)
+Sensitivity(bin.observed, predicted.label, "YES")
+Specificity(bin.observed, predicted.label, "YES")
+
+
--- a/v5/lab4_code.R
+++ b/v5/lab4_code.R
@ -0,0 +1,397 @@
+#
+#
+# INTRODUCTION TO CLASSIFICATION
+#
+#
+
+#
+# Work plan:
+#
+# - Read in the data
+# - Split the data into training and test sets
+# - Build a model using the training set
+# - Evaluate the model using the test set
+#
+
+
+
+# Please download the data file "players.txt" into a local directory
+# then set that directory as the current working directory of R.
+# You can achive this using the "setwd" command or by selecting "File -> Change dir..."
+
+
+# We are going to use the packages: rpart, and pROC. 
+# Make sure that the packages are installed.
+#
+# You install a package in R with the function install.packages():
+#
+#     install.packages("pROC")
+#     library(pROC)
+#
+# To install packages without root access:
+#
+#     install.packages("pROC", lib="/mylibs/Rpackages/")
+#     library(pROC, lib.loc="/mylibs/Rpackages/")
+#
+
+# Read in the dataset 
+players <- read.table("players.txt", header = T, sep = ",")
+
+# Get the summary statistics of the data
+summary(players)
+
+
+
+############################################################################
+#
+# ATTENTION!!!!!!!!!
+#
+# The "id" attribute uniquely identifies a player within our data set. 
+# This attribute cannot be used to classify new players as they will 
+# have different id numbers.
+# 
+# Before we continue, we have to remove the "id" attribute from our data set!
+#
+############################################################################
+
+
+players$id <- NULL
+names(players)
+
+
+############################################################################
+#
+#
+# Please make sure that the "id" attribute has been removed!!!!!!!!!!
+#
+#
+############################################################################
+
+
+
+
+
+
+############################################################################
+#
+# Example 1
+#
+# Prediction of playing position
+#
+############################################################################
+
+
+#
+# We want to build a model to predict a player's playing position 
+# with respect to the given player's statistics.
+#
+# The target variable "position" is discrete - we term this a classification task.   
+#
+# We aim to verify whether or not it is possible to use historical data to predict
+# playing positions for new players. 
+#
+
+# We are going to split the data into a training and testing data set.
+# The training data set consists of players that ended their careers before 1999.
+# The test data set consists of players that began their careers after 1999.
+
+learn <- players[players$lastseason <= 1999,]
+test <- players[players$firstseason > 1999,]
+
+# We used the "firstseason" and "lastseason" attributes to split the data.
+# Therefore the attributes are not going to contribute to the modelling task, so
+# we will remove them.
+
+learn$firstseason <- NULL
+learn$lastseason <- NULL
+
+test$firstseason <- NULL
+test$lastseason <- NULL
+
+# We inspect the target values distribution in the defined data sets.
+# (number of examples and frequency of individual classes)
+
+nrow(learn)
+table(learn$position)
+
+nrow(test)
+table(test$position)
+
+
+
+
+#
+#
+# MAJORITY CLASSIFIER
+#
+#
+
+
+# The majority class is the class with the highest number of training examples
+which.max(table(learn$position))
+
+majority.class <- names(which.max(table(learn$position)))
+majority.class
+
+# The majority classifier classifies all test instances into the majority class.
+# The accuracy of the majority class
+sum(test$position == majority.class) / length(test$position)
+
+
+
+
+#
+#
+# DECISION TREES
+#
+#
+
+
+# load the "rpart" package that implements decision trees in R
+library(rpart)
+
+# Fit a decision tree
+dt <- rpart(position ~ ., data = learn)
+
+#
+# The first argument of the rpart function is a model formula that indicates 
+# the functional form of the model. The "~" symbol stands for "modeled as".
+#
+# The formula "position ~ . " states that we want a model that predicts the 
+# target attribute "position" using all other attributes present in the data 
+# (which is the meaning of the "." character).
+# 
+# If we wanted, for example, a model for the "position" attribute as a function 
+# of the attributes "height", "width", and "pts", we should have 
+# indicated the formula as "position ~ height + weight + pts".  
+# 
+# The second argument of the rpart function provides the training examples.
+#
+
+
+# The content of the dt object 
+dt
+
+# A graphical representation of our tree
+plot(dt)
+text(dt, pretty = 1)
+
+# To navigate the tree, start from the root node. If the attribute of the 
+# observation satisfies the logical test, then navigate down the left branch.
+# Otherwise navigate down the right branch. Continue until a leaf node is reached.
+# A leaf node represents a class label (or class label distribution). 
+
+
+# Observed values in the test samples (the ground truth)
+observed <- test$position
+observed
+
+
+# The "predict" function returns a vector of predicted responses from a fitted tree.
+# The "type" argument denotes the form of predicted value returned. The setting
+# type = "class" will cause the response in a form of a vector of predicted 
+# class labels.
+
+predicted <- predict(dt, test, type = "class")
+predicted
+
+# confusion matrix
+t <- table(observed, predicted)
+t
+
+# The diagonal elements in the confusion matrix represent the number of correctly 
+# classified test instances
+
+# The classification accuracy is the fraction of correctly classified test instances
+sum(diag(t)) / sum(t)
+
+
+# Let's write a function that calculates the classification accuracy
+CA <- function(observed, predicted)
+{
+  t <- table(observed, predicted)
+  
+  sum(diag(t)) / sum(t)
+}
+
+# Function call
+CA(observed, predicted)
+
+
+
+
+# The setting type = "prob" will cause the predict function to 
+# provide its answer in the form of a matrix with class probabilities.
+
+predMat <- predict(dt, test, type = "prob")
+predMat
+
+# A matrix of the observed class probabilities (the ground truth) 
+# (the real class has the probability of 1, others have 0)
+obsMat <- model.matrix( ~ position-1, test)
+obsMat
+
+
+# IMPORTANT: check whether the columns refer to the same attributes
+# (if not, change the order of columns)
+colnames(predMat)
+colnames(obsMat)
+
+
+# The Brier score
+brier.score <- function(observedMatrix, predictedMatrix)
+{
+  sum((observedMatrix - predictedMatrix) ^ 2) / nrow(predictedMatrix)
+}
+
+brier.score(obsMat, predMat)
+
+
+
+
+############################################################################
+#
+# Example 2
+#
+# Does a player make at least 80% of free-throws attempted?
+#
+############################################################################
+
+#
+# It is a binary problem (the target variable is discrete with values YES and NO). 
+#
+
+#
+# We are going to create a data set using the players data.
+#
+
+# Get the players who have attempted at least one free throw during the career.
+bin.players <- players[players$fta > 0,]
+
+# Free-throw success rate
+rate <- bin.players$ftm / bin.players$fta
+
+# Create a discrete attribute "ftexpert" that will be our target variable.
+ftexpert <- vector()
+ftexpert[rate >= 0.8] <- "YES"
+ftexpert[rate < 0.8] <- "NO"
+bin.players$ftexpert <- as.factor(ftexpert)
+
+# Remove the "fta" and "ftm" attributes.
+bin.players$fta <- NULL
+bin.players$ftm <- NULL
+
+summary(bin.players)
+
+
+
+# Split the data into two disjoint sets, one for training and one for testing.
+bin.learn <- bin.players[1:1500,]
+bin.test <- bin.players[-(1:1500),]
+
+# Inspect the distribution of the class values in the data sets.
+table(bin.learn$ftexpert)
+table(bin.test$ftexpert)
+
+
+# Train a decision tree
+dt2 <- rpart(ftexpert ~ ., data = bin.learn)
+plot(dt2)
+text(dt2, pretty = 1)
+
+
+bin.observed <- bin.test$ftexpert
+bin.predicted <- predict(dt2, bin.test, type="class")
+
+
+table(bin.observed, bin.predicted)
+
+
+# Classification accuracy of the decision tree
+CA(bin.observed, bin.predicted)
+
+# Exercise -> majority is important!
+table(bin.observed)
+sum(bin.observed == "NO") / length(bin.observed)
+
+
+# The sensitivity of a model (recall)
+Sensitivity <- function(observed, predicted, pos.class)
+{
+  t <- table(observed, predicted)
+  
+  t[pos.class, pos.class] / sum(t[pos.class,])
+}
+
+# The specificity of a model
+Specificity <- function(observed, predicted, pos.class)
+{
+  t <- table(observed, predicted)
+  
+  # identify the negative class name
+  neg.class <- which(row.names(t) != pos.class)
+
+  t[neg.class, neg.class] / sum(t[neg.class,])
+}
+
+# let's say that the value "YES" is our positive class...
+
+Sensitivity(bin.observed, bin.predicted, "YES")
+Specificity(bin.observed, bin.predicted, "YES")
+
+#
+# ROC curve
+#
+
+library(pROC)
+library(ggplot2)
+library(dplyr)
+library(ggrepel) # For nicer ROC visualization
+
+bin.predMat <- predict(dt2, bin.test, type = "prob")
+
+# Remember, we treat the value "YES" as the positive class
+rocobj <- roc(bin.observed, bin.predMat[,"YES"])
+plot(rocobj)
+
+# This part below is optional -> nicer ROC
+tmpDf <- data.frame(rocobj$sensitivities, 1-rocobj$specificities, rocobj$thresholds)
+colnames(tmpDf) <- c("Sensitivity","Specificity", "Thresholds")
+
+tmpDf %>% ggplot(aes(x=Specificity,y=Sensitivity,color=Thresholds))+
+  geom_point()+
+  geom_line()+
+  geom_abline(intercept = 0, slope = 1, alpha=0.2)+
+  theme_classic()+
+  ylab("Sensitivity (TPR)")+
+  xlab("1-Specificity (FPR)")+
+  geom_label_repel(aes(label = round(Thresholds,2)),
+                   box.padding   = 0.35, 
+                   point.padding = 0.5,
+                   segment.color = 'grey50')
+
+names(rocobj)
+
+cutoffs <- rocobj$thresholds
+cutoffs
+
+# identify the best cut-off value (for example, by minimazing the distance from the point (0,1))
+tp = rocobj$sensitivities
+fp = 1 - rocobj$specificities
+
+dist <- (1-tp)^2 + fp^2
+dist
+which.min(dist)
+
+best.cutoff <- cutoffs[which.min(dist)]
+best.cutoff
+
+# the selected cut-off value has impact on the model's performance
+predicted.label <- factor(ifelse(bin.predMat[,"YES"] >= best.cutoff, "YES", "NO"))
+
+table(bin.observed, predicted.label)
+CA(bin.observed, predicted.label)
+Sensitivity(bin.observed, predicted.label, "YES")
+Specificity(bin.observed, predicted.label, "YES")
+
+
--- a/v6/lab6_code.R
+++ b/v6/lab6_code.R
@ -0,0 +1,229 @@
+###
+#
+# CLUSTERING
+#
+###
+
+# Load the dataset
+cities <- read.table("worldcities.csv", sep=",", header = T)
+summary(cities)
+x <- cities$lng
+y <- cities$lat
+x <- x / max(x)
+y <- y / max(y)
+plot(x,y)
+df = data.frame(x, y)
+
+# KMeans clustering
+results <- kmeans(df, centers=6)
+results
+plot(x, y, col=results$cluster)
+
+# Are clusters aligned with classes?
+library(ggplot2)
+data(iris)
+subset <- iris[,1:2]
+
+# classes
+p1<-qplot(subset$Sepal.Length,subset$Sepal.Width,color = iris$Species, main = "Ground truth") + theme_bw()
+
+resultsIris <-kmeans(subset, centers = 3)
+assignments <- as.factor(resultsIris$cluster)
+p2<-qplot(subset$Sepal.Length,subset$Sepal.Width,color = assignments, main = "Clustering") + theme_bw()
+gridExtra::grid.arrange(p1, p2, nrow = 2)
+
+# Exercise -> try to cluster the iris data according to Length and Width!
+
+# Try to fit models and see what can be learnt best!
+library(mclust)
+fit <- Mclust(df)
+# summarize the fits
+plot(fit)
+summary(fit)
+
+#Hierarchical clustering
+distMat = dist(df)
+resultsH <- hclust(distMat)
+
+plot(resultsH)
+clusters <- cutree(resultsH, 6)
+plot(x, y, col=clusters)
+
+
+###
+#
+# ATTRIBUTE EVALUATION
+#
+###
+
+
+# Attribute evaluation functions are implemented in the package "CORElearn"
+
+# Load the library
+#install.packages("CORElearn")
+library(CORElearn)
+
+
+#
+# Attribute estimation in classification
+#
+
+
+
+#
+# Example 1
+#
+
+mr <- read.table("mushroom.txt", sep=",", header=T)
+mr$edibility <- as.factor(mr$edibility)
+summary(mr)
+
+qplot(mr$edibility, ylab="Number of species", main="Edibility of mushrooms", geom = c("bar"))
+
+
+# the attrEval function evaluates the quality of the attributes/dependent variables 
+# specified by the formula. Parameter formula is used as a mechanism to select
+# attributes and prediction variable(class). The simplest way is to specify just 
+# response variable: "Class ~ .". In this case all other attributes in the data set 
+# are evaluated with the selected heuristic method.
+
+# attribute evaluation using information gain
+sort(attrEval(edibility ~ ., mr, "InfGain"), decreasing = TRUE)
+
+# build a decision tree using information gain as a splitting criterion
+dt <- CoreModel(edibility ~ ., mr, model="tree", selectionEstimator="InfGain")
+plot(dt, mr)
+
+
+#
+# Example 2
+#
+
+quadrant <- read.table("quadrant.txt", sep=",", header=T)
+summary(quadrant)
+
+quadrant$Class <- as.factor(quadrant$Class)
+
+plot(quadrant, col=quadrant$Class)
+plot(quadrant$a1, quadrant$a2, col=quadrant$Class)
+
+# attribute evaluation using information gain
+sort(attrEval(Class ~ ., quadrant, "InfGain"), decreasing = TRUE)
+
+# information gain is a myopic measure 
+# (it assumes that attributes are conditionally independent given the class)
+
+# using information gain to construct a decision tree will produce a poor result
+dt2 <- CoreModel(Class ~ ., quadrant, model="tree", selectionEstimator="InfGain")
+plot(dt2, quadrant)
+
+# non-myopic measures (Relief and ReleifF)
+sort(attrEval(Class ~ ., quadrant, "Relief"), decreasing = TRUE)
+sort(attrEval(Class ~ ., quadrant, "ReliefFequalK"), decreasing = TRUE)
+sort(attrEval(Class ~ ., quadrant, "ReliefFexpRank"), decreasing = TRUE)
+
+# use ?attrEval to find out different variations of ReliefF measure...
+
+# build a decision tree using ReliefFequalK as a splitting criterion
+dt3 <- CoreModel(Class ~ ., quadrant, model="tree", selectionEstimator = "ReliefFequalK")
+plot(dt3, quadrant)
+
+
+#
+# Example 3
+#
+
+players <- read.table("players.txt", sep=",", header=TRUE)
+summary(players)
+players$position <- as.factor(players$position)
+
+sort(attrEval(position ~ ., players, "InfGain"), decreasing = TRUE)
+sort(attrEval(position ~ ., players, "Gini"), decreasing = TRUE)
+
+# the "id" attribute is overestimated, 
+# although it does not carry any useful information
+
+# GainRatio moderates the overestimation of the "id" attribute
+sort(attrEval(position ~ ., players, "GainRatio"), decreasing = TRUE)
+
+# Both ReliefF and MDL measures identify the "id" attribute as irrelevant
+sort(attrEval(position ~ ., players, "ReliefFequalK"), decreasing = TRUE)
+sort(attrEval(position ~ ., players, "MDL"), decreasing = TRUE)
+
+#
+#
+# Attribute selection
+#
+#
+
+student <- read.table("student.txt", sep=",", header=T)
+student$G1 <- cut(student$G1, c(-Inf, 9, 20), labels=c("fail", "pass"))
+student$G2 <- cut(student$G2, c(-Inf, 9, 20), labels=c("fail", "pass"))
+student$G3 <- cut(student$G3, c(-Inf, 9, 20), labels=c("fail", "pass"))
+student$studytime <- cut(student$studytime, c(-Inf, 1, 2, 3, 4), labels=c("none", "few", "hefty", "endless"))
+
+train = student[1:(nrow(student) * 0.7),]
+test = student[(nrow(student) * 0.7):nrow(student),]
+
+#
+# Feature selection using a filter method
+#
+
+# evaluate the quality of attributes
+sort(attrEval(studytime ~ ., student, "MDL"), decreasing = TRUE)
+
+# train a model using everything
+model <- CoreModel(studytime ~ ., data=train, model='tree')
+predictions <- predict(model, test, type='class')
+sum(predictions == test$studytime) / length(predictions)
+
+# train a model using the best evaluated attributes
+model <- CoreModel(studytime ~ sex + Walc + Dalc + freetime + higher + paid, data=train, model='tree')
+predictions <- predict(model, test, type='class')
+sum(predictions == test$studytime) / length(predictions)
+
+
+# using the appropriate combination of attributes...
+model <- CoreModel(studytime ~ Dalc + paid + G2, data=train, model='tree')
+predictions <- predict(model, test, type='class')
+sum(predictions == test$studytime) / length(predictions)
+
+# What about some model-based feature importances?
+library(caret)
+set.seed(123780)
+#Number randomely variable selected is mtry
+control <- trainControl(method='repeatedcv',number=10, repeats=1)
+
+rf.model <- train(studytime ~ ., 
+                  data=train, 
+                  method='rf', 
+                  metric='Accuracy', 
+                  trControl=control)
+
+## varImp extracts the feature relevances
+rf.importances<- varImp(rf.model, scale = FALSE)
+plot(rf.importances, top = 20)
+importance.df.values <- rf.importances$importance
+importance.df.names <- rownames(rf.importances$importance)
+importance.rf.whole <- data.frame(score = importance.df.values,cnames = importance.df.names)
+importance.rf.whole <- importance.rf.whole[order(importance.rf.whole$Overall, decreasing = T),]
+feature.names.rf <- importance.rf.whole$cnames[1:20]
+
+mdl.importances <- sort(attrEval(studytime ~ ., student, "MDL"), decreasing = TRUE)[1:20]
+feature.names.mdl <- names(mdl.importances)
+paste0(length(intersect(feature.names.rf, feature.names.mdl))*100/20,"% overlap between top ranked features!")
+
+scaleVector <- function(x) {(x - min(x))/(max(x) - min(x))}
+
+# Let's visualize how similar the rankings' distributions are
+rf.all <- scaleVector(importance.rf.whole$Overall)
+mdl.all <- scaleVector(as.vector(mdl.importances))
+
+rank.frame <- data.frame(rf.all, mdl.all)
+
+library(dplyr)
+library(tidyr)
+
+# How do the two distributions differ?
+summary(rank.frame)
+rank.frame %>% gather() %>% ggplot(aes(x=value,fill=key))+geom_density(alpha=0.4)+theme_bw()
--- a/v7/lab7_code.R
+++ b/v7/lab7_code.R
@ -0,0 +1,207 @@
+################################################################
+#
+# Combining machine learning algorithms
+#
+################################################################
+
+#install.packages("CORElearn")
+library(CORElearn)
+
+vehicle <- read.table("vehicle.txt", sep=",", header = T)
+summary(vehicle)
+
+set.seed(8678686)
+sel <- sample(1:nrow(vehicle), size=as.integer(nrow(vehicle)*0.7), replace=F)
+learn <- vehicle[sel,]
+test <- vehicle[-sel,]
+
+learn$Class <- as.factor(learn$Class)
+test$Class <- as.factor(test$Class)
+
+table(learn$Class)
+table(test$Class)
+
+CA <- function(observed, predicted)
+{
+  t <- table(observed, predicted)
+  
+  sum(diag(t)) / sum(t)
+}
+
+#
+# A simple tree baseline.
+#
+
+
+#
+# Voting
+#
+
+modelDT <- CoreModel(Class ~ ., learn, model="tree")
+modelNB <- CoreModel(Class ~ ., learn, model="bayes")
+modelKNN <- CoreModel(Class ~ ., learn, model="knn", kInNN = 2)
+
+predDT <- predict(modelDT, test, type = "class")
+result.caDT <- CA(test$Class, predDT)
+result.caDT
+
+predNB <- predict(modelNB, test, type="class")
+result.caNB <- CA(test$Class, predNB)
+result.caNB
+
+predKNN <- predict(modelKNN, test, type="class")
+result.caKNN <- CA(test$Class, predKNN)
+result.caKNN
+
+# combine predictions into a data frame
+pred <- data.frame(predDT, predNB, predKNN)
+pred
+
+# the class with the most votes wins
+voting <- function(predictions)
+{
+  res <- vector()
+  
+  for (i in 1 : nrow(predictions))  	
+  {
+    vec <- unlist(predictions[i,])
+    res[i] <- names(which.max(table(vec)))
+  }
+  
+  factor(res, levels=levels(predictions[,1]))
+}
+
+predicted <- voting(pred)
+result.voting <- CA(test$Class, predicted)
+result.voting
+
+#
+# Weighted voting
+#
+
+predDT.prob <- predict(modelDT, test, type="probability")
+predNB.prob <- predict(modelNB, test, type="probability")
+predKNN.prob <- predict(modelKNN, test, type="probability")
+
+# combine predictions into a data frame
+pred.prob <- result.caDT * predDT.prob + result.caNB * predNB.prob + result.caKNN * predKNN.prob
+pred.prob
+
+# We can visualize the joint output space!
+heatmap(pred.prob, col=c('red','green','blue'))
+legend(x="right", legend=c("min", "med", "max"),
+       fill=c('red','green','blue'))
+
+# pick the class with the highest score
+highest <- apply(pred.prob, 1, which.max)
+classes <- levels(learn$Class)
+predicted <- classes[highest]
+
+result.wvoting <- CA(test$Class, predicted)
+result.wvoting
+
+
+
+#
+# Bagging
+#
+
+#install.packages("ipred")
+library(ipred)
+
+bag <- bagging(Class ~ ., learn, nbagg=14)
+bag.pred <- predict(bag, test, type="class")
+result.bagging <- CA(test$Class, bag.pred)
+result.bagging
+
+
+#
+# Random forest as a variation of bagging
+#
+
+# install.packages("randomForest")
+library(randomForest)
+
+rf <- randomForest(Class ~ ., learn)
+rf.predicted <- predict(rf, test, type = "class")
+result.rf <- CA(test$Class, rf.predicted)
+result.rf
+
+
+#
+# Boosting
+#
+
+# install.packages("adabag")
+library(adabag)
+
+bm <- boosting(Class ~ ., learn)
+predictions.bm <- predict(bm, test)
+names(predictions.bm)
+
+predicted.bm <- predictions.bm$class
+result.boosting <- CA(test$Class, predicted.bm)
+result.boosting
+
+#
+# Caret package
+#
+
+# install.packages("caret")
+library(caret)
+caretModel <- train(Class ~ ., learn, method="xgbLinear", eta=1, verbose=1)
+pred <- predict(caretModel, test)
+result.xgb <- CA(test$Class, pred)
+result.xgb
+
+# Let's visualize currently considered ensemble methods.
+performances <- c(result.rf,
+                  result.bagging,
+                  result.wvoting,
+                  result.boosting,
+                  result.caDT,
+                  result.caKNN,
+                  result.caNB,
+                  result.xgb)
+
+algo.names <- c("RF","Bagging","Weighted vote","Boosting","DT","KNN","NB","XGB")
+ensemble.model <- as.factor(c(1,1,1,1,0,0,0,1))
+result.vec <- data.frame(performances, algo.names,ensemble.model)
+reordering <- order(result.vec$performances)
+result.vec <- result.vec[reordering,]
+rownames(result.vec) <- NULL
+result.vec
+
+library(ggplot2)
+positions <- as.vector(result.vec$algo.names)
+ggplot(data=result.vec, aes(x=algo.names, y=performances, color = ensemble.model)) +
+  geom_point(size = 10, shape = 4) +
+  scale_x_discrete(limits = positions) +
+  ylim(0.5,0.8) +
+  xlab("Ensemble type") +
+  ylab("Accuracy") +
+  geom_hline(yintercept=max(performances), color = "darkgreen") +
+  geom_hline(yintercept=min(performances), color = "black") +
+  title("Performance comparison") + 
+  geom_text(label = positions,nudge_x = 0.2, nudge_y = -0.01) +
+  theme_bw() +
+  theme(axis.title.x=element_blank(),
+        axis.text.x=element_blank(),
+        axis.ticks.x=element_blank())
+
+
+#
+# Cross-validation
+#
+
+# the library ipred is needed to perform cross-validation
+library(ipred)
+
+# Tell the cross-validation which model to use
+mymodel.coremodel <- function(formula, data, target.model){CoreModel(formula, data, model=target.model)}
+# Tell the cross-validation how to obtain predictions
+mypredict.generic <- function(object, newdata){predict(object, newdata, type = "class")}
+# force the predict function to return class labels only and also destroy the internal representation of a given model
+mypredict.coremodel <- function(object, newdata) {pred <- predict(object, newdata)$class; destroyModels(object); pred}
+cvError <- errorest(Class~., data=learn, model = mymodel.coremodel, predict = mypredict.coremodel, target.model = "tree")
+1 - cvError$error