is-vaje/v6/lab6_code.R

###
#
# CLUSTERING
#
###

# Load the dataset
cities <- read.table("worldcities.csv", sep=",", header = T)
summary(cities)
x <- cities$lng
y <- cities$lat
x <- x / max(x)
y <- y / max(y)
plot(x,y)
df = data.frame(x, y)

# KMeans clustering
results <- kmeans(df, centers=6)
results
plot(x, y, col=results$cluster)

# Are clusters aligned with classes?
library(ggplot2)
data(iris)
subset <- iris[,1:2]

# classes
p1<-qplot(subset$Sepal.Length,subset$Sepal.Width,color = iris$Species, main = "Ground truth") + theme_bw()

resultsIris <-kmeans(subset, centers = 3)
assignments <- as.factor(resultsIris$cluster)
p2<-qplot(subset$Sepal.Length,subset$Sepal.Width,color = assignments, main = "Clustering") + theme_bw()
gridExtra::grid.arrange(p1, p2, nrow = 2)

# Exercise -> try to cluster the iris data according to Length and Width!

# Try to fit models and see what can be learnt best!
library(mclust)
fit <- Mclust(df)
# summarize the fits
plot(fit)
summary(fit)

#Hierarchical clustering
distMat = dist(df)
resultsH <- hclust(distMat)

plot(resultsH)
clusters <- cutree(resultsH, 6)
plot(x, y, col=clusters)


###
#
# ATTRIBUTE EVALUATION
#
###


# Attribute evaluation functions are implemented in the package "CORElearn"

# Load the library
#install.packages("CORElearn")
library(CORElearn)


#
# Attribute estimation in classification
#


#
# Example 1
#

mr <- read.table("mushroom.txt", sep=",", header=T)
mr$edibility <- as.factor(mr$edibility)
summary(mr)

qplot(mr$edibility, ylab="Number of species", main="Edibility of mushrooms", geom = c("bar"))


# the attrEval function evaluates the quality of the attributes/dependent variables 
# specified by the formula. Parameter formula is used as a mechanism to select
# attributes and prediction variable(class). The simplest way is to specify just 
# response variable: "Class ~ .". In this case all other attributes in the data set 
# are evaluated with the selected heuristic method.

# attribute evaluation using information gain
sort(attrEval(edibility ~ ., mr, "InfGain"), decreasing = TRUE)

# build a decision tree using information gain as a splitting criterion
dt <- CoreModel(edibility ~ ., mr, model="tree", selectionEstimator="InfGain")
plot(dt, mr)


#
# Example 2
#

quadrant <- read.table("quadrant.txt", sep=",", header=T)
summary(quadrant)

quadrant$Class <- as.factor(quadrant$Class)

plot(quadrant, col=quadrant$Class)
plot(quadrant$a1, quadrant$a2, col=quadrant$Class)

# attribute evaluation using information gain
sort(attrEval(Class ~ ., quadrant, "InfGain"), decreasing = TRUE)

# information gain is a myopic measure 
# (it assumes that attributes are conditionally independent given the class)

# using information gain to construct a decision tree will produce a poor result
dt2 <- CoreModel(Class ~ ., quadrant, model="tree", selectionEstimator="InfGain")
plot(dt2, quadrant)

# non-myopic measures (Relief and ReleifF)
sort(attrEval(Class ~ ., quadrant, "Relief"), decreasing = TRUE)
sort(attrEval(Class ~ ., quadrant, "ReliefFequalK"), decreasing = TRUE)
sort(attrEval(Class ~ ., quadrant, "ReliefFexpRank"), decreasing = TRUE)

# use ?attrEval to find out different variations of ReliefF measure...

# build a decision tree using ReliefFequalK as a splitting criterion
dt3 <- CoreModel(Class ~ ., quadrant, model="tree", selectionEstimator = "ReliefFequalK")
plot(dt3, quadrant)


#
# Example 3
#

players <- read.table("players.txt", sep=",", header=TRUE)
summary(players)
players$position <- as.factor(players$position)

sort(attrEval(position ~ ., players, "InfGain"), decreasing = TRUE)
sort(attrEval(position ~ ., players, "Gini"), decreasing = TRUE)

# the "id" attribute is overestimated, 
# although it does not carry any useful information

# GainRatio moderates the overestimation of the "id" attribute
sort(attrEval(position ~ ., players, "GainRatio"), decreasing = TRUE)

# Both ReliefF and MDL measures identify the "id" attribute as irrelevant
sort(attrEval(position ~ ., players, "ReliefFequalK"), decreasing = TRUE)
sort(attrEval(position ~ ., players, "MDL"), decreasing = TRUE)

#
#
# Attribute selection
#
#

student <- read.table("student.txt", sep=",", header=T)
student$G1 <- cut(student$G1, c(-Inf, 9, 20), labels=c("fail", "pass"))
student$G2 <- cut(student$G2, c(-Inf, 9, 20), labels=c("fail", "pass"))
student$G3 <- cut(student$G3, c(-Inf, 9, 20), labels=c("fail", "pass"))
student$studytime <- cut(student$studytime, c(-Inf, 1, 2, 3, 4), labels=c("none", "few", "hefty", "endless"))

train = student[1:(nrow(student) * 0.7),]
test = student[(nrow(student) * 0.7):nrow(student),]

#
# Feature selection using a filter method
#

# evaluate the quality of attributes
sort(attrEval(studytime ~ ., student, "MDL"), decreasing = TRUE)

# train a model using everything
model <- CoreModel(studytime ~ ., data=train, model='tree')
predictions <- predict(model, test, type='class')
sum(predictions == test$studytime) / length(predictions)

# train a model using the best evaluated attributes
model <- CoreModel(studytime ~ sex + Walc + Dalc + freetime + higher + paid, data=train, model='tree')
predictions <- predict(model, test, type='class')
sum(predictions == test$studytime) / length(predictions)


# using the appropriate combination of attributes...
model <- CoreModel(studytime ~ Dalc + paid + G2, data=train, model='tree')
predictions <- predict(model, test, type='class')
sum(predictions == test$studytime) / length(predictions)

# What about some model-based feature importances?
library(caret)
set.seed(123780)
#Number randomely variable selected is mtry
control <- trainControl(method='repeatedcv',number=10, repeats=1)

rf.model <- train(studytime ~ ., 
                  data=train, 
                  method='rf', 
                  metric='Accuracy', 
                  trControl=control)

## varImp extracts the feature relevances
rf.importances<- varImp(rf.model, scale = FALSE)
plot(rf.importances, top = 20)
importance.df.values <- rf.importances$importance
importance.df.names <- rownames(rf.importances$importance)
importance.rf.whole <- data.frame(score = importance.df.values,cnames = importance.df.names)
importance.rf.whole <- importance.rf.whole[order(importance.rf.whole$Overall, decreasing = T),]
feature.names.rf <- importance.rf.whole$cnames[1:20]

mdl.importances <- sort(attrEval(studytime ~ ., student, "MDL"), decreasing = TRUE)[1:20]
feature.names.mdl <- names(mdl.importances)
paste0(length(intersect(feature.names.rf, feature.names.mdl))*100/20,"% overlap between top ranked features!")

scaleVector <- function(x) {(x - min(x))/(max(x) - min(x))}

# Let's visualize how similar the rankings' distributions are
rf.all <- scaleVector(importance.rf.whole$Overall)
mdl.all <- scaleVector(as.vector(mdl.importances))

rank.frame <- data.frame(rf.all, mdl.all)

library(dplyr)
library(tidyr)

# How do the two distributions differ?
summary(rank.frame)
rank.frame %>% gather() %>% ggplot(aes(x=value,fill=key))+geom_density(alpha=0.4)+theme_bw()
Push vaje 2022-12-18 15:30:24 +01:00			`###`
			`#`
			`# CLUSTERING`
			`#`
			`###`

			`# Load the dataset`
			`cities <- read.table("worldcities.csv", sep=",", header = T)`
			`summary(cities)`
			`x <- cities$lng`
			`y <- cities$lat`
			`x <- x / max(x)`
			`y <- y / max(y)`
			`plot(x,y)`
			`df = data.frame(x, y)`

			`# KMeans clustering`
			`results <- kmeans(df, centers=6)`
			`results`
			`plot(x, y, col=results$cluster)`

			`# Are clusters aligned with classes?`
			`library(ggplot2)`
			`data(iris)`
			`subset <- iris[,1:2]`

			`# classes`
			`p1<-qplot(subset$Sepal.Length,subset$Sepal.Width,color = iris$Species, main = "Ground truth") + theme_bw()`

			`resultsIris <-kmeans(subset, centers = 3)`
			`assignments <- as.factor(resultsIris$cluster)`
			`p2<-qplot(subset$Sepal.Length,subset$Sepal.Width,color = assignments, main = "Clustering") + theme_bw()`
			`gridExtra::grid.arrange(p1, p2, nrow = 2)`

			`# Exercise -> try to cluster the iris data according to Length and Width!`

			`# Try to fit models and see what can be learnt best!`
			`library(mclust)`
			`fit <- Mclust(df)`
			`# summarize the fits`
			`plot(fit)`
			`summary(fit)`

			`#Hierarchical clustering`
			`distMat = dist(df)`
			`resultsH <- hclust(distMat)`

			`plot(resultsH)`
			`clusters <- cutree(resultsH, 6)`
			`plot(x, y, col=clusters)`


			`###`
			`#`
			`# ATTRIBUTE EVALUATION`
			`#`
			`###`


			`# Attribute evaluation functions are implemented in the package "CORElearn"`

			`# Load the library`
			`#install.packages("CORElearn")`
			`library(CORElearn)`


			`#`
			`# Attribute estimation in classification`
			`#`



			`#`
			`# Example 1`
			`#`

			`mr <- read.table("mushroom.txt", sep=",", header=T)`
			`mr$edibility <- as.factor(mr$edibility)`
			`summary(mr)`

			`qplot(mr$edibility, ylab="Number of species", main="Edibility of mushrooms", geom = c("bar"))`


			`# the attrEval function evaluates the quality of the attributes/dependent variables`
			`# specified by the formula. Parameter formula is used as a mechanism to select`
			`# attributes and prediction variable(class). The simplest way is to specify just`
			`# response variable: "Class ~ .". In this case all other attributes in the data set`
			`# are evaluated with the selected heuristic method.`

			`# attribute evaluation using information gain`
			`sort(attrEval(edibility ~ ., mr, "InfGain"), decreasing = TRUE)`

			`# build a decision tree using information gain as a splitting criterion`
			`dt <- CoreModel(edibility ~ ., mr, model="tree", selectionEstimator="InfGain")`
			`plot(dt, mr)`


			`#`
			`# Example 2`
			`#`

			`quadrant <- read.table("quadrant.txt", sep=",", header=T)`
			`summary(quadrant)`

			`quadrant$Class <- as.factor(quadrant$Class)`

			`plot(quadrant, col=quadrant$Class)`
			`plot(quadrant$a1, quadrant$a2, col=quadrant$Class)`

			`# attribute evaluation using information gain`
			`sort(attrEval(Class ~ ., quadrant, "InfGain"), decreasing = TRUE)`

			`# information gain is a myopic measure`
			`# (it assumes that attributes are conditionally independent given the class)`

			`# using information gain to construct a decision tree will produce a poor result`
			`dt2 <- CoreModel(Class ~ ., quadrant, model="tree", selectionEstimator="InfGain")`
			`plot(dt2, quadrant)`

			`# non-myopic measures (Relief and ReleifF)`
			`sort(attrEval(Class ~ ., quadrant, "Relief"), decreasing = TRUE)`
			`sort(attrEval(Class ~ ., quadrant, "ReliefFequalK"), decreasing = TRUE)`
			`sort(attrEval(Class ~ ., quadrant, "ReliefFexpRank"), decreasing = TRUE)`

			`# use ?attrEval to find out different variations of ReliefF measure...`

			`# build a decision tree using ReliefFequalK as a splitting criterion`
			`dt3 <- CoreModel(Class ~ ., quadrant, model="tree", selectionEstimator = "ReliefFequalK")`
			`plot(dt3, quadrant)`


			`#`
			`# Example 3`
			`#`

			`players <- read.table("players.txt", sep=",", header=TRUE)`
			`summary(players)`
			`players$position <- as.factor(players$position)`

			`sort(attrEval(position ~ ., players, "InfGain"), decreasing = TRUE)`
			`sort(attrEval(position ~ ., players, "Gini"), decreasing = TRUE)`

			`# the "id" attribute is overestimated,`
			`# although it does not carry any useful information`

			`# GainRatio moderates the overestimation of the "id" attribute`
			`sort(attrEval(position ~ ., players, "GainRatio"), decreasing = TRUE)`

			`# Both ReliefF and MDL measures identify the "id" attribute as irrelevant`
			`sort(attrEval(position ~ ., players, "ReliefFequalK"), decreasing = TRUE)`
			`sort(attrEval(position ~ ., players, "MDL"), decreasing = TRUE)`

			`#`
			`#`
			`# Attribute selection`
			`#`
			`#`

			`student <- read.table("student.txt", sep=",", header=T)`
			`student$G1 <- cut(student$G1, c(-Inf, 9, 20), labels=c("fail", "pass"))`
			`student$G2 <- cut(student$G2, c(-Inf, 9, 20), labels=c("fail", "pass"))`
			`student$G3 <- cut(student$G3, c(-Inf, 9, 20), labels=c("fail", "pass"))`
			`student$studytime <- cut(student$studytime, c(-Inf, 1, 2, 3, 4), labels=c("none", "few", "hefty", "endless"))`

			`train = student[1:(nrow(student) * 0.7),]`
			`test = student[(nrow(student) * 0.7):nrow(student),]`

			`#`
			`# Feature selection using a filter method`
			`#`

			`# evaluate the quality of attributes`
			`sort(attrEval(studytime ~ ., student, "MDL"), decreasing = TRUE)`

			`# train a model using everything`
			`model <- CoreModel(studytime ~ ., data=train, model='tree')`
			`predictions <- predict(model, test, type='class')`
			`sum(predictions == test$studytime) / length(predictions)`

			`# train a model using the best evaluated attributes`
			`model <- CoreModel(studytime ~ sex + Walc + Dalc + freetime + higher + paid, data=train, model='tree')`
			`predictions <- predict(model, test, type='class')`
			`sum(predictions == test$studytime) / length(predictions)`


			`# using the appropriate combination of attributes...`
			`model <- CoreModel(studytime ~ Dalc + paid + G2, data=train, model='tree')`
			`predictions <- predict(model, test, type='class')`
			`sum(predictions == test$studytime) / length(predictions)`

			`# What about some model-based feature importances?`
			`library(caret)`
			`set.seed(123780)`
			`#Number randomely variable selected is mtry`
			`control <- trainControl(method='repeatedcv',number=10, repeats=1)`

			`rf.model <- train(studytime ~ .,`
			`data=train,`
			`method='rf',`
			`metric='Accuracy',`
			`trControl=control)`

			`## varImp extracts the feature relevances`
			`rf.importances<- varImp(rf.model, scale = FALSE)`
			`plot(rf.importances, top = 20)`
			`importance.df.values <- rf.importances$importance`
			`importance.df.names <- rownames(rf.importances$importance)`
			`importance.rf.whole <- data.frame(score = importance.df.values,cnames = importance.df.names)`
			`importance.rf.whole <- importance.rf.whole[order(importance.rf.whole$Overall, decreasing = T),]`
			`feature.names.rf <- importance.rf.whole$cnames[1:20]`

			`mdl.importances <- sort(attrEval(studytime ~ ., student, "MDL"), decreasing = TRUE)[1:20]`
			`feature.names.mdl <- names(mdl.importances)`
			`paste0(length(intersect(feature.names.rf, feature.names.mdl))*100/20,"% overlap between top ranked features!")`

			`scaleVector <- function(x) {(x - min(x))/(max(x) - min(x))}`

			`# Let's visualize how similar the rankings' distributions are`
			`rf.all <- scaleVector(importance.rf.whole$Overall)`
			`mdl.all <- scaleVector(as.vector(mdl.importances))`

			`rank.frame <- data.frame(rf.all, mdl.all)`

			`library(dplyr)`
			`library(tidyr)`

			`# How do the two distributions differ?`
			`summary(rank.frame)`
			`rank.frame %>% gather() %>% ggplot(aes(x=value,fill=key))+geom_density(alpha=0.4)+theme_bw()`