230 lines
6.8 KiB
R
230 lines
6.8 KiB
R
|
###
|
||
|
#
|
||
|
# CLUSTERING
|
||
|
#
|
||
|
###
|
||
|
|
||
|
# Load the dataset
|
||
|
cities <- read.table("worldcities.csv", sep=",", header = T)
|
||
|
summary(cities)
|
||
|
x <- cities$lng
|
||
|
y <- cities$lat
|
||
|
x <- x / max(x)
|
||
|
y <- y / max(y)
|
||
|
plot(x,y)
|
||
|
df = data.frame(x, y)
|
||
|
|
||
|
# KMeans clustering
|
||
|
results <- kmeans(df, centers=6)
|
||
|
results
|
||
|
plot(x, y, col=results$cluster)
|
||
|
|
||
|
# Are clusters aligned with classes?
|
||
|
library(ggplot2)
|
||
|
data(iris)
|
||
|
subset <- iris[,1:2]
|
||
|
|
||
|
# classes
|
||
|
p1<-qplot(subset$Sepal.Length,subset$Sepal.Width,color = iris$Species, main = "Ground truth") + theme_bw()
|
||
|
|
||
|
resultsIris <-kmeans(subset, centers = 3)
|
||
|
assignments <- as.factor(resultsIris$cluster)
|
||
|
p2<-qplot(subset$Sepal.Length,subset$Sepal.Width,color = assignments, main = "Clustering") + theme_bw()
|
||
|
gridExtra::grid.arrange(p1, p2, nrow = 2)
|
||
|
|
||
|
# Exercise -> try to cluster the iris data according to Length and Width!
|
||
|
|
||
|
# Try to fit models and see what can be learnt best!
|
||
|
library(mclust)
|
||
|
fit <- Mclust(df)
|
||
|
# summarize the fits
|
||
|
plot(fit)
|
||
|
summary(fit)
|
||
|
|
||
|
#Hierarchical clustering
|
||
|
distMat = dist(df)
|
||
|
resultsH <- hclust(distMat)
|
||
|
|
||
|
plot(resultsH)
|
||
|
clusters <- cutree(resultsH, 6)
|
||
|
plot(x, y, col=clusters)
|
||
|
|
||
|
|
||
|
###
|
||
|
#
|
||
|
# ATTRIBUTE EVALUATION
|
||
|
#
|
||
|
###
|
||
|
|
||
|
|
||
|
# Attribute evaluation functions are implemented in the package "CORElearn"
|
||
|
|
||
|
# Load the library
|
||
|
#install.packages("CORElearn")
|
||
|
library(CORElearn)
|
||
|
|
||
|
|
||
|
#
|
||
|
# Attribute estimation in classification
|
||
|
#
|
||
|
|
||
|
|
||
|
|
||
|
#
|
||
|
# Example 1
|
||
|
#
|
||
|
|
||
|
mr <- read.table("mushroom.txt", sep=",", header=T)
|
||
|
mr$edibility <- as.factor(mr$edibility)
|
||
|
summary(mr)
|
||
|
|
||
|
qplot(mr$edibility, ylab="Number of species", main="Edibility of mushrooms", geom = c("bar"))
|
||
|
|
||
|
|
||
|
# the attrEval function evaluates the quality of the attributes/dependent variables
|
||
|
# specified by the formula. Parameter formula is used as a mechanism to select
|
||
|
# attributes and prediction variable(class). The simplest way is to specify just
|
||
|
# response variable: "Class ~ .". In this case all other attributes in the data set
|
||
|
# are evaluated with the selected heuristic method.
|
||
|
|
||
|
# attribute evaluation using information gain
|
||
|
sort(attrEval(edibility ~ ., mr, "InfGain"), decreasing = TRUE)
|
||
|
|
||
|
# build a decision tree using information gain as a splitting criterion
|
||
|
dt <- CoreModel(edibility ~ ., mr, model="tree", selectionEstimator="InfGain")
|
||
|
plot(dt, mr)
|
||
|
|
||
|
|
||
|
#
|
||
|
# Example 2
|
||
|
#
|
||
|
|
||
|
quadrant <- read.table("quadrant.txt", sep=",", header=T)
|
||
|
summary(quadrant)
|
||
|
|
||
|
quadrant$Class <- as.factor(quadrant$Class)
|
||
|
|
||
|
plot(quadrant, col=quadrant$Class)
|
||
|
plot(quadrant$a1, quadrant$a2, col=quadrant$Class)
|
||
|
|
||
|
# attribute evaluation using information gain
|
||
|
sort(attrEval(Class ~ ., quadrant, "InfGain"), decreasing = TRUE)
|
||
|
|
||
|
# information gain is a myopic measure
|
||
|
# (it assumes that attributes are conditionally independent given the class)
|
||
|
|
||
|
# using information gain to construct a decision tree will produce a poor result
|
||
|
dt2 <- CoreModel(Class ~ ., quadrant, model="tree", selectionEstimator="InfGain")
|
||
|
plot(dt2, quadrant)
|
||
|
|
||
|
# non-myopic measures (Relief and ReleifF)
|
||
|
sort(attrEval(Class ~ ., quadrant, "Relief"), decreasing = TRUE)
|
||
|
sort(attrEval(Class ~ ., quadrant, "ReliefFequalK"), decreasing = TRUE)
|
||
|
sort(attrEval(Class ~ ., quadrant, "ReliefFexpRank"), decreasing = TRUE)
|
||
|
|
||
|
# use ?attrEval to find out different variations of ReliefF measure...
|
||
|
|
||
|
# build a decision tree using ReliefFequalK as a splitting criterion
|
||
|
dt3 <- CoreModel(Class ~ ., quadrant, model="tree", selectionEstimator = "ReliefFequalK")
|
||
|
plot(dt3, quadrant)
|
||
|
|
||
|
|
||
|
#
|
||
|
# Example 3
|
||
|
#
|
||
|
|
||
|
players <- read.table("players.txt", sep=",", header=TRUE)
|
||
|
summary(players)
|
||
|
players$position <- as.factor(players$position)
|
||
|
|
||
|
sort(attrEval(position ~ ., players, "InfGain"), decreasing = TRUE)
|
||
|
sort(attrEval(position ~ ., players, "Gini"), decreasing = TRUE)
|
||
|
|
||
|
# the "id" attribute is overestimated,
|
||
|
# although it does not carry any useful information
|
||
|
|
||
|
# GainRatio moderates the overestimation of the "id" attribute
|
||
|
sort(attrEval(position ~ ., players, "GainRatio"), decreasing = TRUE)
|
||
|
|
||
|
# Both ReliefF and MDL measures identify the "id" attribute as irrelevant
|
||
|
sort(attrEval(position ~ ., players, "ReliefFequalK"), decreasing = TRUE)
|
||
|
sort(attrEval(position ~ ., players, "MDL"), decreasing = TRUE)
|
||
|
|
||
|
#
|
||
|
#
|
||
|
# Attribute selection
|
||
|
#
|
||
|
#
|
||
|
|
||
|
student <- read.table("student.txt", sep=",", header=T)
|
||
|
student$G1 <- cut(student$G1, c(-Inf, 9, 20), labels=c("fail", "pass"))
|
||
|
student$G2 <- cut(student$G2, c(-Inf, 9, 20), labels=c("fail", "pass"))
|
||
|
student$G3 <- cut(student$G3, c(-Inf, 9, 20), labels=c("fail", "pass"))
|
||
|
student$studytime <- cut(student$studytime, c(-Inf, 1, 2, 3, 4), labels=c("none", "few", "hefty", "endless"))
|
||
|
|
||
|
train = student[1:(nrow(student) * 0.7),]
|
||
|
test = student[(nrow(student) * 0.7):nrow(student),]
|
||
|
|
||
|
#
|
||
|
# Feature selection using a filter method
|
||
|
#
|
||
|
|
||
|
# evaluate the quality of attributes
|
||
|
sort(attrEval(studytime ~ ., student, "MDL"), decreasing = TRUE)
|
||
|
|
||
|
# train a model using everything
|
||
|
model <- CoreModel(studytime ~ ., data=train, model='tree')
|
||
|
predictions <- predict(model, test, type='class')
|
||
|
sum(predictions == test$studytime) / length(predictions)
|
||
|
|
||
|
# train a model using the best evaluated attributes
|
||
|
model <- CoreModel(studytime ~ sex + Walc + Dalc + freetime + higher + paid, data=train, model='tree')
|
||
|
predictions <- predict(model, test, type='class')
|
||
|
sum(predictions == test$studytime) / length(predictions)
|
||
|
|
||
|
|
||
|
# using the appropriate combination of attributes...
|
||
|
model <- CoreModel(studytime ~ Dalc + paid + G2, data=train, model='tree')
|
||
|
predictions <- predict(model, test, type='class')
|
||
|
sum(predictions == test$studytime) / length(predictions)
|
||
|
|
||
|
# What about some model-based feature importances?
|
||
|
library(caret)
|
||
|
set.seed(123780)
|
||
|
#Number randomely variable selected is mtry
|
||
|
control <- trainControl(method='repeatedcv',number=10, repeats=1)
|
||
|
|
||
|
rf.model <- train(studytime ~ .,
|
||
|
data=train,
|
||
|
method='rf',
|
||
|
metric='Accuracy',
|
||
|
trControl=control)
|
||
|
|
||
|
## varImp extracts the feature relevances
|
||
|
rf.importances<- varImp(rf.model, scale = FALSE)
|
||
|
plot(rf.importances, top = 20)
|
||
|
importance.df.values <- rf.importances$importance
|
||
|
importance.df.names <- rownames(rf.importances$importance)
|
||
|
importance.rf.whole <- data.frame(score = importance.df.values,cnames = importance.df.names)
|
||
|
importance.rf.whole <- importance.rf.whole[order(importance.rf.whole$Overall, decreasing = T),]
|
||
|
feature.names.rf <- importance.rf.whole$cnames[1:20]
|
||
|
|
||
|
mdl.importances <- sort(attrEval(studytime ~ ., student, "MDL"), decreasing = TRUE)[1:20]
|
||
|
feature.names.mdl <- names(mdl.importances)
|
||
|
paste0(length(intersect(feature.names.rf, feature.names.mdl))*100/20,"% overlap between top ranked features!")
|
||
|
|
||
|
scaleVector <- function(x) {(x - min(x))/(max(x) - min(x))}
|
||
|
|
||
|
# Let's visualize how similar the rankings' distributions are
|
||
|
rf.all <- scaleVector(importance.rf.whole$Overall)
|
||
|
mdl.all <- scaleVector(as.vector(mdl.importances))
|
||
|
|
||
|
rank.frame <- data.frame(rf.all, mdl.all)
|
||
|
|
||
|
library(dplyr)
|
||
|
library(tidyr)
|
||
|
|
||
|
# How do the two distributions differ?
|
||
|
summary(rank.frame)
|
||
|
rank.frame %>% gather() %>% ggplot(aes(x=value,fill=key))+geom_density(alpha=0.4)+theme_bw()
|