is-vaje/v3/lab3_visualization.R

##############################################################################
#
# DATA VISUALIZATION
#
##############################################################################

# Please download data files "movies.txt" and "players.txt" into a local directory
# then set that directory as the current working directory of R.
# You can achive this using the "setwd" command or by selecting "File -> Change dir..."

# for example:
# setwd("c:\\labs\\data\\")
library(ggplot2)
library(dplyr)

# To read data from a text file, use the "read.table" command.
# The parameter header=TRUE indicates that the file to be read includes a first line with the column names
md <- read.table(file="movies.txt", sep=",", header=TRUE)

# To get more information on any specific named function, type "?" followed by the function name
?read.table

# Useful functions
head(md)
summary(md)
str(md)
names(md)


# We will transform binary attributes into nominal variables with a fixed number of possible values (factors)
md$Action <- as.factor(md$Action)
md$Animation <- as.factor(md$Animation)

# The remaining columns will be transformed using the for loop
for (i in 20:24)
  md[,i] <- as.factor(md[,i])

#
# Type conversion functions:
#
# as.numeric
# as.integer
# as.character
# as.logical
# as.factor
# as.ordered
#
# values that cannot be converted to the specified type will be converted to a NA value
#

# Binary attributes are now represented as factors
summary(md)

# Accessing data frame elements...
md[30,]
md[30,3]
md[30,"length"]
md[,3]
md$length

# Useful data visualization functions
plot(md$length)
hist(md$length)
plot(density(md$length))
boxplot(md$length)
barplot(table(md$Drama))
pie(table(md$mpaa))

## nicer plots with ggplot2 + dplyr
md %>% ggplot(aes(length)) + geom_histogram(bins = 40) + ggtitle("A genetic histogram") + xlab("Length")

## plotting w.r.t. multiple mpaa categories
md %>% ggplot(aes(length,fill = mpaa)) + geom_density(alpha = 0.2)

## What about a nicer boxplot w.r.t mpaa?
## theme_bw() is more neutral theme
md %>% ggplot(aes(Drama, rating, color = mpaa)) + geom_boxplot() + theme_bw()

## show table view

###############################################################################
#
# EXAMPLE 1: What is the proportion of comedies to other genres in our data set?
#
###############################################################################

# the table() command gives the frequency of values in the vector
table(md$Comedy)

# the proportion of comedies can be plotted
barplot(table(md$Comedy))
pie(table(md$Comedy))


#  it is important to always label graphs ...

tab <- table(md$Comedy)
names(tab) <- c("Other genres", "Comedies")
tab
pie(tab)

sum(tab)

barplot(tab, ylab="Number of titles", main="Proportion of comedies to other genres")
barplot(tab / sum(tab) * 100, ylab="Percentage of titles", main="The proportion of comedies to other genres")

pie(tab, main = "Proportion of comedies to other genres")

###############################################################################
#
# EXAMPLE 2: How are ratings distributed for comedies?
#
###############################################################################

# Plot the rating distribution for comedies
hist(md[md$Comedy == "1", "rating"], xlab="Rating", ylab="Frequency", main="Histogram of ratings for comedies")

# Box plots provide a visual display of the range and potential skewness of the data
boxplot(md[md$Comedy == "1", "rating"], ylab="Rating", main="Boxplot of ratings for comedies")

quantile(md$rating[md$Comedy == 1])


###############################################################################
# 
# EXAMPLE 3: Are comedies on average better rated than non-comedies?
#
###############################################################################

# Select comedies
comedy <- md$Comedy == "1"

# Calculate the mean rating value for comedies and non-comedies
mean(md[comedy,"rating"])
mean(md[!comedy,"rating"])

# Comedies have, on average, higher ratings than non-comedies

# Side-by-side boxplots of ratings grouped by values of the attribute "Comedy" 
boxplot(rating ~ Comedy, data=md)
boxplot(rating ~ Comedy, data=md, names=c("Other genres", "Comedies"), ylab="Rating", main="Comparison of ratings between comedies and non-comedies")

## or with dplyr directly
md %>% group_by(Comedy) %>% select(rating) %>% summarise(mean(rating))

###############################################################################
#
# EXAMPLE 4: What is the proportion of comedies (per year) from 1990 onwards?
#
###############################################################################

sel <- md$year >= 1990

# the table() command can be used to get a two-way contigency table 
table(md$Comedy[sel], md$year[sel])

table(md$year[sel])

tabcomedy <- table(md$Comedy[sel], md$year[sel])
tabyear <- table(md$year[sel])
tabcomedy[2,]/tabyear

ratio <- tabcomedy[2,]/tabyear
barplot(ratio, xlab="Year", ylab="Relative frequency", main="Proportion of comedies")

plot(x=names(ratio), y=as.vector(ratio), type="l", xlab="Year", ylab="Relative frequency", main="Proportion of comedies, 1990-2005")

## or with dplyr directly
md %>% filter(year >= 1990) %>% 
  group_by(year, Comedy) %>%  
  summarise(n = n()) %>% mutate(freq = n / sum(n)) %>% 
  filter(Comedy == 1) %>% select(year, freq) %>% 
  ggplot(aes(year, freq)) + geom_point() + ggtitle("Frequency of comedies") + ylab("Frequency") + xlab("Year") + geom_line() + theme_bw()

###############################################################################
# 
# EXAMPLE 5: Are there more movies above or below the average rating?
#
###############################################################################

# the average rating
mean(md$rating)

# how many movies are above the average rating?
tab <- table(md$rating > mean(md$rating))
tab

names(tab) <- c("below", "above")
barplot(tab, ylab="Number of titles", main="Proportion of movies above and below the average rating")
pie(tab, main="Proportion of movies above and below the average rating")


# Box plots provide a summarization of the variable distribution
boxplot(md$rating, ylab="Rating", main="Boxplot of movie ratings")

# The horizontal line inside the box represents the median rating value

# Let's plot the mean value...
abline(h=mean(md$rating))

## or with dplyr + ggplot
md %>% mutate(mRate = mean(rating)) %>% 
  mutate(indicator = ifelse(rating - mRate > 0, "above", "below")) %>% 
  group_by(indicator) %>% 
  summarise(counts = n()) %>% 
  ggplot(aes(indicator, counts, fill = indicator))+ geom_bar(stat = "identity")

# The mean differs from the median so the distribution is skewed.
# We can conclude that there are more cases above the mean value. 


###############################################################################
#
# EXAMPLE 6: Do movies with bigger budgets get higher ratings?
#
###############################################################################

# there are missing values in the budget attribute
summary(md$budget)

is.na(md$budget)
table(is.na(md$budget))
which(is.na(md$budget))

# select complete observations only
sel <- is.na(md$budget)
mdsub <- md[!sel,]

nrow(mdsub)
summary(mdsub$budget)


plot(mdsub$budget, mdsub$rating, xlab="Budget in $", ylab="Rating", main="Movie rating vs budget")

# Plotted points are mostly located in the upper left part of the diagram, 
# which means that a higher budget usually leads to a higher rating

# Utilization of the budget in terms of rating  
ratio <- mdsub$budget/mdsub$rating
hist(ratio)

# Which movie has the worst budget utilization?
mdsub[which.max(ratio),]


# Let's discretize these budgets to:
# low (less than 1M), mid (between 1M and 50M) and big (more than 50M)

disbudget <- cut(mdsub$budget, c(0, 1000000, 50000000, 500000000), labels=c("low", "mid", "big"))
barplot(table(disbudget)/length(disbudget), xlab="Budget", ylab="Relative frequency", main="Proportion of movies vs budget")

# Side-by-side boxplots of ratings grouped by budget values
boxplot(mdsub$rating ~ disbudget, xlab="Budget", ylab="Rating", main="Boxplot of movie rating vs budget")

## Is this dependent on the mpaa?
## or with dplyr + ggplot + adding votes
md %>% select(budget, rating, votes, mpaa) %>% 
  na.omit() %>% 
  ggplot(aes(budget, rating, color = votes, fill = mpaa)) + geom_point() + geom_smooth(method = "lm", formula = y ~ x) + theme_bw()

###############################################################################
#
# EXAMPLE 7: 
# What is the cumulative movie budget for each year from 1990 to 2000?
# What is the average movie budget for each year from 1990 to 2000?
# (consider only those movies for witch information on the budget is available!)
#
###############################################################################

# Select the movies that contain information on their budgets
sel <- !is.na(md$budget) & md$year >= 1990 & md$year <= 2000

# We can calculate cumulative budget for each year using the "aggregate" function

# Data overflow problem!
aggregate(budget ~ year, data = md[sel,], sum)

# The budget values are represented as integers
typeof(md$budget)

# In order to avoid the overflow problem we have to convert 
# the budget values into a double-precision representation (using the as.double() command)
aggregate(as.double(budget) ~ year, data = md[sel,], sum)

sum.budget <- aggregate(as.double(budget) ~ year, data = md[sel,], sum) 
plot(sum.budget, type="l", xlab="Year", ylab="Cumulative budget in $", main="Cumulative movie budget per year")

avg.budget <- aggregate(as.double(budget) ~ year, data = md[sel,], mean)
plot(avg.budget, type="l", xlab="Year", ylab="Average budget in $", main="Average movie budget per year")

## or with dplyr
md %>% select(budget, year) %>% na.omit() %>% 
  group_by(year) %>% summarise(budget2 = sum(as.numeric(budget))) %>% 
  arrange(year) %>% mutate(csum = cumsum(budget2)) %>%
  ggplot(aes(year, csum)) + geom_bar(stat = "identity") + theme_bw()

##############################################################################
#
# EXAMPLE 8: (players dataset)
# What is the average height for each season in the period from 1970 to 2000?
#
##############################################################################

# Load the Players dataset
players <- read.table("players.txt", sep=",", header = T)
summary(players)

# Create an empty vector
h <- vector()

# Use a for loop to go through each year in the period from 1970 to 2000
for (y in 1970:2000)
{
  # Select active players in that year
  sel <- players$firstseason <= y & players$lastseason >= y
  
  # Append the resulting vector with the mean height for the current year
  h <- c(h, mean(players$height[sel]))
}

# plot the resulting vector (use type="l" for lines)
plot(1970:2000, h, type="l", xlab="Year", ylab="Height in cm", main="Average height in NBA")

## or with dplyr
dfx <- data.frame(year = 1970:2000,mh = h)
dfx %>% ggplot(aes(year, mh)) + geom_point() + geom_smooth(method = "loess") + theme_bw() + xlab("Year") + ylab("Mean height")
Zacetek na v3 2022-10-24 15:50:54 +02:00			`##############################################################################`
			`#`
			`# DATA VISUALIZATION`
			`#`
			`##############################################################################`

			`# Please download data files "movies.txt" and "players.txt" into a local directory`
			`# then set that directory as the current working directory of R.`
			`# You can achive this using the "setwd" command or by selecting "File -> Change dir..."`

			`# for example:`
			`# setwd("c:\\labs\\data\\")`
			`library(ggplot2)`
			`library(dplyr)`

			`# To read data from a text file, use the "read.table" command.`
			`# The parameter header=TRUE indicates that the file to be read includes a first line with the column names`
			`md <- read.table(file="movies.txt", sep=",", header=TRUE)`

			`# To get more information on any specific named function, type "?" followed by the function name`
			`?read.table`

			`# Useful functions`
			`head(md)`
			`summary(md)`
			`str(md)`
			`names(md)`


			`# We will transform binary attributes into nominal variables with a fixed number of possible values (factors)`
			`md$Action <- as.factor(md$Action)`
			`md$Animation <- as.factor(md$Animation)`

			`# The remaining columns will be transformed using the for loop`
			`for (i in 20:24)`
			`md[,i] <- as.factor(md[,i])`

			`#`
			`# Type conversion functions:`
			`#`
			`# as.numeric`
			`# as.integer`
			`# as.character`
			`# as.logical`
			`# as.factor`
			`# as.ordered`
			`#`
			`# values that cannot be converted to the specified type will be converted to a NA value`
			`#`

			`# Binary attributes are now represented as factors`
			`summary(md)`

			`# Accessing data frame elements...`
			`md[30,]`
			`md[30,3]`
			`md[30,"length"]`
			`md[,3]`
			`md$length`

			`# Useful data visualization functions`
			`plot(md$length)`
			`hist(md$length)`
			`plot(density(md$length))`
			`boxplot(md$length)`
			`barplot(table(md$Drama))`
			`pie(table(md$mpaa))`

			`## nicer plots with ggplot2 + dplyr`
			`md %>% ggplot(aes(length)) + geom_histogram(bins = 40) + ggtitle("A genetic histogram") + xlab("Length")`

			`## plotting w.r.t. multiple mpaa categories`
			`md %>% ggplot(aes(length,fill = mpaa)) + geom_density(alpha = 0.2)`

			`## What about a nicer boxplot w.r.t mpaa?`
			`## theme_bw() is more neutral theme`
			`md %>% ggplot(aes(Drama, rating, color = mpaa)) + geom_boxplot() + theme_bw()`

			`## show table view`

			`###############################################################################`
			`#`
			`# EXAMPLE 1: What is the proportion of comedies to other genres in our data set?`
			`#`
			`###############################################################################`

			`# the table() command gives the frequency of values in the vector`
			`table(md$Comedy)`

			`# the proportion of comedies can be plotted`
			`barplot(table(md$Comedy))`
			`pie(table(md$Comedy))`


			`# it is important to always label graphs ...`

			`tab <- table(md$Comedy)`
			`names(tab) <- c("Other genres", "Comedies")`
			`tab`
			`pie(tab)`

			`sum(tab)`

			`barplot(tab, ylab="Number of titles", main="Proportion of comedies to other genres")`
			`barplot(tab / sum(tab) * 100, ylab="Percentage of titles", main="The proportion of comedies to other genres")`

			`pie(tab, main = "Proportion of comedies to other genres")`

			`###############################################################################`
			`#`
			`# EXAMPLE 2: How are ratings distributed for comedies?`
			`#`
			`###############################################################################`

			`# Plot the rating distribution for comedies`
			`hist(md[md$Comedy == "1", "rating"], xlab="Rating", ylab="Frequency", main="Histogram of ratings for comedies")`

			`# Box plots provide a visual display of the range and potential skewness of the data`
			`boxplot(md[md$Comedy == "1", "rating"], ylab="Rating", main="Boxplot of ratings for comedies")`

			`quantile(md$rating[md$Comedy == 1])`


			`###############################################################################`
			`#`
			`# EXAMPLE 3: Are comedies on average better rated than non-comedies?`
			`#`
			`###############################################################################`

			`# Select comedies`
			`comedy <- md$Comedy == "1"`

			`# Calculate the mean rating value for comedies and non-comedies`
			`mean(md[comedy,"rating"])`
			`mean(md[!comedy,"rating"])`

			`# Comedies have, on average, higher ratings than non-comedies`

			`# Side-by-side boxplots of ratings grouped by values of the attribute "Comedy"`
			`boxplot(rating ~ Comedy, data=md)`
			`boxplot(rating ~ Comedy, data=md, names=c("Other genres", "Comedies"), ylab="Rating", main="Comparison of ratings between comedies and non-comedies")`

			`## or with dplyr directly`
			`md %>% group_by(Comedy) %>% select(rating) %>% summarise(mean(rating))`

			`###############################################################################`
			`#`
			`# EXAMPLE 4: What is the proportion of comedies (per year) from 1990 onwards?`
			`#`
			`###############################################################################`

			`sel <- md$year >= 1990`

			`# the table() command can be used to get a two-way contigency table`
			`table(md$Comedy[sel], md$year[sel])`

			`table(md$year[sel])`

			`tabcomedy <- table(md$Comedy[sel], md$year[sel])`
			`tabyear <- table(md$year[sel])`
			`tabcomedy[2,]/tabyear`

			`ratio <- tabcomedy[2,]/tabyear`
			`barplot(ratio, xlab="Year", ylab="Relative frequency", main="Proportion of comedies")`

			`plot(x=names(ratio), y=as.vector(ratio), type="l", xlab="Year", ylab="Relative frequency", main="Proportion of comedies, 1990-2005")`

			`## or with dplyr directly`
			`md %>% filter(year >= 1990) %>%`
			`group_by(year, Comedy) %>%`
			`summarise(n = n()) %>% mutate(freq = n / sum(n)) %>%`
			`filter(Comedy == 1) %>% select(year, freq) %>%`
			`ggplot(aes(year, freq)) + geom_point() + ggtitle("Frequency of comedies") + ylab("Frequency") + xlab("Year") + geom_line() + theme_bw()`

			`###############################################################################`
			`#`
			`# EXAMPLE 5: Are there more movies above or below the average rating?`
			`#`
			`###############################################################################`

			`# the average rating`
			`mean(md$rating)`

			`# how many movies are above the average rating?`
			`tab <- table(md$rating > mean(md$rating))`
			`tab`

			`names(tab) <- c("below", "above")`
			`barplot(tab, ylab="Number of titles", main="Proportion of movies above and below the average rating")`
			`pie(tab, main="Proportion of movies above and below the average rating")`


			`# Box plots provide a summarization of the variable distribution`
			`boxplot(md$rating, ylab="Rating", main="Boxplot of movie ratings")`

			`# The horizontal line inside the box represents the median rating value`

			`# Let's plot the mean value...`
			`abline(h=mean(md$rating))`

			`## or with dplyr + ggplot`
			`md %>% mutate(mRate = mean(rating)) %>%`
			`mutate(indicator = ifelse(rating - mRate > 0, "above", "below")) %>%`
			`group_by(indicator) %>%`
			`summarise(counts = n()) %>%`
			`ggplot(aes(indicator, counts, fill = indicator))+ geom_bar(stat = "identity")`

			`# The mean differs from the median so the distribution is skewed.`
			`# We can conclude that there are more cases above the mean value.`


			`###############################################################################`
			`#`
			`# EXAMPLE 6: Do movies with bigger budgets get higher ratings?`
			`#`
			`###############################################################################`

			`# there are missing values in the budget attribute`
			`summary(md$budget)`

			`is.na(md$budget)`
			`table(is.na(md$budget))`
			`which(is.na(md$budget))`

			`# select complete observations only`
			`sel <- is.na(md$budget)`
			`mdsub <- md[!sel,]`

			`nrow(mdsub)`
			`summary(mdsub$budget)`


			`plot(mdsub$budget, mdsub$rating, xlab="Budget in $", ylab="Rating", main="Movie rating vs budget")`

			`# Plotted points are mostly located in the upper left part of the diagram,`
			`# which means that a higher budget usually leads to a higher rating`

			`# Utilization of the budget in terms of rating`
			`ratio <- mdsub$budget/mdsub$rating`
			`hist(ratio)`

			`# Which movie has the worst budget utilization?`
			`mdsub[which.max(ratio),]`


			`# Let's discretize these budgets to:`
			`# low (less than 1M), mid (between 1M and 50M) and big (more than 50M)`

			`disbudget <- cut(mdsub$budget, c(0, 1000000, 50000000, 500000000), labels=c("low", "mid", "big"))`
			`barplot(table(disbudget)/length(disbudget), xlab="Budget", ylab="Relative frequency", main="Proportion of movies vs budget")`

			`# Side-by-side boxplots of ratings grouped by budget values`
			`boxplot(mdsub$rating ~ disbudget, xlab="Budget", ylab="Rating", main="Boxplot of movie rating vs budget")`

			`## Is this dependent on the mpaa?`
			`## or with dplyr + ggplot + adding votes`
			`md %>% select(budget, rating, votes, mpaa) %>%`
			`na.omit() %>%`
			`ggplot(aes(budget, rating, color = votes, fill = mpaa)) + geom_point() + geom_smooth(method = "lm", formula = y ~ x) + theme_bw()`

			`###############################################################################`
			`#`
			`# EXAMPLE 7:`
			`# What is the cumulative movie budget for each year from 1990 to 2000?`
			`# What is the average movie budget for each year from 1990 to 2000?`
			`# (consider only those movies for witch information on the budget is available!)`
			`#`
			`###############################################################################`

			`# Select the movies that contain information on their budgets`
			`sel <- !is.na(md$budget) & md$year >= 1990 & md$year <= 2000`

			`# We can calculate cumulative budget for each year using the "aggregate" function`

			`# Data overflow problem!`
			`aggregate(budget ~ year, data = md[sel,], sum)`

			`# The budget values are represented as integers`
			`typeof(md$budget)`

			`# In order to avoid the overflow problem we have to convert`
			`# the budget values into a double-precision representation (using the as.double() command)`
			`aggregate(as.double(budget) ~ year, data = md[sel,], sum)`

			`sum.budget <- aggregate(as.double(budget) ~ year, data = md[sel,], sum)`
			`plot(sum.budget, type="l", xlab="Year", ylab="Cumulative budget in $", main="Cumulative movie budget per year")`

			`avg.budget <- aggregate(as.double(budget) ~ year, data = md[sel,], mean)`
			`plot(avg.budget, type="l", xlab="Year", ylab="Average budget in $", main="Average movie budget per year")`

			`## or with dplyr`
			`md %>% select(budget, year) %>% na.omit() %>%`
			`group_by(year) %>% summarise(budget2 = sum(as.numeric(budget))) %>%`
			`arrange(year) %>% mutate(csum = cumsum(budget2)) %>%`
			`ggplot(aes(year, csum)) + geom_bar(stat = "identity") + theme_bw()`

			`##############################################################################`
			`#`
			`# EXAMPLE 8: (players dataset)`
			`# What is the average height for each season in the period from 1970 to 2000?`
			`#`
			`##############################################################################`

			`# Load the Players dataset`
			`players <- read.table("players.txt", sep=",", header = T)`
			`summary(players)`

			`# Create an empty vector`
			`h <- vector()`

			`# Use a for loop to go through each year in the period from 1970 to 2000`
			`for (y in 1970:2000)`
			`{`
			`# Select active players in that year`
			`sel <- players$firstseason <= y & players$lastseason >= y`

			`# Append the resulting vector with the mean height for the current year`
			`h <- c(h, mean(players$height[sel]))`
			`}`

			`# plot the resulting vector (use type="l" for lines)`
			`plot(1970:2000, h, type="l", xlab="Year", ylab="Height in cm", main="Average height in NBA")`

			`## or with dplyr`
			`dfx <- data.frame(year = 1970:2000,mh = h)`
			`dfx %>% ggplot(aes(year, mh)) + geom_point() + geom_smooth(method = "loess") + theme_bw() + xlab("Year") + ylab("Mean height")`