is-vaje/v3/lab3_visualization.R

##############################################################################
#
# DATA VISUALIZATION
#
##############################################################################

# Please download data files "movies.txt" and "players.txt" into a local directory
# then set that directory as the current working directory of R.
# You can achive this using the "setwd" command or by selecting "File -> Change dir..."

# for example:
# setwd("c:\\labs\\data\\")
library(ggplot2)
library(dplyr)

# To read data from a text file, use the "read.table" command.
# The parameter header=TRUE indicates that the file to be read includes a first line with the column names
md <- read.table(file="movies.txt", sep=",", header=TRUE)

# To get more information on any specific named function, type "?" followed by the function name
?read.table

# Useful functions
head(md)
summary(md)
str(md)
names(md)


# We will transform binary attributes into nominal variables with a fixed number of possible values (factors)
md$Action <- as.factor(md$Action)
md$Animation <- as.factor(md$Animation)

# The remaining columns will be transformed using the for loop
for (i in 20:24)
  md[,i] <- as.factor(md[,i])

#
# Type conversion functions:
#
# as.numeric
# as.integer
# as.character
# as.logical
# as.factor
# as.ordered
#
# values that cannot be converted to the specified type will be converted to a NA value
#

# Binary attributes are now represented as factors
summary(md)

# Accessing data frame elements...
md[30,]
md[30,3]
md[30,"length"]
md[,3]
md$length

# Useful data visualization functions
plot(md$length)
hist(md$length)
plot(density(md$length))
boxplot(md$length)
barplot(table(md$Drama))
pie(table(md$mpaa))

## nicer plots with ggplot2 + dplyr
md %>% ggplot(aes(length)) + geom_histogram(bins = 40) + ggtitle("A genetic histogram") + xlab("Length")

## plotting w.r.t. multiple mpaa categories
md %>% ggplot(aes(length,fill = mpaa)) + geom_density(alpha = 0.2)

## What about a nicer boxplot w.r.t mpaa?
## theme_bw() is more neutral theme
md %>% ggplot(aes(Drama, rating, color = mpaa)) + geom_boxplot() + theme_bw()

## show table view

###############################################################################
#
# EXAMPLE 1: What is the proportion of comedies to other genres in our data set?
#
###############################################################################

# the table() command gives the frequency of values in the vector
table(md$Comedy)

# the proportion of comedies can be plotted
barplot(table(md$Comedy))
pie(table(md$Comedy))


#  it is important to always label graphs ...

tab <- table(md$Comedy)
names(tab) <- c("Other genres", "Comedies")
tab
pie(tab)

sum(tab)

barplot(tab, ylab="Number of titles", main="Proportion of comedies to other genres")
barplot(tab / sum(tab) * 100, ylab="Percentage of titles", main="The proportion of comedies to other genres")

pie(tab, main = "Proportion of comedies to other genres")

###############################################################################
#
# EXAMPLE 2: How are ratings distributed for comedies?
#
###############################################################################

# Plot the rating distribution for comedies
hist(md[md$Comedy == "1", "rating"], xlab="Rating", ylab="Frequency", main="Histogram of ratings for comedies")

# Box plots provide a visual display of the range and potential skewness of the data
boxplot(md[md$Comedy == "1", "rating"], ylab="Rating", main="Boxplot of ratings for comedies")

quantile(md$rating[md$Comedy == 1])


###############################################################################
#
# EXAMPLE 3: Are comedies on average better rated than non-comedies?
#
###############################################################################

# Select comedies
comedy <- md$Comedy == "1"

# Calculate the mean rating value for comedies and non-comedies
mean(md[comedy,"rating"])
mean(md[!comedy,"rating"])

# Comedies have, on average, higher ratings than non-comedies

# Side-by-side boxplots of ratings grouped by values of the attribute "Comedy"
boxplot(rating ~ Comedy, data=md)
boxplot(rating ~ Comedy, data=md, names=c("Other genres", "Comedies"), ylab="Rating", main="Comparison of ratings between comedies and non-comedies")

## or with dplyr directly
md %>% group_by(Comedy) %>% select(rating) %>% summarise(mean(rating))

###############################################################################
#
# EXAMPLE 4: What is the proportion of comedies (per year) from 1990 onwards?
#
###############################################################################

sel <- md$year >= 1990

# the table() command can be used to get a two-way contigency table
table(md$Comedy[sel], md$year[sel])

table(md$year[sel])

tabcomedy <- table(md$Comedy[sel], md$year[sel])
tabyear <- table(md$year[sel])
tabcomedy[2,]/tabyear

ratio <- tabcomedy[2,]/tabyear
barplot(ratio, xlab="Year", ylab="Relative frequency", main="Proportion of comedies")

plot(x=names(ratio), y=as.vector(ratio), type="l", xlab="Year", ylab="Relative frequency", main="Proportion of comedies, 1990-2005")

## or with dplyr directly
md %>% filter(year >= 1990) %>%
  group_by(year, Comedy) %>%
  summarise(n = n()) %>% mutate(freq = n / sum(n)) %>%
  filter(Comedy == 1) %>% select(year, freq) %>%
  ggplot(aes(year, freq)) + geom_point() + ggtitle("Frequency of comedies") + ylab("Frequency") + xlab("Year") + geom_line() + theme_bw()

###############################################################################
#
# EXAMPLE 5: Are there more movies above or below the average rating?
#
###############################################################################

# the average rating
mean(md$rating)

# how many movies are above the average rating?
tab <- table(md$rating > mean(md$rating))
tab

names(tab) <- c("below", "above")
barplot(tab, ylab="Number of titles", main="Proportion of movies above and below the average rating")
pie(tab, main="Proportion of movies above and below the average rating")


# Box plots provide a summarization of the variable distribution
boxplot(md$rating, ylab="Rating", main="Boxplot of movie ratings")

# The horizontal line inside the box represents the median rating value

# Let's plot the mean value...
abline(h=mean(md$rating))

## or with dplyr + ggplot
md %>% mutate(mRate = mean(rating)) %>%
  mutate(indicator = ifelse(rating - mRate > 0, "above", "below")) %>%
  group_by(indicator) %>%
  summarise(counts = n()) %>%
  ggplot(aes(indicator, counts, fill = indicator))+ geom_bar(stat = "identity")

# The mean differs from the median so the distribution is skewed.
# We can conclude that there are more cases above the mean value.


###############################################################################
#
# EXAMPLE 6: Do movies with bigger budgets get higher ratings?
#
###############################################################################

# there are missing values in the budget attribute
summary(md$budget)

is.na(md$budget)
table(is.na(md$budget))
which(is.na(md$budget))

# select complete observations only
sel <- is.na(md$budget)
mdsub <- md[!sel,]

nrow(mdsub)
summary(mdsub$budget)


plot(mdsub$budget, mdsub$rating, xlab="Budget in $", ylab="Rating", main="Movie rating vs budget")

# Plotted points are mostly located in the upper left part of the diagram,
# which means that a higher budget usually leads to a higher rating

# Utilization of the budget in terms of rating
ratio <- mdsub$budget/mdsub$rating
hist(ratio)

# Which movie has the worst budget utilization?
mdsub[which.max(ratio),]


# Let's discretize these budgets to:
# low (less than 1M), mid (between 1M and 50M) and big (more than 50M)

disbudget <- cut(mdsub$budget, c(0, 1000000, 50000000, 500000000), labels=c("low", "mid", "big"))
barplot(table(disbudget)/length(disbudget), xlab="Budget", ylab="Relative frequency", main="Proportion of movies vs budget")

# Side-by-side boxplots of ratings grouped by budget values
boxplot(mdsub$rating ~ disbudget, xlab="Budget", ylab="Rating", main="Boxplot of movie rating vs budget")

## Is this dependent on the mpaa?
## or with dplyr + ggplot + adding votes
md %>% select(budget, rating, votes, mpaa) %>%
  na.omit() %>%
  ggplot(aes(budget, rating, color = votes, fill = mpaa)) + geom_point() + geom_smooth(method = "lm", formula = y ~ x) + theme_bw()

###############################################################################
#
# EXAMPLE 7:
# What is the cumulative movie budget for each year from 1990 to 2000?
# What is the average movie budget for each year from 1990 to 2000?
# (consider only those movies for witch information on the budget is available!)
#
###############################################################################

# Select the movies that contain information on their budgets
sel <- !is.na(md$budget) & md$year >= 1990 & md$year <= 2000

# We can calculate cumulative budget for each year using the "aggregate" function

# Data overflow problem!
aggregate(budget ~ year, data = md[sel,], sum)

# The budget values are represented as integers
typeof(md$budget)

# In order to avoid the overflow problem we have to convert
# the budget values into a double-precision representation (using the as.double() command)
aggregate(as.double(budget) ~ year, data = md[sel,], sum)

sum.budget <- aggregate(as.double(budget) ~ year, data = md[sel,], sum)
plot(sum.budget, type="l", xlab="Year", ylab="Cumulative budget in $", main="Cumulative movie budget per year")

avg.budget <- aggregate(as.double(budget) ~ year, data = md[sel,], mean)
plot(avg.budget, type="l", xlab="Year", ylab="Average budget in $", main="Average movie budget per year")

## or with dplyr
md %>% select(budget, year) %>% na.omit() %>%
  group_by(year) %>% summarise(budget2 = sum(as.numeric(budget))) %>%
  arrange(year) %>% mutate(csum = cumsum(budget2)) %>%
  ggplot(aes(year, csum)) + geom_bar(stat = "identity") + theme_bw()

##############################################################################
#
# EXAMPLE 8: (players dataset)
# What is the average height for each season in the period from 1970 to 2000?
#
##############################################################################

# Load the Players dataset
players <- read.table("players.txt", sep=",", header = T)
summary(players)

# Create an empty vector
h <- vector()

# Use a for loop to go through each year in the period from 1970 to 2000
for (y in 1970:2000)
{
  # Select active players in that year
  sel <- players$firstseason <= y & players$lastseason >= y

  # Append the resulting vector with the mean height for the current year
  h <- c(h, mean(players$height[sel]))
}

# plot the resulting vector (use type="l" for lines)
plot(1970:2000, h, type="l", xlab="Year", ylab="Height in cm", main="Average height in NBA")

## or with dplyr
dfx <- data.frame(year = 1970:2000,mh = h)
dfx %>% ggplot(aes(year, mh)) + geom_point() + geom_smooth(method = "loess") + theme_bw() + xlab("Year") + ylab("Mean height")