jebesh dva
parent
02a2f06e03
commit
7d334030e2
|
@ -0,0 +1,512 @@
|
||||||
|
l <- (TRUE, TRUE, FALSE, F, F, F T)
|
||||||
|
l <- (TRUE, TRUE, FALSE, F, F, F, T)
|
||||||
|
a <- (TRUE, TRUE, FALSE, F, F, F, T)
|
||||||
|
a <- c(TRUE, TRUE, FALSE, F, F, F, T)
|
||||||
|
mode(b)
|
||||||
|
mode(a)
|
||||||
|
install.packages('ggplot2')
|
||||||
|
install.packages('dplyr')
|
||||||
|
ls
|
||||||
|
ls
|
||||||
|
setcwd('/home/gasperspagnolo/Documents/faks/3-letnik/1sem/is/vaje/v3')
|
||||||
|
setwd('/home/gasperspagnolo/Documents/faks/3-letnik/1sem/is/vaje/v3')
|
||||||
|
ls
|
||||||
|
ls
|
||||||
|
# for example:
|
||||||
|
# setwd("c:\\labs\\data\\")
|
||||||
|
library(ggplot2)
|
||||||
|
library(dplyr)
|
||||||
|
# To read data from a text file, use the "read.table" command.
|
||||||
|
# The parameter header=TRUE indicates that the file to be read includes a first line with the column names
|
||||||
|
md <- read.table(file="movies.txt", sep=",", header=TRUE)
|
||||||
|
head(md)
|
||||||
|
summary(md)
|
||||||
|
summary(md)
|
||||||
|
str(md)
|
||||||
|
names(md)
|
||||||
|
md$Action <- as.factor(md$Action)
|
||||||
|
md$Animation <- as.factor(md$Animation)
|
||||||
|
str(md)
|
||||||
|
# The remaining columns will be transformed using the for loop
|
||||||
|
for (i in 20:24)
|
||||||
|
md[,i] <- as.factor(md[,i])
|
||||||
|
md[30,]
|
||||||
|
md[30,3]
|
||||||
|
md[30,"length"]
|
||||||
|
md[,3]
|
||||||
|
md$length
|
||||||
|
plot(md$length)
|
||||||
|
hist(md$length)
|
||||||
|
plot(density(md$length))
|
||||||
|
boxplot(md$length)
|
||||||
|
barplot(table(md$Drama))
|
||||||
|
pie(table(md$mpaa))
|
||||||
|
## nicer plots with ggplot2 + dplyr
|
||||||
|
md %>% ggplot(aes(length)) + geom_histogram(bins = 40) + ggtitle("A genetic histogram") + xlab("Length"
|
||||||
|
## nicer plots with ggplot2 + dplyr
|
||||||
|
md %>% ggplot(aes(length)) + geom_histogram(bins = 40) + ggtitle("A genetic histogram") + xlab("Length")
|
||||||
|
## nicer plots with ggplot2 + dplyr
|
||||||
|
md %>% ggplot(aes(length)) + geom_histogram(bins = 40) + ggtitle("A genetic histogram") + xlab("Length")
|
||||||
|
## plotting w.r.t. multiple mpaa categories
|
||||||
|
md %>% ggplot(aes(length,fill = mpaa)) + geom_density(alpha = 0.2)
|
||||||
|
## What about a nicer boxplot w.r.t mpaa?
|
||||||
|
## theme_bw() is more neutral theme
|
||||||
|
md %>% ggplot(aes(Drama, rating, color = mpaa)) + geom_boxplot() + theme_bw()
|
||||||
|
barplot(table(md$Comedy))
|
||||||
|
barplot(table(md$Comedy))
|
||||||
|
pie(table(md$Comedy))
|
||||||
|
tab <- table(md$Comedy)
|
||||||
|
names(tab) <- c("Other genres", "Comedies")
|
||||||
|
tab
|
||||||
|
barplot(table(md$Comedy))
|
||||||
|
pie(table(md$Comedy))
|
||||||
|
tab <- table(md$Comedy)
|
||||||
|
names(tab) <- c("Other genres", "Comedies")
|
||||||
|
tab
|
||||||
|
tab <- table(md$Comedy)
|
||||||
|
names(tab) <- c("Other genres", "Comedies")
|
||||||
|
tab
|
||||||
|
pie(tab)
|
||||||
|
sum(tab)
|
||||||
|
barplot(tab, ylab="Number of titles", main="Proportion of comedies to other genres")
|
||||||
|
barplot(tab / sum(tab) * 100, ylab="Percentage of titles", main="The proportion of comedies to other genres")
|
||||||
|
pie(tab, main = "Proportion of comedies to other genres")
|
||||||
|
# Plot the rating distribution for comedies
|
||||||
|
hist(md[md$Comedy == "1", "rating"], xlab="Rating", ylab="Frequency", main="Histogram of ratings for comedies")
|
||||||
|
boxplot(md[md$Comedy == "1", "rating"], ylab="Rating", main="Boxplot of ratings for comedies")
|
||||||
|
quantile(md$rating[md$Comedy == 1])
|
||||||
|
comedy <- md$Comedy == "1"
|
||||||
|
# Calculate the mean rating value for comedies and non-comedies
|
||||||
|
mean(md[comedy,"rating"])
|
||||||
|
mean(md[!comedy,"rating"])
|
||||||
|
boxplot(rating ~ Comedy, data=md)
|
||||||
|
boxplot(rating ~ Comedy, data=md, names=c("Other genres", "Comedies"), ylab="Rating", main="Comparison of ratings between comedies and non-comedies")
|
||||||
|
md %>% group_by(Comedy) %>% select(rating) %>% summarise(mean(rating))
|
||||||
|
md %>% group_by(Comedy)
|
||||||
|
sel <- md$year >= 1990
|
||||||
|
table(md$Comedy[sel], md$year[sel])
|
||||||
|
table(md$year[sel])
|
||||||
|
tabcomedy <- table(md$Comedy[sel], md$year[sel])
|
||||||
|
tabyear <- table(md$year[sel])
|
||||||
|
tabcomedy[2,]/tabyear
|
||||||
|
ratio <- tabcomedy[2,]/tabyear
|
||||||
|
barplot(ratio, xlab="Year", ylab="Relative frequency", main="Proportion of comedies")
|
||||||
|
plot(x=names(ratio), y=as.vector(ratio), type="l", xlab="Year", ylab="Relative frequency", main="Proportion of comedies, 1990-2005")
|
||||||
|
## Is this dependent on the mpaa?
|
||||||
|
## or with dplyr + ggplot + adding votes
|
||||||
|
md %>% select(budget, rating, votes, mpaa) %>%
|
||||||
|
na.omit() %>%
|
||||||
|
ggplot(aes(budget, rating, color = votes, fill = mpaa)) + geom_point() + geom_smooth(method = "lm", formula = y ~ x) + theme_bw()
|
||||||
|
## or with dplyr directly
|
||||||
|
md %>% filter(year >= 1990) %>%
|
||||||
|
group_by(year, Comedy) %>%
|
||||||
|
summarise(n = n()) %>% mutate(freq = n / sum(n)) %>%
|
||||||
|
filter(Comedy == 1) %>% select(year, freq) %>%
|
||||||
|
ggplot(aes(year, freq)) + geom_point() + ggtitle("Frequency of comedies") + ylab("Frequency") + xlab("Year")
|
||||||
|
md %>% filter(year >= 1990)
|
||||||
|
summary(md$budget)
|
||||||
|
is.na(md$budget)
|
||||||
|
table(is.na(md$budget))
|
||||||
|
which(is.na(md$budget))
|
||||||
|
# select complete observations only
|
||||||
|
sel <- is.na(md$budget)
|
||||||
|
mdsub <- md[!sel,]
|
||||||
|
nrow(mdsub)
|
||||||
|
summary(mdsub$budget)
|
||||||
|
plot(mdsub$budget, mdsub$rating, xlab="Budget in $", ylab="Rating", main="Movie rating vs budget")
|
||||||
|
# Plotted points are mostly located in the upper left part of the diagram,
|
||||||
|
# which means that a higher budget usually leads to a higher rating
|
||||||
|
# Utilization of the budget in terms of rating
|
||||||
|
ratio <- mdsub$budget/mdsub$rating
|
||||||
|
hist(ratio)
|
||||||
|
# Which movie has the worst budget utilization?
|
||||||
|
mdsub[which.max(ratio),]
|
||||||
|
# Let's discretize these budgets to:
|
||||||
|
# low (less than 1M), mid (between 1M and 50M) and big (more than 50M)
|
||||||
|
disbudget <- cut(mdsub$budget, c(0, 1000000, 50000000, 500000000), labels=c("low", "mid", "big"))
|
||||||
|
barplot(table(disbudget)/length(disbudget), xlab="Budget", ylab="Relative frequency", main="Proportion of movies vs budget")
|
||||||
|
# Side-by-side boxplots of ratings grouped by budget values
|
||||||
|
boxplot(mdsub$rating ~ disbudget, xlab="Budget", ylab="Rating", main="Boxplot of movie rating vs budget")
|
||||||
|
## Is this dependent
|
||||||
|
## or with dplyr + ggplot + adding votes
|
||||||
|
md %>% select(budget, rating, votes, mpaa) %>%
|
||||||
|
na.omit() %>%
|
||||||
|
ggplot(aes(budget, rating, color = votes, fill = mpaa)) + geom_point() + geom_smooth(method = "lm", formula = y ~ x) + theme_bw()
|
||||||
|
## or with dplyr
|
||||||
|
md %>% select(budget, year) %>% na.omit() %>%
|
||||||
|
group_by(year) %>% summarise(budget2 = sum(as.numeric(budget))) %>%
|
||||||
|
arrange(year) %>% mutate(csum = cumsum(budget2)) %>%
|
||||||
|
ggplot(aes(year, csum)) + geom_bar(stat = "identity") + theme_bw()
|
||||||
|
## Is this dependent on the mpaa?
|
||||||
|
## or with dplyr + ggplot + adding votes
|
||||||
|
md %>% select(budget, rating, votes, mpaa) %>%
|
||||||
|
na.omit() %>%
|
||||||
|
ggplot(aes(budget, rating, color = votes, fill = mpaa)) + geom_point() + geom_smooth(method = "lm", formula = y ~ x) + theme_bw()
|
||||||
|
#######################################
|
||||||
|
md <- read.table("movies.txt", sep=",", header=TRUE)
|
||||||
|
# - Are there more movies shorter than 100 min or longer than (or equal to) 100 minutes?
|
||||||
|
# (show your answer numerically and graphically)
|
||||||
|
md
|
||||||
|
md <- read.table("movies.txt", sep=",", header=TRUE)
|
||||||
|
# - Are there more movies shorter than 100 min or longer than (or equal to) 100 minutes?
|
||||||
|
# (show your answer numerically and graphically)
|
||||||
|
movies_shorter_or_equal_100_min <- md$length >= 100
|
||||||
|
movies_shorter_or_equal_100_min
|
||||||
|
barplot(movies, xlab="Year", ylab="Relative frequency", main="Proportion of comedies")
|
||||||
|
barplot(movies_shorter_or_equal_100_min, xlab="Year", ylab="Relative frequency", main="Proportion of comedies")
|
||||||
|
md[30,]
|
||||||
|
md[30,3]
|
||||||
|
md[30,"length"]
|
||||||
|
md[,3]
|
||||||
|
md$length
|
||||||
|
# Useful data visualization functions
|
||||||
|
plot(md$length)
|
||||||
|
hist(md$length)
|
||||||
|
plot(density(md$length))
|
||||||
|
boxplot(md$length)
|
||||||
|
barplot(table(md$Drama))
|
||||||
|
pie(table(md$mpaa))
|
||||||
|
## nicer plots with
|
||||||
|
## nicer plots with ggplot2 + dplyr
|
||||||
|
md %>% ggplot(aes(length)) + geom_histogram(bins = 40) + ggtitle("A genetic histogram") + xlab("Length")
|
||||||
|
tab <- table(md$Comedy)
|
||||||
|
names(tab) <- c("Other genres", "Comedies")
|
||||||
|
tab
|
||||||
|
pie(tab)
|
||||||
|
tab
|
||||||
|
tab <- table(md$Comedy)
|
||||||
|
tab
|
||||||
|
hist(md[md$Comedy == "1", "rating"], xlab="Rating", ylab="Frequency", main="Histogram of ratings for comedies")
|
||||||
|
pie(table(md$Comedy))
|
||||||
|
md$Comedy
|
||||||
|
md$length
|
||||||
|
names(tab) <- c("Other genres", "Comedies")
|
||||||
|
tab
|
||||||
|
pie(tab)
|
||||||
|
barplot(table(md$length))
|
||||||
|
movies_shorter_or_equal_100_min <- md$length >= 100
|
||||||
|
table(movies_shorter_or_equal_100_min)
|
||||||
|
movies_shorter_or_equal_100_min <- md$length >= 100
|
||||||
|
tab <- table(movies_shorter_or_equal_100_min)
|
||||||
|
names(tab) <- c("Less than 100", "More or equal to 100")
|
||||||
|
pie(tab)
|
||||||
|
df
|
||||||
|
md
|
||||||
|
md$Comedy
|
||||||
|
md$Comedy
|
||||||
|
md <- read.table("movies.txt", sep=",", header=TRUE)
|
||||||
|
# We will transform binary attributes into nominal variables with a fixed number of possible values (factors)
|
||||||
|
md$Action <- as.factor(md$Action)
|
||||||
|
md$Animation <- as.factor(md$Animation)
|
||||||
|
# The remaining columns will be transformed using the for loop
|
||||||
|
for (i in 20:24)
|
||||||
|
md[,i] <- as.factor(md[,i])
|
||||||
|
movies_shorter_or_equal_100_min <- md$length >= 100
|
||||||
|
tab <- table(movies_shorter_or_equal_100_min)
|
||||||
|
names(tab) <- c("Less than 100", "More or equal to 100")
|
||||||
|
pie(tab)
|
||||||
|
<- md$Romance
|
||||||
|
romantic_comedies
|
||||||
|
romantic_comedies <- md$Romance
|
||||||
|
romantic_comedies
|
||||||
|
romantic_comedies <- sum(md$Romance == 1)
|
||||||
|
romantic_comedies
|
||||||
|
## or with dplyr directly
|
||||||
|
md %>% filter(year >= 1990) %>%
|
||||||
|
group_by(year, Comedy) %>%
|
||||||
|
summarise(n = n()) %>% mutate(freq = n / sum(n)) %>%
|
||||||
|
filter(Comedy == 1) %>% select(year, freq) %>%
|
||||||
|
ggplot(aes(year, freq)) + geom_point() + ggtitle("Frequency of comedies") + ylab("Frequency") + xlab("Year") + geom_line() + theme_bw()
|
||||||
|
######################################################
|
||||||
|
# - Are there more action comedies or romantic comedies?
|
||||||
|
md %>% group_by(Comedy, Action)
|
||||||
|
# - Are there more action comedies or romantic comedies?
|
||||||
|
md %>% group_by(Comedy, Action) %>% summarise(n = n())
|
||||||
|
# - Are there more action comedies or romantic comedies?
|
||||||
|
md %>% group_by(Action, Comedy) %>% summarise(n = n())
|
||||||
|
# - Are there more action comedies or romantic comedies?
|
||||||
|
md %>% group_by(Action == 1, Comedy) %>% summarise(n = n())
|
||||||
|
# - Are there more action comedies or romantic comedies?
|
||||||
|
md %>% group_by(Action == 1, Comedy == 1) %>% summarise(n = n())
|
||||||
|
# - Are there more action comedies or romantic comedies?
|
||||||
|
md %>% filter(Action == 1, Comedy == 1) %>% summarise(n = n())
|
||||||
|
md %>% filter(Action == 1, Comedy == 1) %>% summarise(n = n())
|
||||||
|
md %>% filter(Action == 1, Comedy == 1) %>% summarise(n = n())
|
||||||
|
md %>% filter(Romantic == 1, Comedy == 1) %>% summarise(n = n())
|
||||||
|
# - Are there more action comedies or romantic comedies?
|
||||||
|
md %>% filter(Comedy == 1) %>% summarise(n = n())
|
||||||
|
md %>% filter(Comedy == 1) %>% mutate(number_of_actions = sum(Action == 1)) %>%
|
||||||
|
mutate(number_of_actions = sum(Action == 1)) %>%
|
||||||
|
mutate(number_of_actions = sum(Romance == 1))
|
||||||
|
# - Are there more action comedies or romantic comedies?
|
||||||
|
md %>% filter(Comedy == 1) %>% mutate(number_of_actions = sum(Action == 1)) %>%
|
||||||
|
mutate(number_of_action_movies = sum(Action == 1)) %>%
|
||||||
|
mutate(number_of_Romances = sum(Romance == 1)) %>%
|
||||||
|
select(number_of_action_movies, number_of_Romances)
|
||||||
|
# - Are there more action comedies or romantic comedies?
|
||||||
|
md %>% filter(Comedy == 1) %>% mutate(number_of_actions = sum(Action == 1)) %>%
|
||||||
|
mutate(number_of_action_movies = sum(Action == 1)) %>%
|
||||||
|
mutate(number_of_Romances = sum(Romance == 1)) %>%
|
||||||
|
select(number_of_action_movies, number_of_Romances)
|
||||||
|
md %>% filter(Comedy == 1) %>% mutate(number_of_actions = sum(Action == 1)) %>%
|
||||||
|
mutate(number_of_action_movies = sum(Action == 1)) %>%
|
||||||
|
mutate(number_of_Romances = sum(Romance == 1)) %>%
|
||||||
|
select(number_of_action_movies, number_of_Romances) >%>%
|
||||||
|
unique()
|
||||||
|
md %>% filter(Comedy == 1) %>% mutate(number_of_actions = sum(Action == 1)) %>%
|
||||||
|
mutate(number_of_action_movies = sum(Action == 1)) %>%
|
||||||
|
mutate(number_of_Romances = sum(Romance == 1)) %>%
|
||||||
|
select(number_of_action_movies, number_of_Romances) %>%
|
||||||
|
unique()
|
||||||
|
md %>% filter(Comedy == 1) %>% mutate(number_of_actions = sum(Action == 1)) %>%
|
||||||
|
mutate(number_of_action_movies = sum(Action == 1)) %>%
|
||||||
|
mutate(number_of_Romances = sum(Romance == 1)) %>%
|
||||||
|
select(number_of_action_movies, number_of_Romances) %>%
|
||||||
|
unique() %>% pie()
|
||||||
|
# - Are there more action comedies or romantic comedies?
|
||||||
|
md %>% filter(Comedy == 1) %>% mutate(number_of_actions = sum(Action == 1)) %>%
|
||||||
|
mutate(number_of_action_movies = sum(Action == 1)) %>%
|
||||||
|
mutate(number_of_Romances = sum(Romance == 1)) %>%
|
||||||
|
select(number_of_action_movies, number_of_Romances) %>%
|
||||||
|
unique()
|
||||||
|
# - Are there more action comedies or romantic comedies?
|
||||||
|
md %>% filter(Comedy == 1) %>% mutate(number_of_actions = sum(Action == 1)) %>%
|
||||||
|
mutate(number_of_action_movies = sum(Action == 1)) %>%
|
||||||
|
mutate(number_of_Romances = sum(Romance == 1)) %>%
|
||||||
|
select(number_of_action_movies, number_of_Romances) %>%
|
||||||
|
unique()
|
||||||
|
# - Are there more action comedies or romantic comedies?
|
||||||
|
md %>% filter(Comedy == 1) %>% mutate(number_of_actions = sum(Action == 1)) %>%
|
||||||
|
mutate(number_of_action_movies = sum(Action == 1)) %>%
|
||||||
|
mutate(number_of_Romances = sum(Romance == 1)) %>%
|
||||||
|
unique() %>%
|
||||||
|
select(number_of_action_movies, number_of_Romances)
|
||||||
|
md %>% filter(Comedy == 1) %>% mutate(number_of_actions = sum(Action == 1)) %>%
|
||||||
|
mutate(number_of_action_movies = sum(Action == 1)) %>%
|
||||||
|
mutate(number_of_Romances = sum(Romance == 1)) %>%
|
||||||
|
unique() %>%
|
||||||
|
select(number_of_action_movies, number_of_Romances)
|
||||||
|
md %>% filter(Comedy == 1) %>% mutate(number_of_actions = sum(Action == 1)) %>%
|
||||||
|
mutate(number_of_action_movies = sum(Action == 1)) %>%
|
||||||
|
mutate(number_of_Romances = sum(Romance == 1)) %>%
|
||||||
|
select(number_of_action_movies, number_of_Romances) %>%
|
||||||
|
unique()
|
||||||
|
# - Plot a histogram of the ratings for drama movies.
|
||||||
|
head(md)
|
||||||
|
ratio <- mdsub$budget/mdsub$rating
|
||||||
|
hist(ratio)
|
||||||
|
plot(mdsub$budget, mdsub$rating, xlab="Budget in $", ylab="Rating", main="Movie rating vs budget")
|
||||||
|
ratio <- mdsub$budget/mdsub$rating
|
||||||
|
hist(ratio)
|
||||||
|
hist(md$mpaa)
|
||||||
|
hist(md$mpaa)
|
||||||
|
hist(md$r1)
|
||||||
|
(md$r1)
|
||||||
|
hist(md$r1)
|
||||||
|
md %>% filter(Drama == 1) %>% hist()
|
||||||
|
md %>% filter(Drama == 1
|
||||||
|
md %>% filter(Drama == 1)
|
||||||
|
md %>% filter(Drama == 1)
|
||||||
|
md %>% filter(Drama == 1) %>% select(raiting)
|
||||||
|
md %>% filter(Drama == 1) %>% select(rating)
|
||||||
|
md %>% filter(Drama == 1) %>% select(rating) %>% hist()
|
||||||
|
md %>% filter(Drama == 1) %>% select(rating) %>% class()
|
||||||
|
md %>% filter(Drama == 1) %>% select(rating)
|
||||||
|
md %>% filter(Drama == 1) %>% select(rating) %>% hist(rating)
|
||||||
|
md %>% filter(Drama == 1) %>% select(rating) %>% hist(.$rating)
|
||||||
|
md %>% filter(Drama == 1) %>% tidyr::gather(rating, value) %>% ggplot(aes(value)
|
||||||
|
md %>% filter(Drama == 1) %>% tidyr::gather(rating, value) %>% ggplot(aes(value))
|
||||||
|
md %>% filter(Drama == 1) %>% tidyr::gather(rating, value) %>% ggplot(aes(value))
|
||||||
|
md %>% filter(Drama == 1) %>% ggplot(aes(rating))
|
||||||
|
md %>% filter(Drama == 1) %>% ggplot(rating)
|
||||||
|
md %>% filter(Drama == 1) %>% ggplot(rating)
|
||||||
|
aes(
|
||||||
|
md %>% filter(Drama == 1) %>% ggplot(aes(rating)) geom_histogram(bins = 10))
|
||||||
|
md %>% filter(Drama == 1) %>% ggplot(aes(rating)) geom_histogram(bins = 10))
|
||||||
|
md %>% filter(Drama == 1) %>% ggplot(aes(rating)) + eom_histogram(bins = 10))
|
||||||
|
md %>% filter(Drama == 1) %>% ggplot(aes(rating)) + geom_histogram(bins = 10))
|
||||||
|
md %>% filter(Drama == 1) %>% ggplot(aes(rating)) + geom_histogram(bins = 10)
|
||||||
|
md %>% filter(Drama == 1) %>% select(rating) %>% mutate(higher_than_avg_rating = rating > mean(rating))
|
||||||
|
md %>% filter(Drama == 1) %>% select(rating) %>% mutate(higher_than_avg_rating = rating > mean(rating))%>%
|
||||||
|
pie(table(aes(sum(rating == TRUE), aes(sum(rating == FALSE))
|
||||||
|
md %>% filter(Drama == 1) %>% select(rating) %>% mutate(higher_than_avg_rating = rating > mean(rating))%>%
|
||||||
|
pie(table(aes(sum(rating == TRUE)), aes(sum(rating == FALSE)))
|
||||||
|
md %>% filter(Drama == 1) %>% select(rating) %>% mutate(higher_than_avg_rating = rating > mean(rating))%>%
|
||||||
|
md %>% filter(Drama == 1) %>% select(rating) %>%
|
||||||
|
mutate(higher_than_avg_rating = rating > mean(rating))
|
||||||
|
md %>% filter(Drama == 1) %>% select(rating) %>%
|
||||||
|
mutate(higher_than_avg_rating = rating > mean(rating)) %>%
|
||||||
|
select(higher_than_avg_rating)
|
||||||
|
md %>% filter(Drama == 1) %>% select(rating) %>%
|
||||||
|
mutate(higher_than_avg_rating = rating > mean(rating)) %>%
|
||||||
|
select(higher_than_avg_rating) %>%
|
||||||
|
summarise(counts = n())
|
||||||
|
md %>% filter(Drama == 1) %>% select(rating) %>%
|
||||||
|
mutate(higher_than_avg_rating = rating > mean(rating)) %>%
|
||||||
|
select(higher_than_avg_rating) %>%
|
||||||
|
summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE))
|
||||||
|
mutate(higher_than_avg_rating = rating > mean(rating)) %>%
|
||||||
|
select(higher_than_avg_rating) %>%
|
||||||
|
summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) %>%
|
||||||
|
pie()
|
||||||
|
md %>% filter(Drama == 1) %>% select(rating) %>%
|
||||||
|
mutate(higher_than_avg_rating = rating > mean(rating)) %>%
|
||||||
|
select(higher_than_avg_rating) %>%
|
||||||
|
summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) %>%
|
||||||
|
pie()
|
||||||
|
ggplot(aes(.))) +
|
||||||
|
geom_bar(stat="identity", width=1) +
|
||||||
|
coord_polar("y", start=0)
|
||||||
|
md %>% filter(Drama == 1) %>% select(rating) %>%
|
||||||
|
mutate(higher_than_avg_rating = rating > mean(rating)) %>%
|
||||||
|
select(higher_than_avg_rating) %>%
|
||||||
|
summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) %>%
|
||||||
|
ggplot(aes(.)) +
|
||||||
|
geom_bar(stat="identity", width=1) +
|
||||||
|
coord_polar("y", start=0)
|
||||||
|
md %>% filter(Drama == 1) %>% select(rating) %>%
|
||||||
|
mutate(higher_than_avg_rating = rating > mean(rating)) %>%
|
||||||
|
select(higher_than_avg_rating) %>%
|
||||||
|
summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) %>%
|
||||||
|
ggplot(aes(.)) +
|
||||||
|
coord_polar("y", start=0)
|
||||||
|
md %>% filter(Drama == 1) %>% select(rating) %>%
|
||||||
|
mutate(higher_than_avg_rating = rating > mean(rating)) %>%
|
||||||
|
select(higher_than_avg_rating) %>%
|
||||||
|
summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) %>%
|
||||||
|
ggplot(aes(.)) +
|
||||||
|
coord_polar("y", start=0)
|
||||||
|
summary <- md %>% filter(Drama == 1) %>% select(rating) %>%
|
||||||
|
mutate(higher_than_avg_rating = rating > mean(rating)) %>%
|
||||||
|
select(higher_than_avg_rating) %>%
|
||||||
|
summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) %>%
|
||||||
|
summary
|
||||||
|
summary <- md %>% filter(Drama == 1) %>% select(rating) %>%
|
||||||
|
mutate(higher_than_avg_rating = rating > mean(rating)) %>%
|
||||||
|
select(higher_than_avg_rating) %>%
|
||||||
|
summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) %>%
|
||||||
|
summary
|
||||||
|
summary
|
||||||
|
h <- md %>% filter(Drama == 1) %>% select(rating) %>%
|
||||||
|
mutate(higher_than_avg_rating = rating > mean(rating)) %>%
|
||||||
|
select(higher_than_avg_rating) %>%
|
||||||
|
summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) %>%
|
||||||
|
h
|
||||||
|
pie(h)
|
||||||
|
h <- md %>% filter(Drama == 1) %>% select(rating) %>%
|
||||||
|
mutate(higher_than_avg_rating = rating > mean(rating)) %>%
|
||||||
|
select(higher_than_avg_rating) %>%
|
||||||
|
summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) %>%
|
||||||
|
h <- md %>% filter(Drama == 1) %>% select(rating) %>%
|
||||||
|
mutate(higher_than_avg_rating = rating > mean(rating)) %>%
|
||||||
|
select(higher_than_avg_rating) %>%
|
||||||
|
summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE))
|
||||||
|
md %>% filter(Drama == 1) %>% select(rating) %>%
|
||||||
|
mutate(higher_than_avg_rating = rating > mean(rating)) %>%
|
||||||
|
select(higher_than_avg_rating) %>%
|
||||||
|
summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE))
|
||||||
|
lol <- md %>% filter(Drama == 1) %>% select(rating) %>%
|
||||||
|
mutate(higher_than_avg_rating = rating > mean(rating)) %>%
|
||||||
|
select(higher_than_avg_rating) %>%
|
||||||
|
summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE))
|
||||||
|
lol
|
||||||
|
pie(lol)
|
||||||
|
lol <- md %>% filter(Drama == 1) %>% select(rating) %>%
|
||||||
|
mutate(higher_than_avg_rating = rating > mean(rating)) %>%
|
||||||
|
select(higher_than_avg_rating)# %>%
|
||||||
|
# summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE))
|
||||||
|
pie(lol)
|
||||||
|
# summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE))
|
||||||
|
lol
|
||||||
|
pie(table(lol))
|
||||||
|
lol <- md %>% filter(Drama == 1) %>% select(rating) %>%
|
||||||
|
mutate(higher_than_avg_rating = rating > mean(rating)) %>%
|
||||||
|
select(higher_than_avg_rating)# %>%
|
||||||
|
# summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE))
|
||||||
|
pie(table(lol))
|
||||||
|
pie(table(lol))
|
||||||
|
sel <- md$year >= 1990
|
||||||
|
# the table() command can be used to get a two-way contigency table
|
||||||
|
table(md$Comedy[sel], md$year[sel])
|
||||||
|
sel <- md$year >= 1999 && md$year <= 2005
|
||||||
|
table(md$Animation[sel])
|
||||||
|
sel <- md$year >= 1999 && md$year <= 2005
|
||||||
|
table(md$Animation[sel], md$year[sel])
|
||||||
|
table(md$Animation[sel] == 1, md$year[sel])
|
||||||
|
sel <- md$year >= 1999 && md$year <= 2005
|
||||||
|
table(md$Animation[sel], md$year[sel])
|
||||||
|
tabcomedy <- table(md$Comedy[sel], md$year[sel])
|
||||||
|
tabyear <- table(md$year[sel])
|
||||||
|
tabcomedy[2,]/tabyear
|
||||||
|
tabyear <- table(md$year[sel])
|
||||||
|
tabyear
|
||||||
|
tabcomedy <- table(md$Comedy[sel], md$year[sel])
|
||||||
|
tabcomedy <- table(md$Comedy[sel], md$year[sel])
|
||||||
|
tabcomedy
|
||||||
|
table(md$year[sel])
|
||||||
|
table(md$Comedy[sel], md$year[sel])
|
||||||
|
table(md$year[sel])
|
||||||
|
tabcomedy <- table(md$Comedy[sel], md$year[sel])
|
||||||
|
table(md$Comedy[sel], md$year[sel])
|
||||||
|
tabcomedy <- table(md$Comedy[sel], md$year[sel])
|
||||||
|
tabcomedy
|
||||||
|
tabyear <- table(md$year[sel])
|
||||||
|
tabyear
|
||||||
|
ratio <- tabcomedy[2,]/tabyear
|
||||||
|
## or with dplyr directly
|
||||||
|
md %>% filter(year >= 1990) %>%
|
||||||
|
group_by(year, Comedy) %>%
|
||||||
|
summarise(n = n()) %>% mutate(freq = n / sum(n)) %>%
|
||||||
|
filter(Comedy == 1) %>% select(year, freq) %>%
|
||||||
|
ggplot(aes(year, freq)) + geom_point() + ggtitle("Frequency of comedies") + ylab("Frequency") + xlab("Year") + geom_line() + theme_bw()
|
||||||
|
ratio <- tabcomedy[2,]/tabyear
|
||||||
|
barplot(ratio, xlab="Year", ylab="Relative frequency", main="Proportion of comedies")
|
||||||
|
ratio <- tabcomedy[2,]/tabyear
|
||||||
|
sel <- md$year >= 1999 && md$year <= 2005
|
||||||
|
table(md$Animation[sel], md$year[sel])
|
||||||
|
tt[,2]
|
||||||
|
tt <- table(md$Animation[sel], md$year[sel])
|
||||||
|
tt[,2]
|
||||||
|
tt <- table(md$Animation[sel], md$year[sel])
|
||||||
|
tt[2,]
|
||||||
|
sel <- md$year >= 1999 && md$year <= 2005
|
||||||
|
tt <- table(md$Animation[sel], md$year[sel])
|
||||||
|
sel <- md$year >= 1999 && md$year <= 2005
|
||||||
|
tt <- table(md$Animation[sel], md$year[sel])
|
||||||
|
tt[2,]
|
||||||
|
sel <- md$year >= 1999 && md$year <= 2005
|
||||||
|
sel
|
||||||
|
tt <- table(md$Animation[sel], md$year[sel])
|
||||||
|
tt[2,]
|
||||||
|
sel <- md$year >= 1999 && md$year <= 2005
|
||||||
|
sel
|
||||||
|
tt <- table(md$Animation[sel], md$year[sel])
|
||||||
|
tt <- table(md$Animation[sel], md$year[sel])
|
||||||
|
tt[2,]
|
||||||
|
# - Plot the number of animated movies being produced every year for the period 1995-2005.
|
||||||
|
sel <- md$year >= 1999 && md$year <= 2005
|
||||||
|
tt <- table(md$Animation[sel], md$year[sel])
|
||||||
|
tt[2,]
|
||||||
|
tt <- table(md$Animation[sel], md$year[sel])
|
||||||
|
tt[3,]
|
||||||
|
sel <- md$year >= 1999 && md$year <= 2005
|
||||||
|
tt <- table(md$Animation[sel], md$year[sel])
|
||||||
|
tt[1,]
|
||||||
|
sel <- md$year >= 1999 && md$year <= 2005
|
||||||
|
tt <- table(md$Animation[sel], md$year[sel])
|
||||||
|
tt[2,]
|
||||||
|
#####################
|
||||||
|
barplot(tt[2,], xlab="Year", ylab="Relative frequency", main="Proportion of comedies")
|
||||||
|
sel <- md$year >= 1999 and md$year <= 2005
|
||||||
|
sel <- md[md$year >= 1999 && md$year <= 2005]
|
||||||
|
sel <- md[md$year >= 1999 & md$year <= 2005]
|
||||||
|
tt <- table(md$Animation[sel], md$year[sel])
|
||||||
|
sel <- md[md$year >= 1999 & md$year <= 2005]
|
||||||
|
sel <- md$year >= 1999 & md$year <= 2005
|
||||||
|
sel
|
||||||
|
tt <- table(md$Animation[sel], md$year[sel])
|
||||||
|
barplot(tt[2,], xlab="Year", ylab="Relative frequency", main="Proportion of animated movies")
|
||||||
|
sel <- md$year >= 1999 & md$year <= 2005
|
||||||
|
tt <- table(md$Animation[sel], md$year[sel])
|
||||||
|
barplot(tt[2,], xlab="Year", ylab="Relative frequency", main="Proportion of animated movies")
|
||||||
|
#######################
|
|
@ -70,7 +70,7 @@ pie(table(lol))
|
||||||
# - Plot the number of animated movies being produced every year for the period 1995-2005.
|
# - Plot the number of animated movies being produced every year for the period 1995-2005.
|
||||||
sel <- md$year >= 1999 & md$year <= 2005
|
sel <- md$year >= 1999 & md$year <= 2005
|
||||||
tt <- table(md$Animation[sel], md$year[sel])
|
tt <- table(md$Animation[sel], md$year[sel])
|
||||||
barplot(tt[2,], xlab="Year", ylab="Relative frequency", main="Proportion of animated movies")
|
wbarplot(tt[2,], xlab="Year", ylab="Relative frequency", main="Proportion of animated movies")
|
||||||
|
|
||||||
# - Is there a clear boundary between short and feature movies (according to their length)?
|
# - Is there a clear boundary between short and feature movies (according to their length)?
|
||||||
|
|
||||||
|
@ -93,3 +93,15 @@ barplot(tt[2,], xlab="Year", ylab="Relative frequency", main="Proportion of anim
|
||||||
#
|
#
|
||||||
#######################################################################################################################
|
#######################################################################################################################
|
||||||
|
|
||||||
|
players <- read.table("players.txt", sep=",", header = T)
|
||||||
|
|
||||||
|
# - Plot the proportion of players according to playing positions.
|
||||||
|
position_proportion <- players %>% group_by(position) %>% select(position) %>% table()
|
||||||
|
pie(position_proportion)
|
||||||
|
|
||||||
|
# - Compare career rebounds (the "reb" attribute) with respect to playing position
|
||||||
|
boxplot(reb ~ position, data=players, ylab="Carrer rebounds", main="Comparison of carrer rebounds with respect to playing position")
|
||||||
|
|
||||||
|
# - Show the distribution of free throw percentages.
|
||||||
|
# The percentage is determined by dividing the number of shots made ("ftm") by the total number of shots attempted ("fta").
|
||||||
|
ftp <- players %>% mutate(free_throw_percentage = ftm/fta *100) %>% select(free_throw_percentage)
|
||||||
|
|
Loading…
Reference in New Issue