diff --git a/v3/lab 2 - problems.R b/v3/lab 2 - problems.R index ec4a823..1c2e67a 100644 --- a/v3/lab 2 - problems.R +++ b/v3/lab 2 - problems.R @@ -1,46 +1,95 @@ -####################################################################################################################### -# -# PROBLEMS -# -####################################################################################################################### -# -# Load the Movies dataset using the command: -# -# md <- read.table("movies.txt", sep=",", header=TRUE) -# -# Answer the following questions: -# -# - Are there more movies shorter than 100 min or longer than (or equal to) 100 minutes? -# (show your answer numerically and graphically) -# -# - Are there more action comedies or romantic comedies? -# -# - Plot a histogram of the ratings for drama movies. -# -# - Is the average rating of dramas higher than the average rating of non-dramas? -# (show your answer numerically and graphically) -# -# - Plot the number of animated movies being produced every year for the period 1995-2005. -# -# - Is there a clear boundary between short and feature movies (according to their length)? -# -# -####################################################################################################################### -# -# Load the Players dataset using the command: -# -# players <- read.table("players.txt", sep=",", header = T) -# -# - Plot the proportion of players according to playing positions. -# -# - Compare career rebounds (the "reb" attribute) with respect to playing position. -# -# - Show the distribution of free throw percentages. -# The percentage is determined by dividing the number of shots made ("ftm") by the total number of shots attempted ("fta"). -# -# - Compare career 3-pointers made for the players active between 1990 and 2007, with respect to playing position. -# -# - How does the average career length of retired players vary from year to year? -# -####################################################################################################################### - +####################################################################################################################### +# +# PROBLEMS +# +####################################################################################################################### +# +# Load the Movies dataset using the command: +# +# md <- read.table("movies.txt", sep=",", header=TRUE) +# +# Answer the following questions: +# +# - Are there more movies shorter than 100 min or longer than (or equal to) 100 minutes? +# (show your answer numerically and graphically) +# +# - Are there more action comedies or romantic comedies? +# +# - Plot a histogram of the ratings for drama movies. +# +# - Is the average rating of dramas higher than the average rating of non-dramas? +# (show your answer numerically and graphically) +# +# - Plot the number of animated movies being produced every year for the period 1995-2005. +# +# - Is there a clear boundary between short and feature movies (according to their length)? +# +# +####################################################################################################################### +md <- read.table("movies.txt", sep=",", header=TRUE) +library(ggplot2) +library(dplyr) + + +# We will transform binary attributes into nominal variables with a fixed number of possible values (factors) +md$Action <- as.factor(md$Action) +md$Animation <- as.factor(md$Animation) + +# The remaining columns will be transformed using the for loop +for (i in 20:24) + md[,i] <- as.factor(md[,i]) + + +# - Are there more movies shorter than 100 min or longer than (or equal to) 100 minutes? +# (show your answer numerically and graphically) +movies_shorter_or_equal_100_min <- md$length >= 100 +tab <- table(movies_shorter_or_equal_100_min) +names(tab) <- c("Less than 100", "More or equal to 100") +pie(tab) + +# - Are there more action comedies or romantic comedies? +md %>% filter(Comedy == 1) %>% mutate(number_of_actions = sum(Action == 1)) %>% + mutate(number_of_action_movies = sum(Action == 1)) %>% + mutate(number_of_Romances = sum(Romance == 1)) %>% + select(number_of_action_movies, number_of_Romances) %>% + unique() + + +# - Plot a histogram of the ratings for drama movies. +md %>% filter(Drama == 1) %>% ggplot(aes(rating)) + geom_histogram(bins = 10) + +# - Is the average rating of dramas higher than the average rating of non-dramas? +# (show your answer numerically and graphically) + +lol <- md %>% filter(Drama == 1) %>% select(rating) %>% + mutate(higher_than_avg_rating = rating > mean(rating)) %>% + select(higher_than_avg_rating)# %>% +# summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) +pie(table(lol)) + +# - Plot the number of animated movies being produced every year for the period 1995-2005. +sel <- md$year >= 1999 & md$year <= 2005 +tt <- table(md$Animation[sel], md$year[sel]) +barplot(tt[2,], xlab="Year", ylab="Relative frequency", main="Proportion of animated movies") + +# - Is there a clear boundary between short and feature movies (according to their length)? + +#######################################################################################################################1 +# +# Load the Players dataset using the command: +# +# players <- read.table("players.txt", sep=",", header = T) +# +# - Plot the proportion of players according to playing positions. +# +# - Compare career rebounds (the "reb" attribute) with respect to playing position. +# +# - Show the distribution of free throw percentages. +# The percentage is determined by dividing the number of shots made ("ftm") by the total number of shots attempted ("fta"). +# +# - Compare career 3-pointers made for the players active between 1990 and 2007, with respect to playing position. +# +# - How does the average career length of retired players vary from year to year? +# +####################################################################################################################### +