diff --git a/v3/.Rhistory b/v3/.Rhistory new file mode 100644 index 0000000..6fed280 --- /dev/null +++ b/v3/.Rhistory @@ -0,0 +1,512 @@ +l <- (TRUE, TRUE, FALSE, F, F, F T) +l <- (TRUE, TRUE, FALSE, F, F, F, T) +a <- (TRUE, TRUE, FALSE, F, F, F, T) +a <- c(TRUE, TRUE, FALSE, F, F, F, T) +mode(b) +mode(a) +install.packages('ggplot2') +install.packages('dplyr') +ls +ls +setcwd('/home/gasperspagnolo/Documents/faks/3-letnik/1sem/is/vaje/v3') +setwd('/home/gasperspagnolo/Documents/faks/3-letnik/1sem/is/vaje/v3') +ls +ls +# for example: +# setwd("c:\\labs\\data\\") +library(ggplot2) +library(dplyr) +# To read data from a text file, use the "read.table" command. +# The parameter header=TRUE indicates that the file to be read includes a first line with the column names +md <- read.table(file="movies.txt", sep=",", header=TRUE) +head(md) +summary(md) +summary(md) +str(md) +names(md) +md$Action <- as.factor(md$Action) +md$Animation <- as.factor(md$Animation) +str(md) +# The remaining columns will be transformed using the for loop +for (i in 20:24) +md[,i] <- as.factor(md[,i]) +md[30,] +md[30,3] +md[30,"length"] +md[,3] +md$length +plot(md$length) +hist(md$length) +plot(density(md$length)) +boxplot(md$length) +barplot(table(md$Drama)) +pie(table(md$mpaa)) +## nicer plots with ggplot2 + dplyr +md %>% ggplot(aes(length)) + geom_histogram(bins = 40) + ggtitle("A genetic histogram") + xlab("Length" +## nicer plots with ggplot2 + dplyr +md %>% ggplot(aes(length)) + geom_histogram(bins = 40) + ggtitle("A genetic histogram") + xlab("Length") +## nicer plots with ggplot2 + dplyr +md %>% ggplot(aes(length)) + geom_histogram(bins = 40) + ggtitle("A genetic histogram") + xlab("Length") +## plotting w.r.t. multiple mpaa categories +md %>% ggplot(aes(length,fill = mpaa)) + geom_density(alpha = 0.2) +## What about a nicer boxplot w.r.t mpaa? +## theme_bw() is more neutral theme +md %>% ggplot(aes(Drama, rating, color = mpaa)) + geom_boxplot() + theme_bw() +barplot(table(md$Comedy)) +barplot(table(md$Comedy)) +pie(table(md$Comedy)) +tab <- table(md$Comedy) +names(tab) <- c("Other genres", "Comedies") +tab +barplot(table(md$Comedy)) +pie(table(md$Comedy)) +tab <- table(md$Comedy) +names(tab) <- c("Other genres", "Comedies") +tab +tab <- table(md$Comedy) +names(tab) <- c("Other genres", "Comedies") +tab +pie(tab) +sum(tab) +barplot(tab, ylab="Number of titles", main="Proportion of comedies to other genres") +barplot(tab / sum(tab) * 100, ylab="Percentage of titles", main="The proportion of comedies to other genres") +pie(tab, main = "Proportion of comedies to other genres") +# Plot the rating distribution for comedies +hist(md[md$Comedy == "1", "rating"], xlab="Rating", ylab="Frequency", main="Histogram of ratings for comedies") +boxplot(md[md$Comedy == "1", "rating"], ylab="Rating", main="Boxplot of ratings for comedies") +quantile(md$rating[md$Comedy == 1]) +comedy <- md$Comedy == "1" +# Calculate the mean rating value for comedies and non-comedies +mean(md[comedy,"rating"]) +mean(md[!comedy,"rating"]) +boxplot(rating ~ Comedy, data=md) +boxplot(rating ~ Comedy, data=md, names=c("Other genres", "Comedies"), ylab="Rating", main="Comparison of ratings between comedies and non-comedies") +md %>% group_by(Comedy) %>% select(rating) %>% summarise(mean(rating)) +md %>% group_by(Comedy) +sel <- md$year >= 1990 +table(md$Comedy[sel], md$year[sel]) +table(md$year[sel]) +tabcomedy <- table(md$Comedy[sel], md$year[sel]) +tabyear <- table(md$year[sel]) +tabcomedy[2,]/tabyear +ratio <- tabcomedy[2,]/tabyear +barplot(ratio, xlab="Year", ylab="Relative frequency", main="Proportion of comedies") +plot(x=names(ratio), y=as.vector(ratio), type="l", xlab="Year", ylab="Relative frequency", main="Proportion of comedies, 1990-2005") +## Is this dependent on the mpaa? +## or with dplyr + ggplot + adding votes +md %>% select(budget, rating, votes, mpaa) %>% +na.omit() %>% +ggplot(aes(budget, rating, color = votes, fill = mpaa)) + geom_point() + geom_smooth(method = "lm", formula = y ~ x) + theme_bw() +## or with dplyr directly +md %>% filter(year >= 1990) %>% +group_by(year, Comedy) %>% +summarise(n = n()) %>% mutate(freq = n / sum(n)) %>% +filter(Comedy == 1) %>% select(year, freq) %>% +ggplot(aes(year, freq)) + geom_point() + ggtitle("Frequency of comedies") + ylab("Frequency") + xlab("Year") +md %>% filter(year >= 1990) +summary(md$budget) +is.na(md$budget) +table(is.na(md$budget)) +which(is.na(md$budget)) +# select complete observations only +sel <- is.na(md$budget) +mdsub <- md[!sel,] +nrow(mdsub) +summary(mdsub$budget) +plot(mdsub$budget, mdsub$rating, xlab="Budget in $", ylab="Rating", main="Movie rating vs budget") +# Plotted points are mostly located in the upper left part of the diagram, +# which means that a higher budget usually leads to a higher rating +# Utilization of the budget in terms of rating +ratio <- mdsub$budget/mdsub$rating +hist(ratio) +# Which movie has the worst budget utilization? +mdsub[which.max(ratio),] +# Let's discretize these budgets to: +# low (less than 1M), mid (between 1M and 50M) and big (more than 50M) +disbudget <- cut(mdsub$budget, c(0, 1000000, 50000000, 500000000), labels=c("low", "mid", "big")) +barplot(table(disbudget)/length(disbudget), xlab="Budget", ylab="Relative frequency", main="Proportion of movies vs budget") +# Side-by-side boxplots of ratings grouped by budget values +boxplot(mdsub$rating ~ disbudget, xlab="Budget", ylab="Rating", main="Boxplot of movie rating vs budget") +## Is this dependent +## or with dplyr + ggplot + adding votes +md %>% select(budget, rating, votes, mpaa) %>% +na.omit() %>% +ggplot(aes(budget, rating, color = votes, fill = mpaa)) + geom_point() + geom_smooth(method = "lm", formula = y ~ x) + theme_bw() +## or with dplyr +md %>% select(budget, year) %>% na.omit() %>% +group_by(year) %>% summarise(budget2 = sum(as.numeric(budget))) %>% +arrange(year) %>% mutate(csum = cumsum(budget2)) %>% +ggplot(aes(year, csum)) + geom_bar(stat = "identity") + theme_bw() +## Is this dependent on the mpaa? +## or with dplyr + ggplot + adding votes +md %>% select(budget, rating, votes, mpaa) %>% +na.omit() %>% +ggplot(aes(budget, rating, color = votes, fill = mpaa)) + geom_point() + geom_smooth(method = "lm", formula = y ~ x) + theme_bw() +####################################### +md <- read.table("movies.txt", sep=",", header=TRUE) +# - Are there more movies shorter than 100 min or longer than (or equal to) 100 minutes? +# (show your answer numerically and graphically) +md +md <- read.table("movies.txt", sep=",", header=TRUE) +# - Are there more movies shorter than 100 min or longer than (or equal to) 100 minutes? +# (show your answer numerically and graphically) +movies_shorter_or_equal_100_min <- md$length >= 100 +movies_shorter_or_equal_100_min +barplot(movies, xlab="Year", ylab="Relative frequency", main="Proportion of comedies") +barplot(movies_shorter_or_equal_100_min, xlab="Year", ylab="Relative frequency", main="Proportion of comedies") +md[30,] +md[30,3] +md[30,"length"] +md[,3] +md$length +# Useful data visualization functions +plot(md$length) +hist(md$length) +plot(density(md$length)) +boxplot(md$length) +barplot(table(md$Drama)) +pie(table(md$mpaa)) +## nicer plots with +## nicer plots with ggplot2 + dplyr +md %>% ggplot(aes(length)) + geom_histogram(bins = 40) + ggtitle("A genetic histogram") + xlab("Length") +tab <- table(md$Comedy) +names(tab) <- c("Other genres", "Comedies") +tab +pie(tab) +tab +tab <- table(md$Comedy) +tab +hist(md[md$Comedy == "1", "rating"], xlab="Rating", ylab="Frequency", main="Histogram of ratings for comedies") +pie(table(md$Comedy)) +md$Comedy +md$length +names(tab) <- c("Other genres", "Comedies") +tab +pie(tab) +barplot(table(md$length)) +movies_shorter_or_equal_100_min <- md$length >= 100 +table(movies_shorter_or_equal_100_min) +movies_shorter_or_equal_100_min <- md$length >= 100 +tab <- table(movies_shorter_or_equal_100_min) +names(tab) <- c("Less than 100", "More or equal to 100") +pie(tab) +df +md +md$Comedy +md$Comedy +md <- read.table("movies.txt", sep=",", header=TRUE) +# We will transform binary attributes into nominal variables with a fixed number of possible values (factors) +md$Action <- as.factor(md$Action) +md$Animation <- as.factor(md$Animation) +# The remaining columns will be transformed using the for loop +for (i in 20:24) +md[,i] <- as.factor(md[,i]) +movies_shorter_or_equal_100_min <- md$length >= 100 +tab <- table(movies_shorter_or_equal_100_min) +names(tab) <- c("Less than 100", "More or equal to 100") +pie(tab) +<- md$Romance +romantic_comedies +romantic_comedies <- md$Romance +romantic_comedies +romantic_comedies <- sum(md$Romance == 1) +romantic_comedies +## or with dplyr directly +md %>% filter(year >= 1990) %>% +group_by(year, Comedy) %>% +summarise(n = n()) %>% mutate(freq = n / sum(n)) %>% +filter(Comedy == 1) %>% select(year, freq) %>% +ggplot(aes(year, freq)) + geom_point() + ggtitle("Frequency of comedies") + ylab("Frequency") + xlab("Year") + geom_line() + theme_bw() +###################################################### +# - Are there more action comedies or romantic comedies? +md %>% group_by(Comedy, Action) +# - Are there more action comedies or romantic comedies? +md %>% group_by(Comedy, Action) %>% summarise(n = n()) +# - Are there more action comedies or romantic comedies? +md %>% group_by(Action, Comedy) %>% summarise(n = n()) +# - Are there more action comedies or romantic comedies? +md %>% group_by(Action == 1, Comedy) %>% summarise(n = n()) +# - Are there more action comedies or romantic comedies? +md %>% group_by(Action == 1, Comedy == 1) %>% summarise(n = n()) +# - Are there more action comedies or romantic comedies? +md %>% filter(Action == 1, Comedy == 1) %>% summarise(n = n()) +md %>% filter(Action == 1, Comedy == 1) %>% summarise(n = n()) +md %>% filter(Action == 1, Comedy == 1) %>% summarise(n = n()) +md %>% filter(Romantic == 1, Comedy == 1) %>% summarise(n = n()) +# - Are there more action comedies or romantic comedies? +md %>% filter(Comedy == 1) %>% summarise(n = n()) +md %>% filter(Comedy == 1) %>% mutate(number_of_actions = sum(Action == 1)) %>% +mutate(number_of_actions = sum(Action == 1)) %>% +mutate(number_of_actions = sum(Romance == 1)) +# - Are there more action comedies or romantic comedies? +md %>% filter(Comedy == 1) %>% mutate(number_of_actions = sum(Action == 1)) %>% +mutate(number_of_action_movies = sum(Action == 1)) %>% +mutate(number_of_Romances = sum(Romance == 1)) %>% +select(number_of_action_movies, number_of_Romances) +# - Are there more action comedies or romantic comedies? +md %>% filter(Comedy == 1) %>% mutate(number_of_actions = sum(Action == 1)) %>% +mutate(number_of_action_movies = sum(Action == 1)) %>% +mutate(number_of_Romances = sum(Romance == 1)) %>% +select(number_of_action_movies, number_of_Romances) +md %>% filter(Comedy == 1) %>% mutate(number_of_actions = sum(Action == 1)) %>% +mutate(number_of_action_movies = sum(Action == 1)) %>% +mutate(number_of_Romances = sum(Romance == 1)) %>% +select(number_of_action_movies, number_of_Romances) >%>% +unique() +md %>% filter(Comedy == 1) %>% mutate(number_of_actions = sum(Action == 1)) %>% +mutate(number_of_action_movies = sum(Action == 1)) %>% +mutate(number_of_Romances = sum(Romance == 1)) %>% +select(number_of_action_movies, number_of_Romances) %>% +unique() +md %>% filter(Comedy == 1) %>% mutate(number_of_actions = sum(Action == 1)) %>% +mutate(number_of_action_movies = sum(Action == 1)) %>% +mutate(number_of_Romances = sum(Romance == 1)) %>% +select(number_of_action_movies, number_of_Romances) %>% +unique() %>% pie() +# - Are there more action comedies or romantic comedies? +md %>% filter(Comedy == 1) %>% mutate(number_of_actions = sum(Action == 1)) %>% +mutate(number_of_action_movies = sum(Action == 1)) %>% +mutate(number_of_Romances = sum(Romance == 1)) %>% +select(number_of_action_movies, number_of_Romances) %>% +unique() +# - Are there more action comedies or romantic comedies? +md %>% filter(Comedy == 1) %>% mutate(number_of_actions = sum(Action == 1)) %>% +mutate(number_of_action_movies = sum(Action == 1)) %>% +mutate(number_of_Romances = sum(Romance == 1)) %>% +select(number_of_action_movies, number_of_Romances) %>% +unique() +# - Are there more action comedies or romantic comedies? +md %>% filter(Comedy == 1) %>% mutate(number_of_actions = sum(Action == 1)) %>% +mutate(number_of_action_movies = sum(Action == 1)) %>% +mutate(number_of_Romances = sum(Romance == 1)) %>% +unique() %>% +select(number_of_action_movies, number_of_Romances) +md %>% filter(Comedy == 1) %>% mutate(number_of_actions = sum(Action == 1)) %>% +mutate(number_of_action_movies = sum(Action == 1)) %>% +mutate(number_of_Romances = sum(Romance == 1)) %>% +unique() %>% +select(number_of_action_movies, number_of_Romances) +md %>% filter(Comedy == 1) %>% mutate(number_of_actions = sum(Action == 1)) %>% +mutate(number_of_action_movies = sum(Action == 1)) %>% +mutate(number_of_Romances = sum(Romance == 1)) %>% +select(number_of_action_movies, number_of_Romances) %>% +unique() +# - Plot a histogram of the ratings for drama movies. +head(md) +ratio <- mdsub$budget/mdsub$rating +hist(ratio) +plot(mdsub$budget, mdsub$rating, xlab="Budget in $", ylab="Rating", main="Movie rating vs budget") +ratio <- mdsub$budget/mdsub$rating +hist(ratio) +hist(md$mpaa) +hist(md$mpaa) +hist(md$r1) +(md$r1) +hist(md$r1) +md %>% filter(Drama == 1) %>% hist() +md %>% filter(Drama == 1 +md %>% filter(Drama == 1) +md %>% filter(Drama == 1) +md %>% filter(Drama == 1) %>% select(raiting) +md %>% filter(Drama == 1) %>% select(rating) +md %>% filter(Drama == 1) %>% select(rating) %>% hist() +md %>% filter(Drama == 1) %>% select(rating) %>% class() +md %>% filter(Drama == 1) %>% select(rating) +md %>% filter(Drama == 1) %>% select(rating) %>% hist(rating) +md %>% filter(Drama == 1) %>% select(rating) %>% hist(.$rating) +md %>% filter(Drama == 1) %>% tidyr::gather(rating, value) %>% ggplot(aes(value) +md %>% filter(Drama == 1) %>% tidyr::gather(rating, value) %>% ggplot(aes(value)) +md %>% filter(Drama == 1) %>% tidyr::gather(rating, value) %>% ggplot(aes(value)) +md %>% filter(Drama == 1) %>% ggplot(aes(rating)) +md %>% filter(Drama == 1) %>% ggplot(rating) +md %>% filter(Drama == 1) %>% ggplot(rating) +aes( +md %>% filter(Drama == 1) %>% ggplot(aes(rating)) geom_histogram(bins = 10)) +md %>% filter(Drama == 1) %>% ggplot(aes(rating)) geom_histogram(bins = 10)) +md %>% filter(Drama == 1) %>% ggplot(aes(rating)) + eom_histogram(bins = 10)) +md %>% filter(Drama == 1) %>% ggplot(aes(rating)) + geom_histogram(bins = 10)) +md %>% filter(Drama == 1) %>% ggplot(aes(rating)) + geom_histogram(bins = 10) +md %>% filter(Drama == 1) %>% select(rating) %>% mutate(higher_than_avg_rating = rating > mean(rating)) +md %>% filter(Drama == 1) %>% select(rating) %>% mutate(higher_than_avg_rating = rating > mean(rating))%>% +pie(table(aes(sum(rating == TRUE), aes(sum(rating == FALSE)) +md %>% filter(Drama == 1) %>% select(rating) %>% mutate(higher_than_avg_rating = rating > mean(rating))%>% +pie(table(aes(sum(rating == TRUE)), aes(sum(rating == FALSE))) +md %>% filter(Drama == 1) %>% select(rating) %>% mutate(higher_than_avg_rating = rating > mean(rating))%>% +md %>% filter(Drama == 1) %>% select(rating) %>% +mutate(higher_than_avg_rating = rating > mean(rating)) +md %>% filter(Drama == 1) %>% select(rating) %>% +mutate(higher_than_avg_rating = rating > mean(rating)) %>% +select(higher_than_avg_rating) +md %>% filter(Drama == 1) %>% select(rating) %>% +mutate(higher_than_avg_rating = rating > mean(rating)) %>% +select(higher_than_avg_rating) %>% +summarise(counts = n()) +md %>% filter(Drama == 1) %>% select(rating) %>% +mutate(higher_than_avg_rating = rating > mean(rating)) %>% +select(higher_than_avg_rating) %>% +summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) +mutate(higher_than_avg_rating = rating > mean(rating)) %>% +select(higher_than_avg_rating) %>% +summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) %>% +pie() +md %>% filter(Drama == 1) %>% select(rating) %>% +mutate(higher_than_avg_rating = rating > mean(rating)) %>% +select(higher_than_avg_rating) %>% +summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) %>% +pie() +ggplot(aes(.))) + +geom_bar(stat="identity", width=1) + +coord_polar("y", start=0) +md %>% filter(Drama == 1) %>% select(rating) %>% +mutate(higher_than_avg_rating = rating > mean(rating)) %>% +select(higher_than_avg_rating) %>% +summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) %>% +ggplot(aes(.)) + +geom_bar(stat="identity", width=1) + +coord_polar("y", start=0) +md %>% filter(Drama == 1) %>% select(rating) %>% +mutate(higher_than_avg_rating = rating > mean(rating)) %>% +select(higher_than_avg_rating) %>% +summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) %>% +ggplot(aes(.)) + +coord_polar("y", start=0) +md %>% filter(Drama == 1) %>% select(rating) %>% +mutate(higher_than_avg_rating = rating > mean(rating)) %>% +select(higher_than_avg_rating) %>% +summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) %>% +ggplot(aes(.)) + +coord_polar("y", start=0) +summary <- md %>% filter(Drama == 1) %>% select(rating) %>% +mutate(higher_than_avg_rating = rating > mean(rating)) %>% +select(higher_than_avg_rating) %>% +summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) %>% +summary +summary <- md %>% filter(Drama == 1) %>% select(rating) %>% +mutate(higher_than_avg_rating = rating > mean(rating)) %>% +select(higher_than_avg_rating) %>% +summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) %>% +summary +summary +h <- md %>% filter(Drama == 1) %>% select(rating) %>% +mutate(higher_than_avg_rating = rating > mean(rating)) %>% +select(higher_than_avg_rating) %>% +summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) %>% +h +pie(h) +h <- md %>% filter(Drama == 1) %>% select(rating) %>% +mutate(higher_than_avg_rating = rating > mean(rating)) %>% +select(higher_than_avg_rating) %>% +summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) %>% +h <- md %>% filter(Drama == 1) %>% select(rating) %>% +mutate(higher_than_avg_rating = rating > mean(rating)) %>% +select(higher_than_avg_rating) %>% +summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) +md %>% filter(Drama == 1) %>% select(rating) %>% +mutate(higher_than_avg_rating = rating > mean(rating)) %>% +select(higher_than_avg_rating) %>% +summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) +lol <- md %>% filter(Drama == 1) %>% select(rating) %>% +mutate(higher_than_avg_rating = rating > mean(rating)) %>% +select(higher_than_avg_rating) %>% +summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) +lol +pie(lol) +lol <- md %>% filter(Drama == 1) %>% select(rating) %>% +mutate(higher_than_avg_rating = rating > mean(rating)) %>% +select(higher_than_avg_rating)# %>% +# summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) +pie(lol) +# summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) +lol +pie(table(lol)) +lol <- md %>% filter(Drama == 1) %>% select(rating) %>% +mutate(higher_than_avg_rating = rating > mean(rating)) %>% +select(higher_than_avg_rating)# %>% +# summarise(is_higher = sum(higher_than_avg_rating == TRUE), is_lower = sum(higher_than_avg_rating == FALSE)) +pie(table(lol)) +pie(table(lol)) +sel <- md$year >= 1990 +# the table() command can be used to get a two-way contigency table +table(md$Comedy[sel], md$year[sel]) +sel <- md$year >= 1999 && md$year <= 2005 +table(md$Animation[sel]) +sel <- md$year >= 1999 && md$year <= 2005 +table(md$Animation[sel], md$year[sel]) +table(md$Animation[sel] == 1, md$year[sel]) +sel <- md$year >= 1999 && md$year <= 2005 +table(md$Animation[sel], md$year[sel]) +tabcomedy <- table(md$Comedy[sel], md$year[sel]) +tabyear <- table(md$year[sel]) +tabcomedy[2,]/tabyear +tabyear <- table(md$year[sel]) +tabyear +tabcomedy <- table(md$Comedy[sel], md$year[sel]) +tabcomedy <- table(md$Comedy[sel], md$year[sel]) +tabcomedy +table(md$year[sel]) +table(md$Comedy[sel], md$year[sel]) +table(md$year[sel]) +tabcomedy <- table(md$Comedy[sel], md$year[sel]) +table(md$Comedy[sel], md$year[sel]) +tabcomedy <- table(md$Comedy[sel], md$year[sel]) +tabcomedy +tabyear <- table(md$year[sel]) +tabyear +ratio <- tabcomedy[2,]/tabyear +## or with dplyr directly +md %>% filter(year >= 1990) %>% +group_by(year, Comedy) %>% +summarise(n = n()) %>% mutate(freq = n / sum(n)) %>% +filter(Comedy == 1) %>% select(year, freq) %>% +ggplot(aes(year, freq)) + geom_point() + ggtitle("Frequency of comedies") + ylab("Frequency") + xlab("Year") + geom_line() + theme_bw() +ratio <- tabcomedy[2,]/tabyear +barplot(ratio, xlab="Year", ylab="Relative frequency", main="Proportion of comedies") +ratio <- tabcomedy[2,]/tabyear +sel <- md$year >= 1999 && md$year <= 2005 +table(md$Animation[sel], md$year[sel]) +tt[,2] +tt <- table(md$Animation[sel], md$year[sel]) +tt[,2] +tt <- table(md$Animation[sel], md$year[sel]) +tt[2,] +sel <- md$year >= 1999 && md$year <= 2005 +tt <- table(md$Animation[sel], md$year[sel]) +sel <- md$year >= 1999 && md$year <= 2005 +tt <- table(md$Animation[sel], md$year[sel]) +tt[2,] +sel <- md$year >= 1999 && md$year <= 2005 +sel +tt <- table(md$Animation[sel], md$year[sel]) +tt[2,] +sel <- md$year >= 1999 && md$year <= 2005 +sel +tt <- table(md$Animation[sel], md$year[sel]) +tt <- table(md$Animation[sel], md$year[sel]) +tt[2,] +# - Plot the number of animated movies being produced every year for the period 1995-2005. +sel <- md$year >= 1999 && md$year <= 2005 +tt <- table(md$Animation[sel], md$year[sel]) +tt[2,] +tt <- table(md$Animation[sel], md$year[sel]) +tt[3,] +sel <- md$year >= 1999 && md$year <= 2005 +tt <- table(md$Animation[sel], md$year[sel]) +tt[1,] +sel <- md$year >= 1999 && md$year <= 2005 +tt <- table(md$Animation[sel], md$year[sel]) +tt[2,] +##################### +barplot(tt[2,], xlab="Year", ylab="Relative frequency", main="Proportion of comedies") +sel <- md$year >= 1999 and md$year <= 2005 +sel <- md[md$year >= 1999 && md$year <= 2005] +sel <- md[md$year >= 1999 & md$year <= 2005] +tt <- table(md$Animation[sel], md$year[sel]) +sel <- md[md$year >= 1999 & md$year <= 2005] +sel <- md$year >= 1999 & md$year <= 2005 +sel +tt <- table(md$Animation[sel], md$year[sel]) +barplot(tt[2,], xlab="Year", ylab="Relative frequency", main="Proportion of animated movies") +sel <- md$year >= 1999 & md$year <= 2005 +tt <- table(md$Animation[sel], md$year[sel]) +barplot(tt[2,], xlab="Year", ylab="Relative frequency", main="Proportion of animated movies") +####################### diff --git a/v3/lab 2 - problems.R b/v3/lab 2 - problems.R index 1c2e67a..87e16dd 100644 --- a/v3/lab 2 - problems.R +++ b/v3/lab 2 - problems.R @@ -70,7 +70,7 @@ pie(table(lol)) # - Plot the number of animated movies being produced every year for the period 1995-2005. sel <- md$year >= 1999 & md$year <= 2005 tt <- table(md$Animation[sel], md$year[sel]) -barplot(tt[2,], xlab="Year", ylab="Relative frequency", main="Proportion of animated movies") +wbarplot(tt[2,], xlab="Year", ylab="Relative frequency", main="Proportion of animated movies") # - Is there a clear boundary between short and feature movies (according to their length)? @@ -93,3 +93,15 @@ barplot(tt[2,], xlab="Year", ylab="Relative frequency", main="Proportion of anim # ####################################################################################################################### +players <- read.table("players.txt", sep=",", header = T) + +# - Plot the proportion of players according to playing positions. +position_proportion <- players %>% group_by(position) %>% select(position) %>% table() +pie(position_proportion) + +# - Compare career rebounds (the "reb" attribute) with respect to playing position +boxplot(reb ~ position, data=players, ylab="Carrer rebounds", main="Comparison of carrer rebounds with respect to playing position") + +# - Show the distribution of free throw percentages. +# The percentage is determined by dividing the number of shots made ("ftm") by the total number of shots attempted ("fta"). +ftp <- players %>% mutate(free_throw_percentage = ftm/fta *100) %>% select(free_throw_percentage)