ggplot2
packageggplot2
ggplot2
cheat sheet
http://rpubs.com/hadley/ggplot2-layers by H Wickham short and easy)
http://rpubs.com/hadley/ggplot2-toolbox by H Wickham longer
http://www.ling.upenn.edu/~joseff/avml2012/ more advanced but still feasible
http://zevross.com/blog/2014/08/04/beautiful-plotting-in-r-a-ggplot2-cheatsheet-3/ Long with lots of info on how to modify the aspect of the plot
http://rpubs.com/RobinLovelace/intro-spatial ggplot2:ggmap to plot SPATIAL DATA
ggplot2
in action (movie)library(ggplot2)
summary(movies)
## title year length budget
## Length:58788 Min. :1893 Min. : 1.00 Min. : 0
## Class :character 1st Qu.:1958 1st Qu.: 74.00 1st Qu.: 250000
## Mode :character Median :1983 Median : 90.00 Median : 3000000
## Mean :1976 Mean : 82.34 Mean : 13412513
## 3rd Qu.:1997 3rd Qu.: 100.00 3rd Qu.: 15000000
## Max. :2005 Max. :5220.00 Max. :200000000
## NA's :53573
## rating votes r1 r2
## Min. : 1.000 Min. : 5.0 Min. : 0.000 Min. : 0.000
## 1st Qu.: 5.000 1st Qu.: 11.0 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 6.100 Median : 30.0 Median : 4.500 Median : 4.500
## Mean : 5.933 Mean : 632.1 Mean : 7.014 Mean : 4.022
## 3rd Qu.: 7.000 3rd Qu.: 112.0 3rd Qu.: 4.500 3rd Qu.: 4.500
## Max. :10.000 Max. :157608.0 Max. :100.000 Max. :84.500
##
## r3 r4 r5 r6
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 4.500 1st Qu.: 4.50
## Median : 4.500 Median : 4.500 Median : 4.500 Median :14.50
## Mean : 4.721 Mean : 6.375 Mean : 9.797 Mean :13.04
## 3rd Qu.: 4.500 3rd Qu.: 4.500 3rd Qu.: 14.500 3rd Qu.:14.50
## Max. :84.500 Max. :100.000 Max. :100.000 Max. :84.50
##
## r7 r8 r9 r10
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 4.50 1st Qu.: 4.50 1st Qu.: 4.500 1st Qu.: 4.50
## Median : 14.50 Median : 14.50 Median : 4.500 Median : 14.50
## Mean : 15.55 Mean : 13.88 Mean : 8.954 Mean : 16.85
## 3rd Qu.: 24.50 3rd Qu.: 24.50 3rd Qu.: 14.500 3rd Qu.: 24.50
## Max. :100.00 Max. :100.00 Max. :100.000 Max. :100.00
##
## mpaa Action Animation Comedy
## :53864 Min. :0.00000 Min. :0.00000 Min. :0.0000
## NC-17: 16 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## PG : 528 Median :0.00000 Median :0.00000 Median :0.0000
## PG-13: 1003 Mean :0.07974 Mean :0.06277 Mean :0.2938
## R : 3377 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.00000 Max. :1.0000
##
## Drama Documentary Romance Short
## Min. :0.000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.000 Median :0.00000 Median :0.0000 Median :0.0000
## Mean :0.371 Mean :0.05906 Mean :0.0807 Mean :0.1609
## 3rd Qu.:1.000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.000 Max. :1.00000 Max. :1.0000 Max. :1.0000
##
## create a local copy of my movie to modify it
myMov <- movies
# Let's first create a few more columns
# Add a categorical variable recording whether budget is available
myMov <- transform(myMov, budgetKnown = !is.na(budget))
# Add another categorical variable by makin a factor with rating
myMov <- transform(myMov, ratingCat = cut(rating, breaks = pretty(rating, 5)))
# histogram of the number of movies listed by year
p <- ggplot(data = myMov, aes(x = year))
p + geom_bar()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
# like with other functions if you assign a plot, NOTHING is "printed" in the graphical device
myHist <- p + geom_bar(binwidth = 10)
# Now we display it
print(myHist)
# a ggplot2 graphic is first of all an OBJECT
str(myHist)
## List of 9
## $ data :'data.frame': 58788 obs. of 26 variables:
## ..$ title : chr [1:58788] "$" "$1000 a Touchdown" "$21 a Day Once a Month" "$40,000" ...
## ..$ year : int [1:58788] 1971 1939 1941 1996 1975 2000 2002 2002 1987 1917 ...
## ..$ length : int [1:58788] 121 71 7 70 71 91 93 25 97 61 ...
## ..$ budget : int [1:58788] NA NA NA NA NA NA NA NA NA NA ...
## ..$ rating : num [1:58788] 6.4 6 8.2 8.2 3.4 4.3 5.3 6.7 6.6 6 ...
## ..$ votes : int [1:58788] 348 20 5 6 17 45 200 24 18 51 ...
## ..$ r1 : num [1:58788] 4.5 0 0 14.5 24.5 4.5 4.5 4.5 4.5 4.5 ...
## ..$ r2 : num [1:58788] 4.5 14.5 0 0 4.5 4.5 0 4.5 4.5 0 ...
## ..$ r3 : num [1:58788] 4.5 4.5 0 0 0 4.5 4.5 4.5 4.5 4.5 ...
## ..$ r4 : num [1:58788] 4.5 24.5 0 0 14.5 14.5 4.5 4.5 0 4.5 ...
## ..$ r5 : num [1:58788] 14.5 14.5 0 0 14.5 14.5 24.5 4.5 0 4.5 ...
## ..$ r6 : num [1:58788] 24.5 14.5 24.5 0 4.5 14.5 24.5 14.5 0 44.5 ...
## ..$ r7 : num [1:58788] 24.5 14.5 0 0 0 4.5 14.5 14.5 34.5 14.5 ...
## ..$ r8 : num [1:58788] 14.5 4.5 44.5 0 0 4.5 4.5 14.5 14.5 4.5 ...
## ..$ r9 : num [1:58788] 4.5 4.5 24.5 34.5 0 14.5 4.5 4.5 4.5 4.5 ...
## ..$ r10 : num [1:58788] 4.5 14.5 24.5 45.5 24.5 14.5 14.5 14.5 24.5 4.5 ...
## ..$ mpaa : Factor w/ 5 levels "","NC-17","PG",..: 1 1 1 1 1 1 5 1 1 1 ...
## ..$ Action : int [1:58788] 0 0 0 0 0 0 1 0 0 0 ...
## ..$ Animation : int [1:58788] 0 0 1 0 0 0 0 0 0 0 ...
## ..$ Comedy : int [1:58788] 1 1 0 1 0 0 0 0 0 0 ...
## ..$ Drama : int [1:58788] 1 0 0 0 0 1 1 0 1 0 ...
## ..$ Documentary: int [1:58788] 0 0 0 0 0 0 0 1 0 0 ...
## ..$ Romance : int [1:58788] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ Short : int [1:58788] 0 0 1 0 0 0 0 1 0 0 ...
## ..$ budgetKnown: logi [1:58788] FALSE FALSE FALSE FALSE FALSE FALSE ...
## ..$ ratingCat : Factor w/ 5 levels "(0,2]","(2,4]",..: 4 3 5 5 2 3 3 4 4 3 ...
## $ layers :List of 1
## ..$ :Classes 'proto', 'environment' <environment: 0x7e603c8>
## $ scales :Reference class 'Scales' [package "ggplot2"] with 1 field
## ..$ scales: list()
## ..and 23 methods, of which 9 are possibly relevant:
## .. add, clone, find, get_scales, has_scale, initialize, input, n,
## .. non_position_scales
## $ mapping :List of 1
## ..$ x: symbol year
## $ theme : list()
## $ coordinates:List of 1
## ..$ limits:List of 2
## .. ..$ x: NULL
## .. ..$ y: NULL
## ..- attr(*, "class")= chr [1:2] "cartesian" "coord"
## $ facet :List of 1
## ..$ shrink: logi TRUE
## ..- attr(*, "class")= chr [1:2] "null" "facet"
## $ plot_env :<environment: R_GlobalEnv>
## $ labels :List of 2
## ..$ x: chr "year"
## ..$ y: chr "count"
## - attr(*, "class")= chr [1:2] "gg" "ggplot"
# change the color of the fill of the bars
h1 <- p + geom_bar(fill = "yellow")
h1
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
# lets show side by side the distribution dependant on whether budget is known
h2 <- p + geom_bar(aes(fill = budgetKnown), position = "dodge")
h2
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
# two plots side by side
#install.packages("gridExtra")
library("gridExtra")
print(grid.arrange(h1, h2))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## TableGrob (2 x 1) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (2-2,1-1) arrange gtable[layout]
bp <- ggplot(data = myMov, aes(x = mpaa))
bp + geom_bar()
# Let's save a png file of the plot
ggsave(filename = "barPlotMPAA.png", plot = last_plot())
## Saving 7 x 5 in image
## x-y plot
xy <- ggplot(data = myMov, aes(x = rating, y = votes))
xy + geom_point()
# Movies with known budget have more votes ?
xy + geom_point(aes(color = budgetKnown))
# it looks like there is some overplotting, let's add some transparency
xy + geom_point(aes(color = budgetKnown), alpha = 0.2)
# Let's look at the relation between rating and budget
xy <- ggplot(data = myMov, aes(x = rating, y = budget))
xy + geom_point()
## Warning: Removed 53573 rows containing missing values (geom_point).
# Is a high budget a garantee of high ratings? Bof...
# bar plots with color dependent on a third variable
aggregMov <- aggregate(formula = length ~ mpaa + ratingCat, data = myMov, FUN = mean)
bpf <- ggplot(data = aggregMov, aes(x = ratingCat, y = length))
bpf + geom_bar(stat = "identity")
# bars are "dodged"
bpf + geom_bar(stat = "identity", aes(fill = mpaa), position = position_dodge())
# relative length of movies by rating and by mpaa
bpf + geom_bar(stat = "identity", aes(fill = mpaa), position = position_fill())
# box plot
bxp <- ggplot(data = myMov, aes(x = mpaa, y = budget))
bxp + geom_boxplot() # kind of hard to see, let's change the y scale
## Warning: Removed 53573 rows containing non-finite values (stat_boxplot).
bxp + geom_boxplot() + scale_y_log10()
## Warning: Removed 53605 rows containing non-finite values (stat_boxplot).
# Adult movies have very low budgets ?!
# line plots
# Lets look a the evolution of budgets as time goes bby.default()
bt <- ggplot(data = myMov, aes(x = year, y = budget))
bt + geom_point()
## Warning: Removed 53573 rows containing missing values (geom_point).
bt + geom_point() + geom_smooth()
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## Warning: Removed 53573 rows containing missing values (stat_smooth).
## Warning: Removed 53573 rows containing missing values (geom_point).
## Or something similar
meanBudgByYear <- aggregate(formula = budget ~ year, data = myMov, FUN = median, na.action = na.omit)
bt <- ggplot(data = meanBudgByYear, aes(x = year, y = budget))
bt + geom_line() +
labs(title = "Movie budget trends", x = "YEAR", y = "Budget ($)")
# There are lots of data points and we will try to just get a random sample out of them.
someMov <- subset(myMov, !is.na(budget), -(r1:r10)) # take only rows with no NA and delete r colomns
someMov <- someMov[ sample(1:nrow(someMov), size = 200), ] # sample a random subset of rows
fac <- ggplot(data = someMov, aes(x = length, y = votes))
fac + geom_point()
fac + geom_point() + facet_wrap(facets = ~ mpaa)
gr <- fac + geom_point() + facet_grid(facets = ratingCat ~ mpaa)
gr
gr + theme_bw()
gr + theme_minimal()