ggplot2 package

Brief intro to ggplot2

  • R in a nutshell 2nd edition: “ggplot2 1 has become one of the most popular R packages. ggplot2 is a great tool for producing readable charts. But more importantly, ggplot2 uses a language for describing how to plot data called the grammar of graphics.”

ggplot2 cheat sheet

  • ggplot2 tutorials:

http://rpubs.com/hadley/ggplot2-layers by H Wickham short and easy)

http://rpubs.com/hadley/ggplot2-toolbox by H Wickham longer

http://www.ling.upenn.edu/~joseff/avml2012/ more advanced but still feasible

http://zevross.com/blog/2014/08/04/beautiful-plotting-in-r-a-ggplot2-cheatsheet-3/ Long with lots of info on how to modify the aspect of the plot

http://rpubs.com/RobinLovelace/intro-spatial ggplot2:ggmap to plot SPATIAL DATA

ggplot2 in action (movie)

library(ggplot2)
summary(movies)
##     title                year          length            budget         
##  Length:58788       Min.   :1893   Min.   :   1.00   Min.   :        0  
##  Class :character   1st Qu.:1958   1st Qu.:  74.00   1st Qu.:   250000  
##  Mode  :character   Median :1983   Median :  90.00   Median :  3000000  
##                     Mean   :1976   Mean   :  82.34   Mean   : 13412513  
##                     3rd Qu.:1997   3rd Qu.: 100.00   3rd Qu.: 15000000  
##                     Max.   :2005   Max.   :5220.00   Max.   :200000000  
##                                                      NA's   :53573      
##      rating           votes                r1                r2        
##  Min.   : 1.000   Min.   :     5.0   Min.   :  0.000   Min.   : 0.000  
##  1st Qu.: 5.000   1st Qu.:    11.0   1st Qu.:  0.000   1st Qu.: 0.000  
##  Median : 6.100   Median :    30.0   Median :  4.500   Median : 4.500  
##  Mean   : 5.933   Mean   :   632.1   Mean   :  7.014   Mean   : 4.022  
##  3rd Qu.: 7.000   3rd Qu.:   112.0   3rd Qu.:  4.500   3rd Qu.: 4.500  
##  Max.   :10.000   Max.   :157608.0   Max.   :100.000   Max.   :84.500  
##                                                                        
##        r3               r4                r5                r6       
##  Min.   : 0.000   Min.   :  0.000   Min.   :  0.000   Min.   : 0.00  
##  1st Qu.: 0.000   1st Qu.:  0.000   1st Qu.:  4.500   1st Qu.: 4.50  
##  Median : 4.500   Median :  4.500   Median :  4.500   Median :14.50  
##  Mean   : 4.721   Mean   :  6.375   Mean   :  9.797   Mean   :13.04  
##  3rd Qu.: 4.500   3rd Qu.:  4.500   3rd Qu.: 14.500   3rd Qu.:14.50  
##  Max.   :84.500   Max.   :100.000   Max.   :100.000   Max.   :84.50  
##                                                                      
##        r7               r8               r9               r10        
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.000   Min.   :  0.00  
##  1st Qu.:  4.50   1st Qu.:  4.50   1st Qu.:  4.500   1st Qu.:  4.50  
##  Median : 14.50   Median : 14.50   Median :  4.500   Median : 14.50  
##  Mean   : 15.55   Mean   : 13.88   Mean   :  8.954   Mean   : 16.85  
##  3rd Qu.: 24.50   3rd Qu.: 24.50   3rd Qu.: 14.500   3rd Qu.: 24.50  
##  Max.   :100.00   Max.   :100.00   Max.   :100.000   Max.   :100.00  
##                                                                      
##     mpaa           Action          Animation           Comedy      
##       :53864   Min.   :0.00000   Min.   :0.00000   Min.   :0.0000  
##  NC-17:   16   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000  
##  PG   :  528   Median :0.00000   Median :0.00000   Median :0.0000  
##  PG-13: 1003   Mean   :0.07974   Mean   :0.06277   Mean   :0.2938  
##  R    : 3377   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:1.0000  
##                Max.   :1.00000   Max.   :1.00000   Max.   :1.0000  
##                                                                    
##      Drama        Documentary         Romance           Short       
##  Min.   :0.000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.000   Median :0.00000   Median :0.0000   Median :0.0000  
##  Mean   :0.371   Mean   :0.05906   Mean   :0.0807   Mean   :0.1609  
##  3rd Qu.:1.000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.000   Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
## 

## create a local copy of my movie to modify it
myMov <- movies

# Let's first create a few more columns

# Add a categorical variable recording whether budget is available
myMov <- transform(myMov, budgetKnown = !is.na(budget))

# Add another categorical variable by makin a factor with rating
myMov <- transform(myMov, ratingCat = cut(rating, breaks = pretty(rating, 5)))

some histograms

# histogram of the number of movies listed by year

p <- ggplot(data = myMov, aes(x = year))

p + geom_bar()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.


# like with other functions if you assign a plot, NOTHING is "printed" in the graphical device
myHist <- p + geom_bar(binwidth = 10)

# Now we display it
print(myHist)


# a ggplot2 graphic is first of all an OBJECT
str(myHist)
## List of 9
##  $ data       :'data.frame': 58788 obs. of  26 variables:
##   ..$ title      : chr [1:58788] "$" "$1000 a Touchdown" "$21 a Day Once a Month" "$40,000" ...
##   ..$ year       : int [1:58788] 1971 1939 1941 1996 1975 2000 2002 2002 1987 1917 ...
##   ..$ length     : int [1:58788] 121 71 7 70 71 91 93 25 97 61 ...
##   ..$ budget     : int [1:58788] NA NA NA NA NA NA NA NA NA NA ...
##   ..$ rating     : num [1:58788] 6.4 6 8.2 8.2 3.4 4.3 5.3 6.7 6.6 6 ...
##   ..$ votes      : int [1:58788] 348 20 5 6 17 45 200 24 18 51 ...
##   ..$ r1         : num [1:58788] 4.5 0 0 14.5 24.5 4.5 4.5 4.5 4.5 4.5 ...
##   ..$ r2         : num [1:58788] 4.5 14.5 0 0 4.5 4.5 0 4.5 4.5 0 ...
##   ..$ r3         : num [1:58788] 4.5 4.5 0 0 0 4.5 4.5 4.5 4.5 4.5 ...
##   ..$ r4         : num [1:58788] 4.5 24.5 0 0 14.5 14.5 4.5 4.5 0 4.5 ...
##   ..$ r5         : num [1:58788] 14.5 14.5 0 0 14.5 14.5 24.5 4.5 0 4.5 ...
##   ..$ r6         : num [1:58788] 24.5 14.5 24.5 0 4.5 14.5 24.5 14.5 0 44.5 ...
##   ..$ r7         : num [1:58788] 24.5 14.5 0 0 0 4.5 14.5 14.5 34.5 14.5 ...
##   ..$ r8         : num [1:58788] 14.5 4.5 44.5 0 0 4.5 4.5 14.5 14.5 4.5 ...
##   ..$ r9         : num [1:58788] 4.5 4.5 24.5 34.5 0 14.5 4.5 4.5 4.5 4.5 ...
##   ..$ r10        : num [1:58788] 4.5 14.5 24.5 45.5 24.5 14.5 14.5 14.5 24.5 4.5 ...
##   ..$ mpaa       : Factor w/ 5 levels "","NC-17","PG",..: 1 1 1 1 1 1 5 1 1 1 ...
##   ..$ Action     : int [1:58788] 0 0 0 0 0 0 1 0 0 0 ...
##   ..$ Animation  : int [1:58788] 0 0 1 0 0 0 0 0 0 0 ...
##   ..$ Comedy     : int [1:58788] 1 1 0 1 0 0 0 0 0 0 ...
##   ..$ Drama      : int [1:58788] 1 0 0 0 0 1 1 0 1 0 ...
##   ..$ Documentary: int [1:58788] 0 0 0 0 0 0 0 1 0 0 ...
##   ..$ Romance    : int [1:58788] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ Short      : int [1:58788] 0 0 1 0 0 0 0 1 0 0 ...
##   ..$ budgetKnown: logi [1:58788] FALSE FALSE FALSE FALSE FALSE FALSE ...
##   ..$ ratingCat  : Factor w/ 5 levels "(0,2]","(2,4]",..: 4 3 5 5 2 3 3 4 4 3 ...
##  $ layers     :List of 1
##   ..$ :Classes 'proto', 'environment' <environment: 0x7e603c8> 
##  $ scales     :Reference class 'Scales' [package "ggplot2"] with 1 field
##   ..$ scales: list()
##   ..and 23 methods, of which 9 are  possibly relevant:
##   ..  add, clone, find, get_scales, has_scale, initialize, input, n,
##   ..  non_position_scales
##  $ mapping    :List of 1
##   ..$ x: symbol year
##  $ theme      : list()
##  $ coordinates:List of 1
##   ..$ limits:List of 2
##   .. ..$ x: NULL
##   .. ..$ y: NULL
##   ..- attr(*, "class")= chr [1:2] "cartesian" "coord"
##  $ facet      :List of 1
##   ..$ shrink: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "null" "facet"
##  $ plot_env   :<environment: R_GlobalEnv> 
##  $ labels     :List of 2
##   ..$ x: chr "year"
##   ..$ y: chr "count"
##  - attr(*, "class")= chr [1:2] "gg" "ggplot"


# change the color of the fill of the bars
h1 <- p + geom_bar(fill = "yellow")
h1
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.


# lets show side by side the distribution dependant on whether budget is known
h2 <- p + geom_bar(aes(fill = budgetKnown), position = "dodge")
h2
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.


# two plots side by side
#install.packages("gridExtra")
library("gridExtra")
print(grid.arrange(h1, h2))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

## TableGrob (2 x 1) "arrange": 2 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (2-2,1-1) arrange gtable[layout]

A bar plot with a categorical variable


bp <- ggplot(data = myMov, aes(x = mpaa))
bp + geom_bar()


# Let's save a png file of the plot
ggsave(filename = "barPlotMPAA.png", plot = last_plot())
## Saving 7 x 5 in image

Bivariates plots

## x-y plot
xy <- ggplot(data = myMov, aes(x = rating, y = votes))
xy + geom_point()


# Movies with known budget have more votes ?
xy + geom_point(aes(color = budgetKnown)) 

# it looks like there is some overplotting, let's add some transparency
xy + geom_point(aes(color = budgetKnown), alpha = 0.2) 


# Let's look at the relation between rating and budget
xy <- ggplot(data = myMov, aes(x = rating, y = budget))
xy + geom_point()
## Warning: Removed 53573 rows containing missing values (geom_point).

# Is a high budget a garantee of high ratings? Bof...
# bar plots with color dependent on a third variable
aggregMov <- aggregate(formula = length ~ mpaa + ratingCat, data = myMov, FUN = mean)

bpf <- ggplot(data = aggregMov, aes(x = ratingCat, y = length))
bpf + geom_bar(stat = "identity")


# bars are "dodged"
bpf + geom_bar(stat = "identity", aes(fill = mpaa), position = position_dodge())


# relative length of movies by rating and by mpaa
bpf + geom_bar(stat = "identity", aes(fill = mpaa), position = position_fill())

# box plot
bxp <- ggplot(data = myMov, aes(x = mpaa, y = budget))
bxp + geom_boxplot() # kind of hard to see, let's change the y scale 
## Warning: Removed 53573 rows containing non-finite values (stat_boxplot).


bxp + geom_boxplot() + scale_y_log10()
## Warning: Removed 53605 rows containing non-finite values (stat_boxplot).


# Adult movies have very low budgets ?!
# line plots
# Lets look a the evolution of budgets as time goes bby.default()

bt <-  ggplot(data = myMov, aes(x = year, y = budget))
bt + geom_point()
## Warning: Removed 53573 rows containing missing values (geom_point).

bt + geom_point() + geom_smooth()
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## Warning: Removed 53573 rows containing missing values (stat_smooth).
## Warning: Removed 53573 rows containing missing values (geom_point).



## Or something similar

meanBudgByYear <- aggregate(formula = budget ~ year, data = myMov, FUN = median, na.action = na.omit)

bt <-  ggplot(data = meanBudgByYear, aes(x = year, y = budget))
bt + geom_line() +
    labs(title = "Movie budget trends", x = "YEAR", y = "Budget ($)")

Some facetting

# There are lots of data points and we will try to just get a random sample out of them.

someMov <- subset(myMov, !is.na(budget), -(r1:r10)) # take only rows with no NA and delete r colomns

someMov <- someMov[ sample(1:nrow(someMov), size = 200), ] # sample a random subset of rows

fac <- ggplot(data = someMov, aes(x = length, y = votes))

fac + geom_point()

  
fac + geom_point() + facet_wrap(facets = ~ mpaa)


gr <- fac + geom_point() + facet_grid(facets = ratingCat ~ mpaa)
gr


gr + theme_bw()


gr + theme_minimal()