Exploratory mapping with R

Reuben Fischer-Baum

Pros and cons

Why you should do this

Much better than QGIS at data handling (no more csvts or field calculcators!)
Reuseable code 😍
Much faster to set up than d3 mapping
You already know R and want everything in one place

Why you shouldn't do this

Limited projections
Few “geoprocessing” tools
Complicated maps become very big files (issue for speed and exporting)
Code can't be reused in the front-end

Let's make a map!

Mapping data

maps package has some basic map data, but this example will start with a shapefile (census boundary)
I've removed Alaska/Hawaii for simplicity 😬

External data

Language spoken at home by county, ACS 2015 5-year
R works best with “long” data
The tidyr package is great for converting wide to long

Creating an R Studio project

Starting a script

Go to File -> New File -> R Script
Save it in your folder

The R Studio interface

Installing our packages

Install packages once, in the console

install.packages('dplyr', 'ggplot2', 'rgdal', 'readr', 'rgeos', 'maptools', 'mapproj')

Add to the top of you actual script

require('dplyr')
require('ggplot2')
require('rgdal')
require('readr')
require('rgeos')
require('mapproj')

What do these libraries do?

dplyr handles data manipulation
ggplot2 makes graphics
rgdal reads shapefiles and handles geographic data, with the help of rgeos and mapproj
readr reads in csvs and other external data nicely (optional)

Reading in the shapefile

require('dplyr')
require('ggplot2')
require('rgdal')
require('readr')
require('rgeos')
require('mapproj')

counties <- readOGR(dsn = 'shp', layer = 'counties', verbose = FALSE)

print(head(counties@data))

  STATEFP COUNTYFP COUNTYNS       AFFGEOID GEOID       NAME LSAD
0      16      017 00395158 0500000US16017 16017     Bonner   06
1      18      087 00450368 0500000US18087 18087   LaGrange   06
2      41      027 01155131 0500000US41027 41027 Hood River   06
3      48      273 01383922 0500000US48273 48273    Kleberg   06
4      47      085 01639757 0500000US47085 47085  Humphreys   06
5      40      137 01101856 0500000US40137 40137   Stephens   06
       ALAND    AWATER
0 4492522387 478947473
1  983230790  18324388
2 1351717731  29106847
3 2282585635 541039916
4 1374665964  67186875
5 2253926810  54694872

Turning the shapefile into a data frame

counties <- readOGR(dsn = 'shp', layer = 'counties', verbose = FALSE) %>%
  fortify(region = 'AFFGEOID')

print(head(counties))

       long      lat order  hole piece             id            group
1 -86.91760 32.66417     1 FALSE     1 0500000US01001 0500000US01001.1
2 -86.71339 32.66173     2 FALSE     1 0500000US01001 0500000US01001.1
3 -86.71422 32.70569     3 FALSE     1 0500000US01001 0500000US01001.1
4 -86.41312 32.70739     4 FALSE     1 0500000US01001 0500000US01001.1
5 -86.41117 32.40994     5 FALSE     1 0500000US01001 0500000US01001.1
6 -86.49677 32.34444     6 FALSE     1 0500000US01001 0500000US01001.1

Mapping our counties

counties <- readOGR(dsn = 'shp', layer = 'counties', verbose = FALSE) %>%
  fortify(region = 'AFFGEOID')

print(head(counties, 1))

      long      lat order  hole piece             id            group
1 -86.9176 32.66417     1 FALSE     1 0500000US01001 0500000US01001.1

map <- ggplot(counties, aes(x = long, y = lat, group = group)) +
  geom_polygon()

print(map)

plot of chunk unnamed-chunk-5

Adding a projection

map <- ggplot(counties, aes(x = long, y = lat, group = group)) +
  geom_polygon() +
  coord_map("albers", lat0 = 29.5, lat1 = 45.5)

print(map)

plot of chunk unnamed-chunk-7

Cleaning up the map

map <- ggplot(counties, aes(x = long, y = lat, group = group)) +
  geom_polygon(fill = "#cccccc", color = '#ffffff', size = 0.2) +
  coord_map("albers", lat0 = 29.5, lat1 = 45.5) +
  theme_void()

print(map)

plot of chunk unnamed-chunk-9

PAUSE: Here's our entire script so far

require('dplyr')
require('ggplot2')
require('rgdal')
require('readr')
require('rgeos')
require('mapproj')


# load in our shapefile and convert to a data frame

counties <- readOGR(dsn = 'shp', layer = 'counties', verbose = FALSE) %>%
  fortify(region = 'AFFGEOID')


# plot the data frame

map <- ggplot(counties, aes(x = long, y = lat, group = group)) +
  geom_polygon(fill = "#cccccc", color = '#ffffff', size = 0.2) +
  coord_map("albers", lat0 = 29.5, lat1 = 45.5) +
  theme_void()

print(map)

Let's plot some real data

counties <- readOGR(dsn = 'shp', layer = 'counties', verbose = FALSE) %>%
  fortify(region = 'AFFGEOID')

languages <- read_csv('languages.csv')

print(head(languages))

# A tibble: 6 x 5
              Id               Geography language    pop count
           <chr>                   <chr>    <chr>  <int> <int>
1 0500000US01001 Autauga County, Alabama  spanish  51979   950
2 0500000US01003 Baldwin County, Alabama  spanish 184627  7587
3 0500000US01005 Barbour County, Alabama  spanish  25456  1118
4 0500000US01007    Bibb County, Alabama  spanish  21471   390
5 0500000US01009  Blount County, Alabama  spanish  54165  3520
6 0500000US01011 Bullock County, Alabama  spanish  10003   316

print(unique(languages$language))

 [1] "spanish"       "french"        "french_creole" "italian"      
 [5] "portuguese"    "german"        "yiddish"       "scandinavian" 
 [9] "greek"         "russian"       "polish"        "armenian"     
[13] "persian"       "hindi"         "chinese"       "japanese"     
[17] "korean"        "hmong"         "thai"          "vietnemese"   
[21] "tagalog"       "navajo"        "hungarian"     "arabic"

Let's focus on just one language, and find a percentage

languages <- read_csv('languages.csv') %>%
  filter(language == 'french') %>%
  mutate(share = count/pop) %>%
  select(Id, Geography, language, share)

print(head(languages))

# A tibble: 6 x 4
              Id               Geography language        share
           <chr>                   <chr>    <chr>        <dbl>
1 0500000US01001 Autauga County, Alabama   french 0.0014044133
2 0500000US01003 Baldwin County, Alabama   french 0.0040730771
3 0500000US01005 Barbour County, Alabama   french 0.0007071025
4 0500000US01007    Bibb County, Alabama   french 0.0009780634
5 0500000US01009  Blount County, Alabama   french 0.0005169390
6 0500000US01011 Bullock County, Alabama   french 0.0035989203

Merging our external dataset with our counties

counties <- readOGR(dsn = 'shp', layer = 'counties', verbose = FALSE) %>%
  fortify(region = 'AFFGEOID')

languages <- read_csv('languages.csv') %>%
  filter(language == 'french') %>%
  mutate(share = count/pop)

mapData <- left_join(counties, languages, by = c('id' = 'Id'))

print(head(mapData))

       long      lat order  hole piece             id            group
1 -86.91760 32.66417     1 FALSE     1 0500000US01001 0500000US01001.1
2 -86.71339 32.66173     2 FALSE     1 0500000US01001 0500000US01001.1
3 -86.71422 32.70569     3 FALSE     1 0500000US01001 0500000US01001.1
4 -86.41312 32.70739     4 FALSE     1 0500000US01001 0500000US01001.1
5 -86.41117 32.40994     5 FALSE     1 0500000US01001 0500000US01001.1
6 -86.49677 32.34444     6 FALSE     1 0500000US01001 0500000US01001.1
                Geography language   pop count       share
1 Autauga County, Alabama   french 51979    73 0.001404413
2 Autauga County, Alabama   french 51979    73 0.001404413
3 Autauga County, Alabama   french 51979    73 0.001404413
4 Autauga County, Alabama   french 51979    73 0.001404413
5 Autauga County, Alabama   french 51979    73 0.001404413
6 Autauga County, Alabama   french 51979    73 0.001404413

And we map it!

mapData <- left_join(counties, languages, by = c('id' = 'Id'))

map <- ggplot(mapData, aes(x = long, y = lat, group = group, fill = share)) +
  geom_polygon(color = '#ffffff', size = 0.2) +
  coord_map("albers", lat0 = 29.5, lat1 = 45.5) +
  theme_void()

print(map)

plot of chunk unnamed-chunk-17

Adding a better color scale

map <- ggplot(mapData, aes(x = long, y = lat, group = group, fill = share)) +
  geom_polygon(color = '#ffffff', size = 0.2) +
  coord_map("albers", lat0 = 29.5, lat1 = 45.5) +
  theme_void() +
  scale_fill_continuous(low = '#cccccc', high = '#ff0000')

print(map)

plot of chunk unnamed-chunk-19

PAUSE: It didn't take a ton of code to make a real (custom!) map

require('dplyr')
require('ggplot2')
require('rgdal')
require('readr')
require('rgeos')
require('mapproj')


# load in our shapefile and convert to a data frame

counties <- readOGR(dsn = 'shp', layer = 'counties', verbose = FALSE) %>%
  fortify(region = 'AFFGEOID')


# load in our external data and do a calculcation

languages <- read_csv('languages.csv') %>%
  filter(language == 'french') %>%
  mutate(share = count/pop)


# join the map data and external data

mapData <- left_join(counties, languages, by = c('id' = 'Id'))


# plot the joined data

map <- ggplot(mapData, aes(x = long, y = lat, group = group, fill = share)) +
  geom_polygon(color = '#ffffff', size = 0.2) +
  coord_map("albers", lat0 = 29.5, lat1 = 45.5) +
  theme_void() +
  scale_fill_continuous(low = '#cccccc', high = '#ff0000')

print(map)

Let's get fancy - every language at once

languages <- read_csv('languages.csv') %>%
  #filter(language == 'french') %>%
  mutate(share = count/pop)

The magic of facet_wrap

map <- ggplot(mapData, aes(x = long, y = lat, group = group, fill = share)) +
  geom_polygon() +
  coord_map("albers", lat0 = 29.5, lat1 = 45.5) +
  theme_void() +
  scale_fill_continuous(low = '#cccccc', high = '#ff2700') +
  facet_wrap(~language, ncol = 5)

plot of chunk unnamed-chunk-24

Let's calculate standard deviations from the mean instead

languages <- read_csv('languages.csv') %>%
  mutate(share = count/pop) %>%
  group_by(language) %>%
  mutate(sds = (share - mean(share))/sd(share)) %>%
  select(Id, Geography, share, sds)

print(head(languages))

Source: local data frame [6 x 5]
Groups: language [1]

# A tibble: 6 x 5
  language             Id               Geography      share         sds
     <chr>          <chr>                   <chr>      <dbl>       <dbl>
1  spanish 0500000US01001 Autauga County, Alabama 0.01827661 -0.43601673
2  spanish 0500000US01003 Baldwin County, Alabama 0.04109366 -0.21737052
3  spanish 0500000US01005 Barbour County, Alabama 0.04391892 -0.19029730
4  spanish 0500000US01007    Bibb County, Alabama 0.01816404 -0.43709551
5  spanish 0500000US01009  Blount County, Alabama 0.06498661  0.01158556
6  spanish 0500000US01011 Bullock County, Alabama 0.03159052 -0.30843513

A cooler map

map <- ggplot(mapData, aes(x = long, y = lat, group = group, fill = sds >= 1.5)) +
  geom_polygon() +
  coord_map("albers", lat0 = 29.5, lat1 = 45.5) +
  theme_void() +
  #scale_fill_continuous(low = '#cccccc', high = '#ff0000') +
  facet_wrap(~language, ncol = 5)

plot of chunk unnamed-chunk-29

Better colors

map <- ggplot(mapData, aes(x = long, y = lat, group = group, fill = sds >= 1.5)) +
  geom_polygon() +
  coord_map("albers", lat0 = 29.5, lat1 = 45.5) +
  theme_void() +
  scale_fill_manual(values=c('#cccccc', '#ff0000')) +
  facet_wrap(~language, ncol = 5)

plot of chunk unnamed-chunk-32

FINAL PAUSE: Our completed script

require('dplyr')
require('ggplot2')
require('rgdal')
require('readr')
require('rgeos')
require('mapproj')


# load in our shapefile and convert to a data frame

counties <- readOGR(dsn = 'shp', layer = 'counties', verbose = FALSE) %>%
  fortify(region = 'AFFGEOID')


# load in our external data and do some calculcations

languages <- read_csv('languages.csv') %>%
  mutate(share = count/pop) %>%
  group_by(language) %>%
  mutate(sds = (share - mean(share))/sd(share))


# join the map data and external data

mapData <- left_join(counties, languages, by = c('id' = 'Id'))


# plot the joined data

map <- ggplot(mapData, aes(x = long, y = lat, group = group, fill = sds >= 1.5)) +
  geom_polygon(color = 'NA') +
  coord_map("albers", lat0 = 29.5, lat1 = 45.5) +
  theme_void() +
  scale_fill_manual(values=c('#cccccc', '#ff0000')) +
  facet_wrap(~language, ncol = 5)

print(map)