Note: If chunks do not run, you may have to uncomment package installation or other lines
# Load Libraries
#install.packages("tidyverse")
library(tidyverse)
# Download Datasets
metadata <- read_csv("metadata.csv")
# airquality dataset
ggplot2 lingo
%>%; in ggplot we use
+ to add layersaes() = “aesthetics”; this is how we map
variables/other aestheticsgeom_ = “geometry”; these are built-in types of mapping
for plots (e.g. geom_bar() = bar chart,
geom_hist = histogram)ggplot2 than wide
dplyr::pivot_longer() will become your best friendggplot2 resources online# View the dataset to understand its structure (wide or long?), variables, data types 
view(airquality)
library(ggpmisc)
## Warning: package 'ggpmisc' was built under R version 4.1.2
## Loading required package: ggpp
## Warning: package 'ggpp' was built under R version 4.1.2
## 
## Attaching package: 'ggpp'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
# Pipe the dataset into a ggplot() call
airquality %>% 
# Set aesthetic mappings to the x and y axes for all geoms
  ggplot(aes(x = Wind, y = Temp)) +
# Create a point layer
#    geom_point() #+
# Change the aesthetic mappings of just the point layer
   geom_point(aes(color = as.factor(Month))) +
# Add a trendline, alter the method used to create it
  # geom_smooth() #+
   geom_smooth(method = lm) +
# Add statistical information about the trendline, position it in the plot
  stat_poly_eq(use_label(c("R2", "p", "n")),
               label.x = 20,
               label.y = 90)
## `geom_smooth()` using formula = 'y ~ x'

# Manipulate the data before piping it into the ggplot() call
airquality %>% filter(Month == 5) %>%
# Set aesthetic mappings to the x and y axes for all geoms
  ggplot(aes(x = Day, y = Temp)) +
# Create a line layer
  geom_line()

#library(lubridate)
# Manipulate the data before piping it into the ggplot() call to create a new date column
#airquality %>% mutate(date = make_date(2000, Month, Day)) %>% 
#  ggplot(aes(x = date, y = Temp)) +
#  geom_line()
library(lubridate)
# Manipulate the data before piping it into the ggplot() call to create a new date column
airquality %>% mutate(date = make_date(2000, Month, Day)) %>% select(-Month, -Day) %>%
# Pivot the data longer so we can plot all variables at once
  pivot_longer(cols = c("Ozone", "Solar.R", "Wind", "Temp")) %>%
  ggplot(aes(x = date, y = value)) +
  geom_line() +
# Plot all the variables on the same plot as rows
  facet_grid(name ~ .)

# View the dataset to understand its structure (wide or long?), variables, data types 
view(metadata)
# Pipe the dataset into a ggplot() call
metadata %>% 
# Set aesthetic mapping for just one variable
  ggplot(aes(x = element)) +
# Create a histogram layer and set a statistical transformation for it
  geom_histogram(stat = "count") +
# Change the angle of axis labels
   theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1)) +
# Change the color of geoms
   geom_histogram(aes(fill = collection), stat = "count")
## Warning in geom_histogram(stat = "count"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`
## Warning in geom_histogram(aes(fill = collection), stat = "count"): Ignoring
## unknown parameters: `binwidth`, `bins`, and `pad`

# Flip x and y axis
 coord_flip()
## <ggproto object: Class CoordFlip, CoordCartesian, Coord, gg>
##     aspect: function
##     backtransform_range: function
##     clip: on
##     default: FALSE
##     distance: function
##     expand: TRUE
##     is_free: function
##     is_linear: function
##     labels: function
##     limits: list
##     modify_scales: function
##     range: function
##     render_axis_h: function
##     render_axis_v: function
##     render_bg: function
##     render_fg: function
##     setup_data: function
##     setup_layout: function
##     setup_panel_guides: function
##     setup_panel_params: function
##     setup_params: function
##     train_panel_guides: function
##     transform: function
##     super:  <ggproto object: Class CoordFlip, CoordCartesian, Coord, gg>
The plots we’ve just created are what I would consider “bare-minimum” visualizations. Here’s how we take them to the next level:
ggplot2 text sizes are generally too
small. Increasing the text size is an easy way to make more
accessible, professional looking plots. Read
more on accessible text here# Create a custom color palette
my_pal <- RColorBrewer::brewer.pal(11, "PRGn")[c(2, 9)]
library(ggtext)
## Warning: package 'ggtext' was built under R version 4.1.2
library(RColorBrewer)
# Manipulate data before piping it into the ggplot() call
metadata %>% group_by(element, collection) %>% mutate(count = n()) %>% distinct(element, count, collection) %>%
# Set aesthetic mapping for all layers
# Reorder a variable by its value
  ggplot(aes(x = reorder(element, count), y = count, fill = collection)) +
# Create a column layer, set the columns to equal width
  geom_col(position = position_dodge(preserve = "single")) +
# Flip the axes
  coord_flip() +
# Add a default theme before theme alterations
  theme_minimal() +
# Alter the legend position
  theme(legend.position = "bottom",
# Alter the text size
        text = element_text(size = 20)) +
# Add annotations, specify their position and color
  annotate("text", y = 440, x = 16.25, label = "In the Elections Web Archive, subject repeats a mean \n average of 12.8 times per metadata record", color = my_pal[2]) + 
  annotate("text", y = 120, x = 1.8, label = "The Digital Library of the Caribbean metadata contains three \n unique top-level elements: note, abstract, and classification" , color = my_pal[1]) + 
# Modify the titles, axes labels, and caption
  labs(title = "Frequency of Top-Level Elements",
      # subtitle = "Frequency of Top-Level Elements",
       y = "Number of Observations",
       x = "Top-Level Element",
       caption = "Based on 40 random records collected from each repository (80 records total). \n
       Data Sources: https://www.dloc.com/ and \n
       https://www.loc.gov/collections/united-states-elections-web-archive/") +
# Add a custom color palette, alter the names for the legend and variables
  scale_fill_manual(values = my_pal, name = "Collection",
                      labels=c("Digital Library of the Caribbean", "Elections Web Archive"))

# Save our plot
getwd()
## [1] "/Users/joannaschroeder/Documents/R/intro-rmd-websites/web"
ggsave("metadata_exploration-element_comparison_bar.png", plot = last_plot(),
       height = 10, width = 14, units = "in", bg = "white")
library(tidyverse)
library(tidycensus)
# First use tidycensus to get the data
vars <- c("S1702_C01_001", "S1702_C02_001", "S1702_C01_043E", "S1702_C01_044E", "S1702_C01_045E", "S1702_C01_046E", "S1702_C01_047E", "S1702_C01_048E", "S1702_C01_049E", "S1702_C01_050E")
fairfax_family_poverty <- get_acs(geography = "tract", variables = vars, state = "VA",
                                  county = "Fairfax County", year = 2020, geometry =
                                    TRUE, survey = "acs5", cache_table = TRUE)
# Make sure you have the argument geometry = TRUE
library(tidyverse)
library(tidycensus)
# Second use ggplot to map the data
fairfax_family_poverty %>% filter(variable == "S1702_C01_001") %>%
  # If more than one variable, filter for the name of the variable you want to map
  ggplot() +
  geom_sf(aes(fill = estimate)) + # fill = name of column with values to map
  labs(fill = "Population (Count)", # Legend title
       title = "Fairfax County Population (2020)", # Graph title
       subtitle = "Measured at the Census Tract", 
       caption = "A great phrase explaining my figure \n
       Data Source: American Community Survey, 5-Year Estimates, Table S1702") + 
        #Graph caption
  theme_void() # Takes out x and y axis, axis labels

# You can also make whatever changes you want to the map (color palettes, geography outlines, adding points etc)
library(viridis)
fairfax_family_poverty %>% filter(variable == "S1702_C01_001") %>%
  # If more than one variable, filter for the name of the variable you want to map
  ggplot() +
  geom_sf(aes(fill = estimate)) + # fill = name of column with values to map
  labs(fill = "Population (Count)", # Legend title
       title = "Fairfax County Population (2020)", # Graph title
       subtitle = "Measured at the Census Tract", 
       caption = "A great phrase explaining my figure \n
       Data Source: American Community Survey, 5-Year Estimates, Table S1702") + 
  #Graph caption
  theme_void() + # Takes out x and y axis, axis labels
  scale_fill_viridis(option = "viridis") # or option = "magma"

# You can also make whatever changes you want to the map (color palettes, geography outlines, adding points etc)
# These are the palettes used for the data commons
#devtools::install_github("thomasp85/scico")
library(scico)
fairfax_family_poverty %>% filter(variable == "S1702_C01_001") %>%
  # If more than one variable, filter for the name of the variable you want to map
  ggplot() +
  geom_sf(aes(fill = estimate)) + # fill = name of column with values to map
  labs(fill = "Population (Count)", # Legend title
       title = "Fairfax County Population (2020)", # Graph title
       subtitle = "Measured at the Census Tract", 
       caption = "A great phrase explaining my figure \n
       Data Source: American Community Survey, 5-Year Estimates, Table S1702") + 
  #Graph caption
  theme_void() + # Takes out x and y axis, axis labels
  scale_fill_scico(palette = 'lajolla')  # or palette = "vik" (divergent)

# You can also make whatever changes you want to the map (color palettes, geography outlines, adding points etc)
# Choose one of the example datasets (or another base R dataset if you know of one)
# Use `ggplot2` to explore and visualize the data
# Create a bare-miniumum visualization or two
# If we have time, take your bare-minimum visualization to the next level
# Report out your data story