Cleaning Data

Sarah Cassie Burnett

September 30, 2025

Scenario 1: NOAA weather station data

Load Libraries and data

library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
# NOAA GHCN Monthly (local subset for speed)
# https://www.ncei.noaa.gov/products/land-based-station/global-historical-climatology-network-monthly
temps <- read_csv("temps.csv", show_col_types = FALSE)
# show_col_types = FALSE doesn't do much, just supresses the message read out
# This is good for the including code in the quarto documents


# Station metadata
# https://www.philchodrow.prof/
stations <- read_csv("https://raw.githubusercontent.com/PhilChodrow/PIC16B/master/datasets/noaa-ghcn/station-metadata.csv",
                     show_col_types = FALSE)

# FIPS 10-4 to ISO + Names
# https://www.mysociety.org/
countries <- read_csv("https://raw.githubusercontent.com/mysociety/gaze/master/data/fips-10-4-to-iso-country-codes.csv",
                      show_col_types = FALSE)

02:00

Check out the data

glimpse(temps)
dim(temps)          # rows, columns
colnames(temps)     # column names
head(temps)         # preview

colnames(temps)
colnames(stations)
colnames(countries)

temps |> arrange(Year)
temps |> arrange(desc(Year))

Join Plan

1. Keys

Do they match between tables?
- temps.ID and stations.ID
- substr(ID, 1, 2) and countries."FIPS 10-4"

2. Join Type

What rows do we want to keep?
- Keep all rows from temps
- Use left joins

Join: temps + stations

temps_new <- temps %>%
  left_join(stations,  by = "ID")

Create FIPS from ID

substr("AFM00040911", 1, 2)
substr("AFM00040911", 1, 7)

substr("AFM00040911", 1, 2) == "AF"

temps %>%
  left_join(stations,  by = "ID") %>%
  # match country code: ID prefix matches FIPS 10-4 (first 2 chars of ID)?
  mutate(`FIPS 10-4` = substr(ID, 1, 2))

Join: add countries

temps %>%
  left_join(stations,  by = "ID") %>%
  # match country code: ID prefix matches FIPS 10-4 (first 2 chars of ID)?
  mutate(`FIPS 10-4` = substr(ID, 1, 2)) %>%
  left_join(countries, by = "FIPS 10-4")

Issues from duplicate FIPS codes

Remove duplicates from country dataset

# Count duplicates by FIPS and filter to see them
countries |> add_count(`FIPS 10-4`) |> filter(n > 1)

# Deduplicate by FIPS code (keep first in the tibble)
countries <- countries |> distinct(`FIPS 10-4`, .keep_all = TRUE)

Clean up the names to remove whitespace

library(janitor)

countries |> clean_names() 
# Here I'm going to continue with the old name.

Clean data from missing values, filter, and reorder

mytemps <- temps %>%
  left_join(stations,  by = "ID") %>%
  # match country code: ID prefix matches FIPS 10-4 (first 2 chars of ID)?
  mutate(`FIPS 10-4` = substr(ID, 1, 2)) %>%
  left_join(countries, by = "FIPS 10-4") %>%
  mutate(
    Temp = na_if(Temp, -9999),   # turn -9999 into NA
  ) %>%
  filter(LATITUDE >= 40 | LATITUDE <= -40) %>%
  arrange(.by = desc(Year))

mytemps |> head(7)
mytemps |> tail(7)

Plot: Temperature Over Time

library(ggplot2)

# sample 1% for plotting, not always necessary
temps_small <- mytemps %>% sample_frac(0.01) 


library(ggplot2)
# simple line plot
p <- ggplot(temps_small, aes(x = Year, y = Temp)) +
  geom_line(color = "chartreuse4") + 
  labs(
    title = "Temperature over time",
    x = "Year",
    y = "Temperature (hundredths of °C)",
  ) +
  theme_bw()

Plot: Temperature Over Time

Temperatures at weather stations over time

Save Plot

ggsave("temperature_over_time.png", plot = p, 
       width = 8, height = 4, dpi = 300)

Plot Temperature at the Locations of the Data

ggplot(temps_small, aes(x = LONGITUDE, y = LATITUDE, color = Temp/100)) +
  geom_point(alpha = 0.6, size = 1.2, na.rm = TRUE) +
  scale_color_viridis_c(option = "C", na.value = "grey80") +
  coord_quickmap() +
  labs(
    title = "Observed Temperature by Location",
    x = "Longitude", y = "Latitude",
    color = "Temp (°C)"
  ) +
  theme_bw()

Plot Temperature at the Locations of the Data

Temperatures at weather stations on a map

Scenario 2: Merge World Bank and V-Dem

See last week’s Merging Data slides for another example.

Treating Missing Values

Drop all NAs? Not always the best…..

library(tidyr)

glimpse(airquality)

# default drops any row that has a NA
all_dropped <- airquality |> drop_na()
nrow(all_dropped)   # much fewer rows remain!

Focus on what’s important

Sometimes more column variables are important.
Maybe the important cols here are Ozone and Temp (for a health study).
If we drop ALL rows with any NA, we also lose rows where Ozone is recorded but Solar.R is missing — that’s throwing away useful information!

ozone_dropped <- airquality |> drop_na(Ozone)
nrow(ozone_dropped)   # more rows remain

nrow(airquality)

Summarize what you need

Better to extract the information you want — whatever you plan to analyze or plot — and then handle the NAs in those columns.

library(dplyr)

airquality |>
  drop_na(Ozone) |>
  group_by(Month) |>
  summarize(mean_ozone = mean(Ozone))

Heuristics / Rules of Thumb

Select columns of interest (so you don’t have to worry about the rest)
Use drop_na(col1, col2) to handle specifically col1 and col2
Replace missing values with something that makes sense (imputation)
Filter out rows given your assumptions

Sometimes missing things don’t come as actual NAs!

Alternatives to dropping NAs

imputation = replacing missing values with a reasonable substitute
recoding = treating missing values as their own category or code

df |> mutate(education = if_else(is.na(education), "Unknown", education))

Other useful tidyr functions

`pivot_longer()` — why would you need this?

Maybe you get data and the data has columns that are years
Variables like time should be a single column so you can use tidy data functions
Three basic arguments:

cols      # which columns you want to pivot
names_to  # the name of the column where the old column names are going
values_to # the name of the column where the values are going

Cleaning Data

Scenario 1: NOAA weather station data

Load Libraries and data

Check out the data

Join Plan

1. Keys

2. Join Type

Join: temps + stations

Create FIPS from ID

Join: add countries

Issues from duplicate FIPS codes

Remove duplicates from country dataset

Clean up the names to remove whitespace

Clean data from missing values, filter, and reorder

Plot: Temperature Over Time

Plot: Temperature Over Time

Save Plot

Plot Temperature at the Locations of the Data

Plot Temperature at the Locations of the Data

Scenario 2: Merge World Bank and V-Dem

Treating Missing Values

Drop all NAs? Not always the best…..

Focus on what’s important

Summarize what you need

Heuristics / Rules of Thumb

Alternatives to dropping NAs

Other useful tidyr functions

pivot_longer() — why would you need this?

`pivot_longer()` — why would you need this?