library(DBI)
library(tidyverse)
library(dbplyr)
library(knitr)8 Creating Complex Data Sets for Analysis
8.1 When to Use SQL for Complex Data Sets
8.1.1 Advantages of Using SQL
8.1.2 When to Build into ETL Instead
8.1.3 When to Put Logic in Other Tools
8.2 Code Organization
db <- dbConnect(duckdb::duckdb())
earthquakes <- tbl(db, "read_csv_auto('data/earthquakes*.csv')")
legislators_terms <-
tbl(db, "read_csv_auto('data/legislators_terms.csv')") |>
compute(name = "legislators_terms")
videogame_sales <-
tbl(db, "read_csv_auto('data/videogame_sales.csv')") |>
compute(name = "videogame_sales")earthquakes |>
filter(date_part('year', time) >= 2019,
between(mag, 0, 1)) |>
mutate(place = case_when(grepl('CA', place) ~ 'California',
grepl('AK', place) ~ 'Alaska',
TRUE ~ trim(split_part(place, ',', 2L)))) |>
count(place, type, mag) |>
arrange(desc(n)) |>
collect(n = 10) |>
kable()| place | type | mag | n |
|---|---|---|---|
| Alaska | earthquake | 1.00 | 4824 |
| Alaska | earthquake | 0.90 | 4153 |
| Alaska | earthquake | 0.80 | 3219 |
| California | earthquake | 0.56 | 2631 |
| Alaska | earthquake | 0.70 | 2061 |
| Nevada | earthquake | 1.00 | 1688 |
| Nevada | earthquake | 0.80 | 1681 |
| Nevada | earthquake | 0.90 | 1669 |
| Alaska | earthquake | 0.60 | 1464 |
| California | earthquake | 0.55 | 1444 |
8.2.1 Commenting
8.2.2 Capitalization, Indentation, Parentheses, and Other Formatting Tricks
8.2.3 Storing Code
8.3 Organizing Computations
8.3.1 Understanding Order of SQL Clause Evaluation
In general, I think the order of evaluation is more intuitive when writing SQL using dbplyr. It is perhaps more confusing to someone who has written a lot of SQL without thinking about the order things are evaluated (e.g., WHERE comes early in evaluation, but relatively late in the SQL).
In dplyr the meaning of filter() is usually clear by where it is placed. Some times it is translated into WHERE and some times it is HAVING. In the example below, a WHERE-like filter would be placed just after legislators_terms (much as it is evaluated by the SQL query engine), while in this case we have filter being translated into HAVING because it is using the result of GROUP BY query. One hardly need give this much thought.
terms_by_states <-
legislators_terms |>
group_by(state) |>
summarize(terms = n()) |>
filter(terms >= 1000) |>
arrange(desc(terms))
terms_by_states |>
show_query()<SQL>
SELECT state, COUNT(*) AS terms
FROM legislators_terms
GROUP BY state
HAVING (COUNT(*) >= 1000.0)
ORDER BY terms DESC
terms_by_states |>
collect(n = 10) |>
kable()| state | terms |
|---|---|
| NY | 4159 |
| PA | 3252 |
| OH | 2239 |
| CA | 2121 |
| IL | 2011 |
| TX | 1692 |
| MA | 1667 |
| VA | 1648 |
| NC | 1351 |
| MI | 1284 |
The way dplyr code is written makes it easy to look at the output each step in the series of pipes. In the query below, we can easily highlight the code up to the end of any pipe and evaluate it to see if it is doing what we want and expect it be doing. In this specific case, I find the dplyr code to be more intuitive than the SQL provided in the book, which uses an `a
legislators_terms |>
group_by(state) |>
summarize(terms = n(), .groups = "drop") |>
mutate(avg_terms = mean(terms, na.rm = TRUE)) |>
collect(n = 10) |>
kable(digits = 2)| state | terms | avg_terms |
|---|---|---|
| OH | 2239 | 746.83 |
| WA | 489 | 746.83 |
| MD | 961 | 746.83 |
| DE | 227 | 746.83 |
| PA | 3252 | 746.83 |
| CA | 2121 | 746.83 |
| ID | 188 | 746.83 |
| MN | 687 | 746.83 |
| NJ | 1255 | 746.83 |
| VT | 379 | 746.83 |
legislators_terms |>
group_by(state) |>
summarize(terms = n(), .groups = "drop") |>
window_order(desc(terms)) |>
mutate(rank = row_number()) |>
arrange(rank) |>
collect(n = 10) |>
kable()| state | terms | rank |
|---|---|---|
| NY | 4159 | 1 |
| PA | 3252 | 2 |
| OH | 2239 | 3 |
| CA | 2121 | 4 |
| IL | 2011 | 5 |
| TX | 1692 | 6 |
| MA | 1667 | 7 |
| VA | 1648 | 8 |
| NC | 1351 | 9 |
| MI | 1284 | 10 |
8.3.2 Subqueries
In writing dbplyr code, it is more natural to think in terms of CTEs, even though the code you write will generally be translated into SQL using subqueries.
The query in the book written with LATERAL seems much more confusing to me than the following. (Also, adding EXPLAIN to each query suggests that LATERAL is more complicated for PostgreSQL.) I rewrote the LATERAL query using CTEs and got the following, which seems closer to the second SQL query included in the book.
WITH
current_legislators AS (
SELECT distinct id_bioguide, party
FROM legislators_terms
WHERE term_end > '2020-06-01'),
party_changers AS (
SELECT b.id_bioguide, min(term_start) as first_term
FROM legislators_terms b
INNER JOIN current_legislators AS a
ON b.id_bioguide = a.id_bioguide AND b.party <> a.party
GROUP BY 1)
SELECT date_part('year', first_term) as first_year, party,
count(id_bioguide) as legislators
FROM current_legislators
INNER JOIN party_changers
USING (id_bioguide)
GROUP BY 1, 2;| first_year | party | legislators |
|---|---|---|
| 2011 | Libertarian | 1 |
| 1979 | Republican | 1 |
| 2015 | Democrat | 1 |
Translating the CTE version into dbplyr is a piece of cake.
current_legislators <-
legislators_terms |>
filter(term_end > '2020-06-01') |>
distinct(id_bioguide, party)
party_changers <-
legislators_terms |>
inner_join(current_legislators,
join_by(id_bioguide)) |>
filter(party.x != party.y) |>
group_by(id_bioguide) |>
summarize(first_term = min(term_start, na.rm = TRUE), .groups = "drop")
current_legislators |>
inner_join(party_changers, by = "id_bioguide") |>
mutate(first_year = date_part('year', first_term)) |>
group_by(first_year, party) |>
summarize(legislators = n(), .groups = "drop") |>
collect() |>
kable()| first_year | party | legislators |
|---|---|---|
| 2011 | Libertarian | 1 |
| 1979 | Republican | 1 |
| 2015 | Democrat | 1 |
8.3.3 Temporary Tables
Creating temporary tables with dbplyr is easy: simply append compute() at the end of the table definition. Generally, dbplyr will take care of details that likely do not matter much, such as choosing a name for the table (we don’t care because we can refer to the table below as current_legislators regardless of the name chosen for it).
current_legislators |>
show_query()<SQL>
SELECT DISTINCT id_bioguide, party
FROM legislators_terms
WHERE (term_end > '2020-06-01')
current_legislators <-
legislators_terms |>
filter(term_end > '2020-06-01') |>
distinct(id_bioguide, party) |>
compute()
current_legislators |>
show_query()<SQL>
SELECT *
FROM dbplyr_001
Creating temporary tables can lead to significant performance gains in some situations, as the query optimizer has a simpler object to work with. Note that dbplyr allows the creating of an index with temporary tables, which can improve performance even further.
Not that some database administrators do not allow users to create temporary tables. In such cases, you can often use collect() followed by copy_inline() effectively. (This is also useful when you have data outside the database—so collect() is not relevant—but want to merge it with data in the database.1)
8.3.4 Common Table Expressions
We have been using them throughout the book already. For the sake of completeness, I rewrite the query given in the book here.
first_term <-
legislators_terms |>
group_by(id_bioguide) |>
summarize(first_term = min(term_start, na.rm = TRUE),
.groups = "drop")
first_term |>
inner_join(legislators_terms, by = "id_bioguide") |>
mutate(periods = date_part('year', age(term_start, first_term))) |>
group_by(periods) |>
summarize(cohort_retained = n_distinct(id_bioguide)) |>
collect(n = 3) |>
kable()| periods | cohort_retained |
|---|---|
| 21 | 280 |
| 3 | 1831 |
| 6 | 2385 |
8.3.5 GROUPING SETS
I don’t think there is a “pure” dbplyr way of doing these. However, not all is lost for the dedicated dbplyr user, as the following examples demonstrate. Note that DuckDB requires a more explicit statement of the GROUPING SETS.
global_sales <-
tbl(db, sql("
(SELECT platform, genre, publisher,
sum(global_sales) as global_sales
FROM videogame_sales
GROUP BY GROUPING SETS ((platform, genre, publisher),
platform, genre, publisher))"))
global_sales |>
arrange(desc(global_sales)) |>
collect(n = 10) |>
kable()| platform | genre | publisher | global_sales |
|---|---|---|---|
| NA | NA | Nintendo | 1786.56 |
| NA | Action | NA | 1751.18 |
| NA | Sports | NA | 1330.93 |
| PS2 | NA | NA | 1255.64 |
| NA | NA | Electronic Arts | 1110.32 |
| NA | Shooter | NA | 1037.37 |
| X360 | NA | NA | 979.96 |
| PS3 | NA | NA | 957.84 |
| NA | Role-Playing | NA | 927.37 |
| Wii | NA | NA | 926.71 |
global_sales_cube <-
tbl(db, sql("
(SELECT coalesce(platform, 'All') as platform,
coalesce(genre,'All') AS genre,
coalesce(publisher,'All') AS publisher,
sum(global_sales) AS global_sales
FROM videogame_sales
GROUP BY cube (platform, genre, publisher))"))
global_sales_cube |>
arrange(platform, genre, publisher) |>
collect(n = 10) |>
kable()| platform | genre | publisher | global_sales |
|---|---|---|---|
| 2600 | Action | 20th Century Fox Video Games | 1.72 |
| 2600 | Action | Activision | 4.64 |
| 2600 | Action | All | 29.34 |
| 2600 | Action | Answer Software | 0.50 |
| 2600 | Action | Atari | 7.68 |
| 2600 | Action | Avalon Interactive | 0.17 |
| 2600 | Action | Bomb | 0.22 |
| 2600 | Action | CBS Electronics | 0.31 |
| 2600 | Action | CPG Products | 0.54 |
| 2600 | Action | Coleco | 1.26 |
8.4 Managing Data Set Size and Privacy Concerns
8.4.1 Sampling with %, mod
One can also use random() to similar effect.
8.4.2 Reducing Dimensionality
legislators_terms |>
mutate(state_group =
case_when(state %in% c('CA', 'TX', 'FL', 'NY', 'PA') ~ state,
TRUE ~ 'Other')) |>
group_by(state_group) |>
summarize(terms = n()) |>
arrange(desc(terms)) |>
collect(n = 6) |>
kable()| state_group | terms |
|---|---|
| Other | 31980 |
| NY | 4159 |
| PA | 3252 |
| CA | 2121 |
| TX | 1692 |
| FL | 859 |
top_states <-
legislators_terms |>
group_by(state) |>
summarize(n_reps = n_distinct(id_bioguide), .groups = "drop") |>
window_order(desc(n_reps)) |>
mutate(rank = row_number())
legislators_terms |>
inner_join(top_states, by = "state") |>
mutate(state_group = case_when(rank <= 5 ~ state,
TRUE ~ 'Other')) |>
group_by(state_group) |>
summarize(terms = n_distinct(id_bioguide)) |>
arrange(desc(terms)) |>
collect(n = 6) |>
kable()| state_group | terms |
|---|---|
| Other | 8317 |
| NY | 1494 |
| PA | 1075 |
| OH | 694 |
| IL | 509 |
| VA | 451 |
Note that the CASE WHEN in the SQL in the book can be significantly simplified.
WITH num_terms AS (
SELECT id_bioguide, count(term_id) as terms
FROM legislators_terms
GROUP BY 1)
SELECT terms >= 2 AS two_terms_flag,
count(*) as legislators
FROM num_terms
GROUP BY 1;| two_terms_flag | legislators |
|---|---|
| TRUE | 8379 |
| FALSE | 4139 |
num_terms <-
legislators_terms |>
group_by(id_bioguide) |>
summarize(terms = n(), .groups = "drop")
num_terms |>
mutate(two_terms_flag = terms >= 2) |>
count(two_terms_flag) |>
collect() |>
kable()| two_terms_flag | n |
|---|---|
| TRUE | 8379 |
| FALSE | 4139 |
Now we can reuse the num_terms lazy table we created above.
num_terms |>
mutate(terms_level = case_when(terms >= 10 ~ '10+',
terms >= 2 ~ '2 - 9',
TRUE ~ '1')) |>
count(terms_level) |>
collect() |>
kable()| terms_level | n |
|---|---|
| 2 - 9 | 7496 |
| 10+ | 883 |
| 1 | 4139 |