7 Size Sorts and p-Hacking

We do not need the furr package for the dbplyr version of the code, but we do load the dbplyr package so that we can access the window_order() function below.

dbplyr
Original

library(tidyverse)
library(DBI)
library(scales)
library(sandwich)
library(lmtest)
library(dbplyr)
library(rlang)

library(tidyverse)
library(DBI)
library(scales)
library(sandwich)
library(lmtest)
library(furrr)
library(rlang)

7.1 Data Preparation

dbplyr
Original

tidy_finance <- dbConnect(
  duckdb::duckdb(),
  "data/tidy_finance.duckdb",
  read_only = TRUE)

crsp_monthly <- 
  tbl(tidy_finance, "crsp_monthly")


factors_ff_monthly <- 
  tbl(tidy_finance, "factors_ff_monthly")

tidy_finance <- dbConnect(
  duckdb::duckdb(),
  "data/tidy_finance.duckdb",
  read_only = TRUE)

crsp_monthly <- 
  tbl(tidy_finance, "crsp_monthly") |> 
  collect()

factors_ff_monthly <- 
  tbl(tidy_finance, "factors_ff_monthly") |> 
  collect()

7.2 Size Distribution

In the following code, we need to make a few tweaks in translating it to dbplyr.

First, we want to extract the calculations of quantiles by month into a separate query. These calculations are grouped aggregates, so should happen inside a group_by()/summarize() query in SQL. We also include the calculation of total_market_cap in this query.

Second, I simplify the creation of the quantile indicators (e.g., top01). There is no need to write if_else(condition == TRUE, 1, 0) when condition will effectively yield the same result. This means that we do not need to write mktcap[top01 == 1], but can use mktcap[top01].

Third, the original approach has calculations such as sum(mktcap[top01 == 1]) / total_market_cap inside a summarize() call. However, SQL implementations are normally strict in requiring variables to appear either inside an aggregate function (e.g., sum()) or as part of the grouping (group_by or GROUP BY). In the original calculation is outside the sum() aggregate and not part of the grouping set (we have group_by(month)).

Fourth, because we want to use factors for the plot, we need to collect() before constructing the factor.

Otherwise the two approaches are the same.

dbplyr
Original

monthly_quantiles <-
  crsp_monthly |>
  group_by(month) |>
  summarize(
    total_market_cap = sum(mktcap, na.rm = TRUE),
    q_top01 = quantile(mktcap, 0.99),
    q_top05 = quantile(mktcap, 0.95),
    q_top10 = quantile(mktcap, 0.90),
    q_top25 = quantile(mktcap, 0.75),
    .groups = "drop")

crsp_monthly |>
  inner_join(monthly_quantiles, by = "month") |>
  mutate(
    top01 = mktcap >= q_top01,
    top05 = mktcap >= q_top05,
    top10 = mktcap >= q_top10,
    top25 = mktcap >= q_top25,
    .groups = "drop") |>
  group_by(month) |>
  summarize(
    `Largest 1% of stocks` = sum(mktcap[top01]/total_market_cap),
    `Largest 5% of stocks` = sum(mktcap[top05]/total_market_cap),
    `Largest 10% of stocks` = sum(mktcap[top10]/total_market_cap),
    `Largest 25% of stocks` = sum(mktcap[top25]/total_market_cap),
    .groups = "drop") |>
  pivot_longer(cols = -month) |>
  collect() |>
  mutate(name = factor(name, levels = c(
    "Largest 1% of stocks", "Largest 5% of stocks",
    "Largest 10% of stocks", "Largest 25% of stocks"
  ))) |>
  ggplot(aes(
    x = month, 
    y = value, 
    color = name,
    linetype = name)) +
  geom_line() +
  scale_y_continuous(labels = percent, limits = c(0, 1)) +
  labs(
    x = NULL, y = NULL, color = NULL, linetype = NULL,
    title = "Percentage of total market capitalization in largest stocks")

crsp_monthly |>
  group_by(month) |>
  mutate(
    top01 = if_else(mktcap >= quantile(mktcap, 0.99), 1, 0),
    top05 = if_else(mktcap >= quantile(mktcap, 0.95), 1, 0),
    top10 = if_else(mktcap >= quantile(mktcap, 0.90), 1, 0),
    top25 = if_else(mktcap >= quantile(mktcap, 0.75), 1, 0)
  ) |>
  summarize(
    total_market_cap =  sum(mktcap),
    `Largest 1% of stocks` = sum(mktcap[top01 == 1]) / total_market_cap,
    `Largest 5% of stocks` = sum(mktcap[top05 == 1]) / total_market_cap,
    `Largest 10% of stocks` = sum(mktcap[top10 == 1]) / total_market_cap,
    `Largest 25% of stocks` = sum(mktcap[top25 == 1]) / total_market_cap,
    .groups = "drop"
  ) |>
  select(-total_market_cap) |> 
  pivot_longer(cols = -month) |>
  mutate(name = factor(name, levels = c(
    "Largest 1% of stocks", "Largest 5% of stocks",
    "Largest 10% of stocks", "Largest 25% of stocks"
  ))) |>
  ggplot(aes(
    x = month, 
    y = value, 
    color = name,
    linetype = name)) +
  geom_line() +
  scale_y_continuous(labels = percent, limits = c(0, 1)) +
  labs(
    x = NULL, y = NULL, color = NULL, linetype = NULL,
    title = "Percentage of total market capitalization in largest stocks"
  )

The next code chunk is the same.

crsp_monthly |>
  group_by(month, exchange) |>
  summarize(mktcap = sum(mktcap),
            .groups = "drop_last") |>
  mutate(share = mktcap / sum(mktcap)) |>
  ggplot(aes(
    x = month, 
    y = share, 
    fill = exchange, 
    color = exchange)) +
  geom_area(
    position = "stack",
    stat = "identity",
    alpha = 0.5
  ) +
  geom_line(position = "stack") +
  scale_y_continuous(labels = percent) +
  labs(
    x = NULL, y = NULL, fill = NULL, color = NULL,
    title = "Share of total market capitalization per listing exchange"
  )

dbplyr
Original

create_summary <- function(data, column_name) {
  data |>
    select(value = {{ column_name }}) |>
    summarize(
      mean = mean(value),
      sd = sd(value),
      min = min(value),
      q05 = quantile(value, 0.05),
      q50 = quantile(value, 0.50),
      q95 = quantile(value, 0.95),
      max = max(value),
      n = n()
    )
}

crsp_monthly |>
  filter(month == max(month)) |>
  group_by(exchange) |>
  create_summary(mktcap) |>
  arrange(exchange) |>
  union_all(crsp_monthly |>
            filter(month == max(month)) |>
            create_summary(mktcap) |>
            mutate(exchange = "Overall")) |>
  collect()

Adding missing grouping variables: `exchange`

exchange	mean	sd	min	q05	q50	q95	max	n
AMEX	414.7685	2180.64	7.56557	12.55276	75.83695	1218.314	25718.89	145
NASDAQ	8650.8801	90037.57	7.00553	29.30835	429.09850	18780.584	2902368.14	2779
NYSE	17857.7021	48618.73	23.91300	194.98000	3433.58503	80748.492	472941.07	1395
Other	13906.2461	NA	13906.24613	13906.24613	13906.24613	13906.246	13906.25	1
Overall	11348.6893	77458.26	7.00553	34.31473	795.59922	40647.106	2902368.14	4320

create_summary <- function(data, column_name) {
  data |>
    select(value = {{ column_name }}) |>
    summarize(
      mean = mean(value),
      sd = sd(value),
      min = min(value),
      q05 = quantile(value, 0.05),
      q50 = quantile(value, 0.50),
      q95 = quantile(value, 0.95),
      max = max(value),
      n = n()
    )
}

crsp_monthly |>
  filter(month == max(month)) |>
  group_by(exchange) |>
  create_summary(mktcap) |>
  add_row(crsp_monthly |>
            filter(month == max(month)) |>
            create_summary(mktcap) |>
            mutate(exchange = "Overall"))

dbplyr
Original

get_breaks <- function(data, n_portfolios, exchanges) {
  
  ports <- 0:n_portfolios
  ports_sql <- sql(paste0("[", paste(ports, collapse = ", "), "]"))
  breaks <- seq(0, 1, 1/n_portfolios)
  breaks_sql <- sql(paste0("[", paste(breaks, collapse = ", "), "]"))
  
  data |>
    filter(exchange %in% exchanges) |>
    summarize(portfolio = ports_sql,
              breaks = quantile_cont(mktcap_lag, breaks_sql),
              .groups = "keep") %>%
    mutate(portfolio = unnest(portfolio),
           breaks = unnest(breaks)) |>
    group_by(month) |>
    window_order(portfolio) |>
    mutate(port_max = if_else(portfolio == n_portfolios, Inf, breaks),
           port_min = if_else(portfolio == 1, -Inf, lag(breaks))) |>
    filter(portfolio > 0) |>
    select(month, portfolio, port_max, port_min) |>
    ungroup() |>
    compute()
}

assign_portfolio <- function(data, n_portfolios, exchanges) {
  breaks <- get_breaks(data, n_portfolios, exchanges)
  
  data |>
    inner_join(breaks, 
               join_by(month,
                       mktcap_lag >= port_min,
                       mktcap_lag < port_max)) |>
    select(-port_min, -port_max) 
}

assign_portfolio <- function(n_portfolios,
                             exchanges,
                             data) {
  breakpoints <- data |>
    filter(exchange %in% exchanges) |>
    reframe(breakpoint = quantile(
      mktcap_lag,
      probs = seq(0, 1, length.out = n_portfolios + 1),
      na.rm = TRUE
    )) |>
    pull(breakpoint) |>
    as.numeric()

  assigned_portfolios <- data |>
    mutate(portfolio = findInterval(mktcap_lag,
      breakpoints,
      all.inside = TRUE
    )) |>
    pull(portfolio)
  return(assigned_portfolios)
}

Here I tweak the code so that the data argument comes first in compute_portfolio_returns(). This facilitates a pipe-based approach, which I use below in the dbplyr code.

With the dbplyr approach we have no weighted.mean() function to use. Instead, we calculate the weighted mean “by hand” and set weight = if_else(value_weighted, mktcap_lag, 1) (i.e., weight is mktcap_lag for value-weighted returns, but one otherwise).

dbplyr
Original

compute_portfolio_returns <- 
  function(data = crsp_monthly,
           n_portfolios = 10,
           exchanges = c("NYSE", "NASDAQ", "AMEX"),
           value_weighted = TRUE) {
  
  exchanges_str <- str_flatten(exchanges, ", ")

  data |>
    group_by(month) |>
    assign_portfolio(n_portfolios, exchanges) |>
    group_by(month, portfolio) |>
    filter(!is.na(ret), !is.na(mktcap_lag)) |>
    mutate(weight = if_else(value_weighted, mktcap_lag, 1)) |>
    summarize(
       ret = sum(ret_excess * weight, na.rm = TRUE)/
                    sum(weight, na.rm = TRUE),
      .groups = "drop") |>
    mutate(portfolio = case_when(portfolio == min(portfolio) ~ "min",
                                 portfolio == max(portfolio) ~ "max",
                                 TRUE ~ "other")) |>
    filter(portfolio %in% c("min", "max")) |>
    pivot_wider(names_from = portfolio, values_from = ret) |>
    mutate(size_premium = min - max) |>
    summarize(size_premium = mean(size_premium)) |>
    mutate(exchanges = exchanges_str) |>
    select(exchanges, size_premium) |>
    collect()
}

compute_portfolio_returns <- 
  function(data = crsp_monthly,
           n_portfolios = 10,
           exchanges = c("NYSE", "NASDAQ", "AMEX"),
           value_weighted = TRUE) {
  
  data |>
    group_by(month) |>
    mutate(portfolio = assign_portfolio(
      n_portfolios = n_portfolios,
      exchanges = exchanges,
      data = pick(everything())
    )) |>
    group_by(month, portfolio) |>
    summarize(
      ret = if_else(value_weighted,
        weighted.mean(ret_excess, mktcap_lag),
        mean(ret_excess)
      ),
      .groups = "drop_last"
    ) |>
    summarize(size_premium = ret[portfolio == min(portfolio)] -
      ret[portfolio == max(portfolio)]) |>
    summarize(size_premium = mean(size_premium))
}

dbplyr
Original

ret_all <- 
  crsp_monthly |>
  compute_portfolio_returns(n_portfolios = 2,
                            exchanges = c("NYSE", "NASDAQ", "AMEX"),
                            value_weighted = TRUE) 

ret_nyse <- 
  crsp_monthly |>
  compute_portfolio_returns(n_portfolios = 2,
                            exchanges = "NYSE",
                            value_weighted = TRUE)

bind_rows(ret_all, ret_nyse)

exchanges	size_premium
NYSE, NASDAQ, AMEX	0.000975
NYSE	0.001664

ret_all <- compute_portfolio_returns(
  n_portfolios = 2,
  exchanges = c("NYSE", "NASDAQ", "AMEX"),
  value_weighted = TRUE,
  data = crsp_monthly
)

ret_nyse <- compute_portfolio_returns(
  n_portfolios = 2,
  exchanges = "NYSE",
  value_weighted = TRUE,
  data = crsp_monthly
)

tibble(
  Exchanges = c("NYSE, NASDAQ & AMEX", "NYSE"),
  Premium = as.numeric(c(ret_all, ret_nyse)) * 100
)

The code for p_hacking_setup is common across the two approaches.

p_hacking_setup <- 
  expand_grid(
    n_portfolios = c(2, 5, 10),
    exchanges = list("NYSE", c("NYSE", "NASDAQ", "AMEX")),
    value_weighted = c(TRUE, FALSE),
    data = parse_exprs(
     'crsp_monthly; 
      crsp_monthly |> filter(industry != "Finance");
      crsp_monthly |> filter(month < "1990-06-01");
      crsp_monthly |> filter(month >="1990-06-01")'))

Because DuckDB will use multiple cores without explicit instructions, I don’t need to set up multisession with the dbplyr approach.

dbplyr
Original

p_hacking_setup <- 
  p_hacking_setup |>
  mutate(size_premium = 
           pmap(.l = list(data, n_portfolios, exchanges, value_weighted),
                .f = ~ compute_portfolio_returns(
                  data = eval_tidy(..1),
                  n_portfolios = ..2,
                  exchanges = ..3,
                  value_weighted = ..4)))

plan(multisession, workers = availableCores())

p_hacking_setup <- p_hacking_setup |>
  mutate(size_premium = future_pmap(
    .l = list(
      n_portfolios,
      exchanges,
      value_weighted,
      data
    ),
    .f = ~ compute_portfolio_returns(
      n_portfolios = ..1,
      exchanges = ..2,
      value_weighted = ..3,
      data = eval_tidy(..4)
    )
  ))

dbplyr
Original

p_hacking_results <- 
  p_hacking_setup |>
  select(-exchanges) |>
  mutate(data = map_chr(data, deparse)) |>
  unnest(size_premium) |>
  arrange(desc(size_premium)) |>
  filter(!is.na(size_premium))

p_hacking_results

n_portfolios	value_weighted	data	exchanges	size_premium
10	FALSE	filter(crsp_monthly, month >= “1990-06-01”)	NYSE, NASDAQ, AMEX	0.0186423
10	FALSE	filter(crsp_monthly, industry != “Finance”)	NYSE, NASDAQ, AMEX	0.0182132
10	FALSE	crsp_monthly	NYSE, NASDAQ, AMEX	0.0163249
10	FALSE	filter(crsp_monthly, month < “1990-06-01”)	NYSE, NASDAQ, AMEX	0.0139120
10	TRUE	filter(crsp_monthly, industry != “Finance”)	NYSE, NASDAQ, AMEX	0.0115089
10	TRUE	filter(crsp_monthly, month < “1990-06-01”)	NYSE, NASDAQ, AMEX	0.0109079
10	TRUE	crsp_monthly	NYSE, NASDAQ, AMEX	0.0102892
10	TRUE	filter(crsp_monthly, month >= “1990-06-01”)	NYSE, NASDAQ, AMEX	0.0096950
5	FALSE	filter(crsp_monthly, industry != “Finance”)	NYSE, NASDAQ, AMEX	0.0092400
5	FALSE	filter(crsp_monthly, month >= “1990-06-01”)	NYSE, NASDAQ, AMEX	0.0089425
5	FALSE	crsp_monthly	NYSE, NASDAQ, AMEX	0.0081521
5	FALSE	filter(crsp_monthly, month < “1990-06-01”)	NYSE, NASDAQ, AMEX	0.0073290
10	FALSE	filter(crsp_monthly, industry != “Finance”)	NYSE	0.0053042
10	FALSE	filter(crsp_monthly, month >= “1990-06-01”)	NYSE	0.0049717
10	FALSE	crsp_monthly	NYSE	0.0048490
5	TRUE	filter(crsp_monthly, month < “1990-06-01”)	NYSE, NASDAQ, AMEX	0.0048375
10	FALSE	filter(crsp_monthly, month < “1990-06-01”)	NYSE	0.0047213
5	TRUE	filter(crsp_monthly, industry != “Finance”)	NYSE, NASDAQ, AMEX	0.0040888
5	FALSE	filter(crsp_monthly, industry != “Finance”)	NYSE	0.0039463
5	FALSE	filter(crsp_monthly, month < “1990-06-01”)	NYSE	0.0037026
5	TRUE	crsp_monthly	NYSE, NASDAQ, AMEX	0.0036889
5	FALSE	crsp_monthly	NYSE	0.0036439
5	FALSE	filter(crsp_monthly, month >= “1990-06-01”)	NYSE	0.0035875
2	FALSE	filter(crsp_monthly, month >= “1990-06-01”)	NYSE, NASDAQ, AMEX	0.0032007
2	FALSE	filter(crsp_monthly, industry != “Finance”)	NYSE, NASDAQ, AMEX	0.0031721
2	FALSE	crsp_monthly	NYSE, NASDAQ, AMEX	0.0027854
2	FALSE	filter(crsp_monthly, industry != “Finance”)	NYSE	0.0026137
5	TRUE	filter(crsp_monthly, month >= “1990-06-01”)	NYSE, NASDAQ, AMEX	0.0025858
10	TRUE	filter(crsp_monthly, month < “1990-06-01”)	NYSE	0.0025788
2	FALSE	filter(crsp_monthly, month >= “1990-06-01”)	NYSE	0.0025544
2	FALSE	filter(crsp_monthly, month < “1990-06-01”)	NYSE, NASDAQ, AMEX	0.0023529
2	FALSE	crsp_monthly	NYSE	0.0023461
5	TRUE	filter(crsp_monthly, month < “1990-06-01”)	NYSE	0.0021397
2	FALSE	filter(crsp_monthly, month < “1990-06-01”)	NYSE	0.0021292
2	TRUE	filter(crsp_monthly, month < “1990-06-01”)	NYSE	0.0021229
10	TRUE	crsp_monthly	NYSE	0.0018741
10	TRUE	filter(crsp_monthly, industry != “Finance”)	NYSE	0.0018642
2	TRUE	filter(crsp_monthly, industry != “Finance”)	NYSE	0.0016826
2	TRUE	crsp_monthly	NYSE	0.0016640
5	TRUE	crsp_monthly	NYSE	0.0016331
5	TRUE	filter(crsp_monthly, industry != “Finance”)	NYSE	0.0015835
2	TRUE	filter(crsp_monthly, month >= “1990-06-01”)	NYSE	0.0012233
10	TRUE	filter(crsp_monthly, month >= “1990-06-01”)	NYSE	0.0011973
5	TRUE	filter(crsp_monthly, month >= “1990-06-01”)	NYSE	0.0011465
2	TRUE	filter(crsp_monthly, month < “1990-06-01”)	NYSE, NASDAQ, AMEX	0.0011415
2	TRUE	crsp_monthly	NYSE, NASDAQ, AMEX	0.0009750
2	TRUE	filter(crsp_monthly, industry != “Finance”)	NYSE, NASDAQ, AMEX	0.0008569
2	TRUE	filter(crsp_monthly, month >= “1990-06-01”)	NYSE, NASDAQ, AMEX	0.0008150

p_hacking_results <- p_hacking_setup |>
  mutate(data = map_chr(data, deparse)) |>
  unnest(size_premium) |>
  arrange(desc(size_premium))

p_hacking_results

smb_mean <- 
  factors_ff_monthly |> 
  summarize(mean(smb, na.rm = TRUE)) |>
  pull()

p_hacking_results |>
  ggplot(aes(x = size_premium)) +
  geom_histogram(bins = nrow(p_hacking_results)) +
  labs(
    x = NULL, y = NULL,
    title = "Distribution of size premiums for different sorting choices"
  ) +
  geom_vline(aes(xintercept = smb_mean),
    linetype = "dashed"
  ) +
  scale_x_continuous(labels = percent)

Finally, we close the database.

dbDisconnect(tidy_finance, shutdown = TRUE)