5 Graph Tables, Add Labels, Make Notes

5.1 Use Pipes to Summarize Data

import polars as pl
import plotnine_polars as p9
from plotnine_polars import aes, position_jitter
from socviz_pl import load_data, theme_socviz
from plotnine._mpl.gridspec import p9GridSpec

if not hasattr(p9GridSpec, "locally_modified_subplot_params"):
    p9GridSpec.locally_modified_subplot_params = lambda self: []

p9.theme_set(theme_socviz())

gss_sm = load_data("gss_sm")
religion_levels = ["Protestant", "Catholic", "Jewish", "None", "Other", "NA"]

rel_by_region = (
    gss_sm
    .with_columns(religion=pl.col("religion").fill_null("NA"))
    .group_by("bigregion", "religion")
    .agg(n=pl.len())
    .with_columns(
        freq=pl.col("n") / pl.col("n").sum().over("bigregion")
    )
    .with_columns(
        pct=(pl.col("freq") * 100).round(0)
    )
    .sort("bigregion", "religion")
)

rel_by_region

shape: (24, 5)

bigregion	religion	n	freq	pct
str	str	u32	f64	f64
"Midwest"	"Catholic"	172	0.247482	25.0
"Midwest"	"Jewish"	3	0.004317	0.0
"Midwest"	"NA"	5	0.007194	1.0
"Midwest"	"None"	157	0.225899	23.0
"Midwest"	"Other"	33	0.047482	5.0
…	…	…	…	…
"West"	"Jewish"	10	0.015823	2.0
"West"	"NA"	1	0.001582	0.0
"West"	"None"	180	0.28481	28.0
"West"	"Other"	48	0.075949	8.0
"West"	"Protestant"	238	0.376582	38.0

rel_by_region.group_by("bigregion").agg(sum=pl.col("pct").sum())

shape: (4, 2)

bigregion	sum
str	f64
"Midwest"	101.0
"Northeast"	100.0
"South"	100.0
"West"	101.0

(
    rel_by_region
    .ggplot(aes(x="bigregion", y="pct", fill="religion"))
    .geom_col(position="dodge2")
    .labs(x="Region", y="Percent", fill="Religion")
    .add_theme(legend_position="top")
)

(
    rel_by_region
    .ggplot(aes(y="pct", x="religion", fill="religion"))
    .geom_col()
    .coord_flip()
    .labs(y="Percent", x=None, fill="Religion")
    .add_guides(fill="none")
    .scale_x_discrete(limits=religion_levels)
    .facet_grid(cols="bigregion")
)

5.2 More Geoms

organdata = load_data("organdata")
country_order = (
    organdata
    .group_by("country")
    .agg(pl.col("donors").median().alias("med"))
    .sort("med")
    .get_column("country")
    .to_list()
)
organdata = organdata.with_columns(
    pl.col("country").cast(pl.Enum(country_order))
)

organdata.select(pl.all().exclude("pop")).sample(n=10, seed=123)

shape: (10, 20)

country	year	donors	pop_dens	gdp	gdp_lag	health	health_lag	pubhealth	roads	cerebvas	assault	external	txp_pop	world	opt	consent_law	consent_practice	consistent	ccode
enum	date	f64	f64	f64	f64	f64	f64	f64	f64	f64	f64	f64	f64	str	str	str	str	str	str
"Netherlands"	1997-01-01	14.4	37.589694	23753.0	22541.0	1936.0	1878.0	5.5	74.498751	563.0	13.0	282.0	0.704631	"SocDem"	"In"	"Informed"	"Informed"	"Yes"	"Neth"
"Sweden"	2000-01-01	10.9	1.971731	26574.0	25099.0	2243.0	2119.0	7.2	65.374211	555.0	10.0	352.0	0.676285	"SocDem"	"Out"	"Presumed"	"Informed"	"No"	"Swe"
"Netherlands"	null	null	null	null	28983.0	2831.0	2643.0	null	null	null	null	null	null	"SocDem"	"In"	"Informed"	"Informed"	"Yes"	"Neth"
"Italy"	1999-01-01	13.7	19.129887	23729.0	23291.0	1853.0	1800.0	5.6	115.064358	627.0	11.0	343.0	0.451029	"Corporatist"	"In"	"Presumed"	"Informed"	"No"	"Ita"
"Australia"	null	null	null	null	28168.0	2754.0	2629.0	null	null	null	null	null	null	"Liberal"	"In"	"Informed"	"Informed"	"Yes"	"Oz"
"Netherlands"	null	null	36.002889	17707.0	16580.0	1419.0	1320.0	5.4	92.027822	649.0	9.0	310.0	0.735688	"SocDem"	"In"	"Informed"	"Informed"	"Yes"	"Neth"
"Spain"	2002-01-01	33.7	8.275658	21592.0	20864.0	1646.0	1567.0	5.4	127.692602	416.0	11.0	345.0	0.668673	null	"Out"	"Presumed"	"Informed"	"No"	"Spa"
"United Kingdom"	1995-01-01	14.4	23.879215	19998.0	18994.0	1393.0	1331.0	5.8	64.908198	718.0	10.0	279.0	0.706836	"Liberal"	"In"	"Informed"	"Informed"	"Yes"	"UK"
"United States"	1991-01-01	17.89	2.627258	23443.0	23038.0	2957.0	2738.0	5.2	164.075563	457.0	103.0	565.0	1.083085	"Liberal"	"In"	"Informed"	"Informed"	"Yes"	"USA"
"Netherlands"	2002-01-01	12.6	38.885143	28983.0	28756.0	2643.0	2455.0	5.5	61.118336	500.0	9.0	258.0	0.681157	"SocDem"	"In"	"Informed"	"Informed"	"Yes"	"Neth"

(
    organdata
    .ggplot(aes(x="year", y="donors"))
    .geom_point()
)

(
    organdata
    .ggplot(aes(x="year", y="donors"))
    .geom_line(aes(group="country"))
    .facet_wrap("country", ncol=4)
)

(
    organdata
    .ggplot(aes(x="country", y="donors"))
    .geom_boxplot()
)

The R plot just flips the aes() arguments. This does not work with plotnine, but .coord_flip() does.

(
    organdata
    .ggplot(aes(x="country", y="donors"))
    .geom_boxplot()
    .coord_flip()
)

(
    organdata
    .sort("donors")
    .ggplot(aes(y="donors", x="country"))
    .geom_boxplot()
    .labs(y=None)
    .coord_flip()
)

(
    organdata
    .ggplot(aes(y="donors", x="country", fill="world"))
    .geom_boxplot()
    .labs(y=None, fill="World")
    .add_theme(legend_position="top")
    .coord_flip()
)

(
    organdata
    .ggplot(aes(x="donors", y="country", color="world"))
    .geom_point()
    .labs(y=None)
    .add_theme(legend_position="top")
)

(
    organdata
    .ggplot(aes(x="donors", y="country", color="world"))
    .geom_jitter()
    .labs(y=None)
    .add_theme(legend_position="top")
)

(
    organdata
    .ggplot(aes(x="donors", y="country", color="world"))
    .geom_jitter(position=position_jitter(height=0.15))
    .labs(y=None)
    .add_theme(legend_position="top")
)

5.3 Grouped Summaries

cols=["donors", "gdp", "health", "roads", "cerebvas"]

by_country = (
    organdata
    .group_by("consent_law", "country")
    .agg(
        pl.col(cols).mean().name.suffix("_mean"),
        pl.col(cols).std().name.suffix("_sd"),
    )
    .sort("donors_mean")
)
by_country = by_country.with_columns(
    pl.col("country").cast(pl.Enum(by_country["country"].to_list()))
)

by_country

shape: (17, 12)

consent_law	country	donors_mean	gdp_mean	health_mean	roads_mean	cerebvas_mean	donors_sd	gdp_sd	health_sd	roads_sd	cerebvas_sd
str	enum	f64	f64	f64	f64	f64	f64	f64	f64	f64	f64
"Informed"	"Australia"	10.635	22178.538462	1957.5	104.875728	557.692308	1.142808	3958.505665	481.627649	14.327316	82.698634
"Presumed"	"Italy"	11.1	21554.153846	1757.0	121.942937	712.153846	4.277	2781.30898	271.237903	10.157891	118.032373
"Informed"	"Germany"	13.041667	22163.230769	2348.75	112.788734	706.769231	0.611196	2501.344177	377.227474	25.911094	126.03515
"Informed"	"Denmark"	13.091667	23722.307692	2054.071429	101.636346	640.692308	1.468121	3895.685292	371.361417	12.421001	46.271634
"Presumed"	"Sweden"	13.125	22415.461538	1951.357143	72.345753	595.307692	1.753503	3213.468391	372.978986	13.246919	49.684647
…	…	…	…	…	…	…	…	…	…	…	…
"Informed"	"Ireland"	19.791667	20824.384615	1479.928571	117.774245	704.692308	2.478437	6669.580078	565.552618	10.761587	87.203196
"Informed"	"United States"	19.981667	29211.769231	3988.285714	155.167832	444.384615	1.325367	4571.159958	864.931961	8.35381	16.049603
"Presumed"	"Belgium"	21.9	22499.615385	1958.357143	154.695038	593.846154	1.935787	3170.583636	405.114154	20.556129	55.249202
"Presumed"	"Austria"	23.525	23875.846154	1875.357143	149.865413	768.846154	2.415904	3342.88944	296.897964	30.281692	119.642416
"Presumed"	"Spain"	28.108333	16933.0	1289.071429	161.1143	654.769231	4.963038	2888.342547	265.896008	35.251103	138.650132

(
    by_country
    .ggplot(aes(
        x="donors_mean",
        y="country",
        color="consent_law"
    ))
    .geom_point(size=3)
    .labs(x="Donor Procurement Rate", y=None, color="Consent Law")
    .add_theme(legend_position="top")
)

(
    by_country
    .ggplot(aes(x="donors_mean", y="country"))
    .geom_point(size=3)
    .facet_wrap("consent_law", scales="free_y", ncol=1)
    .labs(x="Donor Procurement Rate", y=None)
)

Plotnine’s geom_pointrange() expects vertical ranges, so the horizontal version uses geom_errorbarh() plus points.

(
    by_country
    .ggplot(aes(x="donors_mean", y="country"))
    .geom_errorbarh(
        aes(xmin="donors_mean - donors_sd",
            xmax="donors_mean + donors_sd"),
        height=0.2
    )
    .geom_point(size=2)
    .labs(x="Donor Procurement Rate", y=None)
)

5.4 Label Outliers

Plotnine does not provide geom_text_repel() directly, but geom_text() can use the adjustText package via its adjust_text argument. This gives us a close approximation.

(
    by_country
    .ggplot(aes(x="gdp_mean", y="health_mean"))
    .geom_point()
    .geom_text(data=by_country.filter(pl.col("gdp_mean") > 25_000),
               mapping=aes(label="country"))
)

(
    by_country
    .ggplot(aes(x="gdp_mean", y="health_mean"))
    .geom_point()
    .geom_text(
        data=by_country.filter(
                (pl.col("gdp_mean") > 25_000) |
                (pl.col("health_mean") > 1500) |
                (pl.col("country").is_in(["Belgium"]))
            ),
        mapping=aes(label="country"))
)

(
    by_country
    .ggplot(aes(x="roads_mean", y="donors_mean"))
    .geom_point()
    .geom_text(aes(label="country"),
               adjust_text={"arrowprops": {"arrowstyle": "-"}})
)

(
    by_country
    .ggplot(aes(x="roads_mean", y="donors_mean"))
    .geom_point()
    .geom_text(
        aes(label="country"),
        adjust_text={"arrowprops": {"arrowstyle": "-"}}
    )
)

(
    by_country
    .ggplot(aes(x="gdp_mean", y="health_mean"))
    .geom_point()
    .geom_text(
        data=by_country.filter(pl.col("gdp_mean") > 25_000),
        mapping=aes(label="country"),
        adjust_text={"arrowprops": {"arrowstyle": "-"}}
    )
)

(
    by_country
    .ggplot(aes(x="gdp_mean", y="health_mean"))
    .geom_point()
    .geom_text(
        data=by_country.filter(
            (pl.col("gdp_mean") > 25_000) |
            (pl.col("health_mean") < 1_500) |
            (pl.col("country") == "Belgium")
        ),
        mapping=aes(label="country"),
        adjust_text={"arrowprops": {"arrowstyle": "-"}}
    )
)

my_organdata = organdata.with_columns(
    ind=(
        pl.col("ccode").is_in(["Ita", "Spa"]) &
        (pl.col("year").dt.year() > 1998)
    )
)

(
    my_organdata
    .ggplot(aes(x="roads", y="donors", color="ind"))
    .geom_point()
    .geom_text(
        data=my_organdata.filter(pl.col("ind")),
        mapping=aes(label="ccode"),
        adjust_text={"arrowprops": {"arrowstyle": "-"}}
    )
    .add_guides(color="none")
)

elections_historic = load_data("elections_historic")

p_title = "Popular Vote and Electoral College Shares"
p_subtitle = "1824-2024"
x_label = "Winner's share of Popular Vote"
y_label = "Winner's share of Electoral College Votes"

(
    elections_historic
    .ggplot(aes(x="popular_pct", y="ec_pct"))
    .geom_hline(yintercept=0.5, size=1.4, color="#CCCCCC")
    .geom_vline(xintercept=0.5, size=1.4, color="#CCCCCC")
    .geom_point(size=0.4)
    .geom_text(
        aes(label="winner_label"),
        size=4,
        adjust_text={
            "arrowprops": {"arrowstyle": "-", "color": "gray"},
            "min_arrow_len": 1,
            "force_static": (0.01, 0.01),
            "force_text": (0.01, 0.01),
        }
    )
    .scale_x_continuous(labels=lambda lst: [f"{v:.0%}" for v in lst])
    .scale_y_continuous(labels=lambda lst: [f"{v:.0%}" for v in lst])
    .labs(x=x_label, y=y_label, title=p_title, subtitle=p_subtitle)
)

5.5 Add Annotations

The original R example also shows I() for placing annotations with relative coordinates inside the plotting area. There does not appear to be a direct I() equivalent for annotate() in plotnine, so these examples use data coordinates.

(
    organdata
    .ggplot(aes(x="roads", y="donors"))
    .geom_point()
    .annotate(
        "text",
        x=91,
        y=33,
        size=8,
        label="A surprisingly high\nrecovery rate.",
        lineheight=0.9,
        ha="left"
    )
)

(
    organdata
    .ggplot(aes(x="roads", y="donors"))
    .geom_point()
    .annotate(
        "rect",
        xmin=125,
        xmax=155,
        ymin=30,
        ymax=35,
        fill="red",
        alpha=0.2
    )
    .annotate(
        "text",
        x=157,
        y=33,
        size=8,
        label="A surprisingly high\nrecovery rate.",
        lineheight=0.9,
        ha="left"
    )
)

5.6 Understanding Scales, Guides, and Themes

(
    organdata
    .ggplot(aes(x="roads", y="donors", color="world"))
    .geom_point()
)

(
    organdata
    .ggplot(aes(x="roads", y="donors", color="world"))
    .geom_point()
    .scale_x_log10()
    .scale_y_continuous(
        breaks=[5, 15, 25],
        labels=["Five", "Fifteen", "Twenty Five"]
    )
)

(
    organdata
    .ggplot(aes(x="roads", y="donors", color="world"))
    .geom_point()
    .scale_color_discrete(
        labels=["Corporatist", "Liberal", "Social Democratic", "Unclassified"]
    )
    .labs(
        x="Road Deaths",
        y="Donor Procurement",
        color="Welfare State"
    )
)

(
    organdata
    .ggplot(aes(x="roads", y="donors", color="world"))
    .geom_point()
    .labs(x="Road Deaths", y="Donor Procurement")
    .add_guides(color="none")
)