"""Loudness & imperative-density: per-metric 2+1 panel charts.
Four metrics (ALL CAPS density, CAPS-imperative density, imperative-marker density,
imperative-sentence share). Each metric gets its own chart with a 2+1 layout:
[snapshot | cumulative count-weighted] side-by-side on top, cumulative absolute
count full-width below.
Snapshot panel: per-version mean of per-file rates (a within-version
descriptive statistic — fine even when individual versions have few files).
Cumulative density panel: count-weighted running rate
(`cumulative_count_by_version(num_count, n_tokens|n_sents)` — `Σ feature /
Σ document_size` ×100). The latest-version endpoint equals the corpus-wide
rate published in the canonical `HEADLINE` sheet by construction.
Cumulative absolute count: running sum of the underlying counts. So a flat
percentage with a steeply rising absolute means the feature is becoming
more prevalent in absolute terms even if the per-file rate is steady.
Both cumulative panels suppress data before the cumulative pool reaches
20 files (v2.1.18 in the current corpus) — below that threshold the running
ratio is dominated by single-file outliers and is not a defensible corpus
claim. Earlier versions exist and contribute to the running state; they
just aren't plotted.
"""
SMALL_N_THRESHOLD = 20
# (pct_col, count_col, denom_col, label, slug, unit, color)
LOUDNESS_METRICS = [
("all_caps_pct", "all_caps_count", "n_tokens",
"ALL CAPS density", "all-caps",
"% of file tokens", "#e15759"),
("caps_imp_pct", "caps_imp_count", "n_tokens",
"CAPS imperative density", "caps-imperative",
"% of file tokens", "#af7aa1"),
("mood_marker_pct", "mood_marker_count", "n_tokens",
"Imperative-marker density (per word)", "imperative-marker",
"% of file tokens", "#4e79a7"),
("imperative_sent_pct", "imperative_sent_count", "n_sents",
"Imperative sentences (per sentence)", "imperative-sentences",
"% of sentences", "#f28e2c"),
]
df_ver = alt_df[alt_df["ccVersion"] != ""].copy()
ver_order_cum = (
df_ver.drop_duplicates("ccVersion").sort_values("ccVersion_sort")["ccVersion"].tolist()
)
# Snapshot per ccVersion: simple mean of per-file rate (per-version, not cumulative — fine).
snap_frames = []
for pct_col, _count_col, _denom_col, label, _slug, unit, _color in LOUDNESS_METRICS:
g = (
df_ver.groupby(["ccVersion", "ccVersion_sort"])[pct_col]
.mean().reset_index().rename(columns={pct_col: "value"})
)
g["metric"] = label
g["unit"] = unit
snap_frames.append(g)
snap_df = pd.concat(snap_frames, ignore_index=True)
# Cumulative count-weighted rate.
cum_mean_frames = []
for pct_col, count_col, denom_col, label, _slug, _unit, _color in LOUDNESS_METRICS:
cw = cumulative_count_by_version(df_ver, count_col, denom_col, pct=True,
metric_label=pct_col)
cw["label"] = label
cum_mean_frames.append(cw)
cum_mean_df = pd.concat(cum_mean_frames, ignore_index=True)
cum_mean_df = cum_mean_df[cum_mean_df["n_files_so_far"] >= SMALL_N_THRESHOLD]
# Cumulative absolute count: running sum of the count column.
count_cols = [m[1] for m in LOUDNESS_METRICS]
cum_abs_df = cumulative_by_version(df_ver, count_cols, agg="sum")
count_to_label = {m[1]: m[3] for m in LOUDNESS_METRICS}
cum_abs_df["label"] = cum_abs_df["metric"].map(count_to_label)
cum_abs_df = cum_abs_df[cum_abs_df["n_files_so_far"] >= SMALL_N_THRESHOLD]
def _loudness_block(pct_col, count_col, label, unit, color):
"""Build a single metric's 2+1 panel chart."""
snap_panel = (
alt.Chart(snap_df[snap_df["metric"] == label])
.mark_line(point=alt.OverlayMarkDef(filled=True, size=40),
strokeWidth=2.0, color=color)
.encode(
x=alt.X("ccVersion:N", sort=ver_order_cum, title=None,
axis=alt.Axis(labelAngle=-90, labelLimit=60,
labelOverlap=False, labelFontSize=8)),
y=alt.Y("value:Q", title=unit),
tooltip=[alt.Tooltip("ccVersion:N"),
alt.Tooltip("value:Q", format=".3f", title="snapshot mean"),
alt.Tooltip("unit:N")],
)
.properties(width=400, height=160, title="snapshot — per-version mean")
)
cum_density_panel = (
alt.Chart(cum_mean_df[cum_mean_df["metric"] == pct_col])
.mark_line(point=alt.OverlayMarkDef(filled=True, size=30),
strokeWidth=2.0, color=color)
.encode(
x=alt.X("ccVersion:N", sort=ver_order_cum, title=None,
axis=alt.Axis(labelAngle=-90, labelLimit=60,
labelOverlap=False, labelFontSize=8)),
y=alt.Y("value:Q", title=f"{unit} (count-weighted)"),
tooltip=[alt.Tooltip("ccVersion:N"),
alt.Tooltip("value:Q", format=".3f", title="count-weighted %"),
alt.Tooltip("num_so_far:Q", format=",.0f", title="Σ count"),
alt.Tooltip("den_so_far:Q", format=",.0f", title="Σ tokens/sents"),
alt.Tooltip("n_files_so_far:Q", title="files ≤ V")],
)
.properties(width=400, height=160, title="cumulative density (count-weighted, n≥20)")
)
cum_abs_panel = (
alt.Chart(cum_abs_df[cum_abs_df["metric"] == count_col])
.mark_line(point=alt.OverlayMarkDef(filled=True, size=30),
strokeWidth=2.0, color=color)
.encode(
x=alt.X("ccVersion:N", sort=ver_order_cum, title="ccVersion",
axis=alt.Axis(labelAngle=-90, labelLimit=60,
labelOverlap=False, labelFontSize=8)),
y=alt.Y("value:Q", title="cumulative count"),
tooltip=[alt.Tooltip("ccVersion:N"),
alt.Tooltip("value:Q", format=",.0f", title="cumulative count"),
alt.Tooltip("n_files_so_far:Q", title="files ≤ V")],
)
.properties(width=820, height=210, title="cumulative absolute count (n≥20)")
)
top = alt.hconcat(snap_panel, cum_density_panel).resolve_scale(y="independent")
return alt.vconcat(top, cum_abs_panel).resolve_scale(y="independent").properties(
title=alt.TitleParams(label, anchor="start", fontSize=14)
)
for pct_col, count_col, _denom_col, label, slug, unit, color in LOUDNESS_METRICS:
block = _loudness_block(pct_col, count_col, label, unit, color)
save_chart(block, f"14-loudness-{slug}")
display(block)