"""Sentence-register per category × class + addressee drilldown — vconcat composite."""
# --- top panel: per-category × per-class distribution ---
sr_long = pd.DataFrame([
{
"category": cat,
"class": cls,
"sent_pct": by_category[cat]["metrics"]["sentence_register"][f"{cls}_sent_pct"],
"sent_count": by_category[cat]["metrics"]["sentence_register"][f"{cls}_sent_count"],
}
for cat in by_category
for cls in SENT_REGISTER_CLASSES
])
sr_class_domain = SENT_REGISTER_CLASSES
sr_class_range = [SR_CLASS_COLORS[c] for c in sr_class_domain]
sr_chart = (
alt.Chart(sr_long)
.mark_bar()
.encode(
x=alt.X("sent_pct:Q", title="% of sentences"),
y=alt.Y("class:N", sort=sr_class_domain, title=None),
color=alt.Color("class:N",
scale=alt.Scale(domain=sr_class_domain, range=sr_class_range),
legend=None),
row=alt.Row("category:N", title=None,
header=alt.Header(labelAngle=0, labelAlign="left")),
tooltip=[
alt.Tooltip("category:N"),
alt.Tooltip("class:N"),
alt.Tooltip("sent_pct:Q", format=".2f", title="sent %"),
alt.Tooltip("sent_count:Q", title="sentences"),
],
)
.properties(width=520, height=140,
title="Sentence-register per category × class (multi-label, % of sentences)")
)
# --- bottom panel: addressee breakdown for the two near-zero classes ---
addr_rows = []
for cls in ("appreciative", "collaborative"):
block = corpus_block["metrics"]["sentence_register"]
for who in ("claude", "user", "unknown"):
addr_rows.append({
"class": cls,
"addressee": who,
"count": block[f"{cls}_addressee_{who}_count"],
})
addr_df = pd.DataFrame(addr_rows)
print("Corpus-wide addressee distribution for the two near-zero classes:")
print(addr_df.pivot(index="class", columns="addressee", values="count")
[["claude", "user", "unknown"]].to_string())
addr_chart = (
alt.Chart(addr_df)
.mark_bar()
.encode(
y=alt.Y("class:N", sort=["appreciative", "collaborative"], title=None),
x=alt.X("count:Q", title="sentences"),
color=alt.Color("addressee:N",
scale=alt.Scale(
domain=["claude", "user", "unknown"],
range=["#4e79a7", "#e15759", "#bab0ab"]),
legend=alt.Legend(title="addressee", orient="bottom")),
yOffset="addressee:N",
tooltip=[alt.Tooltip("class:N"),
alt.Tooltip("addressee:N"),
alt.Tooltip("count:Q", format=",")],
)
.properties(width=520, height=160,
title="Addressee distribution for the two near-zero pragmatic classes")
)
# --- forensic inspection: actual sentences from the parquet ---
import pathlib
parquet_path = pathlib.Path("sentences_classified.parquet")
if parquet_path.exists():
sentences_df = pd.read_parquet(parquet_path)
APPR_KEYWORDS = ["thank you", "thanks", "appreciate", "great job",
"well done", "kudos", "much appreciated"]
pat = "|".join(APPR_KEYWORDS)
appr_sample = sentences_df[
sentences_df["text"].str.lower().str.contains(pat, regex=True, na=False)
][["file_path", "text", "addressee"]].head(10)
print("\nForensic inspection — sentences containing appreciative keywords:")
print(appr_sample.to_string(index=False, max_colwidth=80))
sr_composite = alt.vconcat(sr_chart, addr_chart).resolve_scale(color="independent").properties(
title=alt.TitleParams(
"Sentence-register per category + addressee drilldown",
subtitle=["Top: 4-class distribution per category. "
"Bottom: who is being addressed in the two near-zero classes."],
anchor="start",
)
)
save_chart(sr_composite, "10-sentence-register-by-category")