Assemble + aggregate + write final artifacts

This is stage 4 of the 6-stage producer chain. It reads the three partial JSON outputs from stages 01–03, the per-sentence parquet from stage 03, and the corpus DocBin + metadata from stage 00, then:

  1. Assembles per-file metric trees via prompt_pipeline.build_file_record.
  2. Aggregates corpus-wide and per-category blocks via prompt_pipeline.aggregate.
  3. Writes the final prompt_linguistic_analysis.yaml (~1.8 MiB) — the artifact every consumer reads.
  4. Writes the final sentences_classified.parquet (~395 KiB) — read by 15_* and 21_*.
  5. Round-trips the YAML to verify schema integrity.

Stage 05 then reads these final artifacts and prints the canonical HEADLINE sheet.

Code
"""Reload everything: corpus_meta + DocBin + 3 partial JSONs + per-sentence parquet."""
import os, pathlib, json, datetime as dt, importlib
import pandas as pd
import yaml
from spacy.tokens import DocBin

_here = pathlib.Path.cwd().resolve()
PROJECT_ROOT = next(
    (p for p in [_here, *_here.parents] if (p / "prompt_pipeline.py").is_file()),
    None,
)
if PROJECT_ROOT is None:
    raise RuntimeError(
        f"Could not find prompt_pipeline.py walking up from {_here}. "
        "Run from inside the claude-prompts-analysis repo."
    )
if pathlib.Path.cwd() != PROJECT_ROOT:
    os.chdir(PROJECT_ROOT)

CACHE_DIR     = PROJECT_ROOT / "_pipeline_cache"
DOCBIN_IN     = CACHE_DIR / "corpus_docs.spacy"
META_IN       = CACHE_DIR / "corpus_meta.parquet"
PR_IN         = CACHE_DIR / "partial_register.json"
PV_IN         = CACHE_DIR / "partial_vocab_emphasis.json"
PRW_IN        = CACHE_DIR / "partial_rules_welfare.json"
SENT_PART_IN  = CACHE_DIR / "sentences_partial.parquet"

YAML_OUT     = PROJECT_ROOT / "prompt_linguistic_analysis.yaml"
PARQUET_OUT  = PROJECT_ROOT / "sentences_classified.parquet"
CORPUS_DIR   = PROJECT_ROOT / "claude-code-system-prompts" / "system-prompts"

for p in (DOCBIN_IN, META_IN, PR_IN, PV_IN, PRW_IN, SENT_PART_IN):
    assert p.exists(), f"missing {p} — run stages 00–03 first"

import prompt_pipeline
importlib.reload(prompt_pipeline)
from prompt_pipeline import NLP

df = pd.read_parquet(META_IN)
docs = list(DocBin().from_disk(DOCBIN_IN).get_docs(NLP.vocab))
assert len(docs) == len(df)

with open(PR_IN)  as f: partial_register      = json.load(f)
with open(PV_IN)  as f: partial_vocab_empha   = json.load(f)
with open(PRW_IN) as f: partial_rules_welfare = json.load(f)
sentences_df = pd.read_parquet(SENT_PART_IN)

assert len(partial_register)      == len(df)
assert len(partial_vocab_empha)   == len(df)
assert len(partial_rules_welfare) == len(df)

print(f"reloaded {len(df)} files, {len(docs)} docs, {len(sentences_df)} sentence rows")
print(f"partials: register={len(partial_register)}, vocab_emphasis={len(partial_vocab_empha)}, rules_welfare={len(partial_rules_welfare)}")
reloaded 290 files, 290 docs, 5881 sentence rows
partials: register=290, vocab_emphasis=290, rules_welfare=290

9. Assemble per-file metrics

Stitches the three partials into the unified per-block dict expected by build_file_record. Field-naming contract: - count — raw integer - pct — % of word tokens (0-100) - per_sent — average matches per sentence (rate) - sent_pct — % of sentences (used only by sentence_register)

The rule_explanation block uses n_* for raw counts and pct_* for percentages because its denominator is rule sentences, not all word tokens.

Code
from prompt_pipeline import build_file_record

# Stitch the three partials into the per-block lists expected by build_file_record.
n = len(df)
per_file_metrics = {
    "mood":              [partial_register[str(i)]["mood"]              for i in range(n)],
    "register":          [partial_register[str(i)]["register"]          for i in range(n)],
    "stance":            [partial_register[str(i)]["stance"]            for i in range(n)],
    "sentence_register": [partial_register[str(i)]["sentence_register"] for i in range(n)],
    "modality":          [partial_vocab_empha[str(i)]["modality"]       for i in range(n)],
    "vocab":             [partial_vocab_empha[str(i)]["vocab"]          for i in range(n)],
    "all_caps":          [partial_vocab_empha[str(i)]["all_caps"]       for i in range(n)],
    "caps_imperative":   [partial_vocab_empha[str(i)]["caps_imperative"] for i in range(n)],
    "justification":     [partial_vocab_empha[str(i)]["justification"]  for i in range(n)],
    "rule_explanation":    [partial_rules_welfare[str(i)]["rule_explanation"]    for i in range(n)],
    "judgment_procedural": [partial_rules_welfare[str(i)]["judgment_procedural"] for i in range(n)],
    "consequence_framing": [partial_rules_welfare[str(i)]["consequence_framing"] for i in range(n)],
    "socratic":            [partial_rules_welfare[str(i)]["socratic"]            for i in range(n)],
    "address_form":        [partial_rules_welfare[str(i)]["address_form"]        for i in range(n)],
    "imperative_streaks":  [partial_rules_welfare[str(i)]["imperative_streaks"]  for i in range(n)],
    "rules_section":       [partial_rules_welfare[str(i)]["rules_section"]       for i in range(n)],
}

per_file_records = [build_file_record(i, df, per_file_metrics) for i in range(n)]
print(f"built {len(per_file_records)} per-file records")
print()
print("--- sample (file 0):")
import pprint
pprint.pp(per_file_records[0], depth=3, sort_dicts=False)
built 290 per-file records

--- sample (file 0):
{'path': 'agent-prompt-agent-creation-architect.md',
 'category': 'Agent prompt',
 'name': 'Agent Prompt: Agent creation architect',
 'description': 'System prompt for creating custom AI agents with detailed '
                'specifications',
 'ccVersion': '2.0.77',
 'agentType': None,
 'n_tokens': 936,
 'n_sents': 22,
 'metrics': {'mood': {'marker_count': 6,
                      'marker_pct': 0.641,
                      'marker_per_sent': 0.2727},
             'register': {'ttr': 0.5407,
                          'mean_sent_len': 42.55,
                          'dep_depth': 3.91,
                          'f_score': 71.98,
                          'frozen_count': 0,
                          'formal_count': 0,
                          'consultative_count': 5,
                          'casual_count': 1,
                          'frozen_pct': 0.0,
                          'formal_pct': 0.0,
                          'consultative_pct': 0.5342,
                          'casual_pct': 0.1068,
                          'frozen_per_sent': 0.0,
                          'formal_per_sent': 0.0,
                          'consultative_per_sent': 0.2273,
                          'casual_per_sent': 0.0455,
                          'dominant_register': 'consultative'},
             'stance': {'directive_count': 18,
                        'directive_pct': 1.9231,
                        'directive_per_sent': 0.8182,
                        'expository_count': 15,
                        'expository_pct': 1.6026,
                        'expository_per_sent': 0.6818,
                        'positive_evaluative_count': 6,
                        'positive_evaluative_pct': 0.641,
                        'positive_evaluative_per_sent': 0.2727,
                        'negative_evaluative_count': 0,
                        'negative_evaluative_pct': 0.0,
                        'negative_evaluative_per_sent': 0.0,
                        'dialogic_count': 3,
                        'dialogic_pct': 0.3205,
                        'dialogic_per_sent': 0.1364,
                        'pronouns_1p_count': 2,
                        'pronouns_1p_pct': 0.2137,
                        'pronouns_1p_per_sent': 0.0909,
                        'pronouns_2p_count': 16,
                        'pronouns_2p_pct': 1.7094,
                        'pronouns_2p_per_sent': 0.7273,
                        'dominant_stance': 'directive',
                        'positive_evaluative_quality_count': 3,
                        'positive_evaluative_quality_pct': 0.3205,
                        'positive_evaluative_quality_per_sent': 0.1364,
                        'positive_evaluative_emphasis_count': 3,
                        'positive_evaluative_emphasis_pct': 0.3205,
                        'positive_evaluative_emphasis_per_sent': 0.1364},
             'sentence_register': {'collaborative_sent_count': 0,
                                   'collaborative_sent_pct': 0.0,
                                   'permissive_sent_count': 3,
                                   'permissive_sent_pct': 13.6364,
                                   'appreciative_sent_count': 0,
                                   'appreciative_sent_pct': 0.0,
                                   'imperative_sent_count': 7,
                                   'imperative_sent_pct': 31.8182,
                                   'directive_sent_count': 9,
                                   'directive_sent_pct': 40.9091,
                                   'configuring_sent_count': 5,
                                   'configuring_sent_pct': 22.7273,
                                   'none_sent_count': 7,
                                   'none_sent_pct': 31.8182,
                                   'appreciative_addressee_claude_count': 0,
                                   'appreciative_addressee_user_count': 0,
                                   'appreciative_addressee_unknown_count': 0,
                                   'collaborative_addressee_claude_count': 0,
                                   'collaborative_addressee_user_count': 0,
                                   'collaborative_addressee_unknown_count': 0,
                                   'dominant': 'directive'},
             'modality': {'deontic_count': 5,
                          'deontic_pct': 0.5342,
                          'deontic_per_sent': 0.2273,
                          'epistemic_count': 8,
                          'epistemic_pct': 0.8547,
                          'epistemic_per_sent': 0.3636,
                          'dynamic_count': 4,
                          'dynamic_pct': 0.4274,
                          'dynamic_per_sent': 0.1818,
                          'top_construction': 'should'},
             'vocab': {'hard_prohibitions': {...},
                       'hard_prescriptions': {...},
                       'soft_prescriptions': {...},
                       'politeness_direct': {...},
                       'politeness_softening': {...},
                       'warmth_encouragement': {...},
                       'hedging': {...},
                       'structural_markers': {...},
                       'profanity': {...},
                       'pronouns_2p': {...},
                       'pronouns_1p': {...}},
             'all_caps': {'count': 6,
                          'distinct': 3,
                          'pct': 0.641,
                          'per_sent': 0.2727,
                          'top': [...]},
             'caps_imperative': {'count': 1,
                                 'pct': 0.1068,
                                 'per_sent': 0.0455,
                                 'hits': {...}},
             'justification': {'count': 6,
                               'pct': 0.641,
                               'per_sent': 0.2727,
                               'ratio': 0.857},
             'rule_explanation': {'n_paragraphs': 12,
                                  'n_paragraphs_with_rules': 6,
                                  'n_paragraphs_with_rules_unexplained': 2,
                                  'n_rule_sentences': 9,
                                  'n_imperative_sentences': 9,
                                  'n_prohibition_sentences': 0,
                                  'n_explained_same': 3,
                                  'n_explained_para': 5,
                                  'n_imperative_explained_para': 5,
                                  'n_prohibition_explained_para': 0,
                                  'pct_explained_same': 33.3333,
                                  'pct_explained_para': 55.5556,
                                  'pct_imperative_explained_para': 55.5556,
                                  'pct_prohibition_explained_para': None,
                                  'pct_paragraphs_with_rules_unexplained': 33.3333},
             'judgment_procedural': {'judgment_count': 2,
                                     'procedural_count': 3,
                                     'judgment_per_sent': 0.0909,
                                     'procedural_per_sent': 0.1364,
                                     'judgment_to_procedural_ratio': 0.667},
             'consequence_framing': {'threat_count': 0,
                                     'causal_count': 5,
                                     'soft_conditional_count': 1,
                                     'threat_per_sent': 0.0,
                                     'causal_per_sent': 0.2273,
                                     'soft_conditional_per_sent': 0.0455,
                                     'threat_share': 0.0},
             'socratic': {'question_count': 0,
                          'question_pct': 0.0,
                          'question_per_sent': 0.0,
                          'apology_count': 0,
                          'apology_pct': 0.0,
                          'apology_per_sent': 0.0},
             'address_form': {'selfref_claude': 3,
                              'selfref_assistant': 1,
                              'selfref_model': 10,
                              'selfref_2p': 9,
                              'selfref_we': 0,
                              'pct_anthropomorphic': 0.2143,
                              'pct_artifact': 0.7143,
                              'pct_role': 0.0714},
             'imperative_streaks': {'n_imperative_sentences': 9,
                                    'n_streaks': 5,
                                    'streak_max': 4,
                                    'streak_mean': 1.8,
                                    'n_streaks_ge3': 1,
                                    'n_streaks_ge5': 0},
             'rules_section': {'n_paragraphs': 12,
                               'n_paragraphs_in_rules_section': 0,
                               'n_paragraphs_outside_rules_section': 12,
                               'n_rule_paragraphs_in_rules_section': 0,
                               'n_rule_paragraphs_outside_rules_section': 6,
                               'n_rule_paragraphs_in_rules_section_explained': 0,
                               'n_rule_paragraphs_outside_rules_section_explained': 4,
                               'n_rule_sentences_in_rules_section': 0,
                               'n_rule_sentences_outside_rules_section': 9,
                               'n_rule_sentences_in_rules_section_explained': 0,
                               'n_rule_sentences_outside_rules_section_explained': 5,
                               'pct_rule_paragraphs_explained_in_rules_section': None,
                               'pct_rule_paragraphs_explained_outside_rules_section': 66.6667,
                               'pct_rule_sentences_explained_in_rules_section': None,
                               'pct_rule_sentences_explained_outside_rules_section': 55.5556}}}

10. Corpus + per-category aggregation

aggregate(per_file_records, indices, df, docs) sums per-file counts for every density block and recomputes pct / per_sent from totals. TTR + dep-depth are recomputed by walking the union of docs (set semantics + per-token traversal don’t compose as simple sums).

Code
from prompt_pipeline import aggregate

corpus_block = aggregate(per_file_records, list(range(len(df))), df, docs)
print("corpus n_tokens:", corpus_block["n_tokens"], "n_sents:", corpus_block["n_sents"])
print("corpus mood:", corpus_block["metrics"]["mood"])
print()
re_b = corpus_block["metrics"]["rule_explanation"]
print(f"corpus rule_explanation pct_explained_para: {re_b['pct_explained_para']:.2f}%  ← Tier-1 headline")
jp = corpus_block["metrics"]["judgment_procedural"]
print(f"corpus judgment_procedural ratio: {jp['judgment_to_procedural_ratio']}  ← Tier-3 v1")
cf = corpus_block["metrics"]["consequence_framing"]
print(f"corpus consequence_framing threat_share: {cf['threat_share']}")
af = corpus_block["metrics"]["address_form"]
print(f"corpus address_form pct_anthropomorphic: {af['pct_anthropomorphic']}")
isk = corpus_block["metrics"]["imperative_streaks"]
print(f"corpus imperative_streaks: streak_max={isk['streak_max']} n_streaks_ge5={isk['n_streaks_ge5']}")

by_category = {}
for cat, sub in df.groupby("category"):
    indices = sub.index.tolist()
    block = aggregate(per_file_records, indices, df, docs)
    block["n_files"] = len(indices)
    by_category[cat] = block

print()
print("per-category summary:")
for cat, b in by_category.items():
    m  = b["metrics"]["mood"]
    sr = b["metrics"]["sentence_register"]
    print(f"{cat:18s} files={b['n_files']:3d}  tokens={b['n_tokens']:6d}  "
          f"imp_sent%={sr['imperative_sent_pct']:5.1f}  "
          f"dir_sent%={sr['directive_sent_pct']:5.1f}  "
          f"cfg_sent%={sr['configuring_sent_pct']:5.1f}  "
          f"marker%={m['marker_pct']:5.2f}")
corpus n_tokens: 133611 n_sents: 5881
corpus mood: {'marker_count': 1026, 'marker_pct': 0.7679, 'marker_per_sent': 0.1745}

corpus rule_explanation pct_explained_para: 24.34%  ← Tier-1 headline
corpus judgment_procedural ratio: 0.131  ← Tier-3 v1
corpus consequence_framing threat_share: 0.0552
corpus address_form pct_anthropomorphic: 0.6456
corpus imperative_streaks: streak_max=12 n_streaks_ge5=52

per-category summary:
Agent prompt       files= 37  tokens= 27522  imp_sent%= 29.7  dir_sent%= 15.1  cfg_sent%=  5.4  marker%= 0.73
Data / template    files= 39  tokens= 32931  imp_sent%= 22.4  dir_sent%=  6.5  cfg_sent%=  5.4  marker%= 0.55
Skill              files= 30  tokens= 38530  imp_sent%= 32.9  dir_sent%= 11.6  cfg_sent%=  4.8  marker%= 0.65
System prompt      files= 64  tokens= 16156  imp_sent%= 40.4  dir_sent%= 21.2  cfg_sent%=  4.9  marker%= 0.92
System reminder    files= 40  tokens=  3793  imp_sent%= 46.2  dir_sent%= 28.3  cfg_sent%=  2.9  marker%= 1.79
Tool description   files= 79  tokens= 14500  imp_sent%= 37.1  dir_sent%= 23.4  cfg_sent%=  5.4  marker%= 1.19
Tool parameter     files=  1  tokens=   179  imp_sent%= 78.6  dir_sent%=  0.0  cfg_sent%= 50.0  marker%= 0.00

11. Write the final artifacts

Composes the YAML output dict (metadata + lexicons + corpus + by_category + files), writes it, copies the per-sentence parquet from _pipeline_cache/, and round-trips the YAML to verify schema integrity.

Code
from prompt_pipeline import yaml_lexicons_block

output = {
    "metadata": {
        "generated_at": dt.datetime.now(dt.timezone.utc).isoformat(timespec="seconds"),
        "spacy_model": f"{NLP.meta['name']}-{NLP.meta['version']}",
        "source_dir": str(CORPUS_DIR.relative_to(PROJECT_ROOT)),
        "n_files": len(df),
        "total_tokens": int(df["n_tokens"].sum()),
        "total_sents":  int(df["n_sents"].sum()),
    },
    "lexicons": yaml_lexicons_block(),
    "corpus": corpus_block,
    "by_category": by_category,
    "files": per_file_records,
}

with open(YAML_OUT, "w") as f:
    yaml.safe_dump(output, f, sort_keys=False, allow_unicode=True, width=120)

size = YAML_OUT.stat().st_size
print(f"wrote {YAML_OUT.name}")
print(f"      {size:,} bytes ({size/1024:.1f} KiB)")

# Promote per-sentence partial to the final parquet path.
sentences_df.to_parquet(PARQUET_OUT, index=False)
psize = PARQUET_OUT.stat().st_size
print(f"wrote {PARQUET_OUT.name}")
print(f"      {psize:,} bytes ({psize/1024:.1f} KiB), {len(sentences_df):,} rows")

# Round-trip integrity check
with open(YAML_OUT) as f:
    reloaded = yaml.safe_load(f)
assert set(reloaded.keys()) == {"metadata", "lexicons", "corpus", "by_category", "files"}
assert len(reloaded["files"]) == len(df)
assert "MODALITY_VERBS_REGEX" not in reloaded["lexicons"]
assert "SIMPLE_MODAL_LOOKUP" in reloaded["lexicons"]
assert "SENTENCE_REGISTER_MARKERS" in reloaded["lexicons"]
assert "JUDGMENT_VERBS" in reloaded["lexicons"]
assert "THREAT_PATTERNS" in reloaded["lexicons"]
assert "ADDRESS_FORM_PATTERNS" in reloaded["lexicons"]
assert "RULES_HEADING_RE" in reloaded["lexicons"]
assert set(reloaded["lexicons"]["SENTENCE_REGISTER_MARKERS"].keys()) == {
    "collaborative", "permissive", "appreciative", "configuring"}
assert "evaluative" not in reloaded["lexicons"]["STANCE_MARKERS"]
assert "positive_evaluative" in reloaded["lexicons"]["STANCE_MARKERS"]
assert "negative_evaluative" in reloaded["lexicons"]["STANCE_MARKERS"]
sr0 = reloaded["files"][0]["metrics"]["sentence_register"]
assert "collaborative_sent_pct" in sr0
assert "imperative_sent_pct" in sr0
assert "imperative_sent_pct" not in reloaded["files"][0]["metrics"]["mood"]
re0 = reloaded["files"][0]["metrics"].get("rule_explanation", {})
assert "pct_explained_para" in re0, "Tier-1 rule_explanation missing"
jp0 = reloaded["files"][0]["metrics"].get("judgment_procedural", {})
assert "judgment_to_procedural_ratio" in jp0, "Tier-3 v1 judgment_procedural missing"
af_corpus = reloaded["corpus"]["metrics"].get("address_form", {})
assert "selfref_claude" in af_corpus, "Tier-3 v1 address_form missing"
isk0 = reloaded["files"][0]["metrics"].get("imperative_streaks", {})
assert "streak_max" in isk0, "Tier-3 v2 imperative_streaks missing"
rsec0 = reloaded["files"][0]["metrics"].get("rules_section", {})
assert "n_rule_paragraphs_in_rules_section" in rsec0, "Tier-3 v2 rules_section missing"
stance0 = reloaded["files"][0]["metrics"]["stance"]
assert "positive_evaluative_quality_count" in stance0
assert "positive_evaluative_emphasis_count" in stance0
sr0 = reloaded["files"][0]["metrics"]["sentence_register"]
assert "appreciative_addressee_user_count" in sr0
assert "collaborative_addressee_unknown_count" in sr0

print("round-trip OK — top-level keys:", list(reloaded.keys()))
print("round-trip OK — files:", len(reloaded["files"]))
print("round-trip OK — Tier-1, Tier-3 v1, and Tier-3 v2 blocks all present")
wrote prompt_linguistic_analysis.yaml
      1,911,705 bytes (1866.9 KiB)
wrote sentences_classified.parquet
      414,101 bytes (404.4 KiB), 5,881 rows
round-trip OK — top-level keys: ['metadata', 'lexicons', 'corpus', 'by_category', 'files']
round-trip OK — files: 290
round-trip OK — Tier-1, Tier-3 v1, and Tier-3 v2 blocks all present
Back to top