|
| 1 | +"""Shakespeare-aware creativity scoring. |
| 2 | +
|
| 3 | +Replaces val=CE-on-next-token (which only rewards exact reproduction) |
| 4 | +with metrics that measure whether GENERATED text is Shakespeare-LIKE |
| 5 | +without being identical: |
| 6 | +
|
| 7 | + - n-gram overlap: fraction of n-char windows in generated text that |
| 8 | + appear ANYWHERE in the corpus. Measures Shakespearean character |
| 9 | + patterns without exact-word requirement. |
| 10 | + - vocab overlap: fraction of generated tokens (whitespace-separated) |
| 11 | + that match corpus vocabulary. Real English/Shakespeare words even |
| 12 | + if not in the same sentence. |
| 13 | + - line structure: avg line length, ratio of letters to total chars. |
| 14 | + Captures stanza/line-break patterns. |
| 15 | + - vowel-consonant transition rate: English alternates v/c; random |
| 16 | + text doesn't. Score the alternation pattern. |
| 17 | +
|
| 18 | +Use these to evaluate creative output of substrate-aligned model. A |
| 19 | +model that produces statistically-Shakespearean GIBBERISH gets ~0; |
| 20 | +a model that produces creative but recognizable English gets high. |
| 21 | +""" |
| 22 | + |
| 23 | +import string |
| 24 | +from collections import Counter |
| 25 | + |
| 26 | + |
| 27 | + |
| 28 | + |
| 29 | +VOWELS = set("aeiouAEIOU") |
| 30 | +LETTERS = set(string.ascii_letters) |
| 31 | +WHITESPACE = set(" \n\t") |
| 32 | + |
| 33 | + |
| 34 | +def char_ngram_overlap(generated: str, corpus_text: str, n: int) -> float: |
| 35 | + """Fraction of n-char windows in generated that appear in corpus. |
| 36 | + Higher = more Shakespearean char-pattern overlap.""" |
| 37 | + if len(generated) < n: |
| 38 | + return 0.0 |
| 39 | + corpus_ngrams = set(corpus_text[i:i+n] for i in range(len(corpus_text) - n + 1)) |
| 40 | + gen_ngrams = [generated[i:i+n] for i in range(len(generated) - n + 1)] |
| 41 | + if not gen_ngrams: |
| 42 | + return 0.0 |
| 43 | + matches = sum(1 for g in gen_ngrams if g in corpus_ngrams) |
| 44 | + return matches / len(gen_ngrams) |
| 45 | + |
| 46 | + |
| 47 | +def vocab_overlap(generated: str, corpus_text: str) -> float: |
| 48 | + """Fraction of generated 'words' (whitespace-split) that appear in |
| 49 | + the corpus vocabulary. Punctuation stripped for comparison.""" |
| 50 | + def clean(s): |
| 51 | + return s.lower().strip(string.punctuation) |
| 52 | + corpus_vocab = set(clean(w) for w in corpus_text.split() if clean(w)) |
| 53 | + gen_words = [clean(w) for w in generated.split() if clean(w)] |
| 54 | + if not gen_words: |
| 55 | + return 0.0 |
| 56 | + matches = sum(1 for w in gen_words if w in corpus_vocab) |
| 57 | + return matches / len(gen_words) |
| 58 | + |
| 59 | + |
| 60 | +def line_structure_stats(generated: str) -> dict: |
| 61 | + """Line-level statistics: line count, mean line length, std line |
| 62 | + length. Compare to corpus to see if the model matches Shakespeare's |
| 63 | + typical line structure.""" |
| 64 | + lines = [ln for ln in generated.split("\n") if ln.strip()] |
| 65 | + if not lines: |
| 66 | + return {"n_lines": 0, "mean_line_len": 0.0, "std_line_len": 0.0} |
| 67 | + lengths = [len(ln) for ln in lines] |
| 68 | + mean = sum(lengths) / len(lengths) |
| 69 | + var = sum((L - mean) ** 2 for L in lengths) / len(lengths) |
| 70 | + return {"n_lines": len(lines), |
| 71 | + "mean_line_len": mean, |
| 72 | + "std_line_len": var ** 0.5} |
| 73 | + |
| 74 | + |
| 75 | +def vc_alternation_rate(generated: str) -> float: |
| 76 | + """Vowel-consonant alternation rate. English alternates v/c more |
| 77 | + often than random text. Returns the fraction of adjacent letter |
| 78 | + pairs that are (v,c) or (c,v) -- alternating, not same class.""" |
| 79 | + letters = [c for c in generated if c in LETTERS] |
| 80 | + if len(letters) < 2: |
| 81 | + return 0.0 |
| 82 | + alts = 0 |
| 83 | + for i in range(len(letters) - 1): |
| 84 | + a, b = letters[i] in VOWELS, letters[i+1] in VOWELS |
| 85 | + if a != b: |
| 86 | + alts += 1 |
| 87 | + return alts / (len(letters) - 1) |
| 88 | + |
| 89 | + |
| 90 | +def line_length_match(generated: str, corpus_text: str) -> float: |
| 91 | + """How close is the generated line-length distribution to the |
| 92 | + corpus's? L1 distance over normalized histograms (lower = closer |
| 93 | + to Shakespeare's line structure).""" |
| 94 | + def hist(text, max_len=80): |
| 95 | + lines = [ln for ln in text.split("\n") if ln.strip()] |
| 96 | + h = [0] * (max_len + 1) |
| 97 | + for ln in lines: |
| 98 | + L = min(len(ln), max_len) |
| 99 | + h[L] += 1 |
| 100 | + total = sum(h) or 1 |
| 101 | + return [x / total for x in h] |
| 102 | + gen_h = hist(generated) |
| 103 | + corp_h = hist(corpus_text) |
| 104 | + return sum(abs(g - c) for g, c in zip(gen_h, corp_h)) |
| 105 | + |
| 106 | + |
| 107 | +def real_word_fraction(generated: str, corpus_text: str, |
| 108 | + min_word_len: int = 3) -> float: |
| 109 | + """Fraction of generated 'words' that are real (length >= min_word_len |
| 110 | + AND appear in the corpus vocabulary). The strict gate against |
| 111 | + gibberish: 'fan' is real even if Shakespeare uses it, 'xqrt' is not. |
| 112 | + Short tokens (1-2 chars) excluded because they're noise-prone. |
| 113 | + """ |
| 114 | + def clean(s): |
| 115 | + return s.lower().strip(string.punctuation) |
| 116 | + corpus_vocab = set(clean(w) for w in corpus_text.split() if clean(w)) |
| 117 | + gen_words = [clean(w) for w in generated.split() if clean(w)] |
| 118 | + long_words = [w for w in gen_words if len(w) >= min_word_len] |
| 119 | + if not long_words: |
| 120 | + return 0.0 |
| 121 | + real = sum(1 for w in long_words if w in corpus_vocab) |
| 122 | + return real / len(long_words) |
| 123 | + |
| 124 | + |
| 125 | +def common_word_presence(generated: str, corpus_text: str, |
| 126 | + top_k: int = 50) -> float: |
| 127 | + """How many of the corpus's top-K most-common words appear in the |
| 128 | + generated text. This is the strongest anti-gibberish signal: |
| 129 | + Shakespeare uses 'the', 'and', 'of', 'my', 'I' frequently; |
| 130 | + gibberish doesn't. |
| 131 | + """ |
| 132 | + def clean(s): |
| 133 | + return s.lower().strip(string.punctuation) |
| 134 | + corpus_words = [clean(w) for w in corpus_text.split() if clean(w)] |
| 135 | + corpus_freq = Counter(corpus_words) |
| 136 | + top_words = set(w for w, _ in corpus_freq.most_common(top_k)) |
| 137 | + gen_words = set(clean(w) for w in generated.split() if clean(w)) |
| 138 | + if not top_words: |
| 139 | + return 0.0 |
| 140 | + overlap = len(gen_words & top_words) |
| 141 | + return overlap / len(top_words) |
| 142 | + |
| 143 | + |
| 144 | +def avg_word_length_match(generated: str, corpus_text: str) -> float: |
| 145 | + """How close is generated avg word length to corpus avg? |
| 146 | + Returns 1.0 - normalized_distance, clamped to [0, 1].""" |
| 147 | + def clean(s): |
| 148 | + return s.lower().strip(string.punctuation) |
| 149 | + def avg(text): |
| 150 | + words = [clean(w) for w in text.split() if clean(w)] |
| 151 | + return (sum(len(w) for w in words) / len(words)) if words else 0.0 |
| 152 | + g = avg(generated); c = avg(corpus_text) |
| 153 | + if c == 0: |
| 154 | + return 0.0 |
| 155 | + return max(0.0, 1.0 - abs(g - c) / c) |
| 156 | + |
| 157 | + |
| 158 | +def ngram_diversity(generated: str, n: int = 3) -> float: |
| 159 | + """Fraction of n-grams in the generated text that are UNIQUE. |
| 160 | + 1.0 = every n-gram appears once (max diversity). |
| 161 | + 0.0 = all n-grams identical (max repetition). |
| 162 | + Counter-Goodhart against the model gaming overlap by repetition.""" |
| 163 | + if len(generated) < n: |
| 164 | + return 0.0 |
| 165 | + ngrams = [generated[i:i+n] for i in range(len(generated) - n + 1)] |
| 166 | + if not ngrams: |
| 167 | + return 0.0 |
| 168 | + return len(set(ngrams)) / len(ngrams) |
| 169 | + |
| 170 | + |
| 171 | +def repetition_penalty(generated: str, n: int = 4, |
| 172 | + max_freq_threshold: int = 3) -> float: |
| 173 | + """Penalty in [0, 1] for excessive n-gram repetition. 0 = no penalty. |
| 174 | +
|
| 175 | + For each n-gram appearing more than max_freq_threshold times, add a |
| 176 | + penalty proportional to the excess. Strong signal against the |
| 177 | + 'fan fan, fan, fan' failure mode. |
| 178 | + """ |
| 179 | + if len(generated) < n: |
| 180 | + return 0.0 |
| 181 | + ngrams = [generated[i:i+n] for i in range(len(generated) - n + 1)] |
| 182 | + counts = Counter(ngrams) |
| 183 | + excess = sum(max(0, c - max_freq_threshold) for c in counts.values()) |
| 184 | + # Normalize by total ngrams; cap penalty at 1.0 |
| 185 | + return min(1.0, excess / max(1, len(ngrams))) |
| 186 | + |
| 187 | + |
| 188 | +def lexical_diversity(generated: str) -> float: |
| 189 | + """Type-token ratio over 'words' (whitespace-split). Higher = more |
| 190 | + varied vocabulary, lower = repetitive word use.""" |
| 191 | + import string as _s |
| 192 | + words = [w.lower().strip(_s.punctuation) for w in generated.split()] |
| 193 | + words = [w for w in words if w] |
| 194 | + if not words: |
| 195 | + return 0.0 |
| 196 | + return len(set(words)) / len(words) |
| 197 | + |
| 198 | + |
| 199 | +def creativity_score(generated: str, corpus_text: str) -> dict: |
| 200 | + """Comprehensive Shakespeare-creativity score with anti-gibberish. |
| 201 | +
|
| 202 | + Penalties added in v2 to counter Goodhart's failure (model gaming |
| 203 | + overlap metrics by repetition): |
| 204 | + - ngram_diversity (multiplier; low = repetitive output) |
| 205 | + - lexical_diversity (multiplier; low = same word over and over) |
| 206 | + - repetition_penalty (subtractive; n-gram appears too many times) |
| 207 | + """ |
| 208 | + n2 = char_ngram_overlap(generated, corpus_text, 2) |
| 209 | + n3 = char_ngram_overlap(generated, corpus_text, 3) |
| 210 | + n4 = char_ngram_overlap(generated, corpus_text, 4) |
| 211 | + vocab = vocab_overlap(generated, corpus_text) |
| 212 | + vc = vc_alternation_rate(generated) |
| 213 | + line_dist = line_length_match(generated, corpus_text) |
| 214 | + line_stats = line_structure_stats(generated) |
| 215 | + # Strong anti-gibberish: common-word, real-word, and word-length. |
| 216 | + cw = common_word_presence(generated, corpus_text, top_k=50) |
| 217 | + rw = real_word_fraction(generated, corpus_text, min_word_len=3) |
| 218 | + awl = avg_word_length_match(generated, corpus_text) |
| 219 | + # Repetition penalty: only severe excess counts now (threshold scales |
| 220 | + # with text length so real text's natural repetition doesn't penalize). |
| 221 | + threshold = max(2, len(generated) // 50) |
| 222 | + rep_pen = repetition_penalty(generated, n=4, max_freq_threshold=threshold) |
| 223 | + |
| 224 | + composite = ( |
| 225 | + 0.25 * rw + # real-word fraction (HARDEST anti-gibberish) |
| 226 | + 0.15 * cw + # common-word presence |
| 227 | + 0.15 * vocab + # any vocab overlap (short tokens count) |
| 228 | + 0.10 * awl + # word-length sanity |
| 229 | + 0.15 * n3 + # 3-gram match (corpus patterns) |
| 230 | + 0.10 * n4 + # 4-gram match (longer patterns) |
| 231 | + 0.10 * max(0.0, 1.0 - line_dist) # line structure |
| 232 | + ) - 0.3 * rep_pen |
| 233 | + composite = max(0.0, min(1.0, composite)) |
| 234 | + return { |
| 235 | + "ngram_2": n2, |
| 236 | + "ngram_3": n3, |
| 237 | + "ngram_4": n4, |
| 238 | + "vocab_overlap": vocab, |
| 239 | + "common_word_presence": cw, |
| 240 | + "real_word_fraction": rw, |
| 241 | + "avg_word_len_match": awl, |
| 242 | + "vc_alternation": vc, |
| 243 | + "line_dist": line_dist, |
| 244 | + "line_stats": line_stats, |
| 245 | + "repetition_penalty": rep_pen, |
| 246 | + "creativity_score": composite, |
| 247 | + } |
0 commit comments