Skip to content

Commit 363678f

Browse files
willwadeclaude
andcommitted
Fix smart grammar coverage for buttons without POS tags
Three issues addressed: 1. POS-tagged buttons on BFS-unreachable pages (e.g. topic pages) were ignored because calculateWordFormMetrics required a matching metrics entry. Now creates synthetic entries for unreachable POS-tagged buttons and also looks up parents by label as a fallback. 2. POS tags from tree buttons are now propagated to metrics buttons when the metrics version lacks POS (e.g. the BFS finds a button on a page without POS but the same word exists with POS on another page). 3. Buttons without POS tags now get POS inference: checks the irregular tables first for confident assignment (Verb/Noun/Adjective/Pronoun), then defaults to Noun for untagged single-word content labels. This fixes coverage for words like "bird", "tree", "cloud", "thing" that exist only on topic pages without POS tagging. A skiplist prevents function words from being incorrectly tagged. Also adds MorphologyEngine.inferPOS() method for POS lookup against irregular tables. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 2a1d447 commit 363678f

2 files changed

Lines changed: 129 additions & 3 deletions

File tree

src/utilities/analytics/metrics/core.ts

Lines changed: 114 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -783,13 +783,124 @@ export class MetricsCalculator {
783783
const locale = options.morphologyLocale || 'en-gb';
784784
const morph = new MorphologyEngine(locale);
785785

786+
// Words that should never be POS-inferred (function words, determiners, etc.)
787+
const skipInference = new Set([
788+
'a',
789+
'an',
790+
'the',
791+
'to',
792+
'in',
793+
'on',
794+
'at',
795+
'of',
796+
'for',
797+
'and',
798+
'or',
799+
'but',
800+
'not',
801+
'no',
802+
'yes',
803+
'is',
804+
'am',
805+
'are',
806+
'was',
807+
'were',
808+
'be',
809+
'been',
810+
'being',
811+
'has',
812+
'have',
813+
'had',
814+
'do',
815+
'does',
816+
'did',
817+
'will',
818+
'would',
819+
'could',
820+
'should',
821+
'shall',
822+
'may',
823+
'might',
824+
'can',
825+
'must',
826+
'with',
827+
'from',
828+
'by',
829+
'up',
830+
'down',
831+
'out',
832+
'off',
833+
'over',
834+
'under',
835+
'again',
836+
'then',
837+
'than',
838+
'so',
839+
'if',
840+
'when',
841+
'where',
842+
'how',
843+
'what',
844+
'who',
845+
'which',
846+
'that',
847+
'this',
848+
'these',
849+
'those',
850+
'here',
851+
'there',
852+
'now',
853+
'very',
854+
'just',
855+
'more',
856+
'also',
857+
'too',
858+
'please',
859+
'thank',
860+
'hi',
861+
'hello',
862+
'bye',
863+
'goodbye',
864+
'okay',
865+
'oh',
866+
'wow',
867+
'sorry',
868+
]);
869+
786870
for (const page of Object.values(tree.pages)) {
787871
for (const row of page.grid) {
788872
for (const btn of row) {
789-
if (!btn || !btn.pos || btn.pos === 'Unknown' || btn.pos === 'Ignore') continue;
790-
if (!btn.label) continue;
873+
if (!btn || !btn.label) continue;
874+
875+
let pos = btn.pos;
876+
877+
// If no POS tag (or Unknown/Ignore), attempt POS inference.
878+
// Many content words on topic pages lack POS tags even though
879+
// they are clearly nouns (e.g., "bird", "tree", "cloud").
880+
// Strategy: check irregular tables first for confident POS,
881+
// then fall back to Noun for single-word content labels.
882+
if (!pos || pos === 'Unknown' || pos === 'Ignore') {
883+
const lower = btn.label.toLowerCase();
884+
885+
// Skip function words and multi-word labels
886+
if (!skipInference.has(lower) && !lower.includes(' ') && lower.length > 1) {
887+
// Check irregular tables for confident POS assignment
888+
const inferredPOS = morph.inferPOS(lower);
889+
if (inferredPOS) {
890+
pos = inferredPOS;
891+
btn.pos = inferredPOS;
892+
} else {
893+
// Default to Noun for untagged content words.
894+
// This generates plurals (e.g., bird → birds, tree → trees).
895+
pos = 'Noun';
896+
btn.pos = 'Noun';
897+
}
898+
}
899+
}
900+
901+
if (!pos || pos === 'Unknown' || pos === 'Ignore') continue;
791902

792-
const forms = morph.inflect(btn.label, btn.pos);
903+
const forms = morph.inflect(btn.label, pos);
793904
if (forms.length > 0) {
794905
const existing = btn.predictions || [];
795906
const merged = new Set([...existing, ...forms]);

src/utilities/analytics/morphology/engine.ts

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,21 @@ export class MorphologyEngine {
179179
return undefined;
180180
}
181181

182+
/**
183+
* Infer the most likely POS for a word by checking the irregular tables.
184+
* Returns the POS if found in any irregular table, or null if not found.
185+
* Priority: Verb > Noun > Adjective > Pronoun
186+
*/
187+
inferPOS(word: string): string | null {
188+
const lower = word.toLowerCase();
189+
for (const pos of ['Verb', 'Noun', 'Adjective', 'Pronoun']) {
190+
if (this.ruleSet.irregular[pos]?.[lower]) {
191+
return pos;
192+
}
193+
}
194+
return null;
195+
}
196+
182197
private loadBundled(locale: string): MorphRuleSet {
183198
const normalized = locale.toLowerCase().replace('_', '-');
184199
switch (normalized) {

0 commit comments

Comments
 (0)