Skip to content

Commit b47b3db

Browse files
author
Maarten
committed
Make scanning more efficient, resolves #4
1 parent 8eccc20 commit b47b3db

File tree

9 files changed

+137
-124
lines changed

9 files changed

+137
-124
lines changed

package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
{
22
"name": "probabilistic-earley-parser",
3-
"version": "0.9.2",
3+
"version": "0.9.3",
44
"description": "A parser for parsing Probabilistic Context Free Grammars",
55
"main": "dist/index.js",
66
"author": "Maarten Trompper",
77
"license": "MIT",
8-
"repository": "git@github.com:digitalheir/java-probabilistic-earley-parser.git",
8+
"repository": "git@github.com:digitalheir/probabilistic-earley-parser-javascript.git",
99
"scripts": {
1010
"build": "npm run build:clean && npm run build:npm && npm run build:cp && npm run build:min",
1111
"build:clean": "rimraf dist",

src/earley/chart/chart.ts

Lines changed: 18 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ import {StateToObjectMap} from "./state-to-object-map";
1313
export class Chart<T, S> {
1414
readonly grammar: Grammar<T, S>;
1515

16-
private states: StateIndex<S, T>;
17-
private byIndex: Map<number, Set<State<S, T>>>;
16+
private states = new StateIndex<S, T>();
17+
private byIndex = new Map<number, Set<State<S, T>>>();
1818

1919
/**
2020
* The forward probability <code>α_i</code> of a chart is
@@ -23,7 +23,7 @@ export class Chart<T, S> {
2323
* paths from start to position i. So this includes multiple
2424
* instances of the same history, which may happen because of recursion.
2525
*/
26-
private forwardScores: StateToObjectMap<T, S>;
26+
private forwardScores = new StateToObjectMap<T, S>();
2727

2828
/**
2929
* The inner probability <code>γ_{i}</code> of a chart
@@ -33,36 +33,22 @@ export class Chart<T, S> {
3333
* Note that this is conditional on the chart happening at position k with
3434
* a certain non-terminal X
3535
*/
36-
private innerScores: StateToObjectMap<T, S>;
37-
private viterbiScores: StateToObjectMap<T, ViterbiScore<S, T>>;
36+
private innerScores = new StateToObjectMap<T, S>();
37+
private viterbiScores = new StateToObjectMap<T, ViterbiScore<S, T>>();
3838

39-
completedStates: Map<number, Set<State<S, T>>>;
40-
completedStatesFor: Map<number, Map<NonTerminal, Set<State<S, T>>>>;
41-
completedStatesThatAreNotUnitProductions: Map<number, Set<State<S, T>>>;
42-
statesActiveOnNonTerminals: Map<number, Set<State<S, T>>>;
39+
completedStates = new Map<number, Set<State<S, T>>>();
40+
completedStatesFor = new Map<number, Map<NonTerminal, Set<State<S, T>>>>();
41+
completedStatesThatAreNotUnitProductions = new Map<number, Set<State<S, T>>>();
42+
statesActiveOnNonTerminals = new Map<number, Set<State<S, T>>>();
4343

44-
nonTerminalActiveAtIWithNonZeroUnitStarToY: Map<number, Map<NonTerminal, Set<State<S, T>>>>;
45-
statesActiveOnTerminals: Map<number, Set<State<S, T>>>;
46-
statesActiveOnNonTerminal: Map<NonTerminal, Map<number, Set<State<S, T>>>>;
44+
nonTerminalActiveAtIWithNonZeroUnitStarToY = new Map<number, Map<NonTerminal, Set<State<S, T>>>>();
45+
statesActiveOnTerminals = new Map<number, Map<Terminal<T>, Set<State<S, T>>>>();
46+
statesActiveOnNonTerminal = new Map<NonTerminal, Map<number, Set<State<S, T>>>>();
4747
private EMPTY_SET: Set<State<S, T>> = new Set<State<S, T>>();
4848

4949

5050
constructor(grammar: Grammar<T, S>) {
51-
this.states = new StateIndex<S, T>();
5251
this.grammar = grammar;
53-
54-
this.forwardScores = new StateToObjectMap<T, S>();
55-
this.innerScores = new StateToObjectMap<T, S>();
56-
this.viterbiScores = new StateToObjectMap<T, ViterbiScore<S, T>>();
57-
this.byIndex = new Map<number, Set<State<S, T>>>();
58-
this.completedStates = new Map<number, Set<State<S, T>>>();
59-
this.completedStatesFor = new Map<number, Map<NonTerminal, Set<State<S, T>>>>();
60-
this.completedStatesThatAreNotUnitProductions = new Map<number, Set<State<S, T>>>();
61-
this.statesActiveOnNonTerminals = new Map<number, Set<State<S, T>>>();
62-
63-
this.nonTerminalActiveAtIWithNonZeroUnitStarToY = new Map<number, Map<NonTerminal, Set<State<S, T>>>>();
64-
this.statesActiveOnTerminals = new Map<number, Set<State<S, T>>>();
65-
this.statesActiveOnNonTerminal = new Map<NonTerminal, Map<number, Set<State<S, T>>>>();
6652
}
6753

6854
// getCompletedStates(int i, NonTerminal s):Set<State<SemiringType, T>> {
@@ -220,7 +206,7 @@ export class Chart<T, S> {
220206
});
221207
} else {
222208
// activeCategory MUST be terminal
223-
getOrCreateSet(this.statesActiveOnTerminals, position).add(state);
209+
getOrCreateSet(getOrCreateMap(this.statesActiveOnTerminals, position), activeCategory).add(state);
224210
}
225211
}
226212
}
@@ -274,8 +260,11 @@ export class Chart<T, S> {
274260
return this.statesActiveOnNonTerminals.get(index);
275261
}
276262

277-
public getStatesActiveOnTerminals(index: number) {
278-
return this.statesActiveOnTerminals.get(index);
263+
public getStatesActiveOnTerminals(index: number, terminal: Terminal<T>) {
264+
if (this.statesActiveOnTerminals.has(index))
265+
return this.statesActiveOnTerminals.get(index).get(terminal);
266+
else
267+
return undefined;
279268
}
280269

281270
// public hasInnerScore(s: State<S, T>): boolean {

src/earley/parser.ts

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import {Grammar} from "../grammar/grammar";
2-
import {NonTerminal, Category, isNonTerminal} from "../grammar/category";
2+
import {NonTerminal, Category, isNonTerminal, Terminal, WordWithTypes} from "../grammar/category";
33
import {Rule} from "../grammar/rule";
44
import {State} from "./chart/state";
55
import {setViterbiScores, ViterbiScore} from "./chart/viterbi-score";
@@ -10,12 +10,12 @@ import {complete} from "./complete";
1010
import {ParseTree, addRightMost} from "./parsetree";
1111

1212
export function addState<S, T>(stateSets: Chart<T, S>,
13-
index: number,
14-
ruleStartPosition: number,
15-
ruleDotPosition: number,
16-
rule: Rule<T>,
17-
forward: S,
18-
inner: S): State<S, T> {
13+
index: number,
14+
ruleStartPosition: number,
15+
ruleDotPosition: number,
16+
rule: Rule<T>,
17+
forward: S,
18+
inner: S): State<S, T> {
1919
const state = stateSets.getOrCreate(index, ruleStartPosition, ruleDotPosition, rule);
2020
stateSets.setInnerScore(state, inner);
2121
stateSets.setForwardScore(state, forward);
@@ -87,9 +87,11 @@ export function getViterbiParseFromChart<S, T>(state: State<S, T>, chart: Chart<
8787
}
8888
}
8989

90+
91+
9092
export function parseSentenceIntoChart<S, T>(Start: NonTerminal,
91-
grammar: Grammar<T, S>,
92-
tokens: T[]): [Chart<T, S>, number, State<S, T>] {
93+
grammar: Grammar<T, S>,
94+
tokens: T[]): [Chart<T, S>, number, State<S, T>] {
9395
// ScanProbability scanProbability//TODO
9496

9597
const stateSets: Chart<T, S> = new Chart(grammar);
@@ -99,6 +101,18 @@ export function parseSentenceIntoChart<S, T>(Start: NonTerminal,
99101
// Rule.create(sr, 1.0, Category.START, S), 0
100102
// );
101103

104+
// Index words to their applicable terminals
105+
const wordToTypesMap = new Map<T, Terminal<T>[]>();
106+
const tokensWithWords: WordWithTypes<T>[] = tokens.map(word => {
107+
if (wordToTypesMap.has(word))
108+
return {types: wordToTypesMap.get(word), word};
109+
else {
110+
const types: Terminal<T>[] = grammar.terminals.filter((isOfType: Terminal<T>) => isOfType(word));
111+
wordToTypesMap.set(word, types);
112+
return {types, word};
113+
}
114+
});
115+
102116
const init = addState(
103117
stateSets, 0, 0, 0,
104118
{left: "<start>", right: [Start], probability: 1.0},
@@ -108,8 +122,8 @@ export function parseSentenceIntoChart<S, T>(Start: NonTerminal,
108122

109123
// Cycle through input
110124
let i = 0;
111-
tokens.forEach(
112-
(token: T) => {
125+
tokensWithWords.forEach(
126+
(token: WordWithTypes<T>) => {
113127
predict(i, grammar, stateSets);
114128
scan(i, token, grammar.probabilityMapping.semiring, stateSets);
115129
complete(i + 1, stateSets, grammar);
@@ -138,8 +152,8 @@ export interface ParseTreeWithScore<T> {
138152
}
139153

140154
export function getViterbiParse<S, T>(Start: NonTerminal,
141-
grammar: Grammar<T, S>,
142-
tokens: T[]): ParseTreeWithScore<T> {
155+
grammar: Grammar<T, S>,
156+
tokens: T[]): ParseTreeWithScore<T> {
143157
const [chart, ignored, init] = parseSentenceIntoChart(Start, grammar, tokens);
144158

145159
const finalState = chart.getOrCreate(

src/earley/scan.ts

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import {isNonTerminal} from "../grammar/category";
1+
import {isNonTerminal, WordWithTypes} from "../grammar/category";
22
import {Semiring} from "semiring";
33
import {Chart} from "./chart/chart";
44
import {getActiveCategory, State, advanceDot} from "./chart/state";
@@ -8,13 +8,14 @@ import {getActiveCategory, State, advanceDot} from "./chart/state";
88
* Handles a token scanned from the input string.
99
*
1010
* @param tokenPosition The start index of the scan.
11-
* @param token The token that was scanned.
11+
* @param word
12+
* @param types
1213
* //@param scanProbability Function that provides the probability of scanning the given token at this position. Might be null for a probability of 1.0.
1314
* @param sr
1415
* @param stateSets
1516
*/
1617
export function scan<S, T>(tokenPosition: number,
17-
token: T,
18+
{word, types}: WordWithTypes<T>,
1819
// scanProbability:(x:T)=>number,//TODO
1920
sr: Semiring<S>,
2021
stateSets: Chart<T, S>) {
@@ -27,13 +28,14 @@ export function scan<S, T>(tokenPosition: number,
2728
* Get all states that are active on a terminal
2829
* O(|stateset(i)|) = O(|grammar|): For all states <code>i: X<sub>k</sub> → λ·tμ</code>, where t is a terminal that matches the given token...
2930
*/
30-
31-
const statesActiveOnTerminals: Set<State<S, T>> = stateSets.getStatesActiveOnTerminals(tokenPosition);
32-
if (statesActiveOnTerminals) statesActiveOnTerminals.forEach((preScanState: State<S, T>) => {
33-
const activeCategory = getActiveCategory(preScanState);
34-
if (isNonTerminal(activeCategory)) throw new Error("this is a bug");
35-
else {
36-
if (activeCategory(token)) { // TODO can this be more efficient, ie have tokens make their category be explicit? (Do we want to maintain the possibility of such "fluid" categories?)
31+
types.forEach(terminal => {
32+
const statesActiveOnTerminals: Set<State<S, T>> = stateSets.getStatesActiveOnTerminals(tokenPosition, terminal);
33+
if (statesActiveOnTerminals) statesActiveOnTerminals.forEach((preScanState: State<S, T>) => {
34+
const activeCategory = getActiveCategory(preScanState);
35+
if (isNonTerminal(activeCategory)) throw new Error("this is a bug");
36+
else {
37+
if (!activeCategory(word)) throw new Error("Index failed");
38+
// TODO can this be more efficient, ie have tokens make their category be explicit? (Do we want to maintain the possibility of such "fluid" categories?)
3739
// Create the chart <code>i+1: X<sub>k</sub> → λt·μ</code>
3840
const preScanForward: S = stateSets.getForwardScore(preScanState);
3941
const preScanInner: S = stateSets.getInnerScore(preScanState);
@@ -42,7 +44,7 @@ export function scan<S, T>(tokenPosition: number,
4244
tokenPosition + 1, preScanState.ruleStartPosition,
4345
advanceDot(preScanState),
4446
preScanState.rule,
45-
token
47+
word
4648
);
4749

4850
const postScanForward = calculateForwardScore(sr, preScanForward, scanProb);
@@ -76,7 +78,7 @@ export function scan<S, T>(tokenPosition: number,
7678
forward: postScanForward
7779
});
7880
}
79-
}
81+
});
8082
});
8183
return changes;
8284
}

src/grammar/category.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@ export type Category<T> = Terminal<T> | NonTerminal;
22
export type Terminal<T> = (t: T) => boolean;
33
export type NonTerminal = string;
44

5+
export interface WordWithTypes<T> {
6+
types: Terminal<T>[];
7+
word: T;
8+
}
9+
510
export function isNonTerminal(element: any): element is NonTerminal {
611
return typeof element === "string";
712
}

src/grammar/grammar.ts

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import {Set, Map} from "core-js";
2-
import {NonTerminal, Category, isNonTerminal} from "./category";
2+
import {NonTerminal, Category, isNonTerminal, Terminal} from "./category";
33
import {Rule} from "./rule";
44
import {
55
getLeftCorners,
@@ -32,8 +32,9 @@ export interface ProbabilitySemiringMapping<Y> {
3232
export class Grammar<T, SemiringType> {
3333
readonly name: string;
3434
readonly ruleMap: Map<NonTerminal, Set<Rule<T>>>;
35-
readonly rules: Set<Rule<T>>;
36-
readonly nonTerminals: Set<NonTerminal>;
35+
readonly rules: Set<Rule<T>> = new Set<Rule<T>>();
36+
readonly nonTerminals: Set<NonTerminal> = new Set<NonTerminal>();
37+
readonly terminals: Terminal<T>[];
3738

3839
//
3940
// pre-compute some scores for efficient earley parsing
@@ -49,17 +50,16 @@ export class Grammar<T, SemiringType> {
4950
ruleMap: Map<NonTerminal, Set<Rule<T>>>,
5051
probabilityMapping: ProbabilitySemiringMapping<SemiringType>) {
5152
this.name = name;
52-
5353
this.ruleMap = ruleMap;
54-
this.nonTerminals = new Set<NonTerminal>();
55-
this.rules = new Set<Rule<T>>();
5654

5755
this.probabilityMapping = probabilityMapping;
5856
this.deferrableSemiring = makeDeferrable(probabilityMapping.semiring);
5957

6058
const values: IterableIterator<Set<Rule<T>>> = ruleMap.values();
6159

60+
6261
let done = false;
62+
const terminals = new Set<Terminal<T>>();
6363
while (!done) {
6464
const next: IteratorResult<Set<Rule<T>>> = values.next();
6565
done = next.done;
@@ -68,13 +68,17 @@ export class Grammar<T, SemiringType> {
6868
rulez.forEach(rule => {
6969
this.rules.add(rule);
7070
this.nonTerminals.add(rule.left);
71-
rule.right.filter(isNonTerminal).forEach((a: NonTerminal) =>
72-
this.nonTerminals.add(a)
73-
);
71+
rule.right.forEach((a: Category<T>) => {
72+
if (isNonTerminal(a))
73+
this.nonTerminals.add(a);
74+
else
75+
terminals.add(a);
76+
});
7477
}
7578
);
7679
}
7780
}
81+
this.terminals = Array.from(terminals);
7882

7983
const zero = 0.0;
8084
this.leftCorners = getLeftCorners(this.rules, zero);

0 commit comments

Comments
 (0)