-
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathusjBookExtractor.ts
More file actions
291 lines (269 loc) · 10.6 KB
/
usjBookExtractor.ts
File metadata and controls
291 lines (269 loc) · 10.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
/** @file Extracts {@link RawBook} from a papi USJ book response. */
/** Plain text of a single verse extracted from a USJ document, ready to be tokenized. */
export interface RawVerse {
/** SID from the USJ verse marker, e.g. `"GEN 1:1"`. Parsed into `Segment.startRef` / `endRef`. */
sid: string;
/**
* Accumulated plain-text content of the verse. Note and footnote content is excluded. Becomes
* `Segment.baselineText`; token `charStart` / `charEnd` are expressed relative to this string.
*/
text: string;
}
/**
* Raw book data captured from a papi USJ response. Self-contained — everything the tokenizer needs
* to produce `Book → Segment → Token`.
*/
export interface RawBook {
/** 3-letter book code, e.g. `"GEN"`. */
bookCode: string;
/** BCP 47 writing system tag for the baseline text, from `platform.languageTag`. */
writingSystem: string;
/** FNV-1a hash of the serialized USJ content. Becomes `Book.textVersion`. */
contentHash: string;
/** Verse entries in document order, one per USJ `verse` marker. */
verses: RawVerse[];
}
// ---------------------------------------------------------------------------
// Minimal local types for USJ traversal.
// @eten-tech-foundation/scripture-utilities is not a direct dependency of this
// extension, so we define the subset we need here.
// ---------------------------------------------------------------------------
/** A USJ content item: either a plain text string or a marker node. */
type MarkerContent = string | UsjNode;
/** A USJ marker node. Only the fields used during extraction are declared. */
interface UsjNode {
/** Node type string (e.g. `"book"`, `"chapter"`, `"verse"`, `"para"`, `"note"`). */
type: string;
/** USFM marker (e.g. `"p"`, `"s1"`, `"q"`). Present on `para` and `note` nodes. */
marker?: string;
/** Chapter or verse number string. Present on `chapter` nodes. */
number?: string;
/** 3-letter book code. Present on `book` nodes. */
code?: string;
/**
* Verse or chapter SID. Present on `verse` nodes (e.g. `"GEN 1:1"`) and `chapter` nodes (e.g.
* `"GEN 1"`).
*/
sid?: string;
/** Child content items (strings or nested nodes). */
content?: MarkerContent[];
}
/** Minimal shape of a USJ document as returned by the papi `platformScripture.USJ_Book` provider. */
export interface UsjDocument {
content: MarkerContent[];
}
// ---------------------------------------------------------------------------
// Implementation
// ---------------------------------------------------------------------------
/**
* Para markers whose content is not part of the verse baseline text (headings, titles, spacing,
* speaker IDs, acrostic headings, etc.). Verse-content para markers (p, m, pi, q*, etc.) are absent
* from this set and have their text accumulated as usual.
*/
const HEADING_PARA_MARKERS = new Set([
// Major section headings and reference ranges
'ms',
'ms1',
'ms2',
'ms3',
'mr',
// Section headings, reference ranges, and descriptive titles
's',
's1',
's2',
's3',
's4',
'sr',
'r',
'd',
// Speaker, acrostic heading, blank lines
'sp',
'qa',
'b',
'ib',
// Introduction headings
'imt',
'imt1',
'imt2',
'imt3',
'imte',
'imte1',
'imte2',
'is',
'is1',
'is2',
]);
/** Mutable state threaded through the recursive USJ traversal. */
interface TraversalState {
/** 3-letter book code captured from the `book` marker (e.g. `"GEN"`). */
bookCode: string;
/** Verse SIDs seen so far; used to reject duplicates. */
seenVerseIds: Set<string>;
/** The verse currently being accumulated; `undefined` when outside a verse scope. */
currentVerse: { sid: string; text: string } | undefined;
/** Completed verses in document order. */
verses: RawVerse[];
}
/**
* Captures the book code from a `book` node, then recurses into its content.
*
* @param node - The `book` USJ node; `node.code` is the 3-letter book code.
* @param state - Shared traversal state updated in place.
*/
function handleBookNode(node: UsjNode, state: TraversalState): void {
if (node.code) state.bookCode = node.code;
if (node.content) traverse(node.content, state);
}
/**
* Closes the current open verse (if any) when a `chapter` node is encountered, then recurses into
* the chapter's content to pick up verses inside it.
*
* @param node - The `chapter` USJ node.
* @param state - Shared traversal state updated in place.
*/
function handleChapterNode(node: UsjNode, state: TraversalState): void {
if (state.currentVerse !== undefined) {
state.currentVerse.text = state.currentVerse.text.trimEnd();
state.verses.push(state.currentVerse);
state.currentVerse = undefined;
}
if (node.content) traverse(node.content, state);
}
/**
* Closes the previous open verse (if any) and opens a new one for a `verse` node.
*
* @param node - The `verse` USJ node; must carry a `sid` attribute (e.g. `"GEN 1:1"`).
* @param state - Shared traversal state updated in place.
* @throws {SyntaxError} If the `verse` node is missing its required `sid` attribute.
* @throws {SyntaxError} If the `verse` SID has already been seen (duplicate verse SID).
*/
function handleVerseNode(node: UsjNode, state: TraversalState): void {
if (state.currentVerse !== undefined) {
state.currentVerse.text = state.currentVerse.text.trimEnd();
state.verses.push(state.currentVerse);
}
if (!node.sid) throw new SyntaxError('Invalid USJ: verse marker missing required sid attribute');
if (state.seenVerseIds.has(node.sid))
throw new SyntaxError(`Invalid USJ: duplicate verse SID "${node.sid}"`);
state.seenVerseIds.add(node.sid);
state.currentVerse = { sid: node.sid, text: '' };
if (node.content) traverse(node.content, state);
}
/**
* Recurses into a `para` node's content, appending a space between adjacent para nodes when needed.
* Heading-class paragraphs (see {@link HEADING_PARA_MARKERS}) are skipped entirely so their text is
* not included in the verse baseline.
*
* @param node - The `para` USJ node; `node.marker` determines whether to skip or recurse.
* @param state - Shared traversal state updated in place.
*/
function handleParaNode(node: UsjNode, state: TraversalState): void {
if (node.marker && HEADING_PARA_MARKERS.has(node.marker)) return;
if (
state.currentVerse !== undefined &&
state.currentVerse.text.length > 0 &&
!state.currentVerse.text.endsWith(' ')
)
state.currentVerse.text += ' ';
if (node.content) traverse(node.content, state);
}
/** Dispatch table mapping USJ node `type` strings to their traversal handlers. */
const NODE_HANDLERS: Partial<Record<string, (node: UsjNode, state: TraversalState) => void>> = {
book: handleBookNode,
chapter: handleChapterNode,
verse: handleVerseNode,
note: () => {}, // skip note/footnote content — not part of the baseline text
para: handleParaNode,
};
/**
* Recursively walks a USJ content array, accumulating verse text into `state`.
*
* @param nodes - Content items to walk (`string` or {@link UsjNode}).
* @param state - Shared mutable state updated in place during traversal.
* @throws {SyntaxError} If any verse node encountered during traversal is missing its `sid`
* attribute or contains a duplicate SID (propagated from {@link handleVerseNode}).
*/
function traverse(nodes: MarkerContent[], state: TraversalState): void {
nodes.forEach((node) => {
if (typeof node === 'string') {
if (state.currentVerse !== undefined) state.currentVerse.text += node;
return;
}
const handler = Object.hasOwn(NODE_HANDLERS, node.type) ? NODE_HANDLERS[node.type] : undefined;
if (handler) handler(node, state);
else if (node.content) traverse(node.content, state);
});
}
/**
* Deterministic JSON serialization with keys sorted by UTF-16 code-unit order.
*
* Produces the same output regardless of engine locale, making the result safe to feed into a hash
* function. Arrays preserve their original order; only object keys are sorted.
*
* Intended for plain JSON-shaped structures only; does not special-case Date, Map, Set, or RegExp.
*
* @param value - Any JSON-serializable value.
* @returns A stable JSON string with object keys in UTF-16 code-unit order.
*/
function stableStringify(value: unknown): string {
/* v8 ignore next -- defensive guard; production callers never pass undefined directly */
if (value === undefined) return 'null';
if (!(value instanceof Object)) return JSON.stringify(value);
if (Array.isArray(value)) return `[${value.map(stableStringify).join(',')}]`;
const sorted = Object.entries(value)
.filter(([, v]) => v !== undefined)
.sort(([a], [b]) => +(a > b) - +(a < b))
.map(([k, v]) => `${JSON.stringify(k)}:${stableStringify(v)}`);
return `{${sorted.join(',')}}`;
}
/**
* FNV-1a 32-bit hash — sufficient for one-way internal content versioning.
*
* @param s - String to hash.
* @returns Lowercase hex string of the unsigned 32-bit FNV-1a digest.
*/
function fnv1a32(s: string): string {
let h = 2166136261;
// eslint-disable-next-line no-restricted-syntax -- iterating over string, not array
for (const char of s) {
/* v8 ignore next 2 -- codePointAt(0) on a spread char is always defined */
// eslint-disable-next-line no-bitwise
h = Math.imul(h ^ (char.codePointAt(0) ?? 0), 16777619);
}
// eslint-disable-next-line no-bitwise
return (h >>> 0).toString(16).padStart(8, '0');
}
/**
* Extracts a {@link RawBook} from a papi USJ book response.
*
* Each `verse` marker in the USJ document becomes one {@link RawVerse}. Text strings within the
* verse scope are accumulated into `RawVerse.text`; `note` nodes are skipped entirely. Verse
* markers with no following text produce an empty `RawVerse` (`text: ""`).
*
* @param usj - USJ document returned by `useProjectData('platformScripture.USJ_Book', ...)`.
* @param writingSystem - BCP 47 tag for the baseline, from `platform.languageTag`.
* @returns A `RawBook` with `bookCode`, `writingSystem`, `contentHash`, and `verses` populated.
* @throws {SyntaxError} If no `book` marker with a `code` attribute is found in the document.
*/
export function extractBookFromUsj(usj: UsjDocument, writingSystem: string): RawBook {
const contentHash = fnv1a32(stableStringify(usj.content));
const state: TraversalState = {
bookCode: '',
seenVerseIds: new Set<string>(),
currentVerse: undefined,
verses: [],
};
traverse(usj.content, state);
if (state.currentVerse !== undefined) {
state.currentVerse.text = state.currentVerse.text.trimEnd();
state.verses.push(state.currentVerse);
}
if (!state.bookCode)
throw new SyntaxError('Invalid USJ: no book marker with a code attribute found');
return {
bookCode: state.bookCode,
writingSystem,
contentHash,
verses: state.verses,
};
}