Skip to content

Commit 35996aa

Browse files
committed
Skip audio files when extracting symbol references
Added checks to ignore audio file references in symbol extraction logic, preventing audio files from being treated as symbols in various fields and wordlists.
1 parent 53dcbe1 commit 35996aa

2 files changed

Lines changed: 350 additions & 0 deletions

File tree

Lines changed: 327 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,327 @@
1+
#!/usr/bin/env node
2+
3+
const fs = require('fs');
4+
const path = require('path');
5+
6+
/**
7+
* Create a master spreadsheet of all unique symbols across vocabularies
8+
* Aggregates data from individual vocabulary CSV files
9+
*
10+
* Output columns:
11+
* - symbol-id
12+
* - total-count: Total times symbol is used across all vocabs
13+
* - vocabs: List of vocabularies using this symbol
14+
* - pages: List of pages where symbol appears (with vocab prefix)
15+
* - labels: List of unique cell labels for this symbol
16+
* - actions: List of action types
17+
*/
18+
19+
function parseCSV(filePath) {
20+
const content = fs.readFileSync(filePath, 'utf8');
21+
const lines = content.split('\n').filter(line => line.trim());
22+
const headers = parseCSVLine(lines[0]);
23+
24+
const data = [];
25+
for (let i = 1; i < lines.length; i++) {
26+
const values = parseCSVLine(lines[i]);
27+
if (values.length === headers.length) {
28+
const row = {};
29+
headers.forEach((h, idx) => {
30+
row[h] = values[idx];
31+
});
32+
data.push(row);
33+
}
34+
}
35+
36+
return { headers, data };
37+
}
38+
39+
function parseCSVLine(line) {
40+
const values = [];
41+
let current = '';
42+
let inQuotes = false;
43+
44+
for (let i = 0; i < line.length; i++) {
45+
const char = line[i];
46+
const nextChar = line[i + 1];
47+
48+
if (char === '"') {
49+
if (inQuotes && nextChar === '"') {
50+
// Escaped quote
51+
current += '"';
52+
i++;
53+
} else {
54+
// Toggle quotes
55+
inQuotes = !inQuotes;
56+
}
57+
} else if (char === ',' && !inQuotes) {
58+
// Field separator
59+
values.push(current);
60+
current = '';
61+
} else {
62+
current += char;
63+
}
64+
}
65+
66+
values.push(current);
67+
return values;
68+
}
69+
70+
function aggregateSymbols(csvFiles) {
71+
const symbolMap = new Map();
72+
73+
for (const csvFile of csvFiles) {
74+
console.log(`Processing: ${csvFile}`);
75+
const { data } = parseCSV(csvFile);
76+
77+
for (const row of data) {
78+
const symbolId = row['symbol-id'];
79+
if (!symbolId) continue;
80+
81+
if (!symbolMap.has(symbolId)) {
82+
symbolMap.set(symbolId, {
83+
symbolId,
84+
totalCount: 0,
85+
vocabs: new Set(),
86+
pages: new Set(),
87+
labels: new Set(),
88+
actions: new Set(),
89+
byVocab: {}
90+
});
91+
}
92+
93+
const entry = symbolMap.get(symbolId);
94+
const vocab = row['vocab'] || 'Unknown';
95+
const page = row['page'] || 'Unknown';
96+
const label = row['cell-label'] || '';
97+
const action = row['cell-actions'] || '';
98+
99+
entry.totalCount++;
100+
entry.vocabs.add(vocab);
101+
102+
// Add page with vocab prefix for context
103+
entry.pages.add(`${vocab}:${page}`);
104+
105+
if (label) entry.labels.add(label);
106+
107+
// Extract action type (SPEAK, NAVIGATE, OTHER)
108+
if (action.startsWith('SPEAK:')) {
109+
entry.actions.add('SPEAK');
110+
} else if (action.startsWith('NAVIGATE:')) {
111+
entry.actions.add('NAVIGATE');
112+
} else if (action && action !== '(none)') {
113+
entry.actions.add('OTHER');
114+
}
115+
116+
// Count by vocab
117+
if (!entry.byVocab[vocab]) {
118+
entry.byVocab[vocab] = { count: 0, pages: new Set() };
119+
}
120+
entry.byVocab[vocab].count++;
121+
if (page) entry.byVocab[vocab].pages.add(page);
122+
}
123+
}
124+
125+
return symbolMap;
126+
}
127+
128+
function generateMasterCSV(symbolMap, outputFile) {
129+
const headers = [
130+
'symbol-id',
131+
'total-count',
132+
'vocab-count',
133+
'vocabs',
134+
'page-count',
135+
'pages',
136+
'unique-labels',
137+
'action-types'
138+
];
139+
140+
const rows = [headers.join(',')];
141+
142+
// Sort by total count (descending), then by symbol-id
143+
const sortedSymbols = Array.from(symbolMap.values()).sort((a, b) => {
144+
if (b.totalCount !== a.totalCount) {
145+
return b.totalCount - a.totalCount;
146+
}
147+
return a.symbolId.localeCompare(b.symbolId);
148+
});
149+
150+
for (const entry of sortedSymbols) {
151+
const escape = (str) => {
152+
if (!str) return '""';
153+
const s = String(str).replace(/"/g, '""');
154+
return `"${s}"`;
155+
};
156+
157+
// Limit long lists to keep spreadsheet manageable
158+
const vocabsList = Array.from(entry.vocabs).sort().join('; ');
159+
const pagesList = Array.from(entry.pages)
160+
.sort()
161+
.slice(0, 50) // Limit to first 50 pages
162+
.join('; ');
163+
const labelsList = Array.from(entry.labels)
164+
.sort()
165+
.slice(0, 20) // Limit to first 20 labels
166+
.join('; ');
167+
const actionsList = Array.from(entry.actions).sort().join('; ');
168+
169+
const row = [
170+
escape(entry.symbolId),
171+
entry.totalCount,
172+
entry.vocabs.size,
173+
escape(vocabsList),
174+
entry.pages.size,
175+
escape(pagesList + (entry.pages.size > 50 ? ` ... (+${entry.pages.size - 50} more)` : '')),
176+
escape(labelsList + (entry.labels.size > 20 ? ` ... (+${entry.labels.size - 20} more)` : '')),
177+
escape(actionsList)
178+
];
179+
180+
rows.push(row.join(','));
181+
}
182+
183+
const csv = rows.join('\n');
184+
fs.writeFileSync(outputFile, csv, 'utf8');
185+
console.log(`\nWrote ${sortedSymbols.length} unique symbols to ${outputFile}`);
186+
}
187+
188+
function generateByVocabCSV(symbolMap, outputFile) {
189+
// Create a separate sheet showing symbol usage by vocabulary
190+
const headers = [
191+
'symbol-id',
192+
'total-count',
193+
...getUniqueVocabs(symbolMap),
194+
'all-vocabs'
195+
];
196+
197+
const rows = [headers.join(',')];
198+
199+
const sortedSymbols = Array.from(symbolMap.values()).sort((a, b) => {
200+
return b.totalCount - a.totalCount;
201+
});
202+
203+
for (const entry of sortedSymbols) {
204+
const escape = (str) => {
205+
if (!str) return '""';
206+
return `"${String(str).replace(/"/g, '""')}"`;
207+
};
208+
209+
const row = [
210+
escape(entry.symbolId),
211+
entry.totalCount
212+
];
213+
214+
// Add count for each vocab
215+
for (const vocab of headers.slice(2, -1)) {
216+
const count = entry.byVocab[vocab]?.count || 0;
217+
row.push(count);
218+
}
219+
220+
// Add list of all vocabs
221+
row.push(escape(Array.from(entry.vocabs).sort().join(', ')));
222+
223+
rows.push(row.join(','));
224+
}
225+
226+
const csv = rows.join('\n');
227+
const byVocabFile = outputFile.replace('.csv', '-by-vocab.csv');
228+
fs.writeFileSync(byVocabFile, csv, 'utf8');
229+
console.log(`Wrote by-vocab breakdown to ${byVocabFile}`);
230+
}
231+
232+
function getUniqueVocabs(symbolMap) {
233+
const vocabs = new Set();
234+
for (const entry of symbolMap.values()) {
235+
for (const vocab of Object.keys(entry.byVocab)) {
236+
vocabs.add(vocab);
237+
}
238+
}
239+
return Array.from(vocabs).sort();
240+
}
241+
242+
function generateSummary(symbolMap) {
243+
const totalSymbols = symbolMap.size;
244+
const totalUsages = Array.from(symbolMap.values()).reduce((sum, e) => sum + e.totalCount, 0);
245+
246+
const libraryCounts = {};
247+
const vocabCounts = {};
248+
249+
for (const entry of symbolMap.values()) {
250+
// Count by library
251+
let library = 'unknown';
252+
if (entry.symbolId.startsWith('[')) {
253+
const match = entry.symbolId.match(/^\[([^\]]+)\]/);
254+
if (match) library = match[1];
255+
} else if (entry.symbolId.startsWith('embedded:')) {
256+
library = 'embedded';
257+
} else if (entry.symbolId.startsWith('image:')) {
258+
library = 'image-ref';
259+
}
260+
libraryCounts[library] = (libraryCounts[library] || 0) + 1;
261+
262+
// Count by vocab
263+
for (const vocab of entry.vocabs) {
264+
vocabCounts[vocab] = (vocabCounts[vocab] || 0) + 1;
265+
}
266+
}
267+
268+
return {
269+
totalSymbols,
270+
totalUsages,
271+
libraryCounts,
272+
vocabCounts,
273+
avgUsagesPerSymbol: (totalUsages / totalSymbols).toFixed(2)
274+
};
275+
}
276+
277+
function main() {
278+
// Find all CSV files created by the extract script
279+
const csvDir = 'sheets-for-daisy';
280+
const csvFiles = [
281+
'Super Core 30-symbols-cleaned.csv',
282+
'Super Core 50-symbols-cleaned.csv',
283+
'Aphasia Duo 16-symbols-cleaned.csv',
284+
'Aphasia Duo 9-symbols-cleaned.csv',
285+
'Voco Chat-symbols-cleaned.csv'
286+
].map(f => path.join(csvDir, f))
287+
.filter(f => fs.existsSync(f));
288+
289+
if (csvFiles.length === 0) {
290+
console.error('No symbol CSV files found. Run extract-symbols-with-context.js first.');
291+
process.exit(1);
292+
}
293+
294+
console.log(`Found ${csvFiles.length} vocabulary CSV files to aggregate\n`);
295+
296+
const symbolMap = aggregateSymbols(csvFiles);
297+
298+
// Generate master CSV
299+
const masterFile = path.join(csvDir, 'master-symbols-all-vocabs.csv');
300+
generateMasterCSV(symbolMap, masterFile);
301+
302+
// Generate by-vocab breakdown
303+
generateByVocabCSV(symbolMap, masterFile);
304+
305+
// Print summary
306+
const summary = generateSummary(symbolMap);
307+
console.log('\n=== Master Summary ===');
308+
console.log(`Total unique symbols across all vocabs: ${summary.totalSymbols}`);
309+
console.log(`Total symbol usages: ${summary.totalUsages}`);
310+
console.log(`Average usages per symbol: ${summary.avgUsagesPerSymbol}`);
311+
312+
console.log('\nBy symbol library:');
313+
for (const [lib, count] of Object.entries(summary.libraryCounts).sort((a, b) => b[1] - a[1])) {
314+
console.log(` ${lib}: ${count} symbols`);
315+
}
316+
317+
console.log('\nBy vocabulary:');
318+
for (const [vocab, count] of Object.entries(summary.vocabCounts).sort((a, b) => b[1] - a[1])) {
319+
console.log(` ${vocab}: ${count} symbols`);
320+
}
321+
}
322+
323+
if (require.main === module) {
324+
main();
325+
}
326+
327+
module.exports = { aggregateSymbols, generateMasterCSV, generateSummary };

scripts/utilities/extract-symbols-with-context.js

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,38 @@ const { GridsetProcessor } = require('../../dist/processors');
1515
* node extract-symbols-with-context.js "/path/to/Super Core.gridset" "super-core-symbols.csv"
1616
*/
1717

18+
function isAudioFile(path) {
19+
if (!path) return false;
20+
const lower = path.toLowerCase();
21+
const audioExtensions = ['.mp3', '.wav', '.m4a', '.ogg', '.aac', '.wma', '.flac', '.mp4a', '.aiff', '.au'];
22+
return audioExtensions.some(ext => lower.endsWith(ext));
23+
}
24+
1825
function getSymbolId(button) {
1926
// Construct symbol reference from symbol library and path
2027
if (button.symbolLibrary) {
2128
const lib = button.symbolLibrary;
2229
const symPath = button.symbolPath || '';
30+
// Skip if the symbol path is actually an audio file
31+
if (isAudioFile(symPath)) {
32+
return '';
33+
}
2334
return `[${lib}]${symPath}`;
2435
}
2536
// For embedded images, use the resolved entry path
2637
if (button.resolvedImageEntry) {
38+
// Skip audio files
39+
if (isAudioFile(button.resolvedImageEntry)) {
40+
return '';
41+
}
2742
return `embedded:${button.resolvedImageEntry}`;
2843
}
2944
// Fallback to image field
3045
if (button.image) {
46+
// Skip if it's actually an audio file reference
47+
if (isAudioFile(button.image)) {
48+
return '';
49+
}
3150
return `image:${button.image}`;
3251
}
3352
return '';
@@ -96,6 +115,10 @@ function extractSymbolUsage(gridsetFile, vocabName) {
96115

97116
while ((match = wordListRegex.exec(content)) !== null) {
98117
const symbolRef = match[1];
118+
// Skip audio files in wordlists
119+
if (isAudioFile(symbolRef)) {
120+
continue;
121+
}
99122
const symbolId = symbolRef.startsWith('[') ? symbolRef : `[${symbolRef}]`;
100123

101124
// Extract the word from the Text element

0 commit comments

Comments
 (0)