-
Notifications
You must be signed in to change notification settings - Fork 67.1k
Expand file tree
/
Copy pathget-data.ts
More file actions
396 lines (363 loc) · 14.5 KB
/
get-data.ts
File metadata and controls
396 lines (363 loc) · 14.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
import fs from 'fs'
import path from 'path'
import yaml from 'js-yaml'
import matter from '@gr2m/gray-matter'
import { merge, get } from 'lodash-es'
import languages from '@/languages/lib/languages-server'
import { correctTranslatedContentStrings } from '@/languages/lib/correct-translation-content'
import { createLogger } from '@/observability/logger'
const logger = createLogger(import.meta.url)
interface YAMLException extends Error {
mark?: any
}
interface FileSystemError extends Error {
code?: string
}
// If you run `export DEBUG_JIT_DATA_READS=true` in your terminal,
// next time it will mention every file it reads from disk.
const DEBUG_JIT_DATA_READS = Boolean(JSON.parse(process.env.DEBUG_JIT_DATA_READS || 'false'))
// This is a list of files that we should always immediately fall back to
// English for.
// Having this is safer than trying to wrangle the translations to NOT
// have them translated.
const ALWAYS_ENGLISH_YAML_FILES = new Set(['data/variables/product.yml'])
const ALWAYS_ENGLISH_MD_FILES = new Set([
'data/reusables/ssh/fingerprints.md',
'data/reusables/ssh/known_hosts.md',
])
// Returns all the things inside a directory
export const getDeepDataByLanguage = memoize(
(dottedPath: string, langCode: string, dir: string | null = null): any => {
if (!(langCode in languages)) {
throw new Error(`langCode '${langCode}' not a recognized language code`)
}
// The `dir` argument is only used for testing purposes.
// For example, our unit tests that depend on using a fixtures root.
// If we don't allow those tests to override the `dir` argument,
// it'll be stuck from the first time `languages.ts` was imported.
if (dir === null) {
dir = languages[langCode].dir
}
return getDeepDataByDir(dottedPath, dir)
},
)
// Doesn't need to be memoized because it's used by getDataKeysByLanguage
// which is already memoized.
function getDeepDataByDir(dottedPath: string, dir: string): any {
const fullPath = ['data']
const split = dottedPath.split(/\./g)
fullPath.push(...split)
const things: any = {}
const relPath = fullPath.join(path.sep)
for (const dirent of getDirents(dir, relPath)) {
if (dirent.name === 'README.md') continue
const key = dirent.isDirectory() ? dirent.name : dirent.name.replace(/\.yml$/, '') // e.g. '3-5' or '0-rc2'
if (dirent.isDirectory()) {
things[key] = getDeepDataByDir(`${dottedPath}.${key}`, dir)
} else if (dirent.name.endsWith('.yml')) {
things[key] = getYamlContent(dir, path.join(relPath, dirent.name))
} else if (dirent.name.endsWith('.md')) {
things[key] = getMarkdownContent(dir, path.join(relPath, dirent.name))
} else {
throw new Error(`don't know how to read '${dirent.name}'`)
}
}
return things
}
function getDirents(root: string, relPath: string): fs.Dirent[] {
const filePath = root ? path.join(root, relPath) : relPath
return fs.readdirSync(filePath, { withFileTypes: true })
}
export const getUIDataMerged = memoize((langCode: string): any => {
const uiEnglish = getUIData('en')
if (langCode === 'en') return uiEnglish
// Got to combine. Start with the English and put the translation on top.
// E.g.
// english = {food: "Food", drink: "Drink"}
// swedish = {food: "Mat"}
// =>
// combind = {food: "Mat", drink: "Drink"}
const combined: any = {}
merge(combined, uiEnglish)
merge(combined, getUIData(langCode))
return combined
})
// Doesn't need to be memoized because it's used by another function
// that is memoized.
const getUIData = (langCode: string): any => {
const fullPath = ['data', 'ui.yml']
const { dir } = languages[langCode]
return getYamlContent(dir, fullPath.join(path.sep))
}
export const getDataByLanguage = memoize((dottedPath: string, langCode: string): any => {
if (!(langCode in languages))
throw new Error(`langCode '${langCode}' not a recognized language code`)
const { dir } = languages[langCode]
try {
const value = getDataByDir(dottedPath, dir, languages.en.dir, langCode)
// What could happens is that a new key has only been added to
// the English data/ui.yml but hasn't been added to Japanese, but
// there nevertheless exists a Japanese `data/ui.yml`.
// Since getDataByDir() uses `get(dataObject, 'dott.ed.path')` it
// will return `undefined` if it's not present.
// If this happens, we can't rely on `err.code === 'ENOENT'` to
// fall back the English one. So we just start over using the English data.
if (value === undefined && langCode !== 'en') {
return getDataByDir(dottedPath, languages.en.dir)
}
return value
} catch (error) {
if (error instanceof Error && (error as YAMLException).mark && error.message) {
// It's a yaml.load() generated error!
// Remember, the file that we read might have been a .yml or a .md
// file. If it was a .md file, with corrupt front-matter that too
// would have caused a YAMLException
if (langCode !== 'en') {
if (DEBUG_JIT_DATA_READS) {
logger.warn('Unable to parse Yaml in translation', { langCode, dottedPath, error })
}
// Give it one more chance, but use English this time
return getDataByDir(dottedPath, languages.en.dir)
}
// Always throw English Yaml reading errors. Staff writers
// need to know early and explicitly that they are corrupt.
throw error
}
if ((error as FileSystemError).code === 'ENOENT') return undefined
throw error
}
})
function getDataByDir(
dottedPath: string,
dir: string,
englishRoot?: string,
langCode?: string,
): any {
const fullPath = ['data']
// Using English here because it doesn't matter. We just want to
// figure out how to turn `foo.version-3.4.deeper.key' into
// `['foo', 'version-3.4', 'deeper', 'key']` here and we'll need
// any directory to do that and English is always the most up-to-date.
// We need the getSmartSplit() as long as there's a chance that a
// directory or file inside data/ might contain a dot in the name,
// however the exception is the file names in data/release-notes/**/*.yml
// because it contains files that are just numbers like 3-7/0.yml and
// that can cause problems inside getSmartSplit().
const split = dottedPath.startsWith('release-notes')
? dottedPath.split('.')
: getSmartSplit(dottedPath)
// For early-access data stuff, they're referred to as...
//
// {% data early-access.reusables.foo.bar %}
//
// When we "merge" in the early-access data, we put the whole directory
// within the root `data/` so it exists, on disk, as
//
// data/early-access/reusables/foo/bar.md
//
if (split[0] === 'early-access') {
fullPath.push(split.shift()!)
}
const first = split[0]
if (first === 'variables') {
const key = split.pop()!
const basename = split.pop()!
fullPath.push(...split)
fullPath.push(`${basename}.yml`)
const allData = getYamlContent(dir, fullPath.join(path.sep), englishRoot)
if (allData && key) {
const value = allData[key]
if (value) {
let content = matter(value).content
if (dir !== englishRoot) {
let englishContent = content
try {
const englishData = getYamlContent(englishRoot, fullPath.join(path.sep), englishRoot)
if (englishData?.[key]) {
englishContent = matter(englishData[key]).content
}
} catch (error) {
if ((error as FileSystemError).code !== 'ENOENT') {
throw error
}
}
content = correctTranslatedContentStrings(content, englishContent, {
dottedPath,
code: langCode,
})
}
return content
}
} else {
logger.warn('Unable to find variables Yaml file', { filePath: fullPath.join(path.sep) })
}
return undefined
}
if (first === 'reusables') {
const nakedname = split.pop()!
fullPath.push(...split)
fullPath.push(`${nakedname}.md`)
const markdown = getMarkdownContent(dir, fullPath.join(path.sep), englishRoot)
let { content } = matter(markdown)
if (dir !== englishRoot) {
// If we're reading a translation, we need to replace the possible
// corruptions. For example `[AUTOTITLE"을](/foo/bar)`.
// To do this we'll need the English equivalent
let englishContent = content
try {
englishContent = getMarkdownContent(englishRoot, fullPath.join(path.sep), englishRoot)
} catch (error) {
// In some real but rare cases a reusable doesn't exist in English.
// At all.
// This can happen when the translation is really out of date.
// You might have an old `docs-internal.locale/content/**/*.md`
// file that mentions `{% data reusables.foo.bar %}`. And it's
// working fine, except none of that exists in English.
// If this is the case, we still want to executed the
// correctTranslatedContentStrings() function, but we can't
// genuinely give it the English equivalent content, which it
// sometimes uses to correct some Liquid tags. At least other
// good corrections might happen.
if ((error as FileSystemError).code !== 'ENOENT') {
throw error
}
}
content = correctTranslatedContentStrings(content, englishContent, {
dottedPath,
code: langCode,
})
}
return content
}
// E.g. {% data ui.pages.foo.bar %}
if (first === 'ui') {
const basename = split.shift() // i.e. 'ui'
fullPath.push(`${basename}.yml`)
const allData = getYamlContent(dir, fullPath.join(path.sep), englishRoot)
return get(allData, split.join('.'))
}
if (first === 'glossaries' || first === 'release-notes') {
const basename = split.pop()!
fullPath.push(...split)
fullPath.push(`${basename}.yml`)
return getYamlContent(dir, fullPath.join(path.sep), englishRoot)
}
throw new Error(`Can't find the key '${dottedPath}' in the scope.`)
}
function getSmartSplit(dottedPath: string): string[] {
const split = dottedPath.split('.')
const bits = []
for (let i = 0, len = split.length; i < len; i++) {
const bit = split[i]
if (i === len - 1) {
bits.push(bit)
} else {
const next = split[i + 1]
if (/\d$/.test(bit) && /^\d/.test(next)) {
bits.push([bit, next].join('.'))
i++ // jump ahead one position in the loop
} else {
bits.push(bit)
}
}
}
return bits
}
// The reason this is memoized, even though the parent caller function
// (`getDataByLanguage`) is also memoized is because we might read
// the same file for two different keys. E.g.
//
// getDataByLanguage('variables.product.prodname_ghe_server', 'en')
// getDataByLanguage('variables.product.company_short', 'en')
//
// ...will actually depend on reading `data/variables/product.yml`. Twice.
// Well, actually not twice because we cache the disk reading. So the outcome
// becomes this:
//
// 1. getDataByLanguage('variables.product.prodname_ghe_server', 'en')
// -> cache MISS
// 1.1. read and parse data/variables/product.yml
// -> cache MISS
// 2. getDataByLanguage('variables.product.company_short', 'en')
// -> cache MISS
// 2.1. read and parse data/variables/product.yml
// -> cache HIT (Yay!)
//
const getYamlContent = memoize(
(root: string | undefined, relPath: string, englishRoot?: string): any => {
// Certain Yaml files we know we always want the English one
// no matter what the specified language is.
// For example, we never want `data/variables/product.yml` translated
// so we know to immediately fall back to the English one.
if (ALWAYS_ENGLISH_YAML_FILES.has(relPath)) {
// This forces it to read from English. Later, when it goes
// into `getFileContent(...)` it will note that `root !== englishRoot`
// so it won't try to fall back.
root = englishRoot
}
const fileContent = getFileContent(root, relPath, englishRoot)
return yaml.load(fileContent, { filename: relPath })
},
)
// The reason why this is memoized, is the same as for getYamlContent() above.
const getMarkdownContent = memoize(
(root: string | undefined, relPath: string, englishRoot?: string): string => {
// Certain reusables we never want to be pulled from the translations.
// For example, certain reusables don't contain any English prose. Just
// facts like numbers or hardcoded key words.
// If this is the case, forcibly always draw from the English files.
if (ALWAYS_ENGLISH_MD_FILES.has(relPath)) {
root = englishRoot
}
const fileContent = getFileContent(root, relPath, englishRoot)
return matter(fileContent).content.trimEnd()
},
)
const getFileContent = (
root: string | undefined,
relPath: string,
englishRoot?: string,
): string => {
const filePath = root ? path.join(root, relPath) : relPath
if (DEBUG_JIT_DATA_READS) logger.info('READ', { filePath })
try {
return fs.readFileSync(filePath, 'utf-8')
} catch (err) {
// It might fail because that particular data entry doesn't yet
// exist in a translation
if ((err as FileSystemError).code === 'ENOENT') {
// If looking it up as a file fails, give it one more chance if the
// read was for a translation.
if (englishRoot && root !== englishRoot) {
// We can try again but this time using the English files
return getFileContent(englishRoot, relPath, englishRoot)
}
}
throw err
}
}
function memoize<T extends (...args: any[]) => any>(func: T): T {
const cache = new Map<string, any>()
return ((...args: any[]) => {
if (process.env.NODE_ENV === 'development') {
// It is very possible that certain files, when caching is disabled,
// are read multiple times in short succession. E.g. `product.yml`.
// So how expensive is it to read these files excessively?
// To answer that, we benchmarked it by sampling 10 files from the
// most common files that are used from `data/`. In fact, we ran 100
// runs of 10 *different* files. About 80% of them were `.yml` files.
// As a median, it takes **0.5ms to read 10 files from disk**
// all in a sync manner.
// Since most files coming through here is `.yml` files (e.g.
// product.yml and ui.yml) if you also do the `yaml.load()` of the
// read content, that number becomes **2.1ms to read and parse 10 files**.
// So in conclusion, not a lot of time.
return func(...args)
}
const key = args.join(':')
if (!cache.has(key)) {
cache.set(key, func(...args))
}
return cache.get(key)
}) as T
}