docs/src/data-directory/scripts/find-orphaned-features/find.ts at e9c6874d5e2157cc44d0f43bdff33ae3d27a7a15 · github/docs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
/**
 * This script will loop over all pages, in all languages, and look at
 * the following:
 *
 *    1. `title` in frontmatter
 *    2. `intro` in frontmatter
 *    3. `shortTitle` in frontmatter (if present)
 *    4. the markdown body itself
 *    5. The `versions:` frontmatter key (if the page is in English)
 *
 * Then it will search out the features mentioned based on `data/features/*.yml`
 * It will make a Set of these (e.g. `dependabot-grouped-dependencies` and
 * `ghas-enablement-webhook`) and one by one pluck them away.
 *
 * After the pages, it will loop over the reusables in English, and do the
 * same search there. Once it's done the English, it loops over the
 * reusables in the translations (if they exist) and does the same search.
 *
 * Lastly, it will output the remaining features, as relative file paths.
 * For example, `data/features/havent-been-used-in-years.yml` so now you
 * know that file can be deleted.
 *
 * NOTE: A lot of translations have corrupted Liquid. So if we can't parse
 * the Liquid we fall back to string search. A regex will try to find
 * all `{% ifversion ... %}` (and `elsif`) and search for any features
 * mentioned inside that as a string.
 *
 */

import { strictEqual } from 'node:assert'
import fs from 'fs'
import path from 'path'

import chalk from 'chalk'
import { TokenizationError, TokenKind } from 'liquidjs'
import type { TagToken } from 'liquidjs'

import type { Page } from '@/types'
import warmServer from '@/frame/lib/warm-server'
import { getDeepDataByLanguage } from '@/data-directory/lib/get-data'
import { getLiquidTokens } from '@/content-linter/lib/helpers/liquid-utils'
import languages from '@/languages/lib/languages-server'
import { correctTranslatedContentStrings } from '@/languages/lib/correct-translation-content'

const EXCEPTIONS = new Set([
  // From data/features/placeholder.yml. Used by tests.
  'placeholder',
])

type Options = {
  sourceDirectory: string
  output?: string
  verbose?: boolean
}

export async function find(options: Options) {
  const { sourceDirectory } = options
  if (process.env.ENABLED_LANGUAGES && process.env.ENABLED_LANGUAGES === 'en') {
    console.warn(
      chalk.yellow(
        `Only English is enabled. Be careful with the output.
    To include all translations make sure they're available and that
    ENABLED_LANGUAGES is not set or set to 'all'.`.replaceAll(/\s\s+/g, ' '),
      ),
    )
  }
  const site = await warmServer([])

  const features = new Set(
    Object.keys(getDeepDataByLanguage('features', 'en')).filter((f) => !EXCEPTIONS.has(f)),
  )
  if (options.verbose) {
    console.log(`Found ${features.size} features`)
  }

  const pageList: Page[] = site.pageList
  if (options.verbose) {
    console.log(`Searching ${pageList.length.toLocaleString()} pages`)
  }

  const t0 = new Date()
  searchAndRemove(features, pageList, Boolean(options.verbose))
  const t1 = new Date()

  if (options.verbose) {
    const color = features.size === 0 ? chalk.green : chalk.yellow
    console.log(
      color(
        `Searched ${pageList.length.toLocaleString()} pages in ${formatDelta(t0, t1)}.
      And found ${features.size} features remaining (i.e. orphans).`.replace(/\s\s+/, ' '),
      ),
    )
  }

  const remaining = Array.from(features).map((feature) =>
    path.join(sourceDirectory, `${feature}.yml`),
  )
  if (options.output) {
    if (options.output.endsWith('.json')) {
      if (remaining.length) {
        fs.writeFileSync(options.output, JSON.stringify(remaining, null, 2))
      }
    } else {
      fs.writeFileSync(options.output, remaining.join('\n'))
    }
    if (!options.verbose) {
      return
    }
  }
  console.log(chalk.bold(`Orphans found (${remaining.length}):`))
  for (const feature of remaining) {
    console.log(chalk.green(feature))
  }
}

function formatDelta(t0: Date, t1: Date) {
  const ms = t1.getTime() - t0.getTime()
  return `${(ms / 1000).toFixed(1)} seconds`
}

function searchAndRemove(features: Set<string>, pages: Page[], verbose = false) {
  for (const page of pages) {
    const content = page.markdown
    // We actually never bother looking at the `versions:` frontmatter
    // key in translations, so it doesn't matter if the translated
    // frontmatter might have `versions: some-old-feature`.
    if (page.languageCode === 'en') {
      for (const [key, value] of Object.entries(page.versions)) {
        if (key === 'feature') {
          if (features.has(value)) {
            features.delete(value)
          }
        }
      }
    }

    const combined = `
      ${content}
      ${page.title || ''}
      ${page.shortTitle || ''}
      ${page.intro || ''}
    `

    checkString(combined, features, { page, verbose, languageCode: page.languageCode })
  }

  // Reusables are a bit special, as they are shared between languages.
  // There'll always be a slight mismatch between files present on disk
  // in English vs. translations.
  // The translations never delete files, so there's often excess reusables
  // on disk in translations. And the English might be ahead, meaning a file
  // has been introduced in English but not yet translated.
  // The code below loops over the English reusables, and takes note of the
  // their relative paths and content. Then, we re-use the keys of that map
  // to know which files, in the translations, to check. And when we read
  // them in, we'll need the English equivalent content to be able to
  // use the correctTranslatedContentStrings function.

  // Check variables files
  for (const filePath of getVariableFiles(path.join(languages.en.dir, 'data', 'variables'))) {
    const fileContent = fs.readFileSync(filePath, 'utf-8')
    checkString(fileContent, features, { filePath, verbose, languageCode: 'en' })
  }

  const englishReusables = new Map<string, string>()
  for (const filePath of getReusableFiles(path.join(languages.en.dir, 'data', 'reusables'))) {
    const relativePath = path.relative(languages.en.dir, filePath)
    const fileContent = fs.readFileSync(filePath, 'utf-8')
    checkString(fileContent, features, { filePath, verbose, languageCode: 'en' })
    englishReusables.set(relativePath, fileContent)
  }
  for (const language of Object.values(languages)) {
    if (language.code === 'en') continue // Already did that in the loop above

    for (const [relativePath, englishFileContent] of Array.from(englishReusables.entries())) {
      const filePath = path.join(language.dir, relativePath)
      try {
        const fileContent = fs.readFileSync(filePath, 'utf-8')
        const correctedFileContent = correctTranslatedContentStrings(
          fileContent,
          englishFileContent,
          {
            code: language.code,
            relativePath,
          },
        )

        checkString(correctedFileContent, features, {
          filePath,
          verbose,
          languageCode: language.code,
        })
      } catch (error) {
        if (error instanceof Error && 'code' in error && error.code === 'ENOENT') {
          // That a reusable does *not* exist in a translation is
          // perfectly expected. It means that English reusable was
          // most likely added recently and the translation hasn't been
          // translated yet.
          continue
        }
        throw error
      }
    }
  }
}

export function getReusableFiles(root: string): string[] {
  const here = []
  for (const file of fs.readdirSync(root)) {
    const filePath = `${root}/${file}`
    if (fs.statSync(filePath).isDirectory()) {
      here.push(...getReusableFiles(filePath))
    } else if (file.endsWith('.md') && file !== 'README.md') {
      here.push(filePath)
    }
  }
  return here
}

export function getVariableFiles(root: string): string[] {
  const here = []
  for (const file of fs.readdirSync(root)) {
    const filePath = `${root}/${file}`
    if (fs.statSync(filePath).isDirectory()) {
      here.push(...getVariableFiles(filePath))
    } else if (file.endsWith('.yml') && file !== 'README.yml') {
      here.push(filePath)
    }
  }
  return here
}

const IGNORE_ARGS = new Set(['or', 'and', 'not', '<', '>', 'ghes', 'fpt', 'ghec', '!=', '='])

function checkString(
  string: string,
  features: Set<string>,
  {
    page,
    filePath,
    languageCode,
    verbose = false,
  }: { page?: Page; filePath?: string; languageCode?: string; verbose?: boolean } = {},
) {
  try {
    // The reason for the `noCache: true` is that we're going to be sending
    // a LOT of different strings in and the cache will fill up rapidly
    // when testing every possible string in every possible language for
    // every page.
    const tokens = getLiquidTokens(string, { noCache: true }).filter(
      (token): token is TagToken => token.kind === TokenKind.Tag,
    )
    for (const token of tokens) {
      if (token.name === 'ifversion' || token.name === 'elsif') {
        for (const arg of token.args.split(/\s+/)) {
          if (IGNORE_ARGS.has(arg)) continue
          if (isFloat(arg)) continue

          if (features.has(arg)) {
            features.delete(arg)
          }
        }
      }
    }
  } catch (error) {
    if (error instanceof TokenizationError) {
      // If it happens in English, it's a serious error
      if (languageCode === 'en') throw error

      // The translation might, currently, have corrupted liquid
      // So treat it as a string
      if (verbose)
        console.log(
          `TokenizationError in ${page ? page.fullPath : filePath}. Treating ${page ? page.fullPath : filePath} as a string and using regex`,
        )

      for (const feature of Array.from(findByRegex(features, string))) {
        features.delete(feature)
      }
    } else {
      throw error
    }
  }
}

function findByRegex(features: Set<string>, string: string) {
  const found = new Set<string>()
  for (const match of string.match(/\{%\s*(ifversion|elsif)\s*(.*?)\s*%\}/g) || []) {
    for (const feature of Array.from(features)) {
      const regex = new RegExp(`\\s${escapeRegex(feature)}(\\s|%)`, 'i')
      if (regex.test(match)) {
        found.add(feature)
      }
    }
  }
  return found
}

const test = findByRegex(
  new Set(['placeholder', 'foo-bar']),
  `
  placeholder

  {%ifversion placeholder-foo or fpt%}
  {%   elsif   not-placeholder   %}
  {%   elsif   foo-bar%}
  {%endif %}

  {% data reusables.enterprise-migration-tool.placeholder-table %}
  {% data placeholder %}
`,
)
console.assert(test.has('foo-bar'), test.toString())
console.assert(!test.has('placeholder'), test.toString())

function escapeRegex(string: string) {
  return string.replace(/[/\-\\^$*+?.()|[\]{}]/g, '\\$&')
}

function isFloat(x: string | number) {
  return !!(Number(x) + 1)
}
strictEqual(isFloat('1.2'), true)
strictEqual(isFloat('10'), true)
strictEqual(isFloat('notatall'), false)
strictEqual(isFloat('2fa'), false)