-
Notifications
You must be signed in to change notification settings - Fork 66.7k
Expand file tree
/
Copy pathanalyze-text.ts
More file actions
executable file
·165 lines (140 loc) · 4.98 KB
/
analyze-text.ts
File metadata and controls
executable file
·165 lines (140 loc) · 4.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
// See how a piece of text gets turned into tokens by the different analyzers.
// Requires that the index exists in Elasticsearch.
//
// Example:
//
// npm run analyze-text -- -V dotcom -l en "The name of the wind"
import { Client } from '@elastic/elasticsearch'
import { Command, Option } from 'commander'
import chalk from 'chalk'
import dotenv from 'dotenv'
import { languageKeys } from '@/languages/lib/languages-server'
import { allVersions } from '@/versions/lib/all-versions'
import type { estypes } from '@elastic/elasticsearch'
// Now you can optionally have set the ELASTICSEARCH_URL in your .env file.
dotenv.config()
// Create an object that maps the "short name" of a version to
// all information about it. E.g.
//
// {
// 'ghes-3.5': {
// hasNumberedReleases: true,
// currentRelease: '3.5',
// version: 'enterprise-server@3.5',
// miscBaseName: 'ghes-'
// ...
// },
// ...
//
// We need this later to be able to map CLI arguments to what the
// records are called when found on disk.
const shortNames: Record<string, (typeof allVersions)[keyof typeof allVersions]> =
Object.fromEntries(
Object.values(allVersions).map((info) => {
const shortName = info.hasNumberedReleases
? `${info.miscBaseName}${info.currentRelease}`
: info.miscBaseName
return [shortName, info]
}),
)
const allVersionKeys: string[] = Object.keys(shortNames)
interface Options {
verbose?: boolean
version?: string
language?: string
notLanguage?: string
elasticsearchUrl?: string
indexPrefix?: string
}
const program = new Command()
program
.description('Analyze text into tokens')
.option('-v, --verbose', 'Verbose outputs')
.addOption(new Option('-V, --version <VERSION>', 'Specific version').choices(allVersionKeys))
.addOption(
new Option('-l, --language <LANGUAGE>', 'Which language to focus on').choices(languageKeys),
)
.option('--not-language <LANGUAGE>', 'Exclude a specific language')
.option('-u, --elasticsearch-url <url>', 'If different from $ELASTICSEARCH_URL')
.option('--index-prefix <PREFIX>', 'Prefix for the index name')
.argument('<text>', 'text to tokenize')
.parse(process.argv)
const options = program.opts<Options>()
const args: string[] = program.args
try {
await main(options, args)
} catch (err) {
console.error(chalk.red('Error:'), err)
process.exit(1)
}
async function main(opts: Options, textArgs: string[]): Promise<void> {
const texts = [textArgs.join(' ')]
if (!opts.elasticsearchUrl && !process.env.ELASTICSEARCH_URL) {
throw new Error(
'Must pass the elasticsearch URL option or ' +
'set the environment variable ELASTICSEARCH_URL',
)
}
let node = opts.elasticsearchUrl || process.env.ELASTICSEARCH_URL!
// Allow the user to lazily set it to `localhost:9200` for example.
if (!node.startsWith('http') && !node.startsWith('://') && node.split(':').length === 2) {
node = `http://${node}`
}
try {
const parsed = new URL(node)
if (!parsed.hostname) throw new Error('No valid hostname')
} catch (err) {
console.error(chalk.bold('URL for Elasticsearch not a valid URL'), err)
return
}
const { verbose, language, notLanguage } = opts
// The notLanguage is useful if you want to, for example, index all languages
// *except* English.
if (language && notLanguage) {
throw new Error("Can't combine --language and --not-language")
}
if (verbose) {
console.log(`Connecting to ${chalk.bold(safeUrlDisplay(node))}`)
}
const client = new Client({ node })
// This will throw if it can't ping
await client.ping()
const versionKey = opts.version || 'dotcom'
if (verbose) {
console.log(`Analyzing on version ${chalk.bold(versionKey)}`)
}
const languageKey = opts.language || 'en'
if (verbose) {
console.log(`Analyzing on language ${chalk.bold(languageKey)}`)
}
const { indexPrefix } = opts
const prefix = indexPrefix ? `${indexPrefix}_` : ''
const indexName = `${prefix}github-docs-${versionKey}-${languageKey}`
console.log(chalk.yellow(`Analyzing in ${chalk.bold(indexName)}`))
await analyzeVersion(client, texts, indexName)
}
function safeUrlDisplay(url: string): string {
const parsed = new URL(url)
if (parsed.password) {
parsed.password = '***'
}
if (parsed.username) {
parsed.username = `${parsed.username.slice(0, 4)}***`
}
return parsed.toString()
}
async function analyzeVersion(client: Client, texts: string[], indexName: string): Promise<void> {
for (const text of texts) {
console.log(`RAW TEXT: 〝${chalk.italic(text)}〞`)
for (const analyzer of ['text_analyzer_explicit', 'text_analyzer', 'standard']) {
console.log('ANALYZER:', chalk.bold(analyzer))
const response = await client.indices.analyze({
index: indexName,
body: { analyzer, text },
})
const tokens: estypes.IndicesAnalyzeAnalyzeToken[] | undefined = response.tokens
const tokenWords: string[] = tokens?.map((token) => token.token) || []
console.log(tokenWords)
}
}
}