ops-codegraph-tool/src/domain/search/models.ts at 607cb462adfed97c77d63ddb44e3e5cea42e6b84 · optave/ops-codegraph-tool · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
import { execFileSync } from 'node:child_process';
import { createInterface } from 'node:readline';
import { info } from '../../infrastructure/logger.js';
import { ConfigError, EngineError } from '../../shared/errors.js';

export interface ModelConfig {
  name: string;
  dim: number;
  contextWindow: number;
  desc: string;
  quantized: boolean;
}

// Lazy-load transformers (heavy, optional module)
let pipeline: unknown = null;
let extractor: null | {
  dispose(): Promise<void>;
  (batch: string[], opts: Record<string, unknown>): Promise<{ data: number[] }>;
} = null;
let activeModel: string | null = null;

export const MODELS: Record<string, ModelConfig> = {
  minilm: {
    name: 'Xenova/all-MiniLM-L6-v2',
    dim: 384,
    contextWindow: 256,
    desc: 'Smallest, fastest (~23MB). General text.',
    quantized: true,
  },
  'jina-small': {
    name: 'Xenova/jina-embeddings-v2-small-en',
    dim: 512,
    contextWindow: 8192,
    desc: 'Small, good quality (~33MB). General text.',
    quantized: false,
  },
  'jina-base': {
    name: 'Xenova/jina-embeddings-v2-base-en',
    dim: 768,
    contextWindow: 8192,
    desc: 'Good quality (~137MB). General text, 8192 token context.',
    quantized: false,
  },
  'jina-code': {
    name: 'jinaai/jina-embeddings-v2-base-code',
    dim: 768,
    contextWindow: 8192,
    desc: 'Code-aware (~137MB). Trained on code+text, best for code search.',
    quantized: false,
  },
  nomic: {
    name: 'Xenova/nomic-embed-text-v1',
    dim: 768,
    contextWindow: 8192,
    desc: 'Good local quality (~137MB). 8192 context.',
    quantized: false,
  },
  'nomic-v1.5': {
    name: 'nomic-ai/nomic-embed-text-v1.5',
    dim: 768,
    contextWindow: 8192,
    desc: 'Improved nomic (~137MB). Matryoshka dimensions, 8192 context.',
    quantized: false,
  },
  'bge-large': {
    name: 'Xenova/bge-large-en-v1.5',
    dim: 1024,
    contextWindow: 512,
    desc: 'Best general retrieval (~335MB). Top MTEB scores.',
    quantized: false,
  },
};

export const EMBEDDING_STRATEGIES: readonly string[] = ['structured', 'source'];

export const DEFAULT_MODEL: string = 'nomic-v1.5';
const NPM_BIN = process.platform === 'win32' ? 'npm.cmd' : 'npm';
const BATCH_SIZE_MAP: Record<string, number> = {
  minilm: 32,
  'jina-small': 16,
  'jina-base': 8,
  'jina-code': 8,
  nomic: 8,
  'nomic-v1.5': 8,
  'bge-large': 4,
};
const DEFAULT_BATCH_SIZE = 32;

/** @internal Used by generator.js — not part of the public barrel. */
export function getModelConfig(modelKey?: string): ModelConfig {
  const key = modelKey || DEFAULT_MODEL;
  const config = MODELS[key];
  if (!config) {
    throw new ConfigError(`Unknown model: ${key}. Available: ${Object.keys(MODELS).join(', ')}`);
  }
  return config;
}

/**
 * Attempt to install a missing package.
 * In TTY environments, prompts the user for confirmation first.
 * In non-TTY environments (CI, piped stdin), installs automatically with a log message.
 * Returns true if the package was installed, false otherwise.
 * @internal Not part of the public barrel.
 */
export function promptInstall(packageName: string): Promise<boolean> {
  if (!process.stdin.isTTY) {
    info(`Installing ${packageName} (optional dependency for semantic search)…`);
    try {
      execFileSync(NPM_BIN, ['install', '--no-save', packageName], {
        stdio: 'inherit',
        timeout: 300_000,
      });
      return Promise.resolve(true);
    } catch (err) {
      info(
        `Auto-install of ${packageName} failed (${err instanceof Error ? err.message : String(err)}). Install it manually with:\n  npm install ${packageName}`,
      );
      return Promise.resolve(false);
    }
  }

  return new Promise((resolve) => {
    const rl = createInterface({ input: process.stdin, output: process.stderr });
    rl.question(
      `Semantic search requires ${packageName}. Install it now? [y/N] `,
      (answer: string) => {
        rl.close();
        if (answer.trim().toLowerCase() !== 'y') return resolve(false);
        try {
          execFileSync(NPM_BIN, ['install', packageName], {
            stdio: 'inherit',
            timeout: 300_000,
          });
          resolve(true);
        } catch (err) {
          info(
            `Install of ${packageName} failed (${err instanceof Error ? err.message : String(err)}). Install it manually with:\n  npm install ${packageName}`,
          );
          resolve(false);
        }
      },
    );
  });
}

/**
 * Lazy-load @huggingface/transformers.
 * If the package is missing, prompts the user to install it interactively.
 * In non-TTY environments, attempts automatic installation.
 * @internal Not part of the public barrel.
 */
export async function loadTransformers(): Promise<unknown> {
  try {
    return await import('@huggingface/transformers');
  } catch {
    const pkg = '@huggingface/transformers';
    const installed = await promptInstall(pkg);
    if (installed) {
      try {
        return await import(pkg);
      } catch (loadErr) {
        throw new EngineError(
          `${pkg} was installed but failed to load. Please check your environment.`,
          { cause: loadErr instanceof Error ? loadErr : undefined },
        );
      }
    }
    throw new EngineError(`Semantic search requires ${pkg}.\nInstall it with: npm install ${pkg}`);
  }
}

/**
 * Dispose the current ONNX session and free memory.
 * Safe to call when no model is loaded (no-op).
 */
export async function disposeModel(): Promise<void> {
  if (extractor) {
    await extractor.dispose();
    extractor = null;
  }
  activeModel = null;
}

async function loadModel(modelKey?: string): Promise<{ extractor: unknown; config: ModelConfig }> {
  const config = getModelConfig(modelKey);

  if (extractor && activeModel === config.name) return { extractor, config };

  // Dispose previous model before loading a different one
  await disposeModel();

  const transformers = (await loadTransformers()) as { pipeline: unknown };
  pipeline = transformers.pipeline;

  info(`Loading embedding model: ${config.name} (${config.dim}d)...`);
  const pipelineOpts = config.quantized ? { dtype: 'q8' } : {};
  try {
    extractor =
      await // biome-ignore lint/complexity/noBannedTypes: dynamically loaded transformers pipeline is untyped
      (pipeline as Function)('feature-extraction', config.name, pipelineOpts);
  } catch (err: unknown) {
    const cause = err instanceof Error ? err : undefined;
    const msg = cause?.message || String(err);
    if (msg.includes('Unauthorized') || msg.includes('401') || msg.includes('gated')) {
      throw new EngineError(
        `Model "${config.name}" requires authentication.\n` +
          `This model is gated on HuggingFace and needs an access token.\n\n` +
          `Options:\n` +
          `  1. Set HF_TOKEN env var: export HF_TOKEN=hf_...\n` +
          `  2. Use a public model instead: codegraph embed --model minilm`,
        { cause },
      );
    }
    throw new EngineError(
      `Failed to load model "${config.name}": ${msg}\n` +
        `Try a different model: codegraph embed --model minilm`,
      { cause },
    );
  }
  activeModel = config.name;
  info('Model loaded.');
  return { extractor, config };
}

/**
 * Generate embeddings for an array of texts.
 */
export async function embed(
  texts: string[],
  modelKey?: string,
): Promise<{ vectors: Float32Array[]; dim: number }> {
  const { extractor: ext, config } = await loadModel(modelKey);
  const dim = config.dim;
  const results: Float32Array[] = [];
  const batchSize = BATCH_SIZE_MAP[modelKey || DEFAULT_MODEL] ?? DEFAULT_BATCH_SIZE;

  for (let i = 0; i < texts.length; i += batchSize) {
    const batch = texts.slice(i, i + batchSize);
    const output =
      (await // biome-ignore lint/complexity/noBannedTypes: dynamically loaded extractor is untyped
      (ext as Function)(batch, { pooling: 'mean', normalize: true })) as {
        data: number[];
      };

    for (let j = 0; j < batch.length; j++) {
      const start = j * dim;
      const vec = new Float32Array(dim);
      for (let k = 0; k < dim; k++) {
        vec[k] = output.data[start + k] ?? 0;
      }
      results.push(vec);
    }

    if (texts.length > batchSize) {
      process.stderr.write(`  Embedded ${Math.min(i + batchSize, texts.length)}/${texts.length}\r`);
    }
  }

  return { vectors: results, dim };
}