|
| 1 | +import {writeFileSync, mkdtempSync, rmSync} from 'node:fs'; |
| 2 | +import {tmpdir} from 'node:os'; |
| 3 | +import {join} from 'node:path'; |
1 | 4 | import {describe, it, expect, beforeEach, afterEach, vi} from 'vitest'; |
2 | 5 | import {Command} from 'commander'; |
3 | 6 | import type {Scraper_create_opts} from '../../types/scraper'; |
@@ -68,6 +71,10 @@ import { |
68 | 71 | AI_TRIGGER_DEFAULT_RETRIES, |
69 | 72 | AI_TRIGGER_RETRY_BASE_MS, |
70 | 73 | AI_TRIGGER_RETRY_MAX_MS, |
| 74 | + parse_urls_arg, |
| 75 | + read_input_file, |
| 76 | + resolve_run_inputs, |
| 77 | + is_valid_url, |
71 | 78 | } from '../../commands/scraper'; |
72 | 79 |
|
73 | 80 | describe('commands/scraper', ()=>{ |
@@ -1163,4 +1170,285 @@ describe('commands/scraper', ()=>{ |
1163 | 1170 | error.mockRestore(); |
1164 | 1171 | }); |
1165 | 1172 | }); |
| 1173 | + |
| 1174 | + describe('is_valid_url', ()=>{ |
| 1175 | + it('accepts http/https URLs', ()=>{ |
| 1176 | + expect(is_valid_url('https://example.com')).toBe(true); |
| 1177 | + expect(is_valid_url('http://example.com/a/b?c=1')).toBe(true); |
| 1178 | + }); |
| 1179 | + |
| 1180 | + it('rejects garbage', ()=>{ |
| 1181 | + expect(is_valid_url('not a url')).toBe(false); |
| 1182 | + expect(is_valid_url('')).toBe(false); |
| 1183 | + expect(is_valid_url(' ')).toBe(false); |
| 1184 | + }); |
| 1185 | + }); |
| 1186 | + |
| 1187 | + describe('parse_urls_arg', ()=>{ |
| 1188 | + it('splits, trims, and drops empties', ()=>{ |
| 1189 | + expect(parse_urls_arg( |
| 1190 | + ' https://a.com , https://b.com ,, https://c.com')) |
| 1191 | + .toEqual(['https://a.com', 'https://b.com', 'https://c.com']); |
| 1192 | + }); |
| 1193 | + |
| 1194 | + it('returns single URL for a non-comma input', ()=>{ |
| 1195 | + expect(parse_urls_arg('https://only.example.com')) |
| 1196 | + .toEqual(['https://only.example.com']); |
| 1197 | + }); |
| 1198 | + |
| 1199 | + it('returns empty array for blank input', ()=>{ |
| 1200 | + expect(parse_urls_arg('')).toEqual([]); |
| 1201 | + expect(parse_urls_arg(' , , ')).toEqual([]); |
| 1202 | + }); |
| 1203 | + }); |
| 1204 | + |
| 1205 | + describe('read_input_file', ()=>{ |
| 1206 | + let tmp_dir: string; |
| 1207 | + |
| 1208 | + beforeEach(()=>{ |
| 1209 | + tmp_dir = mkdtempSync(join(tmpdir(), 'bdata-test-')); |
| 1210 | + }); |
| 1211 | + |
| 1212 | + afterEach(()=>{ |
| 1213 | + rmSync(tmp_dir, {recursive: true, force: true}); |
| 1214 | + }); |
| 1215 | + |
| 1216 | + const write = (name: string, content: string): string=>{ |
| 1217 | + const p = join(tmp_dir, name); |
| 1218 | + writeFileSync(p, content, 'utf8'); |
| 1219 | + return p; |
| 1220 | + }; |
| 1221 | + |
| 1222 | + it('reads newline-separated URLs', ()=>{ |
| 1223 | + const p = write('urls.txt', |
| 1224 | + 'https://a.com\nhttps://b.com\nhttps://c.com'); |
| 1225 | + expect(read_input_file(p)).toEqual([ |
| 1226 | + 'https://a.com', 'https://b.com', 'https://c.com']); |
| 1227 | + }); |
| 1228 | + |
| 1229 | + it('skips blank lines and # comments', ()=>{ |
| 1230 | + const p = write('urls.txt', |
| 1231 | + '# top comment\n' |
| 1232 | + +'https://a.com\n' |
| 1233 | + +'\n' |
| 1234 | + +' \n' |
| 1235 | + +'# section\n' |
| 1236 | + +'https://b.com # inline comment ok\n' |
| 1237 | + +'https://c.com'); |
| 1238 | + expect(read_input_file(p)).toEqual([ |
| 1239 | + 'https://a.com', 'https://b.com', 'https://c.com']); |
| 1240 | + }); |
| 1241 | + |
| 1242 | + it('reads JSON array of strings', ()=>{ |
| 1243 | + const p = write('urls.json', |
| 1244 | + JSON.stringify(['https://a.com', 'https://b.com'])); |
| 1245 | + expect(read_input_file(p)).toEqual([ |
| 1246 | + 'https://a.com', 'https://b.com']); |
| 1247 | + }); |
| 1248 | + |
| 1249 | + it('reads JSON array of {url} objects', ()=>{ |
| 1250 | + const p = write('urls.json', JSON.stringify([ |
| 1251 | + {url: 'https://a.com'}, |
| 1252 | + {url: 'https://b.com', extra: 'ignored'}, |
| 1253 | + ])); |
| 1254 | + expect(read_input_file(p)).toEqual([ |
| 1255 | + 'https://a.com', 'https://b.com']); |
| 1256 | + }); |
| 1257 | + |
| 1258 | + it('throws on missing file', ()=>{ |
| 1259 | + expect(()=>read_input_file(join(tmp_dir, 'missing.txt'))) |
| 1260 | + .toThrow(/Cannot read --input-file/); |
| 1261 | + }); |
| 1262 | + |
| 1263 | + it('throws on malformed JSON', ()=>{ |
| 1264 | + const p = write('bad.json', '[{not valid'); |
| 1265 | + expect(()=>read_input_file(p)) |
| 1266 | + .toThrow(/failed to parse/); |
| 1267 | + }); |
| 1268 | + |
| 1269 | + it('throws on non-array JSON', ()=>{ |
| 1270 | + const p = write('obj.json', '{"url": "https://a.com"}'); |
| 1271 | + expect(()=>read_input_file(p)) |
| 1272 | + .toThrow(/must be an array/); |
| 1273 | + }); |
| 1274 | + |
| 1275 | + it('throws on JSON entry with neither string nor {url}', ()=>{ |
| 1276 | + const p = write('mixed.json', |
| 1277 | + JSON.stringify(['https://a.com', {wrong: 'field'}])); |
| 1278 | + expect(()=>read_input_file(p)) |
| 1279 | + .toThrow(/must be a string or an object with a "url"/); |
| 1280 | + }); |
| 1281 | + |
| 1282 | + it('returns empty array for an empty file', ()=>{ |
| 1283 | + const p = write('empty.txt', ' \n\n '); |
| 1284 | + expect(read_input_file(p)).toEqual([]); |
| 1285 | + }); |
| 1286 | + }); |
| 1287 | + |
| 1288 | + describe('resolve_run_inputs', ()=>{ |
| 1289 | + let tmp_dir: string; |
| 1290 | + |
| 1291 | + beforeEach(()=>{ |
| 1292 | + tmp_dir = mkdtempSync(join(tmpdir(), 'bdata-test-')); |
| 1293 | + }); |
| 1294 | + |
| 1295 | + afterEach(()=>{ |
| 1296 | + rmSync(tmp_dir, {recursive: true, force: true}); |
| 1297 | + }); |
| 1298 | + |
| 1299 | + it('returns the positional URL as a single-element list', ()=>{ |
| 1300 | + expect(resolve_run_inputs('https://a.com', {})) |
| 1301 | + .toEqual(['https://a.com']); |
| 1302 | + }); |
| 1303 | + |
| 1304 | + it('parses --urls', ()=>{ |
| 1305 | + expect(resolve_run_inputs(undefined, |
| 1306 | + {urls: 'https://a.com,https://b.com'})) |
| 1307 | + .toEqual(['https://a.com', 'https://b.com']); |
| 1308 | + }); |
| 1309 | + |
| 1310 | + it('reads --input-file', ()=>{ |
| 1311 | + const p = join(tmp_dir, 'urls.txt'); |
| 1312 | + writeFileSync(p, 'https://a.com\nhttps://b.com', 'utf8'); |
| 1313 | + expect(resolve_run_inputs(undefined, {inputFile: p})) |
| 1314 | + .toEqual(['https://a.com', 'https://b.com']); |
| 1315 | + }); |
| 1316 | + |
| 1317 | + it('rejects when no source is provided', ()=>{ |
| 1318 | + expect(()=>resolve_run_inputs(undefined, {})) |
| 1319 | + .toThrow(/requires one of: <url> positional, --urls/); |
| 1320 | + }); |
| 1321 | + |
| 1322 | + it('rejects when multiple sources are provided', ()=>{ |
| 1323 | + expect(()=>resolve_run_inputs('https://a.com', |
| 1324 | + {urls: 'https://b.com'})) |
| 1325 | + .toThrow(/only one input source/); |
| 1326 | + expect(()=>resolve_run_inputs(undefined, |
| 1327 | + {urls: 'https://a.com', inputFile: '/tmp/x'})) |
| 1328 | + .toThrow(/only one input source/); |
| 1329 | + }); |
| 1330 | + |
| 1331 | + it('rejects when parsed list is empty', ()=>{ |
| 1332 | + expect(()=>resolve_run_inputs(undefined, {urls: ' , , '})) |
| 1333 | + .toThrow(/No URLs to scrape/); |
| 1334 | + }); |
| 1335 | + |
| 1336 | + it('rejects invalid URLs and names them', ()=>{ |
| 1337 | + expect(()=>resolve_run_inputs(undefined, |
| 1338 | + {urls: 'https://a.com,not-a-url,also bad'})) |
| 1339 | + .toThrow(/Invalid URL\(s\):.*not-a-url/); |
| 1340 | + }); |
| 1341 | + }); |
| 1342 | + |
| 1343 | + describe('handle_run_scraper multi-URL', ()=>{ |
| 1344 | + let fetch_spy: ReturnType<typeof vi.spyOn>; |
| 1345 | + let tmp_dir: string; |
| 1346 | + |
| 1347 | + beforeEach(()=>{ |
| 1348 | + fetch_spy = vi.spyOn(global, 'fetch') as never; |
| 1349 | + tmp_dir = mkdtempSync(join(tmpdir(), 'bdata-test-')); |
| 1350 | + }); |
| 1351 | + |
| 1352 | + afterEach(()=>{ |
| 1353 | + fetch_spy.mockRestore(); |
| 1354 | + rmSync(tmp_dir, {recursive: true, force: true}); |
| 1355 | + }); |
| 1356 | + |
| 1357 | + it('--urls posts an array body to /dca/trigger and polls /dca/dataset', |
| 1358 | + async()=>{ |
| 1359 | + mocks.post.mockResolvedValueOnce({collection_id: 'd_batch'}); |
| 1360 | + fetch_spy.mockImplementation(()=>Promise.resolve({ |
| 1361 | + status: 200, |
| 1362 | + text: ()=>Promise.resolve( |
| 1363 | + '[{"title":"A"},{"title":"B"},{"title":"C"}]'), |
| 1364 | + } as unknown as Response)); |
| 1365 | + mocks.poll_until.mockImplementationOnce(async(o: never)=>{ |
| 1366 | + const cfg = o as {fetch_once: ()=>Promise<unknown>}; |
| 1367 | + const r = await cfg.fetch_once(); |
| 1368 | + return {result: r, attempts: 1, last_status: '__ready__'}; |
| 1369 | + }); |
| 1370 | + await handle_run_scraper('c_abc', undefined, { |
| 1371 | + urls: 'https://a.com,https://b.com,https://c.com', |
| 1372 | + }); |
| 1373 | + expect(mocks.post).toHaveBeenCalledTimes(1); |
| 1374 | + const call = mocks.post.mock.calls[0]; |
| 1375 | + expect(String(call[1])).toMatch(/\/dca\/trigger\?collector=c_abc/); |
| 1376 | + expect(call[2]).toEqual([ |
| 1377 | + {url: 'https://a.com'}, |
| 1378 | + {url: 'https://b.com'}, |
| 1379 | + {url: 'https://c.com'}, |
| 1380 | + ]); |
| 1381 | + expect(mocks.print).toHaveBeenCalledWith( |
| 1382 | + [{title: 'A'}, {title: 'B'}, {title: 'C'}], |
| 1383 | + {json: undefined, pretty: undefined, output: undefined} |
| 1384 | + ); |
| 1385 | + }); |
| 1386 | + |
| 1387 | + it('--input-file routes to the same batch path', async()=>{ |
| 1388 | + const p = join(tmp_dir, 'urls.txt'); |
| 1389 | + writeFileSync(p, 'https://a.com\nhttps://b.com', 'utf8'); |
| 1390 | + mocks.post.mockResolvedValueOnce({collection_id: 'd_batch'}); |
| 1391 | + fetch_spy.mockImplementation(()=>Promise.resolve({ |
| 1392 | + status: 200, |
| 1393 | + text: ()=>Promise.resolve('[{"ok":1},{"ok":2}]'), |
| 1394 | + } as unknown as Response)); |
| 1395 | + mocks.poll_until.mockImplementationOnce(async(o: never)=>{ |
| 1396 | + const cfg = o as {fetch_once: ()=>Promise<unknown>}; |
| 1397 | + const r = await cfg.fetch_once(); |
| 1398 | + return {result: r, attempts: 1, last_status: '__ready__'}; |
| 1399 | + }); |
| 1400 | + await handle_run_scraper('c_abc', undefined, {inputFile: p}); |
| 1401 | + expect(mocks.post.mock.calls[0][2]).toEqual([ |
| 1402 | + {url: 'https://a.com'}, |
| 1403 | + {url: 'https://b.com'}, |
| 1404 | + ]); |
| 1405 | + }); |
| 1406 | + |
| 1407 | + it('rejects --sync combined with --urls', async()=>{ |
| 1408 | + await expect( |
| 1409 | + handle_run_scraper('c_abc', undefined, { |
| 1410 | + sync: true, |
| 1411 | + urls: 'https://a.com,https://b.com', |
| 1412 | + }) |
| 1413 | + ).rejects.toThrow(/--sync cannot be combined with --urls/); |
| 1414 | + expect(mocks.fail).toHaveBeenCalledWith( |
| 1415 | + expect.stringContaining( |
| 1416 | + '--sync cannot be combined with --urls')); |
| 1417 | + expect(mocks.post).not.toHaveBeenCalled(); |
| 1418 | + }); |
| 1419 | + |
| 1420 | + it('rejects when no URL source is provided', async()=>{ |
| 1421 | + await expect( |
| 1422 | + handle_run_scraper('c_abc', undefined, {}) |
| 1423 | + ).rejects.toThrow( |
| 1424 | + /requires one of: <url> positional, --urls, or --input-file/); |
| 1425 | + }); |
| 1426 | + |
| 1427 | + it('rejects when positional and --urls are both set', async()=>{ |
| 1428 | + await expect( |
| 1429 | + handle_run_scraper('c_abc', 'https://a.com', |
| 1430 | + {urls: 'https://b.com'}) |
| 1431 | + ).rejects.toThrow(/only one input source/); |
| 1432 | + }); |
| 1433 | + |
| 1434 | + it('single URL via --urls still takes the legacy single path', |
| 1435 | + async()=>{ |
| 1436 | + mocks.post.mockResolvedValueOnce({response_id: 'r_xyz'}); |
| 1437 | + fetch_spy.mockImplementation(()=>Promise.resolve({ |
| 1438 | + status: 200, |
| 1439 | + text: ()=>Promise.resolve('{"title":"only"}'), |
| 1440 | + } as unknown as Response)); |
| 1441 | + mocks.poll_until.mockImplementationOnce(async(o: never)=>{ |
| 1442 | + const cfg = o as {fetch_once: ()=>Promise<unknown>}; |
| 1443 | + const r = await cfg.fetch_once(); |
| 1444 | + return {result: r, attempts: 1, last_status: '__ready__'}; |
| 1445 | + }); |
| 1446 | + await handle_run_scraper('c_abc', undefined, |
| 1447 | + {urls: 'https://only.com'}); |
| 1448 | + expect(String(mocks.post.mock.calls[0][1])).toMatch( |
| 1449 | + /\/dca\/trigger_immediate\?collector=c_abc/); |
| 1450 | + expect(mocks.post.mock.calls[0][2]).toEqual( |
| 1451 | + {url: 'https://only.com'}); |
| 1452 | + }); |
| 1453 | + }); |
1166 | 1454 | }); |
0 commit comments