|
1 | | -/** @file Unit tests for {@link convertParatext9ToInterlinearization}. */ |
| 1 | +/** @file Unit tests for {@link convertParatext9ToInterlinearization} and {@link createAnalyses}. */ |
2 | 2 | /// <reference types="jest" /> |
3 | 3 |
|
4 | 4 | import type { InterlinearData } from 'paratext-9-types'; |
5 | | -import { convertParatext9ToInterlinearization } from 'parsers/paratext-9/paratext9Converter'; |
| 5 | +import { |
| 6 | + convertParatext9ToInterlinearization, |
| 7 | + createAnalyses, |
| 8 | +} from 'parsers/paratext-9/paratext9Converter'; |
6 | 9 |
|
7 | 10 | describe('convertParatext9ToInterlinearization', () => { |
8 | 11 | describe('top-level structure', () => { |
@@ -428,4 +431,215 @@ describe('convertParatext9ToInterlinearization', () => { |
428 | 431 | expect(result.books[0].segments[0].occurrences[0].id).toBe(`${segId}-occ-0-Word:word/0-4`); |
429 | 432 | }); |
430 | 433 | }); |
| 434 | + |
| 435 | + describe('createAnalyses', () => { |
| 436 | + it('returns empty Map when verses is empty', () => { |
| 437 | + const data: InterlinearData = { |
| 438 | + glossLanguage: 'en', |
| 439 | + bookId: 'MAT', |
| 440 | + verses: {}, |
| 441 | + }; |
| 442 | + const result = createAnalyses(data); |
| 443 | + |
| 444 | + expect(result).toBeInstanceOf(Map); |
| 445 | + expect(result.size).toBe(0); |
| 446 | + }); |
| 447 | + |
| 448 | + it('returns one Analysis for one verse with one cluster and one lexeme', () => { |
| 449 | + const data: InterlinearData = { |
| 450 | + glossLanguage: 'en', |
| 451 | + bookId: 'MAT', |
| 452 | + verses: { |
| 453 | + 'MAT 1:1': { |
| 454 | + hash: '', |
| 455 | + clusters: [ |
| 456 | + { |
| 457 | + textRange: { index: 0, length: 4 }, |
| 458 | + lexemes: [{ lexemeId: 'Word:hello', senseId: 'g1' }], |
| 459 | + lexemesId: 'Word:hello', |
| 460 | + id: 'Word:hello/0-4', |
| 461 | + excluded: false, |
| 462 | + }, |
| 463 | + ], |
| 464 | + punctuations: [], |
| 465 | + }, |
| 466 | + }, |
| 467 | + }; |
| 468 | + const result = createAnalyses(data); |
| 469 | + |
| 470 | + expect(result.size).toBe(1); |
| 471 | + const analysis = result.get('analysis-en-Word:hello-g1'); |
| 472 | + expect(analysis).toBeDefined(); |
| 473 | + expect(analysis?.id).toBe('analysis-en-Word:hello-g1'); |
| 474 | + expect(analysis?.analysisLanguage).toBe('en'); |
| 475 | + expect(analysis?.analysisType).toBe('gloss'); |
| 476 | + expect(analysis?.confidence).toBe('medium'); |
| 477 | + expect(analysis?.sourceSystem).toBe('paratext-9'); |
| 478 | + expect(analysis?.sourceUser).toBe('paratext-9-parser'); |
| 479 | + expect(analysis?.glossText).toBe('g1'); |
| 480 | + }); |
| 481 | + |
| 482 | + it('deduplicates: same lexeme in multiple clusters yields one analysis', () => { |
| 483 | + const data: InterlinearData = { |
| 484 | + glossLanguage: 'en', |
| 485 | + bookId: 'MAT', |
| 486 | + verses: { |
| 487 | + 'MAT 1:1': { |
| 488 | + hash: '', |
| 489 | + clusters: [ |
| 490 | + { |
| 491 | + textRange: { index: 0, length: 3 }, |
| 492 | + lexemes: [{ lexemeId: 'Word:the', senseId: 'def' }], |
| 493 | + lexemesId: 'Word:the', |
| 494 | + id: 'c1', |
| 495 | + excluded: false, |
| 496 | + }, |
| 497 | + { |
| 498 | + textRange: { index: 4, length: 3 }, |
| 499 | + lexemes: [{ lexemeId: 'Word:the', senseId: 'def' }], |
| 500 | + lexemesId: 'Word:the', |
| 501 | + id: 'c2', |
| 502 | + excluded: false, |
| 503 | + }, |
| 504 | + ], |
| 505 | + punctuations: [], |
| 506 | + }, |
| 507 | + }, |
| 508 | + }; |
| 509 | + const result = createAnalyses(data); |
| 510 | + |
| 511 | + expect(result.size).toBe(1); |
| 512 | + expect(result.has('analysis-en-Word:the-def')).toBe(true); |
| 513 | + }); |
| 514 | + |
| 515 | + it('returns multiple analyses for different lexemes (lexemeId or senseId)', () => { |
| 516 | + const data: InterlinearData = { |
| 517 | + glossLanguage: 'en', |
| 518 | + bookId: 'MAT', |
| 519 | + verses: { |
| 520 | + 'MAT 1:1': { |
| 521 | + hash: '', |
| 522 | + clusters: [ |
| 523 | + { |
| 524 | + textRange: { index: 0, length: 4 }, |
| 525 | + lexemes: [ |
| 526 | + { lexemeId: 'Stem:run', senseId: 'g1' }, |
| 527 | + { lexemeId: 'Suffix:ing', senseId: 'g2' }, |
| 528 | + ], |
| 529 | + lexemesId: 'Stem:run', |
| 530 | + id: 'cluster1', |
| 531 | + excluded: false, |
| 532 | + }, |
| 533 | + ], |
| 534 | + punctuations: [], |
| 535 | + }, |
| 536 | + }, |
| 537 | + }; |
| 538 | + const result = createAnalyses(data); |
| 539 | + |
| 540 | + expect(result.size).toBe(2); |
| 541 | + expect(result.has('analysis-en-Stem:run-g1')).toBe(true); |
| 542 | + expect(result.has('analysis-en-Suffix:ing-g2')).toBe(true); |
| 543 | + expect(result.get('analysis-en-Stem:run-g1')?.glossText).toBe('g1'); |
| 544 | + expect(result.get('analysis-en-Suffix:ing-g2')?.glossText).toBe('g2'); |
| 545 | + }); |
| 546 | + |
| 547 | + it('sets glossText to undefined when senseId is empty', () => { |
| 548 | + const data: InterlinearData = { |
| 549 | + glossLanguage: 'en', |
| 550 | + bookId: 'MAT', |
| 551 | + verses: { |
| 552 | + 'MAT 1:1': { |
| 553 | + hash: '', |
| 554 | + clusters: [ |
| 555 | + { |
| 556 | + textRange: { index: 0, length: 1 }, |
| 557 | + lexemes: [{ lexemeId: 'Word:a', senseId: '' }], |
| 558 | + lexemesId: 'Word:a', |
| 559 | + id: 'Word:a/0-1', |
| 560 | + excluded: false, |
| 561 | + }, |
| 562 | + ], |
| 563 | + punctuations: [], |
| 564 | + }, |
| 565 | + }, |
| 566 | + }; |
| 567 | + const result = createAnalyses(data); |
| 568 | + |
| 569 | + expect(result.size).toBe(1); |
| 570 | + const analysis = result.get('analysis-en-Word:a'); |
| 571 | + expect(analysis).toBeDefined(); |
| 572 | + expect(analysis?.glossText).toBeUndefined(); |
| 573 | + expect(analysis?.id).toBe('analysis-en-Word:a'); |
| 574 | + }); |
| 575 | + |
| 576 | + it('uses glossLanguage from interlinearData for analysisLanguage and id prefix', () => { |
| 577 | + const data: InterlinearData = { |
| 578 | + glossLanguage: 'fr', |
| 579 | + bookId: 'GEN', |
| 580 | + verses: { |
| 581 | + 'GEN 1:1': { |
| 582 | + hash: '', |
| 583 | + clusters: [ |
| 584 | + { |
| 585 | + textRange: { index: 0, length: 2 }, |
| 586 | + lexemes: [{ lexemeId: 'Word:au', senseId: 'sens1' }], |
| 587 | + lexemesId: 'Word:au', |
| 588 | + id: 'c1', |
| 589 | + excluded: false, |
| 590 | + }, |
| 591 | + ], |
| 592 | + punctuations: [], |
| 593 | + }, |
| 594 | + }, |
| 595 | + }; |
| 596 | + const result = createAnalyses(data); |
| 597 | + |
| 598 | + expect(result.size).toBe(1); |
| 599 | + const analysis = result.get('analysis-fr-Word:au-sens1'); |
| 600 | + expect(analysis).toBeDefined(); |
| 601 | + expect(analysis?.analysisLanguage).toBe('fr'); |
| 602 | + expect(analysis?.id).toBe('analysis-fr-Word:au-sens1'); |
| 603 | + }); |
| 604 | + |
| 605 | + it('includes analyses from all verses', () => { |
| 606 | + const data: InterlinearData = { |
| 607 | + glossLanguage: 'en', |
| 608 | + bookId: 'MAT', |
| 609 | + verses: { |
| 610 | + 'MAT 1:1': { |
| 611 | + hash: '', |
| 612 | + clusters: [ |
| 613 | + { |
| 614 | + textRange: { index: 0, length: 3 }, |
| 615 | + lexemes: [{ lexemeId: 'Word:one', senseId: 's1' }], |
| 616 | + lexemesId: 'Word:one', |
| 617 | + id: 'c1', |
| 618 | + excluded: false, |
| 619 | + }, |
| 620 | + ], |
| 621 | + punctuations: [], |
| 622 | + }, |
| 623 | + 'MAT 1:2': { |
| 624 | + hash: '', |
| 625 | + clusters: [ |
| 626 | + { |
| 627 | + textRange: { index: 0, length: 3 }, |
| 628 | + lexemes: [{ lexemeId: 'Word:two', senseId: 's2' }], |
| 629 | + lexemesId: 'Word:two', |
| 630 | + id: 'c2', |
| 631 | + excluded: false, |
| 632 | + }, |
| 633 | + ], |
| 634 | + punctuations: [], |
| 635 | + }, |
| 636 | + }, |
| 637 | + }; |
| 638 | + const result = createAnalyses(data); |
| 639 | + |
| 640 | + expect(result.size).toBe(2); |
| 641 | + expect(result.has('analysis-en-Word:one-s1')).toBe(true); |
| 642 | + expect(result.has('analysis-en-Word:two-s2')).toBe(true); |
| 643 | + }); |
| 644 | + }); |
431 | 645 | }); |
0 commit comments