|
368 | 368 | function hasOnlyUnixLineEndings(string) { |
369 | 369 | return !string.includes('\r\n') && string.includes('\n'); |
370 | 370 | } |
371 | | - function trailingWs(string) { |
| 371 | + /** |
| 372 | + * Split a string into segments using a word segmenter, merging consecutive |
| 373 | + * segments if they are both whitespace segments. Whitespace segments can |
| 374 | + * appear adjacent to one another for two reasons: |
| 375 | + * - newlines always get their own segment |
| 376 | + * - where a diacritic is attached to a whitespace character in the text, the |
| 377 | + * segment ends after the diacritic, so e.g. " \u0300 " becomes two segments. |
| 378 | + * This function therefore runs the segmenter's .segment() method and then |
| 379 | + * merges consecutive segments of whitespace into a single part. |
| 380 | + */ |
| 381 | + function segment(string, segmenter) { |
| 382 | + const parts = []; |
| 383 | + for (const segmentObj of Array.from(segmenter.segment(string))) { |
| 384 | + const segment = segmentObj.segment; |
| 385 | + if (parts.length && (/\s/).test(parts[parts.length - 1]) && (/\s/).test(segment)) { |
| 386 | + parts[parts.length - 1] += segment; |
| 387 | + } |
| 388 | + else { |
| 389 | + parts.push(segment); |
| 390 | + } |
| 391 | + } |
| 392 | + return parts; |
| 393 | + } |
| 394 | + // The functions below take a `segmenter` argument so that, when called from |
| 395 | + // diffWords when it is using a segmenter, they can use a notion of what |
| 396 | + // constitutes "whitespace" that is consistent with the segmenter. |
| 397 | + // |
| 398 | + // USUALLY this will be identical to the result of the non-segmenter-based |
| 399 | + // logic, but it differs in at least one case: when whitespace characters are |
| 400 | + // modified by diacritics. A word segmenter considers these diacritics to be |
| 401 | + // part of the whitespace, whereas our non-segmenter-based logic does not. |
| 402 | + // |
| 403 | + // Because the segmenter-based approach necessarily requires segmenting the |
| 404 | + // entire string, we offer a leadingAndTrailingWs function to allow getting the |
| 405 | + // whitespace prefix AND whitespace suffix with a single call to the segmenter, |
| 406 | + // for efficiency's sake. |
| 407 | + function trailingWs(string, segmenter) { |
| 408 | + if (segmenter) { |
| 409 | + return leadingAndTrailingWs(string, segmenter)[1]; |
| 410 | + } |
372 | 411 | // Yes, this looks overcomplicated and dumb - why not replace the whole function with |
373 | 412 | // return string.match(/\s*$/)[0] |
374 | 413 | // you ask? Because: |
|
388 | 427 | } |
389 | 428 | return string.substring(i + 1); |
390 | 429 | } |
391 | | - function leadingWs(string) { |
| 430 | + function leadingWs(string, segmenter) { |
| 431 | + if (segmenter) { |
| 432 | + return leadingAndTrailingWs(string, segmenter)[0]; |
| 433 | + } |
392 | 434 | // Thankfully the annoying considerations described in trailingWs don't apply here: |
393 | 435 | const match = string.match(/^\s*/); |
394 | 436 | return match ? match[0] : ''; |
395 | 437 | } |
| 438 | + function leadingAndTrailingWs(string, segmenter) { |
| 439 | + if (!segmenter) { |
| 440 | + return [leadingWs(string), trailingWs(string)]; |
| 441 | + } |
| 442 | + if (segmenter.resolvedOptions().granularity != 'word') { |
| 443 | + throw new Error('The segmenter passed must have a granularity of "word"'); |
| 444 | + } |
| 445 | + const segments = segment(string, segmenter); |
| 446 | + const firstSeg = segments[0]; |
| 447 | + const lastSeg = segments[segments.length - 1]; |
| 448 | + const head = (/\s/).test(firstSeg) ? firstSeg : ''; |
| 449 | + const tail = (/\s/).test(lastSeg) ? lastSeg : ''; |
| 450 | + return [head, tail]; |
| 451 | + } |
396 | 452 |
|
397 | 453 | // Based on https://en.wikipedia.org/wiki/Latin_script_in_Unicode |
398 | 454 | // |
|
458 | 514 | // We want `parts` to be an array whose elements alternate between being |
459 | 515 | // pure whitespace and being pure non-whitespace. This is ALMOST what the |
460 | 516 | // segments returned by a word-based Intl.Segmenter already look like, |
461 | | - // and therefore we can ALMOST get what we want by simply doing... |
462 | | - // parts = Array.from(segmenter.segment(value), segment => segment.segment); |
463 | | - // ... but not QUITE, because there's of one annoying special case: every |
464 | | - // newline character gets its own segment, instead of sharing a segment |
465 | | - // with other surrounding whitespace. We therefore need to manually merge |
466 | | - // consecutive segments of whitespace into a single part: |
467 | | - parts = []; |
468 | | - for (const segmentObj of Array.from(segmenter.segment(value))) { |
469 | | - const segment = segmentObj.segment; |
470 | | - if (parts.length && (/\s/).test(parts[parts.length - 1]) && (/\s/).test(segment)) { |
471 | | - parts[parts.length - 1] += segment; |
472 | | - } |
473 | | - else { |
474 | | - parts.push(segment); |
475 | | - } |
476 | | - } |
| 517 | + // but not quite - see explanation in the docs of our custom segment() |
| 518 | + // function. |
| 519 | + parts = segment(value, segmenter); |
477 | 520 | } |
478 | 521 | else { |
479 | 522 | parts = value.match(tokenizeIncludingWhitespace) || []; |
|
537 | 580 | } |
538 | 581 | else { |
539 | 582 | if (insertion || deletion) { // May be false at start of text |
540 | | - dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, change); |
| 583 | + dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, change, options.intlSegmenter); |
541 | 584 | } |
542 | 585 | lastKeep = change; |
543 | 586 | insertion = null; |
544 | 587 | deletion = null; |
545 | 588 | } |
546 | 589 | }); |
547 | 590 | if (insertion || deletion) { |
548 | | - dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, null); |
| 591 | + dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, null, options.intlSegmenter); |
549 | 592 | } |
550 | 593 | return changes; |
551 | 594 | } |
|
561 | 604 | } |
562 | 605 | return wordDiff.diff(oldStr, newStr, options); |
563 | 606 | } |
564 | | - function dedupeWhitespaceInChangeObjects(startKeep, deletion, insertion, endKeep) { |
| 607 | + function dedupeWhitespaceInChangeObjects(startKeep, deletion, insertion, endKeep, segmenter) { |
565 | 608 | // Before returning, we tidy up the leading and trailing whitespace of the |
566 | 609 | // change objects to eliminate cases where trailing whitespace in one object |
567 | 610 | // is repeated as leading whitespace in the next. |
|
604 | 647 | // * Just a "delete" |
605 | 648 | // We handle the three cases separately. |
606 | 649 | if (deletion && insertion) { |
607 | | - const oldWsPrefix = leadingWs(deletion.value); |
608 | | - const oldWsSuffix = trailingWs(deletion.value); |
609 | | - const newWsPrefix = leadingWs(insertion.value); |
610 | | - const newWsSuffix = trailingWs(insertion.value); |
| 650 | + const [oldWsPrefix, oldWsSuffix] = leadingAndTrailingWs(deletion.value, segmenter); |
| 651 | + const [newWsPrefix, newWsSuffix] = leadingAndTrailingWs(insertion.value, segmenter); |
611 | 652 | if (startKeep) { |
612 | 653 | const commonWsPrefix = longestCommonPrefix(oldWsPrefix, newWsPrefix); |
613 | 654 | startKeep.value = replaceSuffix(startKeep.value, newWsPrefix, commonWsPrefix); |
|
629 | 670 | // whitespace and deleting duplicate leading whitespace where |
630 | 671 | // present. |
631 | 672 | if (startKeep) { |
632 | | - const ws = leadingWs(insertion.value); |
| 673 | + const ws = leadingWs(insertion.value, segmenter); |
633 | 674 | insertion.value = insertion.value.substring(ws.length); |
634 | 675 | } |
635 | 676 | if (endKeep) { |
636 | | - const ws = leadingWs(endKeep.value); |
| 677 | + const ws = leadingWs(endKeep.value, segmenter); |
637 | 678 | endKeep.value = endKeep.value.substring(ws.length); |
638 | 679 | } |
639 | 680 | // otherwise we've got a deletion and no insertion |
640 | 681 | } |
641 | 682 | else if (startKeep && endKeep) { |
642 | | - const newWsFull = leadingWs(endKeep.value), delWsStart = leadingWs(deletion.value), delWsEnd = trailingWs(deletion.value); |
| 683 | + const newWsFull = leadingWs(endKeep.value, segmenter), [delWsStart, delWsEnd] = leadingAndTrailingWs(deletion.value, segmenter); |
643 | 684 | // Any whitespace that comes straight after startKeep in both the old and |
644 | 685 | // new texts, assign to startKeep and remove from the deletion. |
645 | 686 | const newWsStart = longestCommonPrefix(newWsFull, delWsStart); |
|
658 | 699 | // We are at the start of the text. Preserve all the whitespace on |
659 | 700 | // endKeep, and just remove whitespace from the end of deletion to the |
660 | 701 | // extent that it overlaps with the start of endKeep. |
661 | | - const endKeepWsPrefix = leadingWs(endKeep.value); |
662 | | - const deletionWsSuffix = trailingWs(deletion.value); |
| 702 | + const endKeepWsPrefix = leadingWs(endKeep.value, segmenter); |
| 703 | + const deletionWsSuffix = trailingWs(deletion.value, segmenter); |
663 | 704 | const overlap = maximumOverlap(deletionWsSuffix, endKeepWsPrefix); |
664 | 705 | deletion.value = removeSuffix(deletion.value, overlap); |
665 | 706 | } |
666 | 707 | else if (startKeep) { |
667 | 708 | // We are at the END of the text. Preserve all the whitespace on |
668 | 709 | // startKeep, and just remove whitespace from the start of deletion to |
669 | 710 | // the extent that it overlaps with the end of startKeep. |
670 | | - const startKeepWsSuffix = trailingWs(startKeep.value); |
671 | | - const deletionWsPrefix = leadingWs(deletion.value); |
| 711 | + const startKeepWsSuffix = trailingWs(startKeep.value, segmenter); |
| 712 | + const deletionWsPrefix = leadingWs(deletion.value, segmenter); |
672 | 713 | const overlap = maximumOverlap(startKeepWsSuffix, deletionWsPrefix); |
673 | 714 | deletion.value = removePrefix(deletion.value, overlap); |
674 | 715 | } |
|
0 commit comments