|
| 1 | +<?php |
| 2 | + |
| 3 | +/** |
| 4 | + * Unit tests covering WordPress’ UTF-8 handling. |
| 5 | + * |
| 6 | + * @package WordPress |
| 7 | + * @group unicode |
| 8 | + * |
| 9 | + * @covers ::wp_check_invalid_utf8 |
| 10 | + */ |
| 11 | +class Tests_Unicode_WpCheckInvalidUtf8 extends WP_UnitTestCase { |
| 12 | + |
| 13 | + /** |
| 14 | + * Verifies that WordPress can properly detect valid and invalid UTF-8. |
| 15 | + * |
| 16 | + * @ticket 63837 |
| 17 | + * |
| 18 | + * @dataProvider data_utf8_test_data |
| 19 | + * |
| 20 | + * @param string $bytes Bytes as a PHP string. |
| 21 | + * @param string|null $scrubbed Expected checked value, if string isn’t valid UTF-8. |
| 22 | + */ |
| 23 | + public function test_properly_checks_utf8( string $bytes, ?string $scrubbed = null ) { |
| 24 | + if ( null === $scrubbed ) { |
| 25 | + $this->assertSame( |
| 26 | + $bytes, |
| 27 | + wp_check_invalid_utf8( $bytes ), |
| 28 | + 'Should have returned the unchanged string for valid UTF-8 input when not stripping invalid bytes.' |
| 29 | + ); |
| 30 | + |
| 31 | + $this->assertSame( |
| 32 | + $bytes, |
| 33 | + wp_check_invalid_utf8( $bytes, true ), |
| 34 | + 'Should have returned the unchanged string for valid UTF-8 input when stripping invalid bytes.' |
| 35 | + ); |
| 36 | + } else { |
| 37 | + $this->assertSame( |
| 38 | + '', |
| 39 | + wp_check_invalid_utf8( $bytes ), |
| 40 | + 'Should have rejected invalid input, returning an empty string when not stripping invalid bytes.' |
| 41 | + ); |
| 42 | + |
| 43 | + $this->assertSame( |
| 44 | + $scrubbed, |
| 45 | + wp_check_invalid_utf8( $bytes, true ), |
| 46 | + 'Failed to properly scrub the invalid spans of UTF-8 from the input string.' |
| 47 | + ); |
| 48 | + } |
| 49 | + } |
| 50 | + |
| 51 | + /** |
| 52 | + * Data provider. |
| 53 | + * |
| 54 | + * @throws Exception |
| 55 | + * |
| 56 | + * @return Generator |
| 57 | + */ |
| 58 | + public static function data_utf8_test_data() { |
| 59 | + $test_file = fopen( __DIR__ . '/../../data/unicode/utf8tests/utf8tests.txt', 'r' ); |
| 60 | + $line_number = 0; |
| 61 | + $last_description = ''; |
| 62 | + |
| 63 | + while ( false !== ( $line = fgets( $test_file ) ) ) { |
| 64 | + ++$line_number; |
| 65 | + |
| 66 | + if ( empty( trim( $line ) ) ) { |
| 67 | + continue; |
| 68 | + } |
| 69 | + |
| 70 | + if ( str_starts_with( $line, '#' ) ) { |
| 71 | + $last_description = trim( substr( $line, 1 ) ); |
| 72 | + continue; |
| 73 | + } |
| 74 | + |
| 75 | + $test_parts = explode( ':', $line ); |
| 76 | + if ( count( $test_parts ) < 3 ) { |
| 77 | + throw new Exception( 'Wrong test data: check utf8tests.txt' ); |
| 78 | + } |
| 79 | + |
| 80 | + list( $reference, $classification, $test_data ) = $test_parts; |
| 81 | + |
| 82 | + $reference = trim( $reference ); |
| 83 | + $classification = trim( $classification ); |
| 84 | + $test_data = trim( $test_data ); |
| 85 | + |
| 86 | + switch ( $classification ) { |
| 87 | + case 'valid': |
| 88 | + yield "{$reference} {$last_description}" => array( $test_data, null ); |
| 89 | + break; |
| 90 | + |
| 91 | + case 'valid hex': |
| 92 | + case 'invalid hex': |
| 93 | + if ( 'invalid hex' === $classification && count( $test_parts ) < 5 ) { |
| 94 | + throw new Exception( "Test data missing expected “scrubbed” value: check utf8tests.txt:{$line_number}" ); |
| 95 | + } |
| 96 | + |
| 97 | + $bytes = hex2bin( str_replace( ' ', '', $test_data ) ); |
| 98 | + $scrubbed = 'invalid hex' === $classification |
| 99 | + ? hex2bin( str_replace( ' ', '', trim( $test_parts[4] ) ) ) |
| 100 | + : null; |
| 101 | + |
| 102 | + yield "{$reference} {$last_description}" => array( $bytes, $scrubbed ); |
| 103 | + break; |
| 104 | + |
| 105 | + default: |
| 106 | + throw new Exception( "Test input file contains unrecognized input classification '{$classification}' (see utf8tests.txt): {$line}" ); |
| 107 | + } |
| 108 | + } |
| 109 | + } |
| 110 | +} |
0 commit comments