1414use function func_num_args ;
1515use function implode ;
1616use function intdiv ;
17+ use function ord ;
1718use function str_split ;
1819use function strlen ;
1920use function strrpos ;
@@ -193,7 +194,7 @@ public static function decode(string $input, int $outLength = null, array &$case
193194 array_splice ($ caseFlags , $ i , 0 , [self ::flagged ($ bytes [$ n - 1 ])]);
194195 }
195196
196- array_splice ($ output , $ i ++, 0 , [CodePoint:: encode ($ n )]);
197+ array_splice ($ output , $ i ++, 0 , [self :: encodeCodePoint ($ n )]);
197198 }
198199
199200 return implode ('' , $ output );
@@ -213,9 +214,9 @@ public static function encode(string $input, int $outLength = null, array $caseF
213214 $ bias = self ::INITIAL_BIAS ;
214215 $ inputLength = 0 ;
215216 $ output = '' ;
216- $ iter = new CodePointString ($ input );
217+ $ codePoints = self :: utf8Decode ($ input );
217218
218- foreach ($ iter as $ j => $ codePoint ) {
219+ foreach ($ codePoints as $ j => $ codePoint ) {
219220 ++$ inputLength ;
220221
221222 if ($ codePoint < 0x80 ) {
@@ -241,7 +242,7 @@ public static function encode(string $input, int $outLength = null, array $caseF
241242 while ($ h < $ inputLength ) {
242243 $ m = self ::MAX_INT ;
243244
244- foreach ($ iter as $ codePoint ) {
245+ foreach ($ codePoints as $ codePoint ) {
245246 if ($ codePoint >= $ n && $ codePoint < $ m ) {
246247 $ m = $ codePoint ;
247248 }
@@ -254,7 +255,7 @@ public static function encode(string $input, int $outLength = null, array $caseF
254255 $ delta += ($ m - $ n ) * ($ h + 1 );
255256 $ n = $ m ;
256257
257- foreach ($ iter as $ j => $ codePoint ) {
258+ foreach ($ codePoints as $ j => $ codePoint ) {
258259 if ($ codePoint < $ n && ++$ delta === 0 ) {
259260 throw new OverflowException ();
260261 } elseif ($ codePoint === $ n ) {
@@ -306,6 +307,43 @@ private static function encodeBasic(int $codePoint, bool $flag): string
306307 return chr ($ codePoint + ((!$ flag && ($ codePoint - 65 < 26 ) ? 1 : 0 ) << 5 ));
307308 }
308309
310+ /**
311+ * Takes a Unicode code point and encodes it. The return behavior is undefined if the given
312+ * code point is outside the range 0..10FFFF.
313+ *
314+ * @see https://encoding.spec.whatwg.org/#utf-8-encoder
315+ */
316+ private static function encodeCodePoint (int $ codePoint ): string
317+ {
318+ if ($ codePoint >= 0x00 && $ codePoint <= 0x7F ) {
319+ return chr ($ codePoint );
320+ }
321+
322+ $ count = 0 ;
323+ $ offset = 0 ;
324+
325+ if ($ codePoint >= 0x0080 && $ codePoint <= 0x07FF ) {
326+ $ count = 1 ;
327+ $ offset = 0xC0 ;
328+ } elseif ($ codePoint >= 0x0800 && $ codePoint <= 0xFFFF ) {
329+ $ count = 2 ;
330+ $ offset = 0xE0 ;
331+ } elseif ($ codePoint >= 0x10000 && $ codePoint <= 0x10FFFF ) {
332+ $ count = 3 ;
333+ $ offset = 0xF0 ;
334+ }
335+
336+ $ bytes = chr (($ codePoint >> (6 * $ count )) + $ offset );
337+
338+ while ($ count > 0 ) {
339+ $ temp = $ codePoint >> (6 * ($ count - 1 ));
340+ $ bytes .= chr (0x80 | ($ temp & 0x3F ));
341+ --$ count ;
342+ }
343+
344+ return $ bytes ;
345+ }
346+
309347 private static function encodeDigit (int $ d , bool $ flag ): string
310348 {
311349 return chr ($ d + 22 + 75 * ($ d < 26 ? 1 : 0 ) - (($ flag ? 1 : 0 ) << 5 ));
@@ -315,4 +353,94 @@ private static function flagged(int $codePoint): bool
315353 {
316354 return $ codePoint - 65 < 26 ;
317355 }
356+
357+ /**
358+ * Takes a UTF-8 encoded string and converts it into a series of integer code points. Any
359+ * invalid byte sequences will be replaced by a U+FFFD replacement code point.
360+ *
361+ * @see https://encoding.spec.whatwg.org/#utf-8-decoder
362+ *
363+ * @return array<int, int>
364+ */
365+ private static function utf8Decode (string $ input ): array
366+ {
367+ $ bytesSeen = 0 ;
368+ $ bytesNeeded = 0 ;
369+ $ lowerBoundary = 0x80 ;
370+ $ upperBoundary = 0xBF ;
371+ $ codePoint = 0 ;
372+ $ codePoints = [];
373+ $ length = strlen ($ input );
374+
375+ for ($ i = 0 ; $ i < $ length ; ++$ i ) {
376+ $ byte = ord ($ input [$ i ]);
377+
378+ if ($ bytesNeeded === 0 ) {
379+ if ($ byte >= 0x00 && $ byte <= 0x7F ) {
380+ $ codePoints [] = $ byte ;
381+
382+ continue ;
383+ }
384+
385+ if ($ byte >= 0xC2 && $ byte <= 0xDF ) {
386+ $ bytesNeeded = 1 ;
387+ $ codePoint = $ byte & 0x1F ;
388+ } elseif ($ byte >= 0xE0 && $ byte <= 0xEF ) {
389+ if ($ byte === 0xE0 ) {
390+ $ lowerBoundary = 0xA0 ;
391+ } elseif ($ byte === 0xED ) {
392+ $ upperBoundary = 0x9F ;
393+ }
394+
395+ $ bytesNeeded = 2 ;
396+ $ codePoint = $ byte & 0xF ;
397+ } elseif ($ byte >= 0xF0 && $ byte <= 0xF4 ) {
398+ if ($ byte === 0xF0 ) {
399+ $ lowerBoundary = 0x90 ;
400+ } elseif ($ byte === 0xF4 ) {
401+ $ upperBoundary = 0x8F ;
402+ }
403+
404+ $ bytesNeeded = 3 ;
405+ $ codePoint = $ byte & 0x7 ;
406+ } else {
407+ $ codePoints [] = 0xFFFD ;
408+ }
409+
410+ continue ;
411+ }
412+
413+ if ($ byte < $ lowerBoundary || $ byte > $ upperBoundary ) {
414+ $ codePoint = 0 ;
415+ $ bytesNeeded = 0 ;
416+ $ bytesSeen = 0 ;
417+ $ lowerBoundary = 0x80 ;
418+ $ upperBoundary = 0xBF ;
419+ --$ i ;
420+ $ codePoints [] = 0xFFFD ;
421+
422+ continue ;
423+ }
424+
425+ $ lowerBoundary = 0x80 ;
426+ $ upperBoundary = 0xBF ;
427+ $ codePoint = ($ codePoint << 6 ) | ($ byte & 0x3F );
428+
429+ if (++$ bytesSeen !== $ bytesNeeded ) {
430+ continue ;
431+ }
432+
433+ $ codePoints [] = $ codePoint ;
434+ $ codePoint = 0 ;
435+ $ bytesNeeded = 0 ;
436+ $ bytesSeen = 0 ;
437+ }
438+
439+ // String unexpectedly ended, so append a U+FFFD code point.
440+ if ($ bytesNeeded !== 0 ) {
441+ $ codePoints [] = 0xFFFD ;
442+ }
443+
444+ return $ codePoints ;
445+ }
318446}
0 commit comments