Skip to content

Commit 2327f31

Browse files
committed
Release 1.0.2
1 parent 04af89e commit 2327f31

4 files changed

Lines changed: 139 additions & 182 deletions

File tree

CHANGELOG.md

100644100755
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# Changelog
22

3+
## [1.0.2] - 2020-07-15
4+
5+
### Added
6+
7+
* Minor performance improvements
8+
39
## [1.0.1] - 2020-06-16
410

511
### Fixed

src/CodePoint.php

Lines changed: 0 additions & 47 deletions
This file was deleted.

src/CodePointString.php

Lines changed: 0 additions & 130 deletions
This file was deleted.

src/Punycode.php

100644100755
Lines changed: 133 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
use function func_num_args;
1515
use function implode;
1616
use function intdiv;
17+
use function ord;
1718
use function str_split;
1819
use function strlen;
1920
use function strrpos;
@@ -193,7 +194,7 @@ public static function decode(string $input, int $outLength = null, array &$case
193194
array_splice($caseFlags, $i, 0, [self::flagged($bytes[$n - 1])]);
194195
}
195196

196-
array_splice($output, $i++, 0, [CodePoint::encode($n)]);
197+
array_splice($output, $i++, 0, [self::encodeCodePoint($n)]);
197198
}
198199

199200
return implode('', $output);
@@ -213,9 +214,9 @@ public static function encode(string $input, int $outLength = null, array $caseF
213214
$bias = self::INITIAL_BIAS;
214215
$inputLength = 0;
215216
$output = '';
216-
$iter = new CodePointString($input);
217+
$codePoints = self::utf8Decode($input);
217218

218-
foreach ($iter as $j => $codePoint) {
219+
foreach ($codePoints as $j => $codePoint) {
219220
++$inputLength;
220221

221222
if ($codePoint < 0x80) {
@@ -241,7 +242,7 @@ public static function encode(string $input, int $outLength = null, array $caseF
241242
while ($h < $inputLength) {
242243
$m = self::MAX_INT;
243244

244-
foreach ($iter as $codePoint) {
245+
foreach ($codePoints as $codePoint) {
245246
if ($codePoint >= $n && $codePoint < $m) {
246247
$m = $codePoint;
247248
}
@@ -254,7 +255,7 @@ public static function encode(string $input, int $outLength = null, array $caseF
254255
$delta += ($m - $n) * ($h + 1);
255256
$n = $m;
256257

257-
foreach ($iter as $j => $codePoint) {
258+
foreach ($codePoints as $j => $codePoint) {
258259
if ($codePoint < $n && ++$delta === 0) {
259260
throw new OverflowException();
260261
} elseif ($codePoint === $n) {
@@ -306,6 +307,43 @@ private static function encodeBasic(int $codePoint, bool $flag): string
306307
return chr($codePoint + ((!$flag && ($codePoint - 65 < 26) ? 1 : 0) << 5));
307308
}
308309

310+
/**
311+
* Takes a Unicode code point and encodes it. The return behavior is undefined if the given
312+
* code point is outside the range 0..10FFFF.
313+
*
314+
* @see https://encoding.spec.whatwg.org/#utf-8-encoder
315+
*/
316+
private static function encodeCodePoint(int $codePoint): string
317+
{
318+
if ($codePoint >= 0x00 && $codePoint <= 0x7F) {
319+
return chr($codePoint);
320+
}
321+
322+
$count = 0;
323+
$offset = 0;
324+
325+
if ($codePoint >= 0x0080 && $codePoint <= 0x07FF) {
326+
$count = 1;
327+
$offset = 0xC0;
328+
} elseif ($codePoint >= 0x0800 && $codePoint <= 0xFFFF) {
329+
$count = 2;
330+
$offset = 0xE0;
331+
} elseif ($codePoint >= 0x10000 && $codePoint <= 0x10FFFF) {
332+
$count = 3;
333+
$offset = 0xF0;
334+
}
335+
336+
$bytes = chr(($codePoint >> (6 * $count)) + $offset);
337+
338+
while ($count > 0) {
339+
$temp = $codePoint >> (6 * ($count - 1));
340+
$bytes .= chr(0x80 | ($temp & 0x3F));
341+
--$count;
342+
}
343+
344+
return $bytes;
345+
}
346+
309347
private static function encodeDigit(int $d, bool $flag): string
310348
{
311349
return chr($d + 22 + 75 * ($d < 26 ? 1 : 0) - (($flag ? 1 : 0) << 5));
@@ -315,4 +353,94 @@ private static function flagged(int $codePoint): bool
315353
{
316354
return $codePoint - 65 < 26;
317355
}
356+
357+
/**
358+
* Takes a UTF-8 encoded string and converts it into a series of integer code points. Any
359+
* invalid byte sequences will be replaced by a U+FFFD replacement code point.
360+
*
361+
* @see https://encoding.spec.whatwg.org/#utf-8-decoder
362+
*
363+
* @return array<int, int>
364+
*/
365+
private static function utf8Decode(string $input): array
366+
{
367+
$bytesSeen = 0;
368+
$bytesNeeded = 0;
369+
$lowerBoundary = 0x80;
370+
$upperBoundary = 0xBF;
371+
$codePoint = 0;
372+
$codePoints = [];
373+
$length = strlen($input);
374+
375+
for ($i = 0; $i < $length; ++$i) {
376+
$byte = ord($input[$i]);
377+
378+
if ($bytesNeeded === 0) {
379+
if ($byte >= 0x00 && $byte <= 0x7F) {
380+
$codePoints[] = $byte;
381+
382+
continue;
383+
}
384+
385+
if ($byte >= 0xC2 && $byte <= 0xDF) {
386+
$bytesNeeded = 1;
387+
$codePoint = $byte & 0x1F;
388+
} elseif ($byte >= 0xE0 && $byte <= 0xEF) {
389+
if ($byte === 0xE0) {
390+
$lowerBoundary = 0xA0;
391+
} elseif ($byte === 0xED) {
392+
$upperBoundary = 0x9F;
393+
}
394+
395+
$bytesNeeded = 2;
396+
$codePoint = $byte & 0xF;
397+
} elseif ($byte >= 0xF0 && $byte <= 0xF4) {
398+
if ($byte === 0xF0) {
399+
$lowerBoundary = 0x90;
400+
} elseif ($byte === 0xF4) {
401+
$upperBoundary = 0x8F;
402+
}
403+
404+
$bytesNeeded = 3;
405+
$codePoint = $byte & 0x7;
406+
} else {
407+
$codePoints[] = 0xFFFD;
408+
}
409+
410+
continue;
411+
}
412+
413+
if ($byte < $lowerBoundary || $byte > $upperBoundary) {
414+
$codePoint = 0;
415+
$bytesNeeded = 0;
416+
$bytesSeen = 0;
417+
$lowerBoundary = 0x80;
418+
$upperBoundary = 0xBF;
419+
--$i;
420+
$codePoints[] = 0xFFFD;
421+
422+
continue;
423+
}
424+
425+
$lowerBoundary = 0x80;
426+
$upperBoundary = 0xBF;
427+
$codePoint = ($codePoint << 6) | ($byte & 0x3F);
428+
429+
if (++$bytesSeen !== $bytesNeeded) {
430+
continue;
431+
}
432+
433+
$codePoints[] = $codePoint;
434+
$codePoint = 0;
435+
$bytesNeeded = 0;
436+
$bytesSeen = 0;
437+
}
438+
439+
// String unexpectedly ended, so append a U+FFFD code point.
440+
if ($bytesNeeded !== 0) {
441+
$codePoints[] = 0xFFFD;
442+
}
443+
444+
return $codePoints;
445+
}
318446
}

0 commit comments

Comments
 (0)