diff --git a/CHANGELOG.md b/CHANGELOG.md old mode 100644 new mode 100755 index f181eb1..bd8262f --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## [1.0.2] - 2020-07-15 + +### Added + +* Minor performance improvements + ## [1.0.1] - 2020-06-16 ### Fixed diff --git a/src/CodePoint.php b/src/CodePoint.php deleted file mode 100644 index dcb349a..0000000 --- a/src/CodePoint.php +++ /dev/null @@ -1,47 +0,0 @@ -= 0x00 && $codePoint <= 0x7F) { - return chr($codePoint); - } - - $count = 0; - $offset = 0; - - if ($codePoint >= 0x0080 && $codePoint <= 0x07FF) { - $count = 1; - $offset = 0xC0; - } elseif ($codePoint >= 0x0800 && $codePoint <= 0xFFFF) { - $count = 2; - $offset = 0xE0; - } elseif ($codePoint >= 0x10000 && $codePoint <= 0x10FFFF) { - $count = 3; - $offset = 0xF0; - } - - $bytes = chr(($codePoint >> (6 * $count)) + $offset); - - while ($count > 0) { - $temp = $codePoint >> (6 * ($count - 1)); - $bytes .= chr(0x80 | ($temp & 0x3F)); - --$count; - } - - return $bytes; - } -} diff --git a/src/CodePointString.php b/src/CodePointString.php deleted file mode 100644 index b436104..0000000 --- a/src/CodePointString.php +++ /dev/null @@ -1,130 +0,0 @@ - - */ -class CodePointString implements IteratorAggregate -{ - /** - * @var array - */ - protected $codePoints; - - public function __construct(string $input) - { - $this->codePoints = $this->utf8Decode($input); - } - - /** - * @return \Generator - */ - public function getIterator(): Generator - { - $length = count($this->codePoints); - - for ($i = 0; $i < $length; ++$i) { - yield $i => $this->codePoints[$i]; - } - } - - /** - * Takes a UTF-8 encoded string and converts it into a series of integer code points. Any - * invalid byte sequences will be replaced by a U+FFFD replacement code point. - * - * @see https://encoding.spec.whatwg.org/#utf-8-decoder - * - * @return array - */ - private function utf8Decode(string $input): array - { - $bytesSeen = 0; - $bytesNeeded = 0; - $lowerBoundary = 0x80; - $upperBoundary = 0xBF; - $codePoint = 0; - $codePoints = []; - $length = strlen($input); - - for ($i = 0; $i < $length; ++$i) { - $byte = ord($input[$i]); - - if ($bytesNeeded === 0) { - if ($byte >= 0x00 && $byte <= 0x7F) { - $codePoints[] = $byte; - - continue; - } - - if ($byte >= 0xC2 && $byte <= 0xDF) { - $bytesNeeded = 1; - $codePoint = $byte & 0x1F; - } elseif ($byte >= 0xE0 && $byte <= 0xEF) { - if ($byte === 0xE0) { - $lowerBoundary = 0xA0; - } elseif ($byte === 0xED) { - $upperBoundary = 0x9F; - } - - $bytesNeeded = 2; - $codePoint = $byte & 0xF; - } elseif ($byte >= 0xF0 && $byte <= 0xF4) { - if ($byte === 0xF0) { - $lowerBoundary = 0x90; - } elseif ($byte === 0xF4) { - $upperBoundary = 0x8F; - } - - $bytesNeeded = 3; - $codePoint = $byte & 0x7; - } else { - $codePoints[] = 0xFFFD; - } - - continue; - } - - if ($byte < $lowerBoundary || $byte > $upperBoundary) { - $codePoint = 0; - $bytesNeeded = 0; - $bytesSeen = 0; - $lowerBoundary = 0x80; - $upperBoundary = 0xBF; - --$i; - $codePoints[] = 0xFFFD; - - continue; - } - - $lowerBoundary = 0x80; - $upperBoundary = 0xBF; - $codePoint = ($codePoint << 6) | ($byte & 0x3F); - - if (++$bytesSeen !== $bytesNeeded) { - continue; - } - - $codePoints[] = $codePoint; - $codePoint = 0; - $bytesNeeded = 0; - $bytesSeen = 0; - } - - // String unexpectedly ended, so append a U+FFFD code point. - if ($bytesNeeded !== 0) { - $codePoints[] = 0xFFFD; - } - - return $codePoints; - } -} diff --git a/src/Punycode.php b/src/Punycode.php old mode 100644 new mode 100755 index df6e620..a62ffab --- a/src/Punycode.php +++ b/src/Punycode.php @@ -14,6 +14,7 @@ use function func_num_args; use function implode; use function intdiv; +use function ord; use function str_split; use function strlen; use function strrpos; @@ -193,7 +194,7 @@ public static function decode(string $input, int $outLength = null, array &$case array_splice($caseFlags, $i, 0, [self::flagged($bytes[$n - 1])]); } - array_splice($output, $i++, 0, [CodePoint::encode($n)]); + array_splice($output, $i++, 0, [self::encodeCodePoint($n)]); } return implode('', $output); @@ -213,9 +214,9 @@ public static function encode(string $input, int $outLength = null, array $caseF $bias = self::INITIAL_BIAS; $inputLength = 0; $output = ''; - $iter = new CodePointString($input); + $codePoints = self::utf8Decode($input); - foreach ($iter as $j => $codePoint) { + foreach ($codePoints as $j => $codePoint) { ++$inputLength; if ($codePoint < 0x80) { @@ -241,7 +242,7 @@ public static function encode(string $input, int $outLength = null, array $caseF while ($h < $inputLength) { $m = self::MAX_INT; - foreach ($iter as $codePoint) { + foreach ($codePoints as $codePoint) { if ($codePoint >= $n && $codePoint < $m) { $m = $codePoint; } @@ -254,7 +255,7 @@ public static function encode(string $input, int $outLength = null, array $caseF $delta += ($m - $n) * ($h + 1); $n = $m; - foreach ($iter as $j => $codePoint) { + foreach ($codePoints as $j => $codePoint) { if ($codePoint < $n && ++$delta === 0) { throw new OverflowException(); } elseif ($codePoint === $n) { @@ -306,6 +307,43 @@ private static function encodeBasic(int $codePoint, bool $flag): string return chr($codePoint + ((!$flag && ($codePoint - 65 < 26) ? 1 : 0) << 5)); } + /** + * Takes a Unicode code point and encodes it. The return behavior is undefined if the given + * code point is outside the range 0..10FFFF. + * + * @see https://encoding.spec.whatwg.org/#utf-8-encoder + */ + private static function encodeCodePoint(int $codePoint): string + { + if ($codePoint >= 0x00 && $codePoint <= 0x7F) { + return chr($codePoint); + } + + $count = 0; + $offset = 0; + + if ($codePoint >= 0x0080 && $codePoint <= 0x07FF) { + $count = 1; + $offset = 0xC0; + } elseif ($codePoint >= 0x0800 && $codePoint <= 0xFFFF) { + $count = 2; + $offset = 0xE0; + } elseif ($codePoint >= 0x10000 && $codePoint <= 0x10FFFF) { + $count = 3; + $offset = 0xF0; + } + + $bytes = chr(($codePoint >> (6 * $count)) + $offset); + + while ($count > 0) { + $temp = $codePoint >> (6 * ($count - 1)); + $bytes .= chr(0x80 | ($temp & 0x3F)); + --$count; + } + + return $bytes; + } + private static function encodeDigit(int $d, bool $flag): string { return chr($d + 22 + 75 * ($d < 26 ? 1 : 0) - (($flag ? 1 : 0) << 5)); @@ -315,4 +353,94 @@ private static function flagged(int $codePoint): bool { return $codePoint - 65 < 26; } + + /** + * Takes a UTF-8 encoded string and converts it into a series of integer code points. Any + * invalid byte sequences will be replaced by a U+FFFD replacement code point. + * + * @see https://encoding.spec.whatwg.org/#utf-8-decoder + * + * @return array + */ + private static function utf8Decode(string $input): array + { + $bytesSeen = 0; + $bytesNeeded = 0; + $lowerBoundary = 0x80; + $upperBoundary = 0xBF; + $codePoint = 0; + $codePoints = []; + $length = strlen($input); + + for ($i = 0; $i < $length; ++$i) { + $byte = ord($input[$i]); + + if ($bytesNeeded === 0) { + if ($byte >= 0x00 && $byte <= 0x7F) { + $codePoints[] = $byte; + + continue; + } + + if ($byte >= 0xC2 && $byte <= 0xDF) { + $bytesNeeded = 1; + $codePoint = $byte & 0x1F; + } elseif ($byte >= 0xE0 && $byte <= 0xEF) { + if ($byte === 0xE0) { + $lowerBoundary = 0xA0; + } elseif ($byte === 0xED) { + $upperBoundary = 0x9F; + } + + $bytesNeeded = 2; + $codePoint = $byte & 0xF; + } elseif ($byte >= 0xF0 && $byte <= 0xF4) { + if ($byte === 0xF0) { + $lowerBoundary = 0x90; + } elseif ($byte === 0xF4) { + $upperBoundary = 0x8F; + } + + $bytesNeeded = 3; + $codePoint = $byte & 0x7; + } else { + $codePoints[] = 0xFFFD; + } + + continue; + } + + if ($byte < $lowerBoundary || $byte > $upperBoundary) { + $codePoint = 0; + $bytesNeeded = 0; + $bytesSeen = 0; + $lowerBoundary = 0x80; + $upperBoundary = 0xBF; + --$i; + $codePoints[] = 0xFFFD; + + continue; + } + + $lowerBoundary = 0x80; + $upperBoundary = 0xBF; + $codePoint = ($codePoint << 6) | ($byte & 0x3F); + + if (++$bytesSeen !== $bytesNeeded) { + continue; + } + + $codePoints[] = $codePoint; + $codePoint = 0; + $bytesNeeded = 0; + $bytesSeen = 0; + } + + // String unexpectedly ended, so append a U+FFFD code point. + if ($bytesNeeded !== 0) { + $codePoints[] = 0xFFFD; + } + + return $codePoints; + } }