Skip to content

Commit

Permalink
Release 1.0.2
Browse files Browse the repository at this point in the history
  • Loading branch information
TRowbotham committed Jul 15, 2020
1 parent 04af89e commit 2327f31
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 182 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## [1.0.2] - 2020-07-15

### Added

* Minor performance improvements

## [1.0.1] - 2020-06-16

### Fixed
Expand Down
47 changes: 0 additions & 47 deletions src/CodePoint.php

This file was deleted.

130 changes: 0 additions & 130 deletions src/CodePointString.php

This file was deleted.

138 changes: 133 additions & 5 deletions src/Punycode.php
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
use function func_num_args;
use function implode;
use function intdiv;
use function ord;
use function str_split;
use function strlen;
use function strrpos;
Expand Down Expand Up @@ -193,7 +194,7 @@ public static function decode(string $input, int $outLength = null, array &$case
array_splice($caseFlags, $i, 0, [self::flagged($bytes[$n - 1])]);
}

array_splice($output, $i++, 0, [CodePoint::encode($n)]);
array_splice($output, $i++, 0, [self::encodeCodePoint($n)]);
}

return implode('', $output);
Expand All @@ -213,9 +214,9 @@ public static function encode(string $input, int $outLength = null, array $caseF
$bias = self::INITIAL_BIAS;
$inputLength = 0;
$output = '';
$iter = new CodePointString($input);
$codePoints = self::utf8Decode($input);

foreach ($iter as $j => $codePoint) {
foreach ($codePoints as $j => $codePoint) {
++$inputLength;

if ($codePoint < 0x80) {
Expand All @@ -241,7 +242,7 @@ public static function encode(string $input, int $outLength = null, array $caseF
while ($h < $inputLength) {
$m = self::MAX_INT;

foreach ($iter as $codePoint) {
foreach ($codePoints as $codePoint) {
if ($codePoint >= $n && $codePoint < $m) {
$m = $codePoint;
}
Expand All @@ -254,7 +255,7 @@ public static function encode(string $input, int $outLength = null, array $caseF
$delta += ($m - $n) * ($h + 1);
$n = $m;

foreach ($iter as $j => $codePoint) {
foreach ($codePoints as $j => $codePoint) {
if ($codePoint < $n && ++$delta === 0) {
throw new OverflowException();
} elseif ($codePoint === $n) {
Expand Down Expand Up @@ -306,6 +307,43 @@ private static function encodeBasic(int $codePoint, bool $flag): string
return chr($codePoint + ((!$flag && ($codePoint - 65 < 26) ? 1 : 0) << 5));
}

/**
* Takes a Unicode code point and encodes it. The return behavior is undefined if the given
* code point is outside the range 0..10FFFF.
*
* @see https://encoding.spec.whatwg.org/#utf-8-encoder
*/
private static function encodeCodePoint(int $codePoint): string
{
if ($codePoint >= 0x00 && $codePoint <= 0x7F) {
return chr($codePoint);
}

$count = 0;
$offset = 0;

if ($codePoint >= 0x0080 && $codePoint <= 0x07FF) {
$count = 1;
$offset = 0xC0;
} elseif ($codePoint >= 0x0800 && $codePoint <= 0xFFFF) {
$count = 2;
$offset = 0xE0;
} elseif ($codePoint >= 0x10000 && $codePoint <= 0x10FFFF) {
$count = 3;
$offset = 0xF0;
}

$bytes = chr(($codePoint >> (6 * $count)) + $offset);

while ($count > 0) {
$temp = $codePoint >> (6 * ($count - 1));
$bytes .= chr(0x80 | ($temp & 0x3F));
--$count;
}

return $bytes;
}

private static function encodeDigit(int $d, bool $flag): string
{
return chr($d + 22 + 75 * ($d < 26 ? 1 : 0) - (($flag ? 1 : 0) << 5));
Expand All @@ -315,4 +353,94 @@ private static function flagged(int $codePoint): bool
{
return $codePoint - 65 < 26;
}

/**
* Takes a UTF-8 encoded string and converts it into a series of integer code points. Any
* invalid byte sequences will be replaced by a U+FFFD replacement code point.
*
* @see https://encoding.spec.whatwg.org/#utf-8-decoder
*
* @return array<int, int>
*/
private static function utf8Decode(string $input): array
{
$bytesSeen = 0;
$bytesNeeded = 0;
$lowerBoundary = 0x80;
$upperBoundary = 0xBF;
$codePoint = 0;
$codePoints = [];
$length = strlen($input);

for ($i = 0; $i < $length; ++$i) {
$byte = ord($input[$i]);

if ($bytesNeeded === 0) {
if ($byte >= 0x00 && $byte <= 0x7F) {
$codePoints[] = $byte;

continue;
}

if ($byte >= 0xC2 && $byte <= 0xDF) {
$bytesNeeded = 1;
$codePoint = $byte & 0x1F;
} elseif ($byte >= 0xE0 && $byte <= 0xEF) {
if ($byte === 0xE0) {
$lowerBoundary = 0xA0;
} elseif ($byte === 0xED) {
$upperBoundary = 0x9F;
}

$bytesNeeded = 2;
$codePoint = $byte & 0xF;
} elseif ($byte >= 0xF0 && $byte <= 0xF4) {
if ($byte === 0xF0) {
$lowerBoundary = 0x90;
} elseif ($byte === 0xF4) {
$upperBoundary = 0x8F;
}

$bytesNeeded = 3;
$codePoint = $byte & 0x7;
} else {
$codePoints[] = 0xFFFD;
}

continue;
}

if ($byte < $lowerBoundary || $byte > $upperBoundary) {
$codePoint = 0;
$bytesNeeded = 0;
$bytesSeen = 0;
$lowerBoundary = 0x80;
$upperBoundary = 0xBF;
--$i;
$codePoints[] = 0xFFFD;

continue;
}

$lowerBoundary = 0x80;
$upperBoundary = 0xBF;
$codePoint = ($codePoint << 6) | ($byte & 0x3F);

if (++$bytesSeen !== $bytesNeeded) {
continue;
}

$codePoints[] = $codePoint;
$codePoint = 0;
$bytesNeeded = 0;
$bytesSeen = 0;
}

// String unexpectedly ended, so append a U+FFFD code point.
if ($bytesNeeded !== 0) {
$codePoints[] = 0xFFFD;
}

return $codePoints;
}
}

0 comments on commit 2327f31

Please sign in to comment.