File size: 2.15Kb
<?php declare(strict_types=1);
/**
* @package s9e\RegexpBuilder
* @copyright Copyright (c) 2016-2022 The s9e authors
* @license http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\RegexpBuilder\Input;
use InvalidArgumentException;
use function array_map, ord, preg_match_all;
class Utf8 extends BaseImplementation
{
/**
* @var bool Whether to use surrogates to represent higher codepoints
*/
protected $useSurrogates;
/**
* {@inheritdoc}
*/
public function __construct(array $options = [])
{
$this->useSurrogates = !empty($options['useSurrogates']);
}
/**
* {@inheritdoc}
*/
public function split(string $string): array
{
if (preg_match_all('(.)us', $string, $matches) === false)
{
throw new InvalidArgumentException('Invalid UTF-8 string');
}
return ($this->useSurrogates) ? $this->charsToCodepointsWithSurrogates($matches[0]) : $this->charsToCodepoints($matches[0]);
}
/**
* Convert a list of UTF-8 characters into a list of Unicode codepoint
*
* @param string[] $chars
* @return integer[]
*/
protected function charsToCodepoints(array $chars): array
{
return array_map([$this, 'cp'], $chars);
}
/**
* Convert a list of UTF-8 characters into a list of Unicode codepoint with surrogates
*
* @param string[] $chars
* @return integer[]
*/
protected function charsToCodepointsWithSurrogates(array $chars): array
{
$codepoints = [];
foreach ($chars as $char)
{
$cp = $this->cp($char);
if ($cp < 0x10000)
{
$codepoints[] = $cp;
}
else
{
$codepoints[] = 0xD7C0 + ($cp >> 10);
$codepoints[] = 0xDC00 + ($cp & 0x3FF);
}
}
return $codepoints;
}
/**
* Compute and return the Unicode codepoint for given UTF-8 char
*
* @param string $char UTF-8 char
* @return integer
*/
protected function cp(string $char): int
{
$cp = ord($char[0]);
if ($cp >= 0xF0)
{
$cp = ($cp << 18) + (ord($char[1]) << 12) + (ord($char[2]) << 6) + ord($char[3]) - 0x3C82080;
}
elseif ($cp >= 0xE0)
{
$cp = ($cp << 12) + (ord($char[1]) << 6) + ord($char[2]) - 0xE2080;
}
elseif ($cp >= 0xC0)
{
$cp = ($cp << 6) + ord($char[1]) - 0x3080;
}
return $cp;
}
}