Tiny File Manager

File "TranscodeUnicode.php"
Full path: /home/argothem/www/organecyberpresse/vendor/algo26-matthias/idna-convert/src/TranscodeUnicode/TranscodeUnicode.php
File size: 13.74 KB
MIME-type: text/x-php
Charset: utf-8
Open Edit Advanced Editor Back
<?php
/**
 * Converts between various flavours of Unicode representations like UCS-4 or UTF-8
 * Supported schemes:
 * - UCS-4 Little Endian / Big Endian / Array (partially)
 * - UTF-16 Little Endian / Big Endian (not yet)
 * - UTF-8
 * - UTF-7
 * - UTF-7 IMAP (modified UTF-7)
 *
 * @package IdnaConvert
 * @author Matthias Sommerfeld  <matthias.sommerfeld@algo26.de>
 * @copyright 2003-2019 algo26 Beratungs GmbH, Berlin, https://www.algo26.de
 */

namespace Algo26\IdnaConvert\TranscodeUnicode;

use Algo26\IdnaConvert\Exception\InvalidCharacterException;
use InvalidArgumentException;

class TranscodeUnicode implements TranscodeUnicodeInterface
{
    public const FORMAT_UCS4       = 'ucs4';
    public const FORMAT_UCS4_ARRAY = 'ucs4array';
    public const FORMAT_UTF8       = 'utf8';
    public const FORMAT_UTF7       = 'utf7';
    public const FORMAT_UTF7_IMAP  = 'utf7imap';

    private const encodings = [
        self::FORMAT_UCS4,
        self::FORMAT_UCS4_ARRAY,
        self::FORMAT_UTF8,
        self::FORMAT_UTF7,
        self::FORMAT_UTF7_IMAP
    ];

    private $safeMode;
    private $safeCodepoint = 0xFFFC;

    public function convert(
        $data,
        string $fromEncoding,
        string $toEncoding,
        bool $safeMode = false,
        ?int $safeCodepoint = null
    ) {
        $this->safeMode = $safeMode;
        if ($safeCodepoint !== null) {
            $this->safeCodepoint = $safeCodepoint;
        }

        $fromEncoding = strtolower($fromEncoding);
        $toEncoding   = strtolower($toEncoding);

        if ($fromEncoding === $toEncoding) {
            return $data;
        }

        if (!in_array($fromEncoding, self::encodings)) {
            throw new InvalidArgumentException(sprintf('Invalid input format %s', $fromEncoding), 300);
        }
        if (!in_array($toEncoding, self::encodings)) {
            throw new InvalidArgumentException(sprintf('Invalid output format %s', $toEncoding), 301);
        }

        if ($fromEncoding !== self::FORMAT_UCS4_ARRAY) {
            $methodName = sprintf('%s_%s', $fromEncoding, self::FORMAT_UCS4_ARRAY);
            $data = $this->$methodName($data);
        }
        if ($toEncoding !== self::FORMAT_UCS4_ARRAY) {
            $methodName = sprintf('%s_%s', self::FORMAT_UCS4_ARRAY, $toEncoding);
            $data = $this->$methodName($data);
        }

        return $data;
    }

    /**
     * This converts an UTF-8 encoded string to its UCS-4 representation
     *
     * @param string $input The UTF-8 string to convert
     *
     * @return array  Array of 32bit values representing each codepoint
     * @throws InvalidCharacterException
     * @access public
     */
    private function utf8_ucs4array($input)
    {
        $startByte = 0;
        $nextByte = 0;

        $output = [];
        $outputLength = 0;
        $inputLength = $this->byteLength($input);
        $mode = 'next';
        $test = 'none';
        for ($k = 0; $k < $inputLength; ++$k) {
            $v = ord($input[$k]); // Extract byte from input string

            if ($v < 128) { // We found an ASCII char - put into string as is
                $output[$outputLength] = $v;
                ++$outputLength;
                if ('add' === $mode) {
                    if ($this->safeMode) {
                        $output[$outputLength - 2] = $this->safeCodepoint;
                        $mode = 'next';
                    } else {
                        throw new InvalidCharacterException(
                            sprintf(
                                'Conversion from UTF-8 to UCS-4 failed: malformed input at byte %d',
                                $k
                            ),
                            302
                        );
                    }
                }

                continue;
            }

            if ('next' === $mode) { // Try to find the next start byte; determine the width of the Unicode char
                $startByte = $v;
                $mode = 'add';
                $test = 'range';
                if ($v >> 5 === 6) { // &110xxxxx 10xxxxx
                    $nextByte = 0; // How many times subsequent bit masks must rotate 6bits to the left
                    $v = ($v - 192) << 6;
                } elseif ($v >> 4 === 14) { // &1110xxxx 10xxxxxx 10xxxxxx
                    $nextByte = 1;
                    $v = ($v - 224) << 12;
                } elseif ($v >> 3 === 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
                    $nextByte = 2;
                    $v = ($v - 240) << 18;
                } elseif ($this->safeMode) {
                    $mode = 'next';
                    $output[$outputLength] = $this->safeCodepoint;
                    ++$outputLength;

                    continue;
                } else {
                    throw new InvalidCharacterException(
                        sprintf('This might be UTF-8, but I don\'t understand it at byte %d', $k),
                        303
                    );
                }
                if (($inputLength - $k - $nextByte) < 2) {
                    $output[$outputLength] = $this->safeCodepoint;
                    $mode = 'no';

                    continue;
                }

                if ('add' === $mode) {
                    $output[$outputLength] = (int)$v;
                    ++$outputLength;

                    continue;
                }
            }
            if ('add' == $mode) {
                if (!$this->safeMode && $test === 'range') {
                    $test = 'none';
                    if (($v < 0xA0 && $startByte === 0xE0)
                        || ($v < 0x90 && $startByte === 0xF0)
                        || ($v > 0x8F && $startByte === 0xF4)
                    ) {
                        throw new InvalidCharacterException(
                            sprintf('Bogus UTF-8 character (out of legal range) at byte %d', $k),
                            304
                        );
                    }
                }
                if ($v >> 6 === 2) { // Bit mask must be 10xxxxxx
                    $v = ($v - 128) << ($nextByte * 6);
                    $output[($outputLength - 1)] += $v;
                    --$nextByte;
                } else {
                    if ($this->safeMode) {
                        $output[$outputLength - 1] = ord($this->safeCodepoint);
                        $k--;
                        $mode = 'next';

                        continue;
                    } else {
                        throw new InvalidCharacterException(
                            sprintf('Conversion from UTF-8 to UCS-4 failed: malformed input at byte %d', $k),
                            302
                        );
                    }
                }
                if ($nextByte < 0) {
                    $mode = 'next';
                }
            }
        } // for

        return $output;
    }

    /**
     * Convert UCS-4 arary into UTF-8 string
     * See utf8_ucs4array() for details
     *
     * @param $input array Array of UCS-4 codepoints
     *
     * @return string
     * @access   public
     * @throws InvalidCharacterException
     */
    private function ucs4array_utf8($input)
    {
        $output = '';
        foreach ($input as $k => $v) {
            if ($v < 128) { // 7bit are transferred literally
                $output .= chr($v);
            } elseif ($v < (1 << 11)) { // 2 bytes
                $output .= sprintf(
                    '%s%s',
                    chr(192 + ($v >> 6)),
                    chr(128 + ($v & 63))
                );
            } elseif ($v < (1 << 16)) { // 3 bytes
                $output .= sprintf(
                    '%s%s%s',
                    chr(224 + ($v >> 12)),
                    chr(128 + (($v >> 6) & 63)),
                    chr(128 + ($v & 63))
                );
            } elseif ($v < (1 << 21)) { // 4 bytes
                $output .= sprintf(
                    '%s%s%s%s',
                    chr(240 + ($v >> 18)),
                    chr(128 + (($v >> 12) & 63)),
                    chr(128 + (($v >> 6) & 63)),
                    chr(128 + ($v & 63))
                );
            } elseif ($this->safeMode) {
                $output .= $this->safeCodepoint;
            } else {
                throw new InvalidCharacterException(
                    sprintf('Conversion from UCS-4 to UTF-8 failed: malformed input at byte %d', $k),
                    305
                );
            }
        }

        return $output;
    }

    private function utf7imap_ucs4array($input)
    {
        return $this->utf7_ucs4array(str_replace(',', '/', $input), '&');
    }

    private function utf7_ucs4array($input, $sc = '+')
    {
        $output = [];
        $outputLength = 0;
        $inputLength = $this->byteLength($input);
        $mode = 'd';
        $b64 = '';

        for ($k = 0; $k < $inputLength; ++$k) {
            $c = $input[$k];

            // Ignore zero bytes
            if (0 === ord($c)) {
                continue;
            }
            if ('b' === $mode) {
                // Sequence got terminated
                if (!preg_match('![A-Za-z0-9/'.preg_quote($sc, '!').']!', $c)) {
                    if ('-' == $c) {
                        if ($b64 === '') {
                            $output[$outputLength] = ord($sc);
                            $outputLength++;
                            $mode = 'd';

                            continue;
                        }
                    }
                    $tmp = base64_decode($b64);
                    $tmp = substr($tmp, -1 * (strlen($tmp) % 2));
                    for ($i = 0; $i < strlen($tmp); $i++) {
                        if ($i % 2) {
                            $output[$outputLength] += ord($tmp[$i]);
                            $outputLength++;
                        } else {
                            $output[$outputLength] = ord($tmp[$i]) << 8;
                        }
                    }
                    $mode = 'd';
                    $b64 = '';

                    continue;
                } else {
                    $b64 .= $c;
                }
            }
            if ('d' === $mode) {
                if ($sc === $c) {
                    $mode = 'b';

                    continue;
                }

                $output[$outputLength] = ord($c);
                $outputLength++;
            }
        }

        return $output;
    }

    private function ucs4array_utf7imap($input)
    {
        return str_replace(
            '/',
            ',',
            $this->ucs4array_utf7($input, '&')
        );
    }

    private function ucs4array_utf7($input, $sc = '+')
    {
        $output = '';
        $mode = 'd';
        $b64 = '';
        while (true) {
            $v = (!empty($input)) ? array_shift($input) : false;
            $isDirect = (false !== $v)
                ? (0x20 <= $v && $v <= 0x7e && $v !== ord($sc))
                : true;
            if ($mode === 'b') {
                if ($isDirect) {
                    if ($b64 === chr(0).$sc) {
                        $output .= $sc.'-';
                        $b64 = '';
                    } elseif ($b64) {
                        $output .= $sc.str_replace('=', '', base64_encode($b64)).'-';
                        $b64 = '';
                    }
                    $mode = 'd';
                } elseif (false !== $v) {
                    $b64 .= chr(($v >> 8) & 255).chr($v & 255);
                }
            }
            if ($mode === 'd' && false !== $v) {
                if ($isDirect) {
                    $output .= chr($v);
                } else {
                    $b64 = chr(($v >> 8) & 255).chr($v & 255);
                    $mode = 'b';
                }
            }
            if (false === $v && $b64 === '') {
                break;
            }
        }

        return $output;
    }

    /**
     * Convert UCS-4 array into UCS-4 string (Little Endian at the moment)
     * @param $input array UCS-4 code points
     * @return string
     * @access   public
     */
    private function ucs4array_ucs4($input)
    {
        $output = '';
        foreach ($input as $v) {
            $output .= sprintf(
                '%s%s%s%s',
                chr(($v >> 24) & 255),
                chr(($v >> 16) & 255),
                chr(($v >> 8) & 255),
                chr($v & 255)
            );
        }

        return $output;
    }

    /**
     * Convert UCS-4 string (LE ar the moment) into UCS-4 array
     *
     * @param $input string UCS-4 LE string
     *
     * @return array
     * @access   public
     * @throws InvalidCharacterException
     */
    private function ucs4_ucs4array($input)
    {
        $output = [];

        $inputLength = $this->byteLength($input);
        // Input length must be dividable by 4
        if ($inputLength % 4) {
            throw new InvalidCharacterException('Input UCS4 string is broken', 306);
        }
        // Empty input - return empty output
        if (!$inputLength) {
            return $output;
        }

        for ($i = 0, $outputLength = -1; $i < $inputLength; ++$i) {
            if (!($i % 4)) { // Increment output position every 4 input bytes
                $outputLength++;
                $output[$outputLength] = 0;
            }
            $output[$outputLength] += ord($input[$i]) << (8 * (3 - ($i % 4)));
        }

        return $output;
    }
    
    /**
     * Gets the length of a string in bytes even if mbstring function
     * overloading is turned on
     *
     * @param string $string the string for which to get the length.
     * @return integer the length of the string in bytes.
     */
    protected function byteLength($string)
    {
        if ((extension_loaded('mbstring')
             && (ini_get('mbstring.func_overload') & 0x02) === 0x02)
        ) {
            return mb_strlen($string, '8bit');
        }

        return strlen((binary) $string);
    }    
}