<?php
namespace App\Http\Controllers;
/**
 * Created by PhpStorm.
 * User: Acer
 * Date: 09/11/22
 * Time: 06:41 PM
 */
/**
 * A utility class to clean up common problems with UTF-8 strings.
 */
class UnicodeUtils{

    /**
     * Maps double-encoded UTF-8 byte sequences back to single encoded UTF-8. The keys are byte sequences where a valid
     * UTF-8 character has been interpreted as multiple characters in CP1252, and then re-converted
     * to UTF-8 characters. The values are the UTF-8 character byte sequence that was double encoded.
     *
     */
    private static $duped_utf8_mapping = [
        "\xC3\x82\xC2\xA0" => "\xC2\xA0",
        "\xC3\x82\xC2\xA1" => "\xC2\xA1",
        "\xC3\x82\xC2\xA2" => "\xC2\xA2",
        "\xC3\x82\xC2\xA3" => "\xC2\xA3",
        "\xC3\x82\xC2\xA4" => "\xC2\xA4",
        "\xC3\x82\xC2\xA5" => "\xC2\xA5",
        "\xC3\x82\xC2\xA6" => "\xC2\xA6",
        "\xC3\x82\xC2\xA7" => "\xC2\xA7",
        "\xC3\x82\xC2\xA8" => "\xC2\xA8",
        "\xC3\x82\xC2\xA9" => "\xC2\xA9",
        "\xC3\x82\xC2\xAA" => "\xC2\xAA",
        "\xC3\x82\xC2\xAB" => "\xC2\xAB",
        "\xC3\x82\xC2\xAC" => "\xC2\xAC",
        "\xC3\x82\xC2\xAD" => "\xC2\xAD",
        "\xC3\x82\xC2\xAE" => "\xC2\xAE",
        "\xC3\x82\xC2\xAF" => "\xC2\xAF",
        "\xC3\x82\xC2\xB0" => "\xC2\xB0",
        "\xC3\x82\xC2\xB1" => "\xC2\xB1",
        "\xC3\x82\xC2\xB2" => "\xC2\xB2",
        "\xC3\x82\xC2\xB3" => "\xC2\xB3",
        "\xC3\x82\xC2\xB4" => "\xC2\xB4",
        "\xC3\x82\xC2\xB5" => "\xC2\xB5",
        "\xC3\x82\xC2\xB6" => "\xC2\xB6",
        "\xC3\x82\xC2\xB7" => "\xC2\xB7",
        "\xC3\x82\xC2\xB8" => "\xC2\xB8",
        "\xC3\x82\xC2\xB9" => "\xC2\xB9",
        "\xC3\x82\xC2\xBA" => "\xC2\xBA",
        "\xC3\x82\xC2\xBB" => "\xC2\xBB",
        "\xC3\x82\xC2\xBC" => "\xC2\xBC",
        "\xC3\x82\xC2\xBD" => "\xC2\xBD",
        "\xC3\x82\xC2\xBE" => "\xC2\xBE",
        "\xC3\x82\xC2\xBF" => "\xC2\xBF",
        "\xC3\x83\xC2\x81" => "\xC3\x81",
        "\xC3\x83\xC2\x8D" => "\xC3\x8D",
        "\xC3\x83\xC2\x8F" => "\xC3\x8F",
        "\xC3\x83\xC2\x90" => "\xC3\x90",
        "\xC3\x83\xC2\x9D" => "\xC3\x9D",
        "\xC3\x83\xC2\xA0" => "\xC3\xA0",
        "\xC3\x83\xC2\xA1" => "\xC3\xA1",
        "\xC3\x83\xC2\xA2" => "\xC3\xA2",
        "\xC3\x83\xC2\xA3" => "\xC3\xA3",
        "\xC3\x83\xC2\xA4" => "\xC3\xA4",
        "\xC3\x83\xC2\xA5" => "\xC3\xA5",
        "\xC3\x83\xC2\xA6" => "\xC3\xA6",
        "\xC3\x83\xC2\xA7" => "\xC3\xA7",
        "\xC3\x83\xC2\xA8" => "\xC3\xA8",
        "\xC3\x83\xC2\xA9" => "\xC3\xA9",
        "\xC3\x83\xC2\xAA" => "\xC3\xAA",
        "\xC3\x83\xC2\xAB" => "\xC3\xAB",
        "\xC3\x83\xC2\xAC" => "\xC3\xAC",
        "\xC3\x83\xC2\xAD" => "\xC3\xAD",
        "\xC3\x83\xC2\xAE" => "\xC3\xAE",
        "\xC3\x83\xC2\xAF" => "\xC3\xAF",
        "\xC3\x83\xC2\xB0" => "\xC3\xB0",
        "\xC3\x83\xC2\xB1" => "\xC3\xB1",
        "\xC3\x83\xC2\xB2" => "\xC3\xB2",
        "\xC3\x83\xC2\xB3" => "\xC3\xB3",
        "\xC3\x83\xC2\xB4" => "\xC3\xB4",
        "\xC3\x83\xC2\xB5" => "\xC3\xB5",
        "\xC3\x83\xC2\xB6" => "\xC3\xB6",
        "\xC3\x83\xC2\xB7" => "\xC3\xB7",
        "\xC3\x83\xC2\xB8" => "\xC3\xB8",
        "\xC3\x83\xC2\xB9" => "\xC3\xB9",
        "\xC3\x83\xC2\xBA" => "\xC3\xBA",
        "\xC3\x83\xC2\xBB" => "\xC3\xBB",
        "\xC3\x83\xC2\xBC" => "\xC3\xBC",
        "\xC3\x83\xC2\xBD" => "\xC3\xBD",
        "\xC3\x83\xC2\xBE" => "\xC3\xBE",
        "\xC3\x83\xC2\xBF" => "\xC3\xBF",
        "\xC3\x83\xC5\x92" => "\xC3\x8C",
        "\xC3\x83\xC5\x93" => "\xC3\x9C",
        "\xC3\x83\xC5\xA0" => "\xC3\x8A",
        "\xC3\x83\xC5\xA1" => "\xC3\x9A",
        "\xC3\x83\xC5\xB8" => "\xC3\x9F",
        "\xC3\x83\xC5\xBD" => "\xC3\x8E",
        "\xC3\x83\xC5\xBE" => "\xC3\x9E",
        "\xC3\x83\xC6\x92" => "\xC3\x83",
        "\xC3\x83\xCB\x86" => "\xC3\x88",
        "\xC3\x83\xCB\x9C" => "\xC3\x98",
        "\xC3\x83\xE2\x80\x93" => "\xC3\x96",
        "\xC3\x83\xE2\x80\x94" => "\xC3\x97",
        "\xC3\x83\xE2\x80\x98" => "\xC3\x91",
        "\xC3\x83\xE2\x80\x99" => "\xC3\x92",
        "\xC3\x83\xE2\x80\x9A" => "\xC3\x82",
        "\xC3\x83\xE2\x80\x9C" => "\xC3\x93",
        "\xC3\x83\xE2\x80\x9D" => "\xC3\x94",
        "\xC3\x83\xE2\x80\x9E" => "\xC3\x84",
        "\xC3\x83\xE2\x80\xA0" => "\xC3\x86",
        "\xC3\x83\xE2\x80\xA1" => "\xC3\x87",
        "\xC3\x83\xE2\x80\xA2" => "\xC3\x95",
        "\xC3\x83\xE2\x80\xA6" => "\xC3\x85",
        "\xC3\x83\xE2\x80\xB0" => "\xC3\x89",
        "\xC3\x83\xE2\x80\xB9" => "\xC3\x8B",
        "\xC3\x83\xE2\x80\xBA" => "\xC3\x9B",
        "\xC3\x83\xE2\x82\xAC" => "\xC3\x80",
        "\xC3\x83\xE2\x84\xA2" => "\xC3\x99",
        "\xC3\x85\xC2\xA0" => "\xC5\xA0",
        "\xC3\x85\xC2\xA1" => "\xC5\xA1",
        "\xC3\x85\xC2\xB8" => "\xC5\xB8",
        "\xC3\x85\xC2\xBD" => "\xC5\xBD",
        "\xC3\x85\xC2\xBE" => "\xC5\xBE",
        "\xC3\x85\xE2\x80\x99" => "\xC5\x92",
        "\xC3\x85\xE2\x80\x9C" => "\xC5\x93",
        "\xC3\x86\xE2\x80\x99" => "\xC6\x92",
        "\xC3\x8B\xC5\x93" => "\xCB\x9C",
        "\xC3\x8B\xE2\x80\xA0" => "\xCB\x86",
        "\xC3\xA2\xE2\x80\x9A\xC2\xAC" => "\xE2\x82\xAC",
        "\xC3\xA2\xE2\x80\x9E\xC2\xA2" => "\xE2\x84\xA2",
        "\xC3\xA2\xE2\x82\xAC\xC2\x9D" => "\xE2\x80\x9D",
        "\xC3\xA2\xE2\x82\xAC\xC2\xA0" => "\xE2\x80\xA0",
        "\xC3\xA2\xE2\x82\xAC\xC2\xA1" => "\xE2\x80\xA1",
        "\xC3\xA2\xE2\x82\xAC\xC2\xA2" => "\xE2\x80\xA2",
        "\xC3\xA2\xE2\x82\xAC\xC2\xA6" => "\xE2\x80\xA6",
        "\xC3\xA2\xE2\x82\xAC\xC2\xB0" => "\xE2\x80\xB0",
        "\xC3\xA2\xE2\x82\xAC\xC2\xB9" => "\xE2\x80\xB9",
        "\xC3\xA2\xE2\x82\xAC\xC2\xBA" => "\xE2\x80\xBA",
        "\xC3\xA2\xE2\x82\xAC\xC5\x93" => "\xE2\x80\x9C",
        "\xC3\xA2\xE2\x82\xAC\xC5\xA1" => "\xE2\x80\x9A",
        "\xC3\xA2\xE2\x82\xAC\xC5\xBE" => "\xE2\x80\x9E",
        "\xC3\xA2\xE2\x82\xAC\xCB\x9C" => "\xE2\x80\x98",
        "\xC3\xA2\xE2\x82\xAC\xE2\x80\x9C" => "\xE2\x80\x93",
        "\xC3\xA2\xE2\x82\xAC\xE2\x80\x9D" => "\xE2\x80\x94",
        "\xC3\xA2\xE2\x82\xAC\xE2\x84\xA2" => "\xE2\x80\x99",
    ];

    /**
     * Cleans up instances where a UTF-8 string has been incorrectly re-encoded to UTF-8 from ISO-8859-1/CP1252.
     *
     * @see http://www.i18nqa.com/debug/utf8-debug.html#dbg
     * @param string $str
     * @return string
     */
    public static function dedupUTF8($str) {
        // do a check for byte prefixes of the mapping strings
        if (
            strpos($str, "\xC3") === false
            ||
            (
                strpos($str, "\xC3\x83") === false
                &&
                strpos($str, "\xC3\x82") === false
                &&
                strpos($str, "\xC3\xA2") === false
                &&
                strpos($str, "\xC3\x85") === false
                &&
                strpos($str, "\xC3\x8B") === false
                &&
                strpos($str, "\xC3\x86") === false
            )
        ) {
            return $str;
        }
        return strtr($str, self::$duped_utf8_mapping);
    }

    /**
     * Converts various "extended" Unicode characters to ASCII equivalents.
     *
     * @see self::convertCurlyQuotes
     * @see self::convertSmartChars
     * @see self::convertSpaces
     * @param string $str
     * @return string
     */
    public static function convertToSimpleChars($str) {
        return self::convertCurlyQuotes(
            self::convertSmartChars(
                self::convertSpaces($str)
            )
        );
    }

    /**
     * Converts curly (aka "smart" or "typographic") quotes to straight quotes a found in ASCII.
     *
     * @param string $str
     * @return string
     */
    public static function convertCurlyQuotes($str) {
        return strtr(
            $str,
            [
                "\xC2\xB4"     => '\'', // acute accent, common on European keyboards
                "\xE2\x80\x98" => '\'', // left single quote
                "\xE2\x80\x99" => '\'', // right single quote
                "\xE2\x80\x9C" => '"',  // left double quote
                "\xE2\x80\x9D" => '"',  // right double quote
            ]
        );
    }

    /**
     * Converts various characters from "smart" versions in Unicode back to ASCII lookalikes.
     *
     * @param string $str
     * @return string
     */
    public static function convertSmartChars($str) {
        return strtr(
            $str,
            [
                "\xE2\x80\xA6"   => '...',   // horizontal ellipsis
                "\xE2\x80\x90"   => '-',     // hyphen
                "\xE2\x80\x91"   => '-',     // non-breaking hyphen
                "\xE2\x80\x92"   => '-',     // figure dash
                "\xE2\x80\x93"   => '-',     // en dash
                "\xE2\x80\x94"   => '--',    // em dash
                "\xE2\x80\x95"   => '--',    // horizontal bar
                "\xC2\xB7"       => '*',     // mid dot
                "\xE2\x80\xA2"   => '*',     // bullet
                "\xC2\xB0"       => 'o',     // degree symbol
            ]
        );
    }

    /**
     * Converts all "space" characters in Unicode to a standard space.
     *
     * @see http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Zs:]
     * @param string $str
     * @return string
     */
    public static function convertSpaces($str) {
        return preg_replace('/\p{Zs}/u', ' ', $str);
    }
}