<?php
 
/**
 
 * @desc
 
 * Sort arrays with multibyte values using a predefined or custom alphabet.
 
 *
 
 * @author Christian Reinecke <reinecke@bajoodoo.de>
 
 * @version 1.1
 
 * @since 2009-01-15
 
 *
 
 * @example
 
 * <code>
 
 *     // quick
 
 *     $array = array("Zebra", "Engländer", "England", "Chinese", "Äffchen", "Überseeboot",
 
 *                    "Unterseeboot", "ß", "Sz", "Bürger", "Burger", "burger", "bürger", "ÄÖÜ");
 
 *     AIS_Util_MultibyteSort::staticSort($array);
 
 *     print_r($array);
 
 *     // Array (
 
 *     // [0] => Äffchen [1] => ÄÖÜ [2] => Burger [3] => Bürger [4] => burger [5] => bürger
 
 *     // [6] => Chinese [7] => England [8] => Engländer [9] => Sz [10] => ß [11] => Unterseeboot
 
 *     // [12] => Überseeboot [13] => Zebra
 
 *     // )
 
 *
 
 *
 
 *     // with custom setup
 
 *     $mbsort = new AIS_Util_MultibyteSort("UTF-8", true);
 
 *     $mbsort->sort($array);
 
 *     print_r($array)
 
 *     // same result as above
 
 *     $mbsort->sort($otherArray);
 
 * </code>
 
 *
 
 * @todo
 
 * This class can not handle unknown special characters (fallback to default behaviour),
 
 * which makes it desirable to find a global, sorted alphabet. Feel free to add more special chars.
 
 *
 
 * Don't forget to send the correct charset in your header
 
 * <code>
 
 *     header("Content-type: text/html; charset=utf-8");
 
 * </code>
 
 */
 
class AIS_Util_MultibyteSort
 
{
 
    const EncodingUtf8    = "UTF-8";
 
    const EncodingDefault = self::EncodingUtf8;
 
 
    const CasePreferenceUpper   = true;
 
    const CasePreferenceLower   = false;
 
    const CasePreferenceDefault = self::CasePreferenceUpper;
 
 
    protected static $instances = array();
 
    protected static $alphabetLoose = array("A" => "Ä", "a" => "äà", "O" => "Ö", "o" => "ö", "U" => "Ü",
 
                                            "u" => "ü", "s" => "ß");
 
 
    /**
 
     * multibyte encoding
 
     * @see self::EncodingDefault
 
     * @var string
 
     */
 
    protected $encoding;
 
 
    /**
 
     * array map (single special char => ASCII-128)
 
     * @var array
 
     */
 
    protected $alphabet;
 
 
    /**
 
     * case flag, whether upper case chars are preferred to lower case chars
 
     * @see self::CasePreferenceDefault
 
     * @var bool
 
     */
 
    protected $case;
 
 
    /**
 
     * @param array|string [optional] loose alphabet or key for predefined loose alphabet
 
     * @param string       [optional] encoding for multibyte functions
 
     * @param bool         [optional] prefer uppercase char to lower-case (Aa or aA)
 
     */
 
    public function __construct($encoding = self::EncodingDefault, $preferUpperCase = self::CasePreferenceDefault)
 
    {
 
        $this->setEncoding($encoding);
 
        $this->setUpperCasePreference($preferUpperCase);
 
        $this->loadAlphabet();
 
    }
 
 
    /**
 
     * @desc
 
     * The instance for all static calls is stored in a static property, so you can use the
 
     * instance once again. But if you prefer to use this class only once, use the __construct
 
     * method. There's no overhead then.
 
     */
 
    public static function getInstance($encoding = self::EncodingDefault, $preferUpperCase = self::CasePreferenceDefault)
 
    {
 
        $hash = crc32($encoding . $preferUpperCase);
 
        if (!array_key_exists($hash, self::$instances)) {
 
            self::$instances[$hash] = new self($encoding, $preferUpperCase);
 
        }
 
        return self::$instances[$hash];
 
    }
 
 
    /**
 
     * @desc shortcut
 
     */
 
    public static function sortStatic(&$array, $encoding = self::EncodingDefault, $preferUpperCase = self::CasePreferenceDefault)
 
    {
 
        $self = self::getInstance($encoding, $preferUpperCase);
 
        $self->sort($array);
 
    }
 
 
    public static function asortStatic(&$array, $encoding = self::EncodingDefault, $preferUpperCase = self::CasePreferenceDefault)
 
    {
 
        $self = self::getInstance($encoding, $preferUpperCase);
 
        $self->asort($array);
 
    }
 
 
    /**
 
     * @desc sort array with the constructor-given settings
 
     */
 
    public function sort(&$array)
 
    {
 
        /**
 
         * Got this message: "Warning: usort() [function.usort]: Invalid comparison function"?
 
         * Then do not call this method from a static context and change your error_reporting level
 
         * to E_ALL | E_STRICT.
 
         */
 
        usort($array, array($this, "sortString"));
 
    }
 
 
    public function asort(&$array)
 
    {
 
        uasort($array, array($this, "sortString"));
 
    }
 
 
    /**
 
     * @desc multibyte string compare (something like we would expect behind mb_strcmp())
 
     * @return int sort order value (1, 0, -1)
 
     */
 
    protected function sortString($a, $b)
 
    {
 
        $ax = mb_strlen($a, $this->encoding);
 
        $bx = mb_strlen($b, $this->encoding);
 
        for ($i = 0, $x = min($ax, $bx); $i < $x; ++$i) {
 
            $result = $this->charCmp(mb_substr($a, $i, 1, $this->encoding),
 
                                     mb_substr($b, $i, 1, $this->encoding));
 
            if ($result != 0) {
 
                return $result;
 
            }
 
        }
 
        return $this->intCmp($ax, $bx);
 
    }
 
 
    /**
 
     * @desc integer compare
 
     * @return int sort order value (-1, 0, 1)
 
     */
 
    protected function intCmp($a, $b)
 
    {
 
        return ($a == $b) ? 0 : ($a < $b ? -1 : 1);
 
    }
 
 
    /**
 
     * @desc multibyte char compare
 
     * @return int sort order value (-1, 0, 1)
 
     */
 
    protected function charCmp($a, $b)
 
    {
 
        // check if characters are known as special chars
 
        $ai = isset($this->alphabet[$a]); // ai = a isset
 
        $bi = isset($this->alphabet[$b]);
 
 
        if ($ai && $bi) {
 
            // both are known special chars
 
            $ar = $this->alphabet[$a]; // ar = a representation (ASCII-128)
 
            $br = $this->alphabet[$b];
 
            $result = $this->charCaseCmp($ar, $br);
 
            if ($result == 0 && $a != $b) {
 
                // they aren't equal, but their representation is, so check position in original array
 
                $ap = mb_strpos(self::$alphabetLoose[$ar], $a, 0, $this->encoding);
 
                $bp = mb_strpos(self::$alphabetLoose[$br], $b, 0, $this->encoding);
 
                $result = $this->intCmp($ap, $bp);
 
            }
 
        } else if ($ai) {
 
            // $a is a known special char, $b not
 
            $result = $this->charCaseCmp($this->alphabet[$a], $b);
 
             // so they are not equal, $result = 0 means $b is "smaller"
 
            $result = ($result == 0) ? 1 : $result;
 
        } else if ($bi) {
 
            // $b is a known special char, $a not
 
            $result = $this->charCaseCmp($a, $this->alphabet[$b]);
 
            // so they are not equal; $result = 0 means $a is "smaller"
 
            $result = ($result == 0) ? -1 : $result;
 
        } else {
 
            // both are unknown characters
 
            $result = $this->charCaseCmp($a, $b);
 
        }
 
        return $result;
 
    }
 
 
    /**
 
     * @desc multibyte char case compare
 
     * @return sort order value (-1, 0, 1)
 
     */
 
    protected function charCaseCmp($a, $b)
 
    {
 
        if ($a == $b) {
 
            // they are equal, no check required
 
            return 0;
 
        }
 
        $A = mb_strtoupper($a, $this->encoding);
 
        $B = mb_strtoupper($b, $this->encoding);
 
        $result = strcmp($A, $B);
 
        if ($result == 0) {
 
            // their mb_strtoupper() value is equal, select compare value depending on case preference
 
            $result = ($A != $a) ? $this->case : ($this->case * -1);
 
        }
 
        return $result;
 
    }
 
 
    protected function loadAlphabet()
 
    {
 
        if (empty($this->alphabet)) {
 
            // load only once per instance
 
            // $alphabetLoose is required to differ between special chars with the same ASCII-128 representation
 
            foreach (self::$alphabetLoose as $order => $char) {
 
                for ($i = 0, $x = mb_strlen($char, $this->encoding); $i < $x; ++$i) {
 
                    // use each multibyte char as key with its ASCII-128 representation as key
 
                    $this->alphabet[mb_substr($char, $i, 1, $this->encoding)] = $order;
 
                }
 
            }
 
        }
 
    }
 
 
    protected function setEncoding($encoding)
 
    {
 
        $this->encoding = $encoding;
 
    }
 
 
    protected function setUpperCasePreference($preferUpperCase)
 
    {
 
        $this->case = $preferUpperCase ? 1 : -1;
 
    }
 
}
 
?>
 
 |