Source for file UnicodeString.php

Documentation is available at UnicodeString.php
<?php
/* vim: set expandtab tabstop=4 shiftwidth=4: */
/**
* Provides a method of storing and manipulating multibyte strings in PHP.
*
* PHP versions 5
*
* LICENSE: Copyright 2004-2006 John Downey. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* o Redistributions of source code must retain the above copyright notice, this
*   list of conditions and the following disclaimer.
* o Redistributions in binary form must reproduce the above copyright notice,
*   this list of conditions and the following disclaimer in the documentation
*   and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE FREEBSD PROJECT "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
* EVENT SHALL THE FREEBSD PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation are
* those of the authors and should not be interpreted as representing official
* policies, either expressed or implied, of The PEAR Group.
*
* @category  Internationalization
* @package   I18N_UnicodeString
* @author    John Downey <jdowney@gmail.com>
* @copyright 2004-2006 John Downey
* @license   http://www.freebsd.org/copyright/freebsd-license.html 2 Clause BSD License
* @version   CVS: $Id$
* @link      http://pear.php.net/packages/I18N_UnicodeString
* @filesource
*/
 
require_once 'I18N/UnicodeString/Exception.php';
 
/**
* Class provides a way to use and manipulate multibyte strings in PHP
*
* @category Internationalization
* @package  I18N_UnicodeString
* @author   John Downey <jdowney@gmail.com>
* @license  http://www.freebsd.org/copyright/freebsd-license.html 2 Clause BSD License
* @version  Release: @package_version@
* @link     http://pear.php.net/packages/I18N_UnicodeString
* @access   public
*/
class I18N_UnicodeString
{
    /**
     * The internal representation of the string as an array of numbers.
     * @access private
     * @var    array $_unicode 
     */
    var $_unicode = array();
 
    /**
     * Converts an entire array of strings to Unicode capable strings.
     *
     * Useful for converting a GET/POST of all its Unicode values into a
     * workable and easily changed format. Takes an optional second
     * parameter similer to that of {@link setString()}.
     *
     * @param array  $array    An array of PHP variables.
     * @param string $encoding The encoding the string values are in.
     *
     * @return array The array with all of its string values converted to
     *                I18N_Unicode objects.
     * @static
     * @access public
     * @see    setString()
     */
    function convertArray($array, $encoding = 'HTML')
    {
        foreach ($array as $key => $value) {
            if (is_string($value)) {
                $array[$key] = new I18N_UnicodeString($value, $encoding);
            }
        }
 
        return $array;
    }
 
    /**
     * The constructor of the class string which can receive a new string in a
     * number of formats.
     *
     * @param mixed  $value    A variable containing the Unicode string in one of
     *                          various encodings.
     * @param string $encoding The encoding that the string is in.
     *
     * @access public
     * @see    setString()
     */
    function __construct($value = '', $encoding = 'UTF-8')
    {
        $this->setString($value, $encoding);
    }
 
    /**
     * Set the string to a value passed in one of many encodings.
     *
     * You may pass the encoding as an optional second parameter which defaults
     * to UTF-8 encoding. Possible encodings are:
     *
     * o <i>ASCII</i> - when you pass a normal 7 bit ASCII string
     * o <i>UTF-8</i> - when you pass a UTF-8 encoded string
     * o <i>HTML</i> - when you pass a string encoded with HTML entities, such
     *   as the kind received from a GET/POST
     * o <i>Unicode</i> or <i>UCS-4</i> - when passing an array of integer values representing
     *   each character
     *
     * @param mixed  $value    A variable containing the Unicode string in one of
     *                          various encodings.
     * @param string $encoding The encoding that the string is in.
     *
     * @access public
     * @return mixed Returns true on success or PEAR_Error otherwise.
     */
    function setString($value, $encoding = 'UTF-8')
    {
        switch(strtoupper($encoding)) {
        case 'ASCII':
        case 'UTF8':
        case 'UTF-8':
            $this->_unicode = I18N_UnicodeString::utf8ToUnicode($value);
            break;
 
        case 'HTML':
            $this->_unicode = $this->_stringFromHtml($value);
            break;
 
        case 'UTF32':
        case 'UTF-32':
        case 'UCS4':
        case 'UCS-4':
        case 'UNICODE':
            $this->_unicode = $value;
            break;
 
        default:
            return I18N_UnicodeString::raiseError('Unrecognized encoding');
        }
 
        if (strtolower(get_class($this->_unicode)) == 'pear_error') {
            return $this->_unicode;
        }
 
        return true;
    }
 
    /**
     * Converts a string encoded with HTML entities into our internal
     * representation of an array of integers.
     *
     * @param string $string A string containing Unicode values encoded as HTML
     *                         entities.
     *
     * @access private
     * @return array The array of Unicode values.
     */
    function _stringFromHtml($string = '')
    {
        $parts   = explode('&#', $string);
        $unicode = array();
        foreach ($parts as $part) {
            $text = strstr($part, ';');
 
            if (!empty($text)) {
                $value = substr($part, 0, strpos($part, ';'));
 
                /* Suggested by Jonathan Yavner to allow HTML to also be in
                   the form of &#xNNNN where NNNN is the hexidecimal of a
                   Unicode character */
                if (ord($value[0]) == 120) {
                    $unicode[] = intval(substr($value, 1), 16);
                } else {
                    $unicode[] = intval($value);
                }
 
                $text = substr($text, 1);
 
                for ($i = 0, $max = strlen($text); $i < $max; $i++) {
                    $unicode[] = ord($text[$i]);
                }
            } else {
                for ($i = 0, $max = strlen($part); $i < $max; $i++) {
                    $unicode[] = ord($part[$i]);
                }
            }
        }
 
        return $unicode;
    }
 
    /**
     * Converts a UTF-8 string into our representation of an array of integers.
     *
     * Method was made static by suggestion of Lukas Feiler (#7429)
     *
     * @param string $string A string containing Unicode values encoded in UTF-8
     *
     * @static
     * @access public
     * @return array The array of Unicode values.
     */
    function utf8ToUnicode($string = '')
    {
        $unicode = array();
        $values  = array();
        $search  = 1;
 
        for ($count = 0, $length = strlen($string); $count < $length; $count++) {
            $value = ord($string[$count]);
 
            if ($value < 128) {
                // if the value is an ASCII char then just go ahead and add it on
                $unicode[] = $value;
            } else {
                // if not then we need to know how many more bytes make up this character
                if (count($values) == 0) {
                    if ($value >> 5 == 6) {
                        $values[] = ($value - 192) << 6;
                        $search   = 2;
                    } elseif ($value >> 4 == 14) {
                        $values[] = ($value - 224) << 12;
                        $search   = 3;
                    } elseif ($value >> 3 == 30) {
                        $values[] = ($value - 240) << 18;
                        $search   = 4;
                    } elseif ($value >> 2 == 62) {
                        $values[] = ($value - 248) << 24;
                        $search   = 5;
                    } elseif ($value >> 1 == 126) {
                        $values[] = ($value - 252) << 30;
                        $search   = 6;
                    } else {
                        return I18N_UnicodeString::raiseError('Malformed UTF-8 string');
                    }
                } else {
                    if ($value >> 6 == 2) {
                        $values[] = $value - 128;
                    } else {
                        return I18N_UnicodeString::raiseError('Malformed UTF-8 string');
                    }
 
                    if (count($values) == $search) {
                        // if we have all of our bytes then go ahead an encode it in unicode
                        $value = $values[0];
                        for ($i = 1; $i < $search; $i++) {
                            $value += ($values[$i] << ((($search - $i) - 1) * 6));
                        }
 
                        $unicode[] = $value;
 
                        $values = array();
                        $search = 1;
                    }
                }
            }
        }
 
        return $unicode;
    }
 
    /**
     * Transforms a single unicode character represented by an integer to a UTF-8 string.
     *
     * Suggested by Lukas Feiler (#7429)
     *
     * @param integer $char A unicode character as an integer
     *
     * @static
     * @access public
     * @return string The unicode character converted to a UTF-8 string.
     */
    function unicodeCharToUtf8($char)
    {
        $string = '';
        if ($char < 128) {
            // its an ASCII char no encoding needed
            $string .= chr($char);
        } elseif ($char < 1 << 11) {
            // its a 2 byte UTF-8 char
            $string .= chr(192 + ($char >> 6));
            $string .= chr(128 + ($char & 63));
        } elseif ($char < 1 << 16) {
            // its a 3 byte UTF-8 char
            $string .= chr(224 + ($char >> 12));
            $string .= chr(128 + (($char >> 6) & 63));
            $string .= chr(128 + ($char & 63));
        } elseif ($char < 1 << 21) {
            // its a 4 byte UTF-8 char
            $string .= chr(240 + ($char >> 18));
            $string .= chr(128 + (($char >> 12) & 63));
            $string .= chr(128 + (($char >>  6) & 63));
            $string .= chr(128 + ($char & 63));
        } elseif ($char < 1 << 26) {
            // its a 5 byte UTF-8 char
            $string .= chr(248 + ($char >> 24));
            $string .= chr(128 + (($char >> 18) & 63));
            $string .= chr(128 + (($char >> 12) & 63));
            $string .= chr(128 + (($char >> 6) & 63));
            $string .= chr(128 + ($char & 63));
        } else {
            // its a 6 byte UTF-8 char
            $string .= chr(252 + ($char >> 30));
            $string .= chr(128 + (($char >> 24) & 63));
            $string .= chr(128 + (($char >> 18) & 63));
            $string .= chr(128 + (($char >> 12) & 63));
            $string .= chr(128 + (($char >> 6) & 63));
            $string .= chr(128 + ($char & 63));
        }
 
        return $string;
    }
 
    /**
     * Retrieves the string and returns it as a UTF-8 encoded string.
     *
     * @access public
     * @return string A string with the Unicode values encoded in UTF-8.
     */
    function toUtf8String()
    {
        $string = '';
 
        foreach ($this->_unicode as $char) {
            $string .= $this->unicodeCharToUtf8($char);
        }
 
        return $string;
    }
 
    /**
     * Retrieves the string and returns it as a string encoded with HTML
     * entities.
     *
     * @access public
     * @return string A string with the Unicode values encoded as HTML entities.
     */
    function toHtmlEntitiesString()
    {
        $string = '';
 
        foreach ($this->_unicode as $char) {
            if ($char > 127) {
                $string .= '&#' . $char . ';';
            } else {
                $string .= chr($char);
            }
        }
 
        return $string;
    }
 
    /**
     * Retrieve the length of the string in characters.
     *
     * @access public
     * @return integer The length of the string.
     */
    function length()
    {
        return count($this->_unicode);
    }
 
    /**
     * Works exactly like PHP's substr function only it works on Unicode
     * strings.
     *
     * @param integer $begin  The beginning of the substring.
     * @param integer $length The length to read. Defaults to the rest of the
     *                          string.
     *
     * @access public
     * @return I18N_UnicodeString A new I18N_UnicodeString class containing the
     *                             substring or a PEAR_Error if an error is
     *                             thrown.
     */
    function subString($begin, $length = null)
    {
        $unicode = array();
 
        if (is_null($length)) {
            $length = $this->length() - $begin;
        }
 
        if (($begin + $length) > $this->length()) {
            return I18N_UnicodeString::raiseError('Cannot read past end of string.');
        }
 
        if ($begin > $this->length()) {
            return I18N_UnicodeString::raiseError('Beginning extends past end of string.');
        }
 
        for ($i = $begin, $max_length = ($begin + $length); $i < $max_length; $i++) {
            array_push($unicode, $this->_unicode[$i]);
        }
 
        return new I18N_UnicodeString($unicode, 'Unicode');
    }
 
    /**
     * Works like PHP's substr_replace function.
     *
     * @param I18N_UnicodeString &$find    The string to replaced
     * @param I18N_UnicodeString &$replace The string to replace $find with
     * @param integer            $start    The position in the string to start replacing at
     * @param integer            $length   The length from the starting to position to stop
     *                                       replacing at
     *
     * @access public
     * @return I18N_UnicodeString The current string with all $find replaced by
     *                             $replace
     * @see    stringReplace()
     */
    function subStringReplace($find, $replace, $start, $length = null)
    {
        if (is_null($length)) {
            $length = $this->length() - $start;
        }
 
        $begin  = $this->subString(0, $start);
        $string = $this->subString($start, $length);
        if (!method_exists($string, 'stringReplace')) {
            // $string is a PEAR_Error, return it
            return $string;
        } else {
            $string = $string->stringReplace($find, $replace);
            $after  = $this->subString($start + $length);
 
            $data = array_merge($begin->_unicode, $string->_unicode, $after->_unicode);
            return new I18N_UnicodeString($data, 'Unicode');
        }
    }
 
    /**
     * Works like PHP's str_replace function.
     *
     * @param I18N_UnicodeString &$find    The string to replaced
     * @param I18N_UnicodeString &$replace The string to replace $find with
     *
     * @access public
     * @return I18N_UnicodeString The current string with all $find replaced by
     *                             $replace
     * @see    subStringReplace()
     */
    function stringReplace($find, $replace)
    {
        $haystack    = new I18N_UnicodeString($this->_unicode, 'Unicode');
        $return        = new I18N_UnicodeString(array(), 'Unicode');
 
        while ($haystack->strStr($find) !== false) {
            $after      = $haystack->strStr($find);
            $begin      = $haystack->subString(0, $haystack->length() - $after->length());
            $haystack   = $after->subString($find->length());
 
            $return     = $return->append($begin)
                                 ->append($replace)
                                 ->append($after->subString(0, $find->length()));
         }
 
        return $return->append($haystack);
    }
 
    /**
     * Works like PHP's strstr function by returning the string from $find on.
     *
     * @param I18N_UnicodeString &$find The string to found
     *
     * @access public
     * @return I18N_UnicodeString The current string from $find on to the end
     */
    function strStr($find)
    {
        $found = false;
        $after = $find->_unicode;
 
        for ($i = 0, $length = $this->length(); $i < $length; $i++) {
            if ($found) {
                $after[] = $this->_unicode[$i];
            } else {
                if ($this->_unicode[$i] == $find->_unicode[0]) {
                    if ($i + $find->length() > $length) {
                        break;
                    }
 
                    $found = true;
                    for ($c = 1, $max = $find->length(); $c < $max; $c++) {
                        if ($this->_unicode[++$i] != $find->_unicode[$c]) {
                            $found = false;
                            break;
                        }
                    }
                }
            }
        }
 
        if ($found) {
            return new I18N_UnicodeString($after, 'Unicode');
        } else {
            return false;
        }
    }
 
    /**
     * Returns the position of a character much like PHP's strpos function.
     *
     * @param mixed $char A Unicode char represented as either an integer or a
     *                      UTF-8 char.
     *
     * @access public
     * @return integer The location of the character in the string.
     */
    function indexOf($char)
    {
        if (!is_int($char)) {
            if (strlen($char) > 1) {
                $char = array_shift(I18N_UnicodeString::utf8ToUnicode($char));
            } else {
                $char = ord($char);
            }
        }
 
        for ($i = 0, $length = $this->length(); $i < $length; $i++) {
            if ($this->_unicode[$i] == $char) {
                return $i;
            }
        }
 
        return -1;
    }
 
    /**
     * Returns the last position of a character much like PHP's strrpos function.
     *
     * @param mixed $char A Unicode char represented as either an integer or a
     *                      UTF-8 char.
     *
     * @access public
     * @return integer The last location of the character in the string.
     */
    function lastIndexOf($char)
    {
        if (!is_int($char)) {
            if (strlen($char) > 1) {
                $char = array_shift($this->utf8ToUnicode($char));
            } else {
                $char = ord($char);
            }
        }
 
        for ($i = $this->length() - 1; $i >= 0; $i--) {
            if ($this->_unicode[$i] == $char) {
                return $i;
            }
        }
 
        return -1;
    }
 
    /**
     * Determines if two Unicode strings are equal
     *
     * @param I18N_UnicodeString &$unicode The string to compare to.
     *
     * @access public
     * @return boolean True if they are equal, false otherwise.
     */
    function equals($unicode)
    {
        if ($this->length() != $unicode->length()) {
            // if they arn't even the same length no need to even check
            return false;
        }
 
        return ($this->_unicode == $unicode->_unicode);
    }
 
    /**
     * Appends a given Unicode string to the end of the current one.
     *
     * @param I18N_UnicodeString &$unicode The string to append.
     *
     * @access public
     * @return I18N_UnicodeString The new string created from the appension.
     */
    function append($unicode)
    {
        $data = array_merge($this->_unicode, $unicode->_unicode);
        return new I18N_UnicodeString($data, 'Unicode');
    }
 
    /**
     * Used to raise a PEAR_Error.
     *
     * Hopefully this method is never called, but when it is it will include the
     * PEAR class and return a new PEAR_Error.
     *
     * @param string $message The error message to raise.
     *
     * @static
     * @access public
     * @throws I18N_UnicodeString_Exception
     */
    function raiseError($message)
    {
        throw new I18N_UnicodeString_Exception($message);
    }
}
?>