Source for file UnicodeString.php
Documentation is available at UnicodeString.php
/* vim: set expandtab tabstop=4 shiftwidth=4: */
* Provides a method of storing and manipulating multibyte strings in PHP.
* LICENSE: Copyright 2004-2006 John Downey. All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* o Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* o Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* THIS SOFTWARE IS PROVIDED BY THE FREEBSD PROJECT "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
* EVENT SHALL THE FREEBSD PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* The views and conclusions contained in the software and documentation are
* those of the authors and should not be interpreted as representing official
* policies, either expressed or implied, of The PEAR Group.
* @category Internationalization
* @package I18N_UnicodeString
* @author John Downey <jdowney@gmail.com>
* @copyright 2004-2006 John Downey
* @license http://www.freebsd.org/copyright/freebsd-license.html 2 Clause BSD License
* @link http://pear.php.net/packages/I18N_UnicodeString
require_once 'I18N/UnicodeString/Exception.php';
* Class provides a way to use and manipulate multibyte strings in PHP
* @category Internationalization
* @package I18N_UnicodeString
* @author John Downey <jdowney@gmail.com>
* @license http://www.freebsd.org/copyright/freebsd-license.html 2 Clause BSD License
* @version Release: @package_version@
* @link http://pear.php.net/packages/I18N_UnicodeString
* The internal representation of the string as an array of numbers.
* Converts an entire array of strings to Unicode capable strings.
* Useful for converting a GET/POST of all its Unicode values into a
* workable and easily changed format. Takes an optional second
* parameter similer to that of {@link setString()}.
* @param array $array An array of PHP variables.
* @param string $encoding The encoding the string values are in.
* @return array The array with all of its string values converted to
foreach ($array as $key => $value) {
* The constructor of the class string which can receive a new string in a
* @param mixed $value A variable containing the Unicode string in one of
* @param string $encoding The encoding that the string is in.
* Set the string to a value passed in one of many encodings.
* You may pass the encoding as an optional second parameter which defaults
* to UTF-8 encoding. Possible encodings are:
* o <i>ASCII</i> - when you pass a normal 7 bit ASCII string
* o <i>UTF-8</i> - when you pass a UTF-8 encoded string
* o <i>HTML</i> - when you pass a string encoded with HTML entities, such
* as the kind received from a GET/POST
* o <i>Unicode</i> or <i>UCS-4</i> - when passing an array of integer values representing
* @param mixed $value A variable containing the Unicode string in one of
* @param string $encoding The encoding that the string is in.
* @return mixed Returns true on success or PEAR_Error otherwise.
function setString($value, $encoding = 'UTF-8')
$this->_unicode = $this->_stringFromHtml ($value);
$this->_unicode = $value;
* Converts a string encoded with HTML entities into our internal
* representation of an array of integers.
* @param string $string A string containing Unicode values encoded as HTML
* @return array The array of Unicode values.
function _stringFromHtml ($string = '')
foreach ($parts as $part) {
/* Suggested by Jonathan Yavner to allow HTML to also be in
the form of &#xNNNN where NNNN is the hexidecimal of a
if (ord($value[0 ]) == 120 ) {
for ($i = 0 , $max = strlen($text); $i < $max; $i++ ) {
$unicode[] = ord($text[$i]);
for ($i = 0 , $max = strlen($part); $i < $max; $i++ ) {
$unicode[] = ord($part[$i]);
* Converts a UTF-8 string into our representation of an array of integers.
* Method was made static by suggestion of Lukas Feiler (#7429)
* @param string $string A string containing Unicode values encoded in UTF-8
* @return array The array of Unicode values.
for ($count = 0 , $length = strlen($string); $count < $length; $count++ ) {
$value = ord($string[$count]);
// if the value is an ASCII char then just go ahead and add it on
// if not then we need to know how many more bytes make up this character
if (count($values) == 0 ) {
$values[] = ($value - 192 ) << 6;
} elseif ($value >> 4 == 14 ) {
$values[] = ($value - 224 ) << 12;
} elseif ($value >> 3 == 30 ) {
$values[] = ($value - 240 ) << 18;
} elseif ($value >> 2 == 62 ) {
$values[] = ($value - 248 ) << 24;
} elseif ($value >> 1 == 126 ) {
$values[] = ($value - 252 ) << 30;
$values[] = $value - 128;
if (count($values) == $search) {
// if we have all of our bytes then go ahead an encode it in unicode
for ($i = 1; $i < $search; $i++ ) {
$value += ($values[$i] << ((($search - $i) - 1 ) * 6 ));
* Transforms a single unicode character represented by an integer to a UTF-8 string.
* Suggested by Lukas Feiler (#7429)
* @param integer $char A unicode character as an integer
* @return string The unicode character converted to a UTF-8 string.
// its an ASCII char no encoding needed
} elseif ($char < 1 << 11 ) {
// its a 2 byte UTF-8 char
$string .= chr(192 + ($char >> 6 ));
$string .= chr(128 + ($char & 63 ));
} elseif ($char < 1 << 16 ) {
// its a 3 byte UTF-8 char
$string .= chr(224 + ($char >> 12 ));
$string .= chr(128 + (($char >> 6 ) & 63 ));
$string .= chr(128 + ($char & 63 ));
} elseif ($char < 1 << 21 ) {
// its a 4 byte UTF-8 char
$string .= chr(240 + ($char >> 18 ));
$string .= chr(128 + (($char >> 12 ) & 63 ));
$string .= chr(128 + (($char >> 6 ) & 63 ));
$string .= chr(128 + ($char & 63 ));
} elseif ($char < 1 << 26 ) {
// its a 5 byte UTF-8 char
$string .= chr(248 + ($char >> 24 ));
$string .= chr(128 + (($char >> 18 ) & 63 ));
$string .= chr(128 + (($char >> 12 ) & 63 ));
$string .= chr(128 + (($char >> 6 ) & 63 ));
$string .= chr(128 + ($char & 63 ));
// its a 6 byte UTF-8 char
$string .= chr(252 + ($char >> 30 ));
$string .= chr(128 + (($char >> 24 ) & 63 ));
$string .= chr(128 + (($char >> 18 ) & 63 ));
$string .= chr(128 + (($char >> 12 ) & 63 ));
$string .= chr(128 + (($char >> 6 ) & 63 ));
$string .= chr(128 + ($char & 63 ));
* Retrieves the string and returns it as a UTF-8 encoded string.
* @return string A string with the Unicode values encoded in UTF-8.
foreach ($this->_unicode as $char) {
* Retrieves the string and returns it as a string encoded with HTML
* @return string A string with the Unicode values encoded as HTML entities.
foreach ($this->_unicode as $char) {
$string .= '&#' . $char . ';';
* Retrieve the length of the string in characters.
* @return integer The length of the string.
return count($this->_unicode);
* Works exactly like PHP's substr function only it works on Unicode
* @param integer $begin The beginning of the substring.
* @param integer $length The length to read. Defaults to the rest of the
* @return I18N_UnicodeString A new I18N_UnicodeString class containing the
* substring or a PEAR_Error if an error is
$length = $this->length() - $begin;
if (($begin + $length) > $this->length()) {
if ($begin > $this->length()) {
for ($i = $begin, $max_length = ($begin + $length); $i < $max_length; $i++ ) {
* Works like PHP's substr_replace function.
* @param I18N_UnicodeString &$find The string to replaced
* @param I18N_UnicodeString &$replace The string to replace $find with
* @param integer $start The position in the string to start replacing at
* @param integer $length The length from the starting to position to stop
* @return I18N_UnicodeString The current string with all $find replaced by
$length = $this->length() - $start;
// $string is a PEAR_Error, return it
$string = $string->stringReplace ($find, $replace);
$data = array_merge($begin->_unicode , $string->_unicode , $after->_unicode );
* Works like PHP's str_replace function.
* @param I18N_UnicodeString &$find The string to replaced
* @param I18N_UnicodeString &$replace The string to replace $find with
* @return I18N_UnicodeString The current string with all $find replaced by
* @see subStringReplace()
while ($haystack->strStr ($find) !== false ) {
$after = $haystack->strStr ($find);
$begin = $haystack->subString (0 , $haystack->length () - $after->length ());
$haystack = $after->subString ($find->length ());
$return = $return->append ($begin)
->append ($after->subString (0 , $find->length ()));
return $return->append ($haystack);
* Works like PHP's strstr function by returning the string from $find on.
* @param I18N_UnicodeString &$find The string to found
* @return I18N_UnicodeString The current string from $find on to the end
$after = $find->_unicode;
for ($i = 0 , $length = $this->length(); $i < $length; $i++ ) {
$after[] = $this->_unicode[$i];
if ($this->_unicode[$i] == $find->_unicode [0 ]) {
if ($i + $find->length () > $length) {
for ($c = 1 , $max = $find->length (); $c < $max; $c++ ) {
if ($this->_unicode[++ $i] != $find->_unicode [$c]) {
* Returns the position of a character much like PHP's strpos function.
* @param mixed $char A Unicode char represented as either an integer or a
* @return integer The location of the character in the string.
for ($i = 0 , $length = $this->length(); $i < $length; $i++ ) {
if ($this->_unicode[$i] == $char) {
* Returns the last position of a character much like PHP's strrpos function.
* @param mixed $char A Unicode char represented as either an integer or a
* @return integer The last location of the character in the string.
for ($i = $this->length() - 1; $i >= 0; $i-- ) {
if ($this->_unicode[$i] == $char) {
* Determines if two Unicode strings are equal
* @param I18N_UnicodeString &$unicode The string to compare to.
* @return boolean True if they are equal, false otherwise.
if ($this->length() != $unicode->length ()) {
// if they arn't even the same length no need to even check
return ($this->_unicode == $unicode->_unicode );
* Appends a given Unicode string to the end of the current one.
* @param I18N_UnicodeString &$unicode The string to append.
* @return I18N_UnicodeString The new string created from the appension.
$data = array_merge($this->_unicode, $unicode->_unicode );
* Used to raise a PEAR_Error.
* Hopefully this method is never called, but when it is it will include the
* PEAR class and return a new PEAR_Error.
* @param string $message The error message to raise.
* @throws I18N_UnicodeString_Exception
throw new I18N_UnicodeString_Exception ($message);
Documentation generated on Mon, 11 Mar 2019 15:47:55 -0400 by phpDocumentor 1.4.4. PEAR Logo Copyright © PHP Group 2004.
|