Source for file IDNA2.php
Documentation is available at IDNA2.php
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 foldmethod=marker: */
// +----------------------------------------------------------------------+
// | This library is free software; you can redistribute it and/or modify |
// | it under the terms of the GNU Lesser General Public License as |
// | published by the Free Software Foundation; either version 2.1 of the |
// | License, or (at your option) any later version. |
// | This library is distributed in the hope that it will be useful, but |
// | WITHOUT ANY WARRANTY; without even the implied warranty of |
// | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
// | Lesser General Public License for more details. |
// | You should have received a copy of the GNU Lesser General Public |
// | License along with this library; if not, write to the Free Software |
// | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 |
// +----------------------------------------------------------------------+
require_once 'Net/IDNA2/Exception.php';
require_once 'Net/IDNA2/Exception/Nameprep.php';
* Encode/decode Internationalized Domain Names.
* The class allows to convert internationalized domain names
* (see RFC 3490 for details) as they can be used with various registries worldwide
* to be translated between their original (localized) form and their encoded form
* as it will be used in the DNS (Domain Name System).
* The class provides two public methods, encode() and decode(), which do exactly
* what you would expect them to do. You are allowed to use complete domain names,
* simple strings and complete email addresses as well. That means, that you might
* use any of the following notations:
* - xn--brse-5qa.xn--knrz-1ra.info
* Unicode input might be given as either UTF-8 string, UCS-4 string or UCS-4
* array. Unicode output is available in the same formats.
* You can select your preferred format via {@link set_paramter()}.
* ACE input and output is always expected to be ASCII.
* @author Markus Nix <mnix@docuverse.de>
* @author Matthias Sommerfeld <mso@phlylabs.de>
* @author Stefan Neufeind <pear.neufeind@speedpartner.de>
* @version $Id: IDNA2.php 305344 2010-11-14 23:52:42Z neufeind $
* These Unicode codepoints are
* mapped to nothing, See RFC3454 for details
private static $_np_map_nothing = array (
private static $_general_prohibited = array (
* Codepints prohibited by Nameprep
private static $_np_prohibit = array (
* Codepoint ranges prohibited by nameprep
private static $_np_prohibit_ranges = array (
array (0x1D173 , 0x1D17A ),
array (0xF0000 , 0xFFFFD ),
array (0x100000 , 0x10FFFD ),
* Replacement mappings (casemapping, replacement sequences, ...)
private static $_np_replacemaps = array (
0xDF => array (0x73 , 0x73 ),
0x130 => array (0x69 , 0x307 ),
0x149 => array (0x2BC , 0x6E ),
0x1F0 => array (0x6A , 0x30C ),
0x37A => array (0x20 , 0x3B9 ),
0x390 => array (0x3B9 , 0x308 , 0x301 ),
0x3B0 => array (0x3C5 , 0x308 , 0x301 ),
0x587 => array (0x565 , 0x582 ),
0x1E96 => array (0x68 , 0x331 ),
0x1E97 => array (0x74 , 0x308 ),
0x1E98 => array (0x77 , 0x30A ),
0x1E99 => array (0x79 , 0x30A ),
0x1E9A => array (0x61 , 0x2BE ),
0x1F50 => array (0x3C5 , 0x313 ),
0x1F52 => array (0x3C5 , 0x313 , 0x300 ),
0x1F54 => array (0x3C5 , 0x313 , 0x301 ),
0x1F56 => array (0x3C5 , 0x313 , 0x342 ),
0x1F80 => array (0x1F00 , 0x3B9 ),
0x1F81 => array (0x1F01 , 0x3B9 ),
0x1F82 => array (0x1F02 , 0x3B9 ),
0x1F83 => array (0x1F03 , 0x3B9 ),
0x1F84 => array (0x1F04 , 0x3B9 ),
0x1F85 => array (0x1F05 , 0x3B9 ),
0x1F86 => array (0x1F06 , 0x3B9 ),
0x1F87 => array (0x1F07 , 0x3B9 ),
0x1F88 => array (0x1F00 , 0x3B9 ),
0x1F89 => array (0x1F01 , 0x3B9 ),
0x1F8A => array (0x1F02 , 0x3B9 ),
0x1F8B => array (0x1F03 , 0x3B9 ),
0x1F8C => array (0x1F04 , 0x3B9 ),
0x1F8D => array (0x1F05 , 0x3B9 ),
0x1F8E => array (0x1F06 , 0x3B9 ),
0x1F8F => array (0x1F07 , 0x3B9 ),
0x1F90 => array (0x1F20 , 0x3B9 ),
0x1F91 => array (0x1F21 , 0x3B9 ),
0x1F92 => array (0x1F22 , 0x3B9 ),
0x1F93 => array (0x1F23 , 0x3B9 ),
0x1F94 => array (0x1F24 , 0x3B9 ),
0x1F95 => array (0x1F25 , 0x3B9 ),
0x1F96 => array (0x1F26 , 0x3B9 ),
0x1F97 => array (0x1F27 , 0x3B9 ),
0x1F98 => array (0x1F20 , 0x3B9 ),
0x1F99 => array (0x1F21 , 0x3B9 ),
0x1F9A => array (0x1F22 , 0x3B9 ),
0x1F9B => array (0x1F23 , 0x3B9 ),
0x1F9C => array (0x1F24 , 0x3B9 ),
0x1F9D => array (0x1F25 , 0x3B9 ),
0x1F9E => array (0x1F26 , 0x3B9 ),
0x1F9F => array (0x1F27 , 0x3B9 ),
0x1FA0 => array (0x1F60 , 0x3B9 ),
0x1FA1 => array (0x1F61 , 0x3B9 ),
0x1FA2 => array (0x1F62 , 0x3B9 ),
0x1FA3 => array (0x1F63 , 0x3B9 ),
0x1FA4 => array (0x1F64 , 0x3B9 ),
0x1FA5 => array (0x1F65 , 0x3B9 ),
0x1FA6 => array (0x1F66 , 0x3B9 ),
0x1FA7 => array (0x1F67 , 0x3B9 ),
0x1FA8 => array (0x1F60 , 0x3B9 ),
0x1FA9 => array (0x1F61 , 0x3B9 ),
0x1FAA => array (0x1F62 , 0x3B9 ),
0x1FAB => array (0x1F63 , 0x3B9 ),
0x1FAC => array (0x1F64 , 0x3B9 ),
0x1FAD => array (0x1F65 , 0x3B9 ),
0x1FAE => array (0x1F66 , 0x3B9 ),
0x1FAF => array (0x1F67 , 0x3B9 ),
0x1FB2 => array (0x1F70 , 0x3B9 ),
0x1FB3 => array (0x3B1 , 0x3B9 ),
0x1FB4 => array (0x3AC , 0x3B9 ),
0x1FB6 => array (0x3B1 , 0x342 ),
0x1FB7 => array (0x3B1 , 0x342 , 0x3B9 ),
0x1FBC => array (0x3B1 , 0x3B9 ),
0x1FC2 => array (0x1F74 , 0x3B9 ),
0x1FC3 => array (0x3B7 , 0x3B9 ),
0x1FC4 => array (0x3AE , 0x3B9 ),
0x1FC6 => array (0x3B7 , 0x342 ),
0x1FC7 => array (0x3B7 , 0x342 , 0x3B9 ),
0x1FCC => array (0x3B7 , 0x3B9 ),
0x1FD2 => array (0x3B9 , 0x308 , 0x300 ),
0x1FD3 => array (0x3B9 , 0x308 , 0x301 ),
0x1FD6 => array (0x3B9 , 0x342 ),
0x1FD7 => array (0x3B9 , 0x308 , 0x342 ),
0x1FE2 => array (0x3C5 , 0x308 , 0x300 ),
0x1FE3 => array (0x3C5 , 0x308 , 0x301 ),
0x1FE4 => array (0x3C1 , 0x313 ),
0x1FE6 => array (0x3C5 , 0x342 ),
0x1FE7 => array (0x3C5 , 0x308 , 0x342 ),
0x1FF2 => array (0x1F7C , 0x3B9 ),
0x1FF3 => array (0x3C9 , 0x3B9 ),
0x1FF4 => array (0x3CE , 0x3B9 ),
0x1FF6 => array (0x3C9 , 0x342 ),
0x1FF7 => array (0x3C9 , 0x342 , 0x3B9 ),
0x1FFC => array (0x3C9 , 0x3B9 ),
0x20A8 => array (0x72 , 0x73 ),
0x2103 => array (0xB0 , 0x63 ),
0x2109 => array (0xB0 , 0x66 ),
0x2116 => array (0x6E , 0x6F ),
0x2120 => array (0x73 , 0x6D ),
0x2121 => array (0x74 , 0x65 , 0x6C ),
0x2122 => array (0x74 , 0x6D ),
0x3371 => array (0x68 , 0x70 , 0x61 ),
0x3373 => array (0x61 , 0x75 ),
0x3375 => array (0x6F , 0x76 ),
0x3380 => array (0x70 , 0x61 ),
0x3381 => array (0x6E , 0x61 ),
0x3382 => array (0x3BC , 0x61 ),
0x3383 => array (0x6D , 0x61 ),
0x3384 => array (0x6B , 0x61 ),
0x3385 => array (0x6B , 0x62 ),
0x3386 => array (0x6D , 0x62 ),
0x3387 => array (0x67 , 0x62 ),
0x338A => array (0x70 , 0x66 ),
0x338B => array (0x6E , 0x66 ),
0x338C => array (0x3BC , 0x66 ),
0x3390 => array (0x68 , 0x7A ),
0x3391 => array (0x6B , 0x68 , 0x7A ),
0x3392 => array (0x6D , 0x68 , 0x7A ),
0x3393 => array (0x67 , 0x68 , 0x7A ),
0x3394 => array (0x74 , 0x68 , 0x7A ),
0x33A9 => array (0x70 , 0x61 ),
0x33AA => array (0x6B , 0x70 , 0x61 ),
0x33AB => array (0x6D , 0x70 , 0x61 ),
0x33AC => array (0x67 , 0x70 , 0x61 ),
0x33B4 => array (0x70 , 0x76 ),
0x33B5 => array (0x6E , 0x76 ),
0x33B6 => array (0x3BC , 0x76 ),
0x33B7 => array (0x6D , 0x76 ),
0x33B8 => array (0x6B , 0x76 ),
0x33B9 => array (0x6D , 0x76 ),
0x33BA => array (0x70 , 0x77 ),
0x33BB => array (0x6E , 0x77 ),
0x33BC => array (0x3BC , 0x77 ),
0x33BD => array (0x6D , 0x77 ),
0x33BE => array (0x6B , 0x77 ),
0x33BF => array (0x6D , 0x77 ),
0x33C0 => array (0x6B , 0x3C9 ),
0x33C1 => array (0x6D , 0x3C9 ),
/* 0x33C2 => array(0x61, 0x2E, 0x6D, 0x2E), */
0x33C3 => array (0x62 , 0x71 ),
0x33C6 => array (0x63 , 0x2215 , 0x6B , 0x67 ),
0x33C7 => array (0x63 , 0x6F , 0x2E ),
0x33C8 => array (0x64 , 0x62 ),
0x33C9 => array (0x67 , 0x79 ),
0x33CB => array (0x68 , 0x70 ),
0x33CD => array (0x6B , 0x6B ),
0x33CE => array (0x6B , 0x6D ),
0x33D7 => array (0x70 , 0x68 ),
0x33D9 => array (0x70 , 0x70 , 0x6D ),
0x33DA => array (0x70 , 0x72 ),
0x33DC => array (0x73 , 0x76 ),
0x33DD => array (0x77 , 0x62 ),
0xFB00 => array (0x66 , 0x66 ),
0xFB01 => array (0x66 , 0x69 ),
0xFB02 => array (0x66 , 0x6C ),
0xFB03 => array (0x66 , 0x66 , 0x69 ),
0xFB04 => array (0x66 , 0x66 , 0x6C ),
0xFB05 => array (0x73 , 0x74 ),
0xFB06 => array (0x73 , 0x74 ),
0xFB13 => array (0x574 , 0x576 ),
0xFB14 => array (0x574 , 0x565 ),
0xFB15 => array (0x574 , 0x56B ),
0xFB16 => array (0x57E , 0x576 ),
0xFB17 => array (0x574 , 0x56D ),
0x10400 => array (0x10428 ),
0x10401 => array (0x10429 ),
0x10402 => array (0x1042A ),
0x10403 => array (0x1042B ),
0x10404 => array (0x1042C ),
0x10405 => array (0x1042D ),
0x10406 => array (0x1042E ),
0x10407 => array (0x1042F ),
0x10408 => array (0x10430 ),
0x10409 => array (0x10431 ),
0x1040A => array (0x10432 ),
0x1040B => array (0x10433 ),
0x1040C => array (0x10434 ),
0x1040D => array (0x10435 ),
0x1040E => array (0x10436 ),
0x1040F => array (0x10437 ),
0x10410 => array (0x10438 ),
0x10411 => array (0x10439 ),
0x10412 => array (0x1043A ),
0x10413 => array (0x1043B ),
0x10414 => array (0x1043C ),
0x10415 => array (0x1043D ),
0x10416 => array (0x1043E ),
0x10417 => array (0x1043F ),
0x10418 => array (0x10440 ),
0x10419 => array (0x10441 ),
0x1041A => array (0x10442 ),
0x1041B => array (0x10443 ),
0x1041C => array (0x10444 ),
0x1041D => array (0x10445 ),
0x1041E => array (0x10446 ),
0x1041F => array (0x10447 ),
0x10420 => array (0x10448 ),
0x10421 => array (0x10449 ),
0x10422 => array (0x1044A ),
0x10423 => array (0x1044B ),
0x10424 => array (0x1044C ),
0x10425 => array (0x1044D ),
0x213B => array (0x66 , 0x61 , 0x78 ),
0x3250 => array (0x70 , 0x74 , 0x65 ),
0x32CC => array (0x68 , 0x67 ),
0x32CE => array (0x65 , 0x76 ),
0x32CF => array (0x6C , 0x74 , 0x64 ),
0x337A => array (0x69 , 0x75 ),
0x33DE => array (0x76 , 0x2215 , 0x6D ),
0x33DF => array (0x61 , 0x2215 , 0x6D )
* Normalization Combining Classes; Code Points not listed
private static $_np_norm_combcls = array (
private $_punycode_prefix = 'xn--';
private $_invalid_ucs = 0x80000000;
private $_max_ucs = 0x10FFFF;
private $_initial_bias = 72;
private $_initial_n = 0x80;
private $_sbase = 0xAC00;
private $_lbase = 0x1100;
private $_vbase = 0x1161;
private $_tbase = 0x11a7;
* lcount * tcount * vcount
private $_scount = 11172;
* Default encoding for encode()'s input and decode()'s output is UTF-8;
* Other possible encodings are ucs4_string and ucs4_array
* See {@link setParams()} for how to select these
private $_api_encoding = 'utf8';
* Overlong UTF-8 encodings are forbidden
private $_allow_overlong = false;
private $_strict_mode = false;
* Values are "2003" and "2008".
* Defaults to "2003", since that was the original version and for
* compatibility with previous versions of this library.
* If you need to encode "new" characters like the German "Eszett",
* please switch to 2008 first before encoding.
private $_version = '2003';
* Cached value indicating whether or not mbstring function overloading is
* This is cached for optimal performance.
* @see Net_IDNA2::_byteLength()
private static $_mb_string_overload = null;
* @param array $options Options to initialise the object with
$this->_slast = $this->_sbase + $this->_lcount * $this->_vcount * $this->_tcount;
// populate mbstring overloading cache if not set
if (self ::$_mb_string_overload === null ) {
self ::$_mb_string_overload = (extension_loaded ('mbstring')
&& (ini_get('mbstring.func_overload') & 0x02 ) === 0x02 );
* Sets a new option value. Available options and values:
* [utf8 - Use either UTF-8 or ISO-8859-1 as input (true for UTF-8, false
* otherwise); The output is always UTF-8]
* [overlong - Unicode does not allow unnecessarily long encodings of chars,
* to allow this, set this parameter to true, else to false;
* [strict - true: strict mode, good for registration purposes - Causes errors
* on failures; false: loose mode, ideal for "wildlife" applications
* by silently ignoring errors and returning the original input instead]
* @param mixed $option Parameter to set (string: single parameter; array of Parameter => Value pairs)
* @param string $value Value to use (if parameter 1 is a string)
* @return boolean true on success, false otherwise
public function setParams($option, $value = false )
$option = array ($option => $value);
foreach ($option as $k => $v) {
$this->_api_encoding = $v;
throw new InvalidArgumentException ('Set Parameter: Unknown parameter '. $v. ' for option '. $k);
$this->_allow_overlong = ($v) ? true : false;
$this->_strict_mode = ($v) ? true : false;
if (in_array($v, array ('2003', '2008'))) {
throw new InvalidArgumentException ('Set Parameter: Invalid parameter '. $v. ' for option '. $k);
* Encode a given UTF-8 domain name.
* @param string $decoded Domain name (UTF-8 or UCS-4)
* @param string $one_time_encoding Desired input encoding, see {@link set_parameter}
* If not given will use default-encoding
* @return string Encoded Domain name (ACE string)
* @return mixed processed string
public function encode($decoded, $one_time_encoding = false )
// Forcing conversion of input to UCS4 array
// If one time encoding is given, use this, else the objects property
switch (($one_time_encoding) ? $one_time_encoding : $this->_api_encoding) {
$decoded = $this->_utf8_to_ucs4 ($decoded);
$decoded = $this->_ucs4_string_to_ucs4 ($decoded);
case 'ucs4_array': // No break; before this line. Catch case, but do nothing
throw new InvalidArgumentException ('Unsupported input format');
// No input, no output, what else did you expect?
if (empty ($decoded)) return '';
foreach ($decoded as $k => $v) {
// Make sure to use just the plain dot
// It's right, no break here
// The codepoints above have to be converted to dots anyway
// Stumbling across an anchoring character
// Neither email addresses nor URLs allowed in strict mode
if ($this->_strict_mode) {
throw new InvalidArgumentException ('Neither email addresses nor URLs are allowed in strict mode.');
$encoded = $this->_encode (array_slice($decoded, $last_begin, (($k)- $last_begin)));
$output .= $this->_ucs4_to_utf8 (array_slice($decoded, $last_begin, (($k)- $last_begin)));
$output .= chr($decoded[$k]);
// Catch the rest of the string
$encoded = $this->_encode (array_slice($decoded, $last_begin, (($inp_len)- $last_begin)));
$output .= $this->_ucs4_to_utf8 (array_slice($decoded, $last_begin, (($inp_len)- $last_begin)));
if ($output = $this->_encode ($decoded)) {
return $this->_ucs4_to_utf8 ($decoded);
* Decode a given ACE domain name.
* @param string $input Domain name (ACE string)
* @param string $one_time_encoding Desired output encoding, see {@link set_parameter}
* @return string Decoded Domain name (UTF-8 or UCS-4)
public function decode($input, $one_time_encoding = false )
if ($one_time_encoding) {
switch ($one_time_encoding) {
throw new InvalidArgumentException ('Unknown encoding '. $one_time_encoding);
// Make sure to drop any newline characters around
// Negotiate input and try to determine, wether it is a plain string,
// an email address or something like a complete URL
if (strpos($input, '@')) { // Maybe it is an email address
if ($this->_strict_mode) {
throw new InvalidArgumentException ('Only simple domain name parts can be handled in strict mode');
list ($email_pref, $input) = explode('@', $input, 2 );
foreach ($arr as $k => $v) {
$conv = $this->_decode ($v);
if ($conv) $arr[$k] = $conv;
$return = $email_pref . '@' . join('.', $arr);
} elseif (preg_match('![:\./]!', $input)) { // Or a complete domain name (with or without paths / parameters)
if ($this->_strict_mode) {
throw new InvalidArgumentException ('Only simple domain name parts can be handled in strict mode');
if (isset ($parsed['host'])) {
$arr = explode('.', $parsed['host']);
foreach ($arr as $k => $v) {
$conv = $this->_decode ($v);
if ($conv) $arr[$k] = $conv;
$parsed['host'] = join('.', $arr);
if (isset ($parsed['scheme'])) {
$parsed['scheme'] .= (strtolower($parsed['scheme']) == 'mailto') ? ':' : '://';
$return = $this->_unparse_url ($parsed);
} else { // parse_url seems to have failed, try without it
foreach ($arr as $k => $v) {
$conv = $this->_decode ($v);
if ($conv) $arr[$k] = $conv;
$return = join('.', $arr);
} else { // Otherwise we consider it being a pure domain name string
$return = $this->_decode ($input);
// The output is UTF-8 by default, other output formats need conversion here
// If one time encoding is given, use this, else the objects property
switch (($one_time_encoding) ? $one_time_encoding : $this->_api_encoding) {
return $this->_ucs4_to_ucs4_string ($this->_utf8_to_ucs4 ($return));
return $this->_utf8_to_ucs4 ($return);
throw new InvalidArgumentException ('Unsupported output format');
* Opposite function to parse_url()
* Inspired by code from comments of php.net-documentation for parse_url()
* @param array $parts_arr parts (strings) as returned by parse_url()
private function _unparse_url ($parts_arr)
if (!empty ($parts_arr['scheme'])) {
$ret_url = $parts_arr['scheme'];
if (!empty ($parts_arr['user'])) {
$ret_url .= $parts_arr['user'];
if (!empty ($parts_arr['pass'])) {
$ret_url .= ':' . $parts_arr['pass'];
$ret_url .= $parts_arr['host'];
if (!empty ($parts_arr['port'])) {
$ret_url .= ':' . $parts_arr['port'];
$ret_url .= $parts_arr['path'];
if (!empty ($parts_arr['query'])) {
$ret_url .= '?' . $parts_arr['query'];
if (!empty ($parts_arr['fragment'])) {
$ret_url .= '#' . $parts_arr['fragment'];
* The actual encoding algorithm.
* @param string $decoded Decoded string which should be encoded
* @return string Encoded string
private function _encode ($decoded)
// We cannot encode a domain name containing the Punycode prefix
$extract = self ::_byteLength ($this->_punycode_prefix);
$check_pref = $this->_utf8_to_ucs4 ($this->_punycode_prefix);
if ($check_pref == $check_deco) {
throw new InvalidArgumentException ('This is already a punycode string');
// We will not try to encode strings consisting of basic code points only
foreach ($decoded as $k => $v) {
if ($this->_strict_mode) {
throw new InvalidArgumentException ('The given string does not contain encodable chars');
$decoded = $this->_nameprep ($decoded);
$deco_len = count($decoded);
// How many chars have been consumed
// Start with the prefix; copy it to output
$encoded = $this->_punycode_prefix;
// Copy all basic code points to output
for ($i = 0; $i < $deco_len; ++ $i) {
// Will match [0-9a-zA-Z-]
if ((0x2F < $test && $test < 0x40 )
|| (0x40 < $test && $test < 0x5B )
|| (0x60 < $test && $test <= 0x7B )
$encoded .= chr($decoded[$i]);
// All codepoints were basic ones
if ($codecount == $deco_len) {
// Start with the prefix; copy it to output
$encoded = $this->_punycode_prefix . $encoded;
// If we have basic code points in output, add an hyphen to the end
// Now find and encode all non-basic code points
$cur_code = $this->_initial_n;
$bias = $this->_initial_bias;
while ($codecount < $deco_len) {
// Find the smallest code point >= the current code point and
// remember the last ouccrence of it in the input
for ($i = 0 , $next_code = $this->_max_ucs; $i < $deco_len; $i++ ) {
if ($decoded[$i] >= $cur_code && $decoded[$i] <= $next_code) {
$next_code = $decoded[$i];
$delta += ($next_code - $cur_code) * ($codecount + 1 );
// Scan input again and encode all characters whose code point is $cur_code
for ($i = 0; $i < $deco_len; $i++ ) {
if ($decoded[$i] < $cur_code) {
} else if ($decoded[$i] == $cur_code) {
for ($q = $delta, $k = $this->_base; 1; $k += $this->_base) {
(($k >= $bias + $this->_tmax)? $this->_tmax : $k - $bias);
$encoded .= $this->_encodeDigit (ceil($t + (($q - $t) % ($this->_base - $t))));
$q = ($q - $t) / ($this->_base - $t);
$encoded .= $this->_encodeDigit ($q);
$bias = $this->_adapt ($delta, $codecount + 1 , $is_first);
* The actual decoding algorithm.
* @param string $encoded Encoded string which should be decoded
* @return string Decoded string
private function _decode ($encoded)
// We do need to find the Punycode prefix
// If nothing left after removing the prefix, it is hopeless
// Find last occurence of the delimiter
$delim_pos = strrpos($encoded, '-');
if ($delim_pos > self ::_byteLength ($this->_punycode_prefix)) {
for ($k = self ::_byteLength ($this->_punycode_prefix); $k < $delim_pos; ++ $k) {
$decoded[] = ord($encoded{$k});
$deco_len = count($decoded);
$enco_len = self ::_byteLength ($encoded);
// Wandering through the strings; init
$bias = $this->_initial_bias;
$char = $this->_initial_n;
for ($enco_idx = ($delim_pos)? ($delim_pos + 1 ) : 0; $enco_idx < $enco_len; ++ $deco_len) {
for ($old_idx = $idx, $w = 1 , $k = $this->_base; 1 ; $k += $this->_base) {
$digit = $this->_decodeDigit ($encoded{$enco_idx++ });
(($k >= $bias + $this->_tmax)? $this->_tmax : ($k - $bias));
$w = (int) ($w * ($this->_base - $t));
$bias = $this->_adapt ($idx - $old_idx, $deco_len + 1 , $is_first);
$char += (int) ($idx / ($deco_len + 1 ));
// Make room for the decoded char
for ($i = $deco_len; $i > $idx; $i-- ) {
$decoded[$i] = $decoded[($i - 1 )];
$decoded[$idx++ ] = $char;
return $this->_ucs4_to_utf8 ($decoded);
* Adapt the bias according to the current code point and position.
* @param int $npoints ...
* @param boolean $is_first ...
private function _adapt ($delta, $npoints, $is_first)
$delta = (int) ($is_first ? ($delta / $this->_damp) : ($delta / 2 ));
$delta += (int) ($delta / $npoints);
for ($k = 0; $delta > (($this->_base - $this->_tmin) * $this->_tmax) / 2; $k += $this->_base) {
$delta = (int) ($delta / ($this->_base - $this->_tmin));
return (int) ($k + ($this->_base - $this->_tmin + 1 ) * $delta / ($delta + $this->_skew));
* Encoding a certain digit.
* @param int $d One digit to encode
* @return char Encoded digit
private function _encodeDigit ($d)
return chr($d + 22 + 75 * ($d < 26 ));
* Decode a certain digit.
* @param char $cp One digit (character) to decode
* @return int Decoded digit
private function _decodeDigit ($cp)
return ($cp - 48 < 10 )? $cp - 22 : (($cp - 65 < 26 )? $cp - 65 : (($cp - 97 < 26 )? $cp - 97 : $this->_base));
* Do Nameprep according to RFC3491 and RFC3454.
* @param array $input Unicode Characters
* @return string Unicode Characters, Nameprep'd
private function _nameprep ($input)
// Walking through the input array, performing the required steps on each of
// the input chars and putting the result into the output array
// While mapping required chars we apply the cannonical ordering
// Map to nothing == skip that code point
if (in_array($v, self ::$_np_map_nothing)) {
// Try to find prohibited input
if (in_array ($v, self ::$_np_prohibit) || in_array ($v, self ::$_general_prohibited)) {
throw new Net_IDNA2_Exception_Nameprep ('Prohibited input U+' . sprintf('%08X', $v));
foreach (self ::$_np_prohibit_ranges as $range) {
if ($range[0 ] <= $v && $v <= $range[1 ]) {
throw new Net_IDNA2_Exception_Nameprep ('Prohibited input U+' . sprintf('%08X', $v));
// Hangul syllable decomposition
if (0xAC00 <= $v && $v <= 0xD7AF ) {
foreach ($this->_hangulDecompose ($v) as $out) {
} else if (($this->_version == '2003') && isset (self ::$_np_replacemaps[$v])) {
// There's a decomposition mapping for that code point
// Decompositions only in version 2003 (original) of IDNA
foreach ($this->_applyCannonicalOrdering (self ::$_np_replacemaps[$v]) as $out) {
$out_len = count ($output);
for ($i = 0; $i < $out_len; ++ $i) {
$class = $this->_getCombiningClass ($output[$i]);
if ((!$last_class || $last_class != $class) && $class) {
$seq_len = $i - $last_starter;
$out = $this->_combine (array_slice($output, $last_starter, $seq_len));
// On match: Replace the last starter with the composed character and remove
// the now redundant non-starter(s)
$output[$last_starter] = $out;
if (count($out) != $seq_len) {
for ($j = $i + 1; $j < $out_len; ++ $j) {
$output[$j - 1 ] = $output[$j];
unset ($output[$out_len]);
// Rewind the for loop by one, since there can be more possible compositions
$last_class = ($i == $last_starter)? 0 : $this->_getCombiningClass ($output[$i - 1 ]);
// The current class is 0
* Decomposes a Hangul syllable
* (see http://www.unicode.org/unicode/reports/tr15/#Hangul).
* @param integer $char 32bit UCS4 code point
* @return array Either Hangul Syllable decomposed or original 32bit
* value as one value array
private function _hangulDecompose ($char)
$sindex = $char - $this->_sbase;
if ($sindex < 0 || $sindex >= $this->_scount) {
$T = $this->_tbase + $sindex % $this->_tcount;
$result[] = (int) ($this->_lbase + $sindex / $this->_ncount);
$result[] = (int) ($this->_vbase + ($sindex % $this->_ncount) / $this->_tcount);
if ($T != $this->_tbase) {
* Ccomposes a Hangul syllable
* (see http://www.unicode.org/unicode/reports/tr15/#Hangul).
* @param array $input Decomposed UCS4 sequence
* @return array UCS4 sequence with syllables composed
private function _hangulCompose ($input)
$inp_len = count($input);
$result[] = $last; // copy first char from input to output
for ($i = 1; $i < $inp_len; ++ $i) {
// Find out, wether two current characters from L and V
$lindex = $last - $this->_lbase;
if (0 <= $lindex && $lindex < $this->_lcount) {
$vindex = $char - $this->_vbase;
if (0 <= $vindex && $vindex < $this->_vcount) {
// create syllable of form LV
$last = ($this->_sbase + ($lindex * $this->_vcount + $vindex) * $this->_tcount);
$out_off = count($result) - 1;
$result[$out_off] = $last; // reset last
// Find out, wether two current characters are LV and T
$sindex = $last - $this->_sbase;
if (0 <= $sindex && $sindex < $this->_scount && ($sindex % $this->_tcount) == 0 ) {
$tindex = $char - $this->_tbase;
if (0 <= $tindex && $tindex <= $this->_tcount) {
// create syllable of form LVT
$out_off = count($result) - 1;
$result[$out_off] = $last; // reset last
// if neither case was true, just add the character
* Returns the combining class of a certain wide char.
* @param integer $char Wide char to check (32bit integer)
* @return integer Combining class if found, else 0
private function _getCombiningClass ($char)
return isset (self ::$_np_norm_combcls[$char])? self ::$_np_norm_combcls[$char] : 0;
* Apllies the cannonical ordering of a decomposed UCS4 sequence.
* @param array $input Decomposed UCS4 sequence
* @return array Ordered USC4 sequence
private function _applyCannonicalOrdering ($input)
$last = $this->_getCombiningClass ($input[0 ]);
for ($i = 0; $i < $size - 1; ++ $i) {
$next = $this->_getCombiningClass ($input[$i + 1 ]);
if ($next != 0 && $last > $next) {
// Move item leftward until it fits
for ($j = $i + 1; $j > 0; -- $j) {
if ($this->_getCombiningClass ($input[$j - 1 ]) <= $next) {
$input[$j] = $input[$j - 1 ];
// Reentering the loop looking at the old character again
* Do composition of a sequence of starter and non-starter.
* @param array $input UCS4 Decomposed sequence
* @return array Ordered USC4 sequence
private function _combine ($input)
$inp_len = count($input);
// Is it a Hangul syllable?
$hangul = $this->_hangulCompose ($input);
// This place is probably wrong
if (count($hangul) != $inp_len) {
foreach (self ::$_np_replacemaps as $np_src => $np_target) {
if ($np_target[0 ] != $input[0 ]) {
if (count ($np_target) != $inp_len) {
foreach ($input as $k2 => $v2) {
if ($v2 == $np_target[$k2]) {
* This converts an UTF-8 encoded string to its UCS-4 (array) representation
* By talking about UCS-4 we mean arrays of 32bit integers representing
* each of the "chars". This is due to PHP not being able to handle strings with
* bit depth different from 8. This applies to the reverse method _ucs4_to_utf8(), too.
* The following UTF-8 encodings are supported:
* bytes bits representation
* 3 16 1110xxxx 10xxxxxx 10xxxxxx
* 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 5 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* 6 31 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* Each x represents a bit that can be used to store character data.
* @param string $input utf8-encoded string
* @return array ucs4-encoded array
private function _utf8_to_ucs4 ($input)
$inp_len = self ::_byteLength ($input, '8bit');
for ($k = 0; $k < $inp_len; ++ $k) {
$v = ord($input{$k}); // Extract byte from input string
if ($v < 128 ) { // We found an ASCII char - put into stirng as is
throw new UnexpectedValueException ('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '. $k);
if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char
if ($v >> 5 == 6 ) { // &110xxxxx 10xxxxx
$next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left
} elseif ($v >> 4 == 14 ) { // &1110xxxx 10xxxxxx 10xxxxxx
} elseif ($v >> 3 == 30 ) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
} elseif ($v >> 2 == 62 ) { // &111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
} elseif ($v >> 1 == 126 ) { // &1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
throw new UnexpectedValueException ('This might be UTF-8, but I don\'t understand it at byte '. $k);
$output[$out_len] = (int) $v;
if (!$this->_allow_overlong && $test == 'range') {
if (($v < 0xA0 && $start_byte == 0xE0 ) || ($v < 0x90 && $start_byte == 0xF0 ) || ($v > 0x8F && $start_byte == 0xF4 )) {
throw new OutOfRangeException ('Bogus UTF-8 character detected (out of legal range) at byte '. $k);
if ($v >> 6 == 2 ) { // Bit mask must be 10xxxxxx
$v = ($v - 128 ) << ($next_byte * 6 );
$output[($out_len - 1 )] += $v;
throw new UnexpectedValueException ('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '. $k);
* Convert UCS-4 array into UTF-8 string
* @param array $input ucs4-encoded array
* @return string utf8-encoded string
private function _ucs4_to_utf8 ($input)
// 7bit are transferred literally
} else if ($v < 1 << 11 ) {
$output .= chr(192 + ($v >> 6 ))
} else if ($v < 1 << 16 ) {
$output .= chr(224 + ($v >> 12 ))
. chr(128 + (($v >> 6 ) & 63 ))
} else if ($v < 1 << 21 ) {
$output .= chr(240 + ($v >> 18 ))
. chr(128 + (($v >> 12 ) & 63 ))
. chr(128 + (($v >> 6 ) & 63 ))
} else if ($v < 1 << 26 ) {
$output .= chr(248 + ($v >> 24 ))
. chr(128 + (($v >> 18 ) & 63 ))
. chr(128 + (($v >> 12 ) & 63 ))
. chr(128 + (($v >> 6 ) & 63 ))
} else if ($v < 1 << 31 ) {
$output .= chr(252 + ($v >> 30 ))
. chr(128 + (($v >> 24 ) & 63 ))
. chr(128 + (($v >> 18 ) & 63 ))
. chr(128 + (($v >> 12 ) & 63 ))
. chr(128 + (($v >> 6 ) & 63 ))
throw new UnexpectedValueException ('Conversion from UCS-4 to UTF-8 failed: malformed input');
* Convert UCS-4 array into UCS-4 string
* @param array $input ucs4-encoded array
* @return string ucs4-encoded string
private function _ucs4_to_ucs4_string ($input)
// Take array values and split output to 4 bytes per value
// The bit mask is 255, which reads &11111111
$output .= ($v & (255 << 24 ) >> 24 ) . ($v & (255 << 16 ) >> 16 ) . ($v & (255 << 8 ) >> 8 ) . ($v & 255 );
* Convert UCS-4 string into UCS-4 array
* @param string $input ucs4-encoded string
* @return array ucs4-encoded array
* @throws InvalidArgumentException
private function _ucs4_string_to_ucs4 ($input)
$inp_len = self ::_byteLength ($input);
// Input length must be dividable by 4
throw new InvalidArgumentException ('Input UCS4 string is broken');
// Empty input - return empty output
for ($i = 0 , $out_len = -1; $i < $inp_len; ++ $i) {
// Increment output position every 4 input bytes
$output[$out_len] += ord($input{$i}) << (8 * (3 - ($i % 4 ) ) );
* Echo hex representation of UCS4 sequence.
* @param array $input UCS4 sequence
* @param boolean $include_bit Include bitmask in output
private static function _showHex ($input, $include_bit = false )
foreach ($input as $k => $v) {
echo '[', $k, '] => ', sprintf('%X', $v);
* Gives you a bit representation of given Byte (8 bits), Word (16 bits) or DWord (32 bits)
* Output width is automagically determined
* @return string Bitmask-representation
private static function _showBitmask ($octet)
if ($octet >= (1 << 16 )) {
} else if ($octet >= (1 << 8 )) {
for ($i = $w; $i > -1; $i-- ) {
$return .= ($octet & (1 << $i))? '1' : '0';
* Gets the length of a string in bytes even if mbstring function
* overloading is turned on
* @param string $string the string for which to get the length.
* @return integer the length of the string in bytes.
* @see Net_IDNA2::$_mb_string_overload
private static function _byteLength ($string)
if (self ::$_mb_string_overload) {
return mb_strlen ($string, '8bit');
return strlen((binary) $string);
* Attempts to return a concrete IDNA instance for either php4 or php5.
* @param array $params Set of paramaters
* Attempts to return a concrete IDNA instance for either php4 or php5,
* only creating a new instance if no IDNA instance with the same
* parameters currently exists.
* @param array $params Set of paramaters
* @return object Net_IDNA2
if (!isset ($instances)) {
if (!isset ($instances[$signature])) {
return $instances[$signature];
Documentation generated on Mon, 11 Mar 2019 15:41:59 -0400 by phpDocumentor 1.4.4. PEAR Logo Copyright © PHP Group 2004.
|