From: Michael Wallner Date: Wed, 29 Oct 2014 07:54:49 +0000 (+0100) Subject: update X-Git-Tag: RELEASE_2_2_0_RC1~9^2~18 X-Git-Url: https://git.m6w6.name/?a=commitdiff_plain;h=62e9d3e1f5ec8896131313c70574ce7f7cbcfc99;p=m6w6%2Fext-http update --- diff --git a/ctype.php b/ctype.php deleted file mode 100644 index c670096..0000000 --- a/ctype.php +++ /dev/null @@ -1,104 +0,0 @@ -= 2 ? $argv[1] : "/usr/share/i18n/locales/i18n"; - -$f = fopen($i18n, "r"); -$c = false; -$a = false; -$r = array(); - -print <<", $start); - sscanf($sstep, "(%d)", $step); - sscanf($send, "", $end); - - break; - case 2: - list($sstart, $send) = $range; - $step = 1; - sscanf($sstart, "", $start); - sscanf($send, "", $end); - break; - case 1: - list($sstart) = $range; - sscanf($sstart, "", $start); - break; - } - print "\t{"; - if ($start >= 0xffff) { - printf("0x%08X, ", $start); - if ($end) { - printf("0x%08X, ", $end); - } else { - print(" 0, "); - } - } else { - printf(" 0x%04X, ", $start); - if ($end) { - printf(" 0x%04X, ", $end); - } else { - print(" 0, "); - } - } - printf("%d},\n", $step); - } - } - break; - default: - if ($a) { - break 2; - } elseif ($line === "alpha /\n") { - $a = true; - } - break; - } -} - -print <<= 2 ? $argv[1] : "/usr/share/i18n/locales/i18n"; + +$f = fopen($i18n, "r"); +$c = false; +$a = false; + +ob_start(null, 0xffff); +while (!feof($f)) { + $line = fgets($f); + if (!$c && $line !== "LC_CTYPE\n") { + continue; + } + $c = true; + if ($line === "END LC_CTYPE\n") { + break; + } + switch($line{0}) { + case "%": + if ($a) { + printf("/* %s */\n", trim($line, "%\n/ ")); + } + break; + case "\n": + if ($a) { + break 2; + } + break; + case " ": + if ($a) { + foreach (explode(";", trim($line, "\n/ ;")) as $ranges) { + $range = explode("..", $ranges); + $step = 0; + $end = 0; + switch (count($range)) { + case 3: + list($sstart, $sstep, $send) = $range; + sscanf($sstart, "", $start); + sscanf($sstep, "(%d)", $step); + sscanf($send, "", $end); + + break; + case 2: + list($sstart, $send) = $range; + $step = 1; + sscanf($sstart, "", $start); + sscanf($send, "", $end); + break; + case 1: + list($sstart) = $range; + sscanf($sstart, "", $start); + break; + } + print "\t{"; + if ($start >= 0xffff) { + printf("0x%08X, ", $start); + if ($end) { + printf("0x%08X, ", $end); + } else { + print(" 0, "); + } + } else { + printf(" 0x%04X, ", $start); + if ($end) { + printf(" 0x%04X, ", $end); + } else { + print(" 0, "); + } + } + printf("%d},\n", $step); + } + } + break; + default: + if ($a) { + break 2; + } elseif ($line === "alpha /\n") { + $a = true; + } + break; + } +} + +file_put_contents("php_http_utf8.h", + preg_replace('/(\/\* BEGIN::UTF8TABLE \*\/\n).*(\n\s*\/\* END::UTF8TABLE \*\/)/s', '$1'. ob_get_contents() .'$2', + file_get_contents("php_http_utf8.h"))); diff --git a/package.xml b/package.xml index 6d0b0a5..154b273 100644 --- a/package.xml +++ b/package.xml @@ -122,6 +122,7 @@ v2: http://dev.iworks.at/ext-http/lcov/ext/http/ + @@ -272,11 +273,18 @@ v2: http://dev.iworks.at/ext-http/lcov/ext/http/ + - + + + + + + + diff --git a/php_http_url.c b/php_http_url.c index 2bf40b7..5ace03d 100644 --- a/php_http_url.c +++ b/php_http_url.c @@ -21,6 +21,8 @@ # include #endif +#include "php_http_utf8.h" + static inline char *localhostname(void) { char hostname[1024] = {0}; @@ -328,93 +330,6 @@ void php_http_url_free(php_http_url_t **url) } } -static const unsigned char utf8mblen[256] = { - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, - 4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6 -}; -static const unsigned char utf8mask[] = { - 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 -}; - -static inline size_t utf8towc(unsigned *wc, const unsigned char *uc, size_t len) -{ - unsigned char ub = utf8mblen[*uc]; - - if (!ub || ub > len || ub > 3) { - return 0; - } - - *wc = *uc & utf8mask[ub]; - - switch (ub) { - case 4: - if ((uc[1] & 0xc0) != 0x80) { - return 0; - } - *wc <<= 6; - *wc += *++uc & 0x3f; - /* no break */ - case 3: - if ((uc[1] & 0xc0) != 0x80) { - return 0; - } - *wc <<= 6; - *wc += *++uc & 0x3f; - /* no break */ - case 2: - if ((uc[1] & 0xc0) != 0x80) { - return 0; - } - *wc <<= 6; - *wc += *++uc & 0x3f; - break; - - default: - return 0; - } - - return ub; -} - -#include "ualpha.h" - -static inline zend_bool isualnum(unsigned ch) -{ - unsigned i; - - /* digits */ - if (ch >= 0x30 && ch <= 0x39) { - return 1; - } - - for (i = 0; i < sizeof(utf8_ranges)/sizeof(utf8_range_t); ++i) { - if (utf8_ranges[i].start == ch) { - return 1; - } else if (utf8_ranges[i].start <= ch && utf8_ranges[i].end >= ch) { - if (utf8_ranges[i].step == 1) { - return 1; - } - /* FIXME step */ - return 0; - } - } - return 0; -} - static size_t parse_mb_utf8(php_http_url_t *url, const char *ptr, const char *end, zend_bool idn) { unsigned wchar; @@ -646,35 +561,26 @@ static STATUS parse_hostinfo(php_http_url_t *url, const char *ptr, const char *e #ifdef PHP_HTTP_HAVE_IDN if (url->flags & PHP_HTTP_URL_PARSE_IDN) { - if (url->flags & PHP_HTTP_URL_PARSE_MBUTF8) { - char *idn = NULL; - int rv = idna_to_ascii_8z(url->authority.host.str, &idn, IDNA_ALLOW_UNASSIGNED|IDNA_USE_STD3_ASCII_RULES); + char *idn = NULL; + int rv = -1; - if (rv != IDNA_SUCCESS) { - php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse IDN; %s", idna_strerror(rv)); - return FAILURE; - } else { - STR_SET(url->authority.host.str, estrdup(idn)); - url->authority.host.len = strlen(idn); - free(idn); - } + if (url->flags & PHP_HTTP_URL_PARSE_MBUTF8) { + rv = idna_to_ascii_8z(url->authority.host.str, &idn, IDNA_ALLOW_UNASSIGNED|IDNA_USE_STD3_ASCII_RULES); } # ifdef PHP_HTTP_HAVE_WCHAR else if (url->flags & PHP_HTTP_URL_PARSE_MBLOC) { - char *idn = NULL; - int rv = idna_to_ascii_lz(url->authority.host.str, &idn, IDNA_ALLOW_UNASSIGNED|IDNA_USE_STD3_ASCII_RULES); - - if (rv != IDNA_SUCCESS) { - php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse IDN; %s", idna_strerror(rv)); - return FAILURE; - } else { - STR_SET(url->authority.host.str, estrdup(idn)); - url->authority.host.len = strlen(idn); - free(idn); - } + rv = idna_to_ascii_lz(url->authority.host.str, &idn, IDNA_ALLOW_UNASSIGNED|IDNA_USE_STD3_ASCII_RULES); } - } # endif + if (rv != IDNA_SUCCESS) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse IDN; %s", idna_strerror(rv)); + return FAILURE; + } else { + STR_SET(url->authority.host.str, estrdup(idn)); + url->authority.host.len = strlen(idn); + free(idn); + } + } #endif return SUCCESS; diff --git a/php_http_utf8.h b/php_http_utf8.h new file mode 100644 index 0000000..b9b7b28 --- /dev/null +++ b/php_http_utf8.h @@ -0,0 +1,774 @@ +/* + +--------------------------------------------------------------------+ + | PECL :: http | + +--------------------------------------------------------------------+ + | Redistribution and use in source and binary forms, with or without | + | modification, are permitted provided that the conditions mentioned | + | in the accompanying LICENSE file are met. | + +--------------------------------------------------------------------+ + | Copyright (c) 2004-2014, Michael Wallner | + +--------------------------------------------------------------------+ +*/ + +#ifndef PHP_HTTP_UTF8_H +#define PHP_HTTP_UTF8_H + +typedef struct utf8_range { + unsigned int start; + unsigned int end; + unsigned char step; +} utf8_range_t; + +static const unsigned char utf8_mblen[256] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, + 4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6 +}; + +static const unsigned char utf8_mask[] = { + 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 +}; + +static const utf8_range_t utf8_ranges[] = { +/* BEGIN::UTF8TABLE */ +/* BASIC LATIN */ + { 0x0041, 0x005A, 1}, + { 0x0061, 0x007A, 1}, +/* LATIN-1 SUPPLEMENT */ + { 0x00AA, 0, 0}, + { 0x00B5, 0, 0}, + { 0x00BA, 0, 0}, + { 0x00C0, 0x00D6, 1}, + { 0x00D8, 0x00F6, 1}, + { 0x00F8, 0x00FF, 1}, +/* LATIN EXTENDED-A */ + { 0x0100, 0x017F, 1}, +/* LATIN EXTENDED-B */ + { 0x0180, 0x024F, 1}, +/* IPA EXTENSIONS */ + { 0x0250, 0x02AF, 1}, +/* SPACING MODIFIER LETTERS */ + { 0x02B0, 0x02C1, 1}, + { 0x02C6, 0x02D1, 1}, + { 0x02E0, 0x02E4, 1}, + { 0x02EE, 0, 0}, +/* COMBINING DIACRITICAL MARKS */ + { 0x0345, 0, 0}, +/* BASIC GREEK */ + { 0x0370, 0x0373, 1}, + { 0x0376, 0x0377, 1}, + { 0x037A, 0x037D, 1}, + { 0x0386, 0, 0}, + { 0x0388, 0x038A, 1}, + { 0x038C, 0, 0}, + { 0x038E, 0x03A1, 1}, + { 0x03A3, 0x03CE, 1}, +/* GREEK SYMBOLS AND COPTIC */ + { 0x03D0, 0x03F5, 1}, + { 0x03F7, 0x03FF, 1}, +/* CYRILLIC */ + { 0x0400, 0x0481, 1}, + { 0x048A, 0x04FF, 1}, +/* CYRILLIC SUPPLEMENT */ + { 0x0500, 0x0523, 1}, +/* ARMENIAN */ + { 0x0531, 0x0556, 1}, + { 0x0559, 0, 0}, + { 0x0561, 0x0587, 1}, +/* HEBREW */ + { 0x05D0, 0x05EA, 1}, + { 0x05F0, 0x05F2, 1}, +/* ARABIC */ + { 0x0621, 0x064A, 1}, + { 0x066E, 0x066F, 1}, + { 0x0671, 0x06D3, 1}, + { 0x06D5, 0, 0}, + { 0x06E5, 0x06E6, 1}, + { 0x06EE, 0x06EF, 1}, + { 0x06FA, 0x06FC, 1}, + { 0x06FF, 0, 0}, +/* SYRIAC */ + { 0x0710, 0, 0}, + { 0x0712, 0x072F, 1}, + { 0x074D, 0x074F, 1}, +/* ARABIC SUPPLEMENT */ + { 0x0750, 0x077F, 1}, +/* THAANA */ + { 0x0780, 0x07A5, 1}, + { 0x07B1, 0, 0}, +/* NKO */ + { 0x07C0, 0x07EA, 1}, + { 0x07F4, 0x07F5, 1}, + { 0x07FA, 0, 0}, +/* - All Matras of Indic and Sinhala are moved from punct to alpha class */ +/* - Added Unicode 5.1 charctares of Indic scripts */ +/* DEVANAGARI */ + { 0x0901, 0x0939, 1}, + { 0x093C, 0x094D, 1}, + { 0x0950, 0x0954, 1}, + { 0x0958, 0x0961, 1}, + { 0x0962, 0, 0}, + { 0x0963, 0, 0}, + { 0x0972, 0, 0}, + { 0x097B, 0x097F, 1}, +/* TABLE 18 BENGALI */ + { 0x0981, 0x0983, 1}, + { 0x0985, 0x098C, 1}, + { 0x098F, 0, 0}, + { 0x0990, 0, 0}, + { 0x0993, 0x09A8, 1}, + { 0x09AA, 0x09B0, 1}, + { 0x09B2, 0, 0}, + { 0x09B6, 0x09B9, 1}, + { 0x09BC, 0x09C4, 1}, + { 0x09C7, 0, 0}, + { 0x09C8, 0, 0}, + { 0x09CB, 0x09CE, 1}, + { 0x09D7, 0, 0}, + { 0x09DC, 0, 0}, + { 0x09DD, 0, 0}, + { 0x09DF, 0x09E3, 1}, + { 0x09F0, 0x09FA, 1}, +/* GURMUKHI */ + { 0x0A01, 0x0A03, 1}, + { 0x0A05, 0x0A0A, 1}, + { 0x0A0F, 0, 0}, + { 0x0A10, 0, 0}, + { 0x0A13, 0x0A28, 1}, + { 0x0A2A, 0x0A30, 1}, + { 0x0A32, 0, 0}, + { 0x0A33, 0, 0}, + { 0x0A35, 0, 0}, + { 0x0A36, 0, 0}, + { 0x0A38, 0, 0}, + { 0x0A39, 0, 0}, + { 0x0A3C, 0, 0}, + { 0x0A3E, 0x0A42, 1}, + { 0x0A47, 0, 0}, + { 0x0A48, 0, 0}, + { 0x0A4B, 0x0A4D, 1}, + { 0x0A51, 0, 0}, + { 0x0A59, 0x0A5C, 1}, + { 0x0A5E, 0, 0}, + { 0x0A70, 0x0A75, 1}, +/* GUJARATI */ + { 0x0A81, 0x0A83, 1}, + { 0x0A85, 0x0A8D, 1}, + { 0x0A8F, 0x0A91, 1}, + { 0x0A93, 0x0AA8, 1}, + { 0x0AAA, 0x0AB0, 1}, + { 0x0AB2, 0, 0}, + { 0x0AB3, 0, 0}, + { 0x0AB5, 0x0AB9, 1}, + { 0x0ABC, 0x0AC5, 1}, + { 0x0AC7, 0x0AC9, 1}, + { 0x0ACB, 0x0ACD, 1}, + { 0x0AD0, 0, 0}, + { 0x0AE0, 0x0AE3, 1}, + { 0x0AF1, 0, 0}, +/* ORIYA */ + { 0x0B01, 0x0B03, 1}, + { 0x0B05, 0x0B0C, 1}, + { 0x0B0F, 0, 0}, + { 0x0B10, 0, 0}, + { 0x0B13, 0x0B28, 1}, + { 0x0B2A, 0x0B30, 1}, + { 0x0B32, 0, 0}, + { 0x0B33, 0, 0}, + { 0x0B35, 0x0B39, 1}, + { 0x0B3C, 0x0B44, 1}, + { 0x0B47, 0x0B48, 1}, + { 0x0B4B, 0x0B4D, 1}, + { 0x0B56, 0x0B57, 1}, + { 0x0B5C, 0, 0}, + { 0x0B5D, 0, 0}, + { 0x0B5F, 0x0B63, 1}, + { 0x0B70, 0, 0}, + { 0x0B71, 0, 0}, +/* TAMIL */ + { 0x0B82, 0, 0}, + { 0x0B83, 0, 0}, + { 0x0B85, 0x0B8A, 1}, + { 0x0B8E, 0x0B90, 1}, + { 0x0B92, 0x0B95, 1}, + { 0x0B99, 0, 0}, + { 0x0B9A, 0, 0}, + { 0x0B9C, 0, 0}, + { 0x0B9E, 0, 0}, + { 0x0B9F, 0, 0}, + { 0x0BA3, 0, 0}, + { 0x0BA4, 0, 0}, + { 0x0BA8, 0x0BAA, 1}, + { 0x0BAE, 0x0BB9, 1}, + { 0x0BBE, 0x0BC2, 1}, + { 0x0BC6, 0x0BC8, 1}, + { 0x0BCA, 0x0BCD, 1}, + { 0x0BD0, 0, 0}, + { 0x0BD7, 0, 0}, + { 0x0BF0, 0x0BFA, 1}, +/* TELUGU */ + { 0x0C01, 0x0C03, 1}, + { 0x0C05, 0x0C0C, 1}, + { 0x0C0E, 0x0C10, 1}, + { 0x0C12, 0x0C28, 1}, + { 0x0C2A, 0x0C33, 1}, + { 0x0C35, 0x0C39, 1}, + { 0x0C3D, 0x0C44, 1}, + { 0x0C46, 0x0C48, 1}, + { 0x0C4A, 0x0C4D, 1}, + { 0x0C55, 0x0C56, 1}, + { 0x0C58, 0x0C59, 1}, + { 0x0C60, 0x0C63, 1}, +/* KANNADA */ + { 0x0C82, 0x0C83, 1}, + { 0x0C85, 0x0C8C, 1}, + { 0x0C8E, 0x0C90, 1}, + { 0x0C92, 0x0CA8, 1}, + { 0x0CAA, 0x0CB3, 1}, + { 0x0CB5, 0x0CB9, 1}, + { 0x0CBC, 0x0CC4, 1}, + { 0x0CC6, 0x0CC8, 1}, + { 0x0CCA, 0x0CCD, 1}, + { 0x0CD5, 0x0CD6, 1}, + { 0x0CDE, 0, 0}, + { 0x0CE0, 0x0CE3, 1}, + { 0x0CF1, 0, 0}, + { 0x0CF2, 0, 0}, +/* MALAYALAM */ + { 0x0D02, 0x0D03, 1}, + { 0x0D05, 0x0D0C, 1}, + { 0x0D0E, 0x0D10, 1}, + { 0x0D12, 0x0D28, 1}, + { 0x0D2A, 0x0D39, 1}, + { 0x0D3D, 0x0D44, 1}, + { 0x0D46, 0x0D48, 1}, + { 0x0D4A, 0x0D4D, 1}, + { 0x0D57, 0, 0}, + { 0x0D60, 0x0D63, 1}, + { 0x0D79, 0x0D7F, 1}, +/* SINHALA */ + { 0x0D82, 0x0D83, 1}, + { 0x0D85, 0x0D96, 1}, + { 0x0D9A, 0x0DB1, 1}, + { 0x0DB3, 0x0DBB, 1}, + { 0x0DBD, 0, 0}, + { 0x0DC0, 0x0DC6, 1}, + { 0x0DCA, 0, 0}, + { 0x0DCF, 0x0DD4, 1}, + { 0x0DD6, 0, 0}, + { 0x0DD8, 0x0DDF, 1}, + { 0x0DF2, 0x0DF4, 1}, +/* THAI */ + { 0x0E01, 0x0E2E, 1}, + { 0x0E30, 0x0E3A, 1}, + { 0x0E40, 0x0E45, 1}, + { 0x0E47, 0x0E4E, 1}, +/* LAO */ + { 0x0E81, 0x0E82, 1}, + { 0x0E84, 0, 0}, + { 0x0E87, 0x0E88, 1}, + { 0x0E8A, 0, 0}, + { 0x0E8D, 0, 0}, + { 0x0E94, 0x0E97, 1}, + { 0x0E99, 0x0E9F, 1}, + { 0x0EA1, 0x0EA3, 1}, + { 0x0EA5, 0, 0}, + { 0x0EA7, 0, 0}, + { 0x0EAA, 0x0EAB, 1}, + { 0x0EAD, 0x0EB0, 1}, + { 0x0EB2, 0x0EB3, 1}, + { 0x0EBD, 0, 0}, + { 0x0EC0, 0x0EC4, 1}, + { 0x0EC6, 0, 0}, + { 0x0EDC, 0x0EDD, 1}, +/* TIBETAN */ + { 0x0F00, 0, 0}, + { 0x0F40, 0x0F47, 1}, + { 0x0F49, 0x0F6C, 1}, + { 0x0F88, 0x0F8B, 1}, +/* MYANMAR */ + { 0x1000, 0x102A, 1}, + { 0x1050, 0x1055, 1}, + { 0x105A, 0x105D, 1}, + { 0x1061, 0, 0}, + { 0x0165, 0, 0}, + { 0x1066, 0, 0}, + { 0x106E, 0x1070, 1}, + { 0x1075, 0x1081, 1}, + { 0x108E, 0, 0}, +/* GEORGIAN */ + { 0x10A0, 0x10C5, 1}, + { 0x10D0, 0x10FA, 1}, + { 0x10FC, 0, 0}, +/* HANGUL JAMO */ + { 0x1100, 0x1159, 1}, + { 0x115F, 0x11A2, 1}, + { 0x11A8, 0x11F9, 1}, +/* ETHIOPIC */ + { 0x1200, 0x1248, 1}, + { 0x124A, 0x124D, 1}, + { 0x1250, 0x1256, 1}, + { 0x1258, 0, 0}, + { 0x125A, 0x125D, 1}, + { 0x1260, 0x1288, 1}, + { 0x128A, 0x128D, 1}, + { 0x1290, 0x12B0, 1}, + { 0x12B2, 0x12B5, 1}, + { 0x12B8, 0x12BE, 1}, + { 0x12C0, 0, 0}, + { 0x12C2, 0x12C5, 1}, + { 0x12C8, 0x12D6, 1}, + { 0x12D8, 0x1310, 1}, + { 0x1312, 0x1315, 1}, + { 0x1318, 0x135A, 1}, +/* ETHIOPIC EXTENDED */ + { 0x1380, 0x138F, 1}, +/* CHEROKEE */ + { 0x13A0, 0x13F4, 1}, +/* UNIFIED CANADIAN ABORIGINAL SYLLABICS */ + { 0x1401, 0x166C, 1}, + { 0x166F, 0x1676, 1}, +/* OGHAM */ + { 0x1681, 0x169A, 1}, +/* RUNIC */ + { 0x16A0, 0x16EA, 1}, + { 0x16EE, 0x16F0, 1}, +/* TAGALOG */ + { 0x1700, 0x170C, 1}, + { 0x170E, 0x1711, 1}, +/* HANUNOO */ + { 0x1720, 0x1731, 1}, +/* BUHID */ + { 0x1740, 0x1751, 1}, +/* TAGBANWA */ + { 0x1760, 0x176C, 1}, + { 0x176E, 0x1770, 1}, +/* KHMER */ + { 0x1780, 0x17B3, 1}, + { 0x17D7, 0, 0}, + { 0x17DC, 0, 0}, +/* MONGOLIAN */ + { 0x1820, 0x1877, 1}, + { 0x1880, 0x18A8, 1}, + { 0x18AA, 0, 0}, +/* LIMBU */ + { 0x1900, 0x191C, 1}, + { 0x1946, 0x194F, 1}, +/* TAI LE */ + { 0x1950, 0x196D, 1}, + { 0x1970, 0x1974, 1}, +/* NEW TAI LUE */ + { 0x1980, 0x19A9, 1}, + { 0x19C1, 0x19C7, 1}, + { 0x19D0, 0x19D9, 1}, +/* BUGINESE */ + { 0x1A00, 0x1A16, 1}, +/* BALINESE */ + { 0x1B05, 0x1B33, 1}, + { 0x1B45, 0x1B4B, 1}, + { 0x1B50, 0x1B59, 1}, +/* SUNDANESE */ + { 0x1B83, 0x1BA0, 1}, + { 0x1BAE, 0x1BAF, 1}, +/* LEPCHA */ + { 0x1C00, 0x1C23, 1}, + { 0x1C4D, 0x1C4F, 1}, +/* OL CHIKI */ + { 0x1C5A, 0x1C7D, 1}, +/* PHONETIC EXTENSIONS */ + { 0x1D00, 0x1DBF, 1}, +/* LATIN EXTENDED ADDITIONAL */ + { 0x1E00, 0x1E9F, 1}, + { 0x1EA0, 0x1EFF, 1}, +/* GREEK EXTENDED */ + { 0x1F00, 0x1F15, 1}, + { 0x1F18, 0x1F1D, 1}, + { 0x1F20, 0x1F45, 1}, + { 0x1F48, 0x1F4D, 1}, + { 0x1F50, 0x1F57, 1}, + { 0x1F59, 0, 0}, + { 0x1F5B, 0, 0}, + { 0x1F5D, 0, 0}, + { 0x1F5F, 0x1F7D, 1}, + { 0x1F80, 0x1FB4, 1}, + { 0x1FB6, 0x1FBC, 1}, + { 0x1FBE, 0, 0}, + { 0x1FC2, 0x1FC4, 1}, + { 0x1FC6, 0x1FCC, 1}, + { 0x1FD0, 0x1FD3, 1}, + { 0x1FD6, 0x1FDB, 1}, + { 0x1FE0, 0x1FEC, 1}, + { 0x1FF2, 0x1FF4, 1}, + { 0x1FF6, 0x1FFC, 1}, +/* SUPERSCRIPTS AND SUBSCRIPTS */ + { 0x2071, 0, 0}, + { 0x207F, 0, 0}, + { 0x2090, 0x2094, 1}, +/* LETTERLIKE SYMBOLS */ + { 0x2102, 0, 0}, + { 0x2107, 0, 0}, + { 0x210A, 0x2113, 1}, + { 0x2115, 0, 0}, + { 0x2119, 0x211D, 1}, + { 0x2124, 0, 0}, + { 0x2126, 0, 0}, + { 0x2128, 0x212D, 1}, + { 0x212F, 0x2139, 1}, + { 0x213C, 0x213F, 1}, + { 0x2145, 0x2149, 1}, + { 0x214E, 0, 0}, +/* NUMBER FORMS */ + { 0x2160, 0x2188, 1}, +/* ENCLOSED ALPHANUMERICS */ + { 0x249C, 0x24E9, 1}, +/* GLAGOLITIC */ + { 0x2C00, 0x2C2E, 1}, + { 0x2C30, 0x2C5E, 1}, +/* LATIN EXTENDED-C */ + { 0x2C60, 0x2C6F, 1}, + { 0x2C71, 0x2C7D, 1}, +/* COPTIC */ + { 0x2C80, 0x2CE4, 1}, +/* GEORGIAN SUPPLEMENT */ + { 0x2D00, 0x2D25, 1}, +/* TIFINAGH */ + { 0x2D30, 0x2D65, 1}, + { 0x2D6F, 0, 0}, +/* ETHIOPIC EXTENDED */ + { 0x2D80, 0x2D96, 1}, + { 0x2DA0, 0x2DA6, 1}, + { 0x2DA8, 0x2DAE, 1}, + { 0x2DB0, 0x2DB6, 1}, + { 0x2DB8, 0x2DBE, 1}, + { 0x2DC0, 0x2DC6, 1}, + { 0x2DC8, 0x2DCE, 1}, + { 0x2DD0, 0x2DD6, 1}, + { 0x2DD8, 0x2DDE, 1}, +/* CJK SYMBOLS AND PUNCTUATION */ + { 0x3005, 0x3007, 1}, + { 0x3021, 0x3029, 1}, + { 0x3031, 0x3035, 1}, + { 0x3038, 0x303C, 1}, +/* HIRAGANA */ + { 0x3041, 0x3096, 1}, + { 0x309D, 0x309F, 1}, +/* KATAKANA */ + { 0x30A1, 0x30FA, 1}, + { 0x30FC, 0x30FF, 1}, +/* BOPOMOFO */ + { 0x3105, 0x312D, 1}, +/* HANGUL COMPATIBILITY JAMO */ + { 0x3131, 0x318E, 1}, +/* BOPOMOFO EXTENDED */ + { 0x31A0, 0x31B7, 1}, +/* KATAKANA PHONETIC EXTENSIONS */ + { 0x31F0, 0x31FF, 1}, +/* CJK UNIFIED IDEOGRAPHS EXTENSION */ + { 0x3400, 0x4DB5, 1}, +/* CJK UNIFIED IDEOGRAPHS */ + { 0x4E00, 0x9FBB, 1}, +/* YI SYLLABLES */ + { 0xA000, 0xA48C, 1}, +/* VAI SYLLABLES */ + { 0xA500, 0xA60B, 1}, + { 0xA610, 0xA61F, 1}, + { 0xA62A, 0xA62B, 1}, +/* CYRILLIC SUPPLEMENT 2 */ + { 0xA640, 0xA65F, 1}, + { 0xA662, 0xA66E, 1}, + { 0xA680, 0xA697, 1}, +/* LATIN EXTENDED-D */ + { 0xA717, 0xA71F, 1}, + { 0xA722, 0xA78C, 1}, + { 0xA7FB, 0xA7FF, 1}, +/* SYLOTI NEGRI */ + { 0xA800, 0, 0}, + { 0xA801, 0, 0}, + { 0xA803, 0xA805, 1}, + { 0xA807, 0xA80A, 1}, + { 0xA80C, 0xA822, 1}, +/* PHAGS PA */ + { 0xA840, 0xA873, 1}, +/* SAURASHTRA */ + { 0xA882, 0xA8B3, 1}, +/* KAYAH LI */ + { 0xA90A, 0xA92D, 1}, +/* REJANG */ + { 0xA930, 0xA946, 1}, +/* CHAM */ + { 0xAA00, 0xAA28, 1}, + { 0xAA40, 0xAA42, 1}, + { 0xAA44, 0xAA4B, 1}, +/* HANGUL SYLLABLES */ + { 0xAC00, 0xD7A3, 1}, +/* CJK COMPATIBILITY IDEOGRAPHS */ + { 0xF900, 0xFA2D, 1}, + { 0xFA30, 0xFA6A, 1}, + { 0xFA70, 0xFAD9, 1}, +/* ALPHABETIC PRESENTATION FORMS */ + { 0xFB00, 0xFB06, 1}, + { 0xFB13, 0xFB17, 1}, + { 0xFB1D, 0, 0}, + { 0xFB1F, 0xFB28, 1}, + { 0xFB2A, 0xFB36, 1}, + { 0xFB38, 0xFB3C, 1}, + { 0xFB3E, 0, 0}, + { 0xFB40, 0, 0}, + { 0xFB41, 0, 0}, + { 0xFB43, 0, 0}, + { 0xFB44, 0, 0}, + { 0xFB46, 0xFB4F, 1}, +/* ARABIC PRESENTATION FORMS-A */ + { 0xFB50, 0xFBB1, 1}, + { 0xFBD3, 0xFD3D, 1}, + { 0xFD50, 0xFD8F, 1}, + { 0xFD92, 0xFDC7, 1}, + { 0xFDF0, 0xFDFB, 1}, +/* ARABIC PRESENTATION FORMS-B */ + { 0xFE70, 0xFE74, 1}, + { 0xFE76, 0xFEFC, 1}, +/* HALFWIDTH AND FULLWIDTH FORMS */ + { 0xFF21, 0xFF3A, 1}, + { 0xFF41, 0xFF5A, 1}, + { 0xFF66, 0xFFBE, 1}, + { 0xFFC2, 0xFFC7, 1}, + { 0xFFCA, 0xFFCF, 1}, + { 0xFFD2, 0xFFD7, 1}, + { 0xFFDA, 0xFFDC, 1}, +/* LINEAR B SYLLABARY */ + {0x00010000, 0x0001000B, 1}, + {0x0001000D, 0x00010026, 1}, + {0x00010028, 0x0001003A, 1}, + {0x0001003C, 0x0001003D, 1}, + {0x0001003F, 0x0001004D, 1}, + {0x00010050, 0x0001005D, 1}, +/* LINEAR B IDEOGRAMS */ + {0x00010080, 0x000100FA, 1}, +/* ANCIENT GREEK NUMBERS */ + {0x00010140, 0x00010174, 1}, +/* LYCIAN */ + {0x00010280, 0x0001029C, 1}, +/* CARIAN */ + {0x000102A0, 0x000102D0, 1}, +/* OLD ITALIC */ + {0x00010300, 0x0001031E, 1}, +/* GOTHIC */ + {0x00010330, 0x0001034A, 1}, +/* UGARITIC */ + {0x00010380, 0x0001039D, 1}, +/* OLD PERSIAN */ + {0x000103A0, 0x000103C3, 1}, + {0x000103C8, 0x000103CF, 1}, + {0x000103D1, 0x000103D5, 1}, +/* DESERET */ + {0x00010400, 0x0001044F, 1}, +/* SHAVIAN */ + {0x00010450, 0x0001047F, 1}, +/* OSMANYA */ + {0x00010480, 0x0001049D, 1}, + {0x000104A0, 0x000104A9, 1}, +/* CYPRIOT SYLLABARY */ + {0x00010800, 0x00010805, 1}, + {0x00010808, 0, 0}, + {0x0001080A, 0x00010835, 1}, + {0x00010837, 0x00010838, 1}, + {0x0001083C, 0, 0}, + {0x0001083F, 0, 0}, +/* PHOENICIAN */ + {0x00010900, 0x00010915, 1}, + {0x00010A00, 0, 0}, + {0x00010A10, 0x00010A13, 1}, +/* KHAROSHTI */ + {0x00010A15, 0x00010A17, 1}, + {0x00010A19, 0x00010A33, 1}, +/* CUNEIFORM */ + {0x00012000, 0x0001236E, 1}, +/* CUNEIFORM NUMBERS AND PONCTUATION */ + {0x00012400, 0x00012462, 1}, +/* BYZANTINE MUSICAL SYMBOLS */ +/* MATHEMATICAL ALPHANUMERIC SYMBOLS */ + {0x0001D400, 0x0001D454, 1}, + {0x0001D456, 0x0001D49C, 1}, + {0x0001D49E, 0x0001D49F, 1}, + {0x0001D4A2, 0, 0}, + {0x0001D4A5, 0x0001D4A6, 1}, + {0x0001D4A9, 0x0001D4AC, 1}, + {0x0001D4AE, 0x0001D4B9, 1}, + {0x0001D4BB, 0, 0}, + {0x0001D4BD, 0x0001D4C3, 1}, + {0x0001D4C5, 0x0001D505, 1}, + {0x0001D507, 0x0001D50A, 1}, + {0x0001D50D, 0x0001D514, 1}, + {0x0001D516, 0x0001D51C, 1}, + {0x0001D51E, 0x0001D539, 1}, + {0x0001D53B, 0x0001D53E, 1}, + {0x0001D540, 0x0001D544, 1}, + {0x0001D546, 0, 0}, + {0x0001D54A, 0x0001D550, 1}, + {0x0001D552, 0x0001D6A5, 1}, + {0x0001D6A8, 0x0001D6C0, 1}, + {0x0001D6C2, 0x0001D6DA, 1}, + {0x0001D6DC, 0x0001D6FA, 1}, + {0x0001D6FC, 0x0001D714, 1}, + {0x0001D716, 0x0001D734, 1}, + {0x0001D736, 0x0001D74E, 1}, + {0x0001D750, 0x0001D76E, 1}, + {0x0001D770, 0x0001D788, 1}, + {0x0001D78A, 0x0001D7A8, 1}, + {0x0001D7AA, 0x0001D7C2, 1}, + {0x0001D7C4, 0x0001D7CB, 1}, + {0x0001D7CE, 0x0001D7FF, 1}, +/* CJK UNIFIED IDEOGRAPHS EXTENSION */ + {0x00020000, 0x0002A6D6, 1}, +/* CJK COMPATIBILITY IDEOGRAPHS SUPPLEMENT */ + {0x0002F800, 0x0002FA1D, 1}, +/* The non-ASCII number characters are included here because ISO C 99 */ +/* forbids us to classify them as digits; however, they behave more like */ +/* alphanumeric than like punctuation. */ +/* ARABIC */ + { 0x0660, 0x0669, 1}, + { 0x06F0, 0x06F9, 1}, +/* DEVANAGARI */ + { 0x0966, 0x096F, 1}, +/* BENGALI */ + { 0x09E6, 0x09EF, 1}, +/* GURMUKHI */ + { 0x0A66, 0x0A6F, 1}, +/* GUJARATI */ + { 0x0AE6, 0x0AEF, 1}, +/* ORIYA */ + { 0x0B66, 0x0B6F, 1}, +/* TAMIL */ + { 0x0BE6, 0x0BEF, 1}, +/* TELUGU */ + { 0x0C66, 0x0C6F, 1}, + { 0x0C78, 0x0C7F, 1}, +/* KANNADA */ + { 0x0CE6, 0x0CEF, 1}, +/* MALAYALAM */ + { 0x0D66, 0x0D75, 1}, + { 0x0D70, 0x0D75, 1}, +/* THAI */ + { 0x0E50, 0x0E59, 1}, +/* LAO */ + { 0x0ED0, 0x0ED9, 1}, +/* TIBETAN */ + { 0x0F20, 0x0F29, 1}, +/* MYANMAR */ + { 0x1040, 0x1049, 1}, +/* KHMER */ + { 0x17E0, 0x17E9, 1}, +/* MONGOLIAN */ + { 0x1810, 0x1819, 1}, +/* SUNDANESE */ + { 0x1BB0, 0x1BB9, 1}, +/* LEPCHA */ + { 0x1C40, 0x1C49, 1}, +/* OL CHIKI */ + { 0x1C50, 0x1C59, 1}, +/* VAI */ + { 0xA620, 0xA629, 1}, +/* SAURASHTRA */ + { 0xA8D0, 0xA8D9, 1}, +/* KAYAH LI */ + { 0xA900, 0xA909, 1}, +/* CHAM */ + { 0xAA50, 0xAA59, 1}, +/* HALFWIDTH AND FULLWIDTH FORMS */ + { 0xFF10, 0xFF19, 1}, + +/* END::UTF8TABLE */ +}; + +static inline size_t utf8towc(unsigned *wc, const unsigned char *uc, size_t len) +{ + unsigned char ub = utf8_mblen[*uc]; + + if (!ub || ub > len || ub > 3) { + return 0; + } + + *wc = *uc & utf8_mask[ub]; + + switch (ub) { + case 4: + if ((uc[1] & 0xc0) != 0x80) { + return 0; + } + *wc <<= 6; + *wc += *++uc & 0x3f; + /* no break */ + case 3: + if ((uc[1] & 0xc0) != 0x80) { + return 0; + } + *wc <<= 6; + *wc += *++uc & 0x3f; + /* no break */ + case 2: + if ((uc[1] & 0xc0) != 0x80) { + return 0; + } + *wc <<= 6; + *wc += *++uc & 0x3f; + /* no break */ + case 1: + break; + + default: + return 0; + } + + return ub; +} + +static inline zend_bool isualpha(unsigned ch) +{ + unsigned i; + + for (i = 0; i < sizeof(utf8_ranges)/sizeof(utf8_range_t); ++i) { + if (utf8_ranges[i].start == ch) { + return 1; + } else if (utf8_ranges[i].start <= ch && utf8_ranges[i].end >= ch) { + if (utf8_ranges[i].step == 1) { + return 1; + } + /* FIXME step */ + return 0; + } + } + return 0; +} + +static inline zend_bool isualnum(unsigned ch) +{ + /* digits */ + if (ch >= 0x30 && ch <= 0x39) { + return 1; + } + return isualpha(ch); +} + +#endif /* PHP_HTTP_UTF8_H */ + +/* + * Local variables: + * tab-width: 4 + * c-basic-offset: 4 + * End: + * vim600: noet sw=4 ts=4 fdm=marker + * vim<600: noet sw=4 ts=4 + */ diff --git a/tests/urlparser001.phpt b/tests/urlparser001.phpt new file mode 100644 index 0000000..73bd9d4 --- /dev/null +++ b/tests/urlparser001.phpt @@ -0,0 +1,191 @@ +--TEST-- +url parser +--SKIPIF-- + +--FILE-- + +DONE +--EXPECTF-- +Test + +s: +object(http\Url)#%d (8) { + ["scheme"]=> + string(1) "s" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + NULL + ["port"]=> + NULL + ["path"]=> + NULL + ["query"]=> + NULL + ["fragment"]=> + NULL +} + +ss: +object(http\Url)#%d (8) { + ["scheme"]=> + string(2) "ss" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + NULL + ["port"]=> + NULL + ["path"]=> + NULL + ["query"]=> + NULL + ["fragment"]=> + NULL +} + +s:a +object(http\Url)#%d (8) { + ["scheme"]=> + string(1) "s" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + NULL + ["port"]=> + NULL + ["path"]=> + string(1) "a" + ["query"]=> + NULL + ["fragment"]=> + NULL +} + +ss:aa +object(http\Url)#%d (8) { + ["scheme"]=> + string(2) "ss" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + NULL + ["port"]=> + NULL + ["path"]=> + string(2) "aa" + ["query"]=> + NULL + ["fragment"]=> + NULL +} + +s:// +object(http\Url)#%d (8) { + ["scheme"]=> + string(1) "s" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + NULL + ["port"]=> + NULL + ["path"]=> + NULL + ["query"]=> + NULL + ["fragment"]=> + NULL +} + +ss:// +object(http\Url)#%d (8) { + ["scheme"]=> + string(2) "ss" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + NULL + ["port"]=> + NULL + ["path"]=> + NULL + ["query"]=> + NULL + ["fragment"]=> + NULL +} + +s://a +object(http\Url)#%d (8) { + ["scheme"]=> + string(1) "s" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + string(1) "a" + ["port"]=> + NULL + ["path"]=> + NULL + ["query"]=> + NULL + ["fragment"]=> + NULL +} + +ss://aa +object(http\Url)#%d (8) { + ["scheme"]=> + string(2) "ss" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + string(2) "aa" + ["port"]=> + NULL + ["path"]=> + NULL + ["query"]=> + NULL + ["fragment"]=> + NULL +} +DONE diff --git a/tests/urlparser002.phpt b/tests/urlparser002.phpt new file mode 100644 index 0000000..be1cd66 --- /dev/null +++ b/tests/urlparser002.phpt @@ -0,0 +1,211 @@ +--TEST-- +url parser with paths +--SKIPIF-- + +--FILE-- + +DONE +--EXPECTF-- +Test + +s:a/ +object(http\Url)#%d (8) { + ["scheme"]=> + string(1) "s" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + NULL + ["port"]=> + NULL + ["path"]=> + string(2) "a/" + ["query"]=> + NULL + ["fragment"]=> + NULL +} + +ss:aa/ +object(http\Url)#%d (8) { + ["scheme"]=> + string(2) "ss" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + NULL + ["port"]=> + NULL + ["path"]=> + string(3) "aa/" + ["query"]=> + NULL + ["fragment"]=> + NULL +} + +s:/a/ +object(http\Url)#%d (8) { + ["scheme"]=> + string(1) "s" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + NULL + ["port"]=> + NULL + ["path"]=> + string(3) "/a/" + ["query"]=> + NULL + ["fragment"]=> + NULL +} + +ss:/aa/ +object(http\Url)#%d (8) { + ["scheme"]=> + string(2) "ss" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + NULL + ["port"]=> + NULL + ["path"]=> + string(4) "/aa/" + ["query"]=> + NULL + ["fragment"]=> + NULL +} + +s://a/ +object(http\Url)#%d (8) { + ["scheme"]=> + string(1) "s" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + string(1) "a" + ["port"]=> + NULL + ["path"]=> + string(1) "/" + ["query"]=> + NULL + ["fragment"]=> + NULL +} + +s://h/a +object(http\Url)#%d (8) { + ["scheme"]=> + string(1) "s" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + string(1) "h" + ["port"]=> + NULL + ["path"]=> + string(2) "/a" + ["query"]=> + NULL + ["fragment"]=> + NULL +} + +ss://hh/aa +object(http\Url)#%d (8) { + ["scheme"]=> + string(2) "ss" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + string(2) "hh" + ["port"]=> + NULL + ["path"]=> + string(3) "/aa" + ["query"]=> + NULL + ["fragment"]=> + NULL +} + +s:///a/b +object(http\Url)#%d (8) { + ["scheme"]=> + string(1) "s" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + NULL + ["port"]=> + NULL + ["path"]=> + string(4) "/a/b" + ["query"]=> + NULL + ["fragment"]=> + NULL +} + +ss:///aa/bb +object(http\Url)#%d (8) { + ["scheme"]=> + string(2) "ss" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + NULL + ["port"]=> + NULL + ["path"]=> + string(6) "/aa/bb" + ["query"]=> + NULL + ["fragment"]=> + NULL +} +DONE diff --git a/tests/urlparser003.phpt b/tests/urlparser003.phpt new file mode 100644 index 0000000..68b1e4a --- /dev/null +++ b/tests/urlparser003.phpt @@ -0,0 +1,274 @@ +--TEST-- +url parser with query +--SKIPIF-- + +--FILE-- + +DONE +--EXPECTF-- +Test + +s:?q +object(http\Url)#%d (8) { + ["scheme"]=> + string(1) "s" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + NULL + ["port"]=> + NULL + ["path"]=> + NULL + ["query"]=> + string(1) "q" + ["fragment"]=> + NULL +} + +ss:?qq +object(http\Url)#%d (8) { + ["scheme"]=> + string(2) "ss" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + NULL + ["port"]=> + NULL + ["path"]=> + NULL + ["query"]=> + string(2) "qq" + ["fragment"]=> + NULL +} + +s:/?q +object(http\Url)#%d (8) { + ["scheme"]=> + string(1) "s" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + NULL + ["port"]=> + NULL + ["path"]=> + string(1) "/" + ["query"]=> + string(1) "q" + ["fragment"]=> + NULL +} + +ss:/?qq +object(http\Url)#%d (8) { + ["scheme"]=> + string(2) "ss" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + NULL + ["port"]=> + NULL + ["path"]=> + string(1) "/" + ["query"]=> + string(2) "qq" + ["fragment"]=> + NULL +} + +s://?q +object(http\Url)#%d (8) { + ["scheme"]=> + string(1) "s" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + NULL + ["port"]=> + NULL + ["path"]=> + NULL + ["query"]=> + string(1) "q" + ["fragment"]=> + NULL +} + +ss://?qq +object(http\Url)#%d (8) { + ["scheme"]=> + string(2) "ss" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + NULL + ["port"]=> + NULL + ["path"]=> + NULL + ["query"]=> + string(2) "qq" + ["fragment"]=> + NULL +} + +s://h?q +object(http\Url)#%d (8) { + ["scheme"]=> + string(1) "s" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + string(1) "h" + ["port"]=> + NULL + ["path"]=> + NULL + ["query"]=> + string(1) "q" + ["fragment"]=> + NULL +} + +ss://hh?qq +object(http\Url)#%d (8) { + ["scheme"]=> + string(2) "ss" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + string(2) "hh" + ["port"]=> + NULL + ["path"]=> + NULL + ["query"]=> + string(2) "qq" + ["fragment"]=> + NULL +} + +s://h/p?q +object(http\Url)#%d (8) { + ["scheme"]=> + string(1) "s" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + string(1) "h" + ["port"]=> + NULL + ["path"]=> + string(2) "/p" + ["query"]=> + string(1) "q" + ["fragment"]=> + NULL +} + +ss://hh/pp?qq +object(http\Url)#%d (8) { + ["scheme"]=> + string(2) "ss" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + string(2) "hh" + ["port"]=> + NULL + ["path"]=> + string(3) "/pp" + ["query"]=> + string(2) "qq" + ["fragment"]=> + NULL +} + +s://h:123/p/?q +object(http\Url)#%d (8) { + ["scheme"]=> + string(1) "s" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + string(1) "h" + ["port"]=> + int(123) + ["path"]=> + string(3) "/p/" + ["query"]=> + string(1) "q" + ["fragment"]=> + NULL +} + +ss://hh:123/pp/?qq +object(http\Url)#%d (8) { + ["scheme"]=> + string(2) "ss" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + string(2) "hh" + ["port"]=> + int(123) + ["path"]=> + string(4) "/pp/" + ["query"]=> + string(2) "qq" + ["fragment"]=> + NULL +} +DONE diff --git a/tests/urlparser004.phpt b/tests/urlparser004.phpt new file mode 100644 index 0000000..3aa57fd --- /dev/null +++ b/tests/urlparser004.phpt @@ -0,0 +1,89 @@ +--TEST-- +url parser multibyte/locale +--SKIPIF-- + +--FILE-- + +DONE +--EXPECTF-- +Test + +sçheme: +object(http\Url)#%d (8) { + ["scheme"]=> + string(7) "sçheme" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + NULL + ["port"]=> + NULL + ["path"]=> + NULL + ["query"]=> + NULL + ["fragment"]=> + NULL +} + +sçheme://hƟst +object(http\Url)#%d (8) { + ["scheme"]=> + string(7) "sçheme" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + string(5) "hƟst" + ["port"]=> + NULL + ["path"]=> + NULL + ["query"]=> + NULL + ["fragment"]=> + NULL +} + +sçheme://hƟst:23/päth/öf/fıle +object(http\Url)#%d (8) { + ["scheme"]=> + string(7) "sçheme" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + string(5) "hƟst" + ["port"]=> + int(23) + ["path"]=> + string(16) "/päth/öf/fıle" + ["query"]=> + NULL + ["fragment"]=> + NULL +} +DONE diff --git a/tests/urlparser005.phpt b/tests/urlparser005.phpt new file mode 100644 index 0000000..ff18fe4 --- /dev/null +++ b/tests/urlparser005.phpt @@ -0,0 +1,85 @@ +--TEST-- +url parser multibyte/utf-8 +--SKIPIF-- + +--FILE-- + +DONE +--EXPECTF-- +Test + +sçheme: +object(http\Url)#%d (8) { + ["scheme"]=> + string(7) "sçheme" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + NULL + ["port"]=> + NULL + ["path"]=> + NULL + ["query"]=> + NULL + ["fragment"]=> + NULL +} + +sçheme://hƟst +object(http\Url)#%d (8) { + ["scheme"]=> + string(7) "sçheme" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + string(5) "hƟst" + ["port"]=> + NULL + ["path"]=> + NULL + ["query"]=> + NULL + ["fragment"]=> + NULL +} + +sçheme://hƟst:23/päth/öf/fıle +object(http\Url)#%d (8) { + ["scheme"]=> + string(7) "sçheme" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + string(5) "hƟst" + ["port"]=> + int(23) + ["path"]=> + string(16) "/päth/öf/fıle" + ["query"]=> + NULL + ["fragment"]=> + NULL +} +DONE diff --git a/tests/urlparser006.phpt b/tests/urlparser006.phpt new file mode 100644 index 0000000..c35a20f --- /dev/null +++ b/tests/urlparser006.phpt @@ -0,0 +1,90 @@ +--TEST-- +url parser multibyte/locale/idna +--SKIPIF-- + +--FILE-- + +DONE +--EXPECTF-- +Test + +sçheme: +object(http\Url)#%d (8) { + ["scheme"]=> + string(7) "sçheme" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + NULL + ["port"]=> + NULL + ["path"]=> + NULL + ["query"]=> + NULL + ["fragment"]=> + NULL +} + +sçheme://hƟst +object(http\Url)#%d (8) { + ["scheme"]=> + string(7) "sçheme" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + string(11) "xn--hst-kwb" + ["port"]=> + NULL + ["path"]=> + NULL + ["query"]=> + NULL + ["fragment"]=> + NULL +} + +sçheme://hƟst:23/päth/öf/fıle +object(http\Url)#%d (8) { + ["scheme"]=> + string(7) "sçheme" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + string(11) "xn--hst-kwb" + ["port"]=> + int(23) + ["path"]=> + string(16) "/päth/öf/fıle" + ["query"]=> + NULL + ["fragment"]=> + NULL +} +DONE diff --git a/tests/urlparser007.phpt b/tests/urlparser007.phpt new file mode 100644 index 0000000..702302b --- /dev/null +++ b/tests/urlparser007.phpt @@ -0,0 +1,88 @@ +--TEST-- +url parser multibyte/utf-8/idna +--SKIPIF-- + +--FILE-- + +DONE +--EXPECTF-- +Test + +sçheme: +object(http\Url)#%d (8) { + ["scheme"]=> + string(7) "sçheme" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + NULL + ["port"]=> + NULL + ["path"]=> + NULL + ["query"]=> + NULL + ["fragment"]=> + NULL +} + +sçheme://hƟst +object(http\Url)#%d (8) { + ["scheme"]=> + string(7) "sçheme" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + string(11) "xn--hst-kwb" + ["port"]=> + NULL + ["path"]=> + NULL + ["query"]=> + NULL + ["fragment"]=> + NULL +} + +sçheme://hƟst:23/päth/öf/fıle +object(http\Url)#%d (8) { + ["scheme"]=> + string(7) "sçheme" + ["user"]=> + NULL + ["pass"]=> + NULL + ["host"]=> + string(11) "xn--hst-kwb" + ["port"]=> + int(23) + ["path"]=> + string(16) "/päth/öf/fıle" + ["query"]=> + NULL + ["fragment"]=> + NULL +} +DONE diff --git a/ualpha.h b/ualpha.h deleted file mode 100644 index 688777d..0000000 --- a/ualpha.h +++ /dev/null @@ -1,654 +0,0 @@ -typedef struct utf8_range { - unsigned int start; - unsigned int end; - unsigned char step; -} utf8_range_t; - -static const utf8_range_t utf8_ranges[] = { -/* BASIC LATIN */ - { 0x0041, 0x005A, 1}, - { 0x0061, 0x007A, 1}, -/* LATIN-1 SUPPLEMENT */ - { 0x00AA, 0, 0}, - { 0x00B5, 0, 0}, - { 0x00BA, 0, 0}, - { 0x00C0, 0x00D6, 1}, - { 0x00D8, 0x00F6, 1}, - { 0x00F8, 0x00FF, 1}, -/* LATIN EXTENDED-A */ - { 0x0100, 0x017F, 1}, -/* LATIN EXTENDED-B */ - { 0x0180, 0x024F, 1}, -/* IPA EXTENSIONS */ - { 0x0250, 0x02AF, 1}, -/* SPACING MODIFIER LETTERS */ - { 0x02B0, 0x02C1, 1}, - { 0x02C6, 0x02D1, 1}, - { 0x02E0, 0x02E4, 1}, - { 0x02EE, 0, 0}, -/* COMBINING DIACRITICAL MARKS */ - { 0x0345, 0, 0}, -/* BASIC GREEK */ - { 0x0370, 0x0373, 1}, - { 0x0376, 0x0377, 1}, - { 0x037A, 0x037D, 1}, - { 0x0386, 0, 0}, - { 0x0388, 0x038A, 1}, - { 0x038C, 0, 0}, - { 0x038E, 0x03A1, 1}, - { 0x03A3, 0x03CE, 1}, -/* GREEK SYMBOLS AND COPTIC */ - { 0x03D0, 0x03F5, 1}, - { 0x03F7, 0x03FF, 1}, -/* CYRILLIC */ - { 0x0400, 0x0481, 1}, - { 0x048A, 0x04FF, 1}, -/* CYRILLIC SUPPLEMENT */ - { 0x0500, 0x0523, 1}, -/* ARMENIAN */ - { 0x0531, 0x0556, 1}, - { 0x0559, 0, 0}, - { 0x0561, 0x0587, 1}, -/* HEBREW */ - { 0x05D0, 0x05EA, 1}, - { 0x05F0, 0x05F2, 1}, -/* ARABIC */ - { 0x0621, 0x064A, 1}, - { 0x066E, 0x066F, 1}, - { 0x0671, 0x06D3, 1}, - { 0x06D5, 0, 0}, - { 0x06E5, 0x06E6, 1}, - { 0x06EE, 0x06EF, 1}, - { 0x06FA, 0x06FC, 1}, - { 0x06FF, 0, 0}, -/* SYRIAC */ - { 0x0710, 0, 0}, - { 0x0712, 0x072F, 1}, - { 0x074D, 0x074F, 1}, -/* ARABIC SUPPLEMENT */ - { 0x0750, 0x077F, 1}, -/* THAANA */ - { 0x0780, 0x07A5, 1}, - { 0x07B1, 0, 0}, -/* NKO */ - { 0x07C0, 0x07EA, 1}, - { 0x07F4, 0x07F5, 1}, - { 0x07FA, 0, 0}, -/* - All Matras of Indic and Sinhala are moved from punct to alpha class */ -/* - Added Unicode 5.1 charctares of Indic scripts */ -/* DEVANAGARI */ - { 0x0901, 0x0939, 1}, - { 0x093C, 0x094D, 1}, - { 0x0950, 0x0954, 1}, - { 0x0958, 0x0961, 1}, - { 0x0962, 0, 0}, - { 0x0963, 0, 0}, - { 0x0972, 0, 0}, - { 0x097B, 0x097F, 1}, -/* TABLE 18 BENGALI */ - { 0x0981, 0x0983, 1}, - { 0x0985, 0x098C, 1}, - { 0x098F, 0, 0}, - { 0x0990, 0, 0}, - { 0x0993, 0x09A8, 1}, - { 0x09AA, 0x09B0, 1}, - { 0x09B2, 0, 0}, - { 0x09B6, 0x09B9, 1}, - { 0x09BC, 0x09C4, 1}, - { 0x09C7, 0, 0}, - { 0x09C8, 0, 0}, - { 0x09CB, 0x09CE, 1}, - { 0x09D7, 0, 0}, - { 0x09DC, 0, 0}, - { 0x09DD, 0, 0}, - { 0x09DF, 0x09E3, 1}, - { 0x09F0, 0x09FA, 1}, -/* GURMUKHI */ - { 0x0A01, 0x0A03, 1}, - { 0x0A05, 0x0A0A, 1}, - { 0x0A0F, 0, 0}, - { 0x0A10, 0, 0}, - { 0x0A13, 0x0A28, 1}, - { 0x0A2A, 0x0A30, 1}, - { 0x0A32, 0, 0}, - { 0x0A33, 0, 0}, - { 0x0A35, 0, 0}, - { 0x0A36, 0, 0}, - { 0x0A38, 0, 0}, - { 0x0A39, 0, 0}, - { 0x0A3C, 0, 0}, - { 0x0A3E, 0x0A42, 1}, - { 0x0A47, 0, 0}, - { 0x0A48, 0, 0}, - { 0x0A4B, 0x0A4D, 1}, - { 0x0A51, 0, 0}, - { 0x0A59, 0x0A5C, 1}, - { 0x0A5E, 0, 0}, - { 0x0A70, 0x0A75, 1}, -/* GUJARATI */ - { 0x0A81, 0x0A83, 1}, - { 0x0A85, 0x0A8D, 1}, - { 0x0A8F, 0x0A91, 1}, - { 0x0A93, 0x0AA8, 1}, - { 0x0AAA, 0x0AB0, 1}, - { 0x0AB2, 0, 0}, - { 0x0AB3, 0, 0}, - { 0x0AB5, 0x0AB9, 1}, - { 0x0ABC, 0x0AC5, 1}, - { 0x0AC7, 0x0AC9, 1}, - { 0x0ACB, 0x0ACD, 1}, - { 0x0AD0, 0, 0}, - { 0x0AE0, 0x0AE3, 1}, - { 0x0AF1, 0, 0}, -/* ORIYA */ - { 0x0B01, 0x0B03, 1}, - { 0x0B05, 0x0B0C, 1}, - { 0x0B0F, 0, 0}, - { 0x0B10, 0, 0}, - { 0x0B13, 0x0B28, 1}, - { 0x0B2A, 0x0B30, 1}, - { 0x0B32, 0, 0}, - { 0x0B33, 0, 0}, - { 0x0B35, 0x0B39, 1}, - { 0x0B3C, 0x0B44, 1}, - { 0x0B47, 0x0B48, 1}, - { 0x0B4B, 0x0B4D, 1}, - { 0x0B56, 0x0B57, 1}, - { 0x0B5C, 0, 0}, - { 0x0B5D, 0, 0}, - { 0x0B5F, 0x0B63, 1}, - { 0x0B70, 0, 0}, - { 0x0B71, 0, 0}, -/* TAMIL */ - { 0x0B82, 0, 0}, - { 0x0B83, 0, 0}, - { 0x0B85, 0x0B8A, 1}, - { 0x0B8E, 0x0B90, 1}, - { 0x0B92, 0x0B95, 1}, - { 0x0B99, 0, 0}, - { 0x0B9A, 0, 0}, - { 0x0B9C, 0, 0}, - { 0x0B9E, 0, 0}, - { 0x0B9F, 0, 0}, - { 0x0BA3, 0, 0}, - { 0x0BA4, 0, 0}, - { 0x0BA8, 0x0BAA, 1}, - { 0x0BAE, 0x0BB9, 1}, - { 0x0BBE, 0x0BC2, 1}, - { 0x0BC6, 0x0BC8, 1}, - { 0x0BCA, 0x0BCD, 1}, - { 0x0BD0, 0, 0}, - { 0x0BD7, 0, 0}, - { 0x0BF0, 0x0BFA, 1}, -/* TELUGU */ - { 0x0C01, 0x0C03, 1}, - { 0x0C05, 0x0C0C, 1}, - { 0x0C0E, 0x0C10, 1}, - { 0x0C12, 0x0C28, 1}, - { 0x0C2A, 0x0C33, 1}, - { 0x0C35, 0x0C39, 1}, - { 0x0C3D, 0x0C44, 1}, - { 0x0C46, 0x0C48, 1}, - { 0x0C4A, 0x0C4D, 1}, - { 0x0C55, 0x0C56, 1}, - { 0x0C58, 0x0C59, 1}, - { 0x0C60, 0x0C63, 1}, -/* KANNADA */ - { 0x0C82, 0x0C83, 1}, - { 0x0C85, 0x0C8C, 1}, - { 0x0C8E, 0x0C90, 1}, - { 0x0C92, 0x0CA8, 1}, - { 0x0CAA, 0x0CB3, 1}, - { 0x0CB5, 0x0CB9, 1}, - { 0x0CBC, 0x0CC4, 1}, - { 0x0CC6, 0x0CC8, 1}, - { 0x0CCA, 0x0CCD, 1}, - { 0x0CD5, 0x0CD6, 1}, - { 0x0CDE, 0, 0}, - { 0x0CE0, 0x0CE3, 1}, - { 0x0CF1, 0, 0}, - { 0x0CF2, 0, 0}, -/* MALAYALAM */ - { 0x0D02, 0x0D03, 1}, - { 0x0D05, 0x0D0C, 1}, - { 0x0D0E, 0x0D10, 1}, - { 0x0D12, 0x0D28, 1}, - { 0x0D2A, 0x0D39, 1}, - { 0x0D3D, 0x0D44, 1}, - { 0x0D46, 0x0D48, 1}, - { 0x0D4A, 0x0D4D, 1}, - { 0x0D57, 0, 0}, - { 0x0D60, 0x0D63, 1}, - { 0x0D79, 0x0D7F, 1}, -/* SINHALA */ - { 0x0D82, 0x0D83, 1}, - { 0x0D85, 0x0D96, 1}, - { 0x0D9A, 0x0DB1, 1}, - { 0x0DB3, 0x0DBB, 1}, - { 0x0DBD, 0, 0}, - { 0x0DC0, 0x0DC6, 1}, - { 0x0DCA, 0, 0}, - { 0x0DCF, 0x0DD4, 1}, - { 0x0DD6, 0, 0}, - { 0x0DD8, 0x0DDF, 1}, - { 0x0DF2, 0x0DF4, 1}, -/* THAI */ - { 0x0E01, 0x0E2E, 1}, - { 0x0E30, 0x0E3A, 1}, - { 0x0E40, 0x0E45, 1}, - { 0x0E47, 0x0E4E, 1}, -/* LAO */ - { 0x0E81, 0x0E82, 1}, - { 0x0E84, 0, 0}, - { 0x0E87, 0x0E88, 1}, - { 0x0E8A, 0, 0}, - { 0x0E8D, 0, 0}, - { 0x0E94, 0x0E97, 1}, - { 0x0E99, 0x0E9F, 1}, - { 0x0EA1, 0x0EA3, 1}, - { 0x0EA5, 0, 0}, - { 0x0EA7, 0, 0}, - { 0x0EAA, 0x0EAB, 1}, - { 0x0EAD, 0x0EB0, 1}, - { 0x0EB2, 0x0EB3, 1}, - { 0x0EBD, 0, 0}, - { 0x0EC0, 0x0EC4, 1}, - { 0x0EC6, 0, 0}, - { 0x0EDC, 0x0EDD, 1}, -/* TIBETAN */ - { 0x0F00, 0, 0}, - { 0x0F40, 0x0F47, 1}, - { 0x0F49, 0x0F6C, 1}, - { 0x0F88, 0x0F8B, 1}, -/* MYANMAR */ - { 0x1000, 0x102A, 1}, - { 0x1050, 0x1055, 1}, - { 0x105A, 0x105D, 1}, - { 0x1061, 0, 0}, - { 0x0165, 0, 0}, - { 0x1066, 0, 0}, - { 0x106E, 0x1070, 1}, - { 0x1075, 0x1081, 1}, - { 0x108E, 0, 0}, -/* GEORGIAN */ - { 0x10A0, 0x10C5, 1}, - { 0x10D0, 0x10FA, 1}, - { 0x10FC, 0, 0}, -/* HANGUL JAMO */ - { 0x1100, 0x1159, 1}, - { 0x115F, 0x11A2, 1}, - { 0x11A8, 0x11F9, 1}, -/* ETHIOPIC */ - { 0x1200, 0x1248, 1}, - { 0x124A, 0x124D, 1}, - { 0x1250, 0x1256, 1}, - { 0x1258, 0, 0}, - { 0x125A, 0x125D, 1}, - { 0x1260, 0x1288, 1}, - { 0x128A, 0x128D, 1}, - { 0x1290, 0x12B0, 1}, - { 0x12B2, 0x12B5, 1}, - { 0x12B8, 0x12BE, 1}, - { 0x12C0, 0, 0}, - { 0x12C2, 0x12C5, 1}, - { 0x12C8, 0x12D6, 1}, - { 0x12D8, 0x1310, 1}, - { 0x1312, 0x1315, 1}, - { 0x1318, 0x135A, 1}, -/* ETHIOPIC EXTENDED */ - { 0x1380, 0x138F, 1}, -/* CHEROKEE */ - { 0x13A0, 0x13F4, 1}, -/* UNIFIED CANADIAN ABORIGINAL SYLLABICS */ - { 0x1401, 0x166C, 1}, - { 0x166F, 0x1676, 1}, -/* OGHAM */ - { 0x1681, 0x169A, 1}, -/* RUNIC */ - { 0x16A0, 0x16EA, 1}, - { 0x16EE, 0x16F0, 1}, -/* TAGALOG */ - { 0x1700, 0x170C, 1}, - { 0x170E, 0x1711, 1}, -/* HANUNOO */ - { 0x1720, 0x1731, 1}, -/* BUHID */ - { 0x1740, 0x1751, 1}, -/* TAGBANWA */ - { 0x1760, 0x176C, 1}, - { 0x176E, 0x1770, 1}, -/* KHMER */ - { 0x1780, 0x17B3, 1}, - { 0x17D7, 0, 0}, - { 0x17DC, 0, 0}, -/* MONGOLIAN */ - { 0x1820, 0x1877, 1}, - { 0x1880, 0x18A8, 1}, - { 0x18AA, 0, 0}, -/* LIMBU */ - { 0x1900, 0x191C, 1}, - { 0x1946, 0x194F, 1}, -/* TAI LE */ - { 0x1950, 0x196D, 1}, - { 0x1970, 0x1974, 1}, -/* NEW TAI LUE */ - { 0x1980, 0x19A9, 1}, - { 0x19C1, 0x19C7, 1}, - { 0x19D0, 0x19D9, 1}, -/* BUGINESE */ - { 0x1A00, 0x1A16, 1}, -/* BALINESE */ - { 0x1B05, 0x1B33, 1}, - { 0x1B45, 0x1B4B, 1}, - { 0x1B50, 0x1B59, 1}, -/* SUNDANESE */ - { 0x1B83, 0x1BA0, 1}, - { 0x1BAE, 0x1BAF, 1}, -/* LEPCHA */ - { 0x1C00, 0x1C23, 1}, - { 0x1C4D, 0x1C4F, 1}, -/* OL CHIKI */ - { 0x1C5A, 0x1C7D, 1}, -/* PHONETIC EXTENSIONS */ - { 0x1D00, 0x1DBF, 1}, -/* LATIN EXTENDED ADDITIONAL */ - { 0x1E00, 0x1E9F, 1}, - { 0x1EA0, 0x1EFF, 1}, -/* GREEK EXTENDED */ - { 0x1F00, 0x1F15, 1}, - { 0x1F18, 0x1F1D, 1}, - { 0x1F20, 0x1F45, 1}, - { 0x1F48, 0x1F4D, 1}, - { 0x1F50, 0x1F57, 1}, - { 0x1F59, 0, 0}, - { 0x1F5B, 0, 0}, - { 0x1F5D, 0, 0}, - { 0x1F5F, 0x1F7D, 1}, - { 0x1F80, 0x1FB4, 1}, - { 0x1FB6, 0x1FBC, 1}, - { 0x1FBE, 0, 0}, - { 0x1FC2, 0x1FC4, 1}, - { 0x1FC6, 0x1FCC, 1}, - { 0x1FD0, 0x1FD3, 1}, - { 0x1FD6, 0x1FDB, 1}, - { 0x1FE0, 0x1FEC, 1}, - { 0x1FF2, 0x1FF4, 1}, - { 0x1FF6, 0x1FFC, 1}, -/* SUPERSCRIPTS AND SUBSCRIPTS */ - { 0x2071, 0, 0}, - { 0x207F, 0, 0}, - { 0x2090, 0x2094, 1}, -/* LETTERLIKE SYMBOLS */ - { 0x2102, 0, 0}, - { 0x2107, 0, 0}, - { 0x210A, 0x2113, 1}, - { 0x2115, 0, 0}, - { 0x2119, 0x211D, 1}, - { 0x2124, 0, 0}, - { 0x2126, 0, 0}, - { 0x2128, 0x212D, 1}, - { 0x212F, 0x2139, 1}, - { 0x213C, 0x213F, 1}, - { 0x2145, 0x2149, 1}, - { 0x214E, 0, 0}, -/* NUMBER FORMS */ - { 0x2160, 0x2188, 1}, -/* ENCLOSED ALPHANUMERICS */ - { 0x249C, 0x24E9, 1}, -/* GLAGOLITIC */ - { 0x2C00, 0x2C2E, 1}, - { 0x2C30, 0x2C5E, 1}, -/* LATIN EXTENDED-C */ - { 0x2C60, 0x2C6F, 1}, - { 0x2C71, 0x2C7D, 1}, -/* COPTIC */ - { 0x2C80, 0x2CE4, 1}, -/* GEORGIAN SUPPLEMENT */ - { 0x2D00, 0x2D25, 1}, -/* TIFINAGH */ - { 0x2D30, 0x2D65, 1}, - { 0x2D6F, 0, 0}, -/* ETHIOPIC EXTENDED */ - { 0x2D80, 0x2D96, 1}, - { 0x2DA0, 0x2DA6, 1}, - { 0x2DA8, 0x2DAE, 1}, - { 0x2DB0, 0x2DB6, 1}, - { 0x2DB8, 0x2DBE, 1}, - { 0x2DC0, 0x2DC6, 1}, - { 0x2DC8, 0x2DCE, 1}, - { 0x2DD0, 0x2DD6, 1}, - { 0x2DD8, 0x2DDE, 1}, -/* CJK SYMBOLS AND PUNCTUATION */ - { 0x3005, 0x3007, 1}, - { 0x3021, 0x3029, 1}, - { 0x3031, 0x3035, 1}, - { 0x3038, 0x303C, 1}, -/* HIRAGANA */ - { 0x3041, 0x3096, 1}, - { 0x309D, 0x309F, 1}, -/* KATAKANA */ - { 0x30A1, 0x30FA, 1}, - { 0x30FC, 0x30FF, 1}, -/* BOPOMOFO */ - { 0x3105, 0x312D, 1}, -/* HANGUL COMPATIBILITY JAMO */ - { 0x3131, 0x318E, 1}, -/* BOPOMOFO EXTENDED */ - { 0x31A0, 0x31B7, 1}, -/* KATAKANA PHONETIC EXTENSIONS */ - { 0x31F0, 0x31FF, 1}, -/* CJK UNIFIED IDEOGRAPHS EXTENSION */ - { 0x3400, 0x4DB5, 1}, -/* CJK UNIFIED IDEOGRAPHS */ - { 0x4E00, 0x9FBB, 1}, -/* YI SYLLABLES */ - { 0xA000, 0xA48C, 1}, -/* VAI SYLLABLES */ - { 0xA500, 0xA60B, 1}, - { 0xA610, 0xA61F, 1}, - { 0xA62A, 0xA62B, 1}, -/* CYRILLIC SUPPLEMENT 2 */ - { 0xA640, 0xA65F, 1}, - { 0xA662, 0xA66E, 1}, - { 0xA680, 0xA697, 1}, -/* LATIN EXTENDED-D */ - { 0xA717, 0xA71F, 1}, - { 0xA722, 0xA78C, 1}, - { 0xA7FB, 0xA7FF, 1}, -/* SYLOTI NEGRI */ - { 0xA800, 0, 0}, - { 0xA801, 0, 0}, - { 0xA803, 0xA805, 1}, - { 0xA807, 0xA80A, 1}, - { 0xA80C, 0xA822, 1}, -/* PHAGS PA */ - { 0xA840, 0xA873, 1}, -/* SAURASHTRA */ - { 0xA882, 0xA8B3, 1}, -/* KAYAH LI */ - { 0xA90A, 0xA92D, 1}, -/* REJANG */ - { 0xA930, 0xA946, 1}, -/* CHAM */ - { 0xAA00, 0xAA28, 1}, - { 0xAA40, 0xAA42, 1}, - { 0xAA44, 0xAA4B, 1}, -/* HANGUL SYLLABLES */ - { 0xAC00, 0xD7A3, 1}, -/* CJK COMPATIBILITY IDEOGRAPHS */ - { 0xF900, 0xFA2D, 1}, - { 0xFA30, 0xFA6A, 1}, - { 0xFA70, 0xFAD9, 1}, -/* ALPHABETIC PRESENTATION FORMS */ - { 0xFB00, 0xFB06, 1}, - { 0xFB13, 0xFB17, 1}, - { 0xFB1D, 0, 0}, - { 0xFB1F, 0xFB28, 1}, - { 0xFB2A, 0xFB36, 1}, - { 0xFB38, 0xFB3C, 1}, - { 0xFB3E, 0, 0}, - { 0xFB40, 0, 0}, - { 0xFB41, 0, 0}, - { 0xFB43, 0, 0}, - { 0xFB44, 0, 0}, - { 0xFB46, 0xFB4F, 1}, -/* ARABIC PRESENTATION FORMS-A */ - { 0xFB50, 0xFBB1, 1}, - { 0xFBD3, 0xFD3D, 1}, - { 0xFD50, 0xFD8F, 1}, - { 0xFD92, 0xFDC7, 1}, - { 0xFDF0, 0xFDFB, 1}, -/* ARABIC PRESENTATION FORMS-B */ - { 0xFE70, 0xFE74, 1}, - { 0xFE76, 0xFEFC, 1}, -/* HALFWIDTH AND FULLWIDTH FORMS */ - { 0xFF21, 0xFF3A, 1}, - { 0xFF41, 0xFF5A, 1}, - { 0xFF66, 0xFFBE, 1}, - { 0xFFC2, 0xFFC7, 1}, - { 0xFFCA, 0xFFCF, 1}, - { 0xFFD2, 0xFFD7, 1}, - { 0xFFDA, 0xFFDC, 1}, -/* LINEAR B SYLLABARY */ - {0x00010000, 0x0001000B, 1}, - {0x0001000D, 0x00010026, 1}, - {0x00010028, 0x0001003A, 1}, - {0x0001003C, 0x0001003D, 1}, - {0x0001003F, 0x0001004D, 1}, - {0x00010050, 0x0001005D, 1}, -/* LINEAR B IDEOGRAMS */ - {0x00010080, 0x000100FA, 1}, -/* ANCIENT GREEK NUMBERS */ - {0x00010140, 0x00010174, 1}, -/* LYCIAN */ - {0x00010280, 0x0001029C, 1}, -/* CARIAN */ - {0x000102A0, 0x000102D0, 1}, -/* OLD ITALIC */ - {0x00010300, 0x0001031E, 1}, -/* GOTHIC */ - {0x00010330, 0x0001034A, 1}, -/* UGARITIC */ - {0x00010380, 0x0001039D, 1}, -/* OLD PERSIAN */ - {0x000103A0, 0x000103C3, 1}, - {0x000103C8, 0x000103CF, 1}, - {0x000103D1, 0x000103D5, 1}, -/* DESERET */ - {0x00010400, 0x0001044F, 1}, -/* SHAVIAN */ - {0x00010450, 0x0001047F, 1}, -/* OSMANYA */ - {0x00010480, 0x0001049D, 1}, - {0x000104A0, 0x000104A9, 1}, -/* CYPRIOT SYLLABARY */ - {0x00010800, 0x00010805, 1}, - {0x00010808, 0, 0}, - {0x0001080A, 0x00010835, 1}, - {0x00010837, 0x00010838, 1}, - {0x0001083C, 0, 0}, - {0x0001083F, 0, 0}, -/* PHOENICIAN */ - {0x00010900, 0x00010915, 1}, - {0x00010A00, 0, 0}, - {0x00010A10, 0x00010A13, 1}, -/* KHAROSHTI */ - {0x00010A15, 0x00010A17, 1}, - {0x00010A19, 0x00010A33, 1}, -/* CUNEIFORM */ - {0x00012000, 0x0001236E, 1}, -/* CUNEIFORM NUMBERS AND PONCTUATION */ - {0x00012400, 0x00012462, 1}, -/* BYZANTINE MUSICAL SYMBOLS */ -/* MATHEMATICAL ALPHANUMERIC SYMBOLS */ - {0x0001D400, 0x0001D454, 1}, - {0x0001D456, 0x0001D49C, 1}, - {0x0001D49E, 0x0001D49F, 1}, - {0x0001D4A2, 0, 0}, - {0x0001D4A5, 0x0001D4A6, 1}, - {0x0001D4A9, 0x0001D4AC, 1}, - {0x0001D4AE, 0x0001D4B9, 1}, - {0x0001D4BB, 0, 0}, - {0x0001D4BD, 0x0001D4C3, 1}, - {0x0001D4C5, 0x0001D505, 1}, - {0x0001D507, 0x0001D50A, 1}, - {0x0001D50D, 0x0001D514, 1}, - {0x0001D516, 0x0001D51C, 1}, - {0x0001D51E, 0x0001D539, 1}, - {0x0001D53B, 0x0001D53E, 1}, - {0x0001D540, 0x0001D544, 1}, - {0x0001D546, 0, 0}, - {0x0001D54A, 0x0001D550, 1}, - {0x0001D552, 0x0001D6A5, 1}, - {0x0001D6A8, 0x0001D6C0, 1}, - {0x0001D6C2, 0x0001D6DA, 1}, - {0x0001D6DC, 0x0001D6FA, 1}, - {0x0001D6FC, 0x0001D714, 1}, - {0x0001D716, 0x0001D734, 1}, - {0x0001D736, 0x0001D74E, 1}, - {0x0001D750, 0x0001D76E, 1}, - {0x0001D770, 0x0001D788, 1}, - {0x0001D78A, 0x0001D7A8, 1}, - {0x0001D7AA, 0x0001D7C2, 1}, - {0x0001D7C4, 0x0001D7CB, 1}, - {0x0001D7CE, 0x0001D7FF, 1}, -/* CJK UNIFIED IDEOGRAPHS EXTENSION */ - {0x00020000, 0x0002A6D6, 1}, -/* CJK COMPATIBILITY IDEOGRAPHS SUPPLEMENT */ - {0x0002F800, 0x0002FA1D, 1}, -/* The non-ASCII number characters are included here because ISO C 99 */ -/* forbids us to classify them as digits; however, they behave more like */ -/* alphanumeric than like punctuation. */ -/* ARABIC */ - { 0x0660, 0x0669, 1}, - { 0x06F0, 0x06F9, 1}, -/* DEVANAGARI */ - { 0x0966, 0x096F, 1}, -/* BENGALI */ - { 0x09E6, 0x09EF, 1}, -/* GURMUKHI */ - { 0x0A66, 0x0A6F, 1}, -/* GUJARATI */ - { 0x0AE6, 0x0AEF, 1}, -/* ORIYA */ - { 0x0B66, 0x0B6F, 1}, -/* TAMIL */ - { 0x0BE6, 0x0BEF, 1}, -/* TELUGU */ - { 0x0C66, 0x0C6F, 1}, - { 0x0C78, 0x0C7F, 1}, -/* KANNADA */ - { 0x0CE6, 0x0CEF, 1}, -/* MALAYALAM */ - { 0x0D66, 0x0D75, 1}, - { 0x0D70, 0x0D75, 1}, -/* THAI */ - { 0x0E50, 0x0E59, 1}, -/* LAO */ - { 0x0ED0, 0x0ED9, 1}, -/* TIBETAN */ - { 0x0F20, 0x0F29, 1}, -/* MYANMAR */ - { 0x1040, 0x1049, 1}, -/* KHMER */ - { 0x17E0, 0x17E9, 1}, -/* MONGOLIAN */ - { 0x1810, 0x1819, 1}, -/* SUNDANESE */ - { 0x1BB0, 0x1BB9, 1}, -/* LEPCHA */ - { 0x1C40, 0x1C49, 1}, -/* OL CHIKI */ - { 0x1C50, 0x1C59, 1}, -/* VAI */ - { 0xA620, 0xA629, 1}, -/* SAURASHTRA */ - { 0xA8D0, 0xA8D9, 1}, -/* KAYAH LI */ - { 0xA900, 0xA909, 1}, -/* CHAM */ - { 0xAA50, 0xAA59, 1}, -/* HALFWIDTH AND FULLWIDTH FORMS */ - { 0xFF10, 0xFF19, 1}, - {0, 0, 0} -};