experimental urlparser
authorMichael Wallner <mike@php.net>
Mon, 27 Oct 2014 19:39:37 +0000 (20:39 +0100)
committerMichael Wallner <mike@php.net>
Thu, 6 Nov 2014 08:12:56 +0000 (09:12 +0100)
config9.m4
ctype.php [new file with mode: 0644]
php_http_api.h
php_http_url.c
php_http_url.h
ualpha.h [new file with mode: 0644]

index 686b5ea286aaaca9349956f228a55c45eeef4457..aa39772576806d389e067ebe335bd7cdde072808 100644 (file)
@@ -10,6 +10,8 @@ PHP_ARG_WITH([http-libcurl-dir], [],
 [  --with-http-libcurl-dir[=DIR]  HTTP: where to find libcurl], $PHP_HTTP, $PHP_HTTP)
 PHP_ARG_WITH([http-libevent-dir], [],
 [  --with-http-libevent-dir[=DIR] HTTP: where to find libevent], $PHP_HTTP_LIBCURL_DIR, "")
+PHP_ARG_WITH([http-libidn-dir], [],
+[  --with-http-libidn-dir=[=DIR]  HTTP: where to find libidn], $PHP_HTTP_LIBCURL_DIR, "")
 
 if test "$PHP_HTTP" != "no"; then
 
@@ -97,9 +99,34 @@ dnl STDC
 dnl ----
        AC_TYPE_OFF_T
        dnl getdomainname() is declared in netdb.h on some platforms: AIX, OSF
-       AC_CHECK_HEADERS([netdb.h unistd.h])
+       AC_CHECK_HEADERS([netdb.h unistd.h wchar.h wctype.h langinfo.h])
        PHP_CHECK_FUNC(gethostname, nsl)
        PHP_CHECK_FUNC(getdomainname, nsl)
+       PHP_CHECK_FUNC(mbrtowc)
+       PHP_CHECK_FUNC(mbtowc)
+       PHP_CHECK_FUNC(iswalnum)
+       PHP_CHECK_FUNC(nl_langinfo)
+
+dnl ----
+dnl IDN
+dnl ----
+
+       AC_MSG_CHECKING([for idna.h])
+       IDNA_DIR=
+       for i in "$PHP_HTTP_LIBIDN_DIR" "$IDN_DIR" /usr/local /usr /opt; do
+               if test -f "$i/include/idna.h"; then
+                       IDNA_DIR=$i
+                       break;
+               fi
+       done
+       if test "x$IDNA_DIR" = "x"; then
+               AC_MSG_RESULT([not found])
+       else
+               AC_MSG_RESULT([found in $IDNA_DIR])
+               AC_DEFINE([PHP_HTTP_HAVE_IDN], [1], [Have libidn support])
+               PHP_ADD_INCLUDE($IDNA_DIR/include)
+               PHP_ADD_LIBRARY_WITH_PATH(idn, $IDNA_DIR/$PHP_LIBDIR, HTTP_SHARED_LIBADD)
+       fi
 
 dnl ----
 dnl ZLIB
diff --git a/ctype.php b/ctype.php
new file mode 100644 (file)
index 0000000..c670096
--- /dev/null
+++ b/ctype.php
@@ -0,0 +1,104 @@
+<?php
+
+error_reporting(E_ALL);
+set_error_handler(function($c, $e, $f, $l) {
+       throw new Exception("$e in $f on line $l");
+});
+
+$i18n = $argc >= 2 ? $argv[1] : "/usr/share/i18n/locales/i18n";
+
+$f = fopen($i18n, "r");
+$c = false;
+$a = false;
+$r = array();
+
+print <<<C
+typedef struct utf8_range {
+       unsigned int start;
+       unsigned int end;
+       unsigned char step;
+} utf8_range_t;
+
+static const utf8_range_t utf8_ranges[] = {
+
+C;
+while (!feof($f)) {
+       $line = fgets($f);
+       if (!$c && $line !== "LC_CTYPE\n") {
+               continue;
+       }
+       $c = true;
+       if ($line === "END LC_CTYPE\n") {
+               break;
+       }
+       switch($line{0}) {
+       case "%":
+               if ($a) {
+                       printf("/* %s */\n", trim($line, "%\n/ "));
+               }
+               break;
+       case "\n":
+               if ($a) {
+                       break 2;
+               }
+               break;
+       case " ":
+               if ($a) {
+                       foreach (explode(";", trim($line, "\n/ ;")) as $ranges) {
+                               $range = explode("..", $ranges);
+                               $step = 0;
+                               $end = 0;
+                               switch (count($range)) {
+                               case 3:
+                                       list($sstart, $sstep, $send) = $range;
+                                       sscanf($sstart, "<U%X>", $start);
+                                       sscanf($sstep, "(%d)", $step);
+                                       sscanf($send, "<U%X>", $end);
+
+                                       break;
+                               case 2:
+                                       list($sstart, $send) = $range;
+                                       $step = 1;
+                                       sscanf($sstart, "<U%X>", $start);
+                                       sscanf($send, "<U%X>", $end);
+                                       break;
+                               case 1:
+                                       list($sstart) = $range;
+                                       sscanf($sstart, "<U%X>", $start);
+                                       break;
+                               }
+                               print "\t{";
+                               if ($start >= 0xffff) {
+                                       printf("0x%08X, ", $start);
+                                       if ($end) {
+                                               printf("0x%08X, ", $end);
+                                       } else {
+                                               print("         0, ");
+                                       }
+                               } else {
+                                       printf("    0x%04X, ", $start);
+                                       if ($end) {
+                                               printf("    0x%04X, ", $end);
+                                       } else {
+                                               print("         0, ");
+                                       }
+                               }
+                               printf("%d},\n", $step);
+                       }
+               }
+               break;
+       default:
+               if ($a) {
+                       break 2;
+               } elseif ($line === "alpha /\n") {
+                       $a = true;
+               }
+               break;
+       }
+}
+
+print <<<C
+       {0, 0, 0}
+};
+
+C;
index 0e65ccb6315f3c9483de784e1abd8cd23909449b..5bddb0c42e670a69fe0bdc6049614b97953354f5 100644 (file)
@@ -68,6 +68,10 @@ typedef int STATUS;
 #      endif
 #endif
 
+#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTYPE_H) && defined(HAVE_ISWALNUM) && (defined(HAVE_MBRTOWC) || defined(HAVE_MBTOWC))
+#      define PHP_HTTP_HAVE_WCHAR 1
+#endif
+
 #include <ctype.h>
 #define PHP_HTTP_IS_CTYPE(type, c) is##type((int) (unsigned char) (c))
 #define PHP_HTTP_TO_CTYPE(type, c) to##type((int) (unsigned char) (c))
index 7c8077b6a31995208a8a16da0ed6789d68bd2878..bed03d6ae5bf84a42c5db36cbb3e8ead43bbaaf0 100644 (file)
 
 #include "php_http_api.h"
 
+#ifdef PHP_HTTP_HAVE_IDN
+#      include <idna.h>
+#endif
+
+#ifdef PHP_HTTP_HAVE_WCHAR
+#      include <wchar.h>
+#      include <wctype.h>
+#endif
+
+#ifdef HAVE_LANGINFO_H
+#      include <langinfo.h>
+#endif
+#include <locale.h>
+
 static inline char *localhostname(void)
 {
        char hostname[1024] = {0};
@@ -299,6 +313,714 @@ STATUS php_http_url_encode_hash_ex(HashTable *hash, php_http_buffer_t *qstr, con
        return SUCCESS;
 }
 
+void php_http_url_dtor(php_http_url_t *url)
+{
+       STR_FREE(url->scheme.str);
+       STR_FREE(url->authority.userinfo.username.str);
+       STR_FREE(url->authority.userinfo.password.str);
+       STR_FREE(url->authority.host.str);
+       STR_FREE(url->path.str);
+       STR_FREE(url->query.str);
+       STR_FREE(url->fragment.str);
+}
+
+void php_http_url_free(php_http_url_t **url)
+{
+       if (*url) {
+               php_http_url_dtor(*url);
+               efree(*url);
+               *url = NULL;
+       }
+}
+
+#ifdef PHP_HTTP_HAVE_WCHAR
+static zend_bool cs_is_utf8(char **lc_ctype)
+{
+#if HAVE_NL_LANGINFO
+       if (strcmp("UTF-8", nl_langinfo(CODESET))) {
+               *lc_ctype = setlocale(LC_CTYPE, NULL);
+               return 0;
+       }
+       return 1;
+#else
+       *lc_ctype = setlocale(LC_CTYPE, NULL);
+
+       if (*lc_ctype) {
+               char *cs;
+
+               if ((cs = strstr(*lc_ctype, ".utf")) || (cs = strstr(*lc_ctype, ".UTF"))) {
+                       if (cs[4] == '-') {
+                               ++cs;
+                       }
+                       if (cs[4] == '8' && (cs[5] == '\0' || cs[5] == '@')) {
+                               return 1;
+                       }
+               }
+               return 0;
+       }
+#endif
+}
+
+static const unsigned char utf8mblen[256] = {
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+    4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6
+};
+static const unsigned char utf8mask[] = {
+               0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01
+};
+
+static size_t utf8towc(wchar_t *wc, const unsigned char *uc, size_t len)
+{
+       unsigned char ub = utf8mblen[*uc];
+
+       if (!ub || ub > len || ub > 3) {
+               return 0;
+       }
+
+       *wc = *uc & utf8mask[ub];
+
+       switch (ub) {
+       case 4:
+               if ((uc[1] & 0xc0) != 0x80) {
+                       return 0;
+               }
+               *wc <<= 6;
+               *wc += *++uc & 0x3f;
+               /* no break */
+       case 3:
+               if ((uc[1] & 0xc0) != 0x80) {
+                       return 0;
+               }
+               *wc <<= 6;
+               *wc += *++uc & 0x3f;
+               /* no break */
+       case 2:
+               if ((uc[1] & 0xc0) != 0x80) {
+                       return 0;
+               }
+               *wc <<= 6;
+               *wc += *++uc & 0x3f;
+               break;
+
+       default:
+               return 0;
+       }
+
+       return ub;
+}
+
+static size_t parse_locmb(php_http_url_t *url, const char *ptr, const char *end)
+{
+       wchar_t wchar;
+       size_t consumed = 0;
+#if defined(HAVE_MBRTOWC)
+       mbstate_t ps = {0};
+
+       consumed = mbrtowc(&wchar, ptr, end - ptr, &ps);
+#elif defined(HAVE_MBTOWC)
+       consumed = mbtowc(&wchar, ptr, end - ptr);
+#endif
+
+       if (!consumed || consumed == (size_t) -1 || !iswalnum(wchar)) {
+               return 0;
+       }
+
+       return consumed - 1;
+}
+
+#include "ualpha.h"
+
+static zend_bool isualnum(wchar_t ch)
+{
+       unsigned i;
+
+       /* digits */
+       if (ch >= 0x30 && ch <= 0x39) {
+               return 1;
+       }
+       for (i = 0; i < sizeof(utf8_ranges)/sizeof(utf8_range_t); ++i) {
+               if (utf8_ranges[i].start == ch) {
+                       return 1;
+               } else if (utf8_ranges[i].start <= ch && utf8_ranges[i].end >= ch) {
+                       if (utf8_ranges[i].step == 1) {
+                               return 1;
+                       }
+                       /* FIXME step */
+                       return 0;
+               }
+       }
+       return 0;
+}
+
+static size_t parse_utf8mb(php_http_url_t *url, const char *ptr, const char *end)
+{
+       char *lc_ctype = NULL;
+
+       if (0 && cs_is_utf8(&lc_ctype)) {
+               return parse_locmb(url, ptr, end);
+       } else {
+               wchar_t wchar;
+               size_t consumed = utf8towc(&wchar, (const unsigned char *) ptr, end - ptr);
+
+               if (!consumed || consumed == (size_t) -1 || !isualnum(wchar)) {
+                       return 0;
+               }
+
+               return consumed -1 ;
+       }
+}
+#endif
+
+static STATUS parse_userinfo(php_http_url_t *url, const char *ptr, const char *end)
+{
+       const char *password = NULL, *tmp = ptr;
+       TSRMLS_FETCH_FROM_CTX(url->ts);
+
+       do {
+               switch (*ptr) {
+               case ':':
+                       if (password) {
+                               php_error_docref(NULL TSRMLS_CC, E_WARNING,
+                                               "Failed to parse password; duplicate ':' at pos %u in '%s'",
+                                               (unsigned) (ptr - tmp), tmp);
+                               return FAILURE;
+                       }
+                       password = ptr + 1;
+                       break;
+
+               case '%':
+                       if (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2))) {
+                               php_error_docref(NULL TSRMLS_CC, E_WARNING,
+                                               "Failed to parse userinfo; invalid percent encoding at pos %u in '%s'",
+                                               (unsigned) (ptr - tmp), tmp);
+                               return FAILURE;
+                       }
+                       ptr += 2;
+                       break;
+
+               case '!': case '$': case '&': case '\'': case '(': case ')': case '*':
+               case '+': case ',': case ';': case '=': /* sub-delims */
+               case '-': case '.': case '_': case '~': /* unreserved */
+               case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
+               case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
+               case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
+               case 'V': case 'W': case 'X': case 'Y': case 'Z':
+               case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
+               case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
+               case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
+               case 'v': case 'w': case 'x': case 'y': case 'z':
+               case '0': case '1': case '2': case '3': case '4': case '5': case '6':
+               case '7': case '8': case '9':
+                       /* allowed */
+                       break;
+
+               default:
+                       if (url->flags & PHP_HTTP_URL_PARSE_UTF8MB) {
+                               size_t n = parse_utf8mb(url, ptr, end);
+
+                               if (n) {
+                                       ptr += n;
+                                       break;
+                               }
+                       }
+#ifdef PHP_HTTP_HAVE_WCHAR
+                       else if (url->flags & PHP_HTTP_URL_PARSE_LOCMB) {
+                               size_t n = parse_locmb(url, ptr, end);
+
+                               if (n) {
+                                       ptr += n;
+                                       break;
+                               }
+                       }
+#endif
+                       php_error_docref(NULL TSRMLS_CC, E_WARNING,
+                                       "Failed to parse userinfo; unexpected byte 0x%02x at pos %u in '%s'",
+                                       *ptr, (unsigned) (ptr - tmp), tmp);
+               }
+       } while(++ptr != end);
+
+       if (password) {
+               url->authority.userinfo.username.len = password - tmp - 1;
+               url->authority.userinfo.username.str = estrndup(tmp,
+                               url->authority.userinfo.username.len);
+               url->authority.userinfo.password.len = end - password;
+               url->authority.userinfo.password.str = estrndup(password,
+                               url->authority.userinfo.password.len);
+       } else {
+               url->authority.userinfo.username.len = end - tmp;
+               url->authority.userinfo.username.str = estrndup(tmp,
+                               url->authority.userinfo.username.len);
+       }
+
+       return SUCCESS;
+}
+
+static STATUS parse_hostinfo(php_http_url_t *url, const char *ptr, const char *end)
+{
+       const char *tmp = ptr, *port = NULL;
+       TSRMLS_FETCH_FROM_CTX(url->ts);
+
+       /* FIXME: IP(v6) addresses */
+       do {
+               switch (*ptr) {
+               case ':':
+                       if (port) {
+                               php_error_docref(NULL TSRMLS_CC, E_WARNING,
+                                               "Failed to parse port; duplicate ':' at pos %u in '%s'",
+                                               (unsigned) (ptr - tmp), tmp);
+                               return FAILURE;
+                       }
+                       port = ptr + 1;
+                       break;
+
+               case '%':
+                       if (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2))) {
+                               php_error_docref(NULL TSRMLS_CC, E_WARNING,
+                                               "Failed to parse hostinfo; invalid percent encoding at pos %u in '%s'",
+                                               (unsigned) (ptr - tmp), tmp);
+                               return FAILURE;
+                       }
+                       ptr += 2;
+                       break;
+
+               case '!': case '$': case '&': case '\'': case '(': case ')': case '*':
+               case '+': case ',': case ';': case '=': /* sub-delims */
+               case '-': case '.': case '_': case '~': /* unreserved */
+               case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
+               case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
+               case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
+               case 'V': case 'W': case 'X': case 'Y': case 'Z':
+               case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
+               case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
+               case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
+               case 'v': case 'w': case 'x': case 'y': case 'z':
+                       if (port) {
+                               php_error_docref(NULL TSRMLS_CC, E_WARNING,
+                                               "Failed to parse port; unexpected char '%c' at pos %u in '%s'",
+                                               *ptr, (unsigned) (ptr - tmp), tmp);
+                               return FAILURE;
+                       }
+                       /* no break */
+               case '0': case '1': case '2': case '3': case '4': case '5': case '6':
+               case '7': case '8': case '9':
+                       /* allowed */
+                       if (port) {
+                               url->authority.port *= 10;
+                               url->authority.port += *ptr - '0';
+                       }
+                       break;
+
+               default:
+                       if (!port) {
+                               if (url->flags & PHP_HTTP_URL_PARSE_UTF8MB) {
+                                       size_t n = parse_utf8mb(url, ptr, end);
+
+                                       if (n) {
+                                               ptr += n;
+                                               break;
+                                       }
+                               }
+#ifdef PHP_HTTP_HAVE_WCHAR
+                               else if ((url->flags & PHP_HTTP_URL_PARSE_LOCMB) || (url->flags & PHP_HTTP_URL_PARSE_LOCIDN)) {
+                                       size_t n = parse_locmb(url, ptr, end);
+
+                                       if (n) {
+                                               ptr += n;
+                                               break;
+                                       }
+                               }
+#endif
+                       }
+                       php_error_docref(NULL TSRMLS_CC, E_WARNING,
+                                       "Failed to parse hostinfo; unexpected byte 0x%02x at pos %u in '%s'",
+                                       (unsigned char) *ptr, (unsigned) (ptr - tmp), tmp);
+                       return FAILURE;
+               }
+       } while (++ptr != end);
+
+       if (port) {
+               url->authority.host.len = port - tmp - 1;
+       } else {
+               url->authority.host.len = end - tmp;
+       }
+
+       url->authority.host.str = estrndup(tmp, url->authority.host.len);
+
+#ifdef PHP_HTTP_HAVE_IDN
+       if (url->flags & PHP_HTTP_URL_PARSE_UTF8IDN) {
+               char *idn = NULL;
+               int rv = idna_to_ascii_8z(url->authority.host.str, &idn, IDNA_ALLOW_UNASSIGNED|IDNA_USE_STD3_ASCII_RULES);
+
+               if (rv != IDNA_SUCCESS) {
+                       php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Failed to parse IDN: '%s'", idna_strerror(rv));
+               } else {
+                       STR_SET(url->authority.host.str, estrdup(idn));
+                       free(idn);
+               }
+       } else if (url->flags & PHP_HTTP_URL_PARSE_LOCIDN) {
+               char *idn = NULL;
+               int rv = idna_to_ascii_lz(url->authority.host.str, &idn, IDNA_ALLOW_UNASSIGNED|IDNA_USE_STD3_ASCII_RULES);
+
+               if (rv != IDNA_SUCCESS) {
+                       php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Failed to parse IDN: '%s'", idna_strerror(rv));
+               } else {
+                       STR_SET(url->authority.host.str, estrdup(idn));
+                       free(idn);
+               }
+       }
+#endif
+
+       return SUCCESS;
+}
+
+static const char *parse_authority(php_http_url_t *url, const char *ptr, const char *end)
+{
+       const char *tmp = ptr;
+
+       do {
+               switch (*ptr) {
+               case '@':
+                       /* userinfo delimiter */
+                       if (tmp != ptr && SUCCESS != parse_userinfo(url, tmp, ptr)) {
+                               return NULL;
+                       }
+                       tmp = ptr + 1;
+                       break;
+
+               case '/':
+               case '?':
+               case '#':
+               case '\0':
+                       /* host delimiter */
+                       if (tmp != ptr && SUCCESS != parse_hostinfo(url, tmp, ptr)) {
+                               return NULL;
+                       }
+                       return ptr;
+               }
+       } while (++ptr <= end);
+
+       return NULL;
+}
+
+static const char *parse_path(php_http_url_t *url, const char *ptr, const char *end)
+{
+       const char *tmp = ptr;
+       TSRMLS_FETCH_FROM_CTX(url->ts);
+
+       do {
+               switch (*ptr) {
+               case '?':
+               case '\0':
+                       url->path.len = ptr - tmp;
+                       url->path.str = estrndup(tmp, url->path.len);
+                       return ptr;
+
+               case '%':
+                       if (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2))) {
+                               php_error_docref(NULL TSRMLS_CC, E_WARNING,
+                                               "Failed to parse path; invalid percent encoding at pos %u in '%s'",
+                                               (unsigned) (ptr - tmp), tmp);
+                               return NULL;
+                       }
+                       ptr += 2;
+                       break;
+
+               case '/': /* yeah, well */
+               case '!': case '$': case '&': case '\'': case '(': case ')': case '*':
+               case '+': case ',': case ';': case '=': /* sub-delims */
+               case '-': case '.': case '_': case '~': /* unreserved */
+               case ':': case '@': /* pchar */
+               case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
+               case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
+               case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
+               case 'V': case 'W': case 'X': case 'Y': case 'Z':
+               case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
+               case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
+               case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
+               case 'v': case 'w': case 'x': case 'y': case 'z':
+               case '0': case '1': case '2': case '3': case '4': case '5': case '6':
+               case '7': case '8': case '9':
+                       /* allowed */
+                       break;
+
+               default:
+                       if (url->flags & PHP_HTTP_URL_PARSE_UTF8MB) {
+                               size_t n = parse_utf8mb(url, ptr, end);
+
+                               if (n) {
+                                       ptr += n;
+                                       break;
+                               }
+                       }
+#if PHP_HTTP_HAVE_WCHAR
+                       else if (url->flags & PHP_HTTP_URL_PARSE_LOCMB) {
+                               size_t n = parse_locmb(url, ptr, end);
+
+                               if (n) {
+                                       ptr += n;
+                                       break;
+                               }
+                       }
+#endif
+                       php_error_docref(NULL TSRMLS_CC, E_WARNING,
+                                       "Failed to parse path; unexpected byte 0x%02x pos %u in '%s'",
+                                       *ptr, (unsigned) (ptr - tmp), tmp);
+               }
+       } while (++ptr <= end);
+
+       return NULL;
+}
+
+static const char *parse_query(php_http_url_t *url, const char *ptr, const char *end)
+{
+       const char *tmp = ptr + !!*ptr;
+       TSRMLS_FETCH_FROM_CTX(url->ts);
+
+       do {
+               switch (*ptr) {
+               case '#':
+               case '\0':
+                       url->query.len = ptr - tmp;
+                       url->query.str = estrndup(tmp, url->query.len);
+                       return ptr;
+
+               case '%':
+                       if (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2))) {
+                               php_error_docref(NULL TSRMLS_CC, E_WARNING,
+                                               "Failed to parse query; invalid percent encoding at pos %u in '%s'",
+                                               (unsigned) (ptr - tmp), tmp);
+                               return NULL;
+                       }
+                       ptr += 2;
+                       break;
+
+               case '?': case '/': /* yeah, well */
+               case '!': case '$': case '&': case '\'': case '(': case ')': case '*':
+               case '+': case ',': case ';': case '=': /* sub-delims */
+               case '-': case '.': case '_': case '~': /* unreserved */
+               case ':': case '@': /* pchar */
+               case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
+               case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
+               case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
+               case 'V': case 'W': case 'X': case 'Y': case 'Z':
+               case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
+               case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
+               case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
+               case 'v': case 'w': case 'x': case 'y': case 'z':
+               case '0': case '1': case '2': case '3': case '4': case '5': case '6':
+               case '7': case '8': case '9':
+                       /* allowed */
+                       break;
+
+               default:
+#ifdef PHP_HTTP_HAVE_WCHAR
+                       if (url->flags & PHP_HTTP_URL_PARSE_LOCMB) {
+                               size_t n = parse_locmb(url, ptr, end);
+
+                               if (n) {
+                                       ptr += n;
+                                       break;
+                               }
+                       }
+#endif
+                       php_error_docref(NULL TSRMLS_CC, E_WARNING,
+                                       "Failed to parse query; unexpected byte 0x%02x at pos %u in '%s'",
+                                       *ptr, (unsigned) (ptr - tmp), tmp);
+               }
+       } while (++ptr <= end);
+
+       return NULL;
+}
+
+static const char *parse_fragment(php_http_url_t *url, const char *ptr, const char *end)
+{
+       const char *tmp = ptr + !!*ptr;
+       TSRMLS_FETCH_FROM_CTX(url->ts);
+
+       do {
+               switch (*ptr) {
+               case '\0':
+                       url->fragment.len = ptr - tmp;
+                       url->fragment.str = estrndup(tmp, url->fragment.len);
+                       return ptr;
+
+               case '%':
+                       if (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2))) {
+                               php_error_docref(NULL TSRMLS_CC, E_WARNING,
+                                               "Failed to parse query; invalid percent encoding at pos %u in '%s'",
+                                               (unsigned) (ptr - tmp), tmp);
+                               return NULL;
+                       }
+                       ptr += 2;
+                       break;
+
+               case '?': case '/': /* yeah, well */
+               case '!': case '$': case '&': case '\'': case '(': case ')': case '*':
+               case '+': case ',': case ';': case '=': /* sub-delims */
+               case '-': case '.': case '_': case '~': /* unreserved */
+               case ':': case '@': /* pchar */
+               case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
+               case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
+               case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
+               case 'V': case 'W': case 'X': case 'Y': case 'Z':
+               case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
+               case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
+               case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
+               case 'v': case 'w': case 'x': case 'y': case 'z':
+               case '0': case '1': case '2': case '3': case '4': case '5': case '6':
+               case '7': case '8': case '9':
+                       /* allowed */
+                       break;
+
+               default:
+#if PHP_HTTP_HAVE_WCHAR
+                       if (url->flags & PHP_HTTP_URL_PARSE_LOCMB) {
+                               size_t n = parse_locmb(url, ptr, end);
+
+                               if (n) {
+                                       ptr += n;
+                                       break;
+                               }
+                       }
+#endif
+                       php_error_docref(NULL TSRMLS_CC, E_WARNING,
+                                       "Failed to parse fragment; unexpected byte 0x%02x at pos %u in '%s'",
+                                       *ptr, (unsigned) (ptr - tmp), tmp);
+               }
+       } while (++ptr <= end);
+
+       return NULL;
+}
+
+static const char *parse_hier(php_http_url_t *url, const char *ptr, const char *end)
+{
+       if (*ptr == '/') {
+               if (end - ptr > 1) {
+                       if (*(ptr + 1) == '/') {
+                               if (!(ptr = parse_authority(url, ptr + 2, end))) {
+                                       return NULL;
+                               }
+                       }
+               }
+       }
+       return parse_path(url, ptr, end);
+}
+
+static const char *parse_scheme(php_http_url_t *url, const char *ptr, const char *end)
+{
+       const char *tmp = ptr;
+
+       do {
+               switch (*ptr) {
+               case ':':
+                       /* scheme delimiter */
+                       url->scheme.len = ptr - tmp;
+                       url->scheme.str = estrndup(tmp, url->scheme.len);
+                       return ++ptr;
+
+               case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
+               case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
+               case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
+               case 'V': case 'W': case 'X': case 'Y': case 'Z':
+               case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
+               case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
+               case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
+               case 'v': case 'w': case 'x': case 'y': case 'z':
+               case '0': case '1': case '2': case '3': case '4': case '5': case '6':
+               case '7': case '8': case '9':
+               case '+': case '-': case '.':
+                       /* scheme part */
+                       break;
+
+               default:
+#ifdef PHP_HTTP_HAVE_WCHAR
+                       if (url->flags & PHP_HTTP_URL_PARSE_LOCMB) {
+                               size_t n = parse_locmb(url, ptr, end);
+
+                               if (n) {
+                                       ptr += n;
+                                       break;
+                               }
+                       }
+#endif
+                       /* no scheme */
+                       return tmp;
+               }
+       } while (++ptr != end);
+
+       return tmp;
+}
+
+php_http_url_t *php_http_url_init(php_http_url_t *url, const char *str, size_t len, unsigned flags TSRMLS_DC)
+{
+       const char *ptr, *end = str + len;
+       zend_bool free_url = !url;
+
+       if (url) {
+               memset(url, 0, sizeof(*url));
+       } else {
+               url = ecalloc(1, sizeof(*url));
+       }
+
+       url->flags = flags;
+       TSRMLS_SET_CTX(url->ts);
+
+       if ((ptr = str) && !(str = parse_scheme(url, ptr, end))) {
+               php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse URL scheme: '%s'", ptr);
+               if (free_url) {
+                       php_http_url_free(&url);
+               } else {
+                       php_http_url_dtor(url);
+               }
+               return NULL;
+       }
+
+       if ((ptr = str) && !(str = parse_hier(url, ptr, end))) {
+               if (free_url) {
+                       php_http_url_free(&url);
+               } else {
+                       php_http_url_dtor(url);
+               }
+               return NULL;
+       }
+
+       if ((ptr = str) && !(str = parse_query(url, ptr, end))) {
+               php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse URL query: '%s'", ptr);
+               if (free_url) {
+                       php_http_url_free(&url);
+               } else {
+                       php_http_url_dtor(url);
+               }
+               return NULL;
+       }
+
+       if ((ptr = str) && !(str = parse_fragment(url, ptr, end))) {
+               php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse URL fragment: '%s'", ptr);
+               if (free_url) {
+                       php_http_url_free(&url);
+               } else {
+                       php_http_url_dtor(url);
+               }
+               return NULL;
+       }
+
+       return url;
+}
+
 ZEND_BEGIN_ARG_INFO_EX(ai_HttpUrl___construct, 0, 0, 0)
        ZEND_ARG_INFO(0, old_url)
        ZEND_ARG_INFO(0, new_url)
@@ -461,12 +1183,41 @@ PHP_METHOD(HttpUrl, toArray)
        php_url_free(purl);
 }
 
+ZEND_BEGIN_ARG_INFO_EX(ai_HttpUrl_parse, 0, 0, 1)
+       ZEND_ARG_INFO(0, url)
+       ZEND_ARG_INFO(0, flags)
+ZEND_END_ARG_INFO();
+PHP_METHOD(HttpUrl, parse)
+{
+       char *str;
+       int len;
+       long flags = 0;
+       php_http_url_t url;
+
+       if (SUCCESS != zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &len, &flags)) {
+               return;
+       }
+
+       if (php_http_url_init(&url, str, len, flags TSRMLS_CC)) {
+               printf("  scheme=(%zu)%s\n", url.scheme.len,url.scheme.str);
+               printf("username=(%zu)%s\n", url.authority.userinfo.username.len,url.authority.userinfo.username.str);
+               printf("password=(%zu)%s\n", url.authority.userinfo.password.len,url.authority.userinfo.password.str);
+               printf("    host=(%zu)%s\n", url.authority.host.len,url.authority.host.str);
+               printf("    port=%d\n", (int) url.authority.port);
+               printf("    path=(%zu)%s\n", url.path.len,url.path.str);
+               printf("   query=(%zu)%s\n", url.query.len,url.query.str);
+               printf("fragment=(%zu)%s\n", url.fragment.len,url.fragment.str);
+               php_http_url_dtor(&url);
+       }
+}
+
 static zend_function_entry php_http_url_methods[] = {
        PHP_ME(HttpUrl, __construct,  ai_HttpUrl___construct, ZEND_ACC_PUBLIC|ZEND_ACC_CTOR)
        PHP_ME(HttpUrl, mod,          ai_HttpUrl_mod, ZEND_ACC_PUBLIC)
        PHP_ME(HttpUrl, toString,     ai_HttpUrl_toString, ZEND_ACC_PUBLIC)
        ZEND_MALIAS(HttpUrl, __toString, toString, ai_HttpUrl_toString, ZEND_ACC_PUBLIC)
        PHP_ME(HttpUrl, toArray,      ai_HttpUrl_toArray, ZEND_ACC_PUBLIC)
+       PHP_ME(HttpUrl, parse,        ai_HttpUrl_parse, ZEND_ACC_PUBLIC|ZEND_ACC_STATIC)
        EMPTY_FUNCTION_ENTRY
 };
 
@@ -502,6 +1253,15 @@ PHP_MINIT_FUNCTION(http_url)
        zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("FROM_ENV"), PHP_HTTP_URL_FROM_ENV TSRMLS_CC);
        zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("SANITIZE_PATH"), PHP_HTTP_URL_SANITIZE_PATH TSRMLS_CC);
 
+#ifdef PHP_HTTP_HAVE_WCHAR
+       zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_LOCMB"), PHP_HTTP_URL_PARSE_LOCMB TSRMLS_CC);
+#endif
+       zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_UTF8MB"), PHP_HTTP_URL_PARSE_UTF8MB TSRMLS_CC);
+#ifdef PHP_HTTP_HAVE_IDN
+       zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_LOCIDN"), PHP_HTTP_URL_PARSE_LOCIDN TSRMLS_CC);
+       zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_UTF8IDN"), PHP_HTTP_URL_PARSE_UTF8IDN TSRMLS_CC);
+#endif
+
        return SUCCESS;
 }
 
index 5c61daa4f1dff75b4c8dd03a399c908800010f8f..a3ea1d87bc97d0e46aa77b1e8f68bcc826f945ff 100644 (file)
 #define PHP_HTTP_URL_FROM_ENV          0x1000
 #define PHP_HTTP_URL_SANITIZE_PATH     0x2000
 
+typedef struct php_http_url_part {
+       char *str;
+       size_t len;
+} php_http_url_part_t;
+
+#define PHP_HTTP_URL_PARSE_LOCMB   0x01
+#define PHP_HTTP_URL_PARSE_UTF8MB  0x02
+#define PHP_HTTP_URL_PARSE_LOCIDN  0x10
+#define PHP_HTTP_URL_PARSE_UTF8IDN 0x20
+
+typedef struct php_http_url {
+       php_http_url_part_t scheme;
+       struct {
+               struct {
+                       php_http_url_part_t username;
+                       php_http_url_part_t password;
+               } userinfo;
+               php_http_url_part_t host;
+               unsigned short port;
+       } authority;
+       php_http_url_part_t path;
+       php_http_url_part_t query;
+       php_http_url_part_t fragment;
+       unsigned flags;
+#ifdef ZTS
+       void ***ts;
+#endif
+} php_http_url_t;
+
+PHP_HTTP_API php_http_url_t *php_http_url_init(php_http_url_t *url, const char *str, size_t len, unsigned flags TSRMLS_DC);
+
 PHP_HTTP_API void php_http_url(int flags, const php_url *old_url, const php_url *new_url, php_url **url_ptr, char **url_str, size_t *url_len TSRMLS_DC);
 
 PHP_HTTP_API STATUS php_http_url_encode_hash(HashTable *hash, const char *pre_encoded_str, size_t pre_encoded_len, char **encoded_str, size_t *encoded_len TSRMLS_DC);
diff --git a/ualpha.h b/ualpha.h
new file mode 100644 (file)
index 0000000..688777d
--- /dev/null
+++ b/ualpha.h
@@ -0,0 +1,654 @@
+typedef struct utf8_range {
+       unsigned int start;
+       unsigned int end;
+       unsigned char step;
+} utf8_range_t;
+
+static const utf8_range_t utf8_ranges[] = {
+/* BASIC LATIN */
+       {    0x0041,     0x005A, 1},
+       {    0x0061,     0x007A, 1},
+/* LATIN-1 SUPPLEMENT */
+       {    0x00AA,          0, 0},
+       {    0x00B5,          0, 0},
+       {    0x00BA,          0, 0},
+       {    0x00C0,     0x00D6, 1},
+       {    0x00D8,     0x00F6, 1},
+       {    0x00F8,     0x00FF, 1},
+/* LATIN EXTENDED-A */
+       {    0x0100,     0x017F, 1},
+/* LATIN EXTENDED-B */
+       {    0x0180,     0x024F, 1},
+/* IPA EXTENSIONS */
+       {    0x0250,     0x02AF, 1},
+/* SPACING MODIFIER LETTERS */
+       {    0x02B0,     0x02C1, 1},
+       {    0x02C6,     0x02D1, 1},
+       {    0x02E0,     0x02E4, 1},
+       {    0x02EE,          0, 0},
+/* COMBINING DIACRITICAL MARKS */
+       {    0x0345,          0, 0},
+/* BASIC GREEK */
+       {    0x0370,     0x0373, 1},
+       {    0x0376,     0x0377, 1},
+       {    0x037A,     0x037D, 1},
+       {    0x0386,          0, 0},
+       {    0x0388,     0x038A, 1},
+       {    0x038C,          0, 0},
+       {    0x038E,     0x03A1, 1},
+       {    0x03A3,     0x03CE, 1},
+/* GREEK SYMBOLS AND COPTIC */
+       {    0x03D0,     0x03F5, 1},
+       {    0x03F7,     0x03FF, 1},
+/* CYRILLIC */
+       {    0x0400,     0x0481, 1},
+       {    0x048A,     0x04FF, 1},
+/* CYRILLIC SUPPLEMENT */
+       {    0x0500,     0x0523, 1},
+/* ARMENIAN */
+       {    0x0531,     0x0556, 1},
+       {    0x0559,          0, 0},
+       {    0x0561,     0x0587, 1},
+/* HEBREW */
+       {    0x05D0,     0x05EA, 1},
+       {    0x05F0,     0x05F2, 1},
+/* ARABIC */
+       {    0x0621,     0x064A, 1},
+       {    0x066E,     0x066F, 1},
+       {    0x0671,     0x06D3, 1},
+       {    0x06D5,          0, 0},
+       {    0x06E5,     0x06E6, 1},
+       {    0x06EE,     0x06EF, 1},
+       {    0x06FA,     0x06FC, 1},
+       {    0x06FF,          0, 0},
+/* SYRIAC */
+       {    0x0710,          0, 0},
+       {    0x0712,     0x072F, 1},
+       {    0x074D,     0x074F, 1},
+/* ARABIC SUPPLEMENT */
+       {    0x0750,     0x077F, 1},
+/* THAANA */
+       {    0x0780,     0x07A5, 1},
+       {    0x07B1,          0, 0},
+/* NKO */
+       {    0x07C0,     0x07EA, 1},
+       {    0x07F4,     0x07F5, 1},
+       {    0x07FA,          0, 0},
+/* - All Matras of Indic and Sinhala are moved from punct to alpha class */
+/* - Added Unicode 5.1 charctares of Indic scripts */
+/* DEVANAGARI */
+       {    0x0901,     0x0939, 1},
+       {    0x093C,     0x094D, 1},
+       {    0x0950,     0x0954, 1},
+       {    0x0958,     0x0961, 1},
+       {    0x0962,          0, 0},
+       {    0x0963,          0, 0},
+       {    0x0972,          0, 0},
+       {    0x097B,     0x097F, 1},
+/* TABLE 18 BENGALI */
+       {    0x0981,     0x0983, 1},
+       {    0x0985,     0x098C, 1},
+       {    0x098F,          0, 0},
+       {    0x0990,          0, 0},
+       {    0x0993,     0x09A8, 1},
+       {    0x09AA,     0x09B0, 1},
+       {    0x09B2,          0, 0},
+       {    0x09B6,     0x09B9, 1},
+       {    0x09BC,     0x09C4, 1},
+       {    0x09C7,          0, 0},
+       {    0x09C8,          0, 0},
+       {    0x09CB,     0x09CE, 1},
+       {    0x09D7,          0, 0},
+       {    0x09DC,          0, 0},
+       {    0x09DD,          0, 0},
+       {    0x09DF,     0x09E3, 1},
+       {    0x09F0,     0x09FA, 1},
+/* GURMUKHI */
+       {    0x0A01,     0x0A03, 1},
+       {    0x0A05,     0x0A0A, 1},
+       {    0x0A0F,          0, 0},
+       {    0x0A10,          0, 0},
+       {    0x0A13,     0x0A28, 1},
+       {    0x0A2A,     0x0A30, 1},
+       {    0x0A32,          0, 0},
+       {    0x0A33,          0, 0},
+       {    0x0A35,          0, 0},
+       {    0x0A36,          0, 0},
+       {    0x0A38,          0, 0},
+       {    0x0A39,          0, 0},
+       {    0x0A3C,          0, 0},
+       {    0x0A3E,     0x0A42, 1},
+       {    0x0A47,          0, 0},
+       {    0x0A48,          0, 0},
+       {    0x0A4B,     0x0A4D, 1},
+       {    0x0A51,          0, 0},
+       {    0x0A59,     0x0A5C, 1},
+       {    0x0A5E,          0, 0},
+       {    0x0A70,     0x0A75, 1},
+/* GUJARATI */
+       {    0x0A81,     0x0A83, 1},
+       {    0x0A85,     0x0A8D, 1},
+       {    0x0A8F,     0x0A91, 1},
+       {    0x0A93,     0x0AA8, 1},
+       {    0x0AAA,     0x0AB0, 1},
+       {    0x0AB2,          0, 0},
+       {    0x0AB3,          0, 0},
+       {    0x0AB5,     0x0AB9, 1},
+       {    0x0ABC,     0x0AC5, 1},
+       {    0x0AC7,     0x0AC9, 1},
+       {    0x0ACB,     0x0ACD, 1},
+       {    0x0AD0,          0, 0},
+       {    0x0AE0,     0x0AE3, 1},
+       {    0x0AF1,          0, 0},
+/* ORIYA */
+       {    0x0B01,     0x0B03, 1},
+       {    0x0B05,     0x0B0C, 1},
+       {    0x0B0F,          0, 0},
+       {    0x0B10,          0, 0},
+       {    0x0B13,     0x0B28, 1},
+       {    0x0B2A,     0x0B30, 1},
+       {    0x0B32,          0, 0},
+       {    0x0B33,          0, 0},
+       {    0x0B35,     0x0B39, 1},
+       {    0x0B3C,     0x0B44, 1},
+       {    0x0B47,     0x0B48, 1},
+       {    0x0B4B,     0x0B4D, 1},
+       {    0x0B56,     0x0B57, 1},
+       {    0x0B5C,          0, 0},
+       {    0x0B5D,          0, 0},
+       {    0x0B5F,     0x0B63, 1},
+       {    0x0B70,          0, 0},
+       {    0x0B71,          0, 0},
+/* TAMIL */
+       {    0x0B82,          0, 0},
+       {    0x0B83,          0, 0},
+       {    0x0B85,     0x0B8A, 1},
+       {    0x0B8E,     0x0B90, 1},
+       {    0x0B92,     0x0B95, 1},
+       {    0x0B99,          0, 0},
+       {    0x0B9A,          0, 0},
+       {    0x0B9C,          0, 0},
+       {    0x0B9E,          0, 0},
+       {    0x0B9F,          0, 0},
+       {    0x0BA3,          0, 0},
+       {    0x0BA4,          0, 0},
+       {    0x0BA8,     0x0BAA, 1},
+       {    0x0BAE,     0x0BB9, 1},
+       {    0x0BBE,     0x0BC2, 1},
+       {    0x0BC6,     0x0BC8, 1},
+       {    0x0BCA,     0x0BCD, 1},
+       {    0x0BD0,          0, 0},
+       {    0x0BD7,          0, 0},
+       {    0x0BF0,     0x0BFA, 1},
+/* TELUGU */
+       {    0x0C01,     0x0C03, 1},
+       {    0x0C05,     0x0C0C, 1},
+       {    0x0C0E,     0x0C10, 1},
+       {    0x0C12,     0x0C28, 1},
+       {    0x0C2A,     0x0C33, 1},
+       {    0x0C35,     0x0C39, 1},
+       {    0x0C3D,     0x0C44, 1},
+       {    0x0C46,     0x0C48, 1},
+       {    0x0C4A,     0x0C4D, 1},
+       {    0x0C55,     0x0C56, 1},
+       {    0x0C58,     0x0C59, 1},
+       {    0x0C60,     0x0C63, 1},
+/* KANNADA */
+       {    0x0C82,     0x0C83, 1},
+       {    0x0C85,     0x0C8C, 1},
+       {    0x0C8E,     0x0C90, 1},
+       {    0x0C92,     0x0CA8, 1},
+       {    0x0CAA,     0x0CB3, 1},
+       {    0x0CB5,     0x0CB9, 1},
+       {    0x0CBC,     0x0CC4, 1},
+       {    0x0CC6,     0x0CC8, 1},
+       {    0x0CCA,     0x0CCD, 1},
+       {    0x0CD5,     0x0CD6, 1},
+       {    0x0CDE,          0, 0},
+       {    0x0CE0,     0x0CE3, 1},
+       {    0x0CF1,          0, 0},
+       {    0x0CF2,          0, 0},
+/* MALAYALAM */
+       {    0x0D02,     0x0D03, 1},
+       {    0x0D05,     0x0D0C, 1},
+       {    0x0D0E,     0x0D10, 1},
+       {    0x0D12,     0x0D28, 1},
+       {    0x0D2A,     0x0D39, 1},
+       {    0x0D3D,     0x0D44, 1},
+       {    0x0D46,     0x0D48, 1},
+       {    0x0D4A,     0x0D4D, 1},
+       {    0x0D57,          0, 0},
+       {    0x0D60,     0x0D63, 1},
+       {    0x0D79,     0x0D7F, 1},
+/* SINHALA */
+       {    0x0D82,     0x0D83, 1},
+       {    0x0D85,     0x0D96, 1},
+       {    0x0D9A,     0x0DB1, 1},
+       {    0x0DB3,     0x0DBB, 1},
+       {    0x0DBD,          0, 0},
+       {    0x0DC0,     0x0DC6, 1},
+       {    0x0DCA,          0, 0},
+       {    0x0DCF,     0x0DD4, 1},
+       {    0x0DD6,          0, 0},
+       {    0x0DD8,     0x0DDF, 1},
+       {    0x0DF2,     0x0DF4, 1},
+/* THAI */
+       {    0x0E01,     0x0E2E, 1},
+       {    0x0E30,     0x0E3A, 1},
+       {    0x0E40,     0x0E45, 1},
+       {    0x0E47,     0x0E4E, 1},
+/* LAO */
+       {    0x0E81,     0x0E82, 1},
+       {    0x0E84,          0, 0},
+       {    0x0E87,     0x0E88, 1},
+       {    0x0E8A,          0, 0},
+       {    0x0E8D,          0, 0},
+       {    0x0E94,     0x0E97, 1},
+       {    0x0E99,     0x0E9F, 1},
+       {    0x0EA1,     0x0EA3, 1},
+       {    0x0EA5,          0, 0},
+       {    0x0EA7,          0, 0},
+       {    0x0EAA,     0x0EAB, 1},
+       {    0x0EAD,     0x0EB0, 1},
+       {    0x0EB2,     0x0EB3, 1},
+       {    0x0EBD,          0, 0},
+       {    0x0EC0,     0x0EC4, 1},
+       {    0x0EC6,          0, 0},
+       {    0x0EDC,     0x0EDD, 1},
+/* TIBETAN */
+       {    0x0F00,          0, 0},
+       {    0x0F40,     0x0F47, 1},
+       {    0x0F49,     0x0F6C, 1},
+       {    0x0F88,     0x0F8B, 1},
+/* MYANMAR */
+       {    0x1000,     0x102A, 1},
+       {    0x1050,     0x1055, 1},
+       {    0x105A,     0x105D, 1},
+       {    0x1061,          0, 0},
+       {    0x0165,          0, 0},
+       {    0x1066,          0, 0},
+       {    0x106E,     0x1070, 1},
+       {    0x1075,     0x1081, 1},
+       {    0x108E,          0, 0},
+/* GEORGIAN */
+       {    0x10A0,     0x10C5, 1},
+       {    0x10D0,     0x10FA, 1},
+       {    0x10FC,          0, 0},
+/* HANGUL JAMO */
+       {    0x1100,     0x1159, 1},
+       {    0x115F,     0x11A2, 1},
+       {    0x11A8,     0x11F9, 1},
+/* ETHIOPIC */
+       {    0x1200,     0x1248, 1},
+       {    0x124A,     0x124D, 1},
+       {    0x1250,     0x1256, 1},
+       {    0x1258,          0, 0},
+       {    0x125A,     0x125D, 1},
+       {    0x1260,     0x1288, 1},
+       {    0x128A,     0x128D, 1},
+       {    0x1290,     0x12B0, 1},
+       {    0x12B2,     0x12B5, 1},
+       {    0x12B8,     0x12BE, 1},
+       {    0x12C0,          0, 0},
+       {    0x12C2,     0x12C5, 1},
+       {    0x12C8,     0x12D6, 1},
+       {    0x12D8,     0x1310, 1},
+       {    0x1312,     0x1315, 1},
+       {    0x1318,     0x135A, 1},
+/* ETHIOPIC EXTENDED */
+       {    0x1380,     0x138F, 1},
+/* CHEROKEE */
+       {    0x13A0,     0x13F4, 1},
+/* UNIFIED CANADIAN ABORIGINAL SYLLABICS */
+       {    0x1401,     0x166C, 1},
+       {    0x166F,     0x1676, 1},
+/* OGHAM */
+       {    0x1681,     0x169A, 1},
+/* RUNIC */
+       {    0x16A0,     0x16EA, 1},
+       {    0x16EE,     0x16F0, 1},
+/* TAGALOG */
+       {    0x1700,     0x170C, 1},
+       {    0x170E,     0x1711, 1},
+/* HANUNOO */
+       {    0x1720,     0x1731, 1},
+/* BUHID */
+       {    0x1740,     0x1751, 1},
+/* TAGBANWA */
+       {    0x1760,     0x176C, 1},
+       {    0x176E,     0x1770, 1},
+/* KHMER */
+       {    0x1780,     0x17B3, 1},
+       {    0x17D7,          0, 0},
+       {    0x17DC,          0, 0},
+/* MONGOLIAN */
+       {    0x1820,     0x1877, 1},
+       {    0x1880,     0x18A8, 1},
+       {    0x18AA,          0, 0},
+/* LIMBU */
+       {    0x1900,     0x191C, 1},
+       {    0x1946,     0x194F, 1},
+/* TAI LE */
+       {    0x1950,     0x196D, 1},
+       {    0x1970,     0x1974, 1},
+/* NEW TAI LUE */
+       {    0x1980,     0x19A9, 1},
+       {    0x19C1,     0x19C7, 1},
+       {    0x19D0,     0x19D9, 1},
+/* BUGINESE */
+       {    0x1A00,     0x1A16, 1},
+/* BALINESE */
+       {    0x1B05,     0x1B33, 1},
+       {    0x1B45,     0x1B4B, 1},
+       {    0x1B50,     0x1B59, 1},
+/* SUNDANESE */
+       {    0x1B83,     0x1BA0, 1},
+       {    0x1BAE,     0x1BAF, 1},
+/* LEPCHA */
+       {    0x1C00,     0x1C23, 1},
+       {    0x1C4D,     0x1C4F, 1},
+/* OL CHIKI */
+       {    0x1C5A,     0x1C7D, 1},
+/* PHONETIC EXTENSIONS */
+       {    0x1D00,     0x1DBF, 1},
+/* LATIN EXTENDED ADDITIONAL */
+       {    0x1E00,     0x1E9F, 1},
+       {    0x1EA0,     0x1EFF, 1},
+/* GREEK EXTENDED */
+       {    0x1F00,     0x1F15, 1},
+       {    0x1F18,     0x1F1D, 1},
+       {    0x1F20,     0x1F45, 1},
+       {    0x1F48,     0x1F4D, 1},
+       {    0x1F50,     0x1F57, 1},
+       {    0x1F59,          0, 0},
+       {    0x1F5B,          0, 0},
+       {    0x1F5D,          0, 0},
+       {    0x1F5F,     0x1F7D, 1},
+       {    0x1F80,     0x1FB4, 1},
+       {    0x1FB6,     0x1FBC, 1},
+       {    0x1FBE,          0, 0},
+       {    0x1FC2,     0x1FC4, 1},
+       {    0x1FC6,     0x1FCC, 1},
+       {    0x1FD0,     0x1FD3, 1},
+       {    0x1FD6,     0x1FDB, 1},
+       {    0x1FE0,     0x1FEC, 1},
+       {    0x1FF2,     0x1FF4, 1},
+       {    0x1FF6,     0x1FFC, 1},
+/* SUPERSCRIPTS AND SUBSCRIPTS */
+       {    0x2071,          0, 0},
+       {    0x207F,          0, 0},
+       {    0x2090,     0x2094, 1},
+/* LETTERLIKE SYMBOLS */
+       {    0x2102,          0, 0},
+       {    0x2107,          0, 0},
+       {    0x210A,     0x2113, 1},
+       {    0x2115,          0, 0},
+       {    0x2119,     0x211D, 1},
+       {    0x2124,          0, 0},
+       {    0x2126,          0, 0},
+       {    0x2128,     0x212D, 1},
+       {    0x212F,     0x2139, 1},
+       {    0x213C,     0x213F, 1},
+       {    0x2145,     0x2149, 1},
+       {    0x214E,          0, 0},
+/* NUMBER FORMS */
+       {    0x2160,     0x2188, 1},
+/* ENCLOSED ALPHANUMERICS */
+       {    0x249C,     0x24E9, 1},
+/* GLAGOLITIC */
+       {    0x2C00,     0x2C2E, 1},
+       {    0x2C30,     0x2C5E, 1},
+/* LATIN EXTENDED-C */
+       {    0x2C60,     0x2C6F, 1},
+       {    0x2C71,     0x2C7D, 1},
+/* COPTIC */
+       {    0x2C80,     0x2CE4, 1},
+/* GEORGIAN SUPPLEMENT */
+       {    0x2D00,     0x2D25, 1},
+/* TIFINAGH */
+       {    0x2D30,     0x2D65, 1},
+       {    0x2D6F,          0, 0},
+/* ETHIOPIC EXTENDED */
+       {    0x2D80,     0x2D96, 1},
+       {    0x2DA0,     0x2DA6, 1},
+       {    0x2DA8,     0x2DAE, 1},
+       {    0x2DB0,     0x2DB6, 1},
+       {    0x2DB8,     0x2DBE, 1},
+       {    0x2DC0,     0x2DC6, 1},
+       {    0x2DC8,     0x2DCE, 1},
+       {    0x2DD0,     0x2DD6, 1},
+       {    0x2DD8,     0x2DDE, 1},
+/* CJK SYMBOLS AND PUNCTUATION */
+       {    0x3005,     0x3007, 1},
+       {    0x3021,     0x3029, 1},
+       {    0x3031,     0x3035, 1},
+       {    0x3038,     0x303C, 1},
+/* HIRAGANA */
+       {    0x3041,     0x3096, 1},
+       {    0x309D,     0x309F, 1},
+/* KATAKANA */
+       {    0x30A1,     0x30FA, 1},
+       {    0x30FC,     0x30FF, 1},
+/* BOPOMOFO */
+       {    0x3105,     0x312D, 1},
+/* HANGUL COMPATIBILITY JAMO */
+       {    0x3131,     0x318E, 1},
+/* BOPOMOFO EXTENDED */
+       {    0x31A0,     0x31B7, 1},
+/* KATAKANA PHONETIC EXTENSIONS */
+       {    0x31F0,     0x31FF, 1},
+/* CJK UNIFIED IDEOGRAPHS EXTENSION */
+       {    0x3400,     0x4DB5, 1},
+/* CJK UNIFIED IDEOGRAPHS */
+       {    0x4E00,     0x9FBB, 1},
+/* YI SYLLABLES */
+       {    0xA000,     0xA48C, 1},
+/* VAI SYLLABLES */
+       {    0xA500,     0xA60B, 1},
+       {    0xA610,     0xA61F, 1},
+       {    0xA62A,     0xA62B, 1},
+/* CYRILLIC SUPPLEMENT 2 */
+       {    0xA640,     0xA65F, 1},
+       {    0xA662,     0xA66E, 1},
+       {    0xA680,     0xA697, 1},
+/* LATIN EXTENDED-D */
+       {    0xA717,     0xA71F, 1},
+       {    0xA722,     0xA78C, 1},
+       {    0xA7FB,     0xA7FF, 1},
+/* SYLOTI NEGRI */
+       {    0xA800,          0, 0},
+       {    0xA801,          0, 0},
+       {    0xA803,     0xA805, 1},
+       {    0xA807,     0xA80A, 1},
+       {    0xA80C,     0xA822, 1},
+/* PHAGS PA */
+       {    0xA840,     0xA873, 1},
+/* SAURASHTRA */
+       {    0xA882,     0xA8B3, 1},
+/* KAYAH LI */
+       {    0xA90A,     0xA92D, 1},
+/* REJANG */
+       {    0xA930,     0xA946, 1},
+/* CHAM */
+       {    0xAA00,     0xAA28, 1},
+       {    0xAA40,     0xAA42, 1},
+       {    0xAA44,     0xAA4B, 1},
+/* HANGUL SYLLABLES */
+       {    0xAC00,     0xD7A3, 1},
+/* CJK COMPATIBILITY IDEOGRAPHS */
+       {    0xF900,     0xFA2D, 1},
+       {    0xFA30,     0xFA6A, 1},
+       {    0xFA70,     0xFAD9, 1},
+/* ALPHABETIC PRESENTATION FORMS */
+       {    0xFB00,     0xFB06, 1},
+       {    0xFB13,     0xFB17, 1},
+       {    0xFB1D,          0, 0},
+       {    0xFB1F,     0xFB28, 1},
+       {    0xFB2A,     0xFB36, 1},
+       {    0xFB38,     0xFB3C, 1},
+       {    0xFB3E,          0, 0},
+       {    0xFB40,          0, 0},
+       {    0xFB41,          0, 0},
+       {    0xFB43,          0, 0},
+       {    0xFB44,          0, 0},
+       {    0xFB46,     0xFB4F, 1},
+/* ARABIC PRESENTATION FORMS-A */
+       {    0xFB50,     0xFBB1, 1},
+       {    0xFBD3,     0xFD3D, 1},
+       {    0xFD50,     0xFD8F, 1},
+       {    0xFD92,     0xFDC7, 1},
+       {    0xFDF0,     0xFDFB, 1},
+/* ARABIC PRESENTATION FORMS-B */
+       {    0xFE70,     0xFE74, 1},
+       {    0xFE76,     0xFEFC, 1},
+/* HALFWIDTH AND FULLWIDTH FORMS */
+       {    0xFF21,     0xFF3A, 1},
+       {    0xFF41,     0xFF5A, 1},
+       {    0xFF66,     0xFFBE, 1},
+       {    0xFFC2,     0xFFC7, 1},
+       {    0xFFCA,     0xFFCF, 1},
+       {    0xFFD2,     0xFFD7, 1},
+       {    0xFFDA,     0xFFDC, 1},
+/* LINEAR B SYLLABARY */
+       {0x00010000, 0x0001000B, 1},
+       {0x0001000D, 0x00010026, 1},
+       {0x00010028, 0x0001003A, 1},
+       {0x0001003C, 0x0001003D, 1},
+       {0x0001003F, 0x0001004D, 1},
+       {0x00010050, 0x0001005D, 1},
+/* LINEAR B IDEOGRAMS */
+       {0x00010080, 0x000100FA, 1},
+/* ANCIENT GREEK NUMBERS */
+       {0x00010140, 0x00010174, 1},
+/* LYCIAN */
+       {0x00010280, 0x0001029C, 1},
+/* CARIAN */
+       {0x000102A0, 0x000102D0, 1},
+/* OLD ITALIC */
+       {0x00010300, 0x0001031E, 1},
+/* GOTHIC */
+       {0x00010330, 0x0001034A, 1},
+/* UGARITIC */
+       {0x00010380, 0x0001039D, 1},
+/* OLD PERSIAN */
+       {0x000103A0, 0x000103C3, 1},
+       {0x000103C8, 0x000103CF, 1},
+       {0x000103D1, 0x000103D5, 1},
+/* DESERET */
+       {0x00010400, 0x0001044F, 1},
+/* SHAVIAN */
+       {0x00010450, 0x0001047F, 1},
+/* OSMANYA */
+       {0x00010480, 0x0001049D, 1},
+       {0x000104A0, 0x000104A9, 1},
+/* CYPRIOT SYLLABARY */
+       {0x00010800, 0x00010805, 1},
+       {0x00010808,          0, 0},
+       {0x0001080A, 0x00010835, 1},
+       {0x00010837, 0x00010838, 1},
+       {0x0001083C,          0, 0},
+       {0x0001083F,          0, 0},
+/* PHOENICIAN */
+       {0x00010900, 0x00010915, 1},
+       {0x00010A00,          0, 0},
+       {0x00010A10, 0x00010A13, 1},
+/* KHAROSHTI */
+       {0x00010A15, 0x00010A17, 1},
+       {0x00010A19, 0x00010A33, 1},
+/* CUNEIFORM */
+       {0x00012000, 0x0001236E, 1},
+/* CUNEIFORM NUMBERS AND PONCTUATION */
+       {0x00012400, 0x00012462, 1},
+/* BYZANTINE MUSICAL SYMBOLS */
+/* MATHEMATICAL ALPHANUMERIC SYMBOLS */
+       {0x0001D400, 0x0001D454, 1},
+       {0x0001D456, 0x0001D49C, 1},
+       {0x0001D49E, 0x0001D49F, 1},
+       {0x0001D4A2,          0, 0},
+       {0x0001D4A5, 0x0001D4A6, 1},
+       {0x0001D4A9, 0x0001D4AC, 1},
+       {0x0001D4AE, 0x0001D4B9, 1},
+       {0x0001D4BB,          0, 0},
+       {0x0001D4BD, 0x0001D4C3, 1},
+       {0x0001D4C5, 0x0001D505, 1},
+       {0x0001D507, 0x0001D50A, 1},
+       {0x0001D50D, 0x0001D514, 1},
+       {0x0001D516, 0x0001D51C, 1},
+       {0x0001D51E, 0x0001D539, 1},
+       {0x0001D53B, 0x0001D53E, 1},
+       {0x0001D540, 0x0001D544, 1},
+       {0x0001D546,          0, 0},
+       {0x0001D54A, 0x0001D550, 1},
+       {0x0001D552, 0x0001D6A5, 1},
+       {0x0001D6A8, 0x0001D6C0, 1},
+       {0x0001D6C2, 0x0001D6DA, 1},
+       {0x0001D6DC, 0x0001D6FA, 1},
+       {0x0001D6FC, 0x0001D714, 1},
+       {0x0001D716, 0x0001D734, 1},
+       {0x0001D736, 0x0001D74E, 1},
+       {0x0001D750, 0x0001D76E, 1},
+       {0x0001D770, 0x0001D788, 1},
+       {0x0001D78A, 0x0001D7A8, 1},
+       {0x0001D7AA, 0x0001D7C2, 1},
+       {0x0001D7C4, 0x0001D7CB, 1},
+       {0x0001D7CE, 0x0001D7FF, 1},
+/* CJK UNIFIED IDEOGRAPHS EXTENSION */
+       {0x00020000, 0x0002A6D6, 1},
+/* CJK COMPATIBILITY IDEOGRAPHS SUPPLEMENT */
+       {0x0002F800, 0x0002FA1D, 1},
+/* The non-ASCII number characters are included here because ISO C 99 */
+/* forbids us to classify them as digits; however, they behave more like */
+/* alphanumeric than like punctuation. */
+/* ARABIC */
+       {    0x0660,     0x0669, 1},
+       {    0x06F0,     0x06F9, 1},
+/* DEVANAGARI */
+       {    0x0966,     0x096F, 1},
+/* BENGALI */
+       {    0x09E6,     0x09EF, 1},
+/* GURMUKHI */
+       {    0x0A66,     0x0A6F, 1},
+/* GUJARATI */
+       {    0x0AE6,     0x0AEF, 1},
+/* ORIYA */
+       {    0x0B66,     0x0B6F, 1},
+/* TAMIL */
+       {    0x0BE6,     0x0BEF, 1},
+/* TELUGU */
+       {    0x0C66,     0x0C6F, 1},
+       {    0x0C78,     0x0C7F, 1},
+/* KANNADA */
+       {    0x0CE6,     0x0CEF, 1},
+/* MALAYALAM */
+       {    0x0D66,     0x0D75, 1},
+       {    0x0D70,     0x0D75, 1},
+/* THAI */
+       {    0x0E50,     0x0E59, 1},
+/* LAO */
+       {    0x0ED0,     0x0ED9, 1},
+/* TIBETAN */
+       {    0x0F20,     0x0F29, 1},
+/* MYANMAR */
+       {    0x1040,     0x1049, 1},
+/* KHMER */
+       {    0x17E0,     0x17E9, 1},
+/* MONGOLIAN */
+       {    0x1810,     0x1819, 1},
+/* SUNDANESE */
+       {    0x1BB0,     0x1BB9, 1},
+/* LEPCHA */
+       {    0x1C40,     0x1C49, 1},
+/* OL CHIKI */
+       {    0x1C50,     0x1C59, 1},
+/* VAI */
+       {    0xA620,     0xA629, 1},
+/* SAURASHTRA */
+       {    0xA8D0,     0xA8D9, 1},
+/* KAYAH LI */
+       {    0xA900,     0xA909, 1},
+/* CHAM */
+       {    0xAA50,     0xAA59, 1},
+/* HALFWIDTH AND FULLWIDTH FORMS */
+       {    0xFF10,     0xFF19, 1},
+       {0, 0, 0}
+};