simplify
[m6w6/ext-http] / php_http_url.c
index bed03d6ae5bf84a42c5db36cbb3e8ead43bbaaf0..05850e8c42a74482c90a8725147ebf4d4d36f751 100644 (file)
 #      include <wctype.h>
 #endif
 
-#ifdef HAVE_LANGINFO_H
-#      include <langinfo.h>
+#ifdef HAVE_ARPA_INET_H
+#      include <arpa/inet.h>
 #endif
-#include <locale.h>
+
+#include "php_http_utf8.h"
 
 static inline char *localhostname(void)
 {
@@ -313,117 +314,44 @@ STATUS php_http_url_encode_hash_ex(HashTable *hash, php_http_buffer_t *qstr, con
        return SUCCESS;
 }
 
-void php_http_url_dtor(php_http_url_t *url)
-{
-       STR_FREE(url->scheme.str);
-       STR_FREE(url->authority.userinfo.username.str);
-       STR_FREE(url->authority.userinfo.password.str);
-       STR_FREE(url->authority.host.str);
-       STR_FREE(url->path.str);
-       STR_FREE(url->query.str);
-       STR_FREE(url->fragment.str);
-}
+struct parse_state {
+       php_http_url_t url;
+#ifdef ZTS
+       void ***ts;
+#endif
+       const char *ptr;
+       const char *end;
+       size_t maxlen;
+       off_t offset;
+       unsigned flags;
+       char buffer[1]; /* last member */
+};
 
 void php_http_url_free(php_http_url_t **url)
 {
        if (*url) {
-               php_http_url_dtor(*url);
                efree(*url);
                *url = NULL;
        }
 }
 
-#ifdef PHP_HTTP_HAVE_WCHAR
-static zend_bool cs_is_utf8(char **lc_ctype)
-{
-#if HAVE_NL_LANGINFO
-       if (strcmp("UTF-8", nl_langinfo(CODESET))) {
-               *lc_ctype = setlocale(LC_CTYPE, NULL);
-               return 0;
-       }
-       return 1;
-#else
-       *lc_ctype = setlocale(LC_CTYPE, NULL);
-
-       if (*lc_ctype) {
-               char *cs;
-
-               if ((cs = strstr(*lc_ctype, ".utf")) || (cs = strstr(*lc_ctype, ".UTF"))) {
-                       if (cs[4] == '-') {
-                               ++cs;
-                       }
-                       if (cs[4] == '8' && (cs[5] == '\0' || cs[5] == '@')) {
-                               return 1;
-                       }
-               }
-               return 0;
-       }
-#endif
-}
-
-static const unsigned char utf8mblen[256] = {
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
-    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
-    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-    4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6
-};
-static const unsigned char utf8mask[] = {
-               0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01
-};
-
-static size_t utf8towc(wchar_t *wc, const unsigned char *uc, size_t len)
+static size_t parse_mb_utf8(unsigned *wc, const char *ptr, const char *end)
 {
-       unsigned char ub = utf8mblen[*uc];
+       unsigned wchar;
+       size_t consumed = utf8towc(&wchar, (const unsigned char *) ptr, end - ptr);
 
-       if (!ub || ub > len || ub > 3) {
+       if (!consumed || consumed == (size_t) -1) {
                return 0;
        }
 
-       *wc = *uc & utf8mask[ub];
-
-       switch (ub) {
-       case 4:
-               if ((uc[1] & 0xc0) != 0x80) {
-                       return 0;
-               }
-               *wc <<= 6;
-               *wc += *++uc & 0x3f;
-               /* no break */
-       case 3:
-               if ((uc[1] & 0xc0) != 0x80) {
-                       return 0;
-               }
-               *wc <<= 6;
-               *wc += *++uc & 0x3f;
-               /* no break */
-       case 2:
-               if ((uc[1] & 0xc0) != 0x80) {
-                       return 0;
-               }
-               *wc <<= 6;
-               *wc += *++uc & 0x3f;
-               break;
-
-       default:
-               return 0;
+       if (wc) {
+               *wc = wchar;
        }
-
-       return ub;
+       return consumed;
 }
 
-static size_t parse_locmb(php_http_url_t *url, const char *ptr, const char *end)
+#ifdef PHP_HTTP_HAVE_WCHAR
+static size_t parse_mb_loc(unsigned *wc, const char *ptr, const char *end)
 {
        wchar_t wchar;
        size_t consumed = 0;
@@ -435,60 +363,98 @@ static size_t parse_locmb(php_http_url_t *url, const char *ptr, const char *end)
        consumed = mbtowc(&wchar, ptr, end - ptr);
 #endif
 
-       if (!consumed || consumed == (size_t) -1 || !iswalnum(wchar)) {
+       if (!consumed || consumed == (size_t) -1) {
                return 0;
        }
 
-       return consumed - 1;
+       if (wc) {
+               *wc = wchar;
+       }
+       return consumed;
 }
+#endif
 
-#include "ualpha.h"
+typedef enum parse_mb_what {
+       PARSE_SCHEME,
+       PARSE_USERINFO,
+       PARSE_HOSTINFO,
+       PARSE_PATH,
+       PARSE_QUERY,
+       PARSE_FRAGMENT
+} parse_mb_what_t;
+
+static const char * const parse_what[] = {
+       "scheme",
+       "userinfo",
+       "hostinfo",
+       "path",
+       "query",
+       "fragment"
+};
+
+static const char parse_xdigits[] = "0123456789ABCDEF";
 
-static zend_bool isualnum(wchar_t ch)
+static size_t parse_mb(struct parse_state *state, parse_mb_what_t what, const char *ptr, const char *end, const char *begin, zend_bool silent)
 {
-       unsigned i;
+       unsigned wchar;
+       size_t consumed = 0;
 
-       /* digits */
-       if (ch >= 0x30 && ch <= 0x39) {
-               return 1;
+       if (state->flags & PHP_HTTP_URL_PARSE_MBUTF8) {
+               consumed = parse_mb_utf8(&wchar, ptr, end);
        }
-       for (i = 0; i < sizeof(utf8_ranges)/sizeof(utf8_range_t); ++i) {
-               if (utf8_ranges[i].start == ch) {
-                       return 1;
-               } else if (utf8_ranges[i].start <= ch && utf8_ranges[i].end >= ch) {
-                       if (utf8_ranges[i].step == 1) {
-                               return 1;
-                       }
-                       /* FIXME step */
-                       return 0;
-               }
+#ifdef PHP_HTTP_HAVE_WCHAR
+       else if (state->flags & PHP_HTTP_URL_PARSE_MBLOC) {
+               consumed = parse_mb_loc(&wchar, ptr, end);
        }
-       return 0;
-}
-
-static size_t parse_utf8mb(php_http_url_t *url, const char *ptr, const char *end)
-{
-       char *lc_ctype = NULL;
-
-       if (0 && cs_is_utf8(&lc_ctype)) {
-               return parse_locmb(url, ptr, end);
-       } else {
-               wchar_t wchar;
-               size_t consumed = utf8towc(&wchar, (const unsigned char *) ptr, end - ptr);
+#endif
 
-               if (!consumed || consumed == (size_t) -1 || !isualnum(wchar)) {
-                       return 0;
+       while (consumed) {
+               if (!(state->flags & PHP_HTTP_URL_PARSE_TOPCT) || what == PARSE_HOSTINFO || what == PARSE_SCHEME) {
+                       if (what == PARSE_HOSTINFO && (state->flags & PHP_HTTP_URL_PARSE_TOIDN)) {
+                               /* idna */
+                       } else if (state->flags & PHP_HTTP_URL_PARSE_MBUTF8) {
+                               if (!isualnum(wchar)) {
+                                       break;
+                               }
+#ifdef PHP_HTTP_HAVE_WCHAR
+                       } else if (state->flags & PHP_HTTP_URL_PARSE_MBLOC) {
+                               if (!iswalnum(wchar)) {
+                                       break;
+                               }
+#endif
+                       }
+                       PHP_HTTP_DUFF(consumed, state->buffer[state->offset++] = *ptr++);
+               } else {
+                       int i = 0;
+
+                       PHP_HTTP_DUFF(consumed,
+                                       state->buffer[state->offset++] = '%';
+                                       state->buffer[state->offset++] = parse_xdigits[((unsigned char) ptr[i]) >> 4];
+                                       state->buffer[state->offset++] = parse_xdigits[((unsigned char) ptr[i]) & 0xf];
+                                       ++i;
+                       );
                }
 
-               return consumed -1 ;
+               return consumed;
        }
+
+       if (!silent) {
+               TSRMLS_FETCH_FROM_CTX(state->ts);
+               php_error_docref(NULL TSRMLS_CC, E_WARNING,
+                               "Failed to parse %s; unexpected byte 0x%02x at pos %u in '%s'",
+                               parse_what[what], (unsigned char) *ptr, (unsigned) (ptr - begin), begin);
+       }
+
+       return 0;
 }
-#endif
 
-static STATUS parse_userinfo(php_http_url_t *url, const char *ptr, const char *end)
+static STATUS parse_userinfo(struct parse_state *state, const char *ptr)
 {
-       const char *password = NULL, *tmp = ptr;
-       TSRMLS_FETCH_FROM_CTX(url->ts);
+       size_t mb;
+       const char *password = NULL, *end = state->ptr, *tmp = ptr;
+       TSRMLS_FETCH_FROM_CTX(state->ts);
+
+       state->url.user = &state->buffer[state->offset];
 
        do {
                switch (*ptr) {
@@ -500,16 +466,20 @@ static STATUS parse_userinfo(php_http_url_t *url, const char *ptr, const char *e
                                return FAILURE;
                        }
                        password = ptr + 1;
+                       state->buffer[state->offset++] = 0;
+                       state->url.pass = &state->buffer[state->offset];
                        break;
 
                case '%':
-                       if (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2))) {
+                       if (ptr[1] != '%' && (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2)))) {
                                php_error_docref(NULL TSRMLS_CC, E_WARNING,
                                                "Failed to parse userinfo; invalid percent encoding at pos %u in '%s'",
                                                (unsigned) (ptr - tmp), tmp);
                                return FAILURE;
                        }
-                       ptr += 2;
+                       state->buffer[state->offset++] = *ptr++;
+                       state->buffer[state->offset++] = *ptr++;
+                       state->buffer[state->offset++] = *ptr;
                        break;
 
                case '!': case '$': case '&': case '\'': case '(': case ')': case '*':
@@ -526,61 +496,69 @@ static STATUS parse_userinfo(php_http_url_t *url, const char *ptr, const char *e
                case '0': case '1': case '2': case '3': case '4': case '5': case '6':
                case '7': case '8': case '9':
                        /* allowed */
+                       state->buffer[state->offset++] = *ptr;
                        break;
 
                default:
-                       if (url->flags & PHP_HTTP_URL_PARSE_UTF8MB) {
-                               size_t n = parse_utf8mb(url, ptr, end);
-
-                               if (n) {
-                                       ptr += n;
-                                       break;
-                               }
-                       }
-#ifdef PHP_HTTP_HAVE_WCHAR
-                       else if (url->flags & PHP_HTTP_URL_PARSE_LOCMB) {
-                               size_t n = parse_locmb(url, ptr, end);
-
-                               if (n) {
-                                       ptr += n;
-                                       break;
-                               }
+                       if (!(mb = parse_mb(state, PARSE_USERINFO, ptr, end, tmp, 0))) {
+                               return FAILURE;
                        }
-#endif
-                       php_error_docref(NULL TSRMLS_CC, E_WARNING,
-                                       "Failed to parse userinfo; unexpected byte 0x%02x at pos %u in '%s'",
-                                       *ptr, (unsigned) (ptr - tmp), tmp);
+                       ptr += mb - 1;
                }
        } while(++ptr != end);
 
-       if (password) {
-               url->authority.userinfo.username.len = password - tmp - 1;
-               url->authority.userinfo.username.str = estrndup(tmp,
-                               url->authority.userinfo.username.len);
-               url->authority.userinfo.password.len = end - password;
-               url->authority.userinfo.password.str = estrndup(password,
-                               url->authority.userinfo.password.len);
-       } else {
-               url->authority.userinfo.username.len = end - tmp;
-               url->authority.userinfo.username.str = estrndup(tmp,
-                               url->authority.userinfo.username.len);
-       }
+
+       state->buffer[state->offset++] = 0;
 
        return SUCCESS;
 }
 
-static STATUS parse_hostinfo(php_http_url_t *url, const char *ptr, const char *end)
+static STATUS parse_hostinfo(struct parse_state *state, const char *ptr)
 {
-       const char *tmp = ptr, *port = NULL;
-       TSRMLS_FETCH_FROM_CTX(url->ts);
+       size_t mb, len;
+       const char *end = state->ptr, *tmp = ptr, *port = NULL;
+       TSRMLS_FETCH_FROM_CTX(state->ts);
+
+
+#ifdef HAVE_INET_PTON
+       if (*ptr == '[') {
+               char *error = NULL, *tmp = memchr(ptr, ']', end - ptr);
+
+               if (tmp) {
+                       size_t addrlen = tmp - ptr + 1;
+                       char buf[16], *addr = estrndup(ptr + 1, addrlen - 2);
+                       int rv = inet_pton(AF_INET6, addr, buf);
+
+                       efree(addr);
+                       if (rv == 1) {
+                               state->buffer[state->offset] = '[';
+                               state->url.host = &state->buffer[state->offset];
+                               inet_ntop(AF_INET6, buf, state->url.host + 1, state->maxlen - state->offset);
+                               state->offset += strlen(state->url.host);
+                               state->buffer[state->offset++] = ']';
+                               state->buffer[state->offset++] = 0;
+                               ptr = tmp + 1;
+                       } else if (rv == -1) {
+                               error = strerror(errno);
+                       } else {
+                               error = "unexpected '['";
+                       }
+               } else {
+                       error = "expected ']'";
+               }
 
-       /* FIXME: IP(v6) addresses */
-       do {
+               if (error) {
+                       php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse hostinfo; %s", error);
+                       return FAILURE;
+               }
+       }
+#endif
+       if (ptr != end) do {
                switch (*ptr) {
                case ':':
                        if (port) {
                                php_error_docref(NULL TSRMLS_CC, E_WARNING,
-                                               "Failed to parse port; duplicate ':' at pos %u in '%s'",
+                                               "Failed to parse port; unexpected ':' at pos %u in '%s'",
                                                (unsigned) (ptr - tmp), tmp);
                                return FAILURE;
                        }
@@ -588,13 +566,15 @@ static STATUS parse_hostinfo(php_http_url_t *url, const char *ptr, const char *e
                        break;
 
                case '%':
-                       if (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2))) {
+                       if (ptr[1] != '%' && (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2)))) {
                                php_error_docref(NULL TSRMLS_CC, E_WARNING,
                                                "Failed to parse hostinfo; invalid percent encoding at pos %u in '%s'",
                                                (unsigned) (ptr - tmp), tmp);
                                return FAILURE;
                        }
-                       ptr += 2;
+                       state->buffer[state->offset++] = *ptr++;
+                       state->buffer[state->offset++] = *ptr++;
+                       state->buffer[state->offset++] = *ptr;
                        break;
 
                case '!': case '$': case '&': case '\'': case '(': case ')': case '*':
@@ -611,7 +591,7 @@ static STATUS parse_hostinfo(php_http_url_t *url, const char *ptr, const char *e
                        if (port) {
                                php_error_docref(NULL TSRMLS_CC, E_WARNING,
                                                "Failed to parse port; unexpected char '%c' at pos %u in '%s'",
-                                               *ptr, (unsigned) (ptr - tmp), tmp);
+                                               (unsigned char) *ptr, (unsigned) (ptr - tmp), tmp);
                                return FAILURE;
                        }
                        /* no break */
@@ -619,67 +599,53 @@ static STATUS parse_hostinfo(php_http_url_t *url, const char *ptr, const char *e
                case '7': case '8': case '9':
                        /* allowed */
                        if (port) {
-                               url->authority.port *= 10;
-                               url->authority.port += *ptr - '0';
+                               state->url.port *= 10;
+                               state->url.port += *ptr - '0';
+                       } else {
+                               state->buffer[state->offset++] = *ptr;
                        }
                        break;
 
                default:
-                       if (!port) {
-                               if (url->flags & PHP_HTTP_URL_PARSE_UTF8MB) {
-                                       size_t n = parse_utf8mb(url, ptr, end);
-
-                                       if (n) {
-                                               ptr += n;
-                                               break;
-                                       }
-                               }
-#ifdef PHP_HTTP_HAVE_WCHAR
-                               else if ((url->flags & PHP_HTTP_URL_PARSE_LOCMB) || (url->flags & PHP_HTTP_URL_PARSE_LOCIDN)) {
-                                       size_t n = parse_locmb(url, ptr, end);
-
-                                       if (n) {
-                                               ptr += n;
-                                               break;
-                                       }
-                               }
-#endif
+                       if (port) {
+                               php_error_docref(NULL TSRMLS_CC, E_WARNING,
+                                               "Failed to parse port; unexpected byte 0x%02x at pos %u in '%s'",
+                                               (unsigned char) *ptr, (unsigned) (ptr - tmp), tmp);
+                               return FAILURE;
+                       } else if (!(mb = parse_mb(state, PARSE_HOSTINFO, ptr, end, tmp, 0))) {
+                               return FAILURE;
                        }
-                       php_error_docref(NULL TSRMLS_CC, E_WARNING,
-                                       "Failed to parse hostinfo; unexpected byte 0x%02x at pos %u in '%s'",
-                                       (unsigned char) *ptr, (unsigned) (ptr - tmp), tmp);
-                       return FAILURE;
+                       ptr += mb - 1;
                }
        } while (++ptr != end);
 
-       if (port) {
-               url->authority.host.len = port - tmp - 1;
-       } else {
-               url->authority.host.len = end - tmp;
+       if (!state->url.host) {
+               len = (port ? port - tmp - 1 : end - tmp);
+               state->url.host = &state->buffer[state->offset - len];
+               state->buffer[state->offset++] = 0;
        }
 
-       url->authority.host.str = estrndup(tmp, url->authority.host.len);
-
 #ifdef PHP_HTTP_HAVE_IDN
-       if (url->flags & PHP_HTTP_URL_PARSE_UTF8IDN) {
+       if (state->flags & PHP_HTTP_URL_PARSE_TOIDN) {
                char *idn = NULL;
-               int rv = idna_to_ascii_8z(url->authority.host.str, &idn, IDNA_ALLOW_UNASSIGNED|IDNA_USE_STD3_ASCII_RULES);
+               int rv = -1;
 
-               if (rv != IDNA_SUCCESS) {
-                       php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Failed to parse IDN: '%s'", idna_strerror(rv));
-               } else {
-                       STR_SET(url->authority.host.str, estrdup(idn));
-                       free(idn);
+               if (state->flags & PHP_HTTP_URL_PARSE_MBUTF8) {
+                       rv = idna_to_ascii_8z(state->url.host, &idn, IDNA_ALLOW_UNASSIGNED|IDNA_USE_STD3_ASCII_RULES);
                }
-       } else if (url->flags & PHP_HTTP_URL_PARSE_LOCIDN) {
-               char *idn = NULL;
-               int rv = idna_to_ascii_lz(url->authority.host.str, &idn, IDNA_ALLOW_UNASSIGNED|IDNA_USE_STD3_ASCII_RULES);
-
+#      ifdef PHP_HTTP_HAVE_WCHAR
+               else if (state->flags & PHP_HTTP_URL_PARSE_MBLOC) {
+                       rv = idna_to_ascii_lz(state->url.host, &idn, IDNA_ALLOW_UNASSIGNED|IDNA_USE_STD3_ASCII_RULES);
+               }
+#      endif
                if (rv != IDNA_SUCCESS) {
-                       php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Failed to parse IDN: '%s'", idna_strerror(rv));
+                       php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse IDN; %s", idna_strerror(rv));
+                       return FAILURE;
                } else {
-                       STR_SET(url->authority.host.str, estrdup(idn));
+                       size_t idnlen = strlen(idn);
+                       memcpy(state->url.host, idn, idnlen + 1);
                        free(idn);
+                       state->offset += idnlen - len;
                }
        }
 #endif
@@ -687,18 +653,25 @@ static STATUS parse_hostinfo(php_http_url_t *url, const char *ptr, const char *e
        return SUCCESS;
 }
 
-static const char *parse_authority(php_http_url_t *url, const char *ptr, const char *end)
+static const char *parse_authority(struct parse_state *state)
 {
-       const char *tmp = ptr;
+       const char *tmp = state->ptr, *host = NULL;
 
        do {
-               switch (*ptr) {
+               switch (*state->ptr) {
                case '@':
                        /* userinfo delimiter */
-                       if (tmp != ptr && SUCCESS != parse_userinfo(url, tmp, ptr)) {
+                       if (host) {
+                               TSRMLS_FETCH_FROM_CTX(state->ts);
+                               php_error_docref(NULL TSRMLS_CC, E_WARNING,
+                                               "Failed to parse userinfo; unexpected '@'");
+                               return NULL;
+                       }
+                       host = state->ptr + 1;
+                       if (tmp != state->ptr && SUCCESS != parse_userinfo(state, tmp)) {
                                return NULL;
                        }
-                       tmp = ptr + 1;
+                       tmp = state->ptr + 1;
                        break;
 
                case '/':
@@ -706,37 +679,52 @@ static const char *parse_authority(php_http_url_t *url, const char *ptr, const c
                case '#':
                case '\0':
                        /* host delimiter */
-                       if (tmp != ptr && SUCCESS != parse_hostinfo(url, tmp, ptr)) {
+                       if (tmp != state->ptr && SUCCESS != parse_hostinfo(state, tmp)) {
                                return NULL;
                        }
-                       return ptr;
+                       return state->ptr;
                }
-       } while (++ptr <= end);
+       } while (++state->ptr <= state->end);
 
        return NULL;
 }
 
-static const char *parse_path(php_http_url_t *url, const char *ptr, const char *end)
+static const char *parse_path(struct parse_state *state)
 {
-       const char *tmp = ptr;
-       TSRMLS_FETCH_FROM_CTX(url->ts);
+       size_t mb;
+       const char *tmp;
+       TSRMLS_FETCH_FROM_CTX(state->ts);
+
+       /* is there actually a path to parse? */
+       if (!*state->ptr) {
+               return state->ptr;
+       }
+       tmp = state->ptr;
+       state->url.path = &state->buffer[state->offset];
 
        do {
-               switch (*ptr) {
+               switch (*state->ptr) {
+               case '#':
                case '?':
                case '\0':
-                       url->path.len = ptr - tmp;
-                       url->path.str = estrndup(tmp, url->path.len);
-                       return ptr;
+                       /* did we have any path component ? */
+                       if (tmp != state->ptr) {
+                               state->buffer[state->offset++] = 0;
+                       } else {
+                               state->url.path = NULL;
+                       }
+                       return state->ptr;
 
                case '%':
-                       if (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2))) {
+                       if (state->ptr[1] != '%' && (state->end - state->ptr <= 2 || !isxdigit(*(state->ptr+1)) || !isxdigit(*(state->ptr+2)))) {
                                php_error_docref(NULL TSRMLS_CC, E_WARNING,
                                                "Failed to parse path; invalid percent encoding at pos %u in '%s'",
-                                               (unsigned) (ptr - tmp), tmp);
+                                               (unsigned) (state->ptr - tmp), tmp);
                                return NULL;
                        }
-                       ptr += 2;
+                       state->buffer[state->offset++] = *state->ptr++;
+                       state->buffer[state->offset++] = *state->ptr++;
+                       state->buffer[state->offset++] = *state->ptr;
                        break;
 
                case '/': /* yeah, well */
@@ -755,57 +743,52 @@ static const char *parse_path(php_http_url_t *url, const char *ptr, const char *
                case '0': case '1': case '2': case '3': case '4': case '5': case '6':
                case '7': case '8': case '9':
                        /* allowed */
+                       state->buffer[state->offset++] = *state->ptr;
                        break;
 
                default:
-                       if (url->flags & PHP_HTTP_URL_PARSE_UTF8MB) {
-                               size_t n = parse_utf8mb(url, ptr, end);
-
-                               if (n) {
-                                       ptr += n;
-                                       break;
-                               }
-                       }
-#if PHP_HTTP_HAVE_WCHAR
-                       else if (url->flags & PHP_HTTP_URL_PARSE_LOCMB) {
-                               size_t n = parse_locmb(url, ptr, end);
-
-                               if (n) {
-                                       ptr += n;
-                                       break;
-                               }
+                       if (!(mb = parse_mb(state, PARSE_PATH, state->ptr, state->end, tmp, 0))) {
+                               return NULL;
                        }
-#endif
-                       php_error_docref(NULL TSRMLS_CC, E_WARNING,
-                                       "Failed to parse path; unexpected byte 0x%02x pos %u in '%s'",
-                                       *ptr, (unsigned) (ptr - tmp), tmp);
+                       state->ptr += mb - 1;
                }
-       } while (++ptr <= end);
+       } while (++state->ptr <= state->end);
 
        return NULL;
 }
 
-static const char *parse_query(php_http_url_t *url, const char *ptr, const char *end)
+static const char *parse_query(struct parse_state *state)
 {
-       const char *tmp = ptr + !!*ptr;
-       TSRMLS_FETCH_FROM_CTX(url->ts);
+       size_t mb;
+       const char *tmp = state->ptr + !!*state->ptr;
+       TSRMLS_FETCH_FROM_CTX(state->ts);
+
+       /* is there actually a query to parse? */
+       if (*state->ptr != '?') {
+               return state->ptr;
+       }
+
+       /* skip initial '?' */
+       tmp = ++state->ptr;
+       state->url.query = &state->buffer[state->offset];
 
        do {
-               switch (*ptr) {
+               switch (*state->ptr) {
                case '#':
                case '\0':
-                       url->query.len = ptr - tmp;
-                       url->query.str = estrndup(tmp, url->query.len);
-                       return ptr;
+                       state->buffer[state->offset++] = 0;
+                       return state->ptr;
 
                case '%':
-                       if (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2))) {
+                       if (state->ptr[1] != '%' && (state->end - state->ptr <= 2 || !isxdigit(*(state->ptr+1)) || !isxdigit(*(state->ptr+2)))) {
                                php_error_docref(NULL TSRMLS_CC, E_WARNING,
                                                "Failed to parse query; invalid percent encoding at pos %u in '%s'",
-                                               (unsigned) (ptr - tmp), tmp);
+                                               (unsigned) (state->ptr - tmp), tmp);
                                return NULL;
                        }
-                       ptr += 2;
+                       state->buffer[state->offset++] = *state->ptr++;
+                       state->buffer[state->offset++] = *state->ptr++;
+                       state->buffer[state->offset++] = *state->ptr;
                        break;
 
                case '?': case '/': /* yeah, well */
@@ -824,51 +807,54 @@ static const char *parse_query(php_http_url_t *url, const char *ptr, const char
                case '0': case '1': case '2': case '3': case '4': case '5': case '6':
                case '7': case '8': case '9':
                        /* allowed */
+                       state->buffer[state->offset++] = *state->ptr;
                        break;
 
                default:
-#ifdef PHP_HTTP_HAVE_WCHAR
-                       if (url->flags & PHP_HTTP_URL_PARSE_LOCMB) {
-                               size_t n = parse_locmb(url, ptr, end);
-
-                               if (n) {
-                                       ptr += n;
-                                       break;
-                               }
+                       if (!(mb = parse_mb(state, PARSE_QUERY, state->ptr, state->end, tmp, 0))) {
+                               return NULL;
                        }
-#endif
-                       php_error_docref(NULL TSRMLS_CC, E_WARNING,
-                                       "Failed to parse query; unexpected byte 0x%02x at pos %u in '%s'",
-                                       *ptr, (unsigned) (ptr - tmp), tmp);
+                       state->ptr += mb - 1;
                }
-       } while (++ptr <= end);
+       } while (++state->ptr <= state->end);
 
        return NULL;
 }
 
-static const char *parse_fragment(php_http_url_t *url, const char *ptr, const char *end)
+static const char *parse_fragment(struct parse_state *state)
 {
-       const char *tmp = ptr + !!*ptr;
-       TSRMLS_FETCH_FROM_CTX(url->ts);
+       size_t mb;
+       const char *tmp;
+       TSRMLS_FETCH_FROM_CTX(state->ts);
+
+       /* is there actually a fragment to parse? */
+       if (*state->ptr != '#') {
+               return state->ptr;
+       }
+
+       /* skip initial '#' */
+       tmp = ++state->ptr;
+       state->url.fragment = &state->buffer[state->offset];
 
        do {
-               switch (*ptr) {
+               switch (*state->ptr) {
                case '\0':
-                       url->fragment.len = ptr - tmp;
-                       url->fragment.str = estrndup(tmp, url->fragment.len);
-                       return ptr;
+                       state->buffer[state->offset++] = 0;
+                       return state->ptr;
 
                case '%':
-                       if (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2))) {
+                       if (state->ptr[1] != '%' && (state->end - state->ptr <= 2 || !isxdigit(*(state->ptr+1)) || !isxdigit(*(state->ptr+2)))) {
                                php_error_docref(NULL TSRMLS_CC, E_WARNING,
-                                               "Failed to parse query; invalid percent encoding at pos %u in '%s'",
-                                               (unsigned) (ptr - tmp), tmp);
+                                               "Failed to parse fragment; invalid percent encoding at pos %u in '%s'",
+                                               (unsigned) (state->ptr - tmp), tmp);
                                return NULL;
                        }
-                       ptr += 2;
+                       state->buffer[state->offset++] = *state->ptr++;
+                       state->buffer[state->offset++] = *state->ptr++;
+                       state->buffer[state->offset++] = *state->ptr;
                        break;
 
-               case '?': case '/': /* yeah, well */
+               case '?': case '/':
                case '!': case '$': case '&': case '\'': case '(': case ')': case '*':
                case '+': case ',': case ';': case '=': /* sub-delims */
                case '-': case '.': case '_': case '~': /* unreserved */
@@ -884,54 +870,55 @@ static const char *parse_fragment(php_http_url_t *url, const char *ptr, const ch
                case '0': case '1': case '2': case '3': case '4': case '5': case '6':
                case '7': case '8': case '9':
                        /* allowed */
+                       state->buffer[state->offset++] = *state->ptr;
                        break;
 
                default:
-#if PHP_HTTP_HAVE_WCHAR
-                       if (url->flags & PHP_HTTP_URL_PARSE_LOCMB) {
-                               size_t n = parse_locmb(url, ptr, end);
-
-                               if (n) {
-                                       ptr += n;
-                                       break;
-                               }
+                       if (!(mb = parse_mb(state, PARSE_FRAGMENT, state->ptr, state->end, tmp, 0))) {
+                               return NULL;
                        }
-#endif
-                       php_error_docref(NULL TSRMLS_CC, E_WARNING,
-                                       "Failed to parse fragment; unexpected byte 0x%02x at pos %u in '%s'",
-                                       *ptr, (unsigned) (ptr - tmp), tmp);
+                       state->ptr += mb - 1;
                }
-       } while (++ptr <= end);
+       } while (++state->ptr <= state->end);
 
        return NULL;
 }
 
-static const char *parse_hier(php_http_url_t *url, const char *ptr, const char *end)
+static const char *parse_hier(struct parse_state *state)
 {
-       if (*ptr == '/') {
-               if (end - ptr > 1) {
-                       if (*(ptr + 1) == '/') {
-                               if (!(ptr = parse_authority(url, ptr + 2, end))) {
+       if (*state->ptr == '/') {
+               if (state->end - state->ptr > 1) {
+                       if (*(state->ptr + 1) == '/') {
+                               state->ptr += 2;
+                               if (!(state->ptr = parse_authority(state))) {
                                        return NULL;
                                }
                        }
                }
        }
-       return parse_path(url, ptr, end);
+       return parse_path(state);
 }
 
-static const char *parse_scheme(php_http_url_t *url, const char *ptr, const char *end)
+static const char *parse_scheme(struct parse_state *state)
 {
-       const char *tmp = ptr;
+       size_t mb;
+       const char *tmp = state->ptr;
 
        do {
-               switch (*ptr) {
+               switch (*state->ptr) {
                case ':':
                        /* scheme delimiter */
-                       url->scheme.len = ptr - tmp;
-                       url->scheme.str = estrndup(tmp, url->scheme.len);
-                       return ++ptr;
+                       state->url.scheme = &state->buffer[0];
+                       state->buffer[state->offset++] = 0;
+                       return ++state->ptr;
 
+               case '0': case '1': case '2': case '3': case '4': case '5': case '6':
+               case '7': case '8': case '9':
+               case '+': case '-': case '.':
+                       if (state->ptr == tmp) {
+                               return tmp;
+                       }
+                       /* no break */
                case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
                case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
                case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
@@ -940,85 +927,57 @@ static const char *parse_scheme(php_http_url_t *url, const char *ptr, const char
                case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
                case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
                case 'v': case 'w': case 'x': case 'y': case 'z':
-               case '0': case '1': case '2': case '3': case '4': case '5': case '6':
-               case '7': case '8': case '9':
-               case '+': case '-': case '.':
                        /* scheme part */
+                       state->buffer[state->offset++] = *state->ptr;
                        break;
 
                default:
-#ifdef PHP_HTTP_HAVE_WCHAR
-                       if (url->flags & PHP_HTTP_URL_PARSE_LOCMB) {
-                               size_t n = parse_locmb(url, ptr, end);
-
-                               if (n) {
-                                       ptr += n;
-                                       break;
-                               }
+                       if (!(mb = parse_mb(state, PARSE_SCHEME, state->ptr, state->end, tmp, 1))) {
+                               /* soft fail; parse path next */
+                               return tmp;
                        }
-#endif
-                       /* no scheme */
-                       return tmp;
+                       state->ptr += mb - 1;
                }
-       } while (++ptr != end);
+       } while (++state->ptr != state->end);
 
        return tmp;
 }
 
-php_http_url_t *php_http_url_init(php_http_url_t *url, const char *str, size_t len, unsigned flags TSRMLS_DC)
+php_http_url_t *php_http_url_parse(const char *str, size_t len, unsigned flags TSRMLS_DC)
 {
-       const char *ptr, *end = str + len;
-       zend_bool free_url = !url;
-
-       if (url) {
-               memset(url, 0, sizeof(*url));
-       } else {
-               url = ecalloc(1, sizeof(*url));
-       }
-
-       url->flags = flags;
-       TSRMLS_SET_CTX(url->ts);
-
-       if ((ptr = str) && !(str = parse_scheme(url, ptr, end))) {
-               php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse URL scheme: '%s'", ptr);
-               if (free_url) {
-                       php_http_url_free(&url);
-               } else {
-                       php_http_url_dtor(url);
-               }
+       size_t maxlen = 3 * len;
+       struct parse_state *state = ecalloc(1, sizeof(*state) + maxlen);
+
+       state->end = str + len;
+       state->ptr = str;
+       state->flags = flags;
+       state->maxlen = maxlen;
+       TSRMLS_SET_CTX(state->ts);
+
+       if (!parse_scheme(state)) {
+               php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse URL scheme: '%s'", state->ptr);
+               efree(state);
                return NULL;
        }
 
-       if ((ptr = str) && !(str = parse_hier(url, ptr, end))) {
-               if (free_url) {
-                       php_http_url_free(&url);
-               } else {
-                       php_http_url_dtor(url);
-               }
+       if (!parse_hier(state)) {
+               efree(state);
                return NULL;
        }
 
-       if ((ptr = str) && !(str = parse_query(url, ptr, end))) {
-               php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse URL query: '%s'", ptr);
-               if (free_url) {
-                       php_http_url_free(&url);
-               } else {
-                       php_http_url_dtor(url);
-               }
+       if (!parse_query(state)) {
+               php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse URL query: '%s'", state->ptr);
+               efree(state);
                return NULL;
        }
 
-       if ((ptr = str) && !(str = parse_fragment(url, ptr, end))) {
-               php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse URL fragment: '%s'", ptr);
-               if (free_url) {
-                       php_http_url_free(&url);
-               } else {
-                       php_http_url_dtor(url);
-               }
+       if (!parse_fragment(state)) {
+               php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse URL fragment: '%s'", state->ptr);
+               efree(state);
                return NULL;
        }
 
-       return url;
+       return (php_http_url_t *) state;
 }
 
 ZEND_BEGIN_ARG_INFO_EX(ai_HttpUrl___construct, 0, 0, 0)
@@ -1192,23 +1151,49 @@ PHP_METHOD(HttpUrl, parse)
        char *str;
        int len;
        long flags = 0;
-       php_http_url_t url;
+       php_http_url_t *url;
+       zend_error_handling zeh;
 
-       if (SUCCESS != zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &len, &flags)) {
-               return;
-       }
+       php_http_expect(SUCCESS == zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &len, &flags), invalid_arg, return);
 
-       if (php_http_url_init(&url, str, len, flags TSRMLS_CC)) {
-               printf("  scheme=(%zu)%s\n", url.scheme.len,url.scheme.str);
-               printf("username=(%zu)%s\n", url.authority.userinfo.username.len,url.authority.userinfo.username.str);
-               printf("password=(%zu)%s\n", url.authority.userinfo.password.len,url.authority.userinfo.password.str);
-               printf("    host=(%zu)%s\n", url.authority.host.len,url.authority.host.str);
-               printf("    port=%d\n", (int) url.authority.port);
-               printf("    path=(%zu)%s\n", url.path.len,url.path.str);
-               printf("   query=(%zu)%s\n", url.query.len,url.query.str);
-               printf("fragment=(%zu)%s\n", url.fragment.len,url.fragment.str);
-               php_http_url_dtor(&url);
+       zend_replace_error_handling(EH_THROW, php_http_exception_bad_url_class_entry, &zeh TSRMLS_CC);
+       if ((url = php_http_url_parse(str, len, flags TSRMLS_CC))) {
+               object_init_ex(return_value, php_http_url_class_entry);
+               if (url->scheme) {
+                       zend_update_property_string(php_http_url_class_entry, return_value,
+                                       ZEND_STRL("scheme"), url->scheme TSRMLS_CC);
+               }
+               if (url->user) {
+                       zend_update_property_string(php_http_url_class_entry, return_value,
+                                       ZEND_STRL("user"), url->user TSRMLS_CC);
+               }
+               if (url->pass) {
+                       zend_update_property_string(php_http_url_class_entry, return_value,
+                                       ZEND_STRL("pass"), url->pass TSRMLS_CC);
+               }
+               if (url->host) {
+                       zend_update_property_string(php_http_url_class_entry, return_value,
+                                       ZEND_STRL("host"), url->host TSRMLS_CC);
+               }
+               if (url->port) {
+                       zend_update_property_long(php_http_url_class_entry, return_value,
+                                       ZEND_STRL("port"), url->port TSRMLS_CC);
+               }
+               if (url->path) {
+                       zend_update_property_string(php_http_url_class_entry, return_value,
+                                       ZEND_STRL("path"), url->path TSRMLS_CC);
+               }
+               if (url->query) {
+                       zend_update_property_string(php_http_url_class_entry, return_value,
+                                       ZEND_STRL("query"), url->query TSRMLS_CC);
+               }
+               if (url->fragment) {
+                       zend_update_property_string(php_http_url_class_entry, return_value,
+                                       ZEND_STRL("fragment"), url->fragment TSRMLS_CC);
+               }
+               php_http_url_free(&url);
        }
+       zend_restore_error_handling(&zeh TSRMLS_CC);
 }
 
 static zend_function_entry php_http_url_methods[] = {
@@ -1254,13 +1239,13 @@ PHP_MINIT_FUNCTION(http_url)
        zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("SANITIZE_PATH"), PHP_HTTP_URL_SANITIZE_PATH TSRMLS_CC);
 
 #ifdef PHP_HTTP_HAVE_WCHAR
-       zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_LOCMB"), PHP_HTTP_URL_PARSE_LOCMB TSRMLS_CC);
+       zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_MBLOC"), PHP_HTTP_URL_PARSE_MBLOC TSRMLS_CC);
 #endif
-       zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_UTF8MB"), PHP_HTTP_URL_PARSE_UTF8MB TSRMLS_CC);
+       zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_MBUTF8"), PHP_HTTP_URL_PARSE_MBUTF8 TSRMLS_CC);
 #ifdef PHP_HTTP_HAVE_IDN
-       zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_LOCIDN"), PHP_HTTP_URL_PARSE_LOCIDN TSRMLS_CC);
-       zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_UTF8IDN"), PHP_HTTP_URL_PARSE_UTF8IDN TSRMLS_CC);
+       zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_TOIDN"), PHP_HTTP_URL_PARSE_TOIDN TSRMLS_CC);
 #endif
+       zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_TOPCT"), PHP_HTTP_URL_PARSE_TOPCT TSRMLS_CC);
 
        return SUCCESS;
 }