X-Git-Url: https://git.m6w6.name/?p=m6w6%2Fext-http;a=blobdiff_plain;f=php_http_url.c;h=1f06271f9527183f12c9c5c2a46506b14a674b74;hp=bed03d6ae5bf84a42c5db36cbb3e8ead43bbaaf0;hb=7205cf3cfe9e5dd142513541414b0327aa3be0cf;hpb=b11342ec89ef18b55828c089d36924a8ecc3dfd5 diff --git a/php_http_url.c b/php_http_url.c index bed03d6..1f06271 100644 --- a/php_http_url.c +++ b/php_http_url.c @@ -21,10 +21,11 @@ # include #endif -#ifdef HAVE_LANGINFO_H -# include +#ifdef HAVE_ARPA_INET_H +# include #endif -#include + +#include "php_http_utf8.h" static inline char *localhostname(void) { @@ -313,117 +314,31 @@ STATUS php_http_url_encode_hash_ex(HashTable *hash, php_http_buffer_t *qstr, con return SUCCESS; } -void php_http_url_dtor(php_http_url_t *url) -{ - STR_FREE(url->scheme.str); - STR_FREE(url->authority.userinfo.username.str); - STR_FREE(url->authority.userinfo.password.str); - STR_FREE(url->authority.host.str); - STR_FREE(url->path.str); - STR_FREE(url->query.str); - STR_FREE(url->fragment.str); -} - void php_http_url_free(php_http_url_t **url) { if (*url) { - php_http_url_dtor(*url); efree(*url); *url = NULL; } } -#ifdef PHP_HTTP_HAVE_WCHAR -static zend_bool cs_is_utf8(char **lc_ctype) +static size_t parse_mb_utf8(php_http_url_t *url, const char *ptr, const char *end, zend_bool idn) { -#if HAVE_NL_LANGINFO - if (strcmp("UTF-8", nl_langinfo(CODESET))) { - *lc_ctype = setlocale(LC_CTYPE, NULL); - return 0; - } - return 1; -#else - *lc_ctype = setlocale(LC_CTYPE, NULL); - - if (*lc_ctype) { - char *cs; + unsigned wchar; + size_t consumed = utf8towc(&wchar, (const unsigned char *) ptr, end - ptr); - if ((cs = strstr(*lc_ctype, ".utf")) || (cs = strstr(*lc_ctype, ".UTF"))) { - if (cs[4] == '-') { - ++cs; - } - if (cs[4] == '8' && (cs[5] == '\0' || cs[5] == '@')) { - return 1; - } - } + if (!consumed || consumed == (size_t) -1) { return 0; } -#endif -} - -static const unsigned char utf8mblen[256] = { - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, - 4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6 -}; -static const unsigned char utf8mask[] = { - 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 -}; - -static size_t utf8towc(wchar_t *wc, const unsigned char *uc, size_t len) -{ - unsigned char ub = utf8mblen[*uc]; - - if (!ub || ub > len || ub > 3) { + if (!idn && !isualnum(wchar)) { return 0; } - *wc = *uc & utf8mask[ub]; - - switch (ub) { - case 4: - if ((uc[1] & 0xc0) != 0x80) { - return 0; - } - *wc <<= 6; - *wc += *++uc & 0x3f; - /* no break */ - case 3: - if ((uc[1] & 0xc0) != 0x80) { - return 0; - } - *wc <<= 6; - *wc += *++uc & 0x3f; - /* no break */ - case 2: - if ((uc[1] & 0xc0) != 0x80) { - return 0; - } - *wc <<= 6; - *wc += *++uc & 0x3f; - break; - - default: - return 0; - } - - return ub; + return consumed; } -static size_t parse_locmb(php_http_url_t *url, const char *ptr, const char *end) +#ifdef PHP_HTTP_HAVE_WCHAR +static size_t parse_mb_loc(php_http_url_t *url, const char *ptr, const char *end, zend_bool idn) { wchar_t wchar; size_t consumed = 0; @@ -435,61 +350,82 @@ static size_t parse_locmb(php_http_url_t *url, const char *ptr, const char *end) consumed = mbtowc(&wchar, ptr, end - ptr); #endif - if (!consumed || consumed == (size_t) -1 || !iswalnum(wchar)) { + if (!consumed || consumed == (size_t) -1) { + return 0; + } + if (!idn && !iswalnum(wchar)) { return 0; } - return consumed - 1; + return consumed; } +#endif -#include "ualpha.h" +typedef enum parse_mb_what { + PARSE_SCHEME, + PARSE_USERINFO, + PARSE_HOSTINFO, + PARSE_PATH, + PARSE_QUERY, + PARSE_FRAGMENT +} parse_mb_what_t; + +static const char * const parse_what[] = { + "scheme", + "userinfo", + "hostinfo", + "path", + "query", + "fragment" +}; -static zend_bool isualnum(wchar_t ch) +static const char parse_xdigits[] = "0123456789ABCDEF"; + +static size_t parse_mb(php_http_url_t *url, parse_mb_what_t what, const char *ptr, const char *end, const char *begin, zend_bool silent) { - unsigned i; + size_t consumed = 0; + zend_bool idn = (what == PARSE_HOSTINFO) && (url->flags & PHP_HTTP_URL_PARSE_TOIDN); - /* digits */ - if (ch >= 0x30 && ch <= 0x39) { - return 1; + if (url->flags & PHP_HTTP_URL_PARSE_MBUTF8) { + consumed = parse_mb_utf8(url, ptr, end, idn); } - for (i = 0; i < sizeof(utf8_ranges)/sizeof(utf8_range_t); ++i) { - if (utf8_ranges[i].start == ch) { - return 1; - } else if (utf8_ranges[i].start <= ch && utf8_ranges[i].end >= ch) { - if (utf8_ranges[i].step == 1) { - return 1; - } - /* FIXME step */ - return 0; - } +#ifdef PHP_HTTP_HAVE_WCHAR + else if (url->flags & PHP_HTTP_URL_PARSE_MBLOC) { + consumed = parse_mb_loc(url, ptr, end, idn); } - return 0; -} - -static size_t parse_utf8mb(php_http_url_t *url, const char *ptr, const char *end) -{ - char *lc_ctype = NULL; - - if (0 && cs_is_utf8(&lc_ctype)) { - return parse_locmb(url, ptr, end); - } else { - wchar_t wchar; - size_t consumed = utf8towc(&wchar, (const unsigned char *) ptr, end - ptr); +#endif - if (!consumed || consumed == (size_t) -1 || !isualnum(wchar)) { - return 0; + if (consumed) { + if (!(url->flags & PHP_HTTP_URL_PARSE_TOPCT) || what == PARSE_HOSTINFO || what == PARSE_SCHEME) { + PHP_HTTP_DUFF(consumed, url->buffer[url->offset++] = *ptr++); + } else { + int i = 0; + + PHP_HTTP_DUFF(consumed, + url->buffer[url->offset++] = '%'; + url->buffer[url->offset++] = parse_xdigits[((unsigned char) ptr[i]) >> 4]; + url->buffer[url->offset++] = parse_xdigits[((unsigned char) ptr[i]) & 0xf]; + ++i; + ); } - - return consumed -1 ; + } else if (!silent) { + TSRMLS_FETCH_FROM_CTX(url->ts); + php_error_docref(NULL TSRMLS_CC, E_WARNING, + "Failed to parse %s; unexpected byte 0x%02x at pos %u in '%s'", + parse_what[what], (unsigned char) *ptr, (unsigned) (ptr - begin), begin); } + + return consumed; } -#endif -static STATUS parse_userinfo(php_http_url_t *url, const char *ptr, const char *end) +static STATUS parse_userinfo(php_http_url_t *url, const char *ptr) { - const char *password = NULL, *tmp = ptr; + size_t mb; + const char *password = NULL, *end = url->ptr, *tmp = ptr; TSRMLS_FETCH_FROM_CTX(url->ts); + url->user = &url->buffer[url->offset]; + do { switch (*ptr) { case ':': @@ -500,16 +436,20 @@ static STATUS parse_userinfo(php_http_url_t *url, const char *ptr, const char *e return FAILURE; } password = ptr + 1; + url->buffer[url->offset++] = 0; + url->pass = &url->buffer[url->offset]; break; case '%': - if (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2))) { + if (ptr[1] != '%' && (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2)))) { php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse userinfo; invalid percent encoding at pos %u in '%s'", (unsigned) (ptr - tmp), tmp); return FAILURE; } - ptr += 2; + url->buffer[url->offset++] = *ptr++; + url->buffer[url->offset++] = *ptr++; + url->buffer[url->offset++] = *ptr; break; case '!': case '$': case '&': case '\'': case '(': case ')': case '*': @@ -526,61 +466,69 @@ static STATUS parse_userinfo(php_http_url_t *url, const char *ptr, const char *e case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': /* allowed */ + url->buffer[url->offset++] = *ptr; break; default: - if (url->flags & PHP_HTTP_URL_PARSE_UTF8MB) { - size_t n = parse_utf8mb(url, ptr, end); - - if (n) { - ptr += n; - break; - } - } -#ifdef PHP_HTTP_HAVE_WCHAR - else if (url->flags & PHP_HTTP_URL_PARSE_LOCMB) { - size_t n = parse_locmb(url, ptr, end); - - if (n) { - ptr += n; - break; - } + if (!(mb = parse_mb(url, PARSE_USERINFO, ptr, end, tmp, 0))) { + return FAILURE; } -#endif - php_error_docref(NULL TSRMLS_CC, E_WARNING, - "Failed to parse userinfo; unexpected byte 0x%02x at pos %u in '%s'", - *ptr, (unsigned) (ptr - tmp), tmp); + ptr += mb - 1; } } while(++ptr != end); - if (password) { - url->authority.userinfo.username.len = password - tmp - 1; - url->authority.userinfo.username.str = estrndup(tmp, - url->authority.userinfo.username.len); - url->authority.userinfo.password.len = end - password; - url->authority.userinfo.password.str = estrndup(password, - url->authority.userinfo.password.len); - } else { - url->authority.userinfo.username.len = end - tmp; - url->authority.userinfo.username.str = estrndup(tmp, - url->authority.userinfo.username.len); - } + + url->buffer[url->offset++] = 0; return SUCCESS; } -static STATUS parse_hostinfo(php_http_url_t *url, const char *ptr, const char *end) +static STATUS parse_hostinfo(php_http_url_t *url, const char *ptr) { - const char *tmp = ptr, *port = NULL; + size_t mb, len; + const char *end = url->ptr, *tmp = ptr, *port = NULL; TSRMLS_FETCH_FROM_CTX(url->ts); - /* FIXME: IP(v6) addresses */ - do { + +#ifdef HAVE_INET_PTON + if (*ptr == '[') { + char *error = NULL, *tmp = memchr(ptr, ']', end - ptr); + + if (tmp) { + size_t addrlen = tmp - ptr + 1; + char buf[16], *addr = estrndup(ptr + 1, addrlen - 2); + int rv = inet_pton(AF_INET6, addr, buf); + + efree(addr); + if (rv == 1) { + url->buffer[url->offset] = '['; + url->host = &url->buffer[url->offset]; + inet_ntop(AF_INET6, buf, url->host + 1, url->maxlen - url->offset); + url->offset += strlen(url->host); + url->buffer[url->offset++] = ']'; + url->buffer[url->offset++] = 0; + ptr = tmp + 1; + } else if (rv == -1) { + error = strerror(errno); + } else { + error = "unexpected '['"; + } + } else { + error = "expected ']'"; + } + + if (error) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse hostinfo; %s", error); + return FAILURE; + } + } +#endif + if (ptr != end) do { switch (*ptr) { case ':': if (port) { php_error_docref(NULL TSRMLS_CC, E_WARNING, - "Failed to parse port; duplicate ':' at pos %u in '%s'", + "Failed to parse port; unexpected ':' at pos %u in '%s'", (unsigned) (ptr - tmp), tmp); return FAILURE; } @@ -588,13 +536,15 @@ static STATUS parse_hostinfo(php_http_url_t *url, const char *ptr, const char *e break; case '%': - if (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2))) { + if (ptr[1] != '%' && (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2)))) { php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse hostinfo; invalid percent encoding at pos %u in '%s'", (unsigned) (ptr - tmp), tmp); return FAILURE; } - ptr += 2; + url->buffer[url->offset++] = *ptr++; + url->buffer[url->offset++] = *ptr++; + url->buffer[url->offset++] = *ptr; break; case '!': case '$': case '&': case '\'': case '(': case ')': case '*': @@ -611,7 +561,7 @@ static STATUS parse_hostinfo(php_http_url_t *url, const char *ptr, const char *e if (port) { php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse port; unexpected char '%c' at pos %u in '%s'", - *ptr, (unsigned) (ptr - tmp), tmp); + (unsigned char) *ptr, (unsigned) (ptr - tmp), tmp); return FAILURE; } /* no break */ @@ -619,67 +569,53 @@ static STATUS parse_hostinfo(php_http_url_t *url, const char *ptr, const char *e case '7': case '8': case '9': /* allowed */ if (port) { - url->authority.port *= 10; - url->authority.port += *ptr - '0'; + url->port *= 10; + url->port += *ptr - '0'; + } else { + url->buffer[url->offset++] = *ptr; } break; default: - if (!port) { - if (url->flags & PHP_HTTP_URL_PARSE_UTF8MB) { - size_t n = parse_utf8mb(url, ptr, end); - - if (n) { - ptr += n; - break; - } - } -#ifdef PHP_HTTP_HAVE_WCHAR - else if ((url->flags & PHP_HTTP_URL_PARSE_LOCMB) || (url->flags & PHP_HTTP_URL_PARSE_LOCIDN)) { - size_t n = parse_locmb(url, ptr, end); - - if (n) { - ptr += n; - break; - } - } -#endif + if (port) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, + "Failed to parse port; unexpected byte 0x%02x at pos %u in '%s'", + (unsigned char) *ptr, (unsigned) (ptr - tmp), tmp); + return FAILURE; + } else if (!(mb = parse_mb(url, PARSE_HOSTINFO, ptr, end, tmp, 0))) { + return FAILURE; } - php_error_docref(NULL TSRMLS_CC, E_WARNING, - "Failed to parse hostinfo; unexpected byte 0x%02x at pos %u in '%s'", - (unsigned char) *ptr, (unsigned) (ptr - tmp), tmp); - return FAILURE; + ptr += mb - 1; } } while (++ptr != end); - if (port) { - url->authority.host.len = port - tmp - 1; - } else { - url->authority.host.len = end - tmp; + if (!url->host) { + len = (port ? port - tmp - 1 : end - tmp); + url->host = &url->buffer[url->offset - len]; + url->buffer[url->offset++] = 0; } - url->authority.host.str = estrndup(tmp, url->authority.host.len); - #ifdef PHP_HTTP_HAVE_IDN - if (url->flags & PHP_HTTP_URL_PARSE_UTF8IDN) { + if (url->flags & PHP_HTTP_URL_PARSE_TOIDN) { char *idn = NULL; - int rv = idna_to_ascii_8z(url->authority.host.str, &idn, IDNA_ALLOW_UNASSIGNED|IDNA_USE_STD3_ASCII_RULES); + int rv = -1; - if (rv != IDNA_SUCCESS) { - php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Failed to parse IDN: '%s'", idna_strerror(rv)); - } else { - STR_SET(url->authority.host.str, estrdup(idn)); - free(idn); + if (url->flags & PHP_HTTP_URL_PARSE_MBUTF8) { + rv = idna_to_ascii_8z(url->host, &idn, IDNA_ALLOW_UNASSIGNED|IDNA_USE_STD3_ASCII_RULES); } - } else if (url->flags & PHP_HTTP_URL_PARSE_LOCIDN) { - char *idn = NULL; - int rv = idna_to_ascii_lz(url->authority.host.str, &idn, IDNA_ALLOW_UNASSIGNED|IDNA_USE_STD3_ASCII_RULES); - +# ifdef PHP_HTTP_HAVE_WCHAR + else if (url->flags & PHP_HTTP_URL_PARSE_MBLOC) { + rv = idna_to_ascii_lz(url->host, &idn, IDNA_ALLOW_UNASSIGNED|IDNA_USE_STD3_ASCII_RULES); + } +# endif if (rv != IDNA_SUCCESS) { - php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Failed to parse IDN: '%s'", idna_strerror(rv)); + php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse IDN; %s", idna_strerror(rv)); + return FAILURE; } else { - STR_SET(url->authority.host.str, estrdup(idn)); + size_t idnlen = strlen(idn); + memcpy(url->host, idn, idnlen + 1); free(idn); + url->offset += idnlen - len; } } #endif @@ -687,18 +623,25 @@ static STATUS parse_hostinfo(php_http_url_t *url, const char *ptr, const char *e return SUCCESS; } -static const char *parse_authority(php_http_url_t *url, const char *ptr, const char *end) +static const char *parse_authority(php_http_url_t *url) { - const char *tmp = ptr; + const char *tmp = url->ptr, *host = NULL; do { - switch (*ptr) { + switch (*url->ptr) { case '@': /* userinfo delimiter */ - if (tmp != ptr && SUCCESS != parse_userinfo(url, tmp, ptr)) { + if (host) { + TSRMLS_FETCH_FROM_CTX(url->ts); + php_error_docref(NULL TSRMLS_CC, E_WARNING, + "Failed to parse userinfo; unexpected '@'"); + return NULL; + } + host = url->ptr + 1; + if (tmp != url->ptr && SUCCESS != parse_userinfo(url, tmp)) { return NULL; } - tmp = ptr + 1; + tmp = url->ptr + 1; break; case '/': @@ -706,37 +649,52 @@ static const char *parse_authority(php_http_url_t *url, const char *ptr, const c case '#': case '\0': /* host delimiter */ - if (tmp != ptr && SUCCESS != parse_hostinfo(url, tmp, ptr)) { + if (tmp != url->ptr && SUCCESS != parse_hostinfo(url, tmp)) { return NULL; } - return ptr; + return url->ptr; } - } while (++ptr <= end); + } while (++url->ptr <= url->end); return NULL; } -static const char *parse_path(php_http_url_t *url, const char *ptr, const char *end) +static const char *parse_path(php_http_url_t *url) { - const char *tmp = ptr; + size_t mb; + const char *tmp; TSRMLS_FETCH_FROM_CTX(url->ts); + /* is there actually a path to parse? */ + if (!*url->ptr) { + return url->ptr; + } + tmp = url->ptr; + url->path = &url->buffer[url->offset]; + do { - switch (*ptr) { + switch (*url->ptr) { + case '#': case '?': case '\0': - url->path.len = ptr - tmp; - url->path.str = estrndup(tmp, url->path.len); - return ptr; + /* did we have any path component ? */ + if (tmp != url->ptr) { + url->buffer[url->offset++] = 0; + } else { + url->path = NULL; + } + return url->ptr; case '%': - if (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2))) { + if (url->ptr[1] != '%' && (url->end - url->ptr <= 2 || !isxdigit(*(url->ptr+1)) || !isxdigit(*(url->ptr+2)))) { php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse path; invalid percent encoding at pos %u in '%s'", - (unsigned) (ptr - tmp), tmp); + (unsigned) (url->ptr - tmp), tmp); return NULL; } - ptr += 2; + url->buffer[url->offset++] = *url->ptr++; + url->buffer[url->offset++] = *url->ptr++; + url->buffer[url->offset++] = *url->ptr; break; case '/': /* yeah, well */ @@ -755,57 +713,52 @@ static const char *parse_path(php_http_url_t *url, const char *ptr, const char * case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': /* allowed */ + url->buffer[url->offset++] = *url->ptr; break; default: - if (url->flags & PHP_HTTP_URL_PARSE_UTF8MB) { - size_t n = parse_utf8mb(url, ptr, end); - - if (n) { - ptr += n; - break; - } - } -#if PHP_HTTP_HAVE_WCHAR - else if (url->flags & PHP_HTTP_URL_PARSE_LOCMB) { - size_t n = parse_locmb(url, ptr, end); - - if (n) { - ptr += n; - break; - } + if (!(mb = parse_mb(url, PARSE_PATH, url->ptr, url->end, tmp, 0))) { + return NULL; } -#endif - php_error_docref(NULL TSRMLS_CC, E_WARNING, - "Failed to parse path; unexpected byte 0x%02x pos %u in '%s'", - *ptr, (unsigned) (ptr - tmp), tmp); + url->ptr += mb - 1; } - } while (++ptr <= end); + } while (++url->ptr <= url->end); return NULL; } -static const char *parse_query(php_http_url_t *url, const char *ptr, const char *end) +static const char *parse_query(php_http_url_t *url) { - const char *tmp = ptr + !!*ptr; + size_t mb; + const char *tmp = url->ptr + !!*url->ptr; TSRMLS_FETCH_FROM_CTX(url->ts); + /* is there actually a query to parse? */ + if (*url->ptr != '?') { + return url->ptr; + } + + /* skip initial '?' */ + tmp = ++url->ptr; + url->query = &url->buffer[url->offset]; + do { - switch (*ptr) { + switch (*url->ptr) { case '#': case '\0': - url->query.len = ptr - tmp; - url->query.str = estrndup(tmp, url->query.len); - return ptr; + url->buffer[url->offset++] = 0; + return url->ptr; case '%': - if (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2))) { + if (url->ptr[1] != '%' && (url->end - url->ptr <= 2 || !isxdigit(*(url->ptr+1)) || !isxdigit(*(url->ptr+2)))) { php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse query; invalid percent encoding at pos %u in '%s'", - (unsigned) (ptr - tmp), tmp); + (unsigned) (url->ptr - tmp), tmp); return NULL; } - ptr += 2; + url->buffer[url->offset++] = *url->ptr++; + url->buffer[url->offset++] = *url->ptr++; + url->buffer[url->offset++] = *url->ptr; break; case '?': case '/': /* yeah, well */ @@ -824,51 +777,54 @@ static const char *parse_query(php_http_url_t *url, const char *ptr, const char case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': /* allowed */ + url->buffer[url->offset++] = *url->ptr; break; default: -#ifdef PHP_HTTP_HAVE_WCHAR - if (url->flags & PHP_HTTP_URL_PARSE_LOCMB) { - size_t n = parse_locmb(url, ptr, end); - - if (n) { - ptr += n; - break; - } + if (!(mb = parse_mb(url, PARSE_QUERY, url->ptr, url->end, tmp, 0))) { + return NULL; } -#endif - php_error_docref(NULL TSRMLS_CC, E_WARNING, - "Failed to parse query; unexpected byte 0x%02x at pos %u in '%s'", - *ptr, (unsigned) (ptr - tmp), tmp); + url->ptr += mb - 1; } - } while (++ptr <= end); + } while (++url->ptr <= url->end); return NULL; } -static const char *parse_fragment(php_http_url_t *url, const char *ptr, const char *end) +static const char *parse_fragment(php_http_url_t *url) { - const char *tmp = ptr + !!*ptr; + size_t mb; + const char *tmp; TSRMLS_FETCH_FROM_CTX(url->ts); + /* is there actually a fragment to parse? */ + if (*url->ptr != '#') { + return url->ptr; + } + + /* skip initial '#' */ + tmp = ++url->ptr; + url->fragment = &url->buffer[url->offset]; + do { - switch (*ptr) { + switch (*url->ptr) { case '\0': - url->fragment.len = ptr - tmp; - url->fragment.str = estrndup(tmp, url->fragment.len); - return ptr; + url->buffer[url->offset++] = 0; + return url->ptr; case '%': - if (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2))) { + if (url->ptr[1] != '%' && (url->end - url->ptr <= 2 || !isxdigit(*(url->ptr+1)) || !isxdigit(*(url->ptr+2)))) { php_error_docref(NULL TSRMLS_CC, E_WARNING, - "Failed to parse query; invalid percent encoding at pos %u in '%s'", - (unsigned) (ptr - tmp), tmp); + "Failed to parse fragment; invalid percent encoding at pos %u in '%s'", + (unsigned) (url->ptr - tmp), tmp); return NULL; } - ptr += 2; + url->buffer[url->offset++] = *url->ptr++; + url->buffer[url->offset++] = *url->ptr++; + url->buffer[url->offset++] = *url->ptr; break; - case '?': case '/': /* yeah, well */ + case '?': case '/': case '!': case '$': case '&': case '\'': case '(': case ')': case '*': case '+': case ',': case ';': case '=': /* sub-delims */ case '-': case '.': case '_': case '~': /* unreserved */ @@ -884,54 +840,55 @@ static const char *parse_fragment(php_http_url_t *url, const char *ptr, const ch case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': /* allowed */ + url->buffer[url->offset++] = *url->ptr; break; default: -#if PHP_HTTP_HAVE_WCHAR - if (url->flags & PHP_HTTP_URL_PARSE_LOCMB) { - size_t n = parse_locmb(url, ptr, end); - - if (n) { - ptr += n; - break; - } + if (!(mb = parse_mb(url, PARSE_FRAGMENT, url->ptr, url->end, tmp, 0))) { + return NULL; } -#endif - php_error_docref(NULL TSRMLS_CC, E_WARNING, - "Failed to parse fragment; unexpected byte 0x%02x at pos %u in '%s'", - *ptr, (unsigned) (ptr - tmp), tmp); + url->ptr += mb - 1; } - } while (++ptr <= end); + } while (++url->ptr <= url->end); return NULL; } -static const char *parse_hier(php_http_url_t *url, const char *ptr, const char *end) +static const char *parse_hier(php_http_url_t *url) { - if (*ptr == '/') { - if (end - ptr > 1) { - if (*(ptr + 1) == '/') { - if (!(ptr = parse_authority(url, ptr + 2, end))) { + if (*url->ptr == '/') { + if (url->end - url->ptr > 1) { + if (*(url->ptr + 1) == '/') { + url->ptr += 2; + if (!(url->ptr = parse_authority(url))) { return NULL; } } } } - return parse_path(url, ptr, end); + return parse_path(url); } -static const char *parse_scheme(php_http_url_t *url, const char *ptr, const char *end) +static const char *parse_scheme(php_http_url_t *url) { - const char *tmp = ptr; + size_t mb; + const char *tmp = url->ptr; do { - switch (*ptr) { + switch (*url->ptr) { case ':': /* scheme delimiter */ - url->scheme.len = ptr - tmp; - url->scheme.str = estrndup(tmp, url->scheme.len); - return ++ptr; + url->scheme = &url->buffer[0]; + url->buffer[url->offset++] = 0; + return ++url->ptr; + case '0': case '1': case '2': case '3': case '4': case '5': case '6': + case '7': case '8': case '9': + case '+': case '-': case '.': + if (url->ptr == tmp) { + return tmp; + } + /* no break */ case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': @@ -940,81 +897,56 @@ static const char *parse_scheme(php_http_url_t *url, const char *ptr, const char case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': - case '0': case '1': case '2': case '3': case '4': case '5': case '6': - case '7': case '8': case '9': - case '+': case '-': case '.': /* scheme part */ + url->buffer[url->offset++] = *url->ptr; break; default: -#ifdef PHP_HTTP_HAVE_WCHAR - if (url->flags & PHP_HTTP_URL_PARSE_LOCMB) { - size_t n = parse_locmb(url, ptr, end); - - if (n) { - ptr += n; - break; - } + if (!(mb = parse_mb(url, PARSE_SCHEME, url->ptr, url->end, tmp, 1))) { + /* soft fail; parse path next */ + return tmp; } -#endif - /* no scheme */ - return tmp; + url->ptr += mb - 1; } - } while (++ptr != end); + } while (++url->ptr != url->end); return tmp; } -php_http_url_t *php_http_url_init(php_http_url_t *url, const char *str, size_t len, unsigned flags TSRMLS_DC) -{ - const char *ptr, *end = str + len; - zend_bool free_url = !url; +struct parser_state { +}; - if (url) { - memset(url, 0, sizeof(*url)); - } else { - url = ecalloc(1, sizeof(*url)); - } +php_http_url_t *php_http_url_parse(const char *str, size_t len, unsigned flags TSRMLS_DC) +{ + size_t maxlen = 3 * len; + php_http_url_t *url = ecalloc(1, sizeof(*url) + maxlen); + url->end = str + len; + url->ptr = str; url->flags = flags; + url->maxlen = maxlen; TSRMLS_SET_CTX(url->ts); - if ((ptr = str) && !(str = parse_scheme(url, ptr, end))) { - php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse URL scheme: '%s'", ptr); - if (free_url) { - php_http_url_free(&url); - } else { - php_http_url_dtor(url); - } + if (!parse_scheme(url)) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse URL scheme: '%s'", url->ptr); + php_http_url_free(&url); return NULL; } - if ((ptr = str) && !(str = parse_hier(url, ptr, end))) { - if (free_url) { - php_http_url_free(&url); - } else { - php_http_url_dtor(url); - } + if (!parse_hier(url)) { + php_http_url_free(&url); return NULL; } - if ((ptr = str) && !(str = parse_query(url, ptr, end))) { - php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse URL query: '%s'", ptr); - if (free_url) { - php_http_url_free(&url); - } else { - php_http_url_dtor(url); - } + if (!parse_query(url)) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse URL query: '%s'", url->ptr); + php_http_url_free(&url); return NULL; } - if ((ptr = str) && !(str = parse_fragment(url, ptr, end))) { - php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse URL fragment: '%s'", ptr); - if (free_url) { - php_http_url_free(&url); - } else { - php_http_url_dtor(url); - } + if (!parse_fragment(url)) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse URL fragment: '%s'", url->ptr); + php_http_url_free(&url); return NULL; } @@ -1192,23 +1124,49 @@ PHP_METHOD(HttpUrl, parse) char *str; int len; long flags = 0; - php_http_url_t url; + php_http_url_t *url; + zend_error_handling zeh; - if (SUCCESS != zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &len, &flags)) { - return; - } + php_http_expect(SUCCESS == zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &len, &flags), invalid_arg, return); - if (php_http_url_init(&url, str, len, flags TSRMLS_CC)) { - printf(" scheme=(%zu)%s\n", url.scheme.len,url.scheme.str); - printf("username=(%zu)%s\n", url.authority.userinfo.username.len,url.authority.userinfo.username.str); - printf("password=(%zu)%s\n", url.authority.userinfo.password.len,url.authority.userinfo.password.str); - printf(" host=(%zu)%s\n", url.authority.host.len,url.authority.host.str); - printf(" port=%d\n", (int) url.authority.port); - printf(" path=(%zu)%s\n", url.path.len,url.path.str); - printf(" query=(%zu)%s\n", url.query.len,url.query.str); - printf("fragment=(%zu)%s\n", url.fragment.len,url.fragment.str); - php_http_url_dtor(&url); + zend_replace_error_handling(EH_THROW, php_http_exception_bad_url_class_entry, &zeh TSRMLS_CC); + if ((url = php_http_url_parse(str, len, flags TSRMLS_CC))) { + object_init_ex(return_value, php_http_url_class_entry); + if (url->scheme) { + zend_update_property_string(php_http_url_class_entry, return_value, + ZEND_STRL("scheme"), url->scheme TSRMLS_CC); + } + if (url->user) { + zend_update_property_string(php_http_url_class_entry, return_value, + ZEND_STRL("user"), url->user TSRMLS_CC); + } + if (url->pass) { + zend_update_property_string(php_http_url_class_entry, return_value, + ZEND_STRL("pass"), url->pass TSRMLS_CC); + } + if (url->host) { + zend_update_property_string(php_http_url_class_entry, return_value, + ZEND_STRL("host"), url->host TSRMLS_CC); + } + if (url->port) { + zend_update_property_long(php_http_url_class_entry, return_value, + ZEND_STRL("port"), url->port TSRMLS_CC); + } + if (url->path) { + zend_update_property_string(php_http_url_class_entry, return_value, + ZEND_STRL("path"), url->path TSRMLS_CC); + } + if (url->query) { + zend_update_property_string(php_http_url_class_entry, return_value, + ZEND_STRL("query"), url->query TSRMLS_CC); + } + if (url->fragment) { + zend_update_property_string(php_http_url_class_entry, return_value, + ZEND_STRL("fragment"), url->fragment TSRMLS_CC); + } + php_http_url_free(&url); } + zend_restore_error_handling(&zeh TSRMLS_CC); } static zend_function_entry php_http_url_methods[] = { @@ -1254,13 +1212,13 @@ PHP_MINIT_FUNCTION(http_url) zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("SANITIZE_PATH"), PHP_HTTP_URL_SANITIZE_PATH TSRMLS_CC); #ifdef PHP_HTTP_HAVE_WCHAR - zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_LOCMB"), PHP_HTTP_URL_PARSE_LOCMB TSRMLS_CC); + zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_MBLOC"), PHP_HTTP_URL_PARSE_MBLOC TSRMLS_CC); #endif - zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_UTF8MB"), PHP_HTTP_URL_PARSE_UTF8MB TSRMLS_CC); + zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_MBUTF8"), PHP_HTTP_URL_PARSE_MBUTF8 TSRMLS_CC); #ifdef PHP_HTTP_HAVE_IDN - zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_LOCIDN"), PHP_HTTP_URL_PARSE_LOCIDN TSRMLS_CC); - zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_UTF8IDN"), PHP_HTTP_URL_PARSE_UTF8IDN TSRMLS_CC); + zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_TOIDN"), PHP_HTTP_URL_PARSE_TOIDN TSRMLS_CC); #endif + zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_TOPCT"), PHP_HTTP_URL_PARSE_TOPCT TSRMLS_CC); return SUCCESS; }