X-Git-Url: https://git.m6w6.name/?p=m6w6%2Fext-http;a=blobdiff_plain;f=php_http_url.c;h=05850e8c42a74482c90a8725147ebf4d4d36f751;hp=bed03d6ae5bf84a42c5db36cbb3e8ead43bbaaf0;hb=31636a8ec8578e7e4d45da180d55e1260bb527db;hpb=b11342ec89ef18b55828c089d36924a8ecc3dfd5 diff --git a/php_http_url.c b/php_http_url.c index bed03d6..05850e8 100644 --- a/php_http_url.c +++ b/php_http_url.c @@ -21,10 +21,11 @@ # include #endif -#ifdef HAVE_LANGINFO_H -# include +#ifdef HAVE_ARPA_INET_H +# include #endif -#include + +#include "php_http_utf8.h" static inline char *localhostname(void) { @@ -313,117 +314,44 @@ STATUS php_http_url_encode_hash_ex(HashTable *hash, php_http_buffer_t *qstr, con return SUCCESS; } -void php_http_url_dtor(php_http_url_t *url) -{ - STR_FREE(url->scheme.str); - STR_FREE(url->authority.userinfo.username.str); - STR_FREE(url->authority.userinfo.password.str); - STR_FREE(url->authority.host.str); - STR_FREE(url->path.str); - STR_FREE(url->query.str); - STR_FREE(url->fragment.str); -} +struct parse_state { + php_http_url_t url; +#ifdef ZTS + void ***ts; +#endif + const char *ptr; + const char *end; + size_t maxlen; + off_t offset; + unsigned flags; + char buffer[1]; /* last member */ +}; void php_http_url_free(php_http_url_t **url) { if (*url) { - php_http_url_dtor(*url); efree(*url); *url = NULL; } } -#ifdef PHP_HTTP_HAVE_WCHAR -static zend_bool cs_is_utf8(char **lc_ctype) -{ -#if HAVE_NL_LANGINFO - if (strcmp("UTF-8", nl_langinfo(CODESET))) { - *lc_ctype = setlocale(LC_CTYPE, NULL); - return 0; - } - return 1; -#else - *lc_ctype = setlocale(LC_CTYPE, NULL); - - if (*lc_ctype) { - char *cs; - - if ((cs = strstr(*lc_ctype, ".utf")) || (cs = strstr(*lc_ctype, ".UTF"))) { - if (cs[4] == '-') { - ++cs; - } - if (cs[4] == '8' && (cs[5] == '\0' || cs[5] == '@')) { - return 1; - } - } - return 0; - } -#endif -} - -static const unsigned char utf8mblen[256] = { - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, - 4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6 -}; -static const unsigned char utf8mask[] = { - 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 -}; - -static size_t utf8towc(wchar_t *wc, const unsigned char *uc, size_t len) +static size_t parse_mb_utf8(unsigned *wc, const char *ptr, const char *end) { - unsigned char ub = utf8mblen[*uc]; + unsigned wchar; + size_t consumed = utf8towc(&wchar, (const unsigned char *) ptr, end - ptr); - if (!ub || ub > len || ub > 3) { + if (!consumed || consumed == (size_t) -1) { return 0; } - *wc = *uc & utf8mask[ub]; - - switch (ub) { - case 4: - if ((uc[1] & 0xc0) != 0x80) { - return 0; - } - *wc <<= 6; - *wc += *++uc & 0x3f; - /* no break */ - case 3: - if ((uc[1] & 0xc0) != 0x80) { - return 0; - } - *wc <<= 6; - *wc += *++uc & 0x3f; - /* no break */ - case 2: - if ((uc[1] & 0xc0) != 0x80) { - return 0; - } - *wc <<= 6; - *wc += *++uc & 0x3f; - break; - - default: - return 0; + if (wc) { + *wc = wchar; } - - return ub; + return consumed; } -static size_t parse_locmb(php_http_url_t *url, const char *ptr, const char *end) +#ifdef PHP_HTTP_HAVE_WCHAR +static size_t parse_mb_loc(unsigned *wc, const char *ptr, const char *end) { wchar_t wchar; size_t consumed = 0; @@ -435,60 +363,98 @@ static size_t parse_locmb(php_http_url_t *url, const char *ptr, const char *end) consumed = mbtowc(&wchar, ptr, end - ptr); #endif - if (!consumed || consumed == (size_t) -1 || !iswalnum(wchar)) { + if (!consumed || consumed == (size_t) -1) { return 0; } - return consumed - 1; + if (wc) { + *wc = wchar; + } + return consumed; } +#endif -#include "ualpha.h" +typedef enum parse_mb_what { + PARSE_SCHEME, + PARSE_USERINFO, + PARSE_HOSTINFO, + PARSE_PATH, + PARSE_QUERY, + PARSE_FRAGMENT +} parse_mb_what_t; + +static const char * const parse_what[] = { + "scheme", + "userinfo", + "hostinfo", + "path", + "query", + "fragment" +}; + +static const char parse_xdigits[] = "0123456789ABCDEF"; -static zend_bool isualnum(wchar_t ch) +static size_t parse_mb(struct parse_state *state, parse_mb_what_t what, const char *ptr, const char *end, const char *begin, zend_bool silent) { - unsigned i; + unsigned wchar; + size_t consumed = 0; - /* digits */ - if (ch >= 0x30 && ch <= 0x39) { - return 1; + if (state->flags & PHP_HTTP_URL_PARSE_MBUTF8) { + consumed = parse_mb_utf8(&wchar, ptr, end); } - for (i = 0; i < sizeof(utf8_ranges)/sizeof(utf8_range_t); ++i) { - if (utf8_ranges[i].start == ch) { - return 1; - } else if (utf8_ranges[i].start <= ch && utf8_ranges[i].end >= ch) { - if (utf8_ranges[i].step == 1) { - return 1; - } - /* FIXME step */ - return 0; - } +#ifdef PHP_HTTP_HAVE_WCHAR + else if (state->flags & PHP_HTTP_URL_PARSE_MBLOC) { + consumed = parse_mb_loc(&wchar, ptr, end); } - return 0; -} - -static size_t parse_utf8mb(php_http_url_t *url, const char *ptr, const char *end) -{ - char *lc_ctype = NULL; - - if (0 && cs_is_utf8(&lc_ctype)) { - return parse_locmb(url, ptr, end); - } else { - wchar_t wchar; - size_t consumed = utf8towc(&wchar, (const unsigned char *) ptr, end - ptr); +#endif - if (!consumed || consumed == (size_t) -1 || !isualnum(wchar)) { - return 0; + while (consumed) { + if (!(state->flags & PHP_HTTP_URL_PARSE_TOPCT) || what == PARSE_HOSTINFO || what == PARSE_SCHEME) { + if (what == PARSE_HOSTINFO && (state->flags & PHP_HTTP_URL_PARSE_TOIDN)) { + /* idna */ + } else if (state->flags & PHP_HTTP_URL_PARSE_MBUTF8) { + if (!isualnum(wchar)) { + break; + } +#ifdef PHP_HTTP_HAVE_WCHAR + } else if (state->flags & PHP_HTTP_URL_PARSE_MBLOC) { + if (!iswalnum(wchar)) { + break; + } +#endif + } + PHP_HTTP_DUFF(consumed, state->buffer[state->offset++] = *ptr++); + } else { + int i = 0; + + PHP_HTTP_DUFF(consumed, + state->buffer[state->offset++] = '%'; + state->buffer[state->offset++] = parse_xdigits[((unsigned char) ptr[i]) >> 4]; + state->buffer[state->offset++] = parse_xdigits[((unsigned char) ptr[i]) & 0xf]; + ++i; + ); } - return consumed -1 ; + return consumed; } + + if (!silent) { + TSRMLS_FETCH_FROM_CTX(state->ts); + php_error_docref(NULL TSRMLS_CC, E_WARNING, + "Failed to parse %s; unexpected byte 0x%02x at pos %u in '%s'", + parse_what[what], (unsigned char) *ptr, (unsigned) (ptr - begin), begin); + } + + return 0; } -#endif -static STATUS parse_userinfo(php_http_url_t *url, const char *ptr, const char *end) +static STATUS parse_userinfo(struct parse_state *state, const char *ptr) { - const char *password = NULL, *tmp = ptr; - TSRMLS_FETCH_FROM_CTX(url->ts); + size_t mb; + const char *password = NULL, *end = state->ptr, *tmp = ptr; + TSRMLS_FETCH_FROM_CTX(state->ts); + + state->url.user = &state->buffer[state->offset]; do { switch (*ptr) { @@ -500,16 +466,20 @@ static STATUS parse_userinfo(php_http_url_t *url, const char *ptr, const char *e return FAILURE; } password = ptr + 1; + state->buffer[state->offset++] = 0; + state->url.pass = &state->buffer[state->offset]; break; case '%': - if (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2))) { + if (ptr[1] != '%' && (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2)))) { php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse userinfo; invalid percent encoding at pos %u in '%s'", (unsigned) (ptr - tmp), tmp); return FAILURE; } - ptr += 2; + state->buffer[state->offset++] = *ptr++; + state->buffer[state->offset++] = *ptr++; + state->buffer[state->offset++] = *ptr; break; case '!': case '$': case '&': case '\'': case '(': case ')': case '*': @@ -526,61 +496,69 @@ static STATUS parse_userinfo(php_http_url_t *url, const char *ptr, const char *e case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': /* allowed */ + state->buffer[state->offset++] = *ptr; break; default: - if (url->flags & PHP_HTTP_URL_PARSE_UTF8MB) { - size_t n = parse_utf8mb(url, ptr, end); - - if (n) { - ptr += n; - break; - } - } -#ifdef PHP_HTTP_HAVE_WCHAR - else if (url->flags & PHP_HTTP_URL_PARSE_LOCMB) { - size_t n = parse_locmb(url, ptr, end); - - if (n) { - ptr += n; - break; - } + if (!(mb = parse_mb(state, PARSE_USERINFO, ptr, end, tmp, 0))) { + return FAILURE; } -#endif - php_error_docref(NULL TSRMLS_CC, E_WARNING, - "Failed to parse userinfo; unexpected byte 0x%02x at pos %u in '%s'", - *ptr, (unsigned) (ptr - tmp), tmp); + ptr += mb - 1; } } while(++ptr != end); - if (password) { - url->authority.userinfo.username.len = password - tmp - 1; - url->authority.userinfo.username.str = estrndup(tmp, - url->authority.userinfo.username.len); - url->authority.userinfo.password.len = end - password; - url->authority.userinfo.password.str = estrndup(password, - url->authority.userinfo.password.len); - } else { - url->authority.userinfo.username.len = end - tmp; - url->authority.userinfo.username.str = estrndup(tmp, - url->authority.userinfo.username.len); - } + + state->buffer[state->offset++] = 0; return SUCCESS; } -static STATUS parse_hostinfo(php_http_url_t *url, const char *ptr, const char *end) +static STATUS parse_hostinfo(struct parse_state *state, const char *ptr) { - const char *tmp = ptr, *port = NULL; - TSRMLS_FETCH_FROM_CTX(url->ts); + size_t mb, len; + const char *end = state->ptr, *tmp = ptr, *port = NULL; + TSRMLS_FETCH_FROM_CTX(state->ts); + + +#ifdef HAVE_INET_PTON + if (*ptr == '[') { + char *error = NULL, *tmp = memchr(ptr, ']', end - ptr); + + if (tmp) { + size_t addrlen = tmp - ptr + 1; + char buf[16], *addr = estrndup(ptr + 1, addrlen - 2); + int rv = inet_pton(AF_INET6, addr, buf); + + efree(addr); + if (rv == 1) { + state->buffer[state->offset] = '['; + state->url.host = &state->buffer[state->offset]; + inet_ntop(AF_INET6, buf, state->url.host + 1, state->maxlen - state->offset); + state->offset += strlen(state->url.host); + state->buffer[state->offset++] = ']'; + state->buffer[state->offset++] = 0; + ptr = tmp + 1; + } else if (rv == -1) { + error = strerror(errno); + } else { + error = "unexpected '['"; + } + } else { + error = "expected ']'"; + } - /* FIXME: IP(v6) addresses */ - do { + if (error) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse hostinfo; %s", error); + return FAILURE; + } + } +#endif + if (ptr != end) do { switch (*ptr) { case ':': if (port) { php_error_docref(NULL TSRMLS_CC, E_WARNING, - "Failed to parse port; duplicate ':' at pos %u in '%s'", + "Failed to parse port; unexpected ':' at pos %u in '%s'", (unsigned) (ptr - tmp), tmp); return FAILURE; } @@ -588,13 +566,15 @@ static STATUS parse_hostinfo(php_http_url_t *url, const char *ptr, const char *e break; case '%': - if (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2))) { + if (ptr[1] != '%' && (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2)))) { php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse hostinfo; invalid percent encoding at pos %u in '%s'", (unsigned) (ptr - tmp), tmp); return FAILURE; } - ptr += 2; + state->buffer[state->offset++] = *ptr++; + state->buffer[state->offset++] = *ptr++; + state->buffer[state->offset++] = *ptr; break; case '!': case '$': case '&': case '\'': case '(': case ')': case '*': @@ -611,7 +591,7 @@ static STATUS parse_hostinfo(php_http_url_t *url, const char *ptr, const char *e if (port) { php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse port; unexpected char '%c' at pos %u in '%s'", - *ptr, (unsigned) (ptr - tmp), tmp); + (unsigned char) *ptr, (unsigned) (ptr - tmp), tmp); return FAILURE; } /* no break */ @@ -619,67 +599,53 @@ static STATUS parse_hostinfo(php_http_url_t *url, const char *ptr, const char *e case '7': case '8': case '9': /* allowed */ if (port) { - url->authority.port *= 10; - url->authority.port += *ptr - '0'; + state->url.port *= 10; + state->url.port += *ptr - '0'; + } else { + state->buffer[state->offset++] = *ptr; } break; default: - if (!port) { - if (url->flags & PHP_HTTP_URL_PARSE_UTF8MB) { - size_t n = parse_utf8mb(url, ptr, end); - - if (n) { - ptr += n; - break; - } - } -#ifdef PHP_HTTP_HAVE_WCHAR - else if ((url->flags & PHP_HTTP_URL_PARSE_LOCMB) || (url->flags & PHP_HTTP_URL_PARSE_LOCIDN)) { - size_t n = parse_locmb(url, ptr, end); - - if (n) { - ptr += n; - break; - } - } -#endif + if (port) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, + "Failed to parse port; unexpected byte 0x%02x at pos %u in '%s'", + (unsigned char) *ptr, (unsigned) (ptr - tmp), tmp); + return FAILURE; + } else if (!(mb = parse_mb(state, PARSE_HOSTINFO, ptr, end, tmp, 0))) { + return FAILURE; } - php_error_docref(NULL TSRMLS_CC, E_WARNING, - "Failed to parse hostinfo; unexpected byte 0x%02x at pos %u in '%s'", - (unsigned char) *ptr, (unsigned) (ptr - tmp), tmp); - return FAILURE; + ptr += mb - 1; } } while (++ptr != end); - if (port) { - url->authority.host.len = port - tmp - 1; - } else { - url->authority.host.len = end - tmp; + if (!state->url.host) { + len = (port ? port - tmp - 1 : end - tmp); + state->url.host = &state->buffer[state->offset - len]; + state->buffer[state->offset++] = 0; } - url->authority.host.str = estrndup(tmp, url->authority.host.len); - #ifdef PHP_HTTP_HAVE_IDN - if (url->flags & PHP_HTTP_URL_PARSE_UTF8IDN) { + if (state->flags & PHP_HTTP_URL_PARSE_TOIDN) { char *idn = NULL; - int rv = idna_to_ascii_8z(url->authority.host.str, &idn, IDNA_ALLOW_UNASSIGNED|IDNA_USE_STD3_ASCII_RULES); + int rv = -1; - if (rv != IDNA_SUCCESS) { - php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Failed to parse IDN: '%s'", idna_strerror(rv)); - } else { - STR_SET(url->authority.host.str, estrdup(idn)); - free(idn); + if (state->flags & PHP_HTTP_URL_PARSE_MBUTF8) { + rv = idna_to_ascii_8z(state->url.host, &idn, IDNA_ALLOW_UNASSIGNED|IDNA_USE_STD3_ASCII_RULES); } - } else if (url->flags & PHP_HTTP_URL_PARSE_LOCIDN) { - char *idn = NULL; - int rv = idna_to_ascii_lz(url->authority.host.str, &idn, IDNA_ALLOW_UNASSIGNED|IDNA_USE_STD3_ASCII_RULES); - +# ifdef PHP_HTTP_HAVE_WCHAR + else if (state->flags & PHP_HTTP_URL_PARSE_MBLOC) { + rv = idna_to_ascii_lz(state->url.host, &idn, IDNA_ALLOW_UNASSIGNED|IDNA_USE_STD3_ASCII_RULES); + } +# endif if (rv != IDNA_SUCCESS) { - php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Failed to parse IDN: '%s'", idna_strerror(rv)); + php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse IDN; %s", idna_strerror(rv)); + return FAILURE; } else { - STR_SET(url->authority.host.str, estrdup(idn)); + size_t idnlen = strlen(idn); + memcpy(state->url.host, idn, idnlen + 1); free(idn); + state->offset += idnlen - len; } } #endif @@ -687,18 +653,25 @@ static STATUS parse_hostinfo(php_http_url_t *url, const char *ptr, const char *e return SUCCESS; } -static const char *parse_authority(php_http_url_t *url, const char *ptr, const char *end) +static const char *parse_authority(struct parse_state *state) { - const char *tmp = ptr; + const char *tmp = state->ptr, *host = NULL; do { - switch (*ptr) { + switch (*state->ptr) { case '@': /* userinfo delimiter */ - if (tmp != ptr && SUCCESS != parse_userinfo(url, tmp, ptr)) { + if (host) { + TSRMLS_FETCH_FROM_CTX(state->ts); + php_error_docref(NULL TSRMLS_CC, E_WARNING, + "Failed to parse userinfo; unexpected '@'"); + return NULL; + } + host = state->ptr + 1; + if (tmp != state->ptr && SUCCESS != parse_userinfo(state, tmp)) { return NULL; } - tmp = ptr + 1; + tmp = state->ptr + 1; break; case '/': @@ -706,37 +679,52 @@ static const char *parse_authority(php_http_url_t *url, const char *ptr, const c case '#': case '\0': /* host delimiter */ - if (tmp != ptr && SUCCESS != parse_hostinfo(url, tmp, ptr)) { + if (tmp != state->ptr && SUCCESS != parse_hostinfo(state, tmp)) { return NULL; } - return ptr; + return state->ptr; } - } while (++ptr <= end); + } while (++state->ptr <= state->end); return NULL; } -static const char *parse_path(php_http_url_t *url, const char *ptr, const char *end) +static const char *parse_path(struct parse_state *state) { - const char *tmp = ptr; - TSRMLS_FETCH_FROM_CTX(url->ts); + size_t mb; + const char *tmp; + TSRMLS_FETCH_FROM_CTX(state->ts); + + /* is there actually a path to parse? */ + if (!*state->ptr) { + return state->ptr; + } + tmp = state->ptr; + state->url.path = &state->buffer[state->offset]; do { - switch (*ptr) { + switch (*state->ptr) { + case '#': case '?': case '\0': - url->path.len = ptr - tmp; - url->path.str = estrndup(tmp, url->path.len); - return ptr; + /* did we have any path component ? */ + if (tmp != state->ptr) { + state->buffer[state->offset++] = 0; + } else { + state->url.path = NULL; + } + return state->ptr; case '%': - if (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2))) { + if (state->ptr[1] != '%' && (state->end - state->ptr <= 2 || !isxdigit(*(state->ptr+1)) || !isxdigit(*(state->ptr+2)))) { php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse path; invalid percent encoding at pos %u in '%s'", - (unsigned) (ptr - tmp), tmp); + (unsigned) (state->ptr - tmp), tmp); return NULL; } - ptr += 2; + state->buffer[state->offset++] = *state->ptr++; + state->buffer[state->offset++] = *state->ptr++; + state->buffer[state->offset++] = *state->ptr; break; case '/': /* yeah, well */ @@ -755,57 +743,52 @@ static const char *parse_path(php_http_url_t *url, const char *ptr, const char * case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': /* allowed */ + state->buffer[state->offset++] = *state->ptr; break; default: - if (url->flags & PHP_HTTP_URL_PARSE_UTF8MB) { - size_t n = parse_utf8mb(url, ptr, end); - - if (n) { - ptr += n; - break; - } - } -#if PHP_HTTP_HAVE_WCHAR - else if (url->flags & PHP_HTTP_URL_PARSE_LOCMB) { - size_t n = parse_locmb(url, ptr, end); - - if (n) { - ptr += n; - break; - } + if (!(mb = parse_mb(state, PARSE_PATH, state->ptr, state->end, tmp, 0))) { + return NULL; } -#endif - php_error_docref(NULL TSRMLS_CC, E_WARNING, - "Failed to parse path; unexpected byte 0x%02x pos %u in '%s'", - *ptr, (unsigned) (ptr - tmp), tmp); + state->ptr += mb - 1; } - } while (++ptr <= end); + } while (++state->ptr <= state->end); return NULL; } -static const char *parse_query(php_http_url_t *url, const char *ptr, const char *end) +static const char *parse_query(struct parse_state *state) { - const char *tmp = ptr + !!*ptr; - TSRMLS_FETCH_FROM_CTX(url->ts); + size_t mb; + const char *tmp = state->ptr + !!*state->ptr; + TSRMLS_FETCH_FROM_CTX(state->ts); + + /* is there actually a query to parse? */ + if (*state->ptr != '?') { + return state->ptr; + } + + /* skip initial '?' */ + tmp = ++state->ptr; + state->url.query = &state->buffer[state->offset]; do { - switch (*ptr) { + switch (*state->ptr) { case '#': case '\0': - url->query.len = ptr - tmp; - url->query.str = estrndup(tmp, url->query.len); - return ptr; + state->buffer[state->offset++] = 0; + return state->ptr; case '%': - if (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2))) { + if (state->ptr[1] != '%' && (state->end - state->ptr <= 2 || !isxdigit(*(state->ptr+1)) || !isxdigit(*(state->ptr+2)))) { php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse query; invalid percent encoding at pos %u in '%s'", - (unsigned) (ptr - tmp), tmp); + (unsigned) (state->ptr - tmp), tmp); return NULL; } - ptr += 2; + state->buffer[state->offset++] = *state->ptr++; + state->buffer[state->offset++] = *state->ptr++; + state->buffer[state->offset++] = *state->ptr; break; case '?': case '/': /* yeah, well */ @@ -824,51 +807,54 @@ static const char *parse_query(php_http_url_t *url, const char *ptr, const char case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': /* allowed */ + state->buffer[state->offset++] = *state->ptr; break; default: -#ifdef PHP_HTTP_HAVE_WCHAR - if (url->flags & PHP_HTTP_URL_PARSE_LOCMB) { - size_t n = parse_locmb(url, ptr, end); - - if (n) { - ptr += n; - break; - } + if (!(mb = parse_mb(state, PARSE_QUERY, state->ptr, state->end, tmp, 0))) { + return NULL; } -#endif - php_error_docref(NULL TSRMLS_CC, E_WARNING, - "Failed to parse query; unexpected byte 0x%02x at pos %u in '%s'", - *ptr, (unsigned) (ptr - tmp), tmp); + state->ptr += mb - 1; } - } while (++ptr <= end); + } while (++state->ptr <= state->end); return NULL; } -static const char *parse_fragment(php_http_url_t *url, const char *ptr, const char *end) +static const char *parse_fragment(struct parse_state *state) { - const char *tmp = ptr + !!*ptr; - TSRMLS_FETCH_FROM_CTX(url->ts); + size_t mb; + const char *tmp; + TSRMLS_FETCH_FROM_CTX(state->ts); + + /* is there actually a fragment to parse? */ + if (*state->ptr != '#') { + return state->ptr; + } + + /* skip initial '#' */ + tmp = ++state->ptr; + state->url.fragment = &state->buffer[state->offset]; do { - switch (*ptr) { + switch (*state->ptr) { case '\0': - url->fragment.len = ptr - tmp; - url->fragment.str = estrndup(tmp, url->fragment.len); - return ptr; + state->buffer[state->offset++] = 0; + return state->ptr; case '%': - if (end - ptr <= 2 || !isxdigit(*(ptr+1)) || !isxdigit(*(ptr+2))) { + if (state->ptr[1] != '%' && (state->end - state->ptr <= 2 || !isxdigit(*(state->ptr+1)) || !isxdigit(*(state->ptr+2)))) { php_error_docref(NULL TSRMLS_CC, E_WARNING, - "Failed to parse query; invalid percent encoding at pos %u in '%s'", - (unsigned) (ptr - tmp), tmp); + "Failed to parse fragment; invalid percent encoding at pos %u in '%s'", + (unsigned) (state->ptr - tmp), tmp); return NULL; } - ptr += 2; + state->buffer[state->offset++] = *state->ptr++; + state->buffer[state->offset++] = *state->ptr++; + state->buffer[state->offset++] = *state->ptr; break; - case '?': case '/': /* yeah, well */ + case '?': case '/': case '!': case '$': case '&': case '\'': case '(': case ')': case '*': case '+': case ',': case ';': case '=': /* sub-delims */ case '-': case '.': case '_': case '~': /* unreserved */ @@ -884,54 +870,55 @@ static const char *parse_fragment(php_http_url_t *url, const char *ptr, const ch case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': /* allowed */ + state->buffer[state->offset++] = *state->ptr; break; default: -#if PHP_HTTP_HAVE_WCHAR - if (url->flags & PHP_HTTP_URL_PARSE_LOCMB) { - size_t n = parse_locmb(url, ptr, end); - - if (n) { - ptr += n; - break; - } + if (!(mb = parse_mb(state, PARSE_FRAGMENT, state->ptr, state->end, tmp, 0))) { + return NULL; } -#endif - php_error_docref(NULL TSRMLS_CC, E_WARNING, - "Failed to parse fragment; unexpected byte 0x%02x at pos %u in '%s'", - *ptr, (unsigned) (ptr - tmp), tmp); + state->ptr += mb - 1; } - } while (++ptr <= end); + } while (++state->ptr <= state->end); return NULL; } -static const char *parse_hier(php_http_url_t *url, const char *ptr, const char *end) +static const char *parse_hier(struct parse_state *state) { - if (*ptr == '/') { - if (end - ptr > 1) { - if (*(ptr + 1) == '/') { - if (!(ptr = parse_authority(url, ptr + 2, end))) { + if (*state->ptr == '/') { + if (state->end - state->ptr > 1) { + if (*(state->ptr + 1) == '/') { + state->ptr += 2; + if (!(state->ptr = parse_authority(state))) { return NULL; } } } } - return parse_path(url, ptr, end); + return parse_path(state); } -static const char *parse_scheme(php_http_url_t *url, const char *ptr, const char *end) +static const char *parse_scheme(struct parse_state *state) { - const char *tmp = ptr; + size_t mb; + const char *tmp = state->ptr; do { - switch (*ptr) { + switch (*state->ptr) { case ':': /* scheme delimiter */ - url->scheme.len = ptr - tmp; - url->scheme.str = estrndup(tmp, url->scheme.len); - return ++ptr; + state->url.scheme = &state->buffer[0]; + state->buffer[state->offset++] = 0; + return ++state->ptr; + case '0': case '1': case '2': case '3': case '4': case '5': case '6': + case '7': case '8': case '9': + case '+': case '-': case '.': + if (state->ptr == tmp) { + return tmp; + } + /* no break */ case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': @@ -940,85 +927,57 @@ static const char *parse_scheme(php_http_url_t *url, const char *ptr, const char case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': - case '0': case '1': case '2': case '3': case '4': case '5': case '6': - case '7': case '8': case '9': - case '+': case '-': case '.': /* scheme part */ + state->buffer[state->offset++] = *state->ptr; break; default: -#ifdef PHP_HTTP_HAVE_WCHAR - if (url->flags & PHP_HTTP_URL_PARSE_LOCMB) { - size_t n = parse_locmb(url, ptr, end); - - if (n) { - ptr += n; - break; - } + if (!(mb = parse_mb(state, PARSE_SCHEME, state->ptr, state->end, tmp, 1))) { + /* soft fail; parse path next */ + return tmp; } -#endif - /* no scheme */ - return tmp; + state->ptr += mb - 1; } - } while (++ptr != end); + } while (++state->ptr != state->end); return tmp; } -php_http_url_t *php_http_url_init(php_http_url_t *url, const char *str, size_t len, unsigned flags TSRMLS_DC) +php_http_url_t *php_http_url_parse(const char *str, size_t len, unsigned flags TSRMLS_DC) { - const char *ptr, *end = str + len; - zend_bool free_url = !url; - - if (url) { - memset(url, 0, sizeof(*url)); - } else { - url = ecalloc(1, sizeof(*url)); - } - - url->flags = flags; - TSRMLS_SET_CTX(url->ts); - - if ((ptr = str) && !(str = parse_scheme(url, ptr, end))) { - php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse URL scheme: '%s'", ptr); - if (free_url) { - php_http_url_free(&url); - } else { - php_http_url_dtor(url); - } + size_t maxlen = 3 * len; + struct parse_state *state = ecalloc(1, sizeof(*state) + maxlen); + + state->end = str + len; + state->ptr = str; + state->flags = flags; + state->maxlen = maxlen; + TSRMLS_SET_CTX(state->ts); + + if (!parse_scheme(state)) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse URL scheme: '%s'", state->ptr); + efree(state); return NULL; } - if ((ptr = str) && !(str = parse_hier(url, ptr, end))) { - if (free_url) { - php_http_url_free(&url); - } else { - php_http_url_dtor(url); - } + if (!parse_hier(state)) { + efree(state); return NULL; } - if ((ptr = str) && !(str = parse_query(url, ptr, end))) { - php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse URL query: '%s'", ptr); - if (free_url) { - php_http_url_free(&url); - } else { - php_http_url_dtor(url); - } + if (!parse_query(state)) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse URL query: '%s'", state->ptr); + efree(state); return NULL; } - if ((ptr = str) && !(str = parse_fragment(url, ptr, end))) { - php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse URL fragment: '%s'", ptr); - if (free_url) { - php_http_url_free(&url); - } else { - php_http_url_dtor(url); - } + if (!parse_fragment(state)) { + php_error_docref(NULL TSRMLS_CC, E_WARNING, "Failed to parse URL fragment: '%s'", state->ptr); + efree(state); return NULL; } - return url; + return (php_http_url_t *) state; } ZEND_BEGIN_ARG_INFO_EX(ai_HttpUrl___construct, 0, 0, 0) @@ -1192,23 +1151,49 @@ PHP_METHOD(HttpUrl, parse) char *str; int len; long flags = 0; - php_http_url_t url; + php_http_url_t *url; + zend_error_handling zeh; - if (SUCCESS != zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &len, &flags)) { - return; - } + php_http_expect(SUCCESS == zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &len, &flags), invalid_arg, return); - if (php_http_url_init(&url, str, len, flags TSRMLS_CC)) { - printf(" scheme=(%zu)%s\n", url.scheme.len,url.scheme.str); - printf("username=(%zu)%s\n", url.authority.userinfo.username.len,url.authority.userinfo.username.str); - printf("password=(%zu)%s\n", url.authority.userinfo.password.len,url.authority.userinfo.password.str); - printf(" host=(%zu)%s\n", url.authority.host.len,url.authority.host.str); - printf(" port=%d\n", (int) url.authority.port); - printf(" path=(%zu)%s\n", url.path.len,url.path.str); - printf(" query=(%zu)%s\n", url.query.len,url.query.str); - printf("fragment=(%zu)%s\n", url.fragment.len,url.fragment.str); - php_http_url_dtor(&url); + zend_replace_error_handling(EH_THROW, php_http_exception_bad_url_class_entry, &zeh TSRMLS_CC); + if ((url = php_http_url_parse(str, len, flags TSRMLS_CC))) { + object_init_ex(return_value, php_http_url_class_entry); + if (url->scheme) { + zend_update_property_string(php_http_url_class_entry, return_value, + ZEND_STRL("scheme"), url->scheme TSRMLS_CC); + } + if (url->user) { + zend_update_property_string(php_http_url_class_entry, return_value, + ZEND_STRL("user"), url->user TSRMLS_CC); + } + if (url->pass) { + zend_update_property_string(php_http_url_class_entry, return_value, + ZEND_STRL("pass"), url->pass TSRMLS_CC); + } + if (url->host) { + zend_update_property_string(php_http_url_class_entry, return_value, + ZEND_STRL("host"), url->host TSRMLS_CC); + } + if (url->port) { + zend_update_property_long(php_http_url_class_entry, return_value, + ZEND_STRL("port"), url->port TSRMLS_CC); + } + if (url->path) { + zend_update_property_string(php_http_url_class_entry, return_value, + ZEND_STRL("path"), url->path TSRMLS_CC); + } + if (url->query) { + zend_update_property_string(php_http_url_class_entry, return_value, + ZEND_STRL("query"), url->query TSRMLS_CC); + } + if (url->fragment) { + zend_update_property_string(php_http_url_class_entry, return_value, + ZEND_STRL("fragment"), url->fragment TSRMLS_CC); + } + php_http_url_free(&url); } + zend_restore_error_handling(&zeh TSRMLS_CC); } static zend_function_entry php_http_url_methods[] = { @@ -1254,13 +1239,13 @@ PHP_MINIT_FUNCTION(http_url) zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("SANITIZE_PATH"), PHP_HTTP_URL_SANITIZE_PATH TSRMLS_CC); #ifdef PHP_HTTP_HAVE_WCHAR - zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_LOCMB"), PHP_HTTP_URL_PARSE_LOCMB TSRMLS_CC); + zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_MBLOC"), PHP_HTTP_URL_PARSE_MBLOC TSRMLS_CC); #endif - zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_UTF8MB"), PHP_HTTP_URL_PARSE_UTF8MB TSRMLS_CC); + zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_MBUTF8"), PHP_HTTP_URL_PARSE_MBUTF8 TSRMLS_CC); #ifdef PHP_HTTP_HAVE_IDN - zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_LOCIDN"), PHP_HTTP_URL_PARSE_LOCIDN TSRMLS_CC); - zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_UTF8IDN"), PHP_HTTP_URL_PARSE_UTF8IDN TSRMLS_CC); + zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_TOIDN"), PHP_HTTP_URL_PARSE_TOIDN TSRMLS_CC); #endif + zend_declare_class_constant_long(php_http_url_class_entry, ZEND_STRL("PARSE_TOPCT"), PHP_HTTP_URL_PARSE_TOPCT TSRMLS_CC); return SUCCESS; }