php中parse_url函式的原始碼及分析(scheme部分)
前言
看師傅們的文章時發現,parse_url出現的次數較多,單純parse_url解析漏洞的考題也有很多,在此研究一下原始碼(太菜了看不懂,待日後再補充Orz)
原始碼
在ext/standard/url.c檔案中
PHPAPI php_url *php_url_parse_ex(char const *str, size_t length) { char port_buf[6]; php_url *ret = ecalloc(1, sizeof(php_url)); char const *s, *e, *p, *pp, *ue; s = str; ue = s + length; /* parse scheme */ if ((e = memchr(s, ':', length)) && e != s) { /* validate scheme */ p = s; while (p < e) { /* scheme = 1*[ lowalpha | digit | "+" | "-" | "." ] */ if (!isalpha(*p) && !isdigit(*p) && *p != '+' && *p != '.' && *p != '-') { if (e + 1 < ue && e < s + strcspn(s, "?#")) { goto parse_port; } else if (s + 1 < ue && *s == '/' && *(s + 1) == '/') { /* relative-scheme URL */ s += 2; e = 0; goto parse_host; } else { goto just_path; } } p++; } if (e + 1 == ue) { /* only scheme is available */ ret->scheme = estrndup(s, (e - s)); php_replace_controlchars_ex(ret->scheme, (e - s)); return ret; } /* * certain schemas like mailto: and zlib: may not have any / after them * this check ensures we support those. */ if (*(e+1) != '/') { /* check if the data we get is a port this allows us to * correctly parse things like a.com:80 */ p = e + 1; while (p < ue && isdigit(*p)) { p++; } if ((p == ue || *p == '/') && (p - e) < 7) { goto parse_port; } ret->scheme = estrndup(s, (e-s)); php_replace_controlchars_ex(ret->scheme, (e - s)); s = e + 1; goto just_path; } else { ret->scheme = estrndup(s, (e-s)); php_replace_controlchars_ex(ret->scheme, (e - s)); if (e + 2 < ue && *(e + 2) == '/') { s = e + 3; if (!strncasecmp("file", ret->scheme, sizeof("file"))) { if (e + 3 < ue && *(e + 3) == '/') { /* support windows drive letters as in: file:///c:/somedir/file.txt */ if (e + 5 < ue && *(e + 5) == ':') { s = e + 4; } goto just_path; } } } else { s = e + 1; goto just_path; } } } else if (e) { /* no scheme; starts with colon: look for port */ parse_port: p = e + 1; pp = p; while (pp < ue && pp - p < 6 && isdigit(*pp)) { pp++; } if (pp - p > 0 && pp - p < 6 && (pp == ue || *pp == '/')) { zend_long port; memcpy(port_buf, p, (pp - p)); port_buf[pp - p] = '\0'; port = ZEND_STRTOL(port_buf, NULL, 10); if (port > 0 && port <= 65535) { ret->port = (unsigned short) port; if (s + 1 < ue && *s == '/' && *(s + 1) == '/') { /* relative-scheme URL */ s += 2; } } else { if (ret->scheme) efree(ret->scheme); efree(ret); return NULL; } } else if (p == pp && pp == ue) { if (ret->scheme) efree(ret->scheme); efree(ret); return NULL; } else if (s + 1 < ue && *s == '/' && *(s + 1) == '/') { /* relative-scheme URL */ s += 2; } else { goto just_path; } } else if (s + 1 < ue && *s == '/' && *(s + 1) == '/') { /* relative-scheme URL */ s += 2; } else { goto just_path; } parse_host: /* Binary-safe strcspn(s, "/?#") */ e = ue; if ((p = memchr(s, '/', e - s))) { e = p; } if ((p = memchr(s, '?', e - s))) { e = p; } if ((p = memchr(s, '#', e - s))) { e = p; } /* check for login and password */ if ((p = zend_memrchr(s, '@', (e-s)))) { if ((pp = memchr(s, ':', (p-s)))) { ret->user = estrndup(s, (pp-s)); php_replace_controlchars_ex(ret->user, (pp - s)); pp++; ret->pass = estrndup(pp, (p-pp)); php_replace_controlchars_ex(ret->pass, (p-pp)); } else { ret->user = estrndup(s, (p-s)); php_replace_controlchars_ex(ret->user, (p-s)); } s = p + 1; } /* check for port */ if (s < ue && *s == '[' && *(e-1) == ']') { /* Short circuit portscan, we're dealing with an IPv6 embedded address */ p = NULL; } else { p = zend_memrchr(s, ':', (e-s)); } if (p) { if (!ret->port) { p++; if (e-p > 5) { /* port cannot be longer then 5 characters */ if (ret->scheme) efree(ret->scheme); if (ret->user) efree(ret->user); if (ret->pass) efree(ret->pass); efree(ret); return NULL; } else if (e - p > 0) { zend_long port; memcpy(port_buf, p, (e - p)); port_buf[e - p] = '\0'; port = ZEND_STRTOL(port_buf, NULL, 10); if (port > 0 && port <= 65535) { ret->port = (unsigned short)port; } else { if (ret->scheme) efree(ret->scheme); if (ret->user) efree(ret->user); if (ret->pass) efree(ret->pass); efree(ret); return NULL; } } p--; } } else { p = e; } /* check if we have a valid host, if we don't reject the string as url */ if ((p-s) < 1) { if (ret->scheme) efree(ret->scheme); if (ret->user) efree(ret->user); if (ret->pass) efree(ret->pass); efree(ret); return NULL; } ret->host = estrndup(s, (p-s)); php_replace_controlchars_ex(ret->host, (p - s)); if (e == ue) { return ret; } s = e; just_path: e = ue; p = memchr(s, '#', (e - s)); if (p) { p++; if (p < e) { ret->fragment = estrndup(p, (e - p)); php_replace_controlchars_ex(ret->fragment, (e - p)); } e = p-1; } p = memchr(s, '?', (e - s)); if (p) { p++; if (p < e) { ret->query = estrndup(p, (e - p)); php_replace_controlchars_ex(ret->query, (e - p)); } e = p-1; } if (s < e || s == ue) { ret->path = estrndup(s, (e - s)); php_replace_controlchars_ex(ret->path, (e - s)); } return ret; }
/* {{{ proto mixed parse_url(string url, [int url_component]) Parse a URL and return its components */ PHP_FUNCTION(parse_url) { char *str; size_t str_len; php_url *resource; zend_long key = -1; if (zend_parse_parameters(ZEND_NUM_ARGS(), "s|l", &str, &str_len, &key) == FAILURE) { return; } resource = php_url_parse_ex(str, str_len); if (resource == NULL) { /* @todo Find a method to determine why php_url_parse_ex() failed */ RETURN_FALSE; } if (key > -1) { switch (key) { case PHP_URL_SCHEME: if (resource->scheme != NULL) RETVAL_STRING(resource->scheme); break; case PHP_URL_HOST: if (resource->host != NULL) RETVAL_STRING(resource->host); break; case PHP_URL_PORT: if (resource->port != 0) RETVAL_LONG(resource->port); break; case PHP_URL_USER: if (resource->user != NULL) RETVAL_STRING(resource->user); break; case PHP_URL_PASS: if (resource->pass != NULL) RETVAL_STRING(resource->pass); break; case PHP_URL_PATH: if (resource->path != NULL) RETVAL_STRING(resource->path); break; case PHP_URL_QUERY: if (resource->query != NULL) RETVAL_STRING(resource->query); break; case PHP_URL_FRAGMENT: if (resource->fragment != NULL) RETVAL_STRING(resource->fragment); break; default: php_error_docref(NULL, E_WARNING, "Invalid URL component identifier " ZEND_LONG_FMT, key); RETVAL_FALSE; } goto done; } /* allocate an array for return */ array_init(return_value); /* add the various elements to the array */ if (resource->scheme != NULL) add_assoc_string(return_value, "scheme", resource->scheme); if (resource->host != NULL) add_assoc_string(return_value, "host", resource->host); if (resource->port != 0) add_assoc_long(return_value, "port", resource->port); if (resource->user != NULL) add_assoc_string(return_value, "user", resource->user); if (resource->pass != NULL) add_assoc_string(return_value, "pass", resource->pass); if (resource->path != NULL) add_assoc_string(return_value, "path", resource->path); if (resource->query != NULL) add_assoc_string(return_value, "query", resource->query); if (resource->fragment != NULL) add_assoc_string(return_value, "fragment", resource->fragment); done: php_url_free(resource); }
程式碼中遇到的問題解決
函式定義部分
PHP_FUNCTION(parse_url) { char *str; size_t str_len; php_url *resource; zend_long key = -1; if (zend_parse_parameters(ZEND_NUM_ARGS(), "s|l", &str, &str_len, &key) == FAILURE) { return; } resource = php_url_parse_ex(str, str_len); if (resource == NULL) { /* @todo Find a method to determine why php_url_parse_ex() failed */ RETURN_FALSE; }
引用這篇文章的內容http://www.nowamagic.net/librarys/veda/detail/1467
b Boolean
l Integer 整型
d Floating point 浮點型
s String 字串
r Resource 資源
a Array 陣列
o Object instance 物件
O Object instance of a specified type 特定型別的物件
z Non-specific zval 任意型別
Z zval**型別
f 表示函式、方法名稱
那麼其中的"s|l"表示parse_url需要兩個引數,一個字串型,一個整型
php_url型別的宣告在ext/standard/url.h中
typedef struct php_url {
char *scheme;
char *user;
char *pass;
char *host;
unsigned short port;
char *path;
char *query;
char *fragment;
} php_url;
問題
- parse_url只有兩個引數,不知道strlen這個引數哪裡去了……?還有他的值到底是怎麼獲得的……
函式內部實現部分
使用php_url_parse_ex函式來處理我們傳過去的url,先暫定str_len為str的長度……
if ((e = memchr(s, ':', length)) && e != s) {
/* validate scheme */
p = s;
while (p < e) {
/* scheme = 1*[ lowalpha | digit | "+" | "-" | "." ] */
if (!isalpha(*p) && !isdigit(*p) && *p != '+' && *p != '.' && *p != '-') {
if (e + 1 < ue && e < s + strcspn(s, "?#")) {
goto parse_port;
} else if (s + 1 < ue && *s == '/' && *(s + 1) == '/') { /* relative-scheme URL */
s += 2;
e = 0;
goto parse_host;
} else {
goto just_path;
}
}
p++;
}
if (e + 1 == ue) { /* only scheme is available */
ret->scheme = estrndup(s, (e - s));
php_replace_controlchars_ex(ret->scheme, (e - s));
return ret;
}
/*
* certain schemas like mailto: and zlib: may not have any / after them
* this check ensures we support those.
*/
if (*(e+1) != '/') {
/* check if the data we get is a port this allows us to
* correctly parse things like a.com:80
*/
p = e + 1;
while (p < ue && isdigit(*p)) {
p++;
}
if ((p == ue || *p == '/') && (p - e) < 7) {
goto parse_port;
}
ret->scheme = estrndup(s, (e-s));
php_replace_controlchars_ex(ret->scheme, (e - s));
s = e + 1;
goto just_path;
} else {
ret->scheme = estrndup(s, (e-s));
php_replace_controlchars_ex(ret->scheme, (e - s));
if (e + 2 < ue && *(e + 2) == '/') {
s = e + 3;
if (!strncasecmp("file", ret->scheme, sizeof("file"))) {
if (e + 3 < ue && *(e + 3) == '/') {
/* support windows drive letters as in:
file:///c:/somedir/file.txt
*/
if (e + 5 < ue && *(e + 5) == ':') {
s = e + 4;
}
goto just_path;
}
}
} else {
s = e + 1;
goto just_path;
}
}
} else if (e) { /* no scheme; starts with colon: look for port */
如果s中含有冒號則e指向冒號 且同時如果冒號不在s的開頭,p指向s
當p不指向冒號向迴圈,p指向下一位
如果p指向的值是字母或者數字或者是+,-,.則指標指向下一位,這就代表冒號前面的值其實是任意的字母、數字、+、-、.
如果冒號所在位置小於str,且?#在冒號後面(如果有的話),就跳轉到port解析部分
如果str的長度大於1且str的前兩個字元是//,s指向//後面的一個字元,e變為0,跳轉到host解析
如果冒號是最後一位字元,則冒號前面的東西會當作scheme返回
如果冒號後面不是/,則p指向冒號後面一位 當p小於str且p指向的為數字字元,p一直指向後一位,直到p指向str末尾或者p指向的字元為/,同時冒號後面的數字位數小於6位,跳轉到port解析
如果冒號後面不是純數字或數字後面有一個/,那麼冒號前面的內容就當作scheme,放在ret的scheme引數中,s指向冒號後一位,跳轉到path解析
如果冒號後面是/,那麼冒號前面的內容就當作scheme,放在ret的scheme引數中。如果下面一位也是/,那麼s指向//後面一位,如果scheme為file,那麼判斷接下來一位是不是/,如果是,判斷冒號後是否有五個字元,如果有那麼第五個字元是不是冒號(為了處理file:///c:),s指向///後的一位字元,跳轉到path解析
如果冒號後面不是三個/,s指向冒號後面一位,之後跳轉到path解析
如果冒號在str開頭,那麼進行port解析
姿勢
- 只要請求的url裡不含有冒號(:)就會被當成path解析