• php中parse_url函数的源码及分析(scheme部分)


    前言

    看师傅们的文章时发现,parse_url出现的次数较多,单纯parse_url解析漏洞的考题也有很多,在此研究一下源码(太菜了看不懂,待日后再补充Orz)

    源码

    ext/standard/url.c文件中

    PHPAPI php_url *php_url_parse_ex(char const *str, size_t length)
    {
        char port_buf[6];
        php_url *ret = ecalloc(1, sizeof(php_url));
        char const *s, *e, *p, *pp, *ue;
    
        s = str;
        ue = s + length;
    
        /* parse scheme */
        if ((e = memchr(s, ':', length)) && e != s) {
            /* validate scheme */
            p = s;
            while (p < e) {
                /* scheme = 1*[ lowalpha | digit | "+" | "-" | "." ] */
                if (!isalpha(*p) && !isdigit(*p) && *p != '+' && *p != '.' && *p != '-') {
                    if (e + 1 < ue && e < s + strcspn(s, "?#")) {
                        goto parse_port;
                    } else if (s + 1 < ue && *s == '/' && *(s + 1) == '/') { /* relative-scheme URL */
                        s += 2;
                        e = 0;
                        goto parse_host;
                    } else {
                        goto just_path;
                    }
                }
                p++;
            }
    
            if (e + 1 == ue) { /* only scheme is available */
                ret->scheme = estrndup(s, (e - s));
                php_replace_controlchars_ex(ret->scheme, (e - s));
                return ret;
            }
    
            /*
             * certain schemas like mailto: and zlib: may not have any / after them
             * this check ensures we support those.
             */
            if (*(e+1) != '/') {
                /* check if the data we get is a port this allows us to
                 * correctly parse things like a.com:80
                 */
                p = e + 1;
                while (p < ue && isdigit(*p)) {
                    p++;
                }
    
                if ((p == ue || *p == '/') && (p - e) < 7) {
                    goto parse_port;
                }
    
                ret->scheme = estrndup(s, (e-s));
                php_replace_controlchars_ex(ret->scheme, (e - s));
    
                s = e + 1;
                goto just_path;
            } else {
                ret->scheme = estrndup(s, (e-s));
                php_replace_controlchars_ex(ret->scheme, (e - s));
    
                if (e + 2 < ue && *(e + 2) == '/') {
                    s = e + 3;
                    if (!strncasecmp("file", ret->scheme, sizeof("file"))) {
                        if (e + 3 < ue && *(e + 3) == '/') {
                            /* support windows drive letters as in:
                               file:///c:/somedir/file.txt
                            */
                            if (e + 5 < ue && *(e + 5) == ':') {
                                s = e + 4;
                            }
                            goto just_path;
                        }
                    }
                } else {
                    s = e + 1;
                    goto just_path;
                }
            }
        } else if (e) { /* no scheme; starts with colon: look for port */
            parse_port:
            p = e + 1;
            pp = p;
    
            while (pp < ue && pp - p < 6 && isdigit(*pp)) {
                pp++;
            }
    
            if (pp - p > 0 && pp - p < 6 && (pp == ue || *pp == '/')) {
                zend_long port;
                memcpy(port_buf, p, (pp - p));
                port_buf[pp - p] = '';
                port = ZEND_STRTOL(port_buf, NULL, 10);
                if (port > 0 && port <= 65535) {
                    ret->port = (unsigned short) port;
                    if (s + 1 < ue && *s == '/' && *(s + 1) == '/') { /* relative-scheme URL */
                        s += 2;
                    }
                } else {
                    if (ret->scheme) efree(ret->scheme);
                    efree(ret);
                    return NULL;
                }
            } else if (p == pp && pp == ue) {
                if (ret->scheme) efree(ret->scheme);
                efree(ret);
                return NULL;
            } else if (s + 1 < ue && *s == '/' && *(s + 1) == '/') { /* relative-scheme URL */
                s += 2;
            } else {
                goto just_path;
            }
        } else if (s + 1 < ue && *s == '/' && *(s + 1) == '/') { /* relative-scheme URL */
            s += 2;
        } else {
            goto just_path;
        }
    
        parse_host:
        /* Binary-safe strcspn(s, "/?#") */
        e = ue;
        if ((p = memchr(s, '/', e - s))) {
            e = p;
        }
        if ((p = memchr(s, '?', e - s))) {
            e = p;
        }
        if ((p = memchr(s, '#', e - s))) {
            e = p;
        }
    
        /* check for login and password */
        if ((p = zend_memrchr(s, '@', (e-s)))) {
            if ((pp = memchr(s, ':', (p-s)))) {
                ret->user = estrndup(s, (pp-s));
                php_replace_controlchars_ex(ret->user, (pp - s));
    
                pp++;
                ret->pass = estrndup(pp, (p-pp));
                php_replace_controlchars_ex(ret->pass, (p-pp));
            } else {
                ret->user = estrndup(s, (p-s));
                php_replace_controlchars_ex(ret->user, (p-s));
            }
    
            s = p + 1;
        }
    
        /* check for port */
        if (s < ue && *s == '[' && *(e-1) == ']') {
            /* Short circuit portscan,
               we're dealing with an
               IPv6 embedded address */
            p = NULL;
        } else {
            p = zend_memrchr(s, ':', (e-s));
        }
    
        if (p) {
            if (!ret->port) {
                p++;
                if (e-p > 5) { /* port cannot be longer then 5 characters */
                    if (ret->scheme) efree(ret->scheme);
                    if (ret->user) efree(ret->user);
                    if (ret->pass) efree(ret->pass);
                    efree(ret);
                    return NULL;
                } else if (e - p > 0) {
                    zend_long port;
                    memcpy(port_buf, p, (e - p));
                    port_buf[e - p] = '';
                    port = ZEND_STRTOL(port_buf, NULL, 10);
                    if (port > 0 && port <= 65535) {
                        ret->port = (unsigned short)port;
                    } else {
                        if (ret->scheme) efree(ret->scheme);
                        if (ret->user) efree(ret->user);
                        if (ret->pass) efree(ret->pass);
                        efree(ret);
                        return NULL;
                    }
                }
                p--;
            }
        } else {
            p = e;
        }
    
        /* check if we have a valid host, if we don't reject the string as url */
        if ((p-s) < 1) {
            if (ret->scheme) efree(ret->scheme);
            if (ret->user) efree(ret->user);
            if (ret->pass) efree(ret->pass);
            efree(ret);
            return NULL;
        }
    
        ret->host = estrndup(s, (p-s));
        php_replace_controlchars_ex(ret->host, (p - s));
    
        if (e == ue) {
            return ret;
        }
    
        s = e;
    
        just_path:
    
        e = ue;
        p = memchr(s, '#', (e - s));
        if (p) {
            p++;
            if (p < e) {
                ret->fragment = estrndup(p, (e - p));
                php_replace_controlchars_ex(ret->fragment, (e - p));
            }
            e = p-1;
        }
    
        p = memchr(s, '?', (e - s));
        if (p) {
            p++;
            if (p < e) {
                ret->query = estrndup(p, (e - p));
                php_replace_controlchars_ex(ret->query, (e - p));
            }
            e = p-1;
        }
    
        if (s < e || s == ue) {
            ret->path = estrndup(s, (e - s));
            php_replace_controlchars_ex(ret->path, (e - s));
        }
    
        return ret;
    }
    
    /* {{{ proto mixed parse_url(string url, [int url_component])
       Parse a URL and return its components */
    PHP_FUNCTION(parse_url)
    {
    	char *str;
    	size_t str_len;
    	php_url *resource;
    	zend_long key = -1;
    
    	if (zend_parse_parameters(ZEND_NUM_ARGS(), "s|l", &str, &str_len, &key) == FAILURE) {
    		return;
    	}
    
    	resource = php_url_parse_ex(str, str_len);
    	if (resource == NULL) {
    		/* @todo Find a method to determine why php_url_parse_ex() failed */
    		RETURN_FALSE;
    	}
    
    	if (key > -1) {
    		switch (key) {
    			case PHP_URL_SCHEME:
    				if (resource->scheme != NULL) RETVAL_STRING(resource->scheme);
    				break;
    			case PHP_URL_HOST:
    				if (resource->host != NULL) RETVAL_STRING(resource->host);
    				break;
    			case PHP_URL_PORT:
    				if (resource->port != 0) RETVAL_LONG(resource->port);
    				break;
    			case PHP_URL_USER:
    				if (resource->user != NULL) RETVAL_STRING(resource->user);
    				break;
    			case PHP_URL_PASS:
    				if (resource->pass != NULL) RETVAL_STRING(resource->pass);
    				break;
    			case PHP_URL_PATH:
    				if (resource->path != NULL) RETVAL_STRING(resource->path);
    				break;
    			case PHP_URL_QUERY:
    				if (resource->query != NULL) RETVAL_STRING(resource->query);
    				break;
    			case PHP_URL_FRAGMENT:
    				if (resource->fragment != NULL) RETVAL_STRING(resource->fragment);
    				break;
    			default:
    				php_error_docref(NULL, E_WARNING, "Invalid URL component identifier " ZEND_LONG_FMT, key);
    				RETVAL_FALSE;
    		}
    		goto done;
    	}
    
    	/* allocate an array for return */
    	array_init(return_value);
    
        /* add the various elements to the array */
    	if (resource->scheme != NULL)
    		add_assoc_string(return_value, "scheme", resource->scheme);
    	if (resource->host != NULL)
    		add_assoc_string(return_value, "host", resource->host);
    	if (resource->port != 0)
    		add_assoc_long(return_value, "port", resource->port);
    	if (resource->user != NULL)
    		add_assoc_string(return_value, "user", resource->user);
    	if (resource->pass != NULL)
    		add_assoc_string(return_value, "pass", resource->pass);
    	if (resource->path != NULL)
    		add_assoc_string(return_value, "path", resource->path);
    	if (resource->query != NULL)
    		add_assoc_string(return_value, "query", resource->query);
    	if (resource->fragment != NULL)
    		add_assoc_string(return_value, "fragment", resource->fragment);
    done:
    	php_url_free(resource);
    }
    

    代码中遇到的问题解决

    函数定义部分

    PHP_FUNCTION(parse_url)
    {
        char *str;
        size_t str_len;
        php_url *resource;
        zend_long key = -1;
    
        if (zend_parse_parameters(ZEND_NUM_ARGS(), "s|l", &str, &str_len, &key) == FAILURE) {
            return;
        }
    
        resource = php_url_parse_ex(str, str_len);
        if (resource == NULL) {
            /* @todo Find a method to determine why php_url_parse_ex() failed */
            RETURN_FALSE;
    }
    

    引用这篇文章的内容http://www.nowamagic.net/librarys/veda/detail/1467

    b   Boolean
    l   Integer 整型
    d   Floating point 浮点型
    s   String 字符串
    r   Resource 资源
    a   Array 数组
    o   Object instance 对象
    O   Object instance of a specified type 特定类型的对象
    z   Non-specific zval 任意类型
    Z   zval**类型
    f   表示函数、方法名称
    

    那么其中的"s|l"表示parse_url需要两个参数,一个字符串型,一个整型

    php_url类型的声明在ext/standard/url.h

    typedef struct php_url {
    	char *scheme;
    	char *user;
    	char *pass;
    	char *host;
    	unsigned short port;
    	char *path;
    	char *query;
    	char *fragment;
    } php_url;
    

    问题

    1. parse_url只有两个参数,不知道strlen这个参数哪里去了……?还有他的值到底是怎么获得的……

    函数内部实现部分

    使用php_url_parse_ex函数来处理我们传过去的url,先暂定str_len为str的长度……

    if ((e = memchr(s, ':', length)) && e != s) {
            /* validate scheme */
            p = s;
            while (p < e) {
                /* scheme = 1*[ lowalpha | digit | "+" | "-" | "." ] */
                if (!isalpha(*p) && !isdigit(*p) && *p != '+' && *p != '.' && *p != '-') {
                    if (e + 1 < ue && e < s + strcspn(s, "?#")) {
                        goto parse_port;
                    } else if (s + 1 < ue && *s == '/' && *(s + 1) == '/') { /* relative-scheme URL */
                        s += 2;
                        e = 0;
                        goto parse_host;
                    } else {
                        goto just_path;
                    }
                }
                p++;
            }
    
            if (e + 1 == ue) { /* only scheme is available */
                ret->scheme = estrndup(s, (e - s));
                php_replace_controlchars_ex(ret->scheme, (e - s));
                return ret;
            }
    
            /*
             * certain schemas like mailto: and zlib: may not have any / after them
             * this check ensures we support those.
             */
            if (*(e+1) != '/') {
                /* check if the data we get is a port this allows us to
                 * correctly parse things like a.com:80
                 */
                p = e + 1;
                while (p < ue && isdigit(*p)) {
                    p++;
                }
    
                if ((p == ue || *p == '/') && (p - e) < 7) {
                    goto parse_port;
                }
    
                ret->scheme = estrndup(s, (e-s));
                php_replace_controlchars_ex(ret->scheme, (e - s));
    
                s = e + 1;
                goto just_path;
            } else {
                ret->scheme = estrndup(s, (e-s));
                php_replace_controlchars_ex(ret->scheme, (e - s));
    
                if (e + 2 < ue && *(e + 2) == '/') {
                    s = e + 3;
                    if (!strncasecmp("file", ret->scheme, sizeof("file"))) {
                        if (e + 3 < ue && *(e + 3) == '/') {
                            /* support windows drive letters as in:
                               file:///c:/somedir/file.txt
                            */
                            if (e + 5 < ue && *(e + 5) == ':') {
                                s = e + 4;
                            }
                            goto just_path;
                        }
                    }
                } else {
                    s = e + 1;
                    goto just_path;
                }
            }
        } else if (e) { /* no scheme; starts with colon: look for port */
    

    如果s中含有冒号则e指向冒号
    且同时如果冒号不在s的开头,p指向s

    当p不指向冒号向循环,p指向下一位

    如果p指向的值是字母或者数字或者是+,-,.则指针指向下一位,这就代表冒号前面的值其实是任意的字母、数字、+、-、.

    如果冒号所在位置小于str,且?#在冒号后面(如果有的话),就跳转到port解析部分

    如果str的长度大于1且str的前两个字符是//,s指向//后面的一个字符,e变为0,跳转到host解析

    如果冒号是最后一位字符,则冒号前面的东西会当作scheme返回

    如果冒号后面不是/,则p指向冒号后面一位

    当p小于str且p指向的为数字字符,p一直指向后一位,直到p指向str末尾或者p指向的字符为/,同时冒号后面的数字位数小于6位,跳转到port解析

    如果冒号后面不是纯数字或数字后面有一个/,那么冒号前面的内容就当作scheme,放在ret的scheme参数中,s指向冒号后一位,跳转到path解析

    如果冒号后面是/,那么冒号前面的内容就当作scheme,放在ret的scheme参数中。如果下面一位也是/,那么s指向//后面一位,如果scheme为file,那么判断接下来一位是不是/,如果是,判断冒号后是否有五个字符,如果有那么第五个字符是不是冒号(为了处理file:///c:),s指向///后的一位字符,跳转到path解析

    如果冒号后面不是三个/,s指向冒号后面一位,之后跳转到path解析

    如果冒号在str开头,那么进行port解析

    姿势

    1. 只要请求的url里不含有冒号(:)就会被当成path解析
  • 相关阅读:
    leetcode Majority Element
    Missing Number 三种解法
    Effective C++学习笔记 chapter 1
    C++ 笔记
    三色排序
    归并排序-就地排序
    506,display有哪些值?说明他们的作用
    505,display,float,position之间的关系(有疑问)
    504,什么是FOUC?怎么避免
    503,display:none;与visibility:hidden;的区别
  • 原文地址:https://www.cnblogs.com/hf99/p/9637354.html
Copyright © 2020-2023  润新知