• php爬虫选择器-来自phpspider


    2021年8月7日09:44:05

    之前一直使用phpspider

    官网:https://doc.phpspider.org/

    但是官方对psr4,对php7 php8似乎没有升级的意思,用的比较多就是 selector 选择器

    现在使用的laravel8 php8的框架,所以一直没有做更改,我其实比较不接受

    composer require owner888/phpspider

    提示一堆psr4不兼容问题,确实有点强迫症,我就直接把selector 直接单独提取出来,试了下竟然没问题,可以单独使用

    经过测试php 7.1 7.2 7.3 8.0都是运行OK的,挺好的,可以自己去拿出来,这里贴出来一个,如果需要curl客户端可以使用 GuzzleHttpClient; 特别是你在使用laravel本身都是集成的

    注意看官方文档

    <?php
    
    namespace AppUtils;
    
    use DOMDocument;
    use DOMXpath;
    use Exception;
    
    class Selector
    {
        /**
         * 版本号
         * @var string
         */
        const VERSION = '1.0.2';
        public static $dom = null;
        public static $dom_auth = '';
        public static $xpath = null;
        public static $error = null;
    
        public static function select($html, $selector, $selector_type = 'xpath')
        {
            if (empty($html) || empty($selector)) {
                return false;
            }
    
            $selector_type = strtolower($selector_type);
            if ($selector_type == 'xpath') {
                return self::_xpath_select($html, $selector);
            } elseif ($selector_type == 'regex') {
                return self::_regex_select($html, $selector);
            } elseif ($selector_type == 'css') {
                return self::_css_select($html, $selector);
            }
        }
    
        public static function remove($html, $selector, $selector_type = 'xpath')
        {
            if (empty($html) || empty($selector)) {
                return false;
            }
    
            $remove_html = "";
            $selector_type = strtolower($selector_type);
            if ($selector_type == 'xpath') {
                $remove_html = self::_xpath_select($html, $selector, true);
            } elseif ($selector_type == 'regex') {
                $remove_html = self::_regex_select($html, $selector, true);
            } elseif ($selector_type == 'css') {
                $remove_html = self::_css_select($html, $selector, true);
            }
            $html = str_replace($remove_html, "", $html);
            return $html;
        }
    
        /**
         * xpath选择器
         *
         * @param mixed $html
         * @param mixed $selector
         * @return void
         * @author seatle <seatle@foxmail.com>
         * @created time :2016-10-26 12:53
         */
        private static function _xpath_select($html, $selector, $remove = false)
        {
            if (!is_object(self::$dom)) {
                self::$dom = new DOMDocument();
            }
    
            // 如果加载的不是之前的HTML内容,替换一下验证标识
            if (self::$dom_auth != md5($html)) {
                self::$dom_auth = md5($html);
                @self::$dom->loadHTML('<?xml encoding="UTF-8">' . $html);
                self::$xpath = new DOMXpath(self::$dom);
            }
    
            //libxml_use_internal_errors(true);
            //self::$dom->loadHTML('<?xml encoding="UTF-8">'.$html);
            //$errors = libxml_get_errors();
            //if (!empty($errors))
            //{
            //print_r($errors);
            //exit;
            //}
    
            $elements = @self::$xpath->query($selector);
            if ($elements === false) {
                self::$error = "the selector in the xpath("{$selector}") syntax errors";
                // 不应该返回false,因为isset(false)为true,更不能通过 !$values 去判断,因为!0为true,所以这里只能返回null
                //return false;
                return null;
            }
    
            $result = array();
            if (!is_null($elements)) {
                foreach ($elements as $element) {
                    // 如果是删除操作,取一整块代码
                    if ($remove) {
                        $content = self::$dom->saveXml($element);
                    } else {
                        $nodeName = $element->nodeName;
                        $nodeType = $element->nodeType;     // 1.Element 2.Attribute 3.Text
                        //$nodeAttr = $element->getAttribute('src');
                        //$nodes = util::node_to_array(self::$dom, $element);
                        //echo $nodes['@src']."
    ";
                        // 如果是img标签,直接取src值
                        if ($nodeType == 1 && in_array($nodeName, array('img'))) {
                            $content = $element->getAttribute('src');
                        } // 如果是标签属性,直接取节点值
                        elseif ($nodeType == 2 || $nodeType == 3 || $nodeType == 4) {
                            $content = $element->nodeValue;
                        } else {
                            // 保留nodeValue里的html符号,给children二次提取
                            $content = self::$dom->saveXml($element);
                            //$content = trim(self::$dom->saveHtml($element));
                            $content = preg_replace(array("#^<{$nodeName}.*>#isU", "#</{$nodeName}>$#isU"), array('', ''), $content);
                        }
                    }
                    $result[] = $content;
                }
            }
            if (empty($result)) {
                return null;
            }
            // 如果只有一个元素就直接返回string,否则返回数组
            return count($result) > 1 ? $result : $result[0];
        }
    
        /**
         * css选择器
         *
         * @param mixed $html
         * @param mixed $selector
         * @return void
         * @author seatle <seatle@foxmail.com>
         * @created time :2016-10-26 12:53
         */
        private static function _css_select($html, $selector, $remove = false)
        {
            $selector = self::css_to_xpath($selector);
            //echo $selector."
    ";
            //exit("
    ");
            return self::_xpath_select($html, $selector, $remove);
            // 如果加载的不是之前的HTML内容,替换一下验证标识
            //if (self::$dom_auth['css'] != md5($html))
            //{
            //self::$dom_auth['css'] = md5($html);
            //phpQuery::loadDocumentHTML($html);
            //}
            //if ($remove)
            //{
            //return phpQuery::pq($selector)->remove();
            //}
            //else
            //{
            //return phpQuery::pq($selector)->html();
            //}
        }
    
        /**
         * 正则选择器
         *
         * @param mixed $html
         * @param mixed $selector
         * @return void
         * @author seatle <seatle@foxmail.com>
         * @created time :2016-10-26 12:53
         */
        private static function _regex_select($html, $selector, $remove = false)
        {
            if (@preg_match_all($selector, $html, $out) === false) {
                self::$error = "the selector in the regex("{$selector}") syntax errors";
                return null;
            }
            $count = count($out);
            $result = array();
            // 一个都没有匹配到
            if ($count == 0) {
                return null;
            } // 只匹配一个,就是只有一个 ()
            elseif ($count == 2) {
                // 删除的话取匹配到的所有内容
                if ($remove) {
                    $result = $out[0];
                } else {
                    $result = $out[1];
                }
            } else {
                for ($i = 1; $i < $count; $i++) {
                    // 如果只有一个元素,就直接返回好了
                    $result[] = count($out[$i]) > 1 ? $out[$i] : $out[$i][0];
                }
            }
            if (empty($result)) {
                return null;
            }
    
            return count($result) > 1 ? $result : $result[0];
        }
    
        public static function find_all($html, $selector)
        {
        }
    
    
        public static function css_to_xpath($selectors)
        {
            $queries = self::parse_selector($selectors);
            $delimiter_before = false;
            $xquery = '';
            foreach ($queries as $s) {
                // TAG
                $is_tag = preg_match('@^[w|||-]+$@', $s) || $s == '*';
                if ($is_tag) {
                    $xquery .= $s;
                } // ID
                else if ($s[0] == '#') {
                    if ($delimiter_before) {
                        $xquery .= '*';
                    }
                    // ID用精确查询
                    $xquery .= "[@id='" . substr($s, 1) . "']";
                } // CLASSES
                else if ($s[0] == '.') {
                    if ($delimiter_before) {
                        $xquery .= '*';
                    }
                    // CLASS用模糊查询
                    $xquery .= "[contains(@class,'" . substr($s, 1) . "')]";
                } // ATTRIBUTES
                else if ($s[0] == '[') {
                    if ($delimiter_before) {
                        $xquery .= '*';
                    }
                    // strip side brackets
                    $attr = trim($s, '][');
                    // attr with specifed value
                    if (mb_strpos($s, '=')) {
                        $value = null;
                        list($attr, $value) = explode('=', $attr);
                        $value = trim($value, "'"");
                        if (self::is_regexp($attr)) {
                            // cut regexp character
                            $attr = substr($attr, 0, -1);
                            $xquery .= "[@{$attr}]";
                        } else {
                            $xquery .= "[@{$attr}='{$value}']";
                        }
                    } // attr without specified value
                    else {
                        $xquery .= "[@{$attr}]";
                    }
                } // ~ General Sibling Selector
                else if ($s[0] == '~') {
                } // + Adjacent sibling selectors
                else if ($s[0] == '+') {
                } // PSEUDO CLASSES
                else if ($s[0] == ':') {
                } // DIRECT DESCENDANDS
                else if ($s == '>') {
                    $xquery .= '/';
                    $delimiter_before = 2;
                } // ALL DESCENDANDS
                else if ($s == ' ') {
                    $xquery .= '//';
                    $delimiter_before = 2;
                } // ERRORS
                else {
                    exit("Unrecognized token '$s'");
                }
                $delimiter_before = $delimiter_before === 2;
            }
            return $xquery;
        }
    
        /**
         * @access private
         */
        public static function parse_selector($query)
        {
            $query = trim(preg_replace('@s+@', ' ', preg_replace('@s*(>|\+|~)s*@', '\1', $query)));
            $queries = array();
            if (!$query) {
                return $queries;
            }
    
            $special_chars = array('>', ' ');
            $special_chars_mapping = array();
            $strlen = mb_strlen($query);
            $class_chars = array('.', '-');
            $pseudo_chars = array('-');
            $tag_chars = array('*', '|', '-');
            // split multibyte string
            // http://code.google.com/p/phpquery/issues/detail?id=76
            $_query = array();
            for ($i = 0; $i < $strlen; $i++) {
                $_query[] = mb_substr($query, $i, 1);
            }
            $query = $_query;
            // it works, but i dont like it...
            $i = 0;
            while ($i < $strlen) {
                $c = $query[$i];
                $tmp = '';
                // TAG
                if (self::is_char($c) || in_array($c, $tag_chars)) {
                    while (isset($query[$i]) && (self::is_char($query[$i]) || in_array($query[$i], $tag_chars))) {
                        $tmp .= $query[$i];
                        $i++;
                    }
                    $queries[] = $tmp;
                } // IDs
                else if ($c == '#') {
                    $i++;
                    while (isset($query[$i]) && (self::is_char($query[$i]) || $query[$i] == '-')) {
                        $tmp .= $query[$i];
                        $i++;
                    }
                    $queries[] = '#' . $tmp;
                } // SPECIAL CHARS
                else if (in_array($c, $special_chars)) {
                    $queries[] = $c;
                    $i++;
                    // MAPPED SPECIAL MULTICHARS
                    //            } else if ( $c.$query[$i+1] == '//') {
                    //                $return[] = ' ';
                    //                $i = $i+2;
                } // MAPPED SPECIAL CHARS
                else if (isset($special_chars_mapping[$c])) {
                    $queries[] = $special_chars_mapping[$c];
                    $i++;
                } // COMMA
                else if ($c == ',') {
                    $i++;
                    while (isset($query[$i]) && $query[$i] == ' ') {
                        $i++;
                    }
                } // CLASSES
                else if ($c == '.') {
                    while (isset($query[$i]) && (self::is_char($query[$i]) || in_array($query[$i], $class_chars))) {
                        $tmp .= $query[$i];
                        $i++;
                    }
                    $queries[] = $tmp;
                } // ~ General Sibling Selector
                else if ($c == '~') {
                    $space_allowed = true;
                    $tmp .= $query[$i++];
                    while (isset($query[$i])
                        && (self::is_char($query[$i])
                            || in_array($query[$i], $class_chars)
                            || $query[$i] == '*'
                            || ($query[$i] == ' ' && $space_allowed)
                        )) {
                        if ($query[$i] != ' ') {
                            $space_allowed = false;
                        }
                        $tmp .= $query[$i];
                        $i++;
                    }
                    $queries[] = $tmp;
                } // + Adjacent sibling selectors
                else if ($c == '+') {
                    $space_allowed = true;
                    $tmp .= $query[$i++];
                    while (isset($query[$i])
                        && (self::is_char($query[$i])
                            || in_array($query[$i], $class_chars)
                            || $query[$i] == '*'
                            || ($space_allowed && $query[$i] == ' ')
                        )) {
                        if ($query[$i] != ' ')
                            $space_allowed = false;
                        $tmp .= $query[$i];
                        $i++;
                    }
                    $queries[] = $tmp;
                } // ATTRS
                else if ($c == '[') {
                    $stack = 1;
                    $tmp .= $c;
                    while (isset($query[++$i])) {
                        $tmp .= $query[$i];
                        if ($query[$i] == '[') {
                            $stack++;
                        } else if ($query[$i] == ']') {
                            $stack--;
                            if (!$stack) {
                                break;
                            }
                        }
                    }
                    $queries[] = $tmp;
                    $i++;
                } // PSEUDO CLASSES
                else if ($c == ':') {
                    $stack = 1;
                    $tmp .= $query[$i++];
                    while (isset($query[$i]) && (self::is_char($query[$i]) || in_array($query[$i], $pseudo_chars))) {
                        $tmp .= $query[$i];
                        $i++;
                    }
                    // with arguments ?
                    if (isset($query[$i]) && $query[$i] == '(') {
                        $tmp .= $query[$i];
                        $stack = 1;
                        while (isset($query[++$i])) {
                            $tmp .= $query[$i];
                            if ($query[$i] == '(') {
                                $stack++;
                            } else if ($query[$i] == ')') {
                                $stack--;
                                if (!$stack) {
                                    break;
                                }
                            }
                        }
                        $queries[] = $tmp;
                        $i++;
                    } else {
                        $queries[] = $tmp;
                    }
                } else {
                    $i++;
                }
            }
    
            if (isset($queries[0])) {
                if (isset($queries[0][0]) && $queries[0][0] == ':') {
                    array_unshift($queries, '*');
                }
                if ($queries[0] != '>') {
                    array_unshift($queries, ' ');
                }
            }
    
            return $queries;
        }
    
        public static function is_char($char)
        {
            return preg_match('@w@', $char);
        }
    
        /**
         * 模糊匹配
         * ^ 前缀字符串
         * * 包含字符串
         * $ 后缀字符串
         * @access private
         */
        protected static function is_regexp($pattern)
        {
            return in_array(
                $pattern[mb_strlen($pattern) - 1],
                array('^', '*', '$')
            );
        }
    }

    使用:

    use GuzzleHttpClient;
    use AppUtilsSelector;
    
    class SpiderService extends BaseService
    {
        /**
         * @param int $page
         * http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html
         */
        public static function getSsq(int $page)
        {
    //        phpinfo();
    //        die;
    
            $client = new Client();
            $body = $client->get('http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html');
            $html = (string)$body->getBody();
    //pp($data);
            $data = Selector::select($html, "table");
            pp($data);
    
    
    
        }
    QQ群 247823727 博客文件如果不能下载请进群下载
    如果公司项目有技术瓶颈问题,如有需要,请联系我,提供技术服务 QQ: 903464207
  • 相关阅读:
    Error -26631: HTTP Status-Code=400 (Bad Request) for
    mysql中的制表符替换
    mysql中json数据的拼接方式
    使用Nightwatch.js做基于浏览器的web应用自动测试
    Selenium + Nightwatch 自动化测试环境搭建
    Python web 框架:web.py
    转 Python Selenium设计模式-POM
    自动化测试
    日志打印longging模块(控制台和文件同时输出)
    读取配置文件(configparser,.ini文件)
  • 原文地址:https://www.cnblogs.com/zx-admin/p/15111144.html
Copyright © 2020-2023  润新知