• phpspider PHP 爬虫


    * 通过composer下载

    composer require owner888/phpspider
    

    // composer.json

    {
        "require": {
            "owner888/phpspider": "^2.1"
        }
    }
    

      

    * 去掉讨厌的注释

       https://doc.phpspider.org/demo-start.html

     ./vendor/owner888/phpspider/core/phpspider.php

    /* Do NOT delete this comment */
            // 彩蛋
            $included_files = get_included_files();
            $content = file_get_contents($included_files[0]);
            if (!preg_match("#/* Do NOT delete this comment */#", $content) || !preg_match("#/* 不要删除这段注释 */#", $content))
            {
                $msg = "Unknown error...";
                log::error($msg);
                exit;
            }
    

     删掉这段恶心的代码

      * 导入数据库文件

        

    cd ./vendor/owner888/phpspider/demo
    

      

    mysql -uroot -hlocalhost -p
    

      

    create database demo charset utf8 collate utf8_general_ci;
    . qiushibaike.sql

      

    # ************************************************************
    # Sequel Pro SQL dump
    # Version 4541
    #
    # http://www.sequelpro.com/
    # https://github.com/sequelpro/sequelpro
    #
    # Host: 127.0.0.1 (MySQL 5.7.14)
    # Database: demo
    # Generation Time: 2016-10-20 16:55:11 +0000
    # ************************************************************
    
    
    /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
    /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
    /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
    /*!40101 SET NAMES utf8 */;
    /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
    /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
    /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
    
    
    # Dump of table content
    # ------------------------------------------------------------
    
    DROP TABLE IF EXISTS `content`;
    
    CREATE TABLE `content` (
      `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
      `depth` int(11) DEFAULT NULL,
      `url` varchar(200) DEFAULT NULL,
      `article_title` varchar(20) DEFAULT NULL,
      `article_headimg` varchar(150) DEFAULT NULL,
      `article_author` varchar(20) DEFAULT NULL,
      `article_content` text,
      `article_publish_time` int(10) DEFAULT NULL,
      PRIMARY KEY (`id`)
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
    
    
    
    
    /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
    /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
    /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
    /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
    /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
    /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
    View Code

    * 创建./index.php

    <?php
    require './vendor/autoload.php';
    
    use phpspidercorephpspider;
    
    $configs = [
        'name' => '糗事百科',
        'domains' => [
            'qiushibaike.com',
            'www.qiushibaike.com'
        ],
        'scan_urls' => [
            'http://www.qiushibaike.com/'
        ],
        'content_url_regexes' => [
            "http://www.qiushibaike.com/article/d+"
        ],
        'list_url_regexes' => [
            "http://www.qiushibaike.com/8hr/page/d+?s=d+"
        ],
        'fields' => [
            [
                // 抽取内容页的文章内容
                'name' => "article_content",
                'selector' => "//*[@id='single-next-link']",
                'required' => true
            ],
            [
                // 抽取内容页的文章作者
                'name' => "article_author",
                'selector' => "//div[contains(@class,'author')]//h2",
                'required' => true
            ],
        ],
        'log_show' => true,
        'input_encoding' => 'utf-8',
        'output_encoding' => 'utf-8',
        'db_config' => [
            'host' => '127.0.0.1',
            'user' => 'root',
            'pass' => '',
            'name' => 'demo',
            'port' => 3306
        ],
        /*
        'export' => [
            'type' => 'sql',
            'file' => './data/sql/qiushibaike.sql'
        ]
        */
        'export' => [
            'type' => 'db',
            'table' => 'content',
        ]
    ];
    
    $spider = new phpspider($configs);
    $spider->start();
    

      

    * Run

    php ./index.php 
    

      

  • 相关阅读:
    学习php中的正则表达式,PHP正则表达式基础
    在新浪云SAE中使用smarty引擎模版
    随笔
    HTML 基础(二)
    HTML笔记(一)
    数据挖掘之异常检测
    DHCP协议
    基础的Linux命令(二)
    基础的linux命令(一)
    VMware虚拟机安装
  • 原文地址:https://www.cnblogs.com/mingzhanghui/p/9311283.html
Copyright © 2020-2023  润新知