• Scrapy框架基础


    创建 Scrapy 项目
    scrapy startproject 项目名
    scrapy genspider name 起始网址


    运行 Scrapy框架
    
    scrapy crawl 名称

    修改 chouti.py 下的 parse 方法,打印 response
    没有日志写法为
    
    scrapy crawl chouti --nolog


    流程图

    抽屉 爬取标题和对应 url 链接
    import scrapy
    from scrapy.selector import Selector
    class ChoutiSpider(scrapy.Spider):
        name = 'chouti'
        allowed_domains = ['dig.chouti.com']
        start_urls = ['http://dig.chouti.com/']
    
        def parse(self, response):
            # print(response.text) 打印网页内容
            a_list = Selector(response = response).xpath('//div[@class="link-detail"]')
            for a in a_list:
                # //div[@class="link-detail"]/a
                title = a.xpath('.//a/text()').extract()[1]
                href = a.xpath('./a/@href').extract()[0]
                print(title)
                print(href)


    md5 加密 网址 等长名称进行存储
    def md5(self,href):
        import hashlib
        obj = hashlib.md5()
        obj.update(bytes(href,encoding = 'utf-8'))
        return obj.hexdigest()

    进行页面请求使用 ,一直递归调用访问网址
    yield Request(url = href,callback = self.parse)
    import scrapy
    from scrapy.selector import Selector
    from scrapy.http import Request
    # 导入 Request 请求
    
    class ChoutiSpider(scrapy.Spider):
        name = 'chouti'
        allowed_domains = ['dig.chouti.com']
        start_urls = ['http://dig.chouti.com/']
        all_urls = set()
    
        def parse(self, response):
            a_list = Selector(response = response).xpath('//div[@class="link-detail"]')
            # 查看所有的 div ,class 为 link-detail 的
            for a in a_list:
                href = a.xpath('./a/@href').extract_first()
                # div 下的 a 链接的 href 属性
                text = a.xpath('./a/text()').extract_first()
                # div 下的 a 链接的 内容
                # print(href,text)
                if href not in self.all_urls:
                    # 如果还没有进行访问
                    self.all_urls.add(href)
                    print(href)
                    # Request(url = href) -> 发送到调度器
                    yield Request(url = href,callback = self.parse)
    
            print(self.all_urls)
    
        def md5(self,href):
            import hashlib
            obj = hashlib.md5()
            obj.update(bytes(href,encoding = 'utf-8'))
            return obj.hexdigest()


    设置访问页面的深度
    在 settings.py 中添加
    
    DEPTH_LIMIT = 2

    使用 pipelines 进行持久化保存数据
    在 settings 中开启 pipeline
    
    ITEM_PIPELINES = {
       'myscrapy.pipelines.MyscrapyPipeline': 300,
       # 打开管道,进行保存到本地
    }
    修改 items 中字段
    
    class MyscrapyItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        href = scrapy.Field()
        title = scrapy.Field()
    修改 pipelines 字段
    
    class MyscrapyPipeline:
        def process_item(self, item, spider):
            contents = f"{item['href']}
    {item['title']}
    "
            f = open('news.json','a')
            f.write(contents)
            f.close()
    注:使用 item['href'] 而不是 item.href 
    chouti.py 代码
    import scrapy
    from scrapy.selector import Selector
    from scrapy.http import Request
    # 导入 Request 请求
    from ..items import MyscrapyItem
    
    
    class ChoutiSpider(scrapy.Spider):
        name = 'chouti'
        allowed_domains = ['dig.chouti.com']
        start_urls = ['http://dig.chouti.com/']
        all_urls = set()
    
        def parse(self, response):
            a_list = Selector(response = response).xpath('//div[@class="link-detail"]')
            # 查看所有的 div ,class 为 link-detail 的
            for a in a_list:
                href = a.xpath('./a/@href').extract_first()
                # div 下的 a 链接的 href 属性
                title = a.xpath('./a/text()').extract_first()
                # div 下的 a 链接的 内容
                # print(href,text)
                if href not in self.all_urls:
                    # 如果还没有进行访问
                    self.all_urls.add(href)
                    print(href)
                    print(title)
                    item_obj = MyscrapyItem(href = href,title = title)
                    # 创建 item 对象,在items.py中创建 href 和 title
                    '''
                        href = scrapy.Field()
                        title = scrapy.Field()
                    '''
    
                    yield item_obj
    
                    # Request(url = href) -> 发送到调度器
                    # yield Request(url = href,callback = self.parse)
    
        def md5(self,href):
            import hashlib
            obj = hashlib.md5()
            obj.update(bytes(href,encoding = 'utf-8'))
            return obj.hexdigest()
    注:
        使用 extract_first 转换为字符串
        使用 extract 方法转换为列表
        一定要使用 yield item_obj
        
    运行结果

    news.json 内容


    爬取多个网站
    allowed_domains = [网站1,网站2]
    
    allowed_domains = ['dig.chouti.com']
    关于递归查询
    设置请求
    from scrapy.http import Request
    
    设置
    yield Request(url = href,callback = self.parse)
    
    注:href 为 网址内容,回调函数 callback 为固定用法,最好不要修改 parse名称

    parse 下的 response 参数
    response.meta

    response.url

    response.text

    response.body

    某一个标签的属性以某一个值开头
    
    //标签[starts-with(@属性,"")]

    starts-with 为 武老师博客整理 Q2Day81
    链接:
    https://www.cnblogs.com/wupeiqi/articles/6229292.html
    使用 re 进行筛选
    //标签[re:test(@属性,"匹配规则")]

    //标签[contains(@属性,"")]
    
    某一个标签的属性值里面含有值

    Scrapy 会自动去掉重复 url,不用进行判断
    RFPDupeFilter 类进行判断
    
    from scrapy.dupefilters import RFPDupeFilter
    
    可以自己创建一个 py 文件,进行自定义判断是否重复
    但是要在 settings 中进行配置
    DUPEFILTER_CLASS = "项目名.py文件名.类名"
    可以判断数据库中是否存在记录
    使用 from_settings 使用 cls 进行创建对象
    创建 py 文件进行定义判断是否重复流程
        1.使用 from_settings 进行创建对象
        2.使用 __init__ 方法对对象进行初始化
        3.open 方法开始进行爬取
        4.request_seen 检查是否已经访问过
        5.close 方法进行关闭

    pipelines 文件补充
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    
    
    # useful for handling different item types with a single interface
    from itemadapter import ItemAdapter
    from scrapy.exceptions import DropItem
    
    class MyscrapyPipeline:
    
        def open_spider(self,spider):
            print("开始进行爬取")
            self.f = open('news.json', 'w')
    
        def close_spider(self,spider):
            print("爬取完毕")
            self.f.close()
    
        def process_item(self, item, spider):
            # yield item_obj 时,进行执行 process_item 方法
            # item_obj = MyscrapyItem(href = href,title = title)
            contents = f"{item['href']}
    {item['title']}
    "
            self.f.write(contents)
    
            # return item # 返还给另一个 pipeline 对象
            # raise DropItem() # 丢弃 item

    获取 settings 中的 DB 数据库
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    
    
    # useful for handling different item types with a single interface
    from itemadapter import ItemAdapter
    from scrapy.exceptions import DropItem
    # 交给下一个 pipeline 对象
    
    class MyscrapyPipeline:
    
        def __init__(self,conn_str):
            # 对 from_crawler 对象进行初始化
            self.conn_str = conn_str
            self.conn = None
    
        @classmethod
        def from_crawler(cls,crawler):
            # 创建 pipeline 对象,可以创建与数据库的连接
            conn_str = crawler.settings.get('DB')
            return cls(conn_str)
    
        def open_spider(self,spider):
            print("开始进行爬取")
            self.conn = open(self.conn_str,'w')
    
        def close_spider(self,spider):
            print("爬取完毕")
            self.conn.close()
    
        def process_item(self, item, spider):
            # yield item_obj 时,进行执行 process_item 方法
            # item_obj = MyscrapyItem(href = href,title = title)
            contents = f"{item['href']}
    {item['title']}
    "
            self.conn.write(contents)
            
            # return item # 返还给另一个 pipeline 对象
            # raise DropItem() # 丢弃 item
    注:需要在 settings 中添加
    DB = '数据库名称'
    DB
    = 'choutidb' # 数据库


    Cookies 基础使用
    
    from scrapy.http.cookies import CookieJar
    # 导入 CookieJar
    cookie_obj = CookieJar()
    # 创建 cookies 对象
    cookie_obj.extract_cookies(response,response.request)
    # 将 cookie 放入到 cookie_obj
    print(cookie_obj._cookies)

    response 方法
    ['_DEFAULT_ENCODING', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__',
    '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__',
     '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclas
    shook__', '__weakref__', '_auto_detect_fun', '_body', '_body_declared_encoding', '_body_inferred_encoding', '_cached_b
    enc', '_cached_decoded_json', '_cached_selector', '_cached_ubody', '_declared_encoding', '_encoding', '_get_body', '_g
    et_url', '_headers_encoding', '_set_body', '_set_url', '_url', 'body', 'body_as_unicode', 'cb_kwargs', 'certificate',
    'copy', 'css', 'encoding', 'flags', 'follow', 'follow_all', 'headers', 'ip_address', 'json', 'meta', 'replace', 'reque
    st', 'selector', 'status', 'text', 'url', 'urljoin', 'xpath']

    带有 Cookie 的页面请求
    yield Request(
        url = '网站',
        method = "POST",
        body = {
            "用户名":"密码"
        },
        headers = {
            "请求头":"内容"
        },
        cookies = cookie_obj._cookies,
        # cookie 内容
        callback = self.函数名
        # 回调函数
    )
    def 函数名(self,response):
        print(response.text)

    Scrapy 扩展 EXTENSIONS
    settings 中 EXTENSIONS 
    
    EXTENSIONS = {
       'scrapy.extensions.telnet.TelnetConsole': None,
       # 可以在此处加上扩展
       "myscrapy.extensions.MyExtend":300
    }
    
    创建了 extensions.py文件,类名为 MyExtend
    extensions.py 文件内容
    from scrapy import signals
    
    class MyExtend:
        def __init__(self,crawler):
            self.crawler = crawler
            crawler.signals.connect(self.start, signals.engine_started)
            crawler.signals.connect(self.close, signals.spider_closed)
    
        @classmethod
        def from_crawler(cls,crawler):
            return cls(crawler)
    
        def start(self):
            print("开始 extensions")
    
        def close(self):
            print("结束 extensions")
    运行结果

    settings 配置文件简单理解
    BOT_NAME = 'myscrapy'
    # 爬虫的名字
    
    SPIDER_MODULES = ['myscrapy.spiders']
    NEWSPIDER_MODULE = 'myscrapy.spiders'
    # 爬虫的路径
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    # USER_AGENT = 'myscrapy (+http://www.yourdomain.com)'
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
    # 当前客户端
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = True
    # 是否遵循网站的 robots.txt 配置文件
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    CONCURRENT_REQUESTS = 16
    # 并发请求数,默认 16 个,最大 32 个
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    DOWNLOAD_DELAY = 3
    # 下载延迟,3秒
    
    # The download delay setting will honor only one of:
    CONCURRENT_REQUESTS_PER_DOMAIN = 16
    # 针对某一个 域名
    CONCURRENT_REQUESTS_PER_IP = 16
    # 针对某一个 ip
    
    # Disable cookies (enabled by default)
    COOKIES_ENABLED = True
    # 是否去爬取 Cookie
    COOKIES_DEBUG = True
    # 是否在运行时显示 Cookie
    
    # Disable Telnet Console (enabled by default)
    TELNETCONSOLE_ENABLED = False
    # 是否暂停爬虫发送请求
    
    EXTENSIONS = {
       'scrapy.extensions.telnet.TelnetConsole': None,
       # 可以在此处加上扩展
       "myscrapy.extensions.MyExtend":300
    }
    
    # Configure item pipelines
    # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
       'myscrapy.pipelines.MyscrapyPipeline': 300,
       # 打开管道,进行保存到本地
    }
    
    DEPTH_LIMIT = 2
    # 爬取深度
    DEPTH_PRIORITY = 0
    # 0 深度优先 ,1 广度优先
    DB = 'choutidb'
    # 数据库

    数字越小越优先


    2021-01-25

    如果觉得文章不错,可以分享给其他人哟~
  • 相关阅读:
    FileUpload1上传控件
    docker如何push镜像到docker hub个人的仓库
    docker的ubuntu镜像无ifconfig和ping命令
    keystone同步数据库的时候提示error
    openstack安装dashboard后访问horizon出错 500 or 504
    装了ubuntu之后,只能进入ubuntu系统,不能进入windows系统
    Kernal Panic
    无法获得锁 /var/lib/dpkg/lock -open
    用户 'NT AUTHORITYIUSR' 登录失败
    配置错误:不能在此路径中使用此配置节。
  • 原文地址:https://www.cnblogs.com/hany-postq473111315/p/14258458.html
Copyright © 2020-2023  润新知