• 爬虫入门(四)


    1.移动端数据

    配置fiddler
        tools->options->connection->allow remote computer to connect
        fiddler port: xxxx
    移动端安装fiddler的证书:
        保证移动端和fiddler所在的pc的网络在同一个网段下
        在移动端的浏览器中:fiddler所在机器的ip地址:fiddler的端口号
        证书下载完毕后进行安装切信任
    配置手机的网络:
        给手机设置一个代理ip:port:

    2.scrapy的初步使用

    settings

    ROBOTSTXT_OBEY = False
    
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
    
    ITEM_PIPELINES = {
       'firstblood.pipelines.FirstbloodPipeline': 300,
    }
    # -*- coding: utf-8 -*-
    import scrapy
    
    
    class FirstSpider(scrapy.Spider):
        # 爬虫文件的名称
        name = 'first'
        # 允许的域名
        # allowed_domains = ['www.xxx.com']
        # 起始url列表
        start_urls = ['https://www.qiushibaike.com/text/']
    
        # def parse(self, response):
        #     div_list = response.xpath('//div[@id="content-left"]/div')
        #     for div in div_list:
        #         # author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract()
        #         # 如果可以保证xpath返回的列表中只有一个元素可以用.extract_first()
        #         author = div.xpath('./div[1]/a[2]/h2/text()').extract_first()
        #         content = div.xpath('./a[1]/div/span//text()').extract()
        #         content = ''.join(content)
        #
        #         print(author, content)
        #
        # # 实现解析+持久化存储
        # # 1.基于终端指令的持久化存储
        #     # 只可以将parse方法的返回值持久化存储到本地文件中
        # # 2.基于管道的持久化存储
    
        # 1.基于终端指令的持久化存储 scrapy crawl first -o xxx.csv
        def parse(self, response):
            div_list = response.xpath('//div[@id="content-left"]/div')
            all_data = []
            for div in div_list:
                # author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract()
                # 如果可以保证xpath返回的列表中只有一个元素可以用.extract_first()
                author = div.xpath('./div[1]/a[2]/h2/text()').extract_first()
                content = div.xpath('./a[1]/div/span//text()').extract()
                content = ''.join(content)
    
                dic = {
                    'author': author,
                    'content': content
                }
    
                all_data.append(dic)
    
            return all_data

    解析数据+管道持久化存储

    settings

    ITEM_PIPELINES = {
       'boosPro.pipelines.BoosproPipeline': 300,
       'boosPro.pipelines.MysqlPipeline': 301,
       'boosPro.pipelines.RedisPipeline': 302,
    }
    # -*- coding: utf-8 -*-
    import scrapy
    from boosPro.items import BoosproItem
    
    
    class BossSpider(scrapy.Spider):
        name = 'boss'
        # allowed_domains = ['www.xxx.com']
        start_urls = ['https://www.zhipin.com/job_detail/?query=python%E7%88%AC%E8%99%AB&scity=101010100&industry=&position=']
    
        url = 'https://www.zhipin.com/c101010100/?query=python爬虫&page=%d&ka=page-2'
        page = 1
        # 解析+管道持久化存储
        def parse(self, response):
            li_list = response.xpath('//div[@class="job-list"]/ul/li')
            for li in li_list:
                job_name = li.xpath('.//div[@class="info-primary"]/h3/a/div/text()').extract_first()
                salary = li.xpath('.//div[@class="info-primary"]/h3/a/span/text()').extract_first()
                company = li.xpath('.//div[@class="company-text"]/h3/a/text()').extract_first()
    
                # 实例化一个item对象
                item = BoosproItem()
                # 将解析到的数据全部封装到item对象中
                item["job_name"] = job_name
                item["salary"] = salary
                item["company"] = company
    
                # 将item提交给管道
                yield item
    
            if self.page <= 3:
                print("执行!!!")
                self.page += 1
                new_url = format(self.url % self.page)
                print(new_url)
                # 手动发起请求
                yield scrapy.Request(url=new_url, callback=self.parse)

    items

    class BoosproItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        job_name = scrapy.Field()
        salary = scrapy.Field()
        company = scrapy.Field()

    pipelines

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    import pymysql
    from redis import Redis
    
    
    class BoosproPipeline(object):
        fp = None
    
        def open_spider(self, spider):
            print("开始爬虫......")
            self.fp = open('./boss.txt', 'w', encoding='utf-8')
    
        def close_spider(self, spider):
            print("结束爬虫.......")
            self.fp.close()
        # 爬虫文件每向管道提交一次item,则该方法就被调用一次
        # 参数:item 就是管道接收到的item类型对象
    
        def process_item(self, item, spider):
            self.fp.write(item["job_name"] + ":" + item["salary"] + ":" + item["company"] + "
    ")
            return item
    
    
    class MysqlPipeline(object):
        conn = None
        cursor = None
    
        def open_spider(self, spider):
            self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='', db='scrapy', charset='utf8')
            print(self.conn)
    
        def process_item(self, item, spider):
            self.cursor = self.conn.cursor()
            try:
                self.cursor.execute('insert into boss values ("%s", "%s", "%s")' % (item["job_name"], item["salary"], item["company"]))
                self.conn.commit()
            except Exception as e:
                print(e)
                self.conn.rollback()
            return item
    
        def close_spider(self, spider):
            self.conn.close()
            self.cursor.close()
    
    
    class RedisPipeline(object):
        conn = None
    
        def open_spider(self, spider):
            self.conn = Redis(host='127.0.0.1', port=6379)
            print(self.conn)
    
        def process_item(self, item, spider):
            dic = {
                'name': item["job_name"],
                'salary': item["salary"],
                'company': item["company"]
            }
    
            self.conn.lpush('boss', dic)
  • 相关阅读:
    性能测试监控指标-数据库
    cpu 故障定位
    ubuntu安装boost
    固定IP下虚拟机网卡配置及ssh
    零基础天池新闻推荐初学-04-排序模型+模型融合的简单学习(TODO 待进一步完善)
    零基础天池新闻推荐初学-04-特征工程(制作特征列和标签列,转为监督学习)
    零基础天池新闻推荐初学-03-多路召回
    零基础天池新闻推荐初学-02-数据分析
    零基础天池新闻推荐初学-01-赛题理解&Baseline
    初学推荐系统-06- GBDT+LR模型
  • 原文地址:https://www.cnblogs.com/qq849784670/p/10472395.html
Copyright © 2020-2023  润新知