• scrapy-起名评分案例


    proxy/ProxyHelper.py (代理文件)

    import requests
    
    
    class ProxyHelperObject(object):
        def __init__(self):
            self.proxy = self.__requests_get_proxy()
    
        def get_proxy(self):
            return self.proxy
    
        def update_proxy(self, old_proxy):
            if self.proxy == old_proxy:
                self.proxy = self.__requests_get_proxy()
                print('你新获取了一个proxy:', self.proxy)
                return self.proxy
    
        def __requests_get_proxy(self):
            # url = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=a3bf7f336f464299a043d6c6988f3665&count=1&expiryDate=0&format=2&newLine=2'
            url = 'http://piping.mogumiao.com/proxy/api/get_ip_al?appKey=3ef0dbd1e6ac43fba9a3a4a39ea96846&count=1&expiryDate=0&format=2&newLine=2'
            response = requests.get(url)
            return 'http://' + response.text.strip()
    
    
    if __name__ == '__main__':
        helper = ProxyHelperObject()
        helper.update_proxy('http://60.167.133.179:23516')
        print(helper.proxy)

    shujuku/qiming_mysql.py (数据库存储)

    """
    CREATE TABLE qiming_data(
        id int primary key auto_increment,
        name varchar(10),
        lishu_score int,
        bazi_score int) default charset=utf8mb4;
    """
    import pymysql
    
    
    class QimingMysql(object):
        # 初始化就是连接数据库
        def __init__(self):
            self.conn = pymysql.connect(host='127.0.0.1', user='root', passwd='510520', db='pachong', charset='utf8mb4')
            self.cursor = self.conn.cursor()
    
        def execute_insert_sql(self, sql, qiming_data):
            self.cursor.execute(sql, qiming_data)
            self.conn.commit()
    
        def __del__(self):
            self.cursor.close()
            self.conn.close()
    
    
    if __name__ == '__main__':
        qiming = QimingMysql()
        insert_sql = "INSERT INTO qiming_data(name, lishu_score, bazi_score) VALUES(%s, %s, %s)"
        data = ('花好月圆夜', 88, 89)
        qiming.execute_insert_sql(insert_sql, data)

    qingming/qiming/settings.py(增加的配置项)

    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    # Disable cookies (enabled by default)
    COOKIES_ENABLED = False
    
    # Enable or disable downloader middlewares
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    DOWNLOADER_MIDDLEWARES = {
       'qiming.middlewares.QimingDownloaderMiddleware': 560,
    }
    
    # Configure item pipelines
    # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
       'qiming.pipelines.QimingPipeline': 300,
    }

    qingming/qiming/spiders/threetong.py

    import scrapy
    from qiming.items import QimingItem
    
    class ThreetongSpider(scrapy.Spider):
        name = 'qiming'
        allowed_domains = ['threetong.com', '5156edu.com']
        start_urls = ['http://xh.5156edu.com/xm/nu.html']
    
        def parse(self, response, **kwargs):
            name_word_list = response.xpath('//a[@class="fontbox"]/text()').extract()
            for name_word1 in name_word_list:
                for name_word2 in name_word_list:
                    double_word_name = name_word1 + name_word2
                    name_word_list.append(double_word_name)
    
            # POST请求获取名字评分
            url = 'https://www.threetong.com/ceming/baziceming/xingmingceshi.php'
            form = {
                'isbz': '1',
                'txtName': '',
                'name': '魅蓝',
                'rdoSex': '1',
                'data_type': '0',
                'cboYear': '2020',
                'cboMonth': '12',
                'cboDay': '26',
                'cboHour': '20 - 戌时',
                'cboMinute': '9分',
                'pid': '广东',
                'cid': '韶关',
                'zty': '0',
            }
            for name in name_word_list:
                form['name'] = name
                name_request = scrapy.FormRequest(url=url, formdata=form, callback=self.parse_score)
                yield name_request
    
        def parse_score(self, response):
            # with open('qiming.html', 'wb') as f:
            #     f.write(response.body)
            name = response.xpath('//ul[@class="bazi_box"]/li[1]/text()').extract_first()
            lishu_score = response.xpath('//span[@class="df_1 left"]/text()').extract_first()
            bazi_score = response.xpath('//span[@class="df_1 right"]/text()').extract_first()
            # print(name, lishu_score, bazi_score)
            item = QimingItem()
            item['name'] = name
            item['lishu_score'] = int(lishu_score.split(':')[-1])
            item['bazi_score'] = int(bazi_score.split(':')[-1])
            yield item

    qiming/qiming/items.py

    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://docs.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class QimingItem(scrapy.Item):
        # define the fields for your item here like:
        name = scrapy.Field()
        lishu_score = scrapy.Field()
        bazi_score = scrapy.Field()

    qiming/qiming/pipelines.py

    # useful for handling different item types with a single interface
    from itemadapter import ItemAdapter
    from project_01.shujuku.qiming_mysql import QimingMysql
    
    
    class QimingPipeline:
        def __init__(self):
            self.qiming = QimingMysql()
    
        def process_item(self, item, spider):
            name = item['name']
            lishu_score = item['lishu_score']
            bazi_score = item['bazi_score']
            insert_sql = "INSERT INTO qiming_data(name, lishu_score, bazi_score) VALUES(%s, %s, %s)"
            data = (name, lishu_score, bazi_score)
            self.qiming.execute_insert_sql(insert_sql, data)
            return item

    qiming/qiming/middlewares.py

    # Define here the models for your spider middleware
    #
    # See documentation in:
    # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    
    from scrapy import signals
    
    # useful for handling different item types with a single interface
    
    from itemadapter import is_item, ItemAdapter
    from project_01.proxy.ProxyHelper import ProxyHelperObject
    from twisted.internet.defer import DeferredLock
    
    class QimingSpiderMiddleware:
        # Not all methods need to be defined. If a method is not defined,
        # scrapy acts as if the spider middleware does not modify the
        # passed objects.
    
        @classmethod
        def from_crawler(cls, crawler):
            # This method is used by Scrapy to create your spiders.
            s = cls()
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
            return s
    
        def process_spider_input(self, response, spider):
            # Called for each response that goes through the spider
            # middleware and into the spider.
    
            # Should return None or raise an exception.
            return None
    
        def process_spider_output(self, response, result, spider):
            # Called with the results returned from the Spider, after
            # it has processed the response.
    
            # Must return an iterable of Request, or item objects.
            for i in result:
                yield i
    
        def process_spider_exception(self, response, exception, spider):
            # Called when a spider or process_spider_input() method
            # (from other spider middleware) raises an exception.
    
            # Should return either None or an iterable of Request or item objects.
            pass
    
        def process_start_requests(self, start_requests, spider):
            # Called with the start requests of the spider, and works
            # similarly to the process_spider_output() method, except
            # that it doesn’t have a response associated.
    
            # Must return only requests (not items).
            for r in start_requests:
                yield r
    
        def spider_opened(self, spider):
            spider.logger.info('Spider opened: %s' % spider.name)
    
    
    class QimingDownloaderMiddleware:
        def __init__(self):
            self.helper = ProxyHelperObject()
            self.lock = DeferredLock()
    
        # Not all methods need to be defined. If a method is not defined,
        # scrapy acts as if the downloader middleware does not modify the
        # passed objects.
    
        @classmethod
        def from_crawler(cls, crawler):
            # This method is used by Scrapy to create your spiders.
            s = cls()
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
            return s
    
        def process_request(self, request, spider):
            # Called for each request that goes through the downloader
            # middleware.
    
            # Must either:
            # - return None: continue processing this request
            # - or return a Response object
            # - or return a Request object
            # - or raise IgnoreRequest: process_exception() methods of
            #   installed downloader middleware will be called
            self.lock.acquire()
            request.meta['proxy'] = self.helper.get_proxy()
            print('正在使用代理:', self.helper.proxy)
            self.lock.release()
            return None
    
        def process_response(self, request, response, spider):
            # Called with the response returned from the downloader.
            self.lock.acquire()
            if response.status != 200:
                self.helper.update_proxy(request.meta['proxy'])
                self.lock.release()
                return request
            self.lock.release()
            # Must either;
            # - return a Response object
            # - return a Request object
            # - or raise IgnoreRequest
            return response
    
        def process_exception(self, request, exception, spider):
            # Called when a download handler or a process_request()
            # (from other downloader middleware) raises an exception.
    
            # Must either:
            # - return None: continue processing this exception
            # - return a Response object: stops process_exception() chain
            # - return a Request object: stops process_exception() chain
            self.lock.acquire()
            self.helper.update_proxy(request.meta['proxy'])
            self.lock.release()
            return request
    
        def spider_opened(self, spider):
            spider.logger.info('Spider opened: %s' % spider.name)

    qiming/qiming/run_qiming.py (执行文件)

    from scrapy.cmdline import execute
    
    execute('scrapy crawl qiming'.split())
  • 相关阅读:
    [ lucene高级 ] lucene中的算法PriorityQueue
    [ lucene扩展 ] MoreLikeThis 相似检索
    排序08归并排序
    lucene中的数值型字段(NumericField)
    两三年前的搜索管理系统
    java中的集合包简要分析
    倒排索引基础
    散列02java中的hashMap
    Mysql数据库中InnoDB和MyISAM的差别
    ajax 乱码
  • 原文地址:https://www.cnblogs.com/glz666/p/14212899.html
Copyright © 2020-2023  润新知