proxy/ProxyHelper.py (代理文件)
import requests class ProxyHelperObject(object): def __init__(self): self.proxy = self.__requests_get_proxy() def get_proxy(self): return self.proxy def update_proxy(self, old_proxy): if self.proxy == old_proxy: self.proxy = self.__requests_get_proxy() print('你新获取了一个proxy:', self.proxy) return self.proxy def __requests_get_proxy(self): # url = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=a3bf7f336f464299a043d6c6988f3665&count=1&expiryDate=0&format=2&newLine=2' url = 'http://piping.mogumiao.com/proxy/api/get_ip_al?appKey=3ef0dbd1e6ac43fba9a3a4a39ea96846&count=1&expiryDate=0&format=2&newLine=2' response = requests.get(url) return 'http://' + response.text.strip() if __name__ == '__main__': helper = ProxyHelperObject() helper.update_proxy('http://60.167.133.179:23516') print(helper.proxy)
shujuku/qiming_mysql.py (数据库存储)
""" CREATE TABLE qiming_data( id int primary key auto_increment, name varchar(10), lishu_score int, bazi_score int) default charset=utf8mb4; """ import pymysql class QimingMysql(object): # 初始化就是连接数据库 def __init__(self): self.conn = pymysql.connect(host='127.0.0.1', user='root', passwd='510520', db='pachong', charset='utf8mb4') self.cursor = self.conn.cursor() def execute_insert_sql(self, sql, qiming_data): self.cursor.execute(sql, qiming_data) self.conn.commit() def __del__(self): self.cursor.close() self.conn.close() if __name__ == '__main__': qiming = QimingMysql() insert_sql = "INSERT INTO qiming_data(name, lishu_score, bazi_score) VALUES(%s, %s, %s)" data = ('花好月圆夜', 88, 89) qiming.execute_insert_sql(insert_sql, data)
qingming/qiming/settings.py(增加的配置项)
# Obey robots.txt rules ROBOTSTXT_OBEY = False # Disable cookies (enabled by default) COOKIES_ENABLED = False # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { 'qiming.middlewares.QimingDownloaderMiddleware': 560, } # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'qiming.pipelines.QimingPipeline': 300, }
qingming/qiming/spiders/threetong.py
import scrapy from qiming.items import QimingItem class ThreetongSpider(scrapy.Spider): name = 'qiming' allowed_domains = ['threetong.com', '5156edu.com'] start_urls = ['http://xh.5156edu.com/xm/nu.html'] def parse(self, response, **kwargs): name_word_list = response.xpath('//a[@class="fontbox"]/text()').extract() for name_word1 in name_word_list: for name_word2 in name_word_list: double_word_name = name_word1 + name_word2 name_word_list.append(double_word_name) # POST请求获取名字评分 url = 'https://www.threetong.com/ceming/baziceming/xingmingceshi.php' form = { 'isbz': '1', 'txtName': '龚', 'name': '魅蓝', 'rdoSex': '1', 'data_type': '0', 'cboYear': '2020', 'cboMonth': '12', 'cboDay': '26', 'cboHour': '20 - 戌时', 'cboMinute': '9分', 'pid': '广东', 'cid': '韶关', 'zty': '0', } for name in name_word_list: form['name'] = name name_request = scrapy.FormRequest(url=url, formdata=form, callback=self.parse_score) yield name_request def parse_score(self, response): # with open('qiming.html', 'wb') as f: # f.write(response.body) name = response.xpath('//ul[@class="bazi_box"]/li[1]/text()').extract_first() lishu_score = response.xpath('//span[@class="df_1 left"]/text()').extract_first() bazi_score = response.xpath('//span[@class="df_1 right"]/text()').extract_first() # print(name, lishu_score, bazi_score) item = QimingItem() item['name'] = name item['lishu_score'] = int(lishu_score.split(':')[-1]) item['bazi_score'] = int(bazi_score.split(':')[-1]) yield item
qiming/qiming/items.py
# Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class QimingItem(scrapy.Item): # define the fields for your item here like: name = scrapy.Field() lishu_score = scrapy.Field() bazi_score = scrapy.Field()
qiming/qiming/pipelines.py
# useful for handling different item types with a single interface from itemadapter import ItemAdapter from project_01.shujuku.qiming_mysql import QimingMysql class QimingPipeline: def __init__(self): self.qiming = QimingMysql() def process_item(self, item, spider): name = item['name'] lishu_score = item['lishu_score'] bazi_score = item['bazi_score'] insert_sql = "INSERT INTO qiming_data(name, lishu_score, bazi_score) VALUES(%s, %s, %s)" data = (name, lishu_score, bazi_score) self.qiming.execute_insert_sql(insert_sql, data) return item
qiming/qiming/middlewares.py
# Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals # useful for handling different item types with a single interface from itemadapter import is_item, ItemAdapter from project_01.proxy.ProxyHelper import ProxyHelperObject from twisted.internet.defer import DeferredLock class QimingSpiderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, or item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Request or item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class QimingDownloaderMiddleware: def __init__(self): self.helper = ProxyHelperObject() self.lock = DeferredLock() # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called self.lock.acquire() request.meta['proxy'] = self.helper.get_proxy() print('正在使用代理:', self.helper.proxy) self.lock.release() return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. self.lock.acquire() if response.status != 200: self.helper.update_proxy(request.meta['proxy']) self.lock.release() return request self.lock.release() # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain self.lock.acquire() self.helper.update_proxy(request.meta['proxy']) self.lock.release() return request def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name)
qiming/qiming/run_qiming.py (执行文件)
from scrapy.cmdline import execute execute('scrapy crawl qiming'.split())