一、创建爬虫项目
scrapy startproject demo1
二、分析并定义需要关注的信息,创建数据库(chapter18.sql),设置爬取变量(items.py)
chapter18.sql
create database chaper18; use chaper18; create table info( id int(10) auto_increment primary key,not null, name varchar(30), url varchar(100), hits int(15), comment int(15) )
items.py
1 # -*- coding: utf-8 -*- 2 import scrapy 3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # http://doc.scrapy.org/en/latest/topics/items.html 7 8 class Demo1Item(scrapy.Item): 9 # 定义关注的数据 10 name = scrapy.Field() 11 url = scrapy.Field() 12 hits = scrapy.Field() 13 comment = scrapy.Field()
三、引入下载中间件封装类(middlewares.py),并在settings.py中设置下载中间件指向
1 # -*- coding: utf-8 -*- 2 # 导入随机模块 3 import random 4 # 导入有关IP池有关的模块 5 from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware 6 # 导入有关用户代理有关的模块 7 from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 8 9 # IP池 10 class HTTPPROXY(HttpProxyMiddleware): 11 # 初始化 注意一定是 ip='' 12 def __init__(self, ip=''): 13 self.ip = ip 14 15 def process_request(self, request, spider): 16 item = random.choice(IPPOOL) 17 try: 18 print("当前的IP是:"+item["ipaddr"]) 19 request.meta["proxy"] = "http://"+item["ipaddr"] 20 except Exception as e: 21 print(e) 22 pass 23 24 25 # 设置IP池 26 IPPOOL = [ 27 {"ipaddr": "182.117.102.10:8118"}, 28 {"ipaddr": "121.31.102.215:8123"}, 29 {"ipaddr": "1222.94.128.49:8118"} 30 ] 31 32 33 # 用户代理 34 class USERAGENT(UserAgentMiddleware): 35 #初始化 注意一定是 user_agent='' 36 def __init__(self, user_agent=''): 37 self.user_agent = user_agent 38 39 def process_request(self, request, spider): 40 item = random.choice(UPPOOL) 41 try: 42 print("当前的User-Agent是:"+item) 43 request.headers.setdefault('User-Agent', item) 44 except Exception as e: 45 print(e) 46 pass 47 48 49 # 设置用户代理池 50 UPPOOL = [ 51 "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393" 52 ] 53 # 在settings.py中设置一下内容(这里包括了禁止Cookie、和管道指向) 54 ''' 55 #============================================== 56 57 # 禁止Cookie 58 COOKIES_ENABLED = False 59 60 # 下载中间件指向 61 DOWNLOADER_MIDDLEWARES = { 62 # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware':123, 63 # '工程名.middlewares.HTTPPROXY' : 125, 64 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2, 65 '工程名.middlewares.USERAGENT': 1 66 } 67 68 # 管道指向 69 ITEM_PIPELINES = { 70 '工程名.pipelines.管道中对应的类名': 300, 71 } 72 73 # 注意将Obey robots关闭(settings.py文件上面已经开启了,所以需要找到并设置为False) 74 ROBOTSTXT_OBEY = False 75 76 #============================================== 77 '''
如在settings.py添加如下信息
#============================================== # 禁止Cookie COOKIES_ENABLED = False # 下载中间件指向 DOWNLOADER_MIDDLEWARES = { # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware':123, # '工程名.middlewares.HTTPPROXY' : 125, 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2, 'demo1.middlewares.USERAGENT': 1 } #==============================================
四、编写管道文件(pipelines.py),并设置管道指向
1 # -*- coding: utf-8 -*- 2 import pymysql as pm 3 # Define your item pipelines here 4 # 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 8 class Demo1Pipeline(object): 9 # 初始化数据库连接 10 def __init__(self): 11 # 创建连接 12 self.db = pm.connect(host='localhost', user='root', password='123456', database='chapter18', charset='utf8') 13 # 创建游标 14 self.cur = self.db.cursor() 15 def process_item(self, item, spider): 16 # 处理信息,包括空信息等 17 for j in range(0, len(item["name"])): 18 name = item["name"][j] 19 url = item["url"][j] 20 hits = item["hits"][j] 21 comment = item["comment"][j] 22 sql = "insert into info(name,url,hits,comment) values(%s,%s,%s,%s)" 23 try: 24 self.cur.execute(sql, (name, url, hits, comment)) 25 self.db.commit() 26 print("OK") 27 except: 28 self.db.rollback() 29 print("数据插入出错") 30 return item 31 # 关闭游标和断开数据库连接 32 def close_spider(self, spider): 33 self.cur.close() 34 self.db.close() 35
如在settings.py添加如下信息
# 管道指向 ITEM_PIPELINES = { 'demo1.pipelines.Demo1Pipeline': 300, }
五、创建爬虫文件,编写爬虫文件(test.py),并引入高度仿浏览器封装类(HeadersHelper.py)
#scrapy genspider -t basic test 爬取总网站域名 scrapy genspider -t basic test hexun.com
test.py源码:
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from ..items import Demo1Item 4 from .HeadersHelper import HeadersHelper 5 from scrapy.http import Request 6 7 class TestSpider(scrapy.Spider): 8 name = "test" 9 allowed_domains = ["hexun.com"] 10 start_urls = ( 11 #'http://www.hexun.com/', 12 'http://yinglunjinye.blog.hexun.com/', 13 ) 14 15 def parse(self, response): 16 item = Demo1Item() 17 #patter_comment = "commentd*?','(d*?)'" 18 #response.xpath('//div[@class="news"]/h1/a/@href').extract() 19 item["name"] = response.xpath('//span[@class="ArticleTitleText"]/a/text()').extract() 20 item["url"] = response.xpath('//span[@class="ArticleTitleText"]/a/@href').extract() 21 url = HeadersHelper('http://yinglunjinye.blog.hexun.com', pattern='<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)"></script>').handle_info()[0] 22 item["hits"] = HeadersHelper(url,pattern="clickd*?','(d*?)'").handle_info() 23 item["comment"] = HeadersHelper(url,pattern="commentd*?','(d*?)'").handle_info() 24 data = HeadersHelper("http://yinglunjinye.blog.hexun.com/",pattern="http://fjrs168.blog.hexun.com/p(d*?)/").handle_info() 25 if(len(data)>=2): 26 # data[-2] 是str类型 27 page_count = int(data[-2]) 28 else: 29 page_count = 1 30 yield item 31 for i in range(2, (page_count+1)): 32 url = "http://yinglunjinye.blog.hexun.com/p"+str(i)+"/default.html" 33 yield Request(url, callback=self.parse)
高度仿浏览器封装类HeadersHelper.py
1 import urllib.request 2 import http.cookiejar 3 import re 4 5 class HeadersHelper: 6 def __init__(self, url, path=None, pattern=None): 7 self.url = url #urllib.request.quote(url,safe='/:?=', encoding='utf-8') 8 self.path = path 9 self.pattern = pattern 10 11 # 设置信息头,高度仿照浏览器 12 def set_Headers(self): 13 # 添加报头 注意"Accept-Encoding": "gb2312, utf-8" 防止解码而出现乱码 14 headers = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 15 "Accept-Encoding": "gb2312, utf-8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", 16 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", 17 "Connection": "keep-alive", "Host": "baidu.com" 18 } 19 cjar = http.cookiejar.CookieJar() 20 opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cjar)) 21 headall = [] 22 for key, value in headers.items(): 23 item = (key, value) 24 headall.append(item) 25 opener.addheaders = headall 26 urllib.request.install_opener(opener) 27 28 # 信息返回 29 def feedbak_info(self): 30 self.set_Headers() 31 # 有时候用utf-8,有时候用gbk 32 # http://yinglunjinye.blog.hexun.com 就需要gbk 33 ''' 34 try: 35 info = urllib.request.urlopen(self.url).read().decode('utf-8') 36 except: 37 info = urllib.request.urlopen(self.url).read().decode('gbk') 38 ''' 39 info = urllib.request.urlopen(self.url).read() 40 return str(info) 41 42 # 信息存档 43 def save_InFile(self): 44 self.set_Headers() 45 info = urllib.request.urlopen(self.url).read() 46 file = open(self.path, 'wb') 47 file.write(info) 48 file.close() 49 50 # 页面信息处理(正则处理) 51 def handle_info(self): 52 info = self.feedbak_info() 53 return re.compile(pattern=self.pattern, flags=re.S).findall(info)
六、测试