Scrapy中添加随机User-Agent:
1.pip install scrapy-fake-useragent
2.setting.py 写:
DOWNLOADER_MIDDLEWARES = {
'lagoujob.middlewares.RandomUesrAgent': 1,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
}
3.middlewares.py 写入:
class RandomUesrAgent(object):
def process_request(self, request, spider):
ua = UserAgent()
request.headers.setdefault("User-Agent", ua.random)
scrapy 添加ip池:
国内的免费IP代理网站参考:
http://www.xicidaili.com/wt
https://www.kuaidaili.com/free/
http://www.youdaili.net/Daili/guonei/
http://ip.zdaye.com/FreeIPlist.html
配置:
中间件 middlewares.py 中 添加一个 代理 类
import random
from proxy1.settings import IPPOOL
class ProxychiMiddleware(object):
# 定义一个请求之前的方法
def process_request(self, request, spider):
# 如果是 私密代理
# request.meta['proxy'] = 'https://用户名and密码114.212.12.4:3128'
# 随即获取一个代理
this_ip = random.choice(IPPOOL)
request.meta['proxy'] = 'HTTP://'+this_ip
return None
setting.py 中 启用 middlewares.py 中的 代理类:
DOWNLOADER_MIDDLEWARES = {
# 启用的类名 要和 中间件中的类名一致
'movie.middlewares.ProxychiMiddleware': 543,
}
# 定义一个代理池
IPPOOL=[
{"ipaddr":"123.55.1.75:30325"},
{"ipaddr":"220.184.213.12:6666"},
{"ipaddr":"171.38.85.82:8123"},
{"ipaddr":"111.121.193.214:3128"},
{"ipaddr":"58.48.193.180:3128"},
{"ipaddr":"171.37.29.26:9797"},
{"ipaddr":"119.188.162.165:8081"} ]
重写start_request:
import scrapy
import random
# 设置一个代理池
proxy_pool = [{'HTTP':'111.155.116.215:8123'}]
class ProxydemoSpider(scrapy.Spider):
name = 'proxydemo'
allowed_domains = ['www.baidu.com']
start_urls = ['http://www.baidu.com/']
def start_requests(self):
for url in self.start_urls:
proxy_addr = random.choice(proxy_pool) # 随机选一个
yield scrapy.Request(url=url, callback=self.parse, meta={'proxy': proxy_addr}) # 通过meta参数添加代理
def parse(self, response):
print('proxy simida')
proxy_addr = "http://ip:port"
加密:
scrapy 爬取多层:
# -*- coding: utf-8 -*-
import scrapy
from Tencent.items import TencentItem
class TencentSpider(scrapy.Spider):
# 爬虫名称
name = 'tencent'
# 允许爬取的域名
allowed_domains = ['www.xxx.com']
# 爬虫基础地址 用于爬虫域名的拼接
base_url = 'https://www.xxx.com/'
# 爬虫入口爬取地址
start_urls = ['https://www.xxx.com/position.php']
# 爬虫爬取页数控制初始值
count = 1
# 爬虫爬取页数 10为只爬取一页
page_end = 1
def parse(self, response):
nodeList = response.xpath("//table[@class='tablelist']/tr[@class='odd'] | //table[@class='tablelist']/tr[@class='even']")
for node in nodeList:
item = TencentItem()
item['title'] = node.xpath("./td[1]/a/text()").extract()[0]
if len(node.xpath("./td[2]/text()")):
item['position'] = node.xpath("./td[2]/text()").extract()[0]
else:
item['position'] = ''
item['num'] = node.xpath("./td[3]/text()").extract()[0]
item['address'] = node.xpath("./td[4]/text()").extract()[0]
item['time'] = node.xpath("./td[5]/text()").extract()[0]
item['url'] = self.base_url + node.xpath("./td[1]/a/@href").extract()[0]
# 根据内页地址爬取
yield scrapy.Request(item['url'], meta={'item': item}, callback=self.detail_parse)
# 有下级页面爬取 注释掉数据返回
# yield item
# 循环爬取翻页
nextPage = response.xpath("//a[@id='next']/@href").extract()[0]
# 爬取页数控制及末页控制
if self.count < self.page_end and nextPage != 'javascript:;':
if nextPage is not None:
# 爬取页数控制值自增
self.count = self.count + 1
# 翻页请求
yield scrapy.Request(self.base_url + nextPage, callback=self.parse)
else:
# 爬虫结束
return None
def detail_parse(self, response):
# 接收上级已爬取的数据
item = response.meta['item']
#一级内页数据提取
item['zhize'] = response.xpath("//*[@id='position_detail']/div/table/tr[3]/td/ul[1]").xpath('string(.)').extract()[0]
item['yaoqiu'] = response.xpath("//*[@id='position_detail']/div/table/tr[4]/td/ul[1]").xpath('string(.)').extract()[0]
# 二级内页地址爬取
yield scrapy.Request(item['url'] + "&123", meta={'item': item}, callback=self.detail_parse2)
# 有下级页面爬取 注释掉数据返回
# return item
def detail_parse2(self, response):
# 接收上级已爬取的数据
item = response.meta['item']
# 二级内页数据提取
item['test'] = "111111111111111111"
# 最终返回数据给爬虫引擎
return item