设置随机请求头
# 安装 pip install fake-useragent
# middleware.py
from fake_useragent import UserAgent
class RandomUserAgentMiddlware(object):
#随机更换user-agent
def __init__(self,crawler):
super(RandomUserAgentMiddlware,self).__init__()
self.ua = UserAgent()
@classmethod
def from_crawler(cls,crawler):
return cls(crawler)
def process_request(self,request,spider):
request.headers.setdefault("User-Agent",self.ua.random)
# ...
# setting.py 中启动RandomUserAgentMiddlware
DOWNLOADER_MIDDLEWARES = {
'yourProjectName.middlewares.RandomUserAgentMiddlware': 312,
}
设置代理
# 这里使用的是阿布云隧道代理
# middleware.py
import base64
# 代理服务器
proxyServer = "http://http-dyn.abuyun.com:9020"
# 代理隧道验证信息
proxyUser = "your proxyUser"
proxyPass = "your proxyPass"
proxyAuth = "Basic " + base64.urlsafe_b64encode(bytes((proxyUser + ":" + proxyPass), "ascii")).decode("utf8")
class ProxyMiddleware(object):
def process_request(self, request, spider):
request.meta["proxy"] = proxyServer
request.headers["Proxy-Authorization"] = proxyAuth
# ...
# setting.py 中启动RandomUserAgentMiddlware
DOWNLOADER_MIDDLEWARES = {
'yourProjectName.middlewares.ProxyMiddleware': 100,
}
图片下载
# pipelines.py
from scrapy import Request
from scrapy.pipelines.images import ImagesPipeline
class ImagePipeline(ImagesPipeline):
headers = {
"""如果网站有headers检测就加上"""
}
def get_media_requests(self, item, info):
# 循环每一张图片地址下载,若传过来的不是集合则无需循环直接yield
for image_url in item['imgurl']:
headers = self.headers
headers['Referer'] = item['from_url']
# meta里面的数据是从spider获取,然后通过meta传递给下面方法:file_path
yield Request(image_url,headers=headers,meta={'foldername': item['foldername'], 'imgname': item['imgname']})
# 重命名,若不重写这函数,图片名为哈希
def file_path(self, request, response=None, info=None):
pic_format = request.url.split(".")[-1]
imgname = request.meta['imgname']
# 接收meta传递过来的图集名称
foldername = request.meta['foldername']
# 过滤windows字符串
foldername = re.sub(r'[?\*|“<>:/]', '', foldername)
filename = fr'{foldername}/{imgname}.{pic_format}'
return filename
# ...
# setting.py 中启动ImagePipeline
DOWNLOADER_MIDDLEWARES = {
'yourProjectName.pipelines.ImagePipeline': 200,
}
异步写入MySQL
# pipelines.py
from yourProjectName.settings import MYSQL_DBNAME, MYSQL_HOST,MYSQL_PASSWORD,MYSQL_PORT,MYSQL_USER
class MysqlTwistedPipeline(object):
"""
异步写入mysql
"""
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls,setting):
dbparms = dict(
host=MYSQL_HOST,
db=MYSQL_DBNAME,
user=MYSQL_USER,
passwd=MYSQL_PASSWORD,
port=MYSQL_PORT,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=False,
)
dbpool = adbapi.ConnectionPool("pymysql", **dbparms)
return cls(dbpool)
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self.do_insert, item)
query.addCallback(self.handle_error)
def handle_error(self, failure):
print("MysqlTwistedPipeline error is :",failure)
def do_insert(self, cursor, item):
insert_sql = """insert_sql"""
cursor.execute(insert_sql, ())#有些情况需使用pymysql.escape_string()对item字段进行转义
# ...
# setting.py 中启动ImagePipeline
DOWNLOADER_MIDDLEWARES = {
'yourProjectName.pipelines.MysqlTwistedPipeline': 200,
}