一,scrapy请求
yield scrapy.Request(url=url, dont_filter=True, callback=self.page, meta={'item': copy.deepcopy(item)}) yield scrapy.FormRequest(url=self.url, headers=self.unicornHeader, method='POST', formdata=self.FormData, meta=self.customerData, callback=self.after_post, errback=self.error_handle, dont_filter=True) item = response.meta['item']
二,xpath提取数据
response.xpath('//*[@id="__EVENTVALIDATION"]/@value').extract_first().strip() response.xpath('//div[@class="mt20 articl-know"][1]/p[4]/span[2]/text()').extract_first().strip()
三,scarpy 判断
if isinstance(item, ArticleViewsCountItem): if 'food_id' in item.keys():
四,scrapy 登入
start_urls = ['http://renren.com/'] def parse(self, response): data = { 'email': '111', 'password': 'sssws' } print('login.....') yield scrapy.FormRequest.from_response(response, formdata=data, callback=self.next, )
五.scrapy 启动
scrapy crawl spiderName scrapy crawl spiderName -s LOG_FILE=spider.log # 指定日志输出文件
六.设置代理
1.middlewares.py
meta={'item': copy.deepcopy(item), 'proxy': "http://10.133.3.26:1080"} # 设置单个请求代理 是spider.py中 import requests class MyproxiesSpiderMiddleware(object): def process_request(self, request, spider): proxies = requests.get('http://127.0.0.1:5000/get').content.decode('utf-8') print(proxies) request.meta["proxy"] = "http://{}".format(proxies) # request.meta["proxy"] = "http://36.249.49.43:9999" import logging import random import redis from steam_users.settings import REDIS_HOST, REDIS_POST, REDIS_DATABASE, REDIS_PASSWORD logger = logging.getLogger(__name__) class ProxyDownloadMiddleware(object): def __init__(self): self.conn = redis.Redis(host=REDIS_HOST, port=REDIS_POST, password=REDIS_PASSWORD, db=REDIS_DATABASE) def queue_len(self): # 获取队列长度 return self.conn.llen("proxies") def get_redis(self): # 随机获取redis中的一个ip num = random.randint(1, self.queue_len()) - 1 return self.conn.lindex('proxies', num).decode('utf-8') def process_request(self, request, spider): if request.url.startswith("http://"): request.meta['proxy'] = "http://{proxy_ip}".format(proxy_ip=self.get_redis()) elif request.url.startswith("https://") and not request.url.startswith('https://steamcommunity'): print('ff') request.meta['proxy'] = "https://{proxy_ip}".format(proxy_ip=self.get_redis()) print("using proxy: {}".format(request.meta['proxy'])) # # 使用私密代理或独享代理需要将用户名和密码进行base64编码,然后赋值给request.headers["Proxy-Authorization"] # # 如果是开放代理就不需要以下步骤,直接设置代理IP即可 # user_password = "{username}:{password}".format(username='username', password='password') # b64_user_password = base64.b64encode(user_password.encode("utf-8")) # request.headers["Proxy-Authorization"] = "Basic " + b64_user_password.decode("utf-8") return None from xiaoheihe.settings import SCRAPY_PROXIES class ProxyDownloadMiddleware(object): def process_request(self, request, spider): if request.url.startswith("http://"): request.meta['proxy'] = SCRAPY_PROXIES elif request.url.startswith("https://") and not request.url.startswith('https://steamcommunity'): request.meta['proxy'] = SCRAPY_PROXIES print("using proxy: {}".format(request.meta['proxy'])) return None
2..setting中开启代理中间件
DOWNLOADER_MIDDLEWARES = { 'jxy.middlewares.MyproxiesSpiderMiddleware': 543, }
七.数据入库
import pymysql class MogujiePipeline(object): def __init__(self): # 创建数据库连接 self.db = pymysql.connect(host='localhost', port=3306, database='cfda', user='root', password='root', charset='utf8') # self.db = pymysql.connect(host='115.238.111.198', port=3306, database='spider_yu', user='spider', # password='Kangce@0608', # charset='utf8') self.cursor = self.db.cursor() def process_item(self, item, spider): # 判断爬取的字段数据库中是否已经存在 num = self.cursor.execute('select id from jiankangshuju_food where url="{}"'.format(item["url"])) if not num: list_key = [] list_lalues = [] for key, lalues in item.items(): list_key.append(key) list_lalues.append("'" + str(lalues).replace("'", "‘") + "'") # 拼接sql语句 insert_sql = "insert into jiankangshuju_food({}) values({})".format(', '.join(list_key), ', '.join(list_lalues)) try: self.cursor.execute(insert_sql) self.db.commit() except: print('insert_sql:', insert_sql) # 查询数据 self.cursor.execute("select * from catalogue") data = self.cursor.fetchone() data = self.cursor.fetchall() # 更新数据 self.cursor.execute("update catalogue set ''='{}', ''='{}' where id={}".format()) self.db.commit() # 删除数据 self.cursor.execute("delete from catalogue where id={}".format()) self.db.commit() return item def close_spider(self, spider): # 关闭数据库的连接 self.cursor.close() self.db.close()
八.def start_requests(self)
# get请求 def start_requests(self): db = pymysql.connect(host='localhost', port=3306, database='game', user='root', password='root', charset='utf8', autocommit=True) cursor = db.cursor() cursor.execute('select id, appid, last_modified from steam_appid where id =1085660') for appid in cursor.fetchall(): item = {} item['appid'] = appid[1] item['last_modified'] = appid[2] yield scrapy.Request(url='https://store.steampowered.com/app/{}/'.format(appid[1]), meta={'item': copy.deepcopy(item)}) # post request payload yield scrapy.Request(url='https://www.wegame.com.cn/api/rail/web/data_filter/game_info/by_game_id', meta={'item': copy.deepcopy(item)}, headers={'Content-Type': 'application/json;charset=UTF-8', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36', }, body=json.dumps({"game_ids": ["{}".format(str(appid[1]))], "filters": [], "stamp": {"agent_client_language": "zh_CN"}, "response_format": 0}, ensure_ascii=False), dont_filter=True, method='POST')
九,数据库配置
# ---------服务器mysql配置---------- # MYSQL_HOST = '192.168.107.229' # MYSQL_POST = 3306 # MYSQL_DATABASE = 'spider_app' # MYSQL_PASSWORD = '123456' # MYSQL_USER = 'root' # -------------本地mysql配置-------------- MYSQL_HOST = '10.133.3.26' MYSQL_POST = 3306 MYSQL_DATABASE = 'spider_app' MYSQL_PASSWORD = 'root' MYSQL_USER = 'root' from steam_users.settings import MYSQL_HOST, MYSQL_POST, MYSQL_DATABASE, MYSQL_PASSWORD, MYSQL_USER pymysql.connect(host=MYSQL_HOST, port=MYSQL_POST, database=MYSQL_DATABASE, user=MYSQL_USER, password=MYSQL_PASSWORD, charset='utf8', autocommit=True)