• scrapy 常用代码


    一,scrapy请求

    yield scrapy.Request(url=url, dont_filter=True, callback=self.page, meta={'item': copy.deepcopy(item)})
    
    yield scrapy.FormRequest(url=self.url, headers=self.unicornHeader, method='POST', formdata=self.FormData, meta=self.customerData, callback=self.after_post, errback=self.error_handle, dont_filter=True)
    item = response.meta['item']

    二,xpath提取数据

    response.xpath('//*[@id="__EVENTVALIDATION"]/@value').extract_first().strip()
    response.xpath('//div[@class="mt20 articl-know"][1]/p[4]/span[2]/text()').extract_first().strip()

    三,scarpy 判断

    if isinstance(item, ArticleViewsCountItem):
    if 'food_id' in item.keys():

    四,scrapy  登入

    start_urls = ['http://renren.com/']
    
    def parse(self, response):
        data = {
            'email': '111',
            'password': 'sssws'
        }
        print('login.....')
        yield scrapy.FormRequest.from_response(response,
                                          formdata=data,
                                          callback=self.next,
                                          )
    

    五.scrapy 启动

    scrapy crawl spiderName
    scrapy crawl spiderName -s LOG_FILE=spider.log   # 指定日志输出文件
    

    六.设置代理

    1.middlewares.py

    meta={'item': copy.deepcopy(item), 'proxy': "http://10.133.3.26:1080"}   # 设置单个请求代理  是spider.py中
    
    
    import requests
    
    
    class MyproxiesSpiderMiddleware(object):
    
        def process_request(self, request, spider):
            proxies = requests.get('http://127.0.0.1:5000/get').content.decode('utf-8')
            print(proxies)
            request.meta["proxy"] = "http://{}".format(proxies)
            # request.meta["proxy"] = "http://36.249.49.43:9999"
    
    
    import logging
    import random
    import redis
    from steam_users.settings import REDIS_HOST, REDIS_POST, REDIS_DATABASE, REDIS_PASSWORD
    logger = logging.getLogger(__name__)
    
    
    class ProxyDownloadMiddleware(object):
        def __init__(self):
            self.conn = redis.Redis(host=REDIS_HOST, port=REDIS_POST, password=REDIS_PASSWORD, db=REDIS_DATABASE)
    
        def queue_len(self):
            # 获取队列长度
            return self.conn.llen("proxies")
    
        def get_redis(self):
            # 随机获取redis中的一个ip
            num = random.randint(1, self.queue_len()) - 1
            return self.conn.lindex('proxies', num).decode('utf-8')
    
        def process_request(self, request, spider):
            if request.url.startswith("http://"):
                request.meta['proxy'] = "http://{proxy_ip}".format(proxy_ip=self.get_redis())
            elif request.url.startswith("https://") and not request.url.startswith('https://steamcommunity'):
                print('ff')
                request.meta['proxy'] = "https://{proxy_ip}".format(proxy_ip=self.get_redis())
            print("using proxy: {}".format(request.meta['proxy']))
            # # 使用私密代理或独享代理需要将用户名和密码进行base64编码,然后赋值给request.headers["Proxy-Authorization"]
            # # 如果是开放代理就不需要以下步骤,直接设置代理IP即可
            # user_password = "{username}:{password}".format(username='username', password='password')
            # b64_user_password = base64.b64encode(user_password.encode("utf-8"))
            # request.headers["Proxy-Authorization"] = "Basic " + b64_user_password.decode("utf-8")
            return None
    
    
    
    from xiaoheihe.settings import SCRAPY_PROXIES
    
    
    class ProxyDownloadMiddleware(object):
    
        def process_request(self, request, spider):
            if request.url.startswith("http://"):
                request.meta['proxy'] = SCRAPY_PROXIES
            elif request.url.startswith("https://") and not request.url.startswith('https://steamcommunity'):
                request.meta['proxy'] = SCRAPY_PROXIES
            print("using proxy: {}".format(request.meta['proxy']))
            return None

    2..setting中开启代理中间件

    DOWNLOADER_MIDDLEWARES = {
       'jxy.middlewares.MyproxiesSpiderMiddleware': 543,
    }

    七.数据入库

    import pymysql
    
    
    class MogujiePipeline(object):
        def __init__(self):
            # 创建数据库连接
            self.db = pymysql.connect(host='localhost', port=3306, database='cfda', user='root', password='root',
                                      charset='utf8')
            # self.db = pymysql.connect(host='115.238.111.198', port=3306, database='spider_yu', user='spider',
            #                           password='Kangce@0608',
            #                           charset='utf8')
            self.cursor = self.db.cursor()
    
        def process_item(self, item, spider):
            # 判断爬取的字段数据库中是否已经存在
            num = self.cursor.execute('select id from jiankangshuju_food where url="{}"'.format(item["url"]))
            if not num:
                list_key = []
                list_lalues = []
                for key, lalues in item.items():
                    list_key.append(key)
                    list_lalues.append("'" + str(lalues).replace("'", "‘") + "'")
                # 拼接sql语句
                insert_sql = "insert into jiankangshuju_food({}) values({})".format(', '.join(list_key),
                                                                                         ', '.join(list_lalues))
                try:
                    self.cursor.execute(insert_sql)
                    self.db.commit()
                except:
                    print('insert_sql:', insert_sql)
    
                # 查询数据
                self.cursor.execute("select * from catalogue")
                data = self.cursor.fetchone()
                data = self.cursor.fetchall()
    
                # 更新数据
                self.cursor.execute("update catalogue set ''='{}', ''='{}' where id={}".format())
                self.db.commit()
    
                # 删除数据
                self.cursor.execute("delete from catalogue where id={}".format())
                self.db.commit()
    
            return item
    
        def close_spider(self, spider):
            # 关闭数据库的连接
            self.cursor.close()
            self.db.close() 

    八.def start_requests(self)

    # get请求
    def start_requests(self):
        db = pymysql.connect(host='localhost', port=3306, database='game', user='root', password='root',
                             charset='utf8', autocommit=True)
        cursor = db.cursor()
        cursor.execute('select id, appid, last_modified from steam_appid where id =1085660')
        for appid in cursor.fetchall():
            item = {}
            item['appid'] = appid[1]
            item['last_modified'] = appid[2]
            yield scrapy.Request(url='https://store.steampowered.com/app/{}/'.format(appid[1]),
                                 meta={'item': copy.deepcopy(item)})
    
    
    # post  request payload
    yield scrapy.Request(url='https://www.wegame.com.cn/api/rail/web/data_filter/game_info/by_game_id',
                         meta={'item': copy.deepcopy(item)},
                         headers={'Content-Type': 'application/json;charset=UTF-8',
                                  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36',
                                  },
                         body=json.dumps({"game_ids": ["{}".format(str(appid[1]))],
                                          "filters": [],
                                          "stamp": {"agent_client_language": "zh_CN"},
                                          "response_format": 0}, ensure_ascii=False),
                         dont_filter=True,
                         method='POST')
    

    九,数据库配置

    # ---------服务器mysql配置----------
    # MYSQL_HOST = '192.168.107.229'
    # MYSQL_POST = 3306
    # MYSQL_DATABASE = 'spider_app'
    # MYSQL_PASSWORD = '123456'
    # MYSQL_USER = 'root'
    
    # -------------本地mysql配置--------------
    MYSQL_HOST = '10.133.3.26'
    MYSQL_POST = 3306
    MYSQL_DATABASE = 'spider_app'
    MYSQL_PASSWORD = 'root'
    MYSQL_USER = 'root'
    
    
    from steam_users.settings import MYSQL_HOST, MYSQL_POST, MYSQL_DATABASE, MYSQL_PASSWORD, MYSQL_USER
    
    pymysql.connect(host=MYSQL_HOST, port=MYSQL_POST, database=MYSQL_DATABASE, user=MYSQL_USER,
                                 password=MYSQL_PASSWORD,
                                 charset='utf8', autocommit=True)
    

      

      

      

      

      

  • 相关阅读:
    cache in c#
    c#解析xml
    自动执行任务管理---TaskManage
    IConfigurationSectionHandler 使用~
    Autofac
    黑苹果教程(四)———MAC OS 10.11+固态硬盘+自定义引导
    android安装Linux,玩转Android
    精通android(Pro Android 4)面试题总结(二)
    精通android(Pro Android 4)面试题总结(一)
    Mac上的词典扩充
  • 原文地址:https://www.cnblogs.com/yoyo1216/p/10132096.html
Copyright © 2020-2023  润新知