• Python爬虫知识


    一、爬虫

    1、概述

    网络爬虫,搜索引擎就是爬虫的应用者。

    2、爬虫分类

    (1)通用爬虫,常见就是搜索引擎,无差别的收集数据,存储,提取关键字,构建索引库,给用户提供搜索接口。

    爬取一般流程:

    初始化一批URL,将这些url放入到等待爬取队列。

    从队列取出这些url,通过dns解析ip,对应ip站点下载HTML页面,保存到本地服务器中,爬取完的url放到已爬取队列。

    分析这些网页内容,找出网页里面关心的url连接,继续执行第二步,直到爬取结束。

    搜索引擎如何获取一个新网站的url。

    新网站主动提交给搜索引擎。

    通过其他网站页面中设置的外链。

    搜索引擎和dns服务商合作,获取最新收录的网站。

    (2)聚焦爬虫

    有针对性的编写特定领域数据的爬取程序,针对某些类别数据的采集的爬虫,是面向主题的。

    3、robots协议

    指定一个robots.txt文件,告诉爬虫引擎什么可以爬取。

    这个协议为了让搜索引擎更有效率搜索自己内容,提供了sitemap这样的文件。

    这个文件禁止抓取的往往又是可能我们感兴趣的内容,反而泄露了这些地址。。

    4、http请求和响应处理

    爬虫网页就是通过HTTP协议访问网页,不过通过浏览器访问往往是人的行为,把程序编程人的行为的问题。

    Urllib包

    from urllib.request import urlopen


    response = urlopen('http://www.bing.com')
    print(response.closed)

    with response:
        print(response.status)
        print(response._method)
        print(response.read())
        print(response.closed)
        print(response.info)
    print(response.closed)

    使用等,urllib包,使用查询等。

    解决useragent问题:

    from urllib.request import urlopen,Request

    url = 'http://www.bing.com'
    ua = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'

    req = Request(url,headers={'User-agent':ua})
    response = urlopen(req,timeout=10)
    # print(req)
    print(response.closed)

    with response:
        print(response.status)
        print(response._method)
        # print(response.read())
        # print(response.closed)
        # # print(response.info)
        print(response.geturl())
    print(req.get_header('User-agent'))
    print(response.closed)

    Chrome浏览器获取useragent

    5、parse

    from urllib import parse

    d = {
        'id':1,
        'name':'tom',
        'url':'http://www.magedu.com'
    }

    url = 'http://www.magedu.com'
    u = parse.urlencode(d)   #url编码
    print(u)

    print(parse.unquote(u))#解码

    6、请求方法

    from urllib import parse
    import simplejson

    base_url = 'http://cn.bing.com/search'

    d = {
        'q':'马哥教育'
    }
    # d = {
    #     'id':1,
    #     'name':'tom',
    #     'url':'http://www.magedu.com'
    # }

    # url = 'http://www.magedu.com'
    u = parse.urlencode(d)   #url编码

    # url = '{}?{}'.format(base_url,u)
    # print(url)
    #
    # print(parse.unquote(url))#解码

    from urllib.request import urlopen,Request

    url = 'http://httpbin.org/post'

    ua = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'

    data = parse.urlencode({'name':'张三,@=/&*','age':'6'})

    req = Request(url,headers={
        'User-agent':ua
    })

    # res = urlopen(req)

    with urlopen(req,data= data.encode()) as res:
        text = res.read()
        d = simplejson.loads(text)
        print(d)
        # with open('c:/assets/bing.html','wb+') as f:
            # f.write(res.read())
            # f.flush()

    from urllib import parse
    import simplejson

    base_url = 'http://cn.bing.com/search'

    d = {
        'q':'马哥教育'
    }
    # d = {
    #     'id':1,
    #     'name':'tom',
    #     'url':'http://www.magedu.com'
    # }

    # url = 'http://www.magedu.com'
    u = parse.urlencode(d)   #url编码

    # url = '{}?{}'.format(base_url,u)
    # print(url)
    #
    # print(parse.unquote(url))#解码

    from urllib.request import urlopen,Request

    url = 'http://httpbin.org/post'

    ua = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'

    data = parse.urlencode({'name':'张三,@=/&*','age':'6'})

    req = Request(url,headers={
        'User-agent':ua
    })

    # res = urlopen(req)

    with urlopen(req,data= data.encode()) as res:
        text = res.read()
        d = simplejson.loads(text)
        print(d)
        # with open('c:/assets/bing.html','wb+') as f:
            # f.write(res.read())
            # f.flush()

    7、爬取豆瓣网

    from urllib.request import Request,urlopen
    import simplejson
    from urllib import parse

    ua = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'

    jurl = 'https://movie.douban.com/j/search_subjects'

    d = {
        'type':'movie',
        'tag':'热门',
        'page_limit':10,
        'page_start':10
    }

    req = Request('{}?{}'.format(jurl,parse.urlencode(d)),headers={
        'User-agent':ua
    })

    with urlopen(req) as res:
        sub = simplejson.loads(res.read())
        print(len(sub))
        print(sub)

    8、解决https,ca证书的问题

    忽略证书,ssl

    from urllib.request import Request,urlopen

    from urllib import parse
    import ssl

    #request = Request('http://www.12306.cn/mormhweb')
    request = Request('http://www.baidu.com')
    request.add_header('User-agent','Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    )

    context = ssl._create_unverified_context()  #忽略不可用证书

    with urlopen(request,context=context) as res:
        print(res._method)
        print(res.read())

    9、urllib3

    pip install urllib3

    import urllib3


    url = 'http://movie.douban.com'

    ua =  'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'

    with urllib3.PoolManager() as http:   #连接池管理器
        response = http.request('GET',url,headers={'User-agent':ua})
        print(1,response)
        print(2,type(response))
        print(3,response.status,response.reason)
        print(4,response.headers)
        print(5,response.data)

    import urllib3
    from urllib.parse import urlencode
    from urllib3 import HTTPResponse

    url = 'http://movie.douban.com'

    ua =  'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'

    jurl = 'https://movie.douban.com/j/search_subjects'

    d = {
        'type':'movie',
        'tag':'热门',
        'page_limit':10,
        'page_start':10
    }

    # with urllib3.PoolManager() as http:   #连接池管理器
    #     response = http.request('GET',url,headers={'User-agent':ua})   #可以指定请求方法
    #     print(1,response)
    #     print(2,type(response))
    #     print(3,response.status,response.reason)
    #     print(4,response.headers)
    #     print(5,response.data)

    with urllib3.PoolManager() as http:
        response = http.request('GET','{}?{}'.format(jurl,urlencode(d)),headers={'User-agent':ua})
        print(response)
        print(response.status)
        print(response.data)

    10、requests库

    Requests使用了urllib3.

    pip install requests

    import urllib3
    from urllib.parse import urlencode
    from urllib3 import HTTPResponse
    import requests


    # url = 'http://movie.douban.com'

    ua =  'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'

    jurl = 'https://movie.douban.com/j/search_subjects'

    d = {
        'type':'movie',
        'tag':'热门',
        'page_limit':10,
        'page_start':10
    }
    url = '{}?{}'.format(jurl,urlencode(d))

    # with urllib3.PoolManager() as http:   #连接池管理器
    #     response = http.request('GET',url,headers={'User-agent':ua})   #可以指定请求方法
    #     print(1,response)
    #     print(2,type(response))
    #     print(3,response.status,response.reason)
    #     print(4,response.headers)
    #     print(5,response.data)

    # with urllib3.PoolManager() as http:
    #     response = http.request('GET','{}?{}'.format(jurl,urlencode(d)),headers={'User-agent':ua})
    #     print(response)
    #     print(response.status)
    #     print(response.data)

    response = requests.request('GET',url,headers = {'User-agent':ua})

    with response:
        print(response.text)
        print(response.status_code)
        print(response.url)
        print(response.headers)
        print(response.request)

    带会话的方式  session。

    会把请求头等信息自动管理。

    import urllib3
    from urllib.parse import urlencode
    from urllib3 import HTTPResponse
    import requests


    # url = 'http://movie.douban.com'

    ua =  'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'

    jurl = 'https://movie.douban.com/j/search_subjects'

    d = {
        'type':'movie',
        'tag':'热门',
        'page_limit':10,
        'page_start':10
    }
    # url = '{}?{}'.format(jurl,urlencode(d))

    # with urllib3.PoolManager() as http:   #连接池管理器
    #     response = http.request('GET',url,headers={'User-agent':ua})   #可以指定请求方法
    #     print(1,response)
    #     print(2,type(response))
    #     print(3,response.status,response.reason)
    #     print(4,response.headers)
    #     print(5,response.data)

    # with urllib3.PoolManager() as http:
    #     response = http.request('GET','{}?{}'.format(jurl,urlencode(d)),headers={'User-agent':ua})
    #     print(response)
    #     print(response.status)
    #     print(response.data)

    # response = requests.request('GET',url,headers = {'User-agent':ua})
    #
    # with response:
    #     print(response.text)
    #     print(response.status_code)
    #     print(response.url)
    #     print(response.headers)
    #     print(response.request)
    urls = ['https://www.baidu.com/s?wd=magedu','https://www.baidu.com/s?wd=magedu']

    session = requests.Session()
    with session:
        for url in urls:
            response = session.get(url,headers={'User-agent':ua})
            with response:
                print(1,response.text)
                print(2,response.status_code)
                print(3,response.url)
                print(4,response.headers)
                print(5,response.request.headers)
                print('--------')
                print(response.cookies)
                print('--------------')
                print(response.cookies)

    11、特别注意

    个别网站登录的时候cookie,登录的时候要把原来的cookie带回去,然后登录成功后其给你返回一个新的,否则不能进行相关操作。有些时候只是带一些cookie相关的值即可。

    反爬措施:对于用户发起的请求来检测上一次是否访问的是我的网站。

    在network的referer里面显示上一次访问网站的哪个一页。

    Files:上传的文件内容。

    路由器的将用户名和密码加密放在请求头里面。

    Cert证书。

    Requests基本功能:

    import requests


    ua = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5514.400 QQBrowser/10.1.1660.400'
    url = 'https://dig.chouti.com/login'

    data = {
    'phone':'8618804928235',
    'password':'tana248654',
    'oneMonth':'1'
    }

    r1_urls = 'https://dig.chouti.com'
    r1 = requests.get(url=r1_urls,headers={'User-Agent':ua})
    # print(r1.text)
    r1_cookie = r1.cookies.get_dict()
    print('r1',r1.cookies)

    response = requests.post(url,data,headers={'User-Agent':ua},cookies=r1_cookie)

    print(response.text)
    print(response.cookies.get_dict())


    r3 = requests.post(url='https://dig.chouti.com/link/vote?linksId=21718341',
                       cookies={'gpsd':r1_cookie.get('gpsd')},headers={'User-Agent':ua})

    print(r3.text)

    二、HTML解析

    通过上面的库,可以拿到HTML内容。

    1、Xpath

    http://www.qutoric.com/xmlquire/

    站点。

    路径的遍历,查找到需要的内容。

    2、lxml库

    解析HTML的库。

    https://lxml.de/

    安装:

    pip install lxml

    爬取豆瓣网top10


    import urllib3
    from urllib.parse import urlencode
    from urllib3 import HTTPResponse
    import requests
    from lxml import etree

    # url = 'http://movie.douban.com'

    ua =  'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'

    jurl = 'https://movie.douban.com/j/search_subjects'

    d = {
        'type':'movie',
        'tag':'热门',
        'page_limit':10,
        'page_start':10
    }
    # urls = ['https://www.baidu.com/s?wd=magedu','https://www.baidu.com/s?wd=magedu']
    urls = ['https://movie.douban.com/']

    session = requests.Session()
    with session:
        for url in urls:
            response = session.get(url,headers={'User-agent':ua})
            with response:
                content = response.text

            html = etree.HTML(content)
            title = html.xpath("//div[@class='billboard-bd']//tr")
            for t in title:
                txt = t.xpath('.//text()')
                print(''.join(map(lambda x:x.strip(),txt)))
                # print(t)

    3、beautifulsoup4

    4、可以导航的string(navigablestring)

    深度优先遍历。

    Soup.findall().

    Soup.findall(id =’header’)

    5、css选择器

    Soup.select          正则表达式

    Pip install jsonpath.

    from concurrent.futures import ThreadPoolExecutor
    import threading
    import time
    from queue import Queue
    import logging
    import requests
    from bs4 import BeautifulSoup

    event = threading.Event()
    url = 'https://news.enblogs.com'
    path = '/n/page/'
    ua = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'

    urls = Queue()
    htmls = Queue()
    outps = Queue()


    def create_urls(start,stop,step=1):
        for i in range(start,stop+1,step):
            url1 = '{}{}{}/'.format(url,path,i)
            urls.put(url1)

    def crawler():
        while not event.is_set():
            try:
                url1 = urls.get(True,1)

                response = requests.get(url,headers={'User-agent':ua})
                with response:
                    html = response.text
                    htmls.put(html)
            except Exception as e:
                print(1,e)

    def parse():
        while not event.is_set():
            try:
                html = htmls.get(True,1)
                soup = BeautifulSoup(html,'lxml')
                news = soup.select('h2.news_entry a')


                for n in news:
                    txt = n.text
                    url1 = url + n.attrs.get('href')
                    outps.put((txt,url1))

            except Exception as e:
                print(e)

    def save(path):
        with open(path,'a+',encoding='utf-8') as f:
            while not event.is_set():
                try:
                    title,url1 = outps.get(True,1)
                    f.write('{}{} '.format(title,url1))
                    f.flush()
                except Exception as e:
                    print(e)

    executor = ThreadPoolExecutor(max_workers=10)
    executor.submit(create_urls,1,10)
    executor.submit(parse)
    executor.submit(save,'c:/new.txt')

    for i in range(7):
        executor.submit(crawler)

    while True:
        cmd = input('>>>')
        if cmd.strip() == 'q':
            event.set()
            executor.shutdown()
            print('close')
            time.sleep()
            break

    三、动态网页处理

    很多网站采用的是ajax技术,spa技术。部分内容都是异步加载的,提高用户体验。

    1、phantomjs无头浏览器

    http://phantomjs.org/

    Xml http 与后端服务器建立的连接。

    2、selenium

    (1)自动化测试工具等,可以直接截图。模仿浏览器的行为等。

    from selenium import webdriver
    import datetime
    import time
    import random


    driver = webdriver.PhantomJS('c:/assets/phantomjs-2.1.1-windows/bin/phantomjs.exe')

    driver.set_window_size(1024,1024)
    url = 'https://cn.bing.com/search?q=%E9%A9%AC%E5%93%A5%E6%95%99%E8%82%B2'
    driver.get(url)



    def savedic():
        try:
            base_dir = 'C:/assets/'
            filename = '{}{:%Y%m%d%H%M%S}{}.png'.format(base_dir,datetime.datetime.now(),random.randint(1,100))
            driver.save_screenshot(filename)
        except Exception as e:
            print(1,e)
    # time.sleep(6)
    # print('-------')
    # savedic()
    MAXRETRIES = 5
    while MAXRETRIES:
        try:
            ele = driver.find_element_by_id('b_results')
            print(ele)
            print('===========')
            savedic()
            break
        except Exception as e:
            print(e)
            print(type(e))
        time.sleep(1)
        MAXRETRIES -= 1

    查找数据等,异步的方式。

    (2)下拉框子使用,使用Select。

    3、模拟键盘输入

    模仿浏览器登录,先找到登录框的id,然后,setkeys。

    之后返回登录后的网页。

    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    import time
    import random
    import datetime


    driver = webdriver.PhantomJS('c:/assets/phantomjs-2.1.1-windows/bin/phantomjs.exe')

    driver.set_window_size(1024,1024)

    url = 'https://www.oschina.net/home/login?goto_page=https%3A%2F%2Fwww.oschina.net%2F'

    def savedic():
        try:
            base_dir = 'C:/assets/'
            filename = '{}{:%Y%m%d%H%M%S}{}.png'.format(base_dir,datetime.datetime.now(),random.randint(1,100))
            driver.save_screenshot(filename)
        except Exception as e:
            print(1,e)

    driver.get(url)
    print(driver.current_url,111111111111)
    savedic()

    email = driver.find_element_by_id('userMail')
    passwed = driver.find_element_by_id('userPassword')

    email.send_keys('604603701@qq.com')
    passwed.send_keys('tana248654')
    savedic()
    passwed.send_keys(Keys.ENTER)



    time.sleep(2)
    print(driver.current_url,2222222222)
    userinfo = driver.find_element_by_class_name('user-info')
    print(userinfo.text)
    time.sleep(2)
    cookie = driver.get_cookies()
    print(cookie)
    savedic()

    4、页面等待

    (1)time.sleep

    数据js加载需要一定的时间内。

    线程休眠。

    设置尝试的次数等

    (2)selenium里面的wait

    显示等待

    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC

    try:
        email = WebDriverWait(driver,10).until(
            EC.presence_of_all_elements_located((By.ID,'userMail'))
        )
        savedic()
    finally:
        driver.quit()

    隐士的等待

    driver.implicitly_wait(10)

    总结:

    四、scrapy框架

    1、安装

    Pip install scrapy    可能报错,报错的原因是下载tw开头的文件.whl文件,然后pip安装。

    2、使用

    scrapy startproject scrapyapp   开启一个项目

    scrapy genspider donz_spider dnoz.org  进入spider文件下创建一个新的模块,把要爬取的网站加到url列表中。

    scrapy genspider -t basic dbbook douban.com   继承自baseic模板。内容少。

    scrapy genspider -t crawl book douban.com   继承自crawl模板,内容多。

    -t 后面加的是模板。  然后名字和网站

    scrapy crawl donz_spider   运行代码,运行时候报错的话pip install pypiwin32

    from scrapy.http.response.html import HtmlResponse

    response 继承于HTMLResponse。

    在item设置中设置要爬取的信息的类例如标题。

    在spiders下的文件里面写爬虫的xpath,爬取的队列及爬取内容的匹配。

    Middlewares里面是中间件。

    Pipelines里面处理函数。

    五、scrapy-redis组件

    1、scrapy-redis使用

    Pip install scrapy_redis

    使用redis作为队列需要的配置文件

    Setting.py

    BOT_NAME = 'scrapyapp'

    SPIDER_MODULES = ['scrapyapp.spiders']
    NEWSPIDER_MODULE = 'scrapyapp.spiders'

    USER_AGENT = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'

    ROBOTSTXT_OBEY = False
    DOWNLOAD_DELAY = 1
    COOKIES_ENABLED = False

    SCHEDULER = "scrapy_redis.scheduler.Scheduler"
    DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

    ITEM_PIPELINES = {      #  redis数据库连接相关
        'scrapyapp.pipelines.ScrapyappPipeline': 300,
        'scrapy_redis.pipelines.RedisPipeline': 543,
    }

    REDIS_HOST = '192.168.118.130'   
    REDIS_PORT = 6379

    # LOG_LEVEL = 'DEBUG'

    Spiders 下面的爬虫文件.py

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from scrapy_redis.spiders import RedisCrawlSpider
    from ..items import MovieItem


    class MoviecommentSpider(RedisCrawlSpider):
        name = 'moviecomment'
        allowed_domains = ['douban.com']
        # start_urls = ['http://douban.com/']
        redis_key = 'moviecomment1:start_urls'

        rules = (
            Rule(LinkExtractor(allow=r'start=d+'), callback='parse_item', follow=False),
        )

        def parse_item(self, response):
            # i = {}
            #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
            #i['name'] = response.xpath('//div[@id="name"]').extract()
            #i['description'] = response.xpath('//div[@id="description"]').extract()
            # return i
            comment = '//div[@class="comment-item"]//span[@class="short"]/text()'
            reviews = response.xpath(comment).extract()
            for review in reviews:
                item = MovieItem()
                item['comment'] = review.strip()
                yield item

    Item.py

    import scrapy


    class MovieItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        comment = scrapy.Field()

    redis数据中要设置一个key值和movecomment.py 中的redis_key = 'moviecomment1:start_urls'  设置value及初始的url值。

    完成后数据库会存储响应的值

    可以在redis-cli 后面加上 –ra

    2、分析

    (1)jieba分词

    Pip install jieba

    (2)stopword停用词

    数据清洗:把脏数据洗掉,检测出并除去数据中无效或者无关的数据,例如空值,非法值的检测,重复数据检测等。

    (3)词云

    Pip install wordcloud

    from redis import Redis
    import json
    import jieba


    redis = Redis()
    stopwords = set()
    with open('', encoding='gbk') as f:
        for line in f:
            print(line.rstrip(' ').encode())
            stopwords.add(line.rstrip(' '))
    print(len(stopwords))
    print(stopwords)
    items = redis.lrange('dbreview:items', 0, -1)
    print(type(items))


    words = {}
    for item in items:
        val = json.loads(item)['review']
        for word in jieba.cut(val):
            words[word] = words.get(word, 0) + 1
    print(len(words))
    print(sorted(words.items(), key=lambda x: x[1], reverse=True))

    分词代码测试

    六、scrapy项目

    1、知识回顾

    2、爬取技术网站

    praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/text()").extract()
    fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()
    # match_re = re.match(".*(d+).*", fav_nums)

    class的值有多个的时候,使用container进行选取。

    from scrapy.http import Request  #找到的url传递给下一级
    from urllib import parse

    #提取下一页并交给scrapy下载
    next_url = response.xpath('//div[@class="navigation margin-20"]/a[4]/@href').extract()
    if next_url:
        yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)

    (1)图片处理及存储:

    pip install pillow

    IMAGES_URLS_FIELD = "front_image_url"
    project_dir = os.path.abspath(os.path.dirname(__file__))
    IMAGES_STORE = os.path.join(project_dir, 'images')

    (2)写入到本地文件:

    class JsonWithEncodingPipeline(object):
        def __init__(self):
            self.file = codecs.open('article.json', 'w', encoding='utf-8')

        def process_item(self, item, spider):
            lines = json.dumps(dict(item), ensure_ascii=False) + " "
            self.file.write(lines)
            return item

        def spider_closed(self, spider):
            self.file.close()

    scrapy自带的JsonItemExporter

    (3)导出功能,还有csv文件等

    class JsonItemExporterPipeline(object):
        '''
       
    调用scrapy的JsonItemExporter
        '''
       
    def __init__(self):
            self.file = open('articleexport.json', 'wb')
            self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
            self.exporter.start_exporting()

        def close_spider(self, spider):
            self.exporter.finish_exporting()
            self.file.close()

        def process_item(self, item, spider):
            self.exporter.export_item(item)
            return item

    (4)数据库插入操作

    class MysqlPipeline(object):
        def __init__(self):
            self.conn = MySQLdb.connect('192.168.118.131', 'wang', 'wang', 'scrapy_jobbole', charset='utf8', use_unicode=True)
            self.cursor = self.conn.cursor()

        def process_item(self, item, spider):
            insert_sql = """
            insert into jobbole_article(title, url, create_date, fav_nums)
            values (%s, %s, %s, %s)
            """
            self.cursor.execute(insert_sql, (item['title'], item['url'], item['create_date'], item['fav_nums']))
            self.conn.commit()

    (5)scrapy提供的异步方法

    import MySQLdb
    import MySQLdb.cursors
    from twisted.enterprise import adbapi

    class MysqlTwistedPipeline(object):
        def __init__(self, dbpool):
            self.dbpool = dbpool

        @classmethod
        def from_settings(cls, settings):
            dbparms = dict(
                host=settings['MYSQL_HOST'],
                db=settings['MYSQL_DBNAME'],
                user=settings['MYSQL_USER'],
                password = settings['MYSQL_PASSWORD'],
                charset='utf8',
                cursorclass = MySQLdb.cursors.DictCursor,
                use_unicode = True
            )

            dbpool = adbapi.ConnectionPool('MySQLdb', **dbparms)
            return cls(dbpool)

        def process_item(self, item, spider):
            '''
           
    异步操作
            :param item:
            :param spider:
            :return:
            '''
           
    query = self.dbpool.runInteraction(self.do_insert, item)
            query.addErrback(self.handle_error)

        def handle_error(self, failure):
            '''
           
    处理插入的异常
            :param failure:
            :return:
            '''
           
    print(failure)

        def do_insert(self, cursor, item):
            '''
           
    执行具体插入
            :param cursor:
            :param item:
            :return:
            '''
           
    insert_sql = """
            insert into jobbole_article(title, url, create_date, fav_nums)
            values (%s, %s, %s, %s)
            """
            cursor.execute(insert_sql, (item['title'], item['url'], item['create_date'], item['fav_nums']))

    (5)将django的model集成到scrapy   

    Scrapy-djangoitem 

    (6)改变超多的xpath和css,使用itemloader

    # 通过itemloader加载item
    item_loader = ArticleItemLoader(item=ArticleItem(), response=response)
    # item_loader.add_css()
    item_loader.add_xpath('title', '//div[@class="entry-header"]/h1/text()')

    可以在item里面的field里面选择,

    class ArticleItem(scrapy.Item):
        title = scrapy.Field(
            input_processor=MapCompose(add_jobbole)
        )
        create_date = scrapy.Field(
            input_processor=MapCompose(add_time)
        )

    自定义输出:

    class ArticleItemLoader(ItemLoader):
        # 自定义item loader
        default_output_processor = TakeFirst()

    pipeline后面的数值是优先级的问题

    七、反爬虫策略

    1、修改settings和middlewares文件

    Setting里面设置一个user-agent-list的列表。

    Middlewares里面设置

    class RandomUserAgentMiddlware(object):
        '''
       
    随机更换user-agent
        '''
       
    def __init__(self, crawler):
            super(RandomUserAgentMiddlware, self).__init__()
            self.user_agent_list = crawler.settings.get("user_agent_list", [])

        @classmethod
        def from_crawler(cls, crawler):
            return cls(crawler)

        def process_request(self, request, spider):
            request.headers.setdefault('User-Agent', random())

    2、随意更换user-agent 的库

    >pip install fake-useragent

    from fake_useragent import UserAgent

    class RandomUserAgentMiddlware(object):
        '''
       
    随机更换user-agent
        '''
       
    def __init__(self, crawler):
            super(RandomUserAgentMiddlware, self).__init__()
            # self.user_agent_list = crawler.settings.get("user_agent_list", [])
            self.ua = UserAgent()
        @classmethod
        def from_crawler(cls, crawler):
            return cls(crawler)

        def process_request(self, request, spider):
            request.headers.setdefault('User-Agent', self.ua.random)

    class RandomUserAgentMiddlware(object):
        '''
       
    随机更换user-agent
        '''
       
    def __init__(self, crawler):
            super(RandomUserAgentMiddlware, self).__init__()
            # self.user_agent_list = crawler.settings.get("user_agent_list", [])
            self.ua = UserAgent()
            self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random") 配置项
        @classmethod
        def from_crawler(cls, crawler):
            return cls(crawler)

        def process_request(self, request, spider):
            def get_ua():
                return  getattr(self.ua, self.ua_type)
            request.headers.setdefault('User-Agent', get_ua())

    随机选取一个user-agent

    3、代理ip

    普通ip代理

    request.meta['proxy'] = "http://61.135.217.7:80"  #ip 代理

    (1)直接设置普通ip

    (2)首先爬取某代理网站的代理ip存入到数据库中,然后从数据库中找到数据,放到middlewares里面进行ip代理。

    import requests
    from scrapy.selector import Selector
    import MySQLdb
    import threading
    from fake_useragent import UserAgent


    conn = MySQLdb.connect(host='127.0.0.1', user='root', passwd='centos', db='test', charset='utf8')
    cour = conn.cursor()

    ua = UserAgent()


    def crawl_ips():
        headers = {
            'User-Agent':  'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
        }
        for i in range(3):
            re = requests.get('http://www.xicidaili.com/wt/{0}'.format(i), headers=headers)

        seletor = Selector(text=re.text)
        all_trs = seletor.css('#ip_list tr')
        ip_list = []
        for tr in all_trs:
            speed_strs = tr.css(".bar::attr(title)").extract()
            if speed_strs:
                speed_str = speed_strs[0]

            all_texts = tr.css('td::text').extract()
            if all_texts:
                ip = all_texts[0]
                port = all_texts[1]
                proxy_type = all_texts[5]
                ip_list.append((ip, port, proxy_type, speed_str.split('秒')[0]))

            for ip_info in ip_list:
                cour.execute(
                    "insert xici_ip_list(ip, port, speed, proxy_type) VALUES('{0}', '{1}', '{2}', '{3}')".format(
                        ip_info[0], ip_info[1], ip_info[3], ip_info[2])

                )
                conn.commit()
                print('数据库写入完成')


    # crawl_ips()


    class GetIP(object):
        def delete_ip(self, ip):
            delete_sql = """
            delete from xici_ip_list where ip='{0}'
            """.format(ip)
            cour.execute(delete_sql)
            conn.commit()
            return True

        def judge_ip(self, ip, port):
            http_url = 'http://ww.baidu.com'
            proxy_url = 'http://{}:{}'.format(ip, port)
            try:
                proxy_dict = {
                    'http': proxy_url
                }
                response = requests.get(http_url, proxies=proxy_dict)
            except Exception as e:
                print('invalid ip and port')
                self.delete_ip(ip)
                return False
            else:
                code = response.status_code
                if code >= 200 and code < 300:
                    print('eddective ip')
                    return True
                else:
                    print('invalid ip and port')
                    self.delete_ip(ip)
                    return False

        def get_random_ip(self):
            # 从数据库中随机获取一个ip
            sql = """
            SELECT ip, port FROM xici_ip_list
            ORDER BY RAND()
            LIMIT 1
            """
            result = cour.execute(sql)
            for ip_info in cour.fetchall():
                ip = ip_info[0]
                port = ip_info[1]
                judge_ip = self.judge_ip(ip, port)
                if judge_ip:
                    return "http://{0}:{1}".format(ip, port)
                else:
                    return self.get_random_ip()


    # t = threading.Thread(target=crawl_ips)
    # t.start()

    get_ip = GetIP()

    get_ip.get_random_ip()

    class RandomProxyMiddleware(object):
        #动态设计ip代理
        def process_request(self, request, spider):
            get_ip = GetIP()
            request.meta['proxy'] = get_ip.get_random_ip()  #ip 代理

    (3)插件化scrapy-proxies

    https://github.com/aivarsk/scrapy-proxies/blob/master/scrapy_proxies

    (4)scrapy-crawlera

    收费版本

    (5)tor洋葱网络

    https://github.com/aivarsk/scrapy-proxies/blob/master/scrapy_proxies

    稳定版本

    八、验证码识别

    1、验证码识别方法

    编码实现tesseract-cor

    在线打码

    http://www.yundama.com/

    人工打码

  • 相关阅读:
    查看python关键字
    命令终端执行python
    Codeforces-462C. A Twisty Movement
    Codeforces-462A. A Compatible Pair
    Codeforces-446C. Pride
    Codeforces-Hello 2018C. Party Lemonade(贪心)
    Codeforces-33C. Wonderful Randomized Sum
    Codeforces-118D. Caesar's Legions(lazy dynamics)
    codeforces-73C. LionAge II
    Gym 101510C-Computer Science
  • 原文地址:https://www.cnblogs.com/wangchunli-blogs/p/9951318.html
Copyright © 2020-2023  润新知