• 微博类爬虫


    一、创建爬虫项目

    scrapy startproject demo1

    二、分析并定义需要关注的信息,创建数据库(chapter18.sql),设置爬取变量(items.py)

    chapter18.sql

    create database chaper18;
    use chaper18;
    
    create table info(
        id int(10) auto_increment primary key,not null,
        name varchar(30),
        url varchar(100),
        hits int(15),
        comment int(15)
    )

    items.py

     1 # -*- coding: utf-8 -*-
     2 import scrapy
     3 # Define here the models for your scraped items
     4 #
     5 # See documentation in:
     6 # http://doc.scrapy.org/en/latest/topics/items.html
     7 
     8 class Demo1Item(scrapy.Item):    
     9     # 定义关注的数据
    10     name = scrapy.Field()
    11     url = scrapy.Field()
    12     hits = scrapy.Field()
    13     comment = scrapy.Field()

    三、引入下载中间件封装类(middlewares.py),并在settings.py中设置下载中间件指向

     1 # -*- coding: utf-8 -*-
     2 # 导入随机模块
     3 import random
     4 # 导入有关IP池有关的模块
     5 from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
     6 # 导入有关用户代理有关的模块
     7 from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
     8 
     9 # IP池
    10 class HTTPPROXY(HttpProxyMiddleware):
    11     # 初始化 注意一定是 ip=''
    12     def __init__(self, ip=''):
    13         self.ip = ip
    14 
    15     def process_request(self, request, spider):
    16         item = random.choice(IPPOOL)
    17         try:
    18             print("当前的IP是:"+item["ipaddr"])
    19             request.meta["proxy"] = "http://"+item["ipaddr"]
    20         except Exception as e:
    21             print(e)
    22             pass
    23 
    24 
    25 # 设置IP池
    26 IPPOOL = [
    27     {"ipaddr": "182.117.102.10:8118"},
    28     {"ipaddr": "121.31.102.215:8123"},
    29     {"ipaddr": "1222.94.128.49:8118"}
    30 ]
    31 
    32 
    33 # 用户代理
    34 class USERAGENT(UserAgentMiddleware):
    35     #初始化 注意一定是 user_agent=''
    36     def __init__(self, user_agent=''):
    37         self.user_agent = user_agent
    38 
    39     def process_request(self, request, spider):
    40         item = random.choice(UPPOOL)
    41         try:
    42             print("当前的User-Agent是:"+item)
    43             request.headers.setdefault('User-Agent', item)
    44         except Exception as e:
    45             print(e)
    46             pass
    47 
    48 
    49 # 设置用户代理池
    50 UPPOOL = [
    51     "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393"
    52 ]
    53 # 在settings.py中设置一下内容(这里包括了禁止Cookie、和管道指向)
    54 '''
    55 #==============================================
    56 
    57 # 禁止Cookie
    58 COOKIES_ENABLED = False
    59 
    60 # 下载中间件指向
    61 DOWNLOADER_MIDDLEWARES = {
    62     # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware':123,
    63     # '工程名.middlewares.HTTPPROXY' : 125,
    64     'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2,
    65     '工程名.middlewares.USERAGENT': 1
    66 }
    67 
    68 # 管道指向
    69 ITEM_PIPELINES = {
    70     '工程名.pipelines.管道中对应的类名': 300,
    71 }
    72 
    73 # 注意将Obey robots关闭(settings.py文件上面已经开启了,所以需要找到并设置为False)
    74 ROBOTSTXT_OBEY = False
    75 
    76 #==============================================
    77 '''

    如在settings.py添加如下信息

    #==============================================
    
    # 禁止Cookie
    COOKIES_ENABLED = False
    
    # 下载中间件指向
    DOWNLOADER_MIDDLEWARES = {
        # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware':123,
        # '工程名.middlewares.HTTPPROXY' : 125,
        'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2,
        'demo1.middlewares.USERAGENT': 1
    }
    #==============================================

    四、编写管道文件(pipelines.py),并设置管道指向

     1 # -*- coding: utf-8 -*-
     2 import pymysql as pm
     3 # Define your item pipelines here
     4 #
     5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
     6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
     7 
     8 class Demo1Pipeline(object):
     9     # 初始化数据库连接
    10     def __init__(self):
    11         # 创建连接
    12         self.db = pm.connect(host='localhost', user='root', password='123456', database='chapter18', charset='utf8')
    13         # 创建游标
    14         self.cur = self.db.cursor()
    15     def process_item(self, item, spider):
    16         # 处理信息,包括空信息等
    17         for j in range(0, len(item["name"])):
    18             name = item["name"][j]
    19             url = item["url"][j]
    20             hits = item["hits"][j]
    21             comment = item["comment"][j]
    22             sql = "insert into info(name,url,hits,comment) values(%s,%s,%s,%s)"
    23             try:
    24                 self.cur.execute(sql, (name, url, hits, comment))
    25                 self.db.commit()
    26                 print("OK")
    27             except:
    28                 self.db.rollback()
    29                 print("数据插入出错")
    30         return item
    31     # 关闭游标和断开数据库连接
    32     def close_spider(self, spider):
    33         self.cur.close()
    34         self.db.close()
    35       

    如在settings.py添加如下信息

    # 管道指向
    ITEM_PIPELINES = {
        'demo1.pipelines.Demo1Pipeline': 300,
    }

    五、创建爬虫文件,编写爬虫文件(test.py),并引入高度仿浏览器封装类(HeadersHelper.py)

    #scrapy genspider -t basic test 爬取总网站域名
    scrapy genspider -t basic test hexun.com

    test.py源码:

     1 # -*- coding: utf-8 -*-
     2 import scrapy
     3 from ..items import Demo1Item
     4 from .HeadersHelper import HeadersHelper
     5 from scrapy.http import Request
     6 
     7 class TestSpider(scrapy.Spider):
     8     name = "test"
     9     allowed_domains = ["hexun.com"]
    10     start_urls = (
    11         #'http://www.hexun.com/',
    12         'http://yinglunjinye.blog.hexun.com/',
    13     )
    14 
    15     def parse(self, response):
    16         item = Demo1Item()
    17         #patter_comment = "commentd*?','(d*?)'"
    18         #response.xpath('//div[@class="news"]/h1/a/@href').extract()
    19         item["name"] = response.xpath('//span[@class="ArticleTitleText"]/a/text()').extract()
    20         item["url"] = response.xpath('//span[@class="ArticleTitleText"]/a/@href').extract()
    21         url = HeadersHelper('http://yinglunjinye.blog.hexun.com', pattern='<script type="text/javascript" src="(http://click.tool.hexun.com/.*?)"></script>').handle_info()[0]
    22         item["hits"] = HeadersHelper(url,pattern="clickd*?','(d*?)'").handle_info()
    23         item["comment"] = HeadersHelper(url,pattern="commentd*?','(d*?)'").handle_info()
    24         data = HeadersHelper("http://yinglunjinye.blog.hexun.com/",pattern="http://fjrs168.blog.hexun.com/p(d*?)/").handle_info()
    25         if(len(data)>=2):
    26             # data[-2] 是str类型
    27             page_count = int(data[-2])
    28         else:
    29             page_count = 1
    30         yield item
    31         for i in range(2, (page_count+1)):
    32             url = "http://yinglunjinye.blog.hexun.com/p"+str(i)+"/default.html"
    33             yield Request(url, callback=self.parse)

    高度仿浏览器封装类HeadersHelper.py

     1 import urllib.request
     2 import http.cookiejar
     3 import re
     4 
     5 class HeadersHelper:
     6     def __init__(self, url, path=None, pattern=None):
     7         self.url = url #urllib.request.quote(url,safe='/:?=', encoding='utf-8')
     8         self.path = path
     9         self.pattern = pattern
    10 
    11     # 设置信息头,高度仿照浏览器
    12     def set_Headers(self):
    13         # 添加报头 注意"Accept-Encoding": "gb2312, utf-8" 防止解码而出现乱码
    14         headers = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    15                    "Accept-Encoding": "gb2312, utf-8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
    16                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0",
    17                    "Connection": "keep-alive", "Host": "baidu.com"
    18                    }
    19         cjar = http.cookiejar.CookieJar()
    20         opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cjar))
    21         headall = []
    22         for key, value in headers.items():
    23             item = (key, value)
    24             headall.append(item)
    25         opener.addheaders = headall
    26         urllib.request.install_opener(opener)
    27 
    28     # 信息返回
    29     def feedbak_info(self):
    30         self.set_Headers()
    31         # 有时候用utf-8,有时候用gbk
    32         # http://yinglunjinye.blog.hexun.com 就需要gbk
    33         '''
    34                 try:
    35             info = urllib.request.urlopen(self.url).read().decode('utf-8')
    36         except:
    37             info = urllib.request.urlopen(self.url).read().decode('gbk')
    38         '''
    39         info = urllib.request.urlopen(self.url).read()
    40         return str(info)
    41 
    42     # 信息存档
    43     def save_InFile(self):
    44         self.set_Headers()
    45         info = urllib.request.urlopen(self.url).read()
    46         file = open(self.path, 'wb')
    47         file.write(info)
    48         file.close()
    49 
    50     # 页面信息处理(正则处理)
    51     def handle_info(self):
    52         info = self.feedbak_info()
    53         return re.compile(pattern=self.pattern, flags=re.S).findall(info)

    六、测试

  • 相关阅读:
    作业1-1 打印华氏温度与摄氏温度对照表
    python配置yaml
    python读写Excel方法(xlwt和xlrd)
    python发送邮件(smtplib)
    python之os模块(os.path)
    python简单面试题(2)
    python---Logging日志模块
    python---python装饰器
    如何从零开始学习自动化
    软件测试不得不知的基础知识
  • 原文地址:https://www.cnblogs.com/xiaomingzaixian/p/7136063.html
Copyright © 2020-2023  润新知