• 利用Scrapy框架对4567电影爬取


    1.创建一个爬虫文件Movie:--scrapy genspider Movie

    2.在爬虫文件中编写:

      

    # -*- coding: utf-8 -*-
    import scrapy
    from dianying.items import DianyingItem


    class MovieSpider(scrapy.Spider):
    name = 'Movie'
    #allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.4567kan.com/index.php/vod/show/class/爱情/id/7.html']
    url = 'https://www.4567kan.com/index.php/vod/show/class/爱情/id/7/page/%d.html'
    pageNumber = 2 # 爬取的页码

    def parse(self, response):
    li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
    for li in li_list:
    name = li.xpath('./div/a/@title')[0].extract()
    detai_url = 'https://www.4567kan.com'+li.xpath('./div/a/@href').extract_first()
    item = DianyingItem()
    item['name'] = name
    # 利用dedail_url对每个详情页面进行爬取
    yield scrapy.Request(detai_url,callback=self.parse_detai,meta={'item':item}) # meta参数的作用,给回调函数
    if self.pageNumber < 5: #爬取前五页的代码数据
    new_url = format(self.url%self.pageNumber)
    yield scrapy.Request(new_url,callback=self.parse)

    def parse_detai(self,response):
    item = response.meta['item']
    desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first()
    item['desc'] = desc
    yield item # 把item传给管道

    3.在item.py中编写
    import scrapy


    class DianyingItem(scrapy.Item):
    # define the fields for your item here like:
    name = scrapy.Field()
    desc = scrapy.Field()


    4.在管道中编写
    # -*- coding: utf-8 -*-

    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    import pymysql

    class DianyingPipeline:
    conn = None
    c = None
    def open_spider(self,spider):
    self.conn = pymysql.Connect(user='root', password='123456', host='localhost'
    , port=3306, database='xuezhijun',charset = 'utf8')
    self.c = self.conn.cursor()

    def process_item(self, item, spider):
    name = item['name']
    desc = item['desc']
    try:
    self.c.execute('insert into DY values (%s,%s)',
    (name, desc))
    except Exception as e:
    print(e)
    self.conn.rollback()
    self.conn.commit()
    return item
    def close_spider(self,spider):
    self.c.close()
    self.conn.close()

    5.设置一下配置文件
    ITEM_PIPELINES = {
    'dianying.pipelines.DianyingPipeline': 300,
    }
    ROBOTSTXT_OBEY = False
    LOG_LEVEL = 'ERROR'
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'


  • 相关阅读:
    关于oracle当中数据类型转换的问题
    CASE WHEN的两种格式
    C#设置默认打印机
    运用Merge Into实现增加或更新数据
    增加或修改的存储过程
    深拷贝与浅拷贝
    sql server两种分页方法
    获取sql执行时间
    inserted触发器,一张表插入数据时,同时向另外一张表插入数据
    List<string[]> 如何去重
  • 原文地址:https://www.cnblogs.com/KingOfCattle/p/13038892.html
Copyright © 2020-2023  润新知