• python-scrapy深度爬取


    爬取电影网站

    movie.py

    import scrapy
    from MyProjectDianying.items import MyprojectdianyingItem

    class MovieSpider(scrapy.Spider):
    name = 'movie'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.1905.com/vod/list/n_1_t_1/o3.html?fr=vodhome_js_lx']

    url = 'https://www.1905.com/vod/list/n_1_t_1/o3p%d.html'
    page = 2

    def parse(self, response):
    divs = response.xpath('//*[@id="content"]/section[4]/div')
    for div in divs:
    href = div.xpath('./a/@href')[0].extract()
    title = div.xpath('./a/@title')[0].extract()
    item = MyprojectdianyingItem()
    item["href"] = href
    item["title"] = title
    print(title)
    yield scrapy.Request(href, callback=self.parse_href, meta={'item': item})
    if self.page < 4:
    url = format(self.url % self.page)
    yield scrapy.Request(url,callback=self.parse)
    self.page += 1

    def parse_href(self,response):
    detail = response.xpath('//*[@id="playerBoxIntroCon"]/text()')[0].extract()
    item = response.meta['item']
    item["detail"] = detail
    yield item

    items.py

    import scrapy

    class MyprojectdianyingItem(scrapy.Item):
    # define the fields for your item here like:
    href = scrapy.Field()
    title = scrapy.Field()
    detail = scrapy.Field()

    settings.py

    USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    ROBOTSTXT_OBEY = False
    LOG_LEVEL = 'ERROR'
    ITEM_PIPELINES = {
    'MyProjectDianying.pipelines.MyprojectdianyingPipeline': 300,
    }

    pipelines.py

    class MyprojectdianyingPipeline:
    fp = None
    def open_spider(self,spider):
    self.fp = open('dianying.txt', mode='w', encoding='utf-8')

    def process_item(self, item, spider):

    href = item["href"]
    title = item["title"]
    detail = item["detail"]
    self.fp.write(title+href+detail+' ')
    return item

    def close_spider(self,spider):
    self.fp.close()
  • 相关阅读:
    在IT行业工作如何获得高薪?选择前沿的技术,把准方向,有技术有人缘
    如何去做不想做的事情的 - 10个建议
    如何去做不想做的事情的 - 10个建议
    项目管理
    项目管理
    Spring Quartz 定时任务
    Spring Quartz 定时任务
    Spring @Transactional (一)
    Spring @Transactional (一)
    Search Insert Position
  • 原文地址:https://www.cnblogs.com/shiyi525/p/14274049.html
Copyright © 2020-2023  润新知