• scrapy crawl itcast -o teachers.json 爬虫案列


    1. spider.py文件配置
        1 
        2 # -*- coding: utf-8 -*-
        3 import scrapy
        4 from itTeachers.items import ItteachersItem
        5 
        6 
        7 class ItcastSpider(scrapy.Spider):
        8     name = 'itcast'
        9     allowed_domains = ['itcast.cn']
       10     start_urls = ['http://www.itcast.cn/channel/teacher.shtml#']
       11 
       12     def parse(self, response):
       13         #with open("teacher.html","w") as f:
       14             #f.write(response.body)
       15 
       16         items = []
       17 
       18         teacher_list = response.xpath('//div[@class="li_txt"]')
       19         for each in teacher_list:
       20 
       21             #我们将得到的数据封装到一个'ItcastItem'对象
       22             item = ItteachersItem()
       23             name = each.xpath('h3/text()').extract()
       24             title = each.xpath('h4/text()').extract()
       25             info = each.xpath('p/text()').extract()
       26 
       27             #xpath返回的是包含一个元素的列表
       28             item['name'] = name[0]
       29             item['title'] = title[0]
       30             item['info'] = info[0]
       31 
       32             items.append(item)
       33         #直接返回最后数据
       34         return items
      ~                         
    2. items.py文件配置
        1 # -*- coding: utf-8 -*-
        2 
        3 # Define here the models for your scraped items
        4 #
        5 # See documentation in:
        6 # https://doc.scrapy.org/en/latest/topics/items.html
        7 
        8 import scrapy
        9 
       10 
       11 class ItteachersItem(scrapy.Item):
       12     # define the fields for your item here like:
       13     # name = scrapy.Field()
       14     name = scrapy.Field()
       15     title = scrapy.Field()
       16     info = scrapy.Field()

  • 相关阅读:
    2010上交:计算表达式
    添加子评论
    上传图片
    settings配置 文件操作
    django 操作前端数据
    静态文件配置
    render httprequest
    上传文件配置
    Django为什么要跳转到不同的页面来实现不同的功能
    定义日志器
  • 原文地址:https://www.cnblogs.com/hizf/p/8270008.html
Copyright © 2020-2023  润新知