scrapy爬虫系列之一--scrapy的基本用法

功能点：scrapy基本使用

爬取网站：传智播客老师

完整代码：https://files.cnblogs.com/files/bookwed/first.zip

主要代码：

ff.py

# -*- coding: utf-8 -*-
import scrapy
from first.items import FirstItem


class FfSpider(scrapy.Spider):    #scrapy.Spider是最基本的类，必须继承这个类
    # 爬虫名称
    name = 'ff'
    # 允许的域名，可选
    allowed_domains = ['itcast.cn']
    start_urls = ['http://www.itcast.cn/channel/teacher.shtml']

    # 默认的Request对象回调函数，用来处理网页返回的response，以及生成Item或者Request对象
    def parse(self, response):
        teacher_list = response.xpath("//div[@class='li_txt']")
        for teacher in teacher_list:
            # 创建item对象
            item = FirstItem()
            # 此处由于疏忽，把teacher写成了item，结果找了半天
            name = teacher.xpath("./h3/text()").extract()        # xpath返回的是xpath对象，需要用extract提取字符串，同时，因为返回的是一个列表，所以要用[0]取值
            level = teacher.xpath("./h4/text()").extract()
            desc = teacher.xpath("./p/text()").extract()

            item["name"] = name[0]
            item["level"] = level[0]
            item["desc"] = desc[0]
            yield item

pipelines.py

import json
# 注意点：对应的settings配置，别忘了打开注释
# 可以做数据去重
class FirstPipeline(object):
    def __init__(self):
        self.f = open('teachers.json', 'w', encoding='utf-8')

    # 处理item
    def process_item(self, item, spider):
        print(dict(item))
        content = json.dumps(dict(item), ensure_ascii=False)
        self.f.write(content+",")
        self.f.write("
")
        return item

    def close_spider(self):
        self.f.close()

原文地址：https://www.cnblogs.com/bookwed/p/10617977.html