scrapy.py 1.cmd运行scrapy shell http://www.baidu.com response.xpath('//div[@aa="bb"]') 找到需要匹配的内容 ##仅供参考语法,内容不准确 2.cmd运行: scrapy startproject sunbeam(名字随意) 然后在pycharm打开项目sunbeam 3.在items.py编辑需要爬取的内容: # -*- coding: utf-8 -*- # Define here the models for your scraped items # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class MyspiderItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() create_time = scrapy.Field() content = scrapy.Field() digg_count = scrapy.Field() favorite_count = scrapy.Field() comment_count = scrapy.Field() author = scrapy.Field() 4.在cmd运行scrapy genspider aaa,这时在pytharm的spiders文件夹下会生成一个aa.py文件(或者手动新建也可以),然后编辑此文件: # -*- coding: utf-8 -*- import scrapy import time import json from myspider.items import MyspiderItem class NhsqSpider(scrapy.Spider): name = 'nhsq' #名字必须唯一 allowed_domains = ['neihanshequ.com'] #第一种方法,start_urls必须是序列或元祖,不能是字符串 start_urls = ['http://neihanshequ.com/'] #第二种方法,如果不写start_urls就必须写start_requests方法 def start_requests(self): url = 'http://neihanshequ.com/joke/?is_json=1&app_name=neihanshequ_web&max_time={}'.format(int(time.time())) yield scrapy.Request(url,callback=self.parse) def parse(self, response): items = MyspiderItem() result = json.loads(response.text) data = result.get('data').get('data') for i in range(20): items['content'] = data[i].get('group').get('content') items['create_time'] = data[i].get('group').get('create_time') yield items #yield以后数据返回到了pipelines.py ''' yield scrapy.Request(link,callback=self.parse_item) def ''' ## 对返回数据做处理 1.在settings.py把ITEM_PIPELINES=....这一行的注释取消 然后在pipelinse编辑: # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import json class SunbeamPipeline(object): def __init__(self): self.file = open('duanzi.json','w') def process_item(self, item, spider): #print(item['content']) #content = json.dumps(dict(item),ensure_ascii=False)+" " content = json.dumps(dict(item))+" " print(content) self.file.write(content) #return item 然后在cmd运行scrapy crawl aaa