-mongodb -操作 -scrapy -安装 -pip3 install scrapy -先装Twisted -装pywin32 -新建项目 -scrapy startproject 项目名字 -新建爬虫切换到你创建的项目下 -scrapy genspider cnblogs www.cnblogs.com/ -项目目录介绍 -spiders -所有的爬虫程序 -items.py -类似于django的model类 -middlewares.py -中间件 -pipelines.py -持久化相关 -settings.py -配置文件 -scrapy.cfg -部署相关 -运行爬虫 -scrapy crawl cnblogs --nolog
创建好的项目spiders下的app
# -*- coding: utf-8 -*- import scrapy from scrapy import Request class CnblogsSpider(scrapy.Spider): name = 'cnblogs' #爬虫名,必须唯一 allowed_domains = ['cnblogs.com'] #允许的域名, start_urls = ['https://www.cnblogs.com/'] #起始的url #深度优先,广度优先 #爬虫去重:好多策略 #爬虫起始入口 start_requests def parse_detail(self,response): print(len(response.text)) def parse(self, response): # print('--------4444-------',response) # print(response.text) #混搭 # from bs4 import BeautifulSoup # soup=BeautifulSoup(response.text,"lxml") # soup.find(name='div') #解析 div_list=response.css('.post_item') #取出class为post_item的所有 # print(len(div_list)) #//a[contains(@class,"li")] # div_list=response.xpath('//div[contains(@class,"post_item")]') # div_list=response.xpath('//*[@id="post_list"]/div') # print(len(div_list)) for div in div_list: #extract_first()表示取列表中的第一个 url=div.css('.post_item_body a::attr(href)').extract_first() print(url) yield Request(url,callback=self.parse_detail) next_url=response.css('.pager a:last-child::attr(href)').extract_first() print('https://www.cnblogs.com'+next_url) yield Request('https://www.cnblogs.com'+next_url)
添加main方法每次在黑窗口启动比较麻烦
from scrapy.cmdline import execute # execute(['scrapy', 'crawl', 'cnblogs','--nolog']) execute(['scrapy', 'crawl', 'cnblogs'])
这里不要禁用log,因为这是唯一的错误显示