1 # -*- coding: utf-8 -*- 2 # scrapy爬取极客学院全部课程 3 import scrapy 4 from pyquery import PyQuery as pq 5 from jike.items import JikeItem 6 7 class JikespiderSpider(scrapy.Spider): 8 name = "jikespider" 9 allowed_domains = ["www.jikexueyuan.com"] 10 base_url = 'http://www.jikexueyuan.com/course/?pageNum=' 11 12 def start_requests(self): 13 for page_num in range(1,96): 14 url = self.base_url + str(page_num) 15 yield scrapy.Request(url, callback=self.parse_index) 16 17 def parse_index(self, response): 18 doc = pq(response.text) 19 lis = doc('.lesson-list .cf li').items() 20 # pyquery心得, 以为pyquery有点问题而导致无法遍历数据结构, 21 # 研究发现是'http:' + item('.lessonimg-box a').attr('href') 22 # 的问题, href是相对路径没有得到一个有效的请求链接 23 for item in lis: 24 detail_url = 'http:' + item('.lessonimg-box a').attr('href') 25 yield scrapy.Request(url=detail_url,callback=self.parse_detail) 26 27 def parse_detail(self, response): 28 item = JikeItem() 29 doc = pq(response.text) 30 item['title'] = doc('.lesson-teacher .bc-box h2').text() 31 item['time'] = doc('.lesson-teacher .bc-box .timebox').text() 32 item['content'] = doc('.lesson-teacher .infor-content').text() 33 34 yield item