• scrapy爬取极客学院全部课程


     1 # -*- coding: utf-8 -*-
     2 # scrapy爬取极客学院全部课程
     3 import scrapy
     4 from pyquery import PyQuery as pq
     5 from jike.items import JikeItem
     6 
     7 class JikespiderSpider(scrapy.Spider):
     8     name = "jikespider"
     9     allowed_domains = ["www.jikexueyuan.com"]
    10     base_url = 'http://www.jikexueyuan.com/course/?pageNum='
    11 
    12     def start_requests(self):
    13         for page_num in range(1,96):
    14             url = self.base_url + str(page_num)
    15             yield scrapy.Request(url, callback=self.parse_index)
    16 
    17     def parse_index(self, response):
    18         doc = pq(response.text)
    19         lis = doc('.lesson-list .cf li').items()
    20         # pyquery心得, 以为pyquery有点问题而导致无法遍历数据结构,
    21         # 研究发现是'http:' + item('.lessonimg-box a').attr('href')
    22         # 的问题, href是相对路径没有得到一个有效的请求链接
    23         for item in lis:
    24             detail_url = 'http:' + item('.lessonimg-box a').attr('href')
    25             yield scrapy.Request(url=detail_url,callback=self.parse_detail)
    26 
    27     def parse_detail(self, response):
    28         item = JikeItem()
    29         doc = pq(response.text)
    30         item['title'] = doc('.lesson-teacher .bc-box h2').text()
    31         item['time'] = doc('.lesson-teacher .bc-box .timebox').text()
    32         item['content'] = doc('.lesson-teacher .infor-content').text()
    33 
    34         yield item
  • 相关阅读:
    canvas-color的几种设置
    canvas-2lineCap.html
    canvas-2lineJoin.html
    canvas-0trasform.html
    总体、个体和简单随机样本
    大数定律
    切比雪夫不等式
    B1032. 挖掘机技术哪家强
    Array(数组)对象-->join() 方法
    Array(数组)对象-->shift() 方法
  • 原文地址:https://www.cnblogs.com/themost/p/7077422.html
Copyright © 2020-2023  润新知