• 【爬虫】批量下载极客时间课程


    import sys
    import requests
    import bs4
    import re
    import urllib
    import os
    import random
    
    class Queue(object):
        """队列
        """
        def __init__(self):
            self.items = []
            self.sum = 0
    
        def is_empty(self):
            """判断队列是否为空"""
            return len(self.items) == 0
    
        def push(self, element):
            """元素入队"""
            self.items.append(element)
            self.sum += 1
    
        def pop(self):
            """元素出队列"""
            temp = self.items[0]
            self.items = self.items[1:]
            return temp
    
        def size(self):
            """获取当前队列大小"""
            return len(self.items)
    
        def total_size(self):
            """入队列的总元素数量"""
            return self.sum
    
    class RequestsCrawler(object):
        """使用requests实现的抓取器
        """
        def get_page(self, url):
            """页面抓取
            """
            text = None
            header = {
                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:48.0) 
                    Gecko/20100101 Firefox/48.0",
                "Connection": "close"
            }
            
            with requests.Session() as s:
                res = s.get(url=url, headers=header)
                text = res.text
            return text
    
    def get_link(url, sub_url):
        crawler = RequestsCrawler()
        text = crawler.get_page(url+sub_url)
        #print(text)
        soup = bs4.BeautifulSoup(text, "html.parser")
        tds = soup.find_all("td", attrs={'class': 'file'})
        rst = []
        for td in tds:
            href = td.find('a', href=True).attrs['href']
            #print(href)
            rst.append(href)
        return rst
    
    def download(url, link):
        r = requests.get(url+link)
        path = os.getcwd()
        dir = path+'/'.join(link.split('/')[:-1])
        if not os.path.exists(dir):
            os.makedirs(dir)
        print("下载", path+link)
        with open(path+link, "wb") as code:
            code.write(r.content)
        print("下载完成")
    
    if __name__ == "__main__":
        #print(text)
        url = "https://d.shikey.com"
        queue = Queue()
        queue.push("/jike/")
        while(not queue.is_empty()):
            link = queue.pop()
            print("遍历目录:", link)
            rst = get_link(url, link)
            for link in rst:
                if not link.endswith("?preview"):
                    queue.push(link)
                else:
                    download(url, link[:-8])
    
  • 相关阅读:
    Ubuntu:替换DASH图标
    使用 python 操作 mongodb 常用的操作
    boost Asio网络编程简介
    optional的使用
    boost中Function和Lambda的使用
    boost多线程入门介绍
    boost中bind的使用
    c++11新标准for循环和lambda表达式
    使用gcc命令编译多个文件
    编辑gif
  • 原文地址:https://www.cnblogs.com/yanqiang/p/14396196.html
Copyright © 2020-2023  润新知