• Python获取个人网站的所有课程下载链接和密码,并保存到Mongodb中


    1、获取网站课程的分类地址;

    '''
    爬取屌丝首页,获取每个分类名称和链接
    '''
    
    import requests
    from lxml import etree
    
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36',
    }
    def get_class_data():
        list_data = []
        url = 'http://www.diaosiweb.net/index.html'
        responese = requests.get(url,headers=headers)
        responese.encoding = responese.apparent_encoding
        class_names = etree.HTML(responese.text).xpath('//div[@id="menu"]/div/ul/li/a/text()')
        class_links = etree.HTML(responese.text).xpath('//div[@id="menu"]/div/ul/li/a/@href')
        for class_name,class_link in zip(class_names,class_links):
            if len(class_link.split('/')[-1]) == 0:
                class_data = {
                    '类别名称':class_name,
                    '类别链接':class_link,
                }
                list_data.append(class_data)
            else:
                pass
        return list_data
    View Code

    2、通过上面获取的地址来获取所有的每个分类下的所有课程名称、链接和发布时间,并保存到Mongodb中去;

    '''
    获取每个分类url下面的课程名称和链接,然后通过课程链接,进入到链接里面去获取每个课程的url和密码
    '''
    
    from spiders_diaosi import get_class_data
    import requests
    from lxml import etree
    import pymongo
    from multiprocessing import Pool
    
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36',
    }
    client = pymongo.MongoClient('localhost',27017)
    diaosi = client['kecheng_message']
    kecheng_message = diaosi['message']
    
    def get_kecheng_data(url):      #获取每页的课程名称、链接、发布时间
        try:
            response = requests.get(url,headers = headers)
            response.encoding = response.apparent_encoding
            kecheng_names = etree.HTML(response.text).xpath('//ul[@class="g-list1"]/li/a/text()')
            kecheng_links = etree.HTML(response.text).xpath('//ul[@class="g-list1"]/li/a/@href')
            times = etree.HTML(response.text).xpath('//ul[@class="g-list1"]/li/span/text()')
            for kecheng_name,kecheng_link,time in zip(kecheng_names,kecheng_links,times):
                data = {
                    '课程名称':kecheng_name,
                    '课程链接':kecheng_link,
                    '发布时间':time
                }
                kecheng_message.insert(data)        #把获取到的课程信息保存到Mongodb中,最后爬取的时候从数据中爬取
                #print(data)
        except Exception as e:
            print(e)
    
    def get_max_page(url):      #获取每个分类的最大页数
        page_response = requests.get(url,headers=headers)
        page_num = int(etree.HTML(page_response.text).xpath('//span[@class="pageinfo"]/strong[1]/text()')[0])
        return page_num
        #print(page_num)
    
    def get_class_id(url):
        class_response = requests.get(url,headers=headers)
        class_response.encoding = class_response.apparent_encoding
        if get_max_page(url) != 1:
            class_id = int(etree.HTML(class_response.text).xpath('//ul[@class="pagelist"]/li/a/@href')[-1].split('_')[1])
            for num in range(1,get_max_page(url) + 1):
                new_url = '{}list_{}_{}.html'.format(url,class_id,num)
                #print(new_url)
                get_kecheng_data(new_url)
    
        else:
            get_kecheng_data(url)
    
    for link in get_class_data():       #从之前的爬取的分类链接中,读取其中的链接,然后爬取每个分类链接中的课程信息
        url = link['类别链接']
        print('开始爬取:' + link['类别名称'])
        get_class_id(url)
        print('已经爬完了:' + link['类别名称'])
    View Code

    3、从数据库中读取每个课程的链接,因为下载地址只有登入之后才可以看到,所以模拟登入之后,进行获取,并保存到Mongodb中去,

    from get_captcha import get_capthca
    import pymongo
    import re
    import requests
    from lxml import etree
    import random
    
    client = pymongo.MongoClient('localhost',27017)
    diaosi = client['kecheng_message']
    kecheng_message = diaosi['message']
    dow_message = diaosi['dow_message']
    
    login_url = 'http://www.diaosiweb.net/member/index.php'
    headers_data = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393',
        'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0',
    ]
    headers = {'User-Agent':random.choice(headers_data)}
    data = {
       'fmdo':'login',
       'dopost':'login',
       'gourl':'',
       'userid':'***',      #运行的时候这里输入你的用户名,或者用input函数输入也可以
       'pwd':'****',        #这里则输入密码,获取用input函数
       'vdcode':'',
       'keeptime':'604800',
    }
    
    get_capthca(login_url)
    captcha = input('输入你看到的验证码:')
    data['vdcode'] = captcha
    
    session = requests.Session()
    session.headers.update(headers)
    
    login_response = session.get(login_url,headers= headers,data=data)
    for link in kecheng_message.find():
        html = session.get(link['课程链接'])
        html.encoding = html.apparent_encoding
        dow_url = re.compile("<div id='pan' style="display:none;">(.*?)</div>").findall(html.text)[0]
        mima = etree.HTML(html.text).xpath('//span[@style]/text()')
        data = {
            'name':link['课程名称'],
            'link':link['课程链接'],
            'dow_url':dow_url,
        }
        try:
            if len(mima) == 0  or len(mima) > 5 and '网盘提取密码' not in mima[-1].split(':') :
                data['mima'] = '没有密码'
            else:
                data['mima'] = mima
            dow_message.insert(data)
            print(data)
        except Exception as e:
            print(e)
            print(link['课程名称'])
    View Code

    下面是获取网页验证码的,

    '''
    获取登入界面的验证码,并保存到本地     --现在只是保存到本地中,后期再编写自动输入
    '''
    
    
    import requests
    from lxml import etree
    import os
    
    login_url = 'http://www.diaosiweb.net/member/index.php'
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36',
    }
    
    def get_capthca(url):
        login_response = requests.get(url,headers=headers)
        image_url = 'http://www.diaosiweb.net' + etree.HTML(login_response.text).xpath('//img[@id="vdimgck"]/@src')[0].replace('..','')
        image_response = requests.get(image_url).content
        with open('captcha.jpg','wb') as f:
            f.write(image_response)
            f.close()
            print('验证码已经保存到:{}'.format(os.getcwd()))
    View Code

    恩,这样差不多就完成了一个爬虫项目了,因为是第一次完整的爬取,所以写的比较乱,也没有思维图,也知道有很多地方不完善,但是发懒筋了,不想写了,先这样吧!

  • 相关阅读:
    Redis数据模型
    Redis集群使用的一些命令(持续更新)
    Redis简单集群搭建
    观察者模式
    抽象工厂模式
    简单工厂模式及其简单Java案例代码实现
    工厂方法模式及简单Java案例代码实现
    Java中的双重检查锁(double checked locking)
    BayaiM__MYSQL千万级数据量的优化方法积累__初级菜鸟
    BayaiM__Linux安装MySQL的两种方法
  • 原文地址:https://www.cnblogs.com/114811yayi/p/6938948.html
Copyright © 2020-2023  润新知