• 爬取校花网视频


    import requests
    import re
    import time
    import hashlib
    
    def get_page(url):
        print('GET %s' %url)
        try:
            response=requests.get(url)
            if response.status_code == 200:
                return response.content
        except Exception:
            pass
    
    def parse_index(res):
        obj=re.compile('class="items.*?<a href="(.*?)"',re.S)
        detail_urls=obj.findall(res.decode('gbk'))
        for detail_url in detail_urls:
            if not detail_url.startswith('http'):
                detail_url='http://www.xiaohuar.com'+detail_url
            yield detail_url
    
    def parse_detail(res):
        obj=re.compile('id="media".*?src="(.*?)"',re.S)
        res=obj.findall(res.decode('gbk'))
        if len(res) > 0:
            movie_url=res[0]
            return movie_url
    
    
    def save(movie_url):
        response=requests.get(movie_url,stream=False)
        if response.status_code == 200:
            m=hashlib.md5()
            m.update(('%s%s.mp4' %(movie_url,time.time())).encode('utf-8'))
            filename=m.hexdigest()
            with open(r'./movies/%s.mp4' %filename,'wb') as f:
                f.write(response.content)
                f.flush()
    
    
    def main():
        index_url='http://www.xiaohuar.com/list-3-{0}.html'
        for i in range(5):
            print('*'*50,i)
            #爬取主页面
            index_page=get_page(index_url.format(i,))
            #解析主页面,拿到视频所在的地址列表
            detail_urls=parse_index(index_page)
            #循环爬取视频页
            for detail_url in detail_urls:
                #爬取视频页
                detail_page=get_page(detail_url)
                #拿到视频的url
                movie_url=parse_detail(detail_page)
                if movie_url:
                    #保存视频
                    save(movie_url)
    
    
    if __name__ == '__main__':
        main()
    
    
    #并发爬取
    from concurrent.futures import ThreadPoolExecutor
    import queue
    import requests
    import re
    import time
    import hashlib
    from threading import current_thread
    
    p=ThreadPoolExecutor(50)
    
    def get_page(url):
        print('%s GET %s' %(current_thread().getName(),url))
        try:
            response=requests.get(url)
            if response.status_code == 200:
                return response.content
        except Exception as e:
            print(e)
    
    def parse_index(res):
        print('%s parse index ' %current_thread().getName())
        res=res.result()
        obj=re.compile('class="items.*?<a href="(.*?)"',re.S)
        detail_urls=obj.findall(res.decode('gbk'))
        for detail_url in detail_urls:
            if not detail_url.startswith('http'):
                detail_url='http://www.xiaohuar.com'+detail_url
            p.submit(get_page,detail_url).add_done_callback(parse_detail)
    
    def parse_detail(res):
        print('%s parse detail ' %current_thread().getName())
        res=res.result()
        obj=re.compile('id="media".*?src="(.*?)"',re.S)
        res=obj.findall(res.decode('gbk'))
        if len(res) > 0:
            movie_url=res[0]
            print('MOVIE_URL: ',movie_url)
            with open('db.txt','a') as f:
                f.write('%s
    ' %movie_url)
            # save(movie_url)
            p.submit(save,movie_url)
            print('%s下载任务已经提交' %movie_url)
    def save(movie_url):
        print('%s SAVE: %s' %(current_thread().getName(),movie_url))
        try:
            response=requests.get(movie_url,stream=False)
            if response.status_code == 200:
                m=hashlib.md5()
                m.update(('%s%s.mp4' %(movie_url,time.time())).encode('utf-8'))
                filename=m.hexdigest()
                with open(r'./movies/%s.mp4' %filename,'wb') as f:
                    f.write(response.content)
                    f.flush()
        except Exception as e:
            print(e)
    
    def main():
        index_url='http://www.xiaohuar.com/list-3-{0}.html'
        for i in range(5):
            p.submit(get_page,index_url.format(i,)).add_done_callback(parse_index)
    
    
    if __name__ == '__main__':
        main()
    
    爬取校花网视频
  • 相关阅读:
    KeilMdk .gitignore文件
    C#与C/C++之间数据类型的转换
    「邮件规则」​​​​​​​​写出我心(一百七十六)
    「当机立断」​​​​​​​写出我心(一百七十五)
    「职责范围」​​​​​​写出我心(一百七十四)
    「建立个人知识库」​​​​​写出我心(一百七十三)
    「学会利用资源」​​​​写出我心(一百七十二)
    「跨越舒适区」​​​写出我心(一百七十一)
    「凡事预则立不预则废」​​写出我心(一百七十)
    「论述」​写出我心(一百六十九)
  • 原文地址:https://www.cnblogs.com/HomeG/p/10519295.html
Copyright © 2020-2023  润新知