• 爬虫----爬取校花网视频


    import requests
    import re
    import time
    import hashlib
    
    def get_page(url):
        print('GET %s' %url)
        try:
            response=requests.get(url)
            if response.status_code == 200:
                return response.content
        except Exception:
            pass
    
    def parse_index(res):
        obj=re.compile('class="items.*?<a href="(.*?)"',re.S)
        detail_urls=obj.findall(res.decode('gbk'))
        for detail_url in detail_urls:
            if not detail_url.startswith('http'):
                detail_url='http://www.xiaohuar.com'+detail_url
            yield detail_url
    
    def parse_detail(res):
        obj=re.compile('id="media".*?src="(.*?)"',re.S)
        res=obj.findall(res.decode('gbk'))
        if len(res) > 0:
            movie_url=res[0]
            return movie_url
    
    
    def save(movie_url):
        response=requests.get(movie_url,stream=False)
        if response.status_code == 200:
            m=hashlib.md5()
            m.update(('%s%s.mp4' %(movie_url,time.time())).encode('utf-8'))
            filename=m.hexdigest()
            with open(r'./movies/%s.mp4' %filename,'wb') as f:
                f.write(response.content)
                f.flush()
    
    
    def main():
        index_url='http://www.xiaohuar.com/list-3-{0}.html'
        for i in range(5):
            print('*'*50,i)
            #爬取主页面
            index_page=get_page(index_url.format(i,))
            #解析主页面,拿到视频所在的地址列表
            detail_urls=parse_index(index_page)
            #循环爬取视频页
            for detail_url in detail_urls:
                #爬取视频页
                detail_page=get_page(detail_url)
                #拿到视频的url
                movie_url=parse_detail(detail_page)
                if movie_url:
                    #保存视频
                    save(movie_url)
    
    
    if __name__ == '__main__':
        main()
    
    
    #并发爬取
    from concurrent.futures import ThreadPoolExecutor
    import queue
    import requests
    import re
    import time
    import hashlib
    from threading import current_thread
    
    p=ThreadPoolExecutor(50)
    
    def get_page(url):
        print('%s GET %s' %(current_thread().getName(),url))
        try:
            response=requests.get(url)
            if response.status_code == 200:
                return response.content
        except Exception as e:
            print(e)
    
    def parse_index(res):
        print('%s parse index ' %current_thread().getName())
        res=res.result()
        obj=re.compile('class="items.*?<a href="(.*?)"',re.S)
        detail_urls=obj.findall(res.decode('gbk'))
        for detail_url in detail_urls:
            if not detail_url.startswith('http'):
                detail_url='http://www.xiaohuar.com'+detail_url
            p.submit(get_page,detail_url).add_done_callback(parse_detail)
    
    def parse_detail(res):
        print('%s parse detail ' %current_thread().getName())
        res=res.result()
        obj=re.compile('id="media".*?src="(.*?)"',re.S)
        res=obj.findall(res.decode('gbk'))
        if len(res) > 0:
            movie_url=res[0]
            print('MOVIE_URL: ',movie_url)
            with open('db.txt','a') as f:
                f.write('%s
    ' %movie_url)
            # save(movie_url)
            p.submit(save,movie_url)
            print('%s下载任务已经提交' %movie_url)
    def save(movie_url):
        print('%s SAVE: %s' %(current_thread().getName(),movie_url))
        try:
            response=requests.get(movie_url,stream=False)
            if response.status_code == 200:
                m=hashlib.md5()
                m.update(('%s%s.mp4' %(movie_url,time.time())).encode('utf-8'))
                filename=m.hexdigest()
                with open(r'./movies/%s.mp4' %filename,'wb') as f:
                    f.write(response.content)
                    f.flush()
        except Exception as e:
            print(e)
    
    def main():
        index_url='http://www.xiaohuar.com/list-3-{0}.html'
        for i in range(5):
            p.submit(get_page,index_url.format(i,)).add_done_callback(parse_index)
    
    
    if __name__ == '__main__':
        main()
    
    爬取校花网视频
    

      

  • 相关阅读:
    linux中服务器定时程序设定
    Linux中java项目环境部署,简单记录一下
    四则运算使用栈和后缀表达式
    PAT乙1003
    L7,too late
    PAT乙1002
    L6,Percy Buttons
    如何计算递归算法的时间复杂度
    c#打印(转)
    C中数组与指针【转】
  • 原文地址:https://www.cnblogs.com/w-s-l123/p/9645934.html
Copyright © 2020-2023  润新知