• 用python爬校花网


    import requests
    import re
    import hashlib,time
    
    def get_index(url):
        response=requests.get(url)
        if response.status_code == 200:
            return response.text
    
    def parse_index(res):
        urls=re.findall(r'class="items".*?href="(.*?)"',res,re.S)
        return urls
    
    def get_detail(urls):
        for url in urls:
            if not url.startswith('http'):
                url='http://www.xiaohuar.com%s' %url
            r1=requests.get(url)
            if r1.status_code == 200:
                url_list=re.findall(r'id="media".*?src="(.*?)"',r1.text,re.S)
                if url_list:
                    mp4_url=url_list[0]
                    # print(mp4_url)
                    save(mp4_url)
    
    def save(url):
        print('Download:%s' %url)
        r2=requests.get(url)
        if r2.status_code == 200:
            m=hashlib.md5()
            m.update(url.encode('utf-8'))
            m.update(str(time.time()).encode('utf-8'))
            filename='%s.mp4' %m.hexdigest()
            file_path=r'D:\爬虫视频\%s' % filename
            with open(file_path,'wb') as f:
                f.write(r2.content)
    
    def main():
        for i in range(5):
            res1=get_index('http://www.xiaohuar.com/list-3-%s.html' %i)
            res2=parse_index(res1)
            get_detail(res2)
    
    if __name__ == '__main__':
        main()

    基于上面代码开多线程爬取视频,优化下载速度

    # 异步,多线程优化下载速度
    
    import requests
    import re
    import hashlib,time
    from concurrent.futures import ThreadPoolExecutor
    
    p=ThreadPoolExecutor(30)
    
    def get_index(url):
        response=requests.get(url)
        if response.status_code == 200:
            return response.text
    
    def parse_index(res):
        res=res.result()
        urls=re.findall(r'class="items".*?href="(.*?)"',res,re.S)
        # return urls
        for url in urls:
            p.submit(get_detail,url)
    
    def get_detail(urls):
        for url in urls:
            if not url.startswith('http'):
                url='http://www.xiaohuar.com%s' %url
            r1=requests.get(url)
            if r1.status_code == 200:
                url_list=re.findall(r'id="media".*?src="(.*?)"',r1.text,re.S)
                if url_list:
                    mp4_url=url_list[0]
                    # print(mp4_url)
                    save(mp4_url)
    
    def save(url):
        print('Download:%s' %url)
        r2=requests.get(url)
        if r2.status_code == 200:
            m=hashlib.md5()
            m.update(url.encode('utf-8'))
            m.update(str(time.time()).encode('utf-8'))
            filename='%s.mp4' %m.hexdigest()
            file_path=r'D:\爬虫视频\%s' % filename
            with open(file_path,'wb') as f:
                f.write(r2.content)
    
    def main():
        for i in range(5):
            p.submit(get_index,'http://www.xiaohuar.com/list-3-%s.html' %i).add_done_callback(parse_index)
            # res1=get_index('http://www.xiaohuar.com/list-3-%s.html' %i)
            # res2=parse_index(res1)
            # get_detail(res2)
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    Codeforces Round #709 (Div. 2, based on Technocup 2021 Final Round)
    Codeforces Round #708 (Div. 2)
    Educational Codeforces Round 106 (Rated for Div. 2)
    ccf csp 202012-1
    带配额的文件系统 (带模拟)
    阔力梯的树
    Codeforces Round #707 (Div. 2, based on Moscow Open Olympiad in Informatics)
    如何获取某个网站IP地址?
    C++开发者眼中的Java关键字abstract
    Java代码中如何获文件名和行号等源码信息?
  • 原文地址:https://www.cnblogs.com/shenbuer/p/7824422.html
Copyright © 2020-2023  润新知