• 第六篇


    环境:python3  pycharm

    模块:requests  bs4  urlretrieve  os  time

    第一步:获取网页源代码

    import requests
    from bs4 import BeautifulSoup
    from urllib.request import urlretrieve
    import os
    import time
    def get_html(url):
        try:
            response = requests.get(url)
            response.encoding = 'gbk'
            return response.text
        except Exception as e:
            print(e)
    if __name__ == '__main__':
        url = 'http://www.521609.com/meinvxiaohua/'
        get_html(url)

    第二步:下载美女图片

    def down_show(html,page):
        try:
            soup = BeautifulSoup(html,'lxml')
            all_img = soup.find("div",class_="index_img list_center").find_all('img')
    
            num = 1
            for img in all_img:
                src = img.get('src')
                url_pic = 'http://www.521609.com' + src
                if os.path.exists('show'):
                    pass
                else:
                    os.mkdir('show')
                urlretrieve(url_pic,'./show/'+'第%s页-%s.jpg'%(page,num))
                num += 1
        except Exception as e:
            print(e)

    第三步:可选打印多少页,代码所示下载5页

    def get_pages(page):
        for i in range(121,page+121):
            url = 'http://www.521609.com/meinvxiaohua/list%d.html' % i
            html = get_html(url)
            down_show(html,i-120)
            time.sleep(1)
        print("图片下载完毕")
    if __name__ == '__main__':
        get_pages(5)

    也可以采用多线程

    import requests
    from bs4 import BeautifulSoup
    import threading
    import time
    import os
    
    headers = {
        'Referer': 'http://www.521609.com/meinvxiaohua/',
        'User-Agent': '',
    }
    
    def get_html(url):
        try:
            response = requests.get(url=url,headers=headers)
            response.encoding = "gb2312"
            return response.text      #文本,字符串
        except Exception as e:
            print(e)
    
    def mk_dir():
        os.makedirs('./show/',exist_ok=True)
    
    def down_image(html,page):
        try:
            soup = BeautifulSoup(html,'lxml')#可以解析html,xml
            all_img = soup.find('div',class_='index_img list_center').find_all('img')
            num = 1
            for img in all_img:
                src = img.get('src')#后半部分的地址
                url = 'http://www.521609.com' + src
                content = requests.get(url=url,headers=headers).content#字节流
                with open('./show/第%s页-%s.jpg' % (page,num),'wb') as file:
                    file.write(content)
                num += 1
                time.sleep(1)
        except Exception as e:
            print(e)
            pass
    
    def get_pages(page):
        for i in range(121,121+page):
            url = "http://www.521609.com/meinvxiaohua/list%s.html" % i
            html = get_html(url)
            if not os.path.exists('show'):
                mk_dir()
            down_image(html,page)
            time.sleep(1)
            print('美女图片前%s页下载完毕' % str(i-120))
        # if not os.path.exists('show'):
        #     mk_dir()
        # thread = []
        # for i in range(121,121+page):
        #     url = "http://www.521609.com/meinvxiaohua/list%s.html" % i
        #     html = get_html(url)
        #     t = threading.Thread(target=down_image,args=(html,str(i-120)))
        #     thread.append(t)
        # for i in thread:
        #     i.start()
        # for j in thread:
        #     j.join()
    
    
    def main():
        start_time = time.time()
        get_pages(3)
        stop_time = time.time()
        load_time = stop_time - start_time
        print(load_time)#48.115086793899536
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    vscode常用插件
    2019前端面试总结
    用户注册登录的逻辑
    Vue项目各个文件夹的作用
    Gulp & webpack 配置详解
    Webpack 配置入门
    开始一个React项目(一)一个最简单的webpack配置
    资源加载过程
    关于Netty Pipeline中Handler的执行顺序问题
    解压版中文乱码问题MYSQL中文乱码
  • 原文地址:https://www.cnblogs.com/smart-zihan/p/9498984.html
Copyright © 2020-2023  润新知