• 多线程爬虫抓图_版本3


     Python入门经典(2K超清_送书)

    https://study.163.com/course/courseMain.htm?courseId=1006183019&share=2&shareId=400000000398149

    # -*- coding: utf-8 -*-
    """
    Created on Sat Mar 26 08:49:42 2016
    
    @author: daxiong
    """
    #开始时间
    
    import requests, os, bs4,time,threading
    timeBegin=time.time() 
    url = 'http://xkcd.com/'              # starting url
    #urls_list=[]
    bad_urls=[]
    filename="urls.txt"
    os.makedirs('xkcd')   # store comics in ./xkcd
     
     
    #爬虫获取所有图片网址
    def crawl_urls(url):
        while not url.endswith('#'):
            res = requests.get(url)
            soup = bs4.BeautifulSoup(res.text,"lxml")
            prevLink = soup.select('a[rel="prev"]')[0]
            url = 'http://xkcd.com' + prevLink.get('href')
            #print("get url:",url)
            urls_list.append(url)
    
    #构造法获取网址,从文件中读取网址
    def get_url_from_file(filename):
        file=open(filename,"r")
        urls_list=file.readlines()
        file.close()
    
        new_urls_list=[]
        for url in urls_list:
            new_url=url.strip("
    ")
            new_urls_list.append(new_url)
        
        return new_urls_list
        
         
    #写入网址到文件夹
    def file_urls(urls_list):
        for url in urls_list:
            file.write(url)
            file.write("
    ")
        file.close()
         
    #下载一个网址的图片
    def download_image(url):
        res = requests.get(url)
        soup = bs4.BeautifulSoup(res.text,"lxml")
        comicElem = soup.select('#comic img')   
        comicUrl = 'http:' + comicElem[0].get('src')
        print('Downloading image %s...' % (comicUrl))
        res = requests.get(comicUrl)
        imageFile = open(os.path.join('xkcd', os.path.basename(comicUrl)), 'wb')
        for chunk in res.iter_content(100000):
            imageFile.write(chunk)
        imageFile.close()
     
     
    #下载所有网址的图片
    def download_all(urls_list):
        for url in urls_list:
            try:
                download_image(url)
            except:
                bad_urls.append(url)
                continue
        print("well Done")
    
    #下载某范围网址的图片
    def download_range(start,end):
        urls_list_range1=urls_list[start:end]
        for url in urls_list_range1:
            try:
                download_image(url)
            except:
                bad_urls.append(url)
                continue
        #print("well Done")
        
    #获取截取数
    def Step(urls_list):
        step=len(urls_list)/20.0
        step=int(round(step,0))
        return step
    
    def TimeCount():
        timeComsuming=timeEnd-timeBegin
        print ("time Comsuming:%f seconds" % timeComsuming)
        return timeComsuming   
        
        
    urls_list=get_url_from_file(filename)
    step=Step(urls_list)
    #urls_list_range1=urls_list[:step]
    
    downloadThreads = [] # a list of all the Thread objects
    for i in range(0, len(urls_list), step): # loops 14 times, creates 14 threads
        downloadThread = threading.Thread(target=download_range, args=(i, i +step))
        downloadThreads.append(downloadThread)
        downloadThread.start()
    
    # Wait for all threads to end.
    for downloadThread in downloadThreads:
        downloadThread.join()
    print('Done.')
    
    #结束时间
    timeEnd=time.time()
    #计算程序消耗时间,可改进算法
    timeComsuming=TimeCount() 
    
    '''
    # Create and start the Thread objects.
    downloadThreads = [] # a list of all the Thread objects
    for i in range(0, 1400, 100): # loops 14 times, creates 14 threads
        downloadThread = threading.Thread(target=downloadXkcd, args=(i, i + 99))
        downloadThreads.append(downloadThread)
        downloadThread.start()
     
    # Wait for all threads to end.
    for downloadThread in downloadThreads:
        downloadThread.join()
    print('Done.')
    download_all(urls_list)
    '''
    

    https://study.163.com/provider/400000000398149/index.htm?share=2&shareId=400000000398149(博主视频教学主页)

      

  • 相关阅读:
    activemq 高可用集群部署
    rabbitmq单机部署、集群部署、haproxy+keepalived 的高可用负载均衡环境搭建
    redis 单机部署、集群部署(主从同步+哨兵)
    zookeeper 单机部署、伪集群部署、集群部署
    IDEA 中调试 dubbo 出现 <dubbo:reference interface="" /> interface not allow null! 异常
    centos 安装 subversion1.8及更高版本
    mysql 两主一从环境搭建
    mysql 一主多从环境搭建
    springboot + post 中文乱码
    android:inputType参数类型说明
  • 原文地址:https://www.cnblogs.com/webRobot/p/5322396.html
Copyright © 2020-2023  润新知