• 使用python网络库下载


    下载1000次网页资源

    1,普通循环方式下载1000次,非常慢

    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    import sys
    import os
    import time
    import urllib
    import urllib2
    
    total_times = 1000
    
    def worker(url):
            try:
                    f = urllib2.urlopen(url,timeout=10800)
                    body = f.read()
            except:
                    print sys.exc_info()
                    return 0
            return 1
    
    if __name__ == "__main__":
    
            for i in range(total_times):
                    url = "http://web.kuaipan.cn/static/images/pc.png"
                    worker(url)
    
    #root:~/test # time ./c.py
    #real    4m6.700s
    #user    0m1.192s
    #sys     0m1.736s

    2,使用进程池下载,有点慢

    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    import sys
    import os
    import time
    import urllib
    import urllib2
    import multiprocessing
    
    total_times = 1000
    
    def worker(url):
            try:
                    f = urllib2.urlopen(url,timeout=10800)
                    body = f.read()
            except:
                    print sys.exc_info()
                    return 0
            return 1
    
    if __name__ == "__main__":
    
            pool_size = multiprocessing.cpu_count() * 2
            pool = multiprocessing.Pool(processes=pool_size)
    
            for i in range(total_times):
                    url = "http://web.kuaipan.cn/static/images/pc.png"
                    pool.apply_async(worker, (url,))
                    
            pool.close()
            pool.join()
    
    #root:~/test # time ./pc.py
    #real    1m43.668s
    #user    0m1.480s
    #sys     0m1.628s

    3,使用twisted网络库,同样发起1000次请求,耗时减少为15s左右,性能提升很多,很快

    #!/usr/bin/python
    
    from sys import argv
    from pprint import pformat
    
    #from twisted.internet.task import react
    from twisted.internet import reactor
    from twisted.web.client import Agent, readBody
    from twisted.web.http_headers import Headers
    
    total_times = 1000
    times = 0
    
    def cbRequest(response):
        #print 'Response version:', response.version
        #print 'Response code:', response.code
        #print 'Response phrase:', response.phrase
        #print 'Response headers:'
        #print pformat(list(response.headers.getAllRawHeaders()))
        d = readBody(response)
        d.addCallback(cbBody)
        return d
    
    def cbBody(body):
        #print 'Response body:'
        #print body
        data = body
    
    def cbShutdown(ignored):
        global times
        times = times + 1
        if total_times - 1 < times:
            reactor.stop()
    
    def curl(url):
        agent = Agent(reactor)
        d = agent.request(
            'GET', url,
            Headers({'User-Agent': ['Twisted Web Client Example']}),
            None)
        d.addCallback(cbRequest)
        d.addBoth(cbShutdown)
        return d
    
    if __name__ == '__main__':
    
        for i in range(total_times):
            curl("http://web.kuaipan.cn/static/images/pc.png")
        
        reactor.run()
    
    #root:~/test # time ./tc.py
    #real    0m15.480s
    #user    0m3.596s
    #sys     0m0.720s

    4,使用twisted网络库长连接,耗时也是很少,很快

    #!/usr/bin/python
    
    from sys import argv
    from pprint import pformat
    
    #from twisted.internet.task import react
    from twisted.internet import reactor
    from twisted.web.http_headers import Headers
    
    from twisted.internet import reactor
    from twisted.internet.defer import Deferred, DeferredList
    from twisted.internet.protocol import Protocol
    from twisted.web.client import Agent, HTTPConnectionPool
    
    total_times = 1000
    times = 0
    
    class IgnoreBody(Protocol):
        def __init__(self, deferred):
            self.deferred = deferred
    
        def dataReceived(self, bytes):
            pass
    
        def connectionLost(self, reason):
            self.deferred.callback(None)
    
    
    def cbRequest(response):
        #print 'Response code:', response.code
        finished = Deferred()
        response.deliverBody(IgnoreBody(finished))
        return finished
    
    pool = HTTPConnectionPool(reactor)
    agent = Agent(reactor, pool=pool)
    
    def requestGet(url):
        d = agent.request('GET', url)
        d.addCallback(cbRequest)
        return d
    
    def cbShutdown(ignored):
        global times
        times = times + 1
        if total_times - 1 < times:
            reactor.stop()
    
    def curl(url):
        agent = Agent(reactor)
        d = agent.request(
            'GET', url,
            Headers({'User-Agent': ['Twisted Web Client Example']}),
            None)
        d.addCallback(cbRequest)
        d.addBoth(cbShutdown)
        return d
    
    for i in range(total_times):
        curl("http://web.kuaipan.cn/static/images/pc.png")
    
    reactor.run()
    
    #root:~/test # time ./tpc.py
    #real    0m12.817s
    #user    0m3.508s
    #sys     0m0.528s

    更多twisted参考:https://twistedmatrix.com/documents/current/web/howto/client.html#auto4

    golang使用循环下载方式,和python使用循环下载方式耗时差不多,4分钟时间,瓶颈应该在网络

    package main
    
    import (
        "fmt"
        "net/http"
        "io/ioutil"
    )
    
    var totaltimes = 1000
    
    func worker(url string) {
       response, err := http.Get(url)
       if err != nil {
           return
       }
       defer response.Body.Close()
       body, _ := ioutil.ReadAll(response.Body)
       fmt.Println(len(body))
    }
    
    func main() {
    
        for i := 0; i < totaltimes;i ++ {
              worker("http://web.kuaipan.cn/static/images/pc.png")
        }
    }
    
    //root:~/test # time ./got > goresult
    //
    //real    4m45.257s
    //user    0m0.628s
    //sys     0m0.632s

    golang使用协程池方式模拟下载1000次,性能也要差很多(而且容易出现网络错误,最近出的go version go1.2rc4 linux/amd64要好一点 ,go1.1问题很多

    package main
    
    import (
        "fmt"
        "net/http"
        "io/ioutil"
        "sync"
    )
    
    var totaltimes = 1000
    var poolsize = 250
    
    func worker(linkChan chan string, wg *sync.WaitGroup) {
       // Decreasing internal counter for wait-group as soon as goroutine finishes
       defer wg.Done()
    
       for url := range linkChan {
           // Analyze value and do the job here
           response, err := http.Get(url)
           if err != nil {
               return
           }
           defer response.Body.Close()
           body, _ := ioutil.ReadAll(response.Body)
           fmt.Println(len(body))
           //fmt.Println("Resp code", response.StatusCode)
       }
    }
    
    func main() {
        var i int
    
        lCh := make(chan string)
        wg := new(sync.WaitGroup)
        // Adding routines to workgroup and running then
        for i := 0; i < poolsize; i++ {
            wg.Add(1)
            go worker(lCh, wg)
        }
    
        for i = 0; i < totaltimes;i ++ {
              lCh <- "http://web.kuaipan.cn/static/images/pc.png"
        }
        close(lCh)
        // Waiting for all goroutines to finish (otherwise they die as main routine dies)
        wg.Wait()
    }
    
    //root:~/test # time ./gotest > goresult
    //
    //real    0m25.250s
    //user    0m0.772s
    //sys     0m0.380s

    twisted支持定时器,我们可以用来动态添加任务

    from twisted.web.client import getPage
    from twisted.internet import reactor
    
    class Getter(object):
    
        def __init__(self):
            self._sequence = 0
            self._results = []
            self._errors = []
    
        def add(self, url):
            d = getPage(url)
            d.addCallbacks(self._on_success, self._on_error)
            d.addCallback(self._on_finish)
            self._sequence += 1
    
        def _on_finish(self, *narg):
            self._sequence -= 1
            print len(self._results), len(self._errors)
         #   if not self._sequence:
         #       reactor.stop()
    
        _on_success = lambda self, *res: self._results.append(res)
        _on_error = lambda self, *err: self._errors.append(err)
    
        def run(self):
            reactor.run()
            return self._results, self._errors
    
    def jobtimer():
        for url in ('http://www.google.com', 'http://www.yahoo.com', 'http://www.baidu.com'):
            g.add(url)
        reactor.callLater(1,jobtimer)
    
    reactor.callLater(2,jobtimer) #定时添加任务
    g = Getter()
    results, errors = g.run()
    
    #print len(results)
    #print len(errors)
  • 相关阅读:
    angular.js 头部默认值,不使用json提交数据
    D1-FFmpeg拼接视频
    B23-Carthage的使用
    B22-SVN在iOS开发的使用中遇到的问题
    C4-Cordova在iOS平台的使用
    C2-PhoneGap的环境搭建及iOS项目创建
    C0-PhoneGap之移动开发策略的选择(翻译)
    C1-PhoneGap和Cordova的关系和认识
    B21-iOS 开发的一些tips(下)
    B17-禅与 Objective-C 编程艺术的阅读笔记
  • 原文地址:https://www.cnblogs.com/ciaos/p/3423476.html
Copyright © 2020-2023  润新知