• A flash of Joy


    import re
    from datetime import timedelta
    from tornado import httpclient, gen, ioloop, queues
    
    peoples = {'011': 71, '012': 66, '013': 54, '014': 50, '015': 66, '041': 61,
               '042': 103, '044': 31, '061': 32, '062': 41, '063': 33, '073': 93, '074': 50, '077': 108, '081': 55,
               '083': 55, '084': 92, '102': 56, '105': 29, '106': 27,
               '107': 25, '108': 25, '141': 50, '143': 66, '144': 68, '161': 52, '162': 50, '163': 50, '164': 52, '167': 50,
               '181': 133, '201': 166, '202': 10, '203': 8, '204': 99, '211': 18,
               '212': 50, '213': 24, '214': 19, '215': 25, '216': 24, '217': 24, '221': 67, '222': 52, '224': 67,
               '261': 67, '271': 8, '274': 31, '291': 82, '292': 62, '296': 8, '312': 104, '341': 52, '316': 52, '331': 47,
               '332': 56, '333': 72, '335': 57, '351': 36, '352': 50, '371': 120, '372': 50,
               '373': 56}
    
    
    class AsySpider(object):
        def __init__(self, urls, concurrency=10, results=None, **kwargs):
            urls.reverse()
            self.urls = urls
            self.concurrency = concurrency
            self._q = queues.Queue()
            self._fetching = set()
            self._fetched = set()
            if results is None:
                self.results = []
    
        def fetch(self, url, **kwargs):
            fetch = getattr(httpclient.AsyncHTTPClient(), 'fetch')
            return fetch(url, raise_error=False, **kwargs)
    
        def handle_html(self, url, html):
            """handle html page"""
            print(url)
    
        def handle_response(self, url, response):
            """inherit and rewrite this method if necessary"""
            if response.code == 200:
                self.handle_html(url, response.body)
    
            elif response.code == 599:  # retry
                self._fetching.remove(url)
                self._q.put(url)
    
        @gen.coroutine
        def get_page(self, url):
            try:
                response = yield self.fetch(url)
                # print('######fetched %s' % url)
            except Exception as e:
                print('Exception: %s %s' % (e, url))
                raise gen.Return(e)
            raise gen.Return(response)
    
        @gen.coroutine
        def _run(self):
            @gen.coroutine
            def fetch_url():
                current_url = yield self._q.get()
                try:
                    if current_url in self._fetching:
                        return
    
                    # print('fetching****** %s' % current_url)
                    self._fetching.add(current_url)
    
                    response = yield self.get_page(current_url)
                    self.handle_response(current_url, response)  # handle reponse
    
                    self._fetched.add(current_url)
    
                    for i in range(self.concurrency):
                        if self.urls:
                            yield self._q.put(self.urls.pop())
    
                finally:
                    self._q.task_done()
    
            @gen.coroutine
            def worker():
                while True:
                    yield fetch_url()
    
            self._q.put(self.urls.pop())  # add first url
    
            # Start workers, then wait for the work queue to be empty.
            for _ in range(self.concurrency):
                worker()
    
            yield self._q.join(timeout=timedelta(seconds=300000))
            try:
                assert self._fetching == self._fetched
            except AssertionError:
                print(self._fetching - self._fetched)
                print(self._fetched - self._fetching)
    
        def run(self):
            io_loop = ioloop.IOLoop.current()
            io_loop.run_sync(self._run)
    
    
    class MySpider(AsySpider):
        def fetch(self, url, **kwargs):
            """重写父类fetch方法"""
            cookies_str = 'JSESSIONID=0000n4jBi_dKg91XbtHHQHDeeDL:1b4e17j2v; iPlanetDire' 
                          'ctoryPro=AQIC5wM2LY4Sfcxu%' 
                          '2FWPIJWGHttZPiXafd%2B1gowyEoxTmyiY%3D%40AAJTSQACMDE%3D%23'
            headers = {
                'User-Agent': 'mozilla/5.0 (compatible; baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
                'cookie': cookies_str
            }
            return super(MySpider, self).fetch(
                url, headers=headers
            )
    
        def handle_html(self, url, html):
            url += 'qwertyu'
            pattern = re.compile('userPhoto&ownerId=(.*)qwertyu')
            filename = re.findall(pattern, url)[0]
            # 注意把dir修改成你想要存放照片位置.例如C:/picture/
            dir = '/home/innovation/文档/pic/'
            with open(dir + filename + '.jpg', 'wb') as file:
                file.write(html)
                file.close()
    
    
    def main():
        urls = []
        url_pic = 'http://myportal.sxu.edu.cn/attachmentDownload.portal?notUseCache=true&type=userPhoto&ownerId='
        for academy in peoples:
            for i in range(peoples[academy]):
                i += 1
                if i < 10:
                    i = '00' + str(i)
                elif 100 > i >= 10:
                    i = '0' + str(i)
                urls.append(url_pic + '2014' + academy + str(i))
        s = MySpider(urls)
        s.run()
    
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    结构体
    Springmvc中异常处理
    SpringMVC的Controller方法返回值的3种类型
    SpringMVC的参数绑定
    @RequestParam用法与@PathVariable用法的区别
    springMVC架构(三大组件)
    springMVC入门程序开发步骤
    @RequestMapping的三个用法
    web.xml标签
    小笔记2(Servlet)
  • 原文地址:https://www.cnblogs.com/INnoVationv2/p/6215399.html
Copyright © 2020-2023  润新知