• 05_多协程爬取斗鱼高颜值美女图片-2.0版本


     1 import requests
     2 import re
     3 # from bs4 import BeautifulSoup
     4 from urllib import request
     5 # import threading
     6 import gevent
     7 from gevent import monkey
     8 
     9 monkey.patch_all()
    10 
    11 def get_html_text(url):
    12     try:
    13         hd = {'User-Agent':'Mozilla/5.0'} #添加伪装浏览器头部信息
    14         r = requests.get(url, timeout=10, headers = hd)
    15         r.raise_for_status()
    16         r.encoding = r.apparent_encoding
    17         print(len(r.text))
    18         return r.text
    19     except Exception as result:
    20         print('错误类型:', result)
    21 
    22 
    23 def html_text_parser(img_list, html):
    24 
    25 # 下面是修改的重点部分,采用了正则表达式,没有采用bs4
    26 # 技术大神可以研究一下斗鱼这个网页的源代码,真正的图片信息都存储在后面,不是传统的# # html,我还没怎么接触过前端的知识,不知怎么使用bs4,所以使用了正则表达式。
    27     
    28     img_pat = r'"rsw+":"(.*?g)"'  
    29     links = re.compile(img_pat, re.S).findall(html)
    30     print(len(links))
    31     print(links)
    32     for link in links:
    33         if link:
    34             img_list.append(link)
    35     return img_list
    36 
    37 
    38 
    39 
    40 def get_douyu_img(Img_list):
    41     for i,j in enumerate(Img_list):
    42         # name = j.split('.')[-1]
    43         try: #异常捕获,如果链接不能访问,退出当前一次循环,进入下一次循环
    44             r = request.urlopen(j)
    45             ima_content = r.read()
    46             path = str(i)
    47             with open(path, 'wb') as f:
    48                 f.write(ima_content)
    49         except:
    50             continue
    51 def main():
    52     url = 'https://www.douyu.com/g_yz'
    53     html = get_html_text(url)
    54     img_list = list()
    55     Img_list = html_text_parser(img_list, html)
    56     # print(Img_list)
    57     #t1 = threading.Thread(target=get_html_text, args=(url,))
    58     #t2 = threading.Thread(target=html_text_parser, args=(img_list,html))
    59     #t3 = threading.Thread(target=get_douyu_img, args=(Img_list,))
    60     #t1.start()
    61     #t2.start()
    62     #t3.start()
    63     gevent.joinall([
    64         gevent.spawn(get_html_text, url),
    65         gevent.spawn(html_text_parser, img_list, html),
    66         gevent.spawn(get_douyu_img, Img_list)
    67     ])
    68 
    69 
    70 if __name__ == '__main__':
    71     main()
  • 相关阅读:
    [BZOJ1015] [JSOI2008]星球大战starwar
    [BZOJ2321,LuoguP1861]星(之)器
    Google Search Operators
    Python blockchain
    CCAE词频表(转)
    python小技巧(转)
    Python著名的lib和开发框架(均为转载)
    Yarn取代job/task tracker
    hadoop 2.73‘s four xml
    HDFS NN,SNN,BN和HA
  • 原文地址:https://www.cnblogs.com/summer1019/p/10388348.html
Copyright © 2020-2023  润新知