• requests和正则表达式爬取猫眼电影Top100练习


     1 import requests
     2 import re
     3 from multiprocessing import Pool
     4 from requests.exceptions import RequestException
     5 import json
     6 import time
     7 
     8 
     9 # 抓取单页内容
    10 def get_one_page(url):
    11     headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
    12                              "Chrome/85.0.4183.121 Safari/537.36"}
    13     try:
    14         response = requests.get(url, headers=headers)
    15         if response.status_code == 200:
    16             return response.text
    17         else:
    18             return None
    19     except RequestException:
    20         return None
    21 
    22 
    23 # 解析单页内容
    24 def parser_one_page(html):
    25     pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a.*?>(.*?)</a>'
    26                          + '.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>'
    27                          + '.*?</dd>', re.S)
    28     contents = re.findall(pattern, html)
    29     for content in contents:
    30         yield {                                   # 生成一个generator,对区域内的内容进行迭代处理
    31             'index': content[0],
    32             'image': content[1],
    33             'name': content[2].strip(),
    34             'actor': content[3].strip()[3:],
    35             'time': content[4][5:],
    36             'score': content[5]+content[6]
    37         }
    38 
    39 
    40 # 将单页内容写入文件
    41 def write_to_file(content):
    42     with open('猫眼电影.txt', 'a', encoding='utf-8') as f:
    43         f.write(json.dumps(content, ensure_ascii=False) + '\n')
    44         f.close()
    45 
    46 
    47 def main(offset):
    48     url = 'http://maoyan.com/board/4?offset=' + str(offset)
    49     html = get_one_page(url)
    50     for item in parser_one_page(html):
    51         write_to_file(item)
    52 
    53 if __name__ == "__main__":
    54     time1 = time.time()
    55     for i in range(0, 100, 10):
    56         main(i)
    57     time2 = time.time()
    58     pool = Pool()    # 使用多进程提高爬取效率
    59     pool.map(main, [i*10 for i in range(0, 10)])
    60     time3 = time.time()
    61     print(time2-time1)    # for...in花费时间
    62     print(time3-time2)    # 多线程花费时间

    运行时间如下:

     补充对yield用法的理解:

    相关博客文章:https://blog.csdn.net/qq_33472765/article/details/80839417

  • 相关阅读:
    flex产生水平滚动条
    js中的类
    typescript
    vue练习
    vue-cli2脚手架搭建
    Luogu P1970 花匠
    Luogu P1311 选择客栈
    Luogu P1016 旅行家的预算
    Luogu P1144 最短路计数
    Luogu P1091 合唱队形
  • 原文地址:https://www.cnblogs.com/chang2021/p/13757447.html
Copyright © 2020-2023  润新知