• 爬虫_豆瓣电影top250 (正则表达式)


    一样的套路,就是多线程还没弄

     1 import requests
     2 import re
     3 import json
     4 
     5 headers = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
     6 
     7 def get_one_page(url):
     8     try:
     9         html = requests.get(url, headers={'User-Agent':'headers'})
    10         if html.status_code == 200:
    11             return html.text
    12         return None
    13     
    14     except RequestsException:
    15         return None
    16 
    17 
    18 
    19 def write_txt(content):
    20     with open('result.txt', 'a', encoding='utf-8') as f:
    21         f.write(json.dumps(content, ensure_ascii=False) + '
    ')
    22         f.close
    23 
    24 def parse_one_page(html):
    25     # <em class="">(d+)</em>
    26     # .*?href="(.*?)/">.*?
    27     # other">(w+)</span
    28     match = re.compile('.*?<em class="">(.*?)</em>.*?href="(.*?)/">.*?"title">(.*?)</span.*?other">(.*?)</span', re.S)
    29     results = re.findall(match, html)
    30     for item in results:
    31         yield{
    32             'range': item[0],
    33             'movie_main_page': item[1],
    34             'movie_title': item[2],
    35             'other_name': item[3].strip()[13:]
    36         }
    37     # print(results)
    38 
    39 def main():
    40     for start in range(0, 250, 25):
    41         url = 'https://movie.douban.com/top250?start=' + str(start)
    42         html = get_one_page(url)
    43         for item in parse_one_page(html):
    44             print(item)
    45             write_txt(item)
    46 
    47 
    48 
    49 if __name__ == '__main__':
    50     main()

     运行结果

  • 相关阅读:
    TensorFlow学习笔记--CIFAR-10 图像识别
    第二章--第二节:注释
    webbrowser
    RichViewEdit
    RichEdit
    TreeView
    RichView
    ListView
    DesktopLoader服务程序
    Delphi实现程序只运行一次并激活已打开的程序
  • 原文地址:https://www.cnblogs.com/MC-Curry/p/9327550.html
Copyright © 2020-2023  润新知