• (python3爬虫实战-第一篇)利用requests+正则抓取猫眼电影热映口碑榜

    #author: "xian" 2 #date: 2018/5/2 3 import requests #导入requests库 4 from requests.exceptions import RequestException #导入异常处理模块 具体见官网 5 import re #导入re模块 6 import json #导入json模块
    7 8 def get_one_page(url): #获取单个页面,RequestException用来捕获异常 9 headers = { 10 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36', 11 12 } #maoyan要求headers 13 try: 14 response = requests.get(url,headers = headers) 15 if response.status_code == 200: 16 return response.text 17 else: 18 return None 19 except RequestException: 20 return None 21 22 def parse_one_page(html): #利用正则解析单个页面 23 pattern = re.compile('<dd>.*?board-index.*?>(d+)</i>.*?data-src="(.*?)"' 24 +'.*?name"><a.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' 25 +'.*?integer">(.*?)</i>.*?fraction">(d+)</i>.*?</dd>',re.S) #生成一个正则表达式对象供后面调用 26 27 items = re.findall(pattern,html) 28 for item in items: 29 yield { 30 'index':item[0], 31 'image':item[1], 32 'title':item[2], 33 'authors':item[3].strip()[3:], 34 'time':item[4].strip()[5:], 35 'rating':item[5] + item[6], 36 37 }
     1 def write_to_file(content):#写入日志result.txt,其中encoding 和 ensure_ascii 解决编码问题,当然你也可以使用codecs模块
     2     with open('result.txt','a',encoding='utf8') as f:
     3         f.write(json.dumps(content,ensure_ascii=False) +'
     4         f.close()
     6 def main():#主函数
     7     url = 'http://maoyan.com/board'
     8     html = get_one_page(url)
     9     for item in parse_one_page(html):
    10         print(item)
    11         write_to_file(item)
    13 if __name__ == '__main__':
    14     main()#解决战斗,妈妈再也不用担心我的学习了!


  • 相关阅读:
    [LeetCode] Power of Three 判断3的次方数
    [LeetCode] 322. Coin Change 硬币找零
    [LeetCode] 321. Create Maximum Number 创建最大数
    ITK 3.20.1 VS2010 Configuration 配置
    VTK 5.10.1 VS2010 Configuration 配置
    FLTK 1.3.3 MinGW 4.9.1 Configuration 配置
    FLTK 1.1.10 VS2010 Configuration 配置
    Inheritance, Association, Aggregation, and Composition 类的继承,关联,聚合和组合的区别
    [LeetCode] Bulb Switcher 灯泡开关
    [LeetCode] Maximum Product of Word Lengths 单词长度的最大积
  • 原文地址:https://www.cnblogs.com/518894-lu/p/8982424.html
Copyright © 2020-2023  润新知