• 爬取猫眼电影排行榜


    # 导入我们需要的模块
    import re
    import requests
    
    # 一、获取网页内容
    # (1)声明目标url,就是爬取的网站地址
    base_url = "http://maoyan.com/board"
    
    # (2)模仿浏览器
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
    
    # (3)发起请求
    response = requests.get(base_url, headers=headers)
    
    # (4)接收响应的数据
    html = response.text
    
    # (5)将接收的数据写入
    with open("maoyan.html", 'w', encoding='utf-8') as f:
        f.write(html)
    
    # 2.提取数据
    # (1)缩小范围(通过正则获取数据)
    pattern = re.compile(r'<dd>.*?</dd>',re.S)
    movie_list = pattern.findall(html)
    
    # (2) 分别拿取每部电影中的数据
    for movie in movie_list:
        # print(movie)
        # 获取排名信息
        pattern = re.compile(r'<i class="board-index board-index-[d]*">(d{1,2})</i>')
        index = pattern.findall(movie)[0]
        index = '排名:' + index
        print(index)
    
        # 获取电影名称信息
        pattern = re.compile(r'title="(.*?)"')
        title = pattern.findall(movie)[0]
        title = '电影名称:' + title
        print(title)
    
        # 获取图片信息
        pattern = re.compile(r'<img data-src="(.*?)@')
        img = pattern.findall(movie)[0]
        img = '图片:' + img
        print(img)
    
        # 获取主演信息
        pattern = re.compile(r'<p class="star">([wW]*?)</p>')
        star = pattern.findall(movie)[0].strip()
        print(star)
    
        # 获取上映时间信息
        pattern = re.compile(r'<p class="releasetime">(.*?)</p>')
        releaseTime = pattern.findall(movie)[0]
        print(releaseTime)
    
        # 获取评分信息
        pattern = re.compile(r'<p class="score"><i class="integer">(d+.)</i><i class="fraction">(d)</i></p> ')
        score = pattern.findall(movie)
        score = '评分:' + score[0][0]+score[0][1]
        print(score)
    
        # 将所有信息拼接成字符串
        result = index + '
    ' + title + '
    ' + img + '
    ' + star + '
    ' + releaseTime + '
    ' + score + '
    
    '
    
        # 将所有信息存入文档
        with open("maoyan.txt", 'a+', encoding='utf-8') as f:
            f.write(result)
  • 相关阅读:
    C++ 编写strcpy函数
    JavaScript抽象类及Class.create备忘
    读:<测试一下你解决问题的逻辑思维及算法能力>后
    JavaScript AJAX类
    MOSS ad组的获取及Hashtable作缓存总结
    Js获取元素位置及动态生成元素的练习备忘
    NET许可证及License
    Javascript获取元素位置及其它
    hdu 149850 years, 50 colors 最大匹配
    poj 2513 Colored Sticks 字典树
  • 原文地址:https://www.cnblogs.com/zhangboblogs/p/10108455.html
Copyright © 2020-2023  润新知