• Requests+正则表达式 爬取猫眼电影


    代码:

    import re
    import json
    from multiprocessing import Pool
    import requests
    from requests.exceptions import RequestException
    
    
    basic_url = 'http://maoyan.com/board/4?offset=%d'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',
    }
    file = open("maoyan_movies.txt", 'a', encoding="utf-8")
    
    
    def get_page(url):
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == requests.codes.ok:
                return response.text
            else:
                return None
        except RequestException:
            return None
    
    
    def parse_page(content):
        pattern = re.compile(
            '<dd>.*?board-index.*?>(d+)</i>'
            '.*?<img data-src="(.*?)"'
            '.*?class="name"><a.*?>(.*?)</a>'
            '.*?class="star">(.*?)</p>'
            '.*?class="releasetime">(.*?)</p>'
            '.*?class="score"><i class="integer">(.*?)</i><i class="fraction">(.*?)</i>'
            '.*?</dd>', re.S)
        items = pattern.findall(content)
        for item in items:
            yield {
                'id': item[0],
                'image': item[1],
                'name': item[2].strip(),
                'actor': item[3].strip()[3:],
                'releasetime': item[4][5:],
                'score': item[5] + item[6],
            }
    
    
    def save_to_file(content):
        json.dump(content, file, ensure_ascii=False)
        file.write('
    ')
    
    
    def get_page_movies(offset):
        '''
            获取一页的电影信息
            offset用来构建完整的网页url,以10为最小单位
        '''
        step = 10
        url = basic_url % (step * offset)
        html = get_page(url)
        for movie_info in parse_page(html):
            save_to_file(movie_info)
    
    
    # 获取猫眼电影top100的电影信息: 排名,图片url,电影名,主演,上映日期,评分
    def get_top_100_movies():
        offset_list = [i for i in range(10)]
        pool = Pool(processes=4)
        pool.map(get_page_movies, offset_list)
        pool.close()
        pool.join()
    
    
    if __name__ == "__main__":
        get_top_100_movies()
  • 相关阅读:
    [转]maven for eclipse在线安装 eclipsesr2
    js循环绑定事件解决方案
    设置 Eclipse/ 快速提示快捷键
    [转]POI 读取 Excel 转 HTML 支持 03xls 和 07xlsx 版本 包含样式
    解决子元素和父元素同时触发onclick
    【Tomcat】本地域名访问配置
    [ELK]快速搭建简单的日志分析平台
    Git 使用心得
    无光驱U盘启动WinPE安装操作系统的方法
    WMI调用发生 InitializationFailure 错误的解决过程
  • 原文地址:https://www.cnblogs.com/hupeng1234/p/7112497.html
Copyright © 2020-2023  润新知