• 团队-爬取豆瓣Top250电影-开发文档


    地址:https://movie.douban.com/top250

    开发内容:

    首先 我们选择使用Python脚本语言开发这个项目

    代码:

    import os
    import requests
    from bs4 import BeautifulSoup
    from requests.exceptions import RequestException


    """

    Author:

    Damon

    功能:

    爬取豆瓣网Top250电影信息保存到本地

    """


    # 目标网址

    URL = "https://movie.douban.com/top250?start={}"
    # 按照爬取顺序保存每个电影的网址

    entity_url = []


    def save_data(result):
    """

    保存爬取信息到本地

    :return: None

    """
    f = open('movice.txt', "a", encoding="utf8")
    f.write("======================================================================================================== ")
    f.write("排名:" + result['top'] + " ")
    f.write("评分:" + result['grade'] + " ")
    f.write("名称:" + result['name'] + " ")
    f.write("导演:" + result['director'] + " ")
    f.write("编剧:" + result['scriptwriter'] + " ")
    f.write("主演:" + result['protagonist'] + " ")
    f.write("简介:" + result['synopsis'] + " ")
    f.write("影评:" + " ")
    f.write(" " + result['film_review']['first_user'] + ":" + result['film_review']['first_discuss'] + " ")
    f.write(" " + result['film_review']['second_user'] + ":" + result['film_review']['second_discuss'] + " ")
    f.write(" " + result['film_review']['thirdly_user'] + ":" + result['film_review']['thirdly_discuss'] + " ")
    f.write(" " + result['film_review']['fourthly_user'] + ":" + result['film_review']['fourthly_discuss'] + " ")
    f.write(" " + result['film_review']['fifth_user'] + ":" + result['film_review']['fifth_discuss'] + " ")
    f.write("网址" + result['url'] + " ")
    f.close()

    print("已处理:" + result['name'] + " " + result['top'])


    def analysis_page(num, url):
    """

    解析网页,获取想要的数据

    :param num: Top排行

    :param url: 电影详情url

    :return: None

    """
    # 保存电影整体信息

    result = {}
    # 保存影评信息

    film_review = {}
    try:
    headers = {
    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
    }
    res = requests.get(url, headers=headers)
    res.encoding = "utf-8"
    except RequestException as e:
    print("请求索引页异常:", repr(e))
    print("链接:", url)
    # [待处理异常] 目标网址崩溃的异常处理。 解决方法:退出程序

    os._exit(0)

    soup = BeautifulSoup(res.text, "html.parser")

    # 如果该网页不存在则 跳过 执行下一个网页

    title = soup.select("title")[0].text
    if title == "页面不存在":
    f = open('movice.txt', "a", encoding="utf8")
    f.write("======================================================================================================== ")
    f.write("排名:Top" + str(num) + " ")
    f.write("ERROR:页面不存在 ")
    f.write("网址:" + url + " ")
    f.close()
    return -1

    try:
    # 排名

    result['top'] = "Top" + str(num)
    # 评分

    result['grade'] = soup.select("#interest_sectl > div.rating_wrap.clearbox > div.rating_self.clearfix > strong")[0].text
    # 名称

    result['name'] = soup.select("#content > h1")[0].text.replace(" ", "")
    # 导演

    result['director'] = soup.select("#info > span > span.attrs")[0].text
    try:
    # 编剧

    result['scriptwriter'] = soup.select("#info > span > span.attrs")[1].text
    # 主演

    result['protagonist'] = soup.select("#info > span.actor > span.attrs")[0].text
    except:
    # 编剧

    result['scriptwriter'] = ""
    # 主演

    result['protagonist'] = ""
    try:
    # 简介

    result['synopsis'] = soup.select("#link-report > span.short > span")[0].text.replace(" ", "").replace(" ", "")
    except:
    # 简介

    result['synopsis'] = soup.select("#link-report > span")[0].text.replace(" ", "").replace(" ", "")
    # 第一个影评用户名

    film_review['first_user'] = soup.select("#hot-comments > div > div > h3 > span.comment-info > a")[0].text
    # 第一个影评用户评论

    film_review['first_discuss'] = soup.select("#hot-comments > div > div > p")[0].text
    # 第二个影评用户名

    film_review['second_user'] = soup.select("#hot-comments > div > div > h3 > span.comment-info > a")[1].text
    # 第二个影评用户评论

    film_review['second_discuss'] = soup.select("#hot-comments > div > div > p")[1].text
    # 第三个影评用户名

    film_review['thirdly_user'] = soup.select("#hot-comments > div > div > h3 > span.comment-info > a")[2].text
    # 第三个影评用户评论

    film_review['thirdly_discuss'] = soup.select("#hot-comments > div > div > p")[2].text
    # 第四个影评用户名

    film_review['fourthly_user'] = soup.select("#hot-comments > div > div > h3 > span.comment-info > a")[3].text
    # 第四个影评用户评论

    film_review['fourthly_discuss'] = soup.select("#hot-comments > div > div > p")[3].text
    # 第五个影评用户名

    film_review['fifth_user'] = soup.select("#hot-comments > div > div > h3 > span.comment-info > a")[4].text
    # 第五个影评用户评论

    film_review['fifth_discuss'] = soup.select("#hot-comments > div > div > p")[4].text
    # 影评

    result['film_review'] = film_review
    # 网址

    result['url'] = url
    except:
    print("异常链接:", url, "------------------------------------")
    # [待处理异常] 目标网址崩溃的异常处理。 解决方法:退出程序

    os._exit(0)

    # 保存数据到本地 txt

    save_data(result)


    def get_entity_url(url):
    """

    爬取目标网址中每一个电影的网址

    :param url: 目标网址

    :return: None

    """
    try:
    headers = {
    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
    }
    res = requests.get(url, headers=headers)
    res.encoding = "utf-8"
    except RequestException as e:
    print("请求索引页异常:", repr(e))
    print("链接:", url)
    # [待处理异常] 目标网址崩溃的异常处理。 解决方法:退出程序

    os._exit(0)

    soup = BeautifulSoup(res.text, "html.parser")

    entity = soup.select("#content > div > div.article > ol > li > div > div.info > div.hd > a")
    for i in range(len(entity)):
    entity_url.append(entity[i]['href'])


    def make_url(num):
    """

    生成每页URL

    :param num: 待生成网页 index

    :return: None

    """
    url = URL.format(num * 25)
    get_entity_url(url)

    if __name__ == '__main__':
    # 获取所有电影的url

    for i in range(10):
    make_url(i)
    print("已成功获取所有电影URL!")
    # 根据获取到的url解析出想要的数据保存到本地

    for i in range(len(entity_url)):
    state = analysis_page((i + 1), entity_url[i])

    if state == -1:
    continue

  • 相关阅读:
    20175226 2018-2019-2 《Java程序设计》第二周学习总结
    存储管理-页面置换算法(页面淘汰算法)
    存储管理-存储组织
    进程管理-死锁问题
    操作系统-进程管理
    操作系统:进程管理、存储管理、文件管理、作业管理、设备管理
    第十一章 集合框架
    匿名内部类
    第10章 java常用类
    第8章 反射
  • 原文地址:https://www.cnblogs.com/ccccryst/p/7780920.html
Copyright © 2020-2023  润新知