• 大二下学期团队项目(豆瓣爬取)


    前面已能够爬取豆瓣电影所需要的所有信息,今日主要修改了爬取豆瓣的代码,进行了一些优化,主要两部分代码:

    def insert_data(data_beans,headers,cursor,conn):
        try:
            for data_bean in data_beans:
                #20个电影数据
                for i in data_bean:
                    # 分配数据
                    score = i["rate"].replace(" ", "")
                    director = i["directors"]  # []
                    director_str = ""
                    for j in director:
                        director_str = director_str + " " + j
                    name = i["title"].replace(" ", "")
                    img = i["cover"].replace(" ", "")
                    star = i["casts"]  # []
                    star_str = ""
                    for j in star:
                        star_str = star_str + " " + j
                    # 分配数据
    
                    # 获取电影详细数据的网址
                    url_details = i["url"].replace(" ", "")
                    r = requests.get(url_details, headers=headers)
                    soup_bean = BeautifulSoup(r.text, "lxml")
                    # 获取详细数据
                    span = soup_bean.find_all("span", {"property": "v:genre"})
                    type = ""
                    for i in span:
                        type = type + " " + i.text
                    span = soup_bean.find_all("span", {"property": "v:runtime"})
                    timelen = span[0].text.replace(" ", "")
                    span = soup_bean.find_all("span", {"property": "v:initialReleaseDate"})
                    date = span[0].text.replace(" ", "")
                    span = soup_bean.find("a", {"class", "rating_people"})
                    scorenum = span.text.replace(" ", "")
                    span = soup_bean.find("span", {"property": "v:summary"})
                    summary = span.text.replace(" ", "")  # 将空格去掉
                    ex = ' <span class="pl">制片国家/地区:</span> (.*?)<br/>'
                    test = re.findall(ex, r.text, re.S)
                    area = test[0].replace(" ", "")
                    ex2 = '<span class="pl">语言:</span> (.*?)<br/>'
                    test = re.findall(ex2, r.text, re.S)
                    language = test[0].replace(" / ", " ")
                    print(url_details)
                    # 获取详细数据
                    sql = "insert into moviebean values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
                    cursor.execute(sql,
                                   [name, star_str, director_str, type, area, date, summary, score, language, img, scorenum,
                                    timelen])
            conn.commit()  # 提交事务 update delete insert操作 //*[@id="info"]/text()[2]
            print(f"{time.asctime()}插入数据完毕")
        except:
            traceback.print_exc()
    def get_tencent_data():
        #豆瓣的网址
        url_bean = 'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=%E7%94%B5%E5%BD%B1&start='
    
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
        }
        cursor = None
        conn = None
        conn, cursor = get_conn()
        data_beans=[]
        num=3240#1440/3020/2760/3100/3180
        b=0;
        while b<=500:
            a = 1
            b=b+1
            while a <= 1:
                num_str = '%d' % num
                num = num + 20
                a = a + 1;
                # 获取豆瓣页面电影数据
                r = requests.get(url_bean + num_str, headers=headers)
                print(num_str)
                res_bean = json.loads(r.text);
                print(url_bean+num_str)
                data_beans.append(res_bean["data"])
                print(f"{time.asctime()}开始插入数据")
            insert_data(data_beans, headers,cursor,conn)
            data_beans=[]
        print(f"{time.asctime()}所有数据插入完毕")
        close_conn(conn, cursor)
  • 相关阅读:
    Finalize,Dispose,SuppressFinalize
    防火防盗防微软,Firefox发布插件自动检测服务
    Nginx的Rewrite设置及示例
    Linux游戏开发包 ClanLib 2.1.0 发布
    HTTP协议详解(真的很经典)
    Linux on POWER:发行版迁移和二进制兼容性考虑事项
    映射网络驱动器VBS脚本
    [笔记] 使用 opcache 优化生产环境PHP
    2020最新版MySQL数据库面试题(三)
    请注意,面试中有这7个行为肯定会被拒绝!
  • 原文地址:https://www.cnblogs.com/fengchuiguobanxia/p/14725084.html
Copyright © 2020-2023  润新知