• 爬虫实战01——爬取猫眼电影top100榜单


    #需求:抓取猫眼电影TOP100的电影名称、时间、评分、图片等信息,提取的结果会以文件的形式保存下来
    import requests
    import time
    from lxml import etree
    import json
    import csv
    import codecs
    
    class MaoYanTop100Spider:
        #存储电影详情页的url
        film_page_url_list = []
        #存储每个的电影信息
        #film_info = {}
        film_info_list = []
    
        # 1.获取电影列表页数据
        def Top100_list(self, session, headers):
            #1.1向列表页发送请求
            #https://maoyan.com/board/4?offset=20
            #(1)固定url
            base_url = "https://maoyan.com/board/4"
            #(2)url变化部分:
            for i in range(0, 91, 10):
                #(3)拼接URL:
                final_url = base_url + "?offset=" + str(i)
                #(4)发送请求:
                time.sleep(5)
                response = session.get(url=final_url, headers=headers)
                #1.2解析列表页
                film_list_page_data = response.content.decode("utf-8")
                #1.2.1使用xpath解析数据
                #(1)转类型
                xpath_data = etree.HTML(film_list_page_data)
                #(2)
                #/dl/dd[1]/a
                #dl/dd[10]/a/img[2]
                # dl / dd[2] / a / img[2]
                for xpath_num in range(1, 11):
                    # 电影名称
                    #dl/dd[1]/a
                    film_name = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/a/@title')[0]
    
                    # 时间
                    #//*[@id="app"]/div/div/div[1]/dl/dd[1]/div/div/div[1]/p[3]
                    #dl/dd[1]/div/div/div[1]/p[3]
                    #dl/dd[2]/div/div/div[1]/p[3]
                    film_time = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/div/div/div[1]/p[3]/text()')[0][5:].strip()
    
                    # 主演
                    #dl/dd[1]/div/div/div[1]/p[2]
                    film_actors = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/div/div/div[1]/p[2]/text()')[0].strip()[3:]
    
                    # 评分
                    #dl/dd[1]/div/div/div[2]/p/i[1]
                    score_int = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/div/div/div[2]/p/i[1]/text()')[0]
                    #dl/dd[1]/div/div/div[2]/p/i[2]
                    score_fraction = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/div/div/div[2]/p/i[2]/text()')[0]
                    film_score = str(score_int) + str(score_fraction)
                    # 图片
                    #dl/dd[1]/a/img[2]
                    #dl/dd[1]/a/img[2]
                    film_img = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/a/img[2]/@data-src')[0]
    
                    # 详情页url
                    #dl/dd[1]/div/div/div[1]/p[1]/a
                    #dl/dd[1]/div/div/div[1]/p[1]/a
                    film_url = xpath_data.xpath('//dl[@class="board-wrapper"]/dd[' + str(xpath_num) + ']/div/div/div[1]/p[1]/a/@href')[0]
    
                    #电影信息
                    film_info = {}
                    film_info["name"] = film_name
                    film_info["time"] = film_time
                    film_info["actors"] = film_actors
                    film_info["score"] = film_score
                    film_info["img"] = film_img
                    film_info["url"] = film_url
                    self.film_info_list.append(film_info)
                    #print(film_info)
                    #详情页url
                    self.film_page_url_list.append(film_url)
    
    
    
    
    
        # 2.获取电影详情页数据
        def film_page(self, url, session, headers, num):
            #2.1向详情页发送请求
            base_url = "https://maoyan.com"
            final_url = base_url + str(url)
            print(final_url)
            time.sleep(3)
            response = session.get(url=final_url, headers=headers)
            data = response.content.decode("utf-8")
            #print(response)
            #2.2解析详情页
            xpath_data = etree.HTML(data)
            #//*[@id="app"]/div/div[1]/div/div[2]/div[1]/div[1]/div[2]/span
            film_summary = xpath_data.xpath('//span[@class="dra"]/text()')[0].strip()
            #print(film_summary)
            self.film_info_list[num]["summary"] = film_summary
    
        #将数据保存至CSV文件
        def save_data(self):
            #1.读取json文件,创建csv文件
            #json_fp = open("new.json", "r")
            csv_fp = codecs.open("maoyan.csv", "w", "utf-8")
            #2.提出csv文件表头,表内容
            #2.1 表头
            #data_list = json.load(json_fp)
            title_list = self.film_info_list[0].keys()
    
            #2.2 表内容
            excel_data = []
            for data in self.film_info_list:
                excel_data.append(data.values())
            #3.使用csv写入器,写入文件
            #3.1创建csv写入器
            csv_writer = csv.writer(csv_fp)
            #3.2写入表头和表内容
            #(1)写入表头
            csv_writer.writerow(title_list)
            #(2)写入表内容
            csv_writer.writerows(excel_data)
            #4.关闭csv文件和json文件
            #json_fp.close()
            csv_fp.close()
    
        #运行:
        def run(self):
            #0.创建session,维持会话
            session = requests.Session()
            #0.1请求头:headers
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
            }
            #1.获取电影列表页数据
            self.Top100_list(session=session, headers=headers)
            #print(self.film_info_list)
            #2.获取电影详情页数据
            for i, film_page_url in enumerate(self.film_page_url_list):
                self.film_page(url=film_page_url, session=session, headers=headers, num=i)
                print(self.film_info_list[i])
    
            #3.保存数据
            self.save_data()
    
    
    
    MaoYanTop100Spider().run()
  • 相关阅读:
    史玉柱语录
    马云语录
    打开IE窗口自动最大化效果
    两个DIV平行存放
    学习ExtJS(一)
    30而立男人必须明白的事
    学习ExtJS(二) Button常用方法
    UltraWebGrid控件在开发ASP.NET项目中的使用方法和技巧(转)
    GridView模板列DropDownList当前行索引
    TreeView触发TreeNodeCheckChanged事件
  • 原文地址:https://www.cnblogs.com/tommyngx/p/11182661.html
Copyright © 2020-2023  润新知