• 爬取豆瓣电影信息保存到Excel


     1 from bs4 import BeautifulSoup
     2 import requests
     3 import html.parser
     4 from openpyxl import Workbook,load_workbook
     5 import os
     6 class DouBan(object):
     7 
     8     def __init__(self):
     9         self.url = 'https://movie.douban.com/'
    10         self.header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}
    11 
    12     def openUrl(self, url):
    13         response = requests.get(url,headers=self.header)
    14         return response
    15 
    16     def getUrl(self):
    17         response = self.openUrl(self.url)
    18         douban_html = response.text
    19         # print(douban_html)
    20         soup = BeautifulSoup(douban_html,'html.parser')
    21         hrefs = soup.select("li.poster > a")
    22         return hrefs
    23         # for href in hrefs:
    24         #     print(href['href']
    25     def getMsg(self):
    26         hrefs = self.getUrl()
    27         for num,href in enumerate(hrefs):
    28             msg_list = []
    29             print(href['href'])
    30             response = self.openUrl(href['href'])
    31             html_mover = response.text
    32             soup = BeautifulSoup(html_mover,'html.parser')
    33             all_info = soup.select('div#content')
    34             # print(all_info)
    35             title = all_info[0].select('h1')[0].text.replace('
    ','')
    36             msg_list.append(title)
    37             # print(title)
    38             info = all_info[0].select('#info')[0].text
    39             msg_list.append(info)
    40             # print(info)
    41             describe = all_info[0].select('div#link-report span')[0].text.replace(' ','')
    42             msg_list.append(describe)
    43             # print(describe)
    44             # return title,info,describe
    45             for col in range(3):
    46                 self.saveMsg(num+1, col+1,  msg_list[col])
    47 
    48     def saveMsg(self, row_, column_,msg):
    49         # msg = self.getMsg()
    50         # a = os.path.exists('//move_msg.xlsx')
    51         # if a=False:
    52         #     os.mkdir('move_msg.xlsx')
    53         
    54         wb = load_workbook('move_msg.xlsx')
    55         sheet = wb.active
    56         sheet.cell(row=row_, column=column_).value = msg
    57         wb.save('move_msg.xlsx')
    58 
    59 
    60 
    61 
    62 if __name__ == "__main__":
    63     db = DouBan()
    64     db.getMsg()
  • 相关阅读:
    [湖北省队互测2014] 没有人的算术 (非题解)
    普及常见图论算法整理
    普及常见数据结构板子整理
    Pisano Period
    退役了
    LOJ3246 「USACO 2020.1 Platinum」Cave Paintings
    LOJ3193 「ROI 2019 Day2」机器人高尔夫球赛
    LOJ3192 「ROI 2019 Day2」课桌
    LOJ6496 「雅礼集训 2018 Day1」仙人掌
    Luogu P4518 [JSOI2018]绝地反击
  • 原文地址:https://www.cnblogs.com/royfans/p/7474662.html
Copyright © 2020-2023  润新知