• 豆瓣爬虫


    豆瓣爬虫

    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    from sklearn.linear_model import LinearRegression
    import seaborn as sns
    import numpy as np
    import matplotlib.pyplot as plt
    import matplotlib
    from scipy.optimize import leastsq
    
    def get_html(url):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0'}#伪装爬虫
        resp = requests.get(url, headers = headers)
        return resp.text
    
    url = 'https://movie.douban.com/top250'
    html = get_html(url)
    soup = BeautifulSoup(html, 'html.parser')
    
    
    a = soup.find_all('div', class_='hd')
    #电影名
    film_name = []
    for i in a:
        film_name.append(i.a.span.text)
    
    #评分
    rating_score = soup.find_all('span', class_='rating_num')
    
    lt = []
    num = 20
    for i in range(num):
        lt.append([i+1,film_name[i], rating_score[i].string])
        df=pd.DataFrame(lt,columns = ['排名', '电影名', '评分'])
    df.to_csv(r'C:UsersadmirDesktop参考豆瓣电影数据.csv') #保存文件,数据持久化

    根据网页格式调整实现批量输出

    import json  
    import requests  
    from requests.exceptions import RequestException  
    import re  
    import time 
    
    def get_one_page(url):  
        try:  
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0'  
            }        #网络 html发起者  请求  消息头
            response = requests.get(url, headers=headers)  
            if response.status_code == 200:  
                return response.text  
            return None  
        except RequestException:  
            return None  
    def parse_one_page(html):  
       pattern = re.compile('<li>.*?<em class="">(.*?)</em>.*?title.*?>(.*?)</span>.*? <span class="rating_num" property="v:average">(.*?)</span>.*?<span class="inq">(.*?)</span>',re.S)  
       items = re.findall(pattern, html)  
       for item in items:  
            yield {'index': item[0],  
                'title': item[1],  
                'score': item[2],
                'comment':item[3]
            }  
    def write_to_file(content):  
        with open(r'C:UsersadmirDesktop参考douban250.txt', 'a', encoding='utf-8') as f:  
    #写入txt文件;如果需要输出csv文件直接修改后缀即可 f.write(json.dumps(content, ensure_ascii
    =False) + ' ') def main(offset): url = 'https://movie.douban.com/top250?start='+str(offset)+'&filter=' html = get_one_page(url) for item in parse_one_page(html): print(item) write_to_file(item) if __name__ == '__main__': for i in range(10): main(offset=i * 25) time.sleep(1)
  • 相关阅读:
    Devexpress Gridview 自定义汇总CustomSummaryCalculate(加权平均)
    JavaScript(1)
    Opencv基本数据类型
    1.2OpenCV如何扫描图像,利用查找表和计时
    读取,修改,保存图像
    Opencv(3):基本数据类型
    OpenCV(2):视频
    OpenCV(1):显示图像
    IPython:一种交互式计算和开发环境(魔术命令,快捷键)
    IPython:一种交互式计算和开发环境
  • 原文地址:https://www.cnblogs.com/celine227/p/14473221.html
Copyright © 2020-2023  润新知