• 使用requests抓取网页内容


    from random import choice
    import json
    
    import requests
    from bs4 import BeautifulSoup
    
    _user_agents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
    ]
    
    
    class InstagramScraper:
    
        def __init__(self, user_agents=None, proxy=None):
            self.user_agents = user_agents
            self.proxy = proxy
    
        def __random_agent(self):
            if self.user_agents and isinstance(self.user_agents, list):
                return choice(self.user_agents)
            return choice(_user_agents)
    
        def __request_url(self, url):
            try:
                response = requests.get(url, headers={'User-Agent': self.__random_agent()}, proxies={'http': self.proxy,
                                                                                                     'https': self.proxy})
                response.raise_for_status()
            except requests.HTTPError:
                raise requests.HTTPError('Received non 200 status code from Instagram')
            except requests.RequestException:
                raise requests.RequestException
            else:
                return response.text
    
        @staticmethod
        def extract_json_data(html):
            soup = BeautifulSoup(html, 'html.parser')
            body = soup.find('body')
            script_tag = body.find('script')
            raw_string = script_tag.text.strip().replace('window._sharedData =', '').replace(';', '')
            return json.loads(raw_string)
    
        def profile_page_metrics(self, profile_url):
            results = {}
            try:
                response = self.__request_url(profile_url)
                json_data = self.extract_json_data(response)
                metrics = json_data['entry_data']['ProfilePage'][0]['graphql']['user']
            except Exception as e:
                raise e
            else:
                for key, value in metrics.items():
                    if key != 'edge_owner_to_timeline_media':
                        if value and isinstance(value, dict):
                            value = value['count']
                            results[key] = value
                        elif value:
                            results[key] = value
            return results
    
        def profile_page_recent_posts(self, profile_url):
            results = []
            try:
                response = self.__request_url(profile_url)
                json_data = self.extract_json_data(response)
                metrics = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']["edges"]
            except Exception as e:
                raise e
            else:
                for node in metrics:
                    node = node.get('node')
                    if node and isinstance(node, dict):
                        results.append(node)
            return results
  • 相关阅读:
    RabbitMQ学习之:(八)Topic Exchange (转贴+我的评论)
    RabbitMQ学习之:(六)Direct Exchange (转贴+我的评论)
    抽象类
    开闭原则
    反射
    解决默认方法冲突
    单一职责原则
    java四种访问修饰符
    Java中基本类型和包装类
    方法在继承过程中可能出现的问题
  • 原文地址:https://www.cnblogs.com/darknoll/p/10547244.html
Copyright © 2020-2023  润新知