• python3爬虫-通过requests爬取西刺代理


    import requests
    from fake_useragent import UserAgent
    from lxml import etree
    from urllib.parse import urljoin
    import pymysql
    import time
    
    ua = UserAgent()
    
    
    class MyException(Exception):
    
        def __init__(self, status, msg):
            self.status = status
            self.msg = msg
            super().__init__()
    
    
    class XiCi:
    
        def __init__(self):
            self.session = requests.Session()
            self.session.headers = {
                "User-Agent": ua.random,
                "Host": "www.xicidaili.com"
            }
            self.conn = pymysql.connect(host="127.0.0.1",
                                        port=3306,
                                        user="root",
                                        db="proxies")
            self.cursor = self.conn.cursor(cursor=pymysql.cursors.DictCursor)
    
        def get_page_html(self, api):
            '''通过get方法请求网页'''
            response = self.session.get(url=api, headers=self.session.headers)
            if response.status_code == 200:
                return response
    
        def __html_to_etree(self, html):
            '''将html源码转为xml'''
            return etree.HTML(html)
    
        def get_next_page_url(self, response):
            '''拿到下一页的url'''
            selector = self.__html_to_etree(response.text)
            try:
                next_page_url = selector.xpath("//a[@class='next_page']/@href")[0]
                next_page_url = urljoin(response.url, next_page_url)
                return next_page_url
            except IndexError:
                raise MyException(1000, "爬取完毕")
    
        def __get_proxies_info(self, response):
            '''获取到爬取的代理信息'''
            selector = self.__html_to_etree(response.text)
            tr_ele_list = selector.xpath("//*[@id='ip_list']//tr")
            for tr in tr_ele_list:
                ip = tr.xpath("td[2]/text()")
                if not ip:
                    continue
                ip = ip[0]
                port = tr.xpath("td[3]/text()")[0]
                type = tr.xpath("td[6]/text()")[0]
                yield [ip, port, type]
    
        def __detect_availability(self, data):
            '''拿到爬取的数据,检测代理是否可以使用'''
            https_api = "https://icanhazip.com/"
            http_api = "http://icanhazip.com/"
            ip = data[0]
            port = data[1]
            type = data[2]
            proxies = {type.lower(): "{}://{}:{}".format(type.lower(), ip, port)}
            try:
                if type.upper() == "HTTPS":
                    requests.get(https_api, headers={"User-Agent": ua.random}, proxies=proxies, timeout=3)
                else:
                    requests.get(http_api, headers={"User-Agent": ua.random}, proxies=proxies, timeout=3)
                return True
            except Exception:
                return False
    
        def get_usable_proxies_ip(self, response):
            '''获取到可用的代理ip'''
            res = self.__get_proxies_info(response)
            for data in res:
                if self.__detect_availability(data):
                    self.save_to_db(data)
    
        def save_to_db(self, data):
            '''保存到数据库'''
            sql = 'insert into proxies_table(ip,port,type) values(%s,%s,%s);'
            print(data)
            self.cursor.execute(sql, data)
            self.conn.commit()
    
        def run(self, api):
            '''启动入口'''
            page = 1
            while True:
                print("爬取第{}页数据...".format(page))
                response = self.get_page_html(api)
                self.get_usable_proxies_ip(response)
                try:
                    api = self.get_next_page_url(response)
                except MyException as e:
                    if e.status == 1000:
                        print(e.msg)
                        break
                page += 1
                time.sleep(3)
    
        def __del__(self):
            self.conn.close()
    
    
    if __name__ == '__main__':
        api = "https://www.xicidaili.com/nn"
        xici = XiCi()
        xici.run(api)
  • 相关阅读:
    loj1201(最大独立集)
    hdu4185+poj3020(最大匹配+最小边覆盖)
    【Leetcode】3Sum Closest
    【Leetcode】3Sum
    【Leetcode】Two Sum
    【Leetcode】Longest Consecutive Sequence
    【Leetcode】Median of Two Sorted Arrays
    【Leetcode】Search in Rotated Sorted Array II
    【Leetcode】Search in Rotated Sorted Array
    【Leetcode】Remove Duplicates from Sorted Array II
  • 原文地址:https://www.cnblogs.com/zhuchunyu/p/10808073.html
Copyright © 2020-2023  润新知