• 手动爬虫之报头及代理封装类(python3)


      本人刚刚学习爬虫,见每次都需要添加报头比较繁琐,故将该过程封装为Url_ProxyHelper类,代码如下

     1 import urllib.request as ur
     2 
     3 class Url_ProxyHelper:
     4     def __init__(self, url, proxy_add, savepath=None):
     5         self.url = url
     6         self.proxy_add = proxy_add
     7         self.req = None
     8         self.proxy = None
     9         self.opener = None
    10         self.info = None
    11         self.save_path = savepath
    12 
    13     # 报头代理设置
    14     def set_UrlAndProxy(self):
    15         # 添加报头
    16         self.req = ur.Request(self.url)
    17         self.req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0')
    18 
    19         # 设置代理服务器
    20         self.proxy = ur.ProxyHandler({'http': self.proxy_add})
    21         self.opener = ur.build_opener(self.proxy, ur.HTTPHandler)
    22         ur.install_opener(self.opener)
    23         return self.req
    24 
    25     # 数据存档
    26     def save_InFile(self):
    27         self.req = self.set_UrlAndProxy()
    28         self.info = ur.urlopen(self.req).read()
    29         open(self.save_path, 'wb').write(self.info)
    30 
    31     # 数据返回
    32     def feedbak_info(self):
    33         self.req = self.set_UrlAndProxy()
    34         self.info = ur.urlopen(self.req).read().decode('utf-8')  # decode()用来解码,特别是中文
    35         return str(self.info)

    测试:

     1 from Url_ProxyHelper import Url_ProxyHelper
     2 
     3 url = "https://www.baidu.com/"
     4 save_path = "E:/workspace/PyCharm/codeSpace/books/python_web_crawler_book/chapter6/demo4/test_url_proxy.html"
     5 proxy = "175.155.24.20:808"
     6 
     7 # 调用Url_ProxyHelper类
     8 #uph = Url_ProxyHelper(url, proxy, savepath=save_path)
     9 uph = Url_ProxyHelper(url, proxy)
    10 
    11 info = uph.feedbak_info()
    12 
    13 print(info)
  • 相关阅读:
    Jsp补充
    Jsp和Servlet关系
    ServletContext简介
    利用Session实现三天免登陆
    Cookie简介
    JSP简介
    线程基础--同步机制
    深思——工作面试
    response.setcontenttype的參数
    Codeforces 96D Volleyball spfa
  • 原文地址:https://www.cnblogs.com/xiaomingzaixian/p/7110687.html
Copyright © 2020-2023  润新知