• Python3爬虫(十五) 代理


     Infi-chu:

    http://www.cnblogs.com/Infi-chu/

    一、设置代理

    1.urllib

    #HTTP代理类型
    from urllib.error import URLError
    from urllib.requests import ProxyHandler,build_opener
    proxy='127.0.0.1:9743'
    # proxy='username:password@127.0.0.1:9743'  用户名密码放在开头
    proxy_handler=ProxyHandler({
    	'http':'http://'+proxy,
    	'https':'https://'+proxy
    })
    opener=build_opener(proxy_handler)
    try:
        res = opener.open('http://httpbin.org/get')
    	print(res.read().decode('uft-8'))
    except URLError as e:
    	print(e.reason)
    #SOCK5代理类型
    import socks	# pip3 install PySocks
    import socket
    from urllib import request
    from urllib.error import URLError
    socks.set_default_proxy(socks.SOCKS5,'127.0.0.1',9742)
    socket.socket=socks.socksocket
    try:
        res = request.urlopen('http://httpbin.org/get')
    	print(res.read().decode('utf-8'))
    except URLError as e:
    	print(e.reason)
    

    2.requests
    比urllib简单

    # HTTP代理类型
    improt requests
    proxy='127.0.0.1:9743'
    proxies = {
    	'http':'http://'+proxy,
    	'https':'https://'+proxy,
    }
    try:
        res = requests.get('http://httpbin.org/get',proxies=proxies)
    	print(res.text)
    except requests.exceptions.ConnectionError as e:
        print('Error',e.args)
    
    # SOCK5代理类型(1)
    import requests    # pip3 install 'requests[socks]'
    proxy='127.0.0.1:9742'
    proxies={
    	'http':'socks5://'+proxy,
    	'https':'socks5://'+proxy,
    }
    try:
        res = requests.get('http://httpbin.org/get',proxies=proxies)
    	print(res.text)
    except requests.exceptions.ConnectionError as e:
        print('Error',e.args)
    # SOCK5代理类型(2)
    import requests,socks,socket
    socks.set_default_proxy(socks.SOCKS5,'127.0.0.1',9742)
    socket.socket=socks.socksocket
    try:
        res = requests.get('http://httpbin.org/get',proxies=proxies)
    	print(res.text)
    except requests.exceptions.ConnectionError as e:
        print('Error',e.args)
    

    3.Selenium
    设置浏览器代理

    from selenium import webdriver
    proxy='127.0.0.1:9743'
    chrome_options=webdriver.ChromeOptions()	# 使用此方法传参数
    chrome_options.add_argument('--proxy-server=http://'+proxy)
    browser=webdriver.Chrome(chrome_options=chrome_options)
    browser.get('http://httpbin.org/get')
    

    设置认证代理

    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    import zipfile
    
    ip='127.0.0.1'
    port=9743
    username='test'
    password='test'
    manifest_json="""
    {
    	"version":"1.0.0",
    	"manifest_version":2,
    	"name":"Chrome Proxy",
    	"permissions":[
    	"proxy",
    	"tabs",
    	"unlimitedStorage",
    	"storage",
    	"<all_urls>",
    	"webRequest",
    	"webRequestBlocking"
    	],
    	"background":{"scripts":["background.js"]}
    }
    """
    background_js="""
    var config={
    	mode:"fixed_servers",
    	rules:{
    		singleProxy:{
    			scheme:"http",
    			host:"%(ip)s",
    			port:"%(port)s"
    		}
    	}
    }
    
    chrome.proxy.settings.set({value:config,scope:"regular"},function(){});
    function callbackFn(details){
    	return{
    		authCredentials:{
    			username:"%(username)s",
    			password:"%(password)s"
    		}
    	}
    }	
    chrome.webRequest.onAuthRequired.addListener(
    	callbackFn,
    	{urls:["<all_urls>"]},
    	['blocking']
    )
    """%{'ip':ip,'port':port,'username':username,'port':port}
    plugin_file='proxy_auth_plugin.zip'
    with zipfile.ZipFile(plugin_file,'w') as zp:
        zp.writestr("manifest_json",manifest_json)
    	zp.writestr("background.js",background_js)
    chrome_options=Options()
    chrome_options.add_argument('--start-maximized')
    chrome_options.add_extension(plugin_file)
    browser=webdriver.Chrome(chrome_options=chrome_options)
    browser.get('http://httpbin.org/get')
    

    二、代理池维护
    单一代理并不能完成我们的代理任务,所以需要更多数量的代理为我们服务。
    我们将对代理进行筛选,并高效的为我们提供服务。
    1.准备
    需要使用redis数据库,aiohttp、requests、redis-py、pyquery、flask库
    2.代理池的目标:存储模块、获取模块、检测模块、接口模块
    3.各模块的实现:

    https://github.com/Infi-chu/proxypool

    三、利用代理爬取微信文章

    https://github.com/Infi-chu/weixinspider

  • 相关阅读:
    http://www.sqlservercentral.com/Forums/Topic6111071461.aspx
    SQL 2012 New Location for Query Templates
    How to Share Data between Stored Procedures
    DB Development Standard summary
    fn_SplitStringToTable
    PowerShell Database Server Disk Space Checking
    IIS支持htaccess的Rewrite3配置过程
    html select按纽代码
    jquery插件集 HA
    HTML基础特殊字符(易记版) HA
  • 原文地址:https://www.cnblogs.com/Infi-chu/p/8995278.html
Copyright © 2020-2023  润新知