# coding:utf-8
'''
定义规则 urls:url列表
type:解析方式,取值 regular(正则表达式),xpath(xpath解析),module(自定义第三方模块解析)
patten:可以是正则表达式,可以是xpath语句不过要和上面的相对应
'''
import os
import random
'''
ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https),country(国家),area(省市),updatetime(更新时间)
speed(连接速度)
'''
parserList = [
{
'urls': ['http://www.66ip.cn/%s.html' % n for n in ['index'] + list(range(2, 12))],
'type': 'xpath',
'pattern': ".//*[@id='main']/div/div[1]/table/tr[position()>1]",
'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[4]', 'protocol': ''}
},
{
'urls': ['http://www.66ip.cn/areaindex_%s/%s.html' % (m, n) for m in range(1, 35) for n in range(1, 10)],
'type': 'xpath',
'pattern': ".//*[@id='footer']/div/table/tr[position()>1]",
'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[4]', 'protocol': ''}
},
{
'urls': ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'],
'type': 'xpath',
'pattern': ".//table[@class='sortable']/tbody/tr",
'position': {'ip': './td[1]', 'port': './td[2]', 'type': '', 'protocol': ''}
},
{
'urls': ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)],
'type': 'xpath',
'pattern': ".//table[@class='list']/tr",
'position': {'ip': './td[1]', 'port': './td[2]', 'type': '', 'protocol': ''}
},
{
'urls': ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)],
'type': 'module',
'moduleName': 'proxy_listPraser',
'pattern': 'Proxy(.+)',
'position': {'ip': 0, 'port': -1, 'type': -1, 'protocol': 2}
},
{
'urls': ['http://incloak.com/proxy-list/%s#list' % n for n in
([''] + ['?start=%s' % (64 * m) for m in range(1, 10)])],
'type': 'xpath',
'pattern': ".//table[@class='proxy__t']/tbody/tr",
'position': {'ip': './td[1]', 'port': './td[2]', 'type': '', 'protocol': ''}
},
{
'urls': ['http://www.kuaidaili.com/proxylist/%s/' % n for n in range(1, 11)],
'type': 'xpath',
'pattern': ".//*[@id='index_free_list']/table/tbody/tr[position()>0]",
'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[3]', 'protocol': './td[4]'}
},
{
'urls': ['http://www.kuaidaili.com/free/%s/%s/' % (m, n) for m in ['inha', 'intr', 'outha', 'outtr'] for n in
range(1, 11)],
'type': 'xpath',
'pattern': ".//*[@id='list']/table/tbody/tr[position()>0]",
'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[3]', 'protocol': './td[4]'}
},
{
'urls': ['http://www.cz88.net/proxy/%s' % m for m in
['index.shtml'] + ['http_%s.shtml' % n for n in range(2, 11)]],
'type': 'xpath',
'pattern': ".//*[@id='boxright']/div/ul/li[position()>1]",
'position': {'ip': './div[1]', 'port': './div[2]', 'type': './div[3]', 'protocol': ''}
},
{
'urls': ['http://www.ip181.com/daili/%s.html' % n for n in range(1, 11)],
'type': 'xpath',
'pattern': ".//div[@class='row']/div[3]/table/tbody/tr[position()>1]",
'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[3]', 'protocol': './td[4]'}
},
{
'urls': ['http://www.xicidaili.com/%s/%s' % (m, n) for m in ['nn', 'nt', 'wn', 'wt'] for n in range(1, 8)],
'type': 'xpath',
'pattern': ".//*[@id='ip_list']/tr[position()>1]",
'position': {'ip': './td[2]', 'port': './td[3]', 'type': './td[5]', 'protocol': './td[6]'}
},
{
'urls': ['http://www.cnproxy.com/proxy%s.html' % i for i in range(1, 11)],
'type': 'module',
'moduleName': 'CnproxyPraser',
'pattern': r'<tr><td>(d+.d+.d+.d+)<SCRIPT type=text/javascript>document.write(":"(.+))</SCRIPT></td><td>(HTTP|SOCKS4)s*',
'position': {'ip': 0, 'port': 1, 'type': -1, 'protocol': 2}
}
]
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time, random
from selenium.webdriver.common.keys import Keys
import logging
import os
def gen_a_broswer(type='pc'):
if type == 'pc':
browser = webdriver.Chrome()
elif type == 'mb':
mobile_emulation = {
"deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
"userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
chrome_options = Options()
chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
browser = webdriver.Chrome(chrome_options=chrome_options)
return browser
from selenium import webdriver
for i in parserList:
if i['type'] == 'xpath':
pa, po = i['pattern'], i['position']
for url in i['urls']:
if 'kuaidaili' not in url:
continue
try:
browser = gen_a_broswer()
browser.get(url)
res = browser.find_element_by_xpath(pa)
print(res.text)
try:
ip, ipp = res.text.split(' ')[0], res.text.split(' ')[1]
except Exception as e:
print(e)
try:
browser.quit()
except Exception as e:
print('break--->', e)
break
browser.quit()
except Exception as e:
print(e)
try:
browser.quit()
except Exception as e:
print('break--->', e)
break
profile = webdriver.FirefoxProfile()
profile.set_preference('network.proxy.type', 1)
profile.set_preference('network.proxy.http', ip)
profile.set_preference('network.proxy.http_port', int(ipp)) # int
profile.update_preferences()
try:
driver = webdriver.Firefox(firefox_profile=profile)
myurl = 'http://www.mys.cn'
driver.get(myurl)
driver.refresh()
time.sleep(120)
except Exception as e:
print(e)
try:
driver.quit()
except Exception as e:
print('break--->', e)
break