• 代理 IP


    # coding:utf-8
    '''
    定义规则 urls:url列表
    type:解析方式,取值 regular(正则表达式),xpath(xpath解析),module(自定义第三方模块解析)
    patten:可以是正则表达式,可以是xpath语句不过要和上面的相对应
    '''
    import os
    import random

    '''
    ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https),country(国家),area(省市),updatetime(更新时间)
    speed(连接速度)
    '''
    parserList = [
    {
    'urls': ['http://www.66ip.cn/%s.html' % n for n in ['index'] + list(range(2, 12))],
    'type': 'xpath',
    'pattern': ".//*[@id='main']/div/div[1]/table/tr[position()>1]",
    'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[4]', 'protocol': ''}
    },
    {
    'urls': ['http://www.66ip.cn/areaindex_%s/%s.html' % (m, n) for m in range(1, 35) for n in range(1, 10)],
    'type': 'xpath',
    'pattern': ".//*[@id='footer']/div/table/tr[position()>1]",
    'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[4]', 'protocol': ''}
    },
    {
    'urls': ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'],
    'type': 'xpath',
    'pattern': ".//table[@class='sortable']/tbody/tr",
    'position': {'ip': './td[1]', 'port': './td[2]', 'type': '', 'protocol': ''}

    },
    {
    'urls': ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)],
    'type': 'xpath',
    'pattern': ".//table[@class='list']/tr",
    'position': {'ip': './td[1]', 'port': './td[2]', 'type': '', 'protocol': ''}

    },
    {
    'urls': ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)],
    'type': 'module',
    'moduleName': 'proxy_listPraser',
    'pattern': 'Proxy(.+)',
    'position': {'ip': 0, 'port': -1, 'type': -1, 'protocol': 2}

    },
    {
    'urls': ['http://incloak.com/proxy-list/%s#list' % n for n in
    ([''] + ['?start=%s' % (64 * m) for m in range(1, 10)])],
    'type': 'xpath',
    'pattern': ".//table[@class='proxy__t']/tbody/tr",
    'position': {'ip': './td[1]', 'port': './td[2]', 'type': '', 'protocol': ''}

    },
    {
    'urls': ['http://www.kuaidaili.com/proxylist/%s/' % n for n in range(1, 11)],
    'type': 'xpath',
    'pattern': ".//*[@id='index_free_list']/table/tbody/tr[position()>0]",
    'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[3]', 'protocol': './td[4]'}
    },
    {
    'urls': ['http://www.kuaidaili.com/free/%s/%s/' % (m, n) for m in ['inha', 'intr', 'outha', 'outtr'] for n in
    range(1, 11)],
    'type': 'xpath',
    'pattern': ".//*[@id='list']/table/tbody/tr[position()>0]",
    'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[3]', 'protocol': './td[4]'}
    },
    {
    'urls': ['http://www.cz88.net/proxy/%s' % m for m in
    ['index.shtml'] + ['http_%s.shtml' % n for n in range(2, 11)]],
    'type': 'xpath',
    'pattern': ".//*[@id='boxright']/div/ul/li[position()>1]",
    'position': {'ip': './div[1]', 'port': './div[2]', 'type': './div[3]', 'protocol': ''}

    },
    {
    'urls': ['http://www.ip181.com/daili/%s.html' % n for n in range(1, 11)],
    'type': 'xpath',
    'pattern': ".//div[@class='row']/div[3]/table/tbody/tr[position()>1]",
    'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[3]', 'protocol': './td[4]'}

    },
    {
    'urls': ['http://www.xicidaili.com/%s/%s' % (m, n) for m in ['nn', 'nt', 'wn', 'wt'] for n in range(1, 8)],
    'type': 'xpath',
    'pattern': ".//*[@id='ip_list']/tr[position()>1]",
    'position': {'ip': './td[2]', 'port': './td[3]', 'type': './td[5]', 'protocol': './td[6]'}
    },
    {
    'urls': ['http://www.cnproxy.com/proxy%s.html' % i for i in range(1, 11)],
    'type': 'module',
    'moduleName': 'CnproxyPraser',
    'pattern': r'<tr><td>(d+.d+.d+.d+)<SCRIPT type=text/javascript>document.write(":"(.+))</SCRIPT></td><td>(HTTP|SOCKS4)s*',
    'position': {'ip': 0, 'port': 1, 'type': -1, 'protocol': 2}
    }
    ]

    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    import time, random
    from selenium.webdriver.common.keys import Keys
    import logging
    import os


    def gen_a_broswer(type='pc'):
    if type == 'pc':
    browser = webdriver.Chrome()
    elif type == 'mb':
    mobile_emulation = {
    "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
    "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
    chrome_options = Options()
    chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
    browser = webdriver.Chrome(chrome_options=chrome_options)
    return browser


    from selenium import webdriver

    for i in parserList:
    if i['type'] == 'xpath':
    pa, po = i['pattern'], i['position']
    for url in i['urls']:
    if 'kuaidaili' not in url:
    continue
    try:
    browser = gen_a_broswer()
    browser.get(url)
    res = browser.find_element_by_xpath(pa)
    print(res.text)
    try:
    ip, ipp = res.text.split(' ')[0], res.text.split(' ')[1]
    except Exception as e:
    print(e)
    try:
    browser.quit()
    except Exception as e:
    print('break--->', e)
    break
    browser.quit()
    except Exception as e:
    print(e)
    try:
    browser.quit()
    except Exception as e:
    print('break--->', e)
    break

    profile = webdriver.FirefoxProfile()
    profile.set_preference('network.proxy.type', 1)
    profile.set_preference('network.proxy.http', ip)
    profile.set_preference('network.proxy.http_port', int(ipp)) # int
    profile.update_preferences()
    try:
    driver = webdriver.Firefox(firefox_profile=profile)
    myurl = 'http://www.mys.cn'
    driver.get(myurl)
    driver.refresh()
    time.sleep(120)
    except Exception as e:
    print(e)
    try:
    driver.quit()
    except Exception as e:
    print('break--->', e)
    break
  • 相关阅读:
    【学习总结】Git学习-GIT工作流-千峰教育(来自B站)
    【学习总结】Git学习-上传本地已有代码到GitHub
    文件读写及字符串与整数的相互转换
    “《编程珠玑》(第2版)第2章”:A题(二分搜索)
    Visual studio2010和Modelsim配置SystemC开发(转)
    C/C++中如何产生伪随机数
    “《编程珠玑》(第2版)第1章”:课后习题
    “《编程珠玑》(第2版)第1章”:查找一个数列中缺失的一个整数
    “《编程珠玑》(第2版)第1章”:整数排序
    判断质数的几种方法
  • 原文地址:https://www.cnblogs.com/rsapaper/p/8332683.html
Copyright © 2020-2023  润新知