• 实战1:建立代理IP池


    一、爬取免费代理IP

    1、爬取代理IP:

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    # Author:Meng Zhaoce
    import requests
    from bs4 import BeautifulSoup
    from multiprocessing.dummy import Pool as ThreadPool #多线程模块
    from pymongo import MongoClient
    data = []
    
    def getIp(page):
        url = 'https://www.xicidaili.com/nt/%d'%(page)
        headers ={
            'User-Agent' :'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
    
        }#伪装请求头
        res = requests.get(url,headers=headers).text #发送请求
        soup = BeautifulSoup(res,'lxml')
        for i in soup.find_all('tr'):
            try:
                data.append({'ip':'%s:%s'%(i.find_all('td')[1].get_text(),i.find_all('td')[2].get_text()),'verify':False})
            except:
                continue
    
    pool = ThreadPool(10)
    pool.map(getIp,[i for i in range(100)])
    pool.close()
    pool.join()
    print(data)
    print(len(data))
    
    db = MongoClient('127.0.0.1',27017).test
    db.ippool.insert_many(data)

    此处涉及知识点:请求库、解析库、多线程模块、菲关系型数据库

     二、建立代理IP池

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    # Author:Meng Zhaoce
    import multiprocessing #引入多线程模块
    import time
    import requests
    from pymongo import MongoClient
    import redis
    db = MongoClient('127.0.0.1',27017).text
    url = 'http://www.baidu.com'
    ippool = []
    for i in db.ippool.find({'verify':False}):
        ippool.append(i['ip'])
    start = time.time()
    def verify(ip):
        proxies = {
            'http':'http://%s'%(ip)
        }
        try:
            res = requests.get(url,proxies=proxies,timeout=2)
            print(res.status_code)
            if res.status_code == 200:
                db.ippool.insert({'ip':ip,'verify':True})
                print('insert finished'.center(50,'*'))
        except Exception as e:
            print(e)
    
    pool = multiprocessing.Pool(processes=10)
    pool.map(verify,ippool[:100])
    print(time.time()-start)
    print('finshed')
  • 相关阅读:
    管理心理学[9095]
    汽车文化[1196]
    小四轴——空心杯电机引起的电源干扰
    38 时序电路扩展2
    37 时序电路扩展1
    36 时序电路的动态特性分析2
    35 时序电路的动态特性分析1
    34 同步时序电路的设计方法2
    33 同步时序电路的设计方法1
    60. 第k个排列
  • 原文地址:https://www.cnblogs.com/1218-mzc/p/11780484.html
Copyright © 2020-2023  润新知