一、爬取免费代理IP
1、爬取代理IP:
#!/usr/bin/env python # -*- coding:utf-8 -*- # Author:Meng Zhaoce import requests from bs4 import BeautifulSoup from multiprocessing.dummy import Pool as ThreadPool #多线程模块 from pymongo import MongoClient data = [] def getIp(page): url = 'https://www.xicidaili.com/nt/%d'%(page) headers ={ 'User-Agent' :'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36' }#伪装请求头 res = requests.get(url,headers=headers).text #发送请求 soup = BeautifulSoup(res,'lxml') for i in soup.find_all('tr'): try: data.append({'ip':'%s:%s'%(i.find_all('td')[1].get_text(),i.find_all('td')[2].get_text()),'verify':False}) except: continue pool = ThreadPool(10) pool.map(getIp,[i for i in range(100)]) pool.close() pool.join() print(data) print(len(data)) db = MongoClient('127.0.0.1',27017).test db.ippool.insert_many(data)
此处涉及知识点:请求库、解析库、多线程模块、菲关系型数据库
二、建立代理IP池
#!/usr/bin/env python # -*- coding:utf-8 -*- # Author:Meng Zhaoce import multiprocessing #引入多线程模块 import time import requests from pymongo import MongoClient import redis db = MongoClient('127.0.0.1',27017).text url = 'http://www.baidu.com' ippool = [] for i in db.ippool.find({'verify':False}): ippool.append(i['ip']) start = time.time() def verify(ip): proxies = { 'http':'http://%s'%(ip) } try: res = requests.get(url,proxies=proxies,timeout=2) print(res.status_code) if res.status_code == 200: db.ippool.insert({'ip':ip,'verify':True}) print('insert finished'.center(50,'*')) except Exception as e: print(e) pool = multiprocessing.Pool(processes=10) pool.map(verify,ippool[:100]) print(time.time()-start) print('finshed')