• Python四线程爬取西刺代理


      1 import requests
      2 from bs4 import BeautifulSoup 
      3 import lxml
      4 import telnetlib #验证代理的可用性
      5 import pymysql.cursors
      6 import random
      7 import threading
      8 
      9 
     10 
     11 
     12 BASEURL = 'http://www.xicidaili.com/' #西刺首页
     13 urls = [BASEURL+ 'nn/',BASEURL+'nt/',BASEURL+'wn/',BASEURL+'wt/']#西刺分组(more)的ip信息链接列表
     14 
     15 #请求头信息,必须有User-Agent
     16 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
     17 
     18 #proxies = {'https': 'http://123.57.85.224:80', 'http': 'http://123.57.85.224:80'}
     19 
     20 #获得与数据库的连接和游标
     21 def get_cc():
     22     # 连接MySQL数据库
     23     connection = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='root', db='iptables', 
     24                                  charset='utf8', cursorclass=pymysql.cursors.DictCursor)
     25     # 通过cursor创建游标
     26     cursor = connection.cursor()
     27     return connection,cursor
     28 
     29 
     30 
     31 #保存ip_port到数据库
     32 def save_ip_port(ip_port):
     33     connection,cursor = get_cc()
     34     try:
     35         sql = 'insert into iptable(ip_port) values("'+ip_port+'")'
     36         cursor.execute(sql)
     37     except:
     38         print('保存'+ip_port+'失败!!!!!')
     39     else:
     40         connection.commit()
     41         connection.close()
     42 
     43 
     44 
     45 #从数据库获得ip_port
     46 def get_ip_port():
     47     connection,cursor = get_cc()
     48     sql_get_id = 'select id,ip_port from iptable'
     49     cursor.execute(sql_get_id)
     50     #fetchone()是查询一条数据
     51     id_list = cursor.fetchall()#得到所有的id的字典列表
     52     i = random.randint(0,len(id_list)-1)
     53     id_num = id_list[i]['id']
     54     ip_port = id_list[i]['ip_port'] #获得所有可用的代理
     55     
     56     return id_num,ip_port#返回id和ip_port:192.168.1.2:8080
     57 
     58 #删除被封的ip_port
     59 def del_ip_port(id_num):
     60     connection,cursor = get_cc()
     61     try:
     62         sql = 'delete from iptable where id = ' + str(id_num)
     63         cursor.execute(sql)
     64     except:
     65         print('删除'+ip_port+'失败!!!!!')
     66     else:
     67         connection.commit()
     68         connection.close()
     69 
     70 
     71 
     72 #获得代理
     73 def get_proxies(ip_port):#ip_port = '192.168.2.45:8088'
     74     proxy_ip = 'http://' + ip_port
     75     proxy_ips = 'https://' + ip_port
     76     proxies = {'https': proxy_ips, 'http': proxy_ip}
     77     return proxies
     78 
     79 
     80 #获得对应url分类的最大页码
     81 def get_max_pagenum(url): #url是more(分类)的链接,/nn,/nt....
     82     
     83     response = requests.get(url,headers = headers)
     84     status_code = response.status_code
     85     soup = BeautifulSoup(response.content,'lxml')
     86     max_pagenum = soup.find('div',attrs = {'class':'pagination'}).find_all('a')[-2].string
     87     max_pagenum = int(max_pagenum)
     88     return max_pagenum
     89 
     90 #验证代理是否有用,ip_port = '192.168.2.45:8088'
     91 #每得到一个ip_port都要进行验证,如果可用则保存,否则抛弃
     92 def verifyProxyList(ip_port):
     93     url = 'http://www.baidu.com'
     94     # proxies = { "http": "http://"+ ip_port }
     95     host ,port = ip_port.split(':')
     96     try:
     97         # res = requests.get(url,headers = headers,proxies = proxies,timeout = 5.0)
     98         telnetlib.Telnet(host, port=port, timeout=5)
     99     except:
    100         print('---Failur:' + ip_port)
    101     else:
    102         #ips.append(ip_port)#这里应该存储到Redis等数据库中
    103         save_ip_port(ip_port)
    104         
    105         
    106 
    107 def main(url,proxies):#这里是more的链接,/nn/1,/nn/2....
    108     
    109     try:
    110         response = requests.get(url,headers = headers,proxies = proxies,timeout = 5.0)
    111         status_code = response.status_code #503说明ip被封
    112 
    113         if(status_code != requests.codes.ok):#响应的不是正常状态
    114             #删除旧的代理ip_port,这里还需要验证是否有bug
    115             old_ip_port = proxies['http'][7:]
    116             del_ip_port(old_ip_port)
    117             #修改代理,重新请求
    118             id_num,ip_port = get_ip_port()
    119             proxies = get_proxies(ip_port)
    120             print(str(proxies))
    121             return
    122                       
    123         soup = BeautifulSoup(response.content,'lxml')
    124             
    125         results = soup.find_all('tr')#遍历所有的tr
    126          
    127         for result in results[1:]:#这里第一个tr子标签是th,所以会报错
    128             tdlist = result.find_all('td')
    129             ip_port = tdlist[1].string+':'+tdlist[2].string
    130             verifyProxyList(ip_port)
    131     except:
    132        print('请求异常......')
    133 
    134 class myThread(threading.Thread):
    135     def __init__(self, threadID, name, url):
    136         threading.Thread.__init__(self)
    137         self.threadID = threadID
    138         self.name = name
    139         self.url = url
    140     
    141     
    142     
    143     def run(self):
    144         print('正在执行线程:'+self.name)#没有验证这一行的可行性
    145         id_num,ip_port = get_ip_port()
    146         proxies = get_proxies(ip_port)
    147         max_pagenum = get_max_pagenum(self.url)
    148         #print(max_pagenum)
    149         for i in range(1,max_pagenum):
    150             url = self.url + '/' + str(i)
    151             main(url,proxies)
    152         
    153 #4线程爬取西刺的ip代理池
    154 if __name__ == '__main__':
    155    
    156     t1 = myThread(1,"Thread-1",urls[0])
    157     t2 = myThread(2,"Thread-2",urls[1])
    158     t3 = myThread(3,"Thread-3",urls[2])
    159     t4 = myThread(4,"Thread-4",urls[3])
    160     t1.start()
    161     t2.start()
    162     t3.start()
    163     t4.start()
    164     t1.join()
    165     t2.join()
    166     t3.join()
    167     t4.join()
    168     
    169    
  • 相关阅读:
    MVC 自定义异常过滤特性
    写一个左中右布局占满屏幕,其中左右两块是固定宽度200,中间自适应宽度, 要求先加载中间块,请写出结构及样式。
    请写出jQuery绑定事件的方法,不少于两种
    用js写个原生的ajax过程
    link和@import的区别
    attribute和property的区别是什么?
    请简要描述margin重复问题,及解决方式
    display:none;与visibility:hidden;的区别
    web标准以及w3c标准
    css优先级
  • 原文地址:https://www.cnblogs.com/suhfj-825/p/9341342.html
Copyright © 2020-2023  润新知