• IP+IDC-chinaz抓取


    #-*-coding:gbk-*-
    #code by anyun.org
    import urllib
    import re
    import time
    
    
    def getHtml(url):
        page = urllib.urlopen(url)
        html = page.read()
        html = html.replace('
    ', '')
        html = html.replace('       ', ' ')
        html = html.replace('   ', '')
        html = html.replace('   ', '')
        # html = html.replace(' ','')
        return html
    
    
    def getcontext(html):
        reg = (r'<span class="Whwtdhalf w15-0">(.*?)</span>')
        listre = re.compile(reg)
        mylist = re.findall(listre, html)
        return mylist
    
    def getadd(html):
        reg = (r'<span class="Whwtdhalf w50-0">(.*?)</span>')
        listre = re.compile(reg)
        mylist = re.findall(listre, html)
        return mylist
    
    def geterr(html):
        reg = (r'<div class="col-red lh30 fz14 jspu">(.*?)</div>')
        listre = re.compile(reg)
        mylist = re.findall(listre, html)
        return mylist
    
    if __name__ == '__main__':
    	f =open('list.txt','r')
    	for i in f.readlines():
    		i=i.strip()
    		
    		try:
    			Url='http://ip.chinaz.com/?ip=http://'+i
    		except:
    			print 'error'
    		Html = getHtml(Url)
    	#	print (getcontext(Html))
    		
    		if len(geterr(Html))==0:
    			print getcontext(Html)[0],getcontext(Html)[3] 
    			,getcontext(Html)[1],getcontext(Html)[4] 
    			,getcontext(Html)[2],getcontext(Html)[5] 
    			,getadd(Html)[0],getadd(Html)[1]
    			
    			f1 = open('ok.txt','a')
    			print >>f1,getcontext(Html)[0],getcontext(Html)[3] 
    			,getcontext(Html)[1],getcontext(Html)[4] 
    			,getcontext(Html)[2],getcontext(Html)[5] 
    			,getadd(Html)[0],getadd(Html)[1]
    			f1.close()
    			
    		else:
    			print i,'解析失败'
    			f2=open('err.txt','a')
    			print >>f2,i,'解析失败'
    			f2.close()
    		time.sleep(0.5)
    	print 'over'
    

      

  • 相关阅读:
    input 正则
    .net ashx Session 未将对象引用到实例
    js 时间和时间对比
    c# Repeater 和 AspNetPager
    c#后台 极光推送到Android 和IOS客户端
    select scope_identity()
    redhat7.4安装git(按照官网从源码安装)
    redhat7.4安装gitlab
    ES6模板字符串
    初次接触webpack
  • 原文地址:https://www.cnblogs.com/crac/p/5778741.html
Copyright © 2020-2023  润新知