import re
import urllib.request
import time
import urllib.error
def wei(url,duan):
try:
open_url=urllib.request.Request(url)
open_url.add_header("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Mobile Safari/537.36")
file=urllib.request.ProxyHandler({'http':duan})
cood=urllib.request.build_opener(file,urllib.request.HTTPHandler)
urllib.request.install_opener(cood)
date=urllib.request.urlopen(open_url).read()
print(len(date))
return date
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
time.sleep(10)
except Exception as t:
print(str(t))
time.sleep(1)
duan="121.231.226.12:6666"
//一般免费的端口其中爬取不完全
key="Python"
for i in range(0,10):
try:
key1=urllib.request.quote(key)
url="http://weixin.sogou.com/weixin?query="+key1+"&_sug_type_=&sut=10977&lkt=7%2C1527054607490%2C1527054613464&s_from=input&_sug_=y&type=2&sst0=1527054613567&page="+str(i+1)+"&ie=utf8&w=01019900&dr=1"
shi=wei(url,duan)
print(shi)
zheng='<a href="(.*?)'
long=re.compile(zheng).findall(str(shi))
if long==0:
print('此'+str(i)+'爬取未成功')
continue
for j in range(len(long)):
rom=long[j]
rom=rom.replace("amp;","")
ong="D:/html/"+str(j)+".txt"
shi=wei(url,duan)
try:
ce=open(ong,"w")
ce.write(shi)
ce.close()
except Exception as e:
print(str(e))
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
except Exception as t:
print(str(t))