1.这次爬虫用到了之前封装的Url_ProxyHelper类,源代码如下
1 import urllib.request as ur 2 3 class Url_ProxyHelper: 4 def __init__(self, url, proxy_add, savepath=None): 5 self.url = url 6 self.proxy_add = proxy_add 7 self.req = None 8 self.proxy = None 9 self.opener = None 10 self.info = None 11 self.save_path = savepath 12 13 # 报头代理设置 14 def set_UrlAndProxy(self): 15 # 添加报头 16 self.req = ur.Request(self.url) 17 self.req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0') 18 19 # 设置代理服务器 20 self.proxy = ur.ProxyHandler({'http': self.proxy_add}) 21 self.opener = ur.build_opener(self.proxy, ur.HTTPHandler) 22 ur.install_opener(self.opener) 23 return self.req 24 25 # 数据存档 26 def save_InFile(self): 27 self.req = self.set_UrlAndProxy() 28 self.info = ur.urlopen(self.req).read() 29 open(self.save_path, 'wb').write(self.info) 30 31 # 数据返回 32 def feedbak_info(self): 33 self.req = self.set_UrlAndProxy() 34 self.info = ur.urlopen(self.req).read().decode('utf-8') # decode()用来解码,特别是中文 35 return str(self.info)
2.爬取源代码:
1 import urllib.request as ur 2 import urllib.error as ue 3 from Url_ProxyHelper import Url_ProxyHelper 4 import re 5 6 # 设置目标网址 quote()函数能够解决url中出现的中文所导致的一些解析问题 7 url = ur.quote("https://s.taobao.com/list?q=平板电脑&q=平板电脑&s=", safe='/:?=', encoding='utf-8') 8 # 设置存放路径 9 save_path = "E:/workspace/PyCharm/codeSpace/books/python_web_crawler_book/chapter6/demo2/images/" 10 # 设置代理服务器IP 11 proxy_add = "218.73.139.196:808" 12 13 def craw(url, save_path, proxy_add,page): 14 url = url+str((page-1)*48) 15 # 调用Url_ProxyHelper封装类 16 uph = Url_ProxyHelper(url, proxy_add) 17 infos = uph.feedbak_info() 18 # 设置正则表达式 一般来讲先把这个实例拿出来,然后根据实例写通式 19 pattern = '"(pic_url)":"(.+?.jpg)' 20 infos = re.compile(pattern=pattern).findall(infos) 21 x = 1 22 for info in infos: 23 image_name = save_path+str(page)+"_"+str(x)+".jpg" 24 image_url = "http:"+info[1] 25 try: 26 ur.urlretrieve(image_url, filename=image_name) 27 except ue.HTTPError as e: 28 if hasattr(e, 'code'): 29 print(e.code) 30 if hasattr(e, 'reason'): 31 print(e.reason) 32 except ue.URLError as e: 33 if hasattr(e, 'code'): 34 print(e.code) 35 if hasattr(e, 'reason'): 36 print(e.reason) 37 x += 1 38 39 # 只爬取了第一页 40 craw(url, save_path, proxy_add, 1)