1. 前期准备
1.1 开发工具
Python 3.6
Pycharm Pro 2017.3.2
Text文本
1.2 Python库
requests
re
urllib
如果没有这些Python库,使用以下方法
pip install 需要安装的包名(Ps: pip install requests)
2. 配置系统主题文件
个人经过和系统主题对比写了一个主题文件代码,大家可以拷贝到text文本中另存为*.theme文件,我这里命名为lamborghini.theme
; Copyright ?Microsoft Corp. [Theme] ; Windows 7 - IDS_THEME_DISPLAYNAME_AERO DisplayName=兰博基尼 # 个性化主题名称 SetLogonBackground=0 ; Computer - SHIDI_SERVER [CLSID{20D04FE0-3AEA-1069-A2D8-08002B30309D}DefaultIcon] DefaultValue=%SystemRoot%System32imageres.dll,-109 ; UsersFiles - SHIDI_USERFILES [CLSID{59031A47-3F72-44A7-89C5-5595FE6B30EE}DefaultIcon] DefaultValue=%SystemRoot%System32imageres.dll,-123 ; Network - SHIDI_MYNETWORK [CLSID{F02C1A0D-BE21-4350-88B0-7367FC96EF3C}DefaultIcon] DefaultValue=%SystemRoot%System32imageres.dll,-25 ; Recycle Bin - SHIDI_RECYCLERFULL SHIDI_RECYCLER [CLSID{645FF040-5081-101B-9F08-00AA002F954E}DefaultIcon] Full=%SystemRoot%System32imageres.dll,-54 Empty=%SystemRoot%System32imageres.dll,-55 [Control PanelCursors] AppStarting=%SystemRoot%cursorsaero_working.ani Arrow=%SystemRoot%cursorsaero_arrow.cur Crosshair= Hand=%SystemRoot%cursorsaero_link.cur Help=%SystemRoot%cursorsaero_helpsel.cur IBeam= No=%SystemRoot%cursorsaero_unavail.cur NWPen=%SystemRoot%cursorsaero_pen.cur SizeAll=%SystemRoot%cursorsaero_move.cur SizeNESW=%SystemRoot%cursorsaero_nesw.cur SizeNS=%SystemRoot%cursorsaero_ns.cur SizeNWSE=%SystemRoot%cursorsaero_nwse.cur SizeWE=%SystemRoot%cursorsaero_ew.cur UpArrow=%SystemRoot%cursorsaero_up.cur Wait=%SystemRoot%cursorsaero_busy.ani DefaultValue=Windows Aero DefaultValue.MUI=@main.cpl,-1020 [Control PanelDesktop] Wallpaper=D:Wallpaperlamborghini139_151202104128_86504.jpg # 初始化图片 TileWallpaper=0 WallpaperStyle=10 Pattern= [VisualStyles] Path=%ResourceDir%ThemesAeroAero.msstyles ColorStyle=NormalColor Size=NormalSize ColorizationColor=0XA84F1B1B Transparency=1 [boot] SCRNSAVE.EXE= [MasterThemeSelector] MTSM=DABJDKT [Sounds] ; IDS_SCHEME_DEFAULT SchemeName=@%SystemRoot%System32mmres.dll,-800 [Slideshow] Interval=60000 # 动画时间 Shuffle=0 ImagesRootPath=D:Wallpaper #图片路径 ----- 以下不要拷贝,用Python批量添加 ----- Item0Path=D:Wallpaperlamborghiniaventador_s-007.jpg Item1Path=D:Wallpaperlamborghiniaventador_s-006.jpg Item2Path=D:Wallpaperlamborghiniaventador_s-005.jpg Item3Path=D:Wallpaperlamborghiniaventador_s-004.jpg Item4Path=D:Wallpaperlamborghiniaventador_s-003.jpg Item5Path=D:Wallpaperlamborghiniaventador_s-002.jpg Item6Path=D:Wallpaperlamborghiniaventador_s-001.jpg
3.获取页面地址
3.1 获取需要爬取的网页地址
url:http://www.ivsky.com/search.php?q=%E5%85%B0%E5%8D%9A%E5%9F%BA%E5%B0%BC&PageNo=2
q 查询的数据
PageNo 页码
3.2 获取爬取页面分页图片地址
img_url:http://img.ivsky.com/img/bizhi/pic/201804/17/aventador_s-007.jpg
img_url:http://img.ivsky.com/img/bizhi/pre/201804/17/aventador_s-007.jpg
pic 原图
pre 缩略图
4.编写爬虫
import requests, re, urllib.request class Ivsky_Spider: def __init__(self, new_search_name): """初始化""" self.url_search = 'http://www.ivsky.com/search.php?q=%s' % urllib.request.quote(new_search_name) # 网站搜索 self.url = re.findall(r'(http://.*?)/', self.url_search)[0] # 网站地址 self.headers = { 'User-Agent': 'Mozilla/5.0', # 伪装成浏览器访问 'Referer': self.url # 是否合法 } def Spider(self): """主程序""" i = 1 while True: try: print('='*30 + '第%d页' % i + '='*30) respone = self.Get_Html_Respone(self.url_search + '&PageNo=' + str(i)).text page_temp = re.findall(r'<div class="pagelist">.*?</div>', respone, re.S)[0] if str(i) in page_temp: self.Get_Img_Download(i, respone) else: print('=' * 30 + '程序爬取完成' + '=' * 30) return i += 1 except Exception as e: print('报错信息:%s 程序退出' % e) return def Get_Html_Respone(self, new_url): """网站Get请求""" respone = requests.get(url=new_url, headers=self.headers) # Get请求 respone.encoding = 'utf-8' # 网页编码转为utf-8 return respone def Get_Img_Download(self, page, new_respone): """图片下载""" print('-' * 20 + '正在获取第%d页图片内容' % page + '-' * 20) img_url_temp = re.findall(r'<div class="left">.*?<ul class="pli">.*?</ul>', new_respone, re.S)[0] img_url_list = re.findall(r'<li>.*?<div.*?><a href="(.*?)".*?>', img_url_temp, re.S) for i in range(len(img_url_list)): print('-' * 20 + '正在下载第%d页第%d张图片' % (page, i+1) + '-' * 20) img_url = self.url + img_url_list[i] img_respone = self.Get_Html_Respone(img_url).text img_respone_url = re.findall(r"</script><img.*?src='(.*?)'", img_respone)[0].replace('pre', 'pic') img_f_name = img_respone_url[img_respone_url.rfind('/') + 1:] with open('D:Wallpaperlamborghini\%s' % img_f_name, 'wb') as f: img_result = self.Get_Html_Respone(img_respone_url).content f.write(img_result) with open('C:\UsersAdministratorAppDataLocalMicrosoftWindowsThemeslamborghini.theme', 'a') as f: f.write(' ') f.write('Item%dPath=D:Wallpaperlamborghini\%s' % (i, img_f_name)) if __name__ == '__main__': search_name = u'兰博基尼' a = Ivsky_Spider(search_name) a.Spider()