实例目标:爬取知网空间300个期刊论文
参考链接有详细的原理和教程
一。调度器:用来控制整个流程
spider_main.py
#!/usr/bin/env python # -*- coding: utf-8 -*- 'spider_main-调度器' import url_manager import html_downloader import html_parser import html_outputer from gevent import monkey; monkey.patch_all() import gevent class SpiderMain(object): """docstring for SpiderMain""" def __init__(self): super(SpiderMain, self).__init__() self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() self.count = 1 # 统计爬的网页数 def gevent_01(self): sum_ge = 1 temp = [] while sum_ge <= 50: # 加入50个协程并发处理(非常好用) temp.append(gevent.spawn(self.craw, sum_ge)) sum_ge = sum_ge + 1 gevent.joinall(temp) def craw(self, n): while True: if self.count > 300: return # 爬300个网页 if self.urls.has_new_url(): try: new_url = self.urls.get_new_url() html_cont = self.downloader.downloader(new_url) new_urls, new_data = self.parser.parser(new_url,html_cont) self.urls.add_new_urls(new_urls) if new_data is None: continue # 网页中没有要爬的内容就到下一个网页 self.outputer.collect_data(new_data) print '协程%d ' % n print 'craw %d : %s' % (self.count,new_url) self.count = self.count + 1 # 爬一个网页自增 except Exception as e: print 'craw failed' else: gevent.sleep(0) # 没有新的链接要怕,交出控制权给另一个协程 if __name__ == '__main__': root_url = 'http://www.cnki.com.cn/index.htm' obj_spider = SpiderMain() obj_spider.urls.add_new_url(root_url) obj_spider.gevent_01() obj_spider.outputer.output_html()
二。URL管理器(数据库,redis缓存, 内存) : 管理待抓取URL集合和已抓取URL集合 (防止重复和循环抓取)
url_manager.py
#!/usr/bin/env python # -*- coding: utf-8 -*- 'url_manager-URL管理器' class UrlManager(object): """docstring for UrlManager""" def __init__(self): super(UrlManager, self).__init__() self.new_urls = set() self.old_urls = set() def add_new_url(self,url): if url is None: return if url not in self.new_urls and url not in self.old_urls: self.new_urls.add(url) def add_new_urls(self,urls): if urls is None or len(urls) == 0: return for url in urls: self.add_new_url(url) def get_new_url(self): new_url = self.new_urls.pop() self.old_urls.add(new_url) return new_url def has_new_url(self): return len(self.new_urls) != 0
三。网页下载器(官方的urllib2,第三方requeests) : 将互联网上的URL对应的网页下载到本地的工具
html_downloader.py
#!/usr/bin/env python # -*- coding: utf-8 -*- 'html_downloader-网页下载器' import urllib2 class HtmlDownloader(object): """docstring for html_downloader""" def __init__(self): super(HtmlDownloader, self).__init__() def downloader(self,url): if url is None: return None response = urllib2.urlopen(url) # 第一种最直接的下载方式(可看参考链接) if response.getcode() != 200: return None return response.read()
四。网页解释器(正则表达式,lxml,官方自带html.parser,第三方的BeautifulSoup) : 从网页中提取有价值数据的工具
html_parser.py
#!/usr/bin/env python # -*- coding: utf-8 -*- 'html_parser-网页解释器' from bs4 import BeautifulSoup import re import urlparse class HtmlParser(object): """docstring for html_parser""" def __init__(self): super(HtmlParser, self).__init__() def __get_new_urls__(self, page_url, soup): new_urls = set() links_01 = soup.find_all('a',href=re.compile(r'/Journal/')) # 获取是期刊的链接 #links = soup.find_all('a',href=re.compile(r'/item/')) for link in links_01: new_url = link['href'] new_full_url = urlparse.urljoin(page_url, new_url) # https://baike.baidu.com new_urls.add(new_full_url) links = soup.find_all('a',class_='zt_name',href=re.compile(r'/Article/')) # 获取文章的链接 #links = soup.find_all('a',href=re.compile(r'/item/')) for link in links: new_url = link['href'] new_full_url = urlparse.urljoin(page_url, new_url) # https://baike.baidu.com new_urls.add(new_full_url) return new_urls def __get_new_data__(self,page_url,soup): res_data = {} title_node = soup.find('h1',class_='xx_title') if title_node is None: return res_data['title'] = title_node.get_text() summary_node = soup.find('div',class_='xx_font') res_data['summary'] = summary_node.get_text() res_data['url'] = page_url return res_data def parser(self, page_url, html_cont): if page_url is None or html_cont is None: return soup = BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8') new_urls = self.__get_new_urls__(page_url,soup) new_data = self.__get_new_data__(page_url,soup) return new_urls, new_data
五。输出器 : 输出数据
html_outputer.py
#!/usr/bin/env python # -*- coding: utf-8 -*- 'html_outputer-输出器' class HtmlOutputer(object): """docstring for html_outputer""" def __init__(self): super(HtmlOutputer, self).__init__() self.datas = [] def collect_data(self, data): if data is None: return self.datas.append(data) def output_html(self): fout = open('output.html','w') fout.write('<html>') fout.write('<head><meta charset="utf-8"></head>') fout.write('<body>') fout.write('<table>') for data in self.datas: fout.write('<tr>') fout.write('<td>%s</td>' % data['url'].encode('utf-8')) fout.write('<td>%s</td>' % data['title'].encode('utf-8')) fout.write('<td>%s</td>' % data['summary'].encode('utf-8')) fout.write('</tr>') fout.write('</table>') fout.write('</body>') fout.write('</html>') fout.close()
六。问题
用了多个协程来并发出来后,解决要等待网络io的问题(下载器下载网页), 但也存在不知道哪个协程先处理完成,哪个后处理完成,导致爬的网页顺序是没有规律的,不过这应该问题不大0.0
七。演示
八。参考
https://www.liaoxuefeng.com/wiki/001374738125095c955c1e6d8bb493182103fac9270762a000/001407503089986d175822da68d4d6685fbe849a0e0ca35000
https://www.imooc.com/learn/563