什么是爬虫?
我们可以把互联网比作一张大网,而爬虫(即网络爬虫)便是在网上爬行的蜘蛛。把网的节点比作一个个网页,爬虫爬到这就相当于访问了该页面,获取了其信息。可以把节点间的连线比作网页与网页之间的链接关系,这样蜘蛛通过一个节点后,可以顺着节点连线继续爬行到达下一个节点,即通过一个网页继续获取后续的网页,这样整个网的节点便可以被蜘蛛全部爬行到,网站的数据就可以被抓取下来了。
代码:
1 from urllib import request 2 from collections import deque 3 from pybloom_live import BloomFilter 4 from lxml import etree 5 import hashlib 6 7 class crawel_bfs: 8 request_header={ 9 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 10 'Accept-Encoding': 'br', 11 'Accept-Language': 'zh-CN,zh;q=0.9', 12 'Cache-Control': 'max-age=0', 13 'Connection': 'keep-alive', 14 'Cookie': 'bid=Kn9AT5duD7k; gr_user_id=32e433a7-19f0-4e17-80c4-56b58d7c0056; _vwo_uuid_v2=5985FEE132C29EC9C840D6C5EDD95323|67c2ccc8467fc02a9cce6928e86ea013; ll="118281"; __yadk_uid=I4Ki5RUaEWOUdpVIjJfRYo1MEuaW36hA; __utmv=30149280.16369; viewed="10483489_1115600_2230208_26857712_1569487_1623164_26708119_26677686"; __utma=30149280.965685249.1516632348.1528892392.1530880979.81; __utmc=30149280; __utmz=30149280.1530880979.81.57.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; __utmb=30149280.1.10.1530880979; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1530880982%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utma=223695111.2038558801.1520348154.1528892435.1530880982.55; __utmb=223695111.0.10.1530880982; __utmc=223695111; __utmz=223695111.1530880982.55.51.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_id.100001.4cf6=da4243a2a9e242f1.1520348154.54.1530881042.1528892472.', 15 'Host': 'movie.douban.com', 16 'Referer': 'https://www.douban.com/', 17 'Upgrade-Insecure-Requests': '1', 18 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' 19 } 20 21 cur_level=0 22 max_level=2 23 24 download_bf=BloomFilter(1024*1024,0.01) 25 26 childque=deque() 27 nowque=deque() 28 29 def __init__(self,url,file_md5name,file_urlname): 30 self.file_urlNameMd5_name=file_md5name 31 self.file_urlName_name=file_urlname 32 self.deal_file_md5=open(self.file_urlNameMd5_name,'r') 33 self.file_md5=self.deal_file_md5.readlines() 34 #用于输入现有的文件 35 for url_md5_name in self.file_md5: 36 #-1表示的是换行符,读入时换行符不会占据两个字符 37 self.download_bf.add(url_md5_name[:-1]) 38 self.deal_file_md5.close() 39 self.file_md5=open(self.file_urlNameMd5_name,'a') 40 self.file_url=open(self.file_urlName_name,'a') 41 self.nowque.append(url) 42 43 def indeque(self,url): 44 self.nowque.append(url) 45 46 def outdeque(self): 47 try: 48 url=self.nowque.popleft() 49 return url 50 except Exception: 51 self.cur_level+=1 52 if self.cur_level==self.max_level: 53 return None 54 if len(self.childque)==0: 55 return None 56 self.nowque=self.childque 57 self.childque=deque() 58 return self.nowque.popleft() 59 60 def crawler(self,url): 61 try: 62 #创建一个request对象,封装一个报文对象 63 req=request.Request(url,headers=self.request_header) 64 #发送报文 65 response=request.urlopen(req) 66 html_page=response.read() 67 #按照固定编码解码 68 html=etree.HTML(html_page.lower().decode('utf-8')) 69 url_list=html.xpath('//a/@href') 70 for url in url_list: 71 if url.find('javascript:')!=-1: 72 continue 73 if url.startswith('http://') is False: 74 if url.startswith('/') is True: 75 url='http://movie.douban.com'+url 76 else: 77 continue 78 if url[-1]=='/': 79 url=url[:-1] 80 temp=hashlib.md5(url.encode('utf-8')).hexdigest() 81 if temp not in self.download_bf: 82 self.download_bf.add(url) 83 self.childque.append(url) 84 self.file_md5.write(temp+'\n') 85 self.file_url.write(url+'\n') 86 except Exception: 87 print("出现异常") 88 89 def startcrawler(self): 90 while True: 91 url=self.outdeque() 92 if url!=None: 93 print("现在爬取"+url+"的超链接") 94 self.crawler(url) 95 else: 96 break 97 self.file_md5.close() 98 self.file_url.close() 99 100 crawel=crawel_bfs("https://movie.douban.com/",'urlmd5.txt', 101 'urlname.txt') 102 crawel.startcrawler()