爬虫架构
运行流程
网页解析器
网页解析器-BeautifulSoup-语法
简单解析实例1
1 from bs4 import BeautifulSoup 2 import re 3 html_doc = """ 4 <html><head><title> The Dormouse's story</title></head> 5 <body> 6 <p class="title"><b>The Documents's story</b></p> 7 8 <p class="story">Once upon a time there were three littlesisters;and their name 9 <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a> 10 <a href="http://example.com/locie" class="sister" id="link2">Lacie</a> and 11 <a href="http://example.com/title" class="sister" id="link3">Title</a>; 12 and they lived at the bottom of a well.</p> 13 14 <p class="story">...</p> 15 """ 16 soup = BeautifulSoup(html_doc,"html.parser",from_encoding='utf8') 17 print ('获取所有的连接') 18 links = soup.find_all('a') 19 for link in links: 20 print (link.name,link['href'], link.get_text()) 21 22 print ('获取lacie的链接') 23 link_node = soup.find('a', href='http://example.com/locie') 24 print (link_node.name,link_node['href'],link_node.get_text()) 25 26 print ('正则匹配') 27 link_node = soup.find('a',href=re.compile(r"tl")) 28 print (link_node.name,link_node['href'], link_node.get_text()) 29 30 print ('获取P段落文字') 31 p_node = soup.find('p', class_="title") 32 print (p_node.name, p_node.get_text())
简单解析实例2
from bs4 import BeautifulSoup as bs import re html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.net/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # html.parser解析器解析 soup = bs(html_doc,"html.parser") print(soup.prettify()) # 获取title标签及内容 print(soup.title) # 获取title标签的内容 print(soup.title.string) # 获取父标签 print(soup.title.parent.name) # 获得P标签及内容 print(soup.p) # 获得P标签class元素的内容 print(soup.p['class']) # 获取当前A标签及内容 print(soup.a) ''' soup.tag只能获取当前标签所有标签当中的第一个 ''' # 获取所有A标签及内容 print(soup.find_all('a')) # 获得 link元素所在的标签及内容 print(soup.find(id='link1')) # 获取link元素所在标签的内容 print(soup.find(id='link1').string) # 获取所有A标签下的链接和内容 for link in soup.find_all('a'): print('网址为:'+link.get('href')+' 内容为:'+link.string) # 获取p标签里 class 元素的值为story 下的所有标签及内容 print(soup.find("p",{"class":"story"})) # 获取p标签里class元素的只为story下的所有内容 print(soup.find("p",{"class":"story"}).get_text()) # 获取b开头的标签 for tag in soup.find_all(re.compile("^b")): print(tag.name) # 获取a标签下href包含http://example.com的所有标签及内容 print(soup.findAll("a",href=re.compile(r"http://example.com/")))
综合实例-爬取维基百科词条
#!/usr/bin/env python #-*- coding:utf-8 -*- #引入开发包 from urllib.request import urlopen from bs4 import BeautifulSoup import re resp = urlopen("https://en.wikipedia.org/wiki/Main_Page").read().decode('utf-8') #使用BeautifulSoup去解析 soup = BeautifulSoup(resp,"html.parser") #查找以wiki开头的链接 listUrls=soup.findAll("a",href=re.compile("^/wiki/")) #输出所有词条对应的名称和URL for url in listUrls: #过滤掉.以jpg或JPG结尾的链接 if not re.search('.(jpg|JPG)$',url["href"]): #输出URL的文字和对应的链接 print(url.get_text+'<--->'+url['href'])