• 简单爬虫架构


    爬虫架构

    运行流程

     网页解析器 

     网页解析器-BeautifulSoup-语法

    简单解析实例1

     1 from bs4 import BeautifulSoup
     2 import re
     3 html_doc  = """
     4 <html><head><title> The Dormouse's story</title></head>
     5 <body>
     6 <p class="title"><b>The Documents's story</b></p>
     7 
     8 <p class="story">Once upon a time there were three littlesisters;and their name
     9 <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
    10 <a href="http://example.com/locie" class="sister" id="link2">Lacie</a> and 
    11 <a href="http://example.com/title" class="sister" id="link3">Title</a>;
    12 and they lived at the bottom of a well.</p>
    13 
    14 <p class="story">...</p>
    15 """
    16 soup = BeautifulSoup(html_doc,"html.parser",from_encoding='utf8')
    17 print ('获取所有的连接')
    18 links =  soup.find_all('a')
    19 for link in links:
    20     print (link.name,link['href'], link.get_text()) 
    21 
    22 print ('获取lacie的链接') 
    23 link_node = soup.find('a', href='http://example.com/locie')
    24 print  (link_node.name,link_node['href'],link_node.get_text())
    25 
    26 print ('正则匹配')
    27 link_node = soup.find('a',href=re.compile(r"tl"))
    28 print (link_node.name,link_node['href'], link_node.get_text())
    29 
    30 print ('获取P段落文字')
    31 p_node = soup.find('p', class_="title")
    32 print (p_node.name, p_node.get_text())

     简单解析实例2

    from bs4 import BeautifulSoup as bs
    import re
    
    
    html_doc = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title"><b>The Dormouse's story</b></p>
    
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.net/elsie" class="sister" id="link1">Elsie</a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    
    <p class="story">...</p>
    """
    # html.parser解析器解析
    soup = bs(html_doc,"html.parser")
    
    print(soup.prettify())
    
    # 获取title标签及内容
    print(soup.title) 
    
    # 获取title标签的内容
    print(soup.title.string) 
    
    # 获取父标签
    print(soup.title.parent.name) 
    
    # 获得P标签及内容
    print(soup.p)
    
    # 获得P标签class元素的内容
    print(soup.p['class'])
    
    # 获取当前A标签及内容
    print(soup.a)
    
    '''
    soup.tag只能获取当前标签所有标签当中的第一个
    
    '''
    # 获取所有A标签及内容
    print(soup.find_all('a'))
    
    # 获得 link元素所在的标签及内容
    print(soup.find(id='link1'))
    
    # 获取link元素所在标签的内容
    print(soup.find(id='link1').string)
    
    # 获取所有A标签下的链接和内容
    for link in soup.find_all('a'):
        print('网址为:'+link.get('href')+'  内容为:'+link.string)
    
    # 获取p标签里 class 元素的值为story 下的所有标签及内容
    print(soup.find("p",{"class":"story"}))
    
    
    # 获取p标签里class元素的只为story下的所有内容
    print(soup.find("p",{"class":"story"}).get_text())
    
    
    # 获取b开头的标签
    for tag in soup.find_all(re.compile("^b")):
        print(tag.name)
    
    
    # 获取a标签下href包含http://example.com的所有标签及内容
    print(soup.findAll("a",href=re.compile(r"http://example.com/")))    

    综合实例-爬取维基百科词条 

    #!/usr/bin/env python
    #-*- coding:utf-8 -*-
    #引入开发包
    from urllib.request import urlopen
    from bs4 import BeautifulSoup
    import re
    
    resp = urlopen("https://en.wikipedia.org/wiki/Main_Page").read().decode('utf-8')
    
    #使用BeautifulSoup去解析
    soup = BeautifulSoup(resp,"html.parser")
    
    #查找以wiki开头的链接
    listUrls=soup.findAll("a",href=re.compile("^/wiki/"))
    
    #输出所有词条对应的名称和URL
    for url in listUrls:
    	#过滤掉.以jpg或JPG结尾的链接
    	if not re.search('.(jpg|JPG)$',url["href"]):
    
    		#输出URL的文字和对应的链接
    		print(url.get_text+'<--->'+url['href'])
    

      

  • 相关阅读:
    dbcc练习1
    查看底层硬件信息
    mkfs.xfs命令没找到
    Linux清空内存缓存
    常用CentOS 6/7 扩展源
    CentOS 6.x安装gcc 4.8/4.9/5.2
    根据SSH私钥(private key)计算SSH公钥(public key)
    Mac下Sublime Text Vim模式 方向键无法长按
    【转载】Hadoop和大数据:60款顶级大数据开源工具
    Linux Shell产生16进制随机数
  • 原文地址:https://www.cnblogs.com/luoye00/p/5785495.html
Copyright © 2020-2023  润新知