• 爬虫 基础 BS详解


    Beautifulsoup 库详解

    # -*- coding:utf8 -*-

    # 工程路径:3.3 beautifulsoup库.py

    # 工程日期:9/6/2019

    # 工程目标:beautifulsoup使用详解

    """

    bs支持lxml, HTML 解析, html5解析

     

    """

    #%%

    html = """

    <html><head><title>The Dormouse's story</title></head>

    <body>

    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>

    <p class="story">Once upon a time there were three little sisters; and their names were

    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,

    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

    and they lived at the bottom of a well.</p>

    <p class="story">...</p>

    """

     

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.prettify()) # 格式化html

    print(soup.title.string) # 输出 title中内容

     

    #%% 标签选择器

    html = """

    <html><head><title>The Dormouse's story</title></head>

    <body>

    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>

    <p class="story">Once upon a time there were three little sisters; and their names were

    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,

    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

    and they lived at the bottom of a well.</p>

    <p class="story">...</p>

    """

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.title)

    print(type(soup.title)) # 为bs4的元素tag类型

    print(soup.head)

    print(type(soup.head))

    print(soup.p) # 只返回第一个匹配的p标签

     

    #%% 获取标签的名称

    html = """

    <html><head><title>The Dormouse's story</title></head>

    <body>

    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>

    <p class="story">Once upon a time there were three little sisters; and their names were

    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,

    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

    and they lived at the bottom of a well.</p>

    <p class="story">...</p>

    """

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.title.name) # 获取标签的名称

    print(soup.p.name) # 获取p标签的名称

     

    #%% 获取标签的属性

    html = """

    <html><head><title>The Dormouse's story</title></head>

    <body>

    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>

    <p class="story">Once upon a time there were three little sisters; and their names were

    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,

    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

    and they lived at the bottom of a well.</p>

    <p class="story">...</p>

    """

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.p['name'])

    print(soup.p.attrs['name']) # 获取属性

     

    #%% 获取标签内的文本内容 .string

    html = """

    <html><head><title>The Dormouse's story</title></head>

    <body>

    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>

    <p class="story">Once upon a time there were three little sisters; and their names were

    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,

    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

    and they lived at the bottom of a well.</p>

    <p class="story">...</p>

    """

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.p.string) # 获取标签内的文本内容

     

    #%% 标签的嵌套选择

    html = """

    <html><head><title>The Dormouse's story</title></head>

    <body>

    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>

    <p class="story">Once upon a time there were three little sisters; and their names were

    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,

    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

    and they lived at the bottom of a well.</p>

    <p class="story">...</p>

    """

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.head.title.string)

    print(soup.body.p)

    print(soup.body.a['href'])

    print(soup.body.a['class'])

    print(soup.body.a['id'])

     

    #%% 子节点以及子孙节点的选择

    html = """

    <html>

    <head>

    <title>The Dormouse's story</title>

    </head>

    <body>

    <p class="story">

    Once upon a time there were three little sisters; and their names were

    <a href="http://example.com/elsie" class="sister" id="link1">

    <span>Elsie</span>

    </a>

    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>

    and

    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>

    and they lived at the bottom of a well.

    </p>

    <p class="story">...</p>

    """

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.body.p.a['href'])

    #print(soup.p.contents)

    print(type(soup.p.contents))

    for i in soup.p.contents:

    print(i)

     

    #%% .children 获取子节点 迭代器类型,

    # 使用循环的方式才能取出内容

    html = """

    <html>

    <head>

    <title>The Dormouse's story</title>

    </head>

    <body>

    <p class="story">

    Once upon a time there were three little sisters; and their names were

    <a href="http://example.com/elsie" class="sister" id="link1">

    <span>Elsie</span>

    </a>

    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>

    and

    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>

    and they lived at the bottom of a well.

    </p>

    <p class="story">...</p>

    """

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.p.children)

    for i, child in enumerate(soup.p.children):

    print(i, child)

     

    #%% .descendents 获取所有的子孙节点

    html = """

    <html>

    <head>

    <title>The Dormouse's story</title>

    </head>

    <body>

    <p class="story">

    Once upon a time there were three little sisters; and their names were

    <a href="http://example.com/elsie" class="sister" id="link1">

    <span>Elsie</span>

    </a>

    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>

    and

    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>

    and they lived at the bottom of a well.

    </p>

    <p class="story">...</p>

    """

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.p.descendants)

    for i, decendant in enumerate(soup.p.descendants):

    print(i, decendant) # 输出p标签的所有的子孙节点

     

    #%% .parent父节点 .parents祖先节点

    html = """

    <html>

    <head>

    <title>The Dormouse's story</title>

    </head>

    <body>

    <p class="story">

    Once upon a time there were three little sisters; and their names were

    <a href="http://example.com/elsie" class="sister" id="link1">

    <span>Elsie</span>

    </a>

    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>

    and

    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>

    and they lived at the bottom of a well.

    </p>

    <p class="story">...</p>

    """

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.a.parent)

    print(type(soup.a.parent)) # 父节点为 标签类

    print(type(soup.a.parents)) # 祖先节点为 迭代器

    for i, pars in enumerate(soup.a.parents):

    print(i, pars)

     

    print(list(enumerate(soup.a.parents))) # list 输出

     

    #%% 获取兄弟并列的节点

    # .next_siblings 下一个兄弟节点

    # .previous_siblings 前一个兄弟界节点

    html = """

    <html>

    <head>

    <title>The Dormouse's story</title>

    </head>

    <body>

    <p class="story">

    Once upon a time there were three little sisters; and their names were

    <a href="http://example.com/elsie" class="sister" id="link1">

    <span>Elsie</span>

    </a>

    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>

    and

    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>

    and they lived at the bottom of a well.

    </p>

    <p class="story">...</p>

    """

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.a.next_siblings) # 迭代器的类型的返回

    print(list(enumerate(soup.a.next_siblings)))

    print(list(enumerate(soup.a.previous_siblings)))

     

    """

    以上的选择方式为表标签的选择方式,方式快,但是满足不够

    """

    #%% 标准选择器 find_all 根据标签名, 属性,选择标签 列表返回

    # find_all (name, attrs, recursiv, text, **kwargs)

     

    #%% 标签名选择 name

    html='''

    <div class="panel">

    <div class="panel-heading">

    <h4>Hello</h4>

    </div>

    <div class="panel-body">

    <ul class="list" id="list-1">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    <li class="element">Jay</li>

    </ul>

    <ul class="list list-small" id="list-2">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    </ul>

    </div>

    </div>

    '''

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.find_all('ul'))

    print(type(soup.find_all('ul')))

    print(soup.find_all('ul')[0])

     

    #%% 循环嵌套 find_all

    html='''

    <div class="panel">

    <div class="panel-heading">

    <h4>Hello</h4>

    </div>

    <div class="panel-body">

    <ul class="list" id="list-1">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    <li class="element">Jay</li>

    </ul>

    <ul class="list list-small" id="list-2">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    </ul>

    </div>

    </div>

    '''

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    for ul in soup.find_all('ul'):

    print(soup.find_all('li')) # 循环嵌套的方式查找 ul 标签中的li标签

     

    #%% attrs 属性查找对应的内容

    html='''

    <div class="panel">

    <div class="panel-heading">

    <h4>Hello</h4>

    </div>

    <div class="panel-body">

    <ul class="list" id="list-1" name="elements">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    <li class="element">Jay</li>

    </ul>

    <ul class="list list-small" id="list-2">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    </ul>

    </div>

    </div>

    '''

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.find_all(attrs={'id':'list-1'})) # 匹配所有符合该 属性的标签内容

    print(soup.find_all(attrs={'name':'elements'})) # 两次的匹配结果实际上一致

     

     

    ## 更简单的写法

    print(soup.find_all(id='list-1'))

    print(soup.find_all(class_= 'list')) # 再这个地方class 为关键字, 因此加下划线来进行区分

     

     

    #%% 使用文本的内容进行匹配 text

    html='''

    <div class="panel">

    <div class="panel-heading">

    <h4>Hello</h4>

    </div>

    <div class="panel-body">

    <ul class="list" id="list-1">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    <li class="element">Jay</li>

    </ul>

    <ul class="list list-small" id="list-2">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    </ul>

    </div>

    </div>

    '''

     

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.find_all(text='Foo')) # 有两处满足 输出文本, 不输出完整的标签

     

    #%% find 方法 只返回单个匹配的元素, 不返回所有的结果

    html='''

    <div class="panel">

    <div class="panel-heading">

    <h4>Hello</h4>

    </div>

    <div class="panel-body">

    <ul class="list" id="list-1">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    <li class="element">Jay</li>

    </ul>

    <ul class="list list-small" id="list-2">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    </ul>

    </div>

    </div>

    '''

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.find('ul')) # 值返回第一个匹配的结果

     

    #%% 其他选择, 类似于标签的选择

    """

    ### find_parents() find_parent()

     

    find_parents()返回所有祖先节点,find_parent()返回直接父节点。

     

    ### find_next_siblings() find_next_sibling()

     

     

    ### find_previous_siblings() find_previous_sibling()

     

    find_previous_siblings()返回前面所有兄弟节点,find_previous_sibling()返回前面第一个兄弟节点。

     

    ### find_all_next() find_next()

     

    find_all_next()返回节点后所有符合条件的节点, find_next()返回第一个符合条件的节点

     

    ### find_all_previous() 和 find_previous()

     

    find_all_previous()返回节点后所有符合条件的节点, find_previous()返回第一个符合条件的节点

     

    """

     

     

    #%% CSS 选择器 通过select 直接传入CSS选择器 就可以完成标签或者元素的的选择

    # . 代表 class

    # # 代表 id

    # 空格 代表嵌套

    html='''

    <div class="panel">

    <div class="panel-heading">

    <h4>Hello</h4>

    </div>

    <div class="panel-body">

    <ul class="list" id="list-1">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    <li class="element">Jay</li>

    </ul>

    <ul class="list list-small" id="list-2">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    </ul>

    </div>

    </div>

    '''

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    print(soup.select('.panel .panel-heading')) # . 选择class

    print(soup.select('.panel .panel-body'))

     

    print(soup.select('ul li'))

    print(type(soup.select('ul li'))) # 嵌套使用空格进行选择 选择后的对象为list

    print(list(enumerate(soup.select('ul li'))))

    print(list(soup.select('ul li')))

    print(soup.select('#list-2')) # 通过 # 选择id 选出所有 id 符合条件的标签

    print(soup.select('#list-1 .element')) # 通过 # id 选择 然后嵌套选出 class 为element的标签

     

    #%% for嵌套选择

    html='''

    <div class="panel">

    <div class="panel-heading">

    <h4>Hello</h4>

    </div>

    <div class="panel-body">

    <ul class="list" id="list-1">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    <li class="element">Jay</li>

    </ul>

    <ul class="list list-small" id="list-2">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    </ul>

    </div>

    </div>

    '''

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    for ul in soup.select('ul'):

    print(soup.select('li')) # 通过for循环嵌套选出 li 标签

    print(soup.select('ul li')) # 使用空格 具有同等效果的嵌套

     

    #%% select 获取标签的属性 [ ]

    html='''

    <div class="panel">

    <div class="panel-heading">

    <h4>Hello</h4>

    </div>

    <div class="panel-body">

    <ul class="list" id="list-1">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    <li class="element">Jay</li>

    </ul>

    <ul class="list list-small" id="list-2">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    </ul>

    </div>

    </div>

    '''

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    for ul in soup.select('ul'):

    print(ul['class'])

    print(ul.attrs['id']) # 使用 [ ] 的这两种方式都可以获取 标签的 id

     

    #%% get_text 获取标签标签中的内容

    html='''

    <div class="panel">

    <div class="panel-heading">

    <h4>Hello</h4>

    </div>

    <div class="panel-body">

    <ul class="list" ok hah id="list-1">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    <li class="element">Jay</li>

    </ul>

    <ul class="list list-small" id="list-2">

    <li class="element">Foo</li>

    <li class="element">Bar</li>

    </ul>

    </div>

    </div>

    '''

    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, 'lxml')

    for li in soup.select('li'):

    print(li.get_text('.title'))

     

     

    #%% 示例

    import requests

    from bs4 import BeautifulSoup

     

    html = requests.get('https://book.douban.com').text

    #print(html.text)

    soup = BeautifulSoup(html, 'lxml')

    # print(soup.prettify())

    for title in soup.select('.title '):

    #print(soup.select('a'))

    for a in soup.select('a'):

    print(a['href'])

    print(a.get_text())

  • 相关阅读:
    vue-cli + webpack 多页面实例配置优化方法
    Python Web(1):建立第一个Web项目
    C# winform用sharpGL(OpenGl)解析读取3D模型obj
    CSS outline 属性
    sqlserver查询两个值是否相等
    vue v-for(数组遍历)
    内存查看工具RAMMAP说明
    linux 入门
    linux 内核根文件系统
    linux 命令
  • 原文地址:https://www.cnblogs.com/binyang/p/10995671.html
Copyright © 2020-2023  润新知