• BS基础


    Beautifulsoup 库详解

    # -*- coding:utf8 -*-
    # 工程路径:3.3 beautifulsoup库.py
    # 工程日期:9/6/2019
    # 工程目标:beautifulsoup使用详解
    """
    bs支持lxml, HTML 解析, html5解析

    """
    #%%
    html = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """

    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    print(soup.prettify()) # 格式化html
    print(soup.title.string) # 输出 title中内容

    #%% 标签选择器
    html = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    print(soup.title)
    print(type(soup.title)) # 为bs4的元素tag类型
    print(soup.head)
    print(type(soup.head))
    print(soup.p) # 只返回第一个匹配的p标签

    #%% 获取标签的名称
    html = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    print(soup.title.name) # 获取标签的名称
    print(soup.p.name) # 获取p标签的名称

    #%% 获取标签的属性
    html = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    print(soup.p['name'])
    print(soup.p.attrs['name']) # 获取属性

    #%% 获取标签内的文本内容 .string
    html = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    print(soup.p.string) # 获取标签内的文本内容

    #%% 标签的嵌套选择
    html = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    print(soup.head.title.string)
    print(soup.body.p)
    print(soup.body.a['href'])
    print(soup.body.a['class'])
    print(soup.body.a['id'])

    #%% 子节点以及子孙节点的选择
    html = """
    <html>
    <head>
    <title>The Dormouse's story</title>
    </head>
    <body>
    <p class="story">
    Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1">
    <span>Elsie</span>
    </a>
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
    and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
    and they lived at the bottom of a well.
    </p>
    <p class="story">...</p>
    """
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    print(soup.body.p.a['href'])
    #print(soup.p.contents)
    print(type(soup.p.contents))
    for i in soup.p.contents:
    print(i)

    #%% .children 获取子节点 迭代器类型,
    # 使用循环的方式才能取出内容
    html = """
    <html>
    <head>
    <title>The Dormouse's story</title>
    </head>
    <body>
    <p class="story">
    Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1">
    <span>Elsie</span>
    </a>
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
    and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
    and they lived at the bottom of a well.
    </p>
    <p class="story">...</p>
    """
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'lxml')
    print(soup.p.children)
    for i, child in enumerate(soup.p.children):
    print(i, child)



  • 相关阅读:
    render 动态增减表单项校验 小结
    面试题总结2
    禁止蒙层底部页面跟随滚动
    前端面试题总结1
    ||与??的区别,??非空运算符,??=非空赋值运算符 ??.链判断运算符 Object.defineProperty 与Proxy的区别
    chrome 代码调试实用小技巧
    Ubuntu安装ibmmq
    Python语言规范之Pylint的使用
    Python发送SMTP邮件指南
    快看那个运维妹子在学算法【二分查找】
  • 原文地址:https://www.cnblogs.com/binyang/p/10995677.html
Copyright © 2020-2023  润新知