• beautiful模块


    from bs4 import BeautifulSoup
    
    html_doc = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    asdf
        <div class="title">
            <b>The Dormouse's story总共</b>
            <h1>f</h1>
        </div>
    <div class="story">Once upon a time there were three little sisters; and their names were
        <a  class="sister0" id="link1">Els<span>f</span>ie</a>,
        <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
        <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</div>
    ad<br/>sf
    <p class="story">...</p>
    </body>
    </html>
    """
    
    soup = BeautifulSoup(html_doc, features='lxml')
    # 前戏
    # tag1 = soup.find('a')
    # print(tag1)
    # tag2 = soup.find_all('a')
    # for tag in tag2:
    #     print(tag.text)
    # 找到id=link2的标签
    # tag3 = soup.select('#link2')
    # print(tag3)
    #
    # tag4 = soup.find('', id='link2')
    # print(tag4)
    #
    # tag5 = soup.select('.title')
    # print(tag5, type(tag5[0]))
    
    # 1 name
    # tag = soup.find('a')
    # print(tag)
    # print(tag.name)
    #
    # tag.name = 'span'
    # print(soup)
    
    # 2 attr
    # tag = soup.find('a')
    # attrs = tag.attrs
    # print(attrs)
    # print('xxxxxx', tag.get('class'))
    #
    # tag.attrs = {'ik': 123}
    # tag.attrs['id'] = 'iiiii'
    # print(tag)
    
    # 3 children    所有子标签
    # body = soup.find('body')
    # v = body.children
    # child_list = []
    # for i in v:
    #     print('分割线'.center(120, '#'))
    #     print(i)
    
    # 4 children 所有子子孙孙
    # body = soup.find('body')
    # v = body.descendants
    # for i in v:
    #     print('分割线'.center(120, '#'))
    #     print(i)
    
    # 5 递归删除所有的标签
    # body = soup.find('body')
    # body.decompose()
    # print(soup)
    
    # 6 clear 将标签的所有子标签全部清空(保留标签名)
    # body = soup.find('body')
    # body.clear()
    # print(soup)
    
    # 7 extract,递归的删除所有的标签,并获取删除的标签
    # body = soup.find('body')
    # v = body.extract()
    # print(soup)
    # print('xxxxxxx', v)
    
    # 8 decode,转换为字符串(含当前标签);decode_contents(不含当前标签)
    # body = soup.find('body')
    # print('没转化之前', type(body), body)
    # print("$$$$$$$$$$$$$$$$$")
    # v = body.decode()
    # v1 = body.decode_contents()
    # print(v, type(v))
    # print("$$$$$$$$$$$$$$$$$")
    # print(v1, type(v1))
    
    # 9. encode,转换为字节(含当前标签);encode_contents(不含当前标签)
    # body = soup.find('body')
    # v = body.encode()
    # v1 = body.encode_contents()
    # print(v)
    # print('#'.center(120, '#'))
    # print(v1)
    
    # 10. find,获取匹配的第一个标签
    # tag = soup.find('a')
    # tag = soup.find('a', attrs={'class': 'sister'}, recursive=True, text='Lacie')   # recursive 递归
    # tag = soup.find('a', id='link2')
    # print(tag)
    
    # 11. find_all,获取匹配的所有标签
    # tags = soup.find_all('a')
    # tags = soup.find_all('a', limit=1)
    # tags = soup.find_all('a', attrs={'class': 'sister'})
    # tags = soup.find_all('a', attrs={'class': 'sister'}, text='Lacie')
    # print(tags)
    
    # 列表
    # v = soup.find_all(name=['a', 'div'])
    #
    # v1 = soup.find_all(name='a')
    # v2 = soup.find_all(name='div')
    # # v = soup.find_all(href=rep)
    # print(v)
    #
    # print("&".center(120, '#'))
    # print(v1)
    # print("&".center(120, '#'))
    # print(v2)
    
    # v = soup.find_all(name=['a', 'div'])  # v1 = soup.find_all(name='a') + v2 = soup.find_all(name='div')
    # v = soup.find_all(class_=['sister0', 'sister'])
    # v = soup.find_all(text='Tillie')
    # v = soup.find_all(id=['link1', 'link2'])
    # v = soup.find_all(href=["http://example.com/lacie", "http://example.com/tillie"])
    # print(v)
    
    # 正则
    import re
    
    # rep = re.compile('p')
    # rep = re.compile('^p')
    # v = soup.find_all(name=rep)
    # print(v)
    
    # rep = re.compile('sister.*')
    # v = soup.find_all(class_=rep)
    # print(v)
    
    # rep = re.compile('http://example.com.*')
    #
    # v = soup.find_all(href=rep)
    # print(v)
    
    # 方法筛选
    
    
    # def func(tag):
    #     return tag.has_attr('class') and tag.has_attr('id')
    #
    #
    # v = soup.find_all(name=func)
    # print(v)
    
    # get 获取属性
    # tag = soup.find('a')
    # v = tag.get('id')
    # print(v)
    
    # 12. has_attr,检查标签是否具有该属性
    # tag = soup.find('a')
    # v = tag.has_attr('id')
    # print(v)
    
    # 13. get_text,获取标签内部文本内容
    # tag = soup.find('a')
    # v = tag.get_text('id')
    # print(tag)
    # print(v)
    
    # 14. index,检查标签在某标签中的索引位置
    # tag = soup.find('body')
    # v = tag.index(tag.find('p'))
    # print(tag)
    # print(v)
    
    # tag = soup.find("body")
    # for i, v in enumerate(tag):
    #     print(i, v)
    
    # 15. is_empty_element,是否是空标签(是否可以是空)或者自闭合标签,
    # 判断是否是如下标签:'br' , 'hr', 'input', 'img', 'meta','spacer', 'link', 'frame', 'base'
    # tag = soup.find('br')
    # v = tag.is_empty_element
    # print(tag)
    # print(v)
    
    # 16. 当前的关联标签
    # div = soup.find('div')
    # print(div)
    # print(div.next)
    # print(div.next_element)
    # print(div.next_elements)
    # print(div.sibling)
    # print(div.siblings)
    
    # tag = soup.find('a')
    # print(tag)
    # print(tag.previous)
    # print(tag.previous_element)
    # print(tag.previous_elements)
    # print(tag.previous_sibling)
    # print(tag.previous_siblings)
    
    # print(tag.parent)
    # print(tag.parents)
    
    # 17. 查找某标签的关联标签        #  参数同find_all
    # tag = soup.find('a')
    # print(tag.parent)
    # print(tag.find_next())    # 下一个, 内嵌
    # print(tag.find_all_next())
    # print(tag.find_next_sibling())    # 兄弟
    # print(tag.find_next_siblings())   # 所有兄弟
    # print(tag.find_previous())  # 等同于找上一级
    # print(tag.find_all_previous())
    
    # tag1 = soup.find_all('a')[1]
    # # print(tag1)
    # # print(tag1.find_previous_sibling())  # 前一个兄弟
    # # print(tag1.find_previous_siblings())  # 前面的兄弟们
    
    # print(tag.find_parent())    # tag.parent
    # print(tag.find_parents())    # tag.parents
    
    # 18. select,select_one, CSS选择器
    # print(soup.select('title'))
    # print(soup.select('p nth-of-type(3)'))
    # print(soup.select('body a'))  # soup.find_al('a')
    # soup.select("html head title")
    # tag = soup.select("div,a")
    # tag = soup.select("head > title")     # 注意空格
    # tag = soup.select("div > a")    # 注意空格
    # tag = soup.select("p > a:nth-of-type(2)")
    # tag = soup.select("p > #link1")
    # tag = soup.select("body > a")
    # tag = soup.select("#link1 ~ .sister") # 同级往下所有
    # tag = soup.select("#link1 + .sister") # 同级往下一个
    # tag = soup.select(".sister")  # class
    # tag = soup.select("[class~=sister]")  # 属性
    # tag = soup.select("#link1")   # id
    # tag = soup.select("a#link2")  # a标签里的id=link2
    # tag = soup.select('a[href]')  # 属性
    # tag = soup.select('a[href="http://example.com/lacie"]')     # 完全匹配
    # tag = soup.select('a[href^="http://example.com/"]')  # 开头匹配
    # tag = soup.select('a[href$="tillie"]')  # 结尾匹配
    # tag = soup.select('a[href*=".com/"]')  # 随意包含
    # print(tag)
    
    from bs4.element import Tag
    
    
    # def default_condition_generator(tag):
    #     """找出含有href的标签"""
    #     for child in tag.descendants:
    #         if not isinstance(child, Tag):
    #             continue
    #         if not child.has_attr('href'):
    #             continue
    #         yield child
    
    
    # tags = soup.find('body').select('a', _candidate_generator=default_condition_generator)
    # tags = soup.find('body').select('a', _candidate_generator=default_condition_generator, limit=1)
    # print(type(tags), tags)
    
    
    # 19. 标签的内容
    # tag = soup.find('span')
    # print(tag.string)   # 获取
    # tag.string = 'hello world'  # 设置
    # print(soup)
    
    # tag = soup.find('body')
    # print(tag.string)
    # tag.string = 'xxx'
    # print(soup)
    
    # tag = soup.find('body')
    # v = tag.stripped_strings  # 递归内部获取所有标签的文本
    # for i in v:
    #     print(i)
    
    # tag = soup.find('body')
    # print(tag.text)
    
    # 20 append在当前标签【内部追加】一个标签
    # tag = soup.find('body')
    # tag.append(soup.find('a'))  # <a class="sister0" id="link1">Els<span>f</span>ie</a></body>
    # print(soup)
    
    # from bs4.element import Tag
    # obj = Tag(name='i',attrs={'id': 'it'})
    # obj.string = '我是一个新来的'
    # tag = soup.find('body')
    # tag.append(obj)
    # print(soup)
    
    # 21.insert在当前标签内部指定位置插入一个标签
    # from bs4.element import Tag
    # obj = Tag(name='i', attrs={'id': 'it'})
    # obj.string = '我是一个新来的'
    # tag = soup.find('body')
    # tag.insert(2, obj)  # 在索引为2的位置插入
    # print(soup)
    
    # 22. insert_after,insert_before 在当前标签后面或前面插入
    # from bs4.element import Tag
    # obj = Tag(name='i', attrs={'id': 'it'})
    # obj.string = '我是一个新来的'
    # tag = soup.find('body')
    # tag.insert_before(obj)
    # # tag.insert_after(obj)
    # print(soup)
    
    # 23. replace_with 在当前标签替换为指定标签
    # from bs4.element import Tag
    # obj = Tag(name='i', attrs={'id': 'it'})
    # obj.string = '我是一个新来的'
    # tag = soup.find('div')
    # tag.replace_with(obj)
    # print(soup)
    
    # 24. 创建标签之间的关系
    # tag = soup.find('div')
    # a = soup.find('a')
    # tag.setup(previous_sibling=a)
    # print(tag.previous_sibling)
    
    # 25. wrap,用指定标签把当前标签包裹起来
    # from bs4.element import Tag
    # obj1 = Tag(name='div', attrs={'id': 'it'})
    # obj1.string = '我是一个新来的'
    #
    # tag = soup.find('a')
    # v = tag.wrap(obj1)
    # print(soup)
    
    # tag = soup.find('a')
    # v = tag.wrap(soup.find('p'))
    # print(soup)
    
    # 26. unwrap,去掉当前标签,将保留其包裹的标签
    tag = soup.find('a')
    v = tag.unwrap()    # v为包裹的标签
    print(v)
    print(soup)
  • 相关阅读:
    PHP学习(字符串和变量)
    一个操作配置文件(Reg,ini,XML)的类
    Indy中判断邮件来源
    PHP学习(MSSQL数据库连接)
    辛辛苦苦,写了个INNO的安装脚本
    php连接sqlserver
    Sql Server 使用CTE实现递归查询
    使用序列化和反序列化机制深度复制对象
    ASP.NET中不常用的另类绑定方法<%$ %>
    google map事件监听
  • 原文地址:https://www.cnblogs.com/sunch/p/10764051.html
Copyright © 2020-2023  润新知