• BeautifulSoup 库


    #!Usr/bin/env python
    # -*- coding:utf-8 -*-
    # Author:Alex  Li
    html = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    from bs4 import BeautifulSoup
    import re
    import urllib.request
    import requests
    soup = BeautifulSoup(html)
    print(soup.prettify())     #以网页格式输出
    print(soup.title.string)  #获得title标签内的内容
    print(soup.a.string)
    print(type(soup.title))  #返回<class 'bs4.element.Tag'> 这种格式可以进行嵌套
    print(soup.title)   #输出title标签带内容<title>The Dormouse's story</title>
    print(soup.head)
    print(soup.li) #没有返回None
    html = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html)
    print(soup.title.name)  #返回title标签的名称   title
    
    html = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html)
    print(soup.p.attrs['name'])  #两种格式输出P标签name='dromouse'
    print(soup.p['name'])
    print(soup.a['href'])  #返回http://example.com/elsie
    html = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p clss="title" name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html)
    print(soup.p.string)  #输出P标签的内容The Dormouse's story
    
    html = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html)
    print(type(soup.head))         #这种格式可以进行嵌套<class 'bs4.element.Tag'>
    print(soup.head.title.strint)  #soup 的嵌套
    
    
    html = """
    <html>
        <head>
            <title>The Dormouse's story</title>
        </head>
        <body>
            <p class="story">
                Once upon a time there were three little sisters; and their names were
                <a href="http://example.com/elsie" class="sister" id="link1">
                    <span>Elsie</span>
                </a>
                <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
                and
                <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
                and they lived at the bottom of a well.
            </p>
            <p class="story">...</p>
    """
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html)
    print(soup.p.contents) #输出P标签里的东西从<p>到</p>
    
    html = """
    <html>
        <head>
            <title>The Dormouse's story</title>
        </head>
        <body>
            <p class="story">
                Once upon a time there were three little sisters; and their names were
                <a href="http://example.com/elsie" class="sister" id="link1">
                    <span>Elsie</span>
                </a>
                <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
                and
                <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
                and they lived at the bottom of a well.
            </p>
            <p class="story">...</p>
    """
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html)
    print(soup.p.children)
    for i, child in enumerate(soup.p.children): #i代表索引,children是p内的子节点
        print(i, child)
    
    #父节点和祖先节点
    html = """
    <html>
        <head>
            <title>The Dormouse's story</title>
        </head>
        <body>
            <p class="story">
                Once upon a time there were three little sisters; and their names were
                <a href="http://example.com/elsie" class="sister" id="link1">
                    <span>Elsie</span>
                </a>
                <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
                and
                <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
                and they lived at the bottom of a well.
            </p>
            <p class="story">...</p>
    """
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html)
    print(soup.a.parent) #输出a标签的父亲节点包括自己
    
    
    #兄弟节点
    
    html = """
    <html>
        <head>
            <title>The Dormouse's story</title>
        </head>
        <body>
            <p class="story">
                Once upon a time there were three little sisters; and their names were
                <a href="http://example.com/elsie" class="sister" id="link1">
                    <span>Elsie</span>
                </a>
                <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
                and
                <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
                and they lived at the bottom of a well.
            </p>
            <p class="story">...</p>
    """
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html)
    print(list(enumerate(soup.a.next_siblings)))
    print(list(enumerate(soup.a.previous_siblings)))
    #这有点不清楚
    
    
    
    #标准选择器
    html='''
    <div class="panel">
        <div class="panel-heading">
            <h4>Hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
                <li class="element">Jay</li>
            </ul>
            <ul class="list list-small" id="list-2">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
            </ul>
        </div>
    </div>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html)
    print(soup.find_all('ul'))#寻找所有<ul>
    print(type(soup.find_all('ul')[0]))
    
    html='''
    <div class="panel">
        <div class="panel-heading">
            <h4>Hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1" name="elements">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
                <li class="element">Jay</li>
            </ul>
            <ul class="list list-small" id="list-2">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
            </ul>
        </div>
    </div>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html)
    print(soup.find_all(attrs={'id': 'list-1'}))#可以理解为寻找里面有id='list-1'的所有标签
    print(soup.find_all(attrs={'name': 'elements'}))
    
    
    #获取属性
    html='''
    <div class="panel">
        <div class="panel-heading">
            <h4>Hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
                <li class="element">Jay</li>
            </ul>
            <ul class="list list-small" id="list-2">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
            </ul>
        </div>
    </div>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html)
    for ul in soup.select('ul'):
        print(ul['id'])
        print(ul.attrs['id'])#可以进行嵌套
    
    #CSS选择器
    html='''
    <div class="panel">
        <div class="panel-heading">
            <h4>Hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
                <li class="element">Jay</li>
            </ul>
            <ul class="list list-small" id="list-2">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
            </ul>
        </div>
    </div>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html)
    print(soup.select('.panel .panel-heading'))#==class="panel"下class="panel-heading"及其内容
    print(soup.select('ul li'))#找到ul 下的li标签以列表的形式返回内容
    print(soup.select('#list-2 .element'))# #list==id="lisr-2" .element==class="element" 返回这个ID下的class
    print(type(soup.select('ul')[0]))
    
    #获取内容
    html='''
    <div class="panel">
        <div class="panel-heading">
            <h4>Hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list" id="list-1">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
                <li class="element">Jay</li>
            </ul>
            <ul class="list list-small" id="list-2">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
            </ul>
        </div>
    </div>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html)
    for li in soup.select('li'):
        print(li.get_text())
    #获取li里面的内容
    # Foo
    # Bar
    # Jay
    # Foo
    # Bar
  • 相关阅读:
    第一章计算机基础
    补充:bytes类型以及字符编码转换
    python内存相关以及深浅拷贝讲解
    python之路day15--内置函数
    python之路day14--嵌套函数、匿名函数、高阶函数。函数的递归
    python之路day13--迭代器
    python之路day14--列表生成式、生成器generator、生成器并行
    spark MLlib DataType ML中的数据类型
    spark actions 算子
    spark Transformations算子
  • 原文地址:https://www.cnblogs.com/yuanke98/p/9219324.html
Copyright © 2020-2023  润新知