• python beautifulsoup 对html 进行爬取分类(部分)


    html = '''
    <html><head><title>The Domouse's story</title></head>
    <body>
    <p class="title"name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were little sisters;and their names were
    <a href="http://example.com/elsie"class="sister"id="link1"><!--Elsie--></a>
    <a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and
    <a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a>
    and they lived at bottom of a well.</p>
    <p class="story">...</p>
    '''
     
    from bs4 import BeautifulSoup
    soup= BeautifulSoup(html,'lxml')
     
    print(soup.prettify())#格式化代码,打印结果自动补全缺失的代码
    print(soup.title.string)#文章标题
     
    结果:
    <html>
     <head>
      <title>
       The Domouse's story
      </title>
     </head>
     <body>
      <p class="title" name="dromouse">
       <b>
        The Dormouse's story
       </b>
      </p>
      <p class="story">
       Once upon a time there were little sisters;and their names were
       <a class="sister" href="http://example.com/elsie" id="link1">
        <!--Elsie-->
       </a>
       <a class="sister" hred="http://example.com/lacle" id="link2">
        Lacle
       </a>
       and
       <a class="sister" hred="http://example.com/tilie" id="link3">
        Tillie
       </a>
       and they lived at bottom of a well.
      </p>
      <p class="story">
       ...
      </p>
     </body>
    </html>
    The Domouse's story
     
     
     
     
     
     

    选择元素
    html = '''
    <html><head><title>The Domouse's story</title></head>
    <body>
    <p class="title"name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were little sisters;and their names were
    <a href="http://example.com/elsie"class="sister"id="link1"><!--Elsie--></a>
    <a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and
    <a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a>
    and they lived at bottom of a well.</p>
    <p class="story">...</p>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.title)
            #<title>The Domouse's story</title>
    print(type(soup.title))
            #<class 'bs4.element.Tag'>
    print(soup.head)
            #<head><title>The Domouse's story</title></head>
    print(soup.p)#当出现多个时,只返回第一个
            #<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
     
     
     
    获取标签名称:
    html = '''
    <html><head><title>The Domouse's story</title></head>
    <body>
    <p class="title"name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were little sisters;and their names were
    <a href="http://example.com/elsie"class="sister"id="link1"><!--Elsie--></a>
    <a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and
    <a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a>
    and they lived at bottom of a well.</p>
    <p class="story">...</p>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.title.name)
            #title
     
     
     
    获取属性:
    html = '''
    <html><head><title>The Domouse's story</title></head>
    <body>
    <p class="title"name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were little sisters;and their names were
    <a href="http://example.com/elsie"class="sister"id="link1"><!--Elsie--></a>
    <a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and
    <a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a>
    and they lived at bottom of a well.</p>
    <p class="story">...</p>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.p.attrs['name'])
            #dromouse
    print(soup.p['name'])
            #dromouse
     
     

    获取标签内容:
    html = '''
    <html><head><title>The Domouse's story</title></head>
    <body>
    <p class="title"name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were little sisters;and their names were
    <a href="http://example.com/elsie"class="sister"id="link1"><!--Elsie--></a>
    <a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and
    <a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a>
    and they lived at bottom of a well.</p>
    <p class="story">...</p>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
    print(soup.p.string)
            #The Dormouse's story
     
     
    根据name查找
    html = '''
    <div class="panel">
        <div class="panel-heading"name="elements">
            <h4>Hello</h4>
        </div>
        <div class="panel-body">
            <ul class="list"Id="list-1">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
                <li class="element">Jay</li>
            </ul>
            <ul class="list list-small"Id="list-2">
                <li class="element">Foo</li>
                <li class="element">Bar</li>
            </ul>
        </div>
    <div>
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html,'lxml')
     
    print(soup.find_all('ul'))#列表类型
    print(type(soup.find_all('ul')[0]))
     
    结果:
    [<ul class="list" id="list-1">
    <li class="element">Foo</li>
    <li class="element">Bar</li>
    <li class="element">Jay</li>
    </ul>, <ul class="list list-small" id="list-2">
    <li class="element">Foo</li>
    <li class="element">Bar</li>
    </ul>]
    <class 'bs4.element.Tag'>
  • 相关阅读:
    joda-time的使用
    logger 的使用一 小结
    svn 技巧
    mysql 获取一段时间的数据
    Drools5
    Java各种Utils小结
    Java 8 新特新 工具类 ZonedDateTime
    集合工具类CollectionUtils、ListUtils、SetUtils、MapUtils的使用
    sonar 的使用
    MySQL入门教学(看完必懂,图文详解!)
  • 原文地址:https://www.cnblogs.com/1208xu/p/11824375.html
Copyright © 2020-2023  润新知