• Spider_基础总结3_BeautifulSoup对象+find()+find_all()


    # 本节内容:
    # 解析复杂的 HTML网页:
    # 1--bs.find()  bs.find_all()  tag.get_text()
    # find_all(tag/tag_list,attributes_dict,recursive,text,limit,keywords)  
    #     find(tag/tag_list,attributes_dict,recursive,text,keywords)
    
    # 2--CSS选择器(导航树): 一般与 bs.find() bs.find_all()搭配使用 
    # tag.children   tag.descendants  tag.next_siblings   tag.previous_siblings  tag.parent
    
    # 3--BeautifulSoup对象: 
    # beautifulsoup对象  bs
    # Tag对象(包含单个Tag或者 Tag列表)
    # NavigableString 对象    表示标签里的文字,而不是标签本身
    # Comment对象 用来查找 HTML 文档的注释标签,<!--像这样-->
    
    # 解析复杂的 html网页时,我们使用 beautifulsoup利用 css的样式属性可以轻松地区分出不同的标签来:
    #  bs.find()   bs.findall()   tag.get_text()
     
    # 一,引子:
    import requests
    from requests import exceptions
    from bs4 import BeautifulSoup
    
    html = requests.get('http://www.pythonscraping.com/pages/warandpeace.html')
    bs = BeautifulSoup(html.text, 'html.parser')
    # print(bs)
    nameList = bs.findAll('span', {'class': 'green'})   # bs.findall(tag/tag_list,attributes_dict) 返回以 满足条件的 tag的列表
    for name in nameList:
        print(name.get_text())                          # tag.get_text()  最后使用 get_text(),一般情况下我们保留 HTML的标签结构
    
    Anna
    Pavlovna Scherer
    Empress Marya
    Fedorovna
    Prince Vasili Kuragin
    Anna Pavlovna
    St. Petersburg
    the prince
    Anna Pavlovna
    Anna Pavlovna
    the prince
    the prince
    the prince
    Prince Vasili
    Anna Pavlovna
    Anna Pavlovna
    the prince
    Wintzingerode
    King of Prussia
    le Vicomte de Mortemart
    Montmorencys
    Rohans
    Abbe Morio
    the Emperor
    the prince
    Prince Vasili
    Dowager Empress Marya Fedorovna
    the baron
    Anna Pavlovna
    the Empress
    the Empress
    Anna Pavlovna's
    Her Majesty
    Baron
    Funke
    The prince
    Anna
    Pavlovna
    the Empress
    The prince
    Anatole
    the prince
    The prince
    Anna
    Pavlovna
    Anna Pavlovna
    
    # 二,通过标签的名称和属性来查找标签:
    
    # bs.findall()与 bs.find()  (后者相当于前者 limit=1的情况)
    
    # find_all(tag/tag_list,attributes_dict,recursive,text,limit,keywords)  
    #     find(tag/tag_list,attributes_dict,recursive,text,keywords)
    
    # tag/tag_list (标签或标签列表)-- 如:‘span’ 或 ['h1','h2','p']
    # attributes_dict (属性字典)-- 如: {'class':'green'}  再如:{'class':{'green', 'red'}}  
    # recursive (递归 )    -- 默认为 True---表示 查找指定的tag/tag_list及其子标签...
    # text (文本参数 )     -- text=‘指定要查找的文本内容’  而不使用 标签的属性   返回的是 NavigableString,而不是标签对象。
    # limit (限制匹配次数 )--注意是,按照网页上的顺序排序之后抓取指定的次数的标签,未必是你想要的那前几项。
    # keywords--可以设置一个或多个 keyword来进一步限制匹配的标签,如 id='Tiltle' class_='green'等。 (为与python中的关键字区分,bs规定加个_)
    
    
    # 示例 1:
    
    titles = bs.find_all(['h1', 'h2','h3','h4','h5','h6'])
    print([title for title in titles])   # [<h1>War and Peace</h1>, <h2>Chapter 1</h2>]
    
    prince=bs.find(text='the prince')
    print(type(prince))  # <class 'bs4.element.NavigableString'>
    prince_list=bs.find_all(text='the prince')
    print(prince_list)
    print([prince for prince in prince_list])
    
    [<h1>War and Peace</h1>, <h2>Chapter 1</h2>]
    <class 'bs4.element.NavigableString'>
    ['the prince', 'the prince', 'the prince', 'the prince', 'the prince', 'the prince', 'the prince']
    ['the prince', 'the prince', 'the prince', 'the prince', 'the prince', 'the prince', 'the prince']
    
    # 示例 2:
    allText = bs.find_all(id='title', class_='text')
    print(allText)
    print([text for text in allText])
    
    []
    []
    
    # 三,BeautifulSoup对象:
    # 1-beautifulsoup对象  bs
    # 2-Tag对象(包含单个Tag或者 Tag列表)
    # 3-NavigableString 对象    表示标签里的文字,而不是标签本身
    # 4-Comment对象 用来查找 HTML 文档的注释标签,<!--像这样-->
    
    # 四,导航树:子标签,后代标签,兄弟标签,父标签
    # find_all()与find()是通过标签的名称和属性来查找标签,我们还可以通过标签的位置来查找:
    # 1)单一方向: bs.tag.subtag.anothersubtag 
    # 2) 导航树:纵向和横向导航
    
    # 1-- 子标签: .children
    import requests
    from bs4 import BeautifulSoup
    
    html = requests.get('http://www.pythonscraping.com/pages/page3.html')
    bs = BeautifulSoup(html.text, 'html.parser')
    
    for child in bs.find('table',{'id':'giftList'}).children:
        print(child)
        print('--------------------------------------------')
    
    
    --------------------------------------------
    <tr><th>
    Item Title
    </th><th>
    Description
    </th><th>
    Cost
    </th><th>
    Image
    </th></tr>
    --------------------------------------------
    
    
    --------------------------------------------
    <tr class="gift" id="gift1"><td>
    Vegetable Basket
    </td><td>
    This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
    <span class="excitingNote">Now with super-colorful bell peppers!</span>
    </td><td>
    $15.00
    </td><td>
    <img src="../img/gifts/img1.jpg"/>
    </td></tr>
    --------------------------------------------
    
    
    --------------------------------------------
    <tr class="gift" id="gift2"><td>
    Russian Nesting Dolls
    </td><td>
    Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
    </td><td>
    $10,000.52
    </td><td>
    <img src="../img/gifts/img2.jpg"/>
    </td></tr>
    --------------------------------------------
    
    
    --------------------------------------------
    <tr class="gift" id="gift3"><td>
    Fish Painting
    </td><td>
    If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
    </td><td>
    $10,005.00
    </td><td>
    <img src="../img/gifts/img3.jpg"/>
    </td></tr>
    --------------------------------------------
    
    
    --------------------------------------------
    <tr class="gift" id="gift4"><td>
    Dead Parrot
    </td><td>
    This is an ex-parrot! <span class="excitingNote">Or maybe he's only resting?</span>
    </td><td>
    $0.50
    </td><td>
    <img src="../img/gifts/img4.jpg"/>
    </td></tr>
    --------------------------------------------
    
    
    --------------------------------------------
    <tr class="gift" id="gift5"><td>
    Mystery Box
    </td><td>
    If you love suprises, this mystery box is for you! Do not place on light-colored surfaces. May cause oil staining. <span class="excitingNote">Keep your friends guessing!</span>
    </td><td>
    $1.50
    </td><td>
    <img src="../img/gifts/img6.jpg"/>
    </td></tr>
    --------------------------------------------
    
    
    --------------------------------------------
    
    # 2-- 后代标签: .descendants 
    
    import requests
    from bs4 import BeautifulSoup
    
    html = requests.get('http://www.pythonscraping.com/pages/page3.html')
    bs = BeautifulSoup(html.text, 'html.parser')
    
    for child in bs.find('table',{'id':'giftList'}).descendants:  # 查找第一个时,bs.table.tr 或 bs.tr也行,但不具体,如果网页变化,容易丢失
        print(child)
        print('--------------------------------------------')
    
    --------------------------------------------
    <tr><th>
    Item Title
    </th><th>
    Description
    </th><th>
    Cost
    </th><th>
    Image
    </th></tr>
    --------------------------------------------
    <th>
    Item Title
    </th>
    --------------------------------------------
    
    Item Title
    
    --------------------------------------------
    <th>
    Description
    </th>
    --------------------------------------------
    
    Description
    
    --------------------------------------------
    <th>
    Cost
    </th>
    --------------------------------------------
    
    Cost
    
    --------------------------------------------
    <th>
    Image
    </th>
    --------------------------------------------
    
    Image
    
    --------------------------------------------
    
    
    --------------------------------------------
    <tr class="gift" id="gift1"><td>
    Vegetable Basket
    </td><td>
    This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
    <span class="excitingNote">Now with super-colorful bell peppers!</span>
    </td><td>
    $15.00
    </td><td>
    <img src="../img/gifts/img1.jpg"/>
    </td></tr>
    --------------------------------------------
    <td>
    Vegetable Basket
    </td>
    --------------------------------------------
    
    Vegetable Basket
    
    --------------------------------------------
    <td>
    This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
    <span class="excitingNote">Now with super-colorful bell peppers!</span>
    </td>
    --------------------------------------------
    
    This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
    
    --------------------------------------------
    <span class="excitingNote">Now with super-colorful bell peppers!</span>
    --------------------------------------------
    Now with super-colorful bell peppers!
    --------------------------------------------
    
    
    --------------------------------------------
    <td>
    $15.00
    </td>
    --------------------------------------------
    
    $15.00
    
    --------------------------------------------
    <td>
    <img src="../img/gifts/img1.jpg"/>
    </td>
    --------------------------------------------
    
    
    --------------------------------------------
    <img src="../img/gifts/img1.jpg"/>
    --------------------------------------------
    
    
    --------------------------------------------
    
    
    --------------------------------------------
    <tr class="gift" id="gift2"><td>
    Russian Nesting Dolls
    </td><td>
    Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
    </td><td>
    $10,000.52
    </td><td>
    <img src="../img/gifts/img2.jpg"/>
    </td></tr>
    --------------------------------------------
    <td>
    Russian Nesting Dolls
    </td>
    --------------------------------------------
    
    Russian Nesting Dolls
    
    --------------------------------------------
    <td>
    Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
    </td>
    --------------------------------------------
    
    Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! 
    --------------------------------------------
    <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
    --------------------------------------------
    8 entire dolls per set! Octuple the presents!
    --------------------------------------------
    
    
    --------------------------------------------
    <td>
    $10,000.52
    </td>
    --------------------------------------------
    
    $10,000.52
    
    --------------------------------------------
    <td>
    <img src="../img/gifts/img2.jpg"/>
    </td>
    --------------------------------------------
    
    
    --------------------------------------------
    <img src="../img/gifts/img2.jpg"/>
    --------------------------------------------
    
    
    --------------------------------------------
    
    
    --------------------------------------------
    <tr class="gift" id="gift3"><td>
    Fish Painting
    </td><td>
    If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
    </td><td>
    $10,005.00
    </td><td>
    <img src="../img/gifts/img3.jpg"/>
    </td></tr>
    --------------------------------------------
    <td>
    Fish Painting
    </td>
    --------------------------------------------
    
    Fish Painting
    
    --------------------------------------------
    <td>
    If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
    </td>
    --------------------------------------------
    
    If something seems fishy about this painting, it's because it's a fish! 
    --------------------------------------------
    <span class="excitingNote">Also hand-painted by trained monkeys!</span>
    --------------------------------------------
    Also hand-painted by trained monkeys!
    --------------------------------------------
    
    
    --------------------------------------------
    <td>
    $10,005.00
    </td>
    --------------------------------------------
    
    $10,005.00
    
    --------------------------------------------
    <td>
    <img src="../img/gifts/img3.jpg"/>
    </td>
    --------------------------------------------
    
    
    --------------------------------------------
    <img src="../img/gifts/img3.jpg"/>
    --------------------------------------------
    
    
    --------------------------------------------
    
    
    --------------------------------------------
    <tr class="gift" id="gift4"><td>
    Dead Parrot
    </td><td>
    This is an ex-parrot! <span class="excitingNote">Or maybe he's only resting?</span>
    </td><td>
    $0.50
    </td><td>
    <img src="../img/gifts/img4.jpg"/>
    </td></tr>
    --------------------------------------------
    <td>
    Dead Parrot
    </td>
    --------------------------------------------
    
    Dead Parrot
    
    --------------------------------------------
    <td>
    This is an ex-parrot! <span class="excitingNote">Or maybe he's only resting?</span>
    </td>
    --------------------------------------------
    
    This is an ex-parrot! 
    --------------------------------------------
    <span class="excitingNote">Or maybe he's only resting?</span>
    --------------------------------------------
    Or maybe he's only resting?
    --------------------------------------------
    
    
    --------------------------------------------
    <td>
    $0.50
    </td>
    --------------------------------------------
    
    $0.50
    
    --------------------------------------------
    <td>
    <img src="../img/gifts/img4.jpg"/>
    </td>
    --------------------------------------------
    
    
    --------------------------------------------
    <img src="../img/gifts/img4.jpg"/>
    --------------------------------------------
    
    
    --------------------------------------------
    
    
    --------------------------------------------
    <tr class="gift" id="gift5"><td>
    Mystery Box
    </td><td>
    If you love suprises, this mystery box is for you! Do not place on light-colored surfaces. May cause oil staining. <span class="excitingNote">Keep your friends guessing!</span>
    </td><td>
    $1.50
    </td><td>
    <img src="../img/gifts/img6.jpg"/>
    </td></tr>
    --------------------------------------------
    <td>
    Mystery Box
    </td>
    --------------------------------------------
    
    Mystery Box
    
    --------------------------------------------
    <td>
    If you love suprises, this mystery box is for you! Do not place on light-colored surfaces. May cause oil staining. <span class="excitingNote">Keep your friends guessing!</span>
    </td>
    --------------------------------------------
    
    If you love suprises, this mystery box is for you! Do not place on light-colored surfaces. May cause oil staining. 
    --------------------------------------------
    <span class="excitingNote">Keep your friends guessing!</span>
    --------------------------------------------
    Keep your friends guessing!
    --------------------------------------------
    
    
    --------------------------------------------
    <td>
    $1.50
    </td>
    --------------------------------------------
    
    $1.50
    
    --------------------------------------------
    <td>
    <img src="../img/gifts/img6.jpg"/>
    </td>
    --------------------------------------------
    
    
    --------------------------------------------
    <img src="../img/gifts/img6.jpg"/>
    --------------------------------------------
    
    
    --------------------------------------------
    
    
    --------------------------------------------
    
    # 3-- 兄弟标签:next_siblings 和 previous_sibling
    
    import requests
    from bs4 import BeautifulSoup
    
    html = requests.get('http://www.pythonscraping.com/pages/page3.html')
    bs = BeautifulSoup(html.text, 'html.parser')
    
    for sibling in bs.find('table', {'id':'giftList'}).tr.next_siblings:
        print(sibling) 
    
    <tr class="gift" id="gift1"><td>
    Vegetable Basket
    </td><td>
    This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
    <span class="excitingNote">Now with super-colorful bell peppers!</span>
    </td><td>
    $15.00
    </td><td>
    <img src="../img/gifts/img1.jpg"/>
    </td></tr>
    
    
    <tr class="gift" id="gift2"><td>
    Russian Nesting Dolls
    </td><td>
    Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
    </td><td>
    $10,000.52
    </td><td>
    <img src="../img/gifts/img2.jpg"/>
    </td></tr>
    
    
    <tr class="gift" id="gift3"><td>
    Fish Painting
    </td><td>
    If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
    </td><td>
    $10,005.00
    </td><td>
    <img src="../img/gifts/img3.jpg"/>
    </td></tr>
    
    
    <tr class="gift" id="gift4"><td>
    Dead Parrot
    </td><td>
    This is an ex-parrot! <span class="excitingNote">Or maybe he's only resting?</span>
    </td><td>
    $0.50
    </td><td>
    <img src="../img/gifts/img4.jpg"/>
    </td></tr>
    
    
    <tr class="gift" id="gift5"><td>
    Mystery Box
    </td><td>
    If you love suprises, this mystery box is for you! Do not place on light-colored surfaces. May cause oil staining. <span class="excitingNote">Keep your friends guessing!</span>
    </td><td>
    $1.50
    </td><td>
    <img src="../img/gifts/img6.jpg"/>
    </td></tr>
    
    # 4-- 父标签:.parent  用的比较少
    
    # 查找图片 '../img/gifts/img1.jpg'对应的商品的价格:
    import requests
    from bs4 import BeautifulSoup
    html = requests.get('http://www.pythonscraping.com/pages/page3.html')
    bs = BeautifulSoup(html.text, 'html.parser')
    
    print(bs.find('img',
                  {'src':'../img/gifts/img1.jpg'})
          .parent.previous_sibling.get_text())  # 兄弟标签和父标签
    
    $15.00
    
                                                                                                                
    
  • 相关阅读:
    1.5 RPM红帽软件包1.6 Yum软件仓库
    Linux如何查找大文件或目录总结
    1.4 重置root用户密码
    Chrome 红色和 Chromium蓝色 区别:logoChrome 红色和 Chromium蓝色;Chrome闭源和 Chromium开源;
    【Android架构GPS篇】之定位数据怎样从GPS芯片到应用层
    ArcGIS教程:生成特征文件、类和聚类分析
    MiniGUI + Hi3531 笔记 .
    swift手记-2
    java桌面程序中使用联动菜单遇到与解决的问题
    &lt;div+css页面布局课堂笔记&gt;11---页面布局站点首页设计实例__终极版(仿csdn首页)
  • 原文地址:https://www.cnblogs.com/Collin-pxy/p/13178990.html
Copyright © 2020-2023  润新知