• 爬虫——选择器


    BeautifulSoup

    • 导入类库
    from bs4 import BeautifulSoup  # 适用于简单页面
    
    
    • 创建soup对象
    soup = BeautifulSoup(test_data,'lxml')
    
    • 标签特性
    1 # print(soup.a)                          #获取指定标签
    2 # print(soup.a['href'])                    #获取标签指定属性
    3 # print(soup.a.contents,type(soup.a.contents))      #获取标签的文本,结果是列表
    4 # print(soup.a.text,type(soup.a.text))                #获取标签的文本,结果是字符串
    • 查找标签
      • 查找所有a标签
      • 查找特定id的a标签及其href
      • 查找所有特定id的标签
     1 # 查找所有a标签
     2 # a_list = soup.find_all('a')
     3 # for a in a_list:
     4 #     print(a['href'])
     5 
     6 # 查找id为places_neighbours__row的a标签及其href
     7 # a_place = soup.find_all('a',id='places_neighbours__row')
     8 # for a in a_place:
     9 #     print(a['href'])
    10 # 作用比上面更广,不指定标签,把具有id为places_neighbours__row的标签都找出来
    11 # attrs_place = soup.find_all(attrs={'id':'places_neighbours__row'})
    12 # for attrs in attrs_place:
    13 #     print(attrs.name)           #name为标签名字

    lxml

    / 从根标签开始 // 从当前标签开始 * 通配符,选择所有 //div/book[2]/title[@lang="zh"] 选择当前div标签路径下第二个book(下标从1开始)的含有lang=zh属性的title元素

    • 导入类库
    import lxml.html
    
    • 创建lxml对象
    html = lxml.html.fromstring(test_data) 
    
    • 相对路径//
    # 以下三种具有相同结果,使用相对路径最终都指向title,写得越简单需要搜索越久
    # html_data = html.xpath('//div/book/title')
    # html_data = html.xpath('//book/title')
    # html_data = html.xpath('//title')
    
    • *作用
    # 把含有属性的title都选择出来
    # html_data = html.xpath('//book/title[@*]')
    
    # 将title所有的属性值选择出来,选择出来的是属性值没有text
    # html_data = html.xpath('//book/title/@*')
    
    • 内置text()函数

    取出title的内容,是一个列表,不用text

    # html_data = html.xpath('//book/title/text()')
    # html_data = html.xpath('//div/ul/li[1]/a/text()')
    
    • 逻辑关系and
    # html_data = html.xpath('//a[@href="link1.html" and @id="places_neighbours__row"]/text()')
    
    • 逻辑关系or
    # html_data = html.xpath('//li[@class="item-1" or @class="item-0"]/a/text()')
    
    • 不等于(!=)
    # html_data = html.xpath('//li[@class!="item-1" and @class!="item-0"]/a/text()')
    
    • last()

    取指定标签的最后一个

    # html_data = html.xpath('//div/book[last()-1]/title/text()')
    
    • 比较关系
    # html_data = html.xpath('//div/book[price > 39]/title/text()')
    # html_data = html.xpath('//div/book[price >= 39.95]/title/text()')
    
    • starts-with
    # html_data = html.xpath('//li[starts-with(@class,"item")]/a/text()')
    # html_data = html.xpath('//li[starts-with(@class,"g")]/a/text()')
    
    • contains
    # html_data = html.xpath('//li[contains(@class,"te")]/a/text()')
    # html_data = html.xpath('//title[contains(@lang,"n")]/text()')
    
    • 父子节点
    # html_data = html.xpath('//book/descendant::*/text()')
    # html_data = html.xpath('//book/ancestor::*')  # 选出祖先节点


    爬取百科文章

    爬取前对网页源码进行分析,对要爬取的文章定位

    1 # url_base = 'https://www.qiushibaike.com/8hr/page/2/'
    2 # result = requests.get(url_base,headers=headers)
    3 # html = lxml.html.fromstring(result.text)
    4 # html_data = html.xpath('//div[@class="content"]/span[1]/text()')
    5 # # print(html_data)
    6 # for i in html_data:
    7 #     with open('./qiushi.txt','ab') as f:
    8 #         f.write(i.encode('utf-8'))
    9 # print(result.text)

    爬取百科图片

    爬取前对网页源码进行分析,对要爬取的图片定位

    
    
     1 # url_base = 'https://www.qiushibaike.com/imgrank/page/2/'
     2 # result = requests.get(url_base,headers=headers)
     3 # html = lxml.html.fromstring(result.text)
     4 # html_data = html.xpath('//div[@class="thumb"]/a/img/@src')     #提取图片地址
     5 # # print(html_data[0][2:],type(str(html_data[0][2:])))
     6 # for i in html_data:
     7 #     photo_url = 'https:'+str(i)
     8 #     photo = requests.get(photo_url)
     9 #     p_num = html_data.index(i) + 1
    10 #     with open('./picture/'+str(p_num)+'.jpg','wb') as f:
    11 #         f.write(photo.content)






    请使用手机"扫一扫"x

  • 相关阅读:
    第6周编程题:零基础学Java
    帆软报表软件学习计划
    北大软件工程——第八周:面向对象设计2
    hdu1264 Counting Squares
    hdu1264 Counting Squares
    poj1151 Atlantis(线段树+扫描线)
    poj1151 Atlantis(线段树+扫描线)
    bzoj4653 [Noi2016]区间
    bzoj4653 [Noi2016]区间
    Tyvj1043
  • 原文地址:https://www.cnblogs.com/siplips/p/9689473.html
Copyright © 2020-2023  润新知