• 解析网页的多种方式


    1.使用正则表达式获取网页标题内容

    In [ ]:

    import requests
    import re
    
    url = 'http://www.tipdm.com/tipdm/index.html'
    
    rqq = requests.get(url)
    rqq.encoding = 'utf-8'
    rqq.text
    

    In [ ]:

    re.findall('<li><a href="[a-z0-9.:/]+" target=".*">(.+)</a></li>', rqq.text)
    

    In [ ]:

    re.findall('<li[ a-z="]*><a[a-z0-9 = "]* href="[a-z0-9.:/]+".*>(.+)</a></li>', rqq.text)
    

    2.使用XPath解析网页

    In [ ]:

    import requests
    from lxml import etree
    
    url = 'http://www.tipdm.com/tipdm/index.html'
    
    rqq = requests.get(url)
    html = etree.HTML(rqq.content, etree.HTMLParser())
    html
    

    In [ ]:

    etree.tostring(html, encoding='utf-8').decode('utf-8')
    rqq.encoding = 'utf-8'
    rqq.text
    

    In [ ]:

    html.xpath('/html/head/title')
    

    In [ ]:

    html.xpath('//title')
    

    In [ ]:

    html.xpath('/html/body/header/div/nav/ul/li/a/text()')
    html.xpath('//header/div/nav/ul/li/a/text()')
    html.xpath('//*[@id="menu"]/li/a/text()')
    

    In [ ]:

    html.xpath('/html/body/header/div/nav/ul/li[last()]')
    

    In [ ]:

    html.xpath('/html/body/header/div/nav/ul/li/a[@target="_blank"]/text()')
    

    In [ ]:

    import requests
    from lxml import etree
    
    url = 'https://weixin.sogou.com/'
    
    rqq = requests.get(url)
    

    In [ ]:

    html = etree.HTML(rqq.content, etree.HTMLParser(encoding='utf-8'))
    
    html.xpath('//*[@id="topwords"]/li/a[@title]/text()')
    

    In [ ]:

    [html.xpath('//*[@id="topwords"]/li['+str(i)+']/a[@title]/text()') for i in range(1, 11)]
    html.xpath('//*[@id="topwords"]/li/a[@title]/text()')
    

    In [ ]:

    print(rqq.text[:20])
    print(rqq.content[:20])
    

    3.使用Beautiful Soup解析网页

    In [ ]:

    import requests
    from bs4 import BeautifulSoup
    
    rqq = requests.get('http://www.tipdm.com/tipdm/index.html')
    
    soup = BeautifulSoup(rqq.content, 'lxml')
    

    In [ ]:

    soup.head
    soup.body
    soup.li
    soup.find_all('li')
    

    In [ ]:

    a = soup.link
    a.name
    a.attrs
    a['href']
    

    In [ ]:

    a = soup.find_all('nav')
    for i in a[0].find_all('li'):
        print(i.string)
    

    In [ ]:

    a=soup.select('html > head > title')[0]
    a.text
    

    In [ ]:

    soup.select('.menu > li')  # class
    soup.select('#menu > li')  # id
    [i.text for i in soup.select('.menu > li')]
    

    In [ ]:

    soup.select('#menu > li > a')
    

    In [ ]:

    from bs4 import BeautifulSoup
    import requests
    
    rqq = requests.get('https://weixin.sogou.com/')
    soup = BeautifulSoup(rqq.content, 'lxml')
    
    soup.select('#topwords')
    [i.text for i in soup.select('.hot-news > li > a')]
    [i['title'] for i in soup.select('.hot-news > li > a')]
    

    In [ ]:

    a = soup.find_all(id='topwords')[0]
    [i.text for i in a.find_all('a')]
    [i.string for i in a.find_all('a')]
    

    4.参考文章

    【创作不易,望点赞收藏,若有疑问,请留言,谢谢】

  • 相关阅读:
    运算符优先级问题
    文件操作工具,需者自取
    Text文档编码识别方法
    删除重复文件的程序
    修道士和野人问题
    猜数字游戏
    存储器层级图
    IL指令汇总
    输入1~8,每个数字不重复
    厦门大学线下编程比赛第一题:求和
  • 原文地址:https://www.cnblogs.com/dongxuelove/p/16434804.html
Copyright © 2020-2023  润新知