• Spider_实践_beautifulsoup静态网页爬取所有网页链接


    # 获取百度网站首页上的所有a标签里的 href属性值:
    
    # import requests
    # from bs4 import BeautifulSoup 
    
    # # html = requests.get('http://en.wikipedia.org/wiki/Kevin_Bacon')
    # html = requests.get('http://www.baidu.com')
    # bs = BeautifulSoup(html.text, 'html.parser')
    # for link in bs.find_all(lambda tag: 'href' in tag.attrs):
    #     print(link.attrs['href'])
        
    
    # import requests
    # import re
    # from bs4 import BeautifulSoup 
    
    # # html = requests.get('http://en.wikipedia.org/wiki/Kevin_Bacon')
    # html = requests.get('http://www.baidu.com')
    # bs = BeautifulSoup(html.text, 'html.parser')
    # for link in bs.find_all('', {'href':re.compile('.com')}):
    #     print(link.attrs['href'])
    
    
    # import requests
    # from bs4 import BeautifulSoup 
    
    # html = requests.get('http://www.baidu.com')
    # bs = BeautifulSoup(html.text, 'html.parser')
    # for link in bs.find_all('a'):
    #     if 'href' in link.attrs:
    #         print(link.attrs['href'])
            
    
    import requests
    from bs4 import BeautifulSoup 
    
    def geturl(url):
        html = requests.get(url)
        bs = BeautifulSoup(html.text, 'html.parser')
        return bs.find_all('a')
    
    links=geturl('http://www.baidu.com')
    for link in links:
        if 'href' in link.attrs:
            print(link.attrs['href'])
    
    http://news.baidu.com
    http://www.hao123.com
    http://map.baidu.com
    http://v.baidu.com
    http://tieba.baidu.com
    http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1
    //www.baidu.com/more/
    http://home.baidu.com
    http://ir.baidu.com
    http://www.baidu.com/duty/
    http://jianyi.baidu.com/
    
    
    import requests
    from bs4 import BeautifulSoup 
    
    def geturl(url):
        html = requests.get(url)
        bs = BeautifulSoup(html.text, 'html.parser')
        return bs.find_all('', {'href':re.compile('http://')})
    
    # links=geturl('http://www.baidu.com')
    # print(list)
    # links_in_news=set(geturl(links[1].attrs['href']))  # set(list) 去重变为集合
    # for link in links_in_news:
    #    print(link.attrs['href'])
    
    print('-------------------------------------------------------------------------')
        
    links=geturl('http://www.baidu.com')
    for link in links:
        if '//news.' in link.attrs['href']:
            links_in_news=set(geturl(link.attrs['href']))  # set(list) 去重变为集合
            for link in links_in_news:
                print(link.attrs['href'])
            break
    
    -------------------------------------------------------------------------
    http://baijiahao.baidu.com/s?id=1670182176542294758
    http://baijiahao.baidu.com/s?id=1670237336710694101
    http://baijiahao.baidu.com/s?id=1670287125142703268
    http://baijiahao.baidu.com/s?id=1670255408896313915
    http://baijiahao.baidu.com/s?id=1670191066909619203
    http://v.baidu.com/
    http://baijiahao.baidu.com/s?id=1670253988609166598
     http://news.cctv.com/2020/06/23/ARTIHsG0yhCaD2YJUSFy7Qwt200623.shtml
    http://baijiahao.baidu.com/s?id=1670286769270600802
    http://news.cctv.com/2020/06/23/ARTIpnapIHyb413WeY46ShDy200623.shtml
    http://m.top.cnr.cn/bdxw/20200623/t20200623_525141426.html
    http://world.people.com.cn/n1/2020/0623/c1002-31756267.html
    http://m.news.cctv.com/2020/06/23/ARTIDAQdwzQFMOkbW2Z0ehEk200623.shtml
    http://baijiahao.baidu.com/s?id=1670245143050480742
    http://m.news.cctv.com/2020/06/18/ARTIYNwiYAjjHBmGeAXpERs3200618.shtml
    http://m.xinhuanet.com/yn/2020-06/23/c_139161263.htm
    http://baijiahao.baidu.com/s?id=1670194818426496533
    http://baijiahao.baidu.com/s?id=1670232858345398185
    http://www.xinhuanet.com/2020-06/23/c_1126147531.htm
    http://baijiahao.baidu.com/s?id=1670251112933488182
    http://baijiahao.baidu.com/s?id=1670254276238905964
    http://baijiahao.baidu.com/s?id=1670255017218969710
    http://music.baidu.com/
    http://m.top.cnr.cn/bdxw/20200623/t20200623_525141422.html
    http://app.cctv.com/special/cportal/detail/arti/index.html?id=Arti8bFV6wkTJPYEkaZYVvoC200622&fromapp=cctvnews&version=805&allow_comment=1&allow_comment=1
    http://map.baidu.com/
    http://baijiahao.baidu.com/s?id=1670243226621040644
    http://baijiahao.baidu.com/s?id=1670254944449236682
    http://net.china.cn/chinese/index.htm
    http://baijiahao.baidu.com/s?id=1670250874637091231
    http://baijiahao.baidu.com/s?id=1670232858345398185
    http://baijiahao.baidu.com/s?id=1670289098569528699
    http://baijiahao.baidu.com/s?id=1670247580845339645
    http://baijiahao.baidu.com/s?id=1670254849012760202
    http://m.top.cnr.cn/bdxw/20200623/t20200623_525141424.html
    http://baijiahao.baidu.com/s?id=1670246144336669257
    http://baijiahao.baidu.com/s?id=1670254276238905964
    http://app.cctv.com/special/cportal/detail/arti/index.html?id=ArtiLXGGutc9OLD23xo3Y3dN200622&fromapp=cctvnews&version=805&allow_comment=1&allow_comment=1
    http://www.qstheory.cn/zt2019/llxjj/index.htm
    http://www.cyberpolice.cn/wfjb/
    http://baijiahao.baidu.com/s?id=1670250874637091231
    http://baijiahao.baidu.com/s?id=1670239896280719334
    http://baijiahao.baidu.com/s?id=1670248053773599893
    http://image.baidu.com/
    http://baijiahao.baidu.com/s?id=1670243226621040644
    http://news.baidu.com/
    http://tieba.baidu.com/
    http://wenku.baidu.com/
    http://report.12377.cn:13225/toreportinputNormal_anis.do
    http://www.xinhuanet.com/politics/2020-06/23/c_1126149333.htm
    http://app.cctv.com/special/cportal/detail/arti/index.html?id=ArtiA1FM8grjZNDdJ15XVvv8200623&fromapp=cctvnews&version=727
    http://downpack.baidu.com/baidunews_AndroidPhone_1014720b.apk
    http://www.bjjubao.org/
    http://www.qstheory.cn/zt2017/xcgcdd19djs/index.htm
    
    li=[1,2,2,3,4,3,6,4,3]
    s=set(li)  #set(list) 去重变为集合
    print(s)
    
    {1, 2, 3, 4, 6}
    
    # 递归 抓取所有的链接及链接页面的链接...:
    import requests
    from requests import exceptions
    from bs4 import BeautifulSoup 
    
    pages=set()
    def geturl(url):
        global pages
        headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
        # 捕获异常
        try:
            html = requests.get(url,headers=headers)
        except exceptions.ConnectionError:
            # print(url)
            print("连接错误")
        else:
            bs = BeautifulSoup(html.text, 'html.parser')
            links=set(bs.find_all('', {'href':re.compile('^(http://)|^(https://)')}))  # set去重
            if links:
                for link in links:
                    if link.attrs['href'] not in pages:  # 去重
                        pages.add(link.attrs['href'])
                        print(link.attrs['href'])
                        geturl(link.attrs['href'])  # 递归
            else:
                print("已爬完!")
    home_link='http://www.baidu.com'
    geturl(home_link)
    print('end....')
    
    
    https://wenku.baidu.com
    https://www.baidu.com/cache/icon/favicon.ico
    已爬完!
    https://www.baidu.com/cache/icon/favicon.svg
    已爬完!
    https://jingyan.baidu.com
    https://passport.baidu.com/v2/?reg&tpl=exp&u=http%3A%2F%2Fjingyan.baidu.com%2F
    https://www.baidu.com/favicon.ico
    已爬完!
    https://www.baidu.com/img/baidu.svg
    已爬完!
    https://passport.baidu.com/v2/?ucenterfeedback#reg
    http://www.baidu.com/
    https://passport.baidu.com/v2/?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2F&sms=5
    https://passport.baidu.com/export/app/index.html
    https://downpack.baidu.com/ppSecurityCenter_AndroidPhone_passport.apk
    已爬完!
    https://itunes.apple.com/cn/app/bai-du-quan-zhong-xin-shou/id695439229
    https://www.apple.com.cn/iphone/
    https://www.apple.com/kw/iphone/
    https://www.apple.com/lae/iphone/
    https://www.apple.com/gn/iphone/
    https://support.apple.com/fr-gn
    https://support.apple.com/ko-kr
    https://support.apple.com/en-al
    https://support.apple.com/fr-sn
    https://support.apple.com/ru-ru
    https://www.apple.com/ru/
    https://www.apple.com/kr/
    https://www.apple.com/la/
    ---为了方便展示,删除几百行---
    
    KeyboardInterrupt: 
    
    # 增强上面的代码:只爬取链接,往往是没有什么用的,现在我们增加点功能:
    # 1)所有链接网页的标题
          # h1-->span
    # 2)第一段文字  
          # div#mw-content-text-->p
    # 3)编辑链接
          # li#cca-edit-->span-->a
    
    import requests
    from bs4 import BeautifulSoup
    from requests import exceptions
    import re
    
    
    pages = set()
    def geturl(url):
        global pages
        headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
        # 捕获异常
        html = requests.get(url,headers=headers)
        bs = BeautifulSoup(html.text, 'html.parser')
        try:
            print(bs.h1)
    #         print(bs.find(id ='mw-content-text').find_all('p')[0])
    #         print(bs.find(id='ca-edit').find('span').find('a').attrs['href'])
        except AttributeError:
            print('This page is missing something! Continuing.')
        except exceptions.ConnectionError:
            print("连接错误")
        except exceptions.HTTPError:
            print('HTTP错误异常')
        except exceptions.MaxRetryError:
            print("多次重试失败")
        except exceptions.TooManyRedirects:
            print("多次重定位失败")
        except exceptions.ConnectTimeout:
            print("连接远程服务超时错误")
        except exceptions.Timeout:
            print("请求 URL超时")
        
        links=set(bs.find_all('', {'href':re.compile('^(http://)|^(https://)')}))  # set去重
        if links:
            for link in links:
                if 'href' in link.attrs:
                    if link.attrs['href'] not in pages:
                        newPage = link.attrs['href']
                        print('-'*20)
                        print(newPage)
                        pages.add(newPage)
                        geturl(newPage)
        else:
            print("已爬完!")
    
    # home_link='https://baike.baidu.com/'
    home_link='https://baike.hk.xileso.top/wiki/Wikipedia:首页'
    geturl(home_link)
    print('end....')
    
    # 书上的例子(不翻墙没法访问,例子不可用)
    from urllib.request import urlopen
    from bs4 import BeautifulSoup
    import re
    
    pages = set()
    def getLinks(pageUrl):
        global pages
        html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
        bs = BeautifulSoup(html, 'html.parser')
        try:
            print(bs.h1.get_text())
            print(bs.find(id ='mw-content-text').find_all('p')[0])
            print(bs.find(id='ca-edit').find('span').find('a').attrs['href'])
        except AttributeError:
            print('This page is missing something! Continuing.')
        
        for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
            if 'href' in link.attrs:
                if link.attrs['href'] not in pages:
                    #We have encountered a new page
                    newPage = link.attrs['href']
                    print('-'*20)
                    print(newPage)
                    pages.add(newPage)
                    getLinks(newPage)
    getLinks('') 
    
  • 相关阅读:
    Ajax 学习笔记
    StringBulider简单用法
    asp.net 发送邮件
    log4net日志组件
    Web.Config文件详解
    关闭discuzX3.2注册页面的注册邮箱验证
    Oracle 序列(sequence)的创建、修改及删除
    MySQL 和 Oracle 主键自增长
    EL(表达式)语言的几种运算符
    SQL:select case when 的用法
  • 原文地址:https://www.cnblogs.com/Collin-pxy/p/13192388.html
Copyright © 2020-2023  润新知