• python 爬虫(四)


    爬遍整个网络

    1 当我们访问整个网络的时候,我们不可避免的会访问不同的网站,但是不同的网站会有完全不同的结构和内容...

    现在一步一步的构建访问整个网络的脚本

    I 从一个网站开始,每一次都爬向不同的网站。如果在一个页面找不到指向其他网站的链接,获取本网站其他界面信息,直到找到其他网站的链接。

    # -*- coding:utf-8 -*-  
    
    from urllib.request import urlopen
    from urllib.error import HTTPError
    from bs4 import BeautifulSoup
    from random import choice
    import re
    
    basename = "http://en.wikipedia.org"
    visitedpages = set()
    
    def getInternalLinks(bsObj,includeUrl):
        return [eachlink.attrs['href'] for eachlink in bsObj.find_all("a",href=re.compile("^(/|.*" + includeUrl + ")")) if 'href' in eachlink.attrs]
    
    def getExternalLinks(bsObj,excludeUrl):
        return [eachlink.attrs['href'] for eachlink in bsObj.find_all("a",href=re.compile("^(http|www)((?!" + excludeUrl + ").)*$")) if 'href' in eachlink.attrs]
    
    def splitAddress(address):
        addressParts = address.replace("http://","").split("/")
        return addressParts
    
    def getRandomExternalLink(startingPage):
        html = urlopen(startingPage)
        with html:
            bsObj = BeautifulSoup(html,"html.parser")
        externalLinks = getExternalLinks(bsObj,splitAddress(startingPage)[0])
        if len(externalLinks) == 0:
            internalLinks = getInternalLinks(bsObj, splitAddress(startingPage)[0])
            return choice(internalLinks)
        else:
            return choice(externalLinks)
    
    def followExternalLink(startingPage):
        externalLink = getRandomExternalLink("http://www.oreilly.com/")
        if externalLink in visitedpages:
            print("visited")
        else:    
            print("the random external link is   " + externalLink)
            visitedpages.add(externalLink)
            followExternalLink(externalLink)
    
    
    if __name__ == "__main__":
        #print(splitAddress("http://www.oreilly.com/")[0])
        #print(getRandomExternalLink("http://www.oreilly.com/"))
        followExternalLink("http://www.oreilly.com/")    
                 
    View Code

    II 从一个网站开始,查找这个网站所有界面信息,获取整个网站指向其他网站的链接

    # -*- coding:utf-8 -*-  
    
    from urllib.request import urlopen
    from urllib.error import HTTPError
    from bs4 import BeautifulSoup
    from random import choice
    import re
    
    def getInternalLinks(bsObj,includeUrl):
        return [eachlink.attrs['href'] for eachlink in bsObj.find_all("a",href=re.compile("^(/|.*" + includeUrl + ")")) if 'href' in eachlink.attrs]
    
    def getExternalLinks(bsObj,excludeUrl):
        return [eachlink.attrs['href'] for eachlink in bsObj.find_all("a",href=re.compile("^(http|www)((?!" + excludeUrl + ").)*$")) if 'href' in eachlink.attrs]
    
    def splitAddress(address):
        addressParts = address.replace("http://","").split("/")
        return addressParts
    
    allINlinks = set()
    allEXlinks = set()
    def getAllexternalLinks(startPage):
        try:
            with urlopen(startPage) as html:
                bsObj = BeautifulSoup(html,"html.parser")
        except HTTPError as e:
            print(e)
        else:
            allinternallinks = getInternalLinks(bsObj,splitAddress(startPage)[0])
            allexternallinks = getExternalLinks(bsObj,splitAddress(startPage)[0])
            print("************external*******************************")
            for eachexternallink in allexternallinks:
                if eachexternallink not in allEXlinks:
                    allEXlinks.add(eachexternallink)
                    print(eachexternallink)
            print("************internal*******************************")
            for eachinternallink in allinternallinks:
                if eachinternallink not in allINlinks:
                    allINlinks.add(eachinternallink)
                    print(eachinternallink)
                    getAllexternalLinks(eachinternallink)
    
    if __name__ == "__main__":
        getAllexternalLinks("http://www.oreilly.com/")    
    View Code

       ***************还存在问题的代码***************************

  • 相关阅读:
    c程序设计语言_习题1-16_自己编写getline()函数,接收整行字符串,并完整输出
    c程序设计语言_习题1-13_统计输入中单词的长度,并且根据不同长度出现的次数绘制相应的直方图
    c程序设计语言_习题1-11_学习单元测试,自己生成测试输入文件
    c程序设计语言_习题1-9_将输入流复制到输出流,并将多个空格过滤成一个空格
    c语言时间库函数#include<time.h>
    c语言输入与输出库函数#include<stdio.h>
    c语言诊断_断言库函数#include<assert.h>
    c语言实用功能库函数#include<stdlib.h>
    Remove Duplicates from Sorted List
    Merge Sorted Array
  • 原文地址:https://www.cnblogs.com/someoneHan/p/6234508.html
Copyright © 2020-2023  润新知