• 提取网址的python练习


    import urllib, urllib2, cookielib
    from HTMLParser import HTMLParser
    import sys
    
    reload(sys)
    sys.setdefaultencoding('utf8')
    
    class WebParser(HTMLParser):
        def __init__(self, links, path):
            HTMLParser.__init__(self)
            self.links = links
            self.path = path
    
        def handle_starttag(self, tag, attrs):
            if tag == 'a':
                if len(attrs) == 0:
                    pass
                else:
                    for (key, val) in attrs:
                        if key == 'href':
                            if val.startswith('http'):
                                self.links.add(val)
                            elif val.startswith('/'):
                                self.links.add(self.path + val)
    
    class Crawl:
        def __init__(self):
            self.path = 'http://www.baidu.com'
            self.cookie = cookielib.CookieJar()
            handler = urllib2.HTTPCookieProcessor(self.cookie)
            self.opener = urllib2.build_opener(handler)
    
        def open(self, path):
            self.response = self.opener.open(path)
    
        def showCookie(self):
            for item in self.cookie:
                print 'Name = ' + item.name
                print 'value = ' + item.value
    
        def showResponse(self):
            print self.response.read()
    
        def getAllUrl(self, links, path):
            try:
                self.open(path)
                res = self.response.read()
                parser = WebParser(links, path)
                parser.feed(res)
                parser.close()
            except Exception, e:
                print e
    
        def crawl(self):
            src_links = set()
            result_links = set()
            self.getAllUrl(src_links, self.path)
            n = 200
            while len(src_links) != 0 and n > 0:
                link = src_links.pop()
                if link in result_links:
                    pass
                result_links.add(link)
                self.getAllUrl(src_links, link)
                n -= 1
                print n
    
            return result_links | src_links
    
    c = Crawl()
    rlt = c.crawl()
    for link in rlt:
        print link
  • 相关阅读:
    第七届蓝桥杯javaB组真题解析-煤球数目(第一题)
    考生须知
    2016年12月1日
    蓝桥网试题 java 基础练习 矩形面积交
    蓝桥网试题 java 基础练习 矩阵乘法
    蓝桥网试题 java 基础练习 分解质因数
    蓝桥网试题 java 基础练习 字符串对比
    个人银行账户管理程序
    new和delete的三种形式详解
    C++的六个函数
  • 原文地址:https://www.cnblogs.com/hushpa/p/4671144.html
Copyright © 2020-2023  润新知