#小练习解析HTML文件并使用字典保存链接分类： HTMLParser python 小练习 2013-11-11 12:06 267人阅读评论(0) 收藏

#coding:utf-8
from HTMLParser import HTMLParser
import pprint

class myhtml(HTMLParser):

    def __init__(self):
        HTMLParser.__init__(self)
        self.d={}
        self.flag=None
        self.value=None

    #解析html声明
    def handle_decl(self,declaration):
        print 'Declaration:',declaration

    #解析开始标签
    def handle_starttag(self,tag,attrs):
        print 'Start tag:',tag
        if tag=='a':
            #如果tag是“a”，则添加标识flag=‘a’，处理链接数据时，判断是否tag为‘a’
            self.flag='a'
            if len(attrs)==0:pass
            else:
                for key,value in attrs:
                    if key=='href':
                        self.value=value
    #解析结束标签
    def handle_endtag(self,tag):
        print 'End tag:',tag

    #解析注释标签
    def handle_comment(self,comm):
        print 'Comment:',comm
    #解析数据
    def handle_data(self,data):
        #添加flag=‘a’判断，否则字典中会增加'test': None 项
        if self.flag=='a' and data.strip():#
##            print 'Data:',data
            self.d[data.strip()]=self.value
            print data.decode('utf-8')
        else:
##            print 'Data:',data
            pass


if __name__ == '__main__':
    a = '''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
    <html><head><!--insert javaScript here!--><title>test</title>
    <body>
    <a href="http: //www.163.com">163.com</a>
    <a href="www.google.com"> goolge.com </a>
    <A Href="www.pythonclub.org">PythonClub </a>
    <A HREF='www.sina.com.cn'> sina </a>
    </body></html>'''
  
    m=myhtml()
    m.feed(a)
    m.close()
    print m.d

结果：

Declaration: DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"
Start tag: html
Start tag: head
Comment: insert javaScript here!
Start tag: title
End tag: title
Start tag: body
Start tag: a
163.com
End tag: a
Start tag: a
goolge.com
End tag: a
Start tag: a
PythonClub
End tag: a
Start tag: a
sina
End tag: a
End tag: body
End tag: html
{'goolge.com': 'www.google.com', 'PythonClub': 'www.pythonclub.org', '163.com': 'http: //www.163.com', 'sina': 'www.sina.com.cn'}

相关阅读:
B
A
I
IIS发布和部署
编程中什么叫上下文
浅谈Session，Cookie和http协议中的无状态
cmd界面输入sqlplus提示不是内外部命令解决方法
C#已设置安全调试选项，但此选项要求的VS承载进程在此调试中不可用。解决方法
IIS和IIS Express的区别
vue.js:634 [Vue warn]: Failed to generate render function: SyntaxError: Unexpected token ')'

原文地址：https://www.cnblogs.com/think1988/p/4628021.html

#小练习 解析HTML文件并使用字典保存链接 分类： HTMLParser python 小练习 2013-11-11 12:06 267人阅读 评论(0) 收藏

#小练习解析HTML文件并使用字典保存链接分类： HTMLParser python 小练习 2013-11-11 12:06 267人阅读评论(0) 收藏