from HTMLParser import HTMLParser class myHTMLParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.links=[] def handle_starttag(self,tag,attrs): if tag=='a': if len(attrs)==0:pass else: for variable,value in attrs: if variable=='href': self.links.append(value) def handle_data(self,data): # if data.strip(): # print data.strip() print '-'*10 print 'data:',data print '-'*10 if __name__=='__main__': html_code=''' <a href="www.google.com"> goolge.com </a> <A Href="www.pythonclub.org">PythonClub </a> <A HREF='www.sina.com.cn'> sina </a> ''' hp=myHTMLParser() hp.feed(html_code) hp.close() ## print hp.links
由于 字符串的data中含有空格,如
<a href="www.google.com"> goolge.com </a>中的google两端有空格,所以要进行strip处理,如代码中的注释,否则会将空格看作一个data,输出空格。
#coding:utf-8 from sgmllib import SGMLParser class urlparser(SGMLParser): def reset(self): #添加判断标示,如果遇到<a>,则self.flog='a' self.flag=None #存放链接地址 self.l=[] #存放 self.v=[] self.d={} SGMLParser.reset(self) def start_a(self,attrs): self.flag='a' href=[v for k,v in attrs if k=='href'] if href: self.l.extend(href) def handle_data(self,data): #只追加<a>标签后面的文本 if self.flag=='a': self.v.append(data) #遇到</a>,则self.flog=None,以免读取 urls中的 NEW, 添加至字典 def end_a(self): self.flag=None def merge(self): for a,v in enumerate(self.l): for b,r in enumerate(self.v): if a==b: self.d[r]=v if __name__ == '__main__': urls=''' <tr> <td height="207" colspan="2" align="left" valign="top" class="normal"> <p>Damien Rice - 《0》 </p> <a href="http://galeki.xy568.net/music/Delicate.mp3">1. Delicate</a><br />NEW <a href="http://galeki.xy568.net/music/Volcano.mp3">2. Volcano</a><br /> <a href="http://galeki.xy568.net/music/The Blower's Daughter.mp3">3. The Blower's Daughter</a><br /> <a href="http://galeki.xy568.net/music/Cannonball.mp3">4. Cannonball </a><br /> <a href="http://galeki.xy568.net/music/Older Chests.mp3">5. Order Chests</a><br /> <a href="http://galeki.xy568.net/music/Amie.mp3">6. Amie</a><br /> <a href="http://galeki.xy568.net/music/Cheers Darlin'.mp3">7. Cheers Darling</a><br /> <a href="http://galeki.xy568.net/music/Cold Water.mp3">8. Cold water</a><br /> <a href="http://galeki.xy568.net/music/I Remember.mp3">9. I remember</a><br /> <a href="http://galeki.xy568.net/music/Eskimo.mp3">10. Eskimo</a></p> </td> </tr> ''' upr=urlparser() upr.feed(urls) ## for i in upr.l: ## print i upr.merge() for item in upr.d.items(): print item print len(upr.d) upr.close()
结果:
('4. Cannonball ', 'http://galeki.xy568.net/music/Cannonball.mp3')
('1. Delicate', 'http://galeki.xy568.net/music/Delicate.mp3')
('6. Amie', 'http://galeki.xy568.net/music/Amie.mp3')
('8. Cold water', 'http://galeki.xy568.net/music/Cold Water.mp3')
('10. Eskimo', 'http://galeki.xy568.net/music/Eskimo.mp3')
('9. I remember', 'http://galeki.xy568.net/music/I Remember.mp3')
('5. Order Chests', 'http://galeki.xy568.net/music/Older Chests.mp3')
("3. The Blower's Daughter", "http://galeki.xy568.net/music/The Blower's Daughter.mp3")
('2. Volcano', 'http://galeki.xy568.net/music/Volcano.mp3')
('7. Cheers Darling', "http://galeki.xy568.net/music/Cheers Darlin'.mp3")
10
版权声明:本文为博主原创文章,未经博主允许不得转载。