import urllib, urllib2, cookielib from HTMLParser import HTMLParser import sys reload(sys) sys.setdefaultencoding('utf8') class WebParser(HTMLParser): def __init__(self, links, path): HTMLParser.__init__(self) self.links = links self.path = path def handle_starttag(self, tag, attrs): if tag == 'a': if len(attrs) == 0: pass else: for (key, val) in attrs: if key == 'href': if val.startswith('http'): self.links.add(val) elif val.startswith('/'): self.links.add(self.path + val) class Crawl: def __init__(self): self.path = 'http://www.baidu.com' self.cookie = cookielib.CookieJar() handler = urllib2.HTTPCookieProcessor(self.cookie) self.opener = urllib2.build_opener(handler) def open(self, path): self.response = self.opener.open(path) def showCookie(self): for item in self.cookie: print 'Name = ' + item.name print 'value = ' + item.value def showResponse(self): print self.response.read() def getAllUrl(self, links, path): try: self.open(path) res = self.response.read() parser = WebParser(links, path) parser.feed(res) parser.close() except Exception, e: print e def crawl(self): src_links = set() result_links = set() self.getAllUrl(src_links, self.path) n = 200 while len(src_links) != 0 and n > 0: link = src_links.pop() if link in result_links: pass result_links.add(link) self.getAllUrl(src_links, link) n -= 1 print n return result_links | src_links c = Crawl() rlt = c.crawl() for link in rlt: print link