近期写的一个爬虫的Demo,只是简单的用了几个函数。实现了简单的爬取网页的功能(以途牛为例)。
1 import urllib2 2 import re 3 import urlparse 4 import robotparser 5 import datetime 6 import time 7 8 9 class Throttle: 10 """ 11 Add a delay to the same domain between two download 12 """ 13 def __init__(self, delay): 14 # amount of delay between download of a domain 15 self.delay = delay 16 # timestamp of when a domain was last accessed 17 self.domains = {} 18 19 def wait(self, url): 20 domain = urlparse.urlparse(url).netloc 21 last_accessed = self.domains.get(domain) 22 23 if self.delay > 0 and last_accessed is not None: 24 sleep_sec = self.delay - (datetime.datetime.now() - last_accessed).seconds 25 26 if sleep_sec >= 0: 27 time.sleep(sleep_sec) 28 print 'sleep: ', sleep_sec, 's' 29 self.domains[domain] = datetime.datetime.now() 30 31 32 def download(url, proxy, user_agent='wawp', num_retries=2): 33 print 'Downloading:', url 34 headers = {'User-agent': user_agent} 35 request = urllib2.Request(url, headers=headers) 36 37 opener = urllib2.build_opener() 38 if proxy: 39 proxy_param = {urlparse.urlparse(url).scheme: proxy} 40 opener.add_handler(urllib2.ProxyHandler(proxy_param)) 41 try: 42 html = opener.open(request).read() 43 except urllib2.URLError as e: 44 print 'Downloading error:', e.reason, ' ' 45 html = '' 46 if num_retries > 0: 47 if hasattr(e, 'code') and 500 <= e.code < 600: 48 return download(url, proxy, user_agent, num_retries - 1) 49 return html 50 51 52 def get_links(html, regstr=r'http://[^w].*.tuniu.com'): 53 reg = regstr 54 rexp = re.compile(reg) 55 return re.findall(rexp, html) 56 57 58 def deduplicate_list(inputList): 59 new_list = [] 60 for x in inputList: 61 if x not in new_list: 62 new_list.append(x) 63 return new_list 64 65 66 def crawl_sitemap(url): 67 sitemap = download(url) 68 links = get_links(sitemap) 69 print 'before links are : ', links 70 newlinks = deduplicate_list(links) 71 print 'after links are : ', newlinks 72 73 for link in newlinks: 74 print link 75 download(link) 76 77 78 def get_robot(url): 79 rp = robotparser.RobotFileParser() 80 rp.set_url(urlparse.urljoin(url, 'robots.txt')) 81 rp.read() 82 return rp 83 84 85 def link_crawler(seed_url, max_depth=3, link_regex=r'http://[^w][^"]*.tuniu.com', delay=1, proxy=None): 86 # For robots.txt check install 87 rp = get_robot(seed_url) 88 # init vars 89 throttle = Throttle(delay) 90 crwal_queue = [seed_url] 91 seen = {seed_url: 0} 92 93 while crwal_queue: 94 url = crwal_queue.pop() 95 96 depth = seen[url] 97 if depth != max_depth: 98 99 if rp.can_fetch('heimaojingzhang', url): # here just for joking 100 throttle.wait(url) 101 html = download(url, proxy) 102 # print 'down func ', url 103 for link in get_links(html, link_regex): 104 link = urlparse.urljoin(seed_url, link) 105 if link not in seen: 106 seen[link] = depth + 1 107 crwal_queue.append(link) 108 else: 109 print 'Blocked by robot.txt ', url 110 111 112 # TODO: 113 # fix bugs: (in regex) done on : 2017/09/23 23:16 114 # delay: done on : 2017/09/24 21:36 115 # proxy 116 # depth: done on : 2017/09/23 23:10 117 118 119 if __name__ == '__main__': 120 link_crawler('http://www.tuniu.com/corp/sitemap.shtml', link_regex=r'http://www.tuniu.com/guide/[^"]*') 121 # html = download('http://www.tuniu.com/corp/sitemap.shtml') 122 # print html