• python爬虫入门学习


    近期写的一个爬虫的Demo,只是简单的用了几个函数。实现了简单的爬取网页的功能(以途牛为例)。

      1 import urllib2
      2 import re
      3 import urlparse
      4 import robotparser
      5 import datetime
      6 import time
      7 
      8 
      9 class Throttle:
     10     """
     11     Add a delay to the same domain between two download
     12     """
     13     def __init__(self, delay):
     14         #  amount of delay between download of a domain
     15         self.delay = delay
     16         #  timestamp of when a domain was last accessed
     17         self.domains = {}
     18 
     19     def wait(self, url):
     20         domain = urlparse.urlparse(url).netloc
     21         last_accessed = self.domains.get(domain)
     22 
     23         if self.delay > 0 and last_accessed is not None:
     24             sleep_sec = self.delay - (datetime.datetime.now() - last_accessed).seconds
     25 
     26             if sleep_sec >= 0:
     27                 time.sleep(sleep_sec)
     28                 print 'sleep: ', sleep_sec, 's'
     29         self.domains[domain] = datetime.datetime.now()
     30 
     31 
     32 def download(url, proxy, user_agent='wawp', num_retries=2):
     33     print 'Downloading:', url
     34     headers = {'User-agent': user_agent}
     35     request = urllib2.Request(url, headers=headers)
     36 
     37     opener = urllib2.build_opener()
     38     if proxy:
     39         proxy_param = {urlparse.urlparse(url).scheme: proxy}
     40         opener.add_handler(urllib2.ProxyHandler(proxy_param))
     41     try:
     42         html = opener.open(request).read()
     43     except urllib2.URLError as e:
     44         print 'Downloading error:', e.reason, '
    '
     45         html = ''
     46         if num_retries > 0:
     47             if hasattr(e, 'code') and 500 <= e.code < 600:
     48                 return download(url, proxy, user_agent, num_retries - 1)
     49     return html
     50 
     51 
     52 def get_links(html, regstr=r'http://[^w].*.tuniu.com'):
     53     reg = regstr
     54     rexp = re.compile(reg)
     55     return re.findall(rexp, html)
     56 
     57 
     58 def deduplicate_list(inputList):
     59     new_list = []
     60     for x in inputList:
     61         if x not in new_list:
     62             new_list.append(x)
     63     return new_list
     64 
     65 
     66 def crawl_sitemap(url):
     67     sitemap = download(url)
     68     links = get_links(sitemap)
     69     print 'before links are : ', links
     70     newlinks = deduplicate_list(links)
     71     print 'after links are : ', newlinks
     72 
     73     for link in newlinks:
     74         print link
     75         download(link)
     76 
     77 
     78 def get_robot(url):
     79     rp = robotparser.RobotFileParser()
     80     rp.set_url(urlparse.urljoin(url, 'robots.txt'))
     81     rp.read()
     82     return rp
     83 
     84 
     85 def link_crawler(seed_url, max_depth=3, link_regex=r'http://[^w][^"]*.tuniu.com', delay=1, proxy=None):
     86     # For robots.txt check install
     87     rp = get_robot(seed_url)
     88     # init vars
     89     throttle = Throttle(delay)
     90     crwal_queue = [seed_url]
     91     seen = {seed_url: 0}
     92 
     93     while crwal_queue:
     94         url = crwal_queue.pop()
     95 
     96         depth = seen[url]
     97         if depth != max_depth:
     98 
     99             if rp.can_fetch('heimaojingzhang', url):  # here just for joking
    100                 throttle.wait(url)
    101                 html = download(url, proxy)
    102                 #  print 'down func ', url
    103                 for link in get_links(html, link_regex):
    104                     link = urlparse.urljoin(seed_url, link)
    105                     if link not in seen:
    106                         seen[link] = depth + 1
    107                         crwal_queue.append(link)
    108             else:
    109                 print 'Blocked by robot.txt ', url
    110 
    111 
    112 # TODO:
    113 # fix bugs: (in regex) done on : 2017/09/23 23:16
    114 # delay: done on : 2017/09/24 21:36
    115 # proxy
    116 # depth: done on : 2017/09/23 23:10
    117 
    118 
    119 if __name__ == '__main__':
    120     link_crawler('http://www.tuniu.com/corp/sitemap.shtml', link_regex=r'http://www.tuniu.com/guide/[^"]*')
    121     #  html = download('http://www.tuniu.com/corp/sitemap.shtml')
    122     #  print  html
    View Code
  • 相关阅读:
    LeetCode653. 两数之和 IV
    DFS
    DFS hdu 1016
    nyist 58 最小步数 BFS
    闭路电视监控系统
    闭路监控
    闭路电视
    恋爱的犀牛
    http://blog.163.com/db_teacher/blog/static/194540298201110723712407/
    2013=12=3 数据库实验七 数据控制实验(完整性部分)
  • 原文地址:https://www.cnblogs.com/flying-tiger/p/8038017.html
Copyright © 2020-2023  润新知