#!/usr/bin/env python #coding:utf-8 import urllib2,sys,re,os,string reload(sys); sys.setdefaultencoding('utf8'); #url="http://www.dianping.com/search/category/1/20/g122" def httpCrawler(url): #first page content = httpRequest(url) #other page #for pageNo in range(2,50): # content = httpRequest(url) shops=parseHtml(content) getAllPages(shops) unpackOneShop() #saveData(shops) def httpRequest(url): try: html = None req_header = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0' #'Accept':'text/html;q=0.9,*/*;q=0.8', #'Accept-Language':'en-US,en;q=0.5', #'Accept-Encoding':'gzip', #'Host':'j3.s2.dpfile.com', #'Connection':'keep-alive', #'Referer':'http://www.baidu.com' } req_timeout = 5 req = urllib2.Request(url,None,req_header) resp = urllib2.urlopen(req,None,req_timeout) html = resp.read().decode('utf-8') #print html finally: if resp: resp.close() return html def parseHtml(html): content = None #shops = re.findall(r'<li class="" >(.+?)<li>',html,re.S) shops = re.findall(r'<div class="shop-list J_shop-list shop-all-list" id="shop-all-list"> <ul>.*?</ul> </div>',html,re.S) return shops def getAllPages(shops): #total 50 pages getEachShop(shops) print "################one page done." global t_OneShop t_OneShop=['']*15*50 def getEachShop(shops): global t_OneShop t_OneShop=['']*15*50 t_start=0 shops_string=''.join(shops) i=0 t_start = shops_string.find(r'<li class="" >') all_end = shops_string.rfind('</li>') while i<=15 and t_start and all_end: t_start = shops_string.find(r'<li class="" >',t_start) t_end = shops_string.find('</li>',t_start,all_end) #print "t_start:",t_start #print "t_end:",t_end t_OneShop[i] = shops_string[t_start:t_end] #print t_OneShop[i] t_start=t_end i=i+1 def unpackOneShop(): global t_OneShop f = open('./zhubao/shops.csv', 'w') f.write('xEFxBBxBF') f.write('名称,地址,人均消费,,,') f.write(' ') f.close() for i in range(0,15): #print t_OneShop[i] f = open('./zhubao/shops.csv', 'ab+') ShopName = re.findall(r'<h4>(.*?)</h4>',t_OneShop[i]) #ShopDistrict = address = re.findall(r'<span class="addr">(.*?)</span>',t_OneShop[i]) mean_price = re.findall(r'mean-price" target="_blank" >(.*?)</span>',t_OneShop[i],re.S) averageComsumption = re.findall(r'<b>(.*?)</b>',''.join(mean_price),re.S) print 'mean_price:',mean_price print 'average::',averageComsumption ShopName.extend(address) ShopName.extend(averageComsumption) print (','.join(ShopName)).replace(' ',''),' ' f.write((''.join(','.join(ShopName)).replace(' ',''))) f.write(' ') f.close() #iprovince = #city = #adminDistrict = def saveData(data): if not os.path.exists('./zhubao'): os.mkdir(r'./zhubao') f = open('./zhubao/zhubao_shops.csv', 'wb') f.write(data) f.close() if __name__ == '__main__': url="http://www.dianping.com/search/category/1/20/g122" httpCrawler(url) ''' python2.6 没有urllib.request 多线程 gevent 爬虫系统基本的结构: 1.网络请求; 最简单的工具就是urllib、urllib2。这两个工具可以实现基本的下载功能,如果进阶想要异步可以使用多线程,如果想效率更高采用非阻塞方案tornado和curl可以实现非阻塞的下载。 2.抓取结构化数据; 要想在页面中找到新链接需要对页面解析和对url排重,正则和DOM都可以实现这个功能,看自己熟悉哪一种。 正则感觉速度较快一些,DOM相对较慢并且复杂一点,如果只是为了要url正则可以解决,如果还想要页面中其他的结构或者内容DOM比较方便。 url的排重两小可以用memcache或者redis,量大就要用到bloomfilter。 3.数据存储; 抓的少怎么存都行,抓的多并且要方便读取那就要好好设计了,用哈希分布存储在RDBMS上或者直接存在HBase上都要看你的数据量和具体需求。 '''