__author__ = 'dell' # -*- coding: utf-8 -*- from lxml import etree import urllib2 import time def loadCategory(): res = {} f_txt = open('catetory.txt') while True: line = f_txt.readline() if not line: break line = line.strip().decode('gbk') tokens = line.split(' ') if len(tokens) < 2: continue key = tokens[1].strip() print key val = tokens[0].strip() res[key] = val return res def loadCity(): res = {} f_txt = open('city.txt') while True: line = f_txt.readline() if not line: break line = line.strip().decode('gbk') tokens = line.split(':') if len(tokens) < 2: continue key = tokens[0].strip() val = tokens[1].strip() if key in res.keys(): print 'repeated city:', key else: res[key] = val return res cats = loadCategory() # for key in cats.keys(): # print key, cats[key] citys = loadCity() # for key in citys.keys(): # print key, citys[key] print 'length of category:', len(cats) print 'length of citys:', len(citys) print 'generating urls ... ...' standard = 'http://www.dianping.com/search/category/%s/%s' def gen(cateName): res = [] if cateName in cats.keys(): catId = cats[cateName] for cityName in citys.keys(): cityId = citys[cityName] url = standard % (cityId, catId) res.append((url, cityName)) return res else: return res def getHtml(url): request = urllib2.Request(url) request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:19.0) Gecko/20100101 Firefox/19.0') doc = urllib2.urlopen(request, timeout=45).read().decode('utf8') return doc def getFetchHour(count): return count * 5.0 / 3600 def getFetchDay(count): return (count * 5.0 / 3600) / 24 urllist = gen(u'购物') print len(urllist) sum = 0 for u in urllist: html = getHtml(u[0]) tree = etree.HTML(html) hnc = tree.xpath("//span[@class='Color7']") for hn in hnc: strnum = hn.text.replace('(', '').replace(')', '') print u[1], strnum sum += int(strnum) # time.sleep(5) print sum print 'fetch time (hour) :' + str(getFetchHour(sum)) print 'fetch time (day) :' + str(getFetchDay(sum))