• shops


    #!/usr/bin/env python
    #coding:utf-8
    import urllib2,sys,re,os,string
    
    reload(sys);
    sys.setdefaultencoding('utf8');
    
    #url="http://www.dianping.com/search/category/1/20/g122"
    
    def httpCrawler(url):
        #first page
        content = httpRequest(url)
        #other page
        #for pageNo in range(2,50):
        #    content = httpRequest(url)
        shops=parseHtml(content)
        getAllPages(shops)
        unpackOneShop()
        #saveData(shops)
       
    
    def httpRequest(url):
        try:
            html = None
            req_header = {
                'User-Agent':'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0'
                #'Accept':'text/html;q=0.9,*/*;q=0.8',
                #'Accept-Language':'en-US,en;q=0.5',
                #'Accept-Encoding':'gzip',
                #'Host':'j3.s2.dpfile.com',
                #'Connection':'keep-alive',
                #'Referer':'http://www.baidu.com'
            }
            req_timeout = 5
            req = urllib2.Request(url,None,req_header)
            resp = urllib2.urlopen(req,None,req_timeout)
            html = resp.read().decode('utf-8')
            #print html
        finally:
            if resp:
                resp.close()
        return html
    
    def parseHtml(html):
        content = None
        #shops = re.findall(r'<li class="" >(.+?)<li>',html,re.S)
        shops = re.findall(r'<div class="shop-list J_shop-list shop-all-list" id="shop-all-list">
    <ul>.*?</ul>
    </div>',html,re.S)
        return shops
    
    def getAllPages(shops):
        #total 50 pages
        getEachShop(shops)
        print "################one page done."
    
    global t_OneShop
    t_OneShop=['']*15*50
    
    def getEachShop(shops):
        global t_OneShop
        t_OneShop=['']*15*50
        t_start=0
        shops_string=''.join(shops)
    
        i=0
        t_start = shops_string.find(r'<li class="" >')
        all_end = shops_string.rfind('</li>')
        while i<=15 and t_start and all_end:
            t_start = shops_string.find(r'<li class="" >',t_start)
            t_end = shops_string.find('</li>',t_start,all_end)
            #print "t_start:",t_start
            #print "t_end:",t_end
            t_OneShop[i] = shops_string[t_start:t_end]
            #print t_OneShop[i]
            
            t_start=t_end
            i=i+1
    
    
    def unpackOneShop():
        global t_OneShop
        
        f = open('./zhubao/shops.csv', 'w')
        f.write('xEFxBBxBF')
        f.write('名称,地址,人均消费,,,')
        f.write('
    ')
        f.close()
        for i in range(0,15):
            #print t_OneShop[i]
            
            f = open('./zhubao/shops.csv', 'ab+')
           
            ShopName = re.findall(r'<h4>(.*?)</h4>',t_OneShop[i])
        #ShopDistrict = 
            address = re.findall(r'<span class="addr">(.*?)</span>',t_OneShop[i])
            mean_price = re.findall(r'mean-price" target="_blank" >(.*?)</span>',t_OneShop[i],re.S)
            averageComsumption = re.findall(r'<b>(.*?)</b>',''.join(mean_price),re.S)
    
            print 'mean_price:',mean_price 
            print 'average::',averageComsumption
            ShopName.extend(address)
            ShopName.extend(averageComsumption)
    
            print (','.join(ShopName)).replace('
    ',''),'
    '
            f.write((''.join(','.join(ShopName)).replace('
    ','')))
            f.write('
    ')
            f.close()
        
        #iprovince = 
        #city =
        #adminDistrict =
    
    
    def saveData(data):
        if not os.path.exists('./zhubao'):
            os.mkdir(r'./zhubao')
        f = open('./zhubao/zhubao_shops.csv', 'wb')
        f.write(data)
        f.close()
    
    
    if __name__ == '__main__':
        url="http://www.dianping.com/search/category/1/20/g122"
        httpCrawler(url)
    
    
    '''
    python2.6 没有urllib.request
    多线程
    gevent
    爬虫系统基本的结构:
    1.网络请求;
    最简单的工具就是urllib、urllib2。这两个工具可以实现基本的下载功能,如果进阶想要异步可以使用多线程,如果想效率更高采用非阻塞方案tornado和curl可以实现非阻塞的下载。
    2.抓取结构化数据;
    要想在页面中找到新链接需要对页面解析和对url排重,正则和DOM都可以实现这个功能,看自己熟悉哪一种。
    正则感觉速度较快一些,DOM相对较慢并且复杂一点,如果只是为了要url正则可以解决,如果还想要页面中其他的结构或者内容DOM比较方便。
    url的排重两小可以用memcache或者redis,量大就要用到bloomfilter。
    3.数据存储;
    抓的少怎么存都行,抓的多并且要方便读取那就要好好设计了,用哈希分布存储在RDBMS上或者直接存在HBase上都要看你的数据量和具体需求。 
    '''
  • 相关阅读:
    Eclipse配置Struts2
    Servlet读取头信息
    JAVA基础
    Start Tomcat v8.0 Server at localhost错误
    Eclipse 配置Server Location时灰色,不可用
    Windows 7安装apache-tomcat-8.0.24
    Sql developer——缺点
    Oracle Database,SQL Server,MySQL的比较
    poj1580---欧几里得算法(辗转相除法)
    poj1565---(数论)skew binary
  • 原文地址:https://www.cnblogs.com/timssd/p/4650009.html
Copyright © 2020-2023  润新知