• 网络爬虫入门——案例三:爬取大众点评的商户信息


    pyspider:http://demo.pyspider.org/

    CSS选择器:http://www.w3school.com.cn/cssref/css_selectors.asp

    Beautiful Soup:http://beautifulsoup.readthedocs.io/zh_CN/latest/

    正则表达式:http://www.cnblogs.com/deerchao/archive/2006/08/24/zhengzhe30fengzhongjiaocheng.html

    本帖目标:

    http://www.dianping.com/search/keyword/3/0_%E4%B8%80%E9%B8%A3%E7%9C%9F%E9%B2%9C%E5%A5%B6%E5%90%A7

    1.抓取一鸣真鲜奶吧的所有商店信息

    2.抓取商店所有的评论信息

    3.将抓取到的内容保存到数据库(没有体现)

    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # Created on 2016-06-07 07:40:58
    # Project: dazhongdianping
    
    from pyspider.libs.base_handler import *
    from bs4 import BeautifulSoup
    from pymongo import MongoClient
    import base64
    import re
    
    
    id = 0
    count = 0
    number=0
    global count
    global id
    global number
    
    
    class Handler(BaseHandler):
        crawl_config = {
        }
    
        @every(minutes=24 * 60)
        def on_start(self):
            self.crawl('http://www.dianping.com/search/keyword/3/0_%E4%B8%80%E9%B8%A3%E7%9C%9F%E9%B2%9C%E5%A5%B6%E5%90%A7', callback=self.local_page)
            
        @config(age=2 * 24 *60)
        def local_page(self, response):
            
            self.save_local('remark',response.url,response.doc)
            for each in response.doc('DIV.pic>A').items():
                
                self.crawl(each.attr.href, callback=self.index_page)
                
            #下一页
            for each in response.doc('A.next',).items():
                
                self.crawl(each.attr.href, callback=self.local_page)
    
        @config(age=3*24*60)
        def index_page(self,response):
            
            global number
            
            
            #店铺信息
            for each in response.doc('DIV#basic-info').items():
                
                number +=1
                
                info={}
                tmp = BeautifulSoup(str(each))
                name = tmp.find('h1',class_='shop-name')
                
                #店铺编号
                info['itemid']=number
                
                #店铺名称
                if re.findall(r'<h1 class="shop-name">[s]+(.*)',str(name)):
                    info['name']=re.findall(r'<h1 class="shop-name">[s]+(.*)',str(name))[0]
                else:
                    info['name']='-'
                    
                #
                if re.findall(r'<a class="branch J-branch">(.*)<i class="icon i-arrow"></i></a>',str(name)):
    
                    info['branch']=re.findall(r'<a class="branch J-branch">(.*)<i class="icon i-arrow"></i></a>',str(name))[0]
                else:
                    info['branch']='-'
                    
                #   
                info['basic_info']=[]
    
                basic_info = tmp.find("div",class_="brief-info")
                
                if basic_info:
                    #星级
                    star=basic_info.span.get('class')[1]
                    
                    info['level']=int(re.findall(r'mid-str(.*)',str(star))[0])*1.0/10
                    print info['level']
                    for td in basic_info.find_all('span',class_="item"):
                        
                        info['basic_info'].append(td.string.encode('utf-8'))
                else:
                    info['level']='-'
                #区名       
                region=tmp.find('span',itemprop='locality region')
                
                
                #街道信息
                address=tmp.find('span',class_='item',itemprop="street-address")
                
                
                if region:
                    info['region']=region.string.encode('utf-8')
                else:
                    info['region']='-'
                 
                if address:
                        
                    info['address']=address.string.encode('utf-8').strip()
                    
                else:
                    info['address']='-'
                
                #电话
                tel=tmp.find('p',class_="expand-info tel")
                if tel:
                        
                    info['telephone']=tel.find('span',class_='item').string.encode('utf-8')
                    
                else:
                    info['telephone']='-'
                    
             
            #更多评论     
            if response.doc('P.comment-all>A'):
                
                for each in response.doc('P.comment-all>A').items():
                    
                    self.crawl(each.attr.href, callback=self.detail_page_all)
            #如果当前已经显示了所有评论    
            else:
                
                self.crawl(response.url,callback=self.detail_page)
    
        @config(age=4*24*60)
        def detail_page(self, response):
            
            
            global id
            
            each = BeautifulSoup(str(response.doc))
            
            #获取评论
            tmp=each.find_all('li',class_="comment-item")
    
            for tr in tmp:
                    
                res={}
                    
                id +=1
                    
                #评论id
                res['itemid']=id
                
                #用户名
                if tr.find('p',class_='user-info'):
                    res['user']=tr.find('p',class_='user-info').a.string.encode('utf-8')
                else:
                    res['user']='-'
                    
                res['comment']={}
                    
                #点赞次数
                date=tr.find('div',class_='misc-info')
                res['time']=date.find('span',class_='time').string.encode('utf-8')
                
                #商店信息
                info = tr.find('p',class_='shop-info')
                    
                #商店得分情况
                star=info.span.get('class')[1]
                res['level']=int(re.findall(r'sml-str(.*)',str(star))[0])*1.0/10
                #口味环境和服务得分
                if info.find_all('span',class_='item'):
                        
                    for thing in info.find_all('span',class_='item'):
                            
                        thing = thing.string.encode('utf-8').split('£º')
                            
                        res['comment'][thing[0]]=thing[1]
                
                if info.find('span',class_='average'):
                    res['price']=info.find('span',class_='average').string.encode('utf-8').split('£º')[1]
                else:
                    res['price']='-'
                   
                #展开评论
                content=tr.find('div',class_='info J-info-all Hide')
                    
                if content:
                        
                    res['content']=content.p.string.encode('utf-8')
                    
                else:
                    if tr.find('div',class_='info J-info-short'):
                            
                        res['content']=tr.find('div',class_='info J-info-short').p.string.encode('utf-8').strip()
                            
                    else:
                        res['content']='-'
                        
                
        @config(age=4*24*60)
        def detail_page_all(self, response):
            
            global count
            
            
            #得到全部评论
            for each in response.doc('DIV.comment-list').items():
                
                each = BeautifulSoup(str(each))
                
                tmp=each.find_all('li')
                
                for tr in tmp:
                   
                    res={}
                    count += 1
                   
                    #点评的id
                    res['itemid']=count
                    
                    #星级
                    star=tr.find('div',class_='content')
                    if star:
                        
                        rank=star.span.get('class')[1]
                    
                        res['level']=int(re.findall(r'irr-star(.*)',str(rank))[0])*1.0/10
                        
                    else:
                        continue
                        
                    #点赞次数
                    date=tr.find('div',class_='misc-info')
                    res['time']=date.find('span',class_='time').string.encode('utf-8')
                    
                    #用户名
                    name = tr.find('div',class_='pic')
                    if name:
                        
                        res['user']=name.find('p',class_='name').string.encode('utf-8')
                    else:
                        
                        res['user']='-'
                    
                    #口味环境服务
                    res['comment']={}
                    page=tr.find('div',class_='comment-rst')
                    if page:
                        
                        info= re.findall('class="rst">(.*)<em class="col-exp">(.*)</em></span>',str(page))
                        
                        
                        if info:
    
                            for td in info:
    
                                res['comment'][td[0]]=td[1].strip('(').strip(')')
                    #是否为团购点评
                    group=tr.find('div',class_='comment-txt')
                    if group.find('a',target='blank'):
                        
                        res['shopping_group']=group.find('a',target='blank').string.encode('utf-8')
                        
                    else:
                        res['shopping_group']='-'
                        
                    #人均价格     
                    price=tr.find('span',class_='comm-per')
                    if price:
                        res['price']=price.string.encode('utf-8')
    
                    else:
                        res['price']='-'
                    #简要评论
                    if tr.find('div',class_='J_brief-cont'):
                        
                        tmp = str(tr.find('div',class_='J_brief-cont'))
                        res['content']=re.findall(r'<div class="J_brief-cont">([wW]*)</div>',tmp)[0].strip()
                        
                    else:
                        res['content']='-'
                    
            
            #下一页
            for each in response.doc('A.NextPage').items():
               
                self.crawl(each.attr.href, callback=self.detail_page_all)  
               
  • 相关阅读:
    (转)Java线程:线程的同步与锁
    (转)线程栈模型与线程的变量、线程状态转换
    (转)概念与原理
    Oracle 技术支持之现场优化的思维路径
    oracle复合索引的选择和使用
    oracle中sql执行性能关注点
    oracle中位图索引和B-tree索引的区别
    oracle锁表
    oracle索引的理解
    oracle海量数据中提升创建索引的速度
  • 原文地址:https://www.cnblogs.com/jingyuewutong/p/5569108.html
Copyright © 2020-2023  润新知