网络爬虫入门——案例三：爬取大众点评的商户信息

pyspider：http://demo.pyspider.org/

CSS选择器：http://www.w3school.com.cn/cssref/css_selectors.asp

Beautiful Soup：http://beautifulsoup.readthedocs.io/zh_CN/latest/

正则表达式：http://www.cnblogs.com/deerchao/archive/2006/08/24/zhengzhe30fengzhongjiaocheng.html

本帖目标：

http://www.dianping.com/search/keyword/3/0_%E4%B8%80%E9%B8%A3%E7%9C%9F%E9%B2%9C%E5%A5%B6%E5%90%A7

1.抓取一鸣真鲜奶吧的所有商店信息

2.抓取商店所有的评论信息

3.将抓取到的内容保存到数据库（没有体现）

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2016-06-07 07:40:58
# Project: dazhongdianping

from pyspider.libs.base_handler import *
from bs4 import BeautifulSoup
from pymongo import MongoClient
import base64
import re


id = 0
count = 0
number=0
global count
global id
global number


class Handler(BaseHandler):
    crawl_config = {
    }

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('http://www.dianping.com/search/keyword/3/0_%E4%B8%80%E9%B8%A3%E7%9C%9F%E9%B2%9C%E5%A5%B6%E5%90%A7', callback=self.local_page)
        
    @config(age=2 * 24 *60)
    def local_page(self, response):
        
        self.save_local('remark',response.url,response.doc)
        for each in response.doc('DIV.pic>A').items():
            
            self.crawl(each.attr.href, callback=self.index_page)
            
        #下一页
        for each in response.doc('A.next',).items():
            
            self.crawl(each.attr.href, callback=self.local_page)

    @config(age=3*24*60)
    def index_page(self,response):
        
        global number
        
        
        #店铺信息
        for each in response.doc('DIV#basic-info').items():
            
            number +=1
            
            info={}
            tmp = BeautifulSoup(str(each))
            name = tmp.find('h1',class_='shop-name')
            
            #店铺编号
            info['itemid']=number
            
            #店铺名称
            if re.findall(r'<h1 class="shop-name">[s]+(.*)',str(name)):
                info['name']=re.findall(r'<h1 class="shop-name">[s]+(.*)',str(name))[0]
            else:
                info['name']='-'
                
            #
            if re.findall(r'<a class="branch J-branch">(.*)<i class="icon i-arrow"></i></a>',str(name)):

                info['branch']=re.findall(r'<a class="branch J-branch">(.*)<i class="icon i-arrow"></i></a>',str(name))[0]
            else:
                info['branch']='-'
                
            #   
            info['basic_info']=[]

            basic_info = tmp.find("div",class_="brief-info")
            
            if basic_info:
                #星级
                star=basic_info.span.get('class')[1]
                
                info['level']=int(re.findall(r'mid-str(.*)',str(star))[0])*1.0/10
                print info['level']
                for td in basic_info.find_all('span',class_="item"):
                    
                    info['basic_info'].append(td.string.encode('utf-8'))
            else:
                info['level']='-'
            #区名       
            region=tmp.find('span',itemprop='locality region')
            
            
            #街道信息
            address=tmp.find('span',class_='item',itemprop="street-address")
            
            
            if region:
                info['region']=region.string.encode('utf-8')
            else:
                info['region']='-'
             
            if address:
                    
                info['address']=address.string.encode('utf-8').strip()
                
            else:
                info['address']='-'
            
            #电话
            tel=tmp.find('p',class_="expand-info tel")
            if tel:
                    
                info['telephone']=tel.find('span',class_='item').string.encode('utf-8')
                
            else:
                info['telephone']='-'
                
         
        #更多评论     
        if response.doc('P.comment-all>A'):
            
            for each in response.doc('P.comment-all>A').items():
                
                self.crawl(each.attr.href, callback=self.detail_page_all)
        #如果当前已经显示了所有评论    
        else:
            
            self.crawl(response.url,callback=self.detail_page)

    @config(age=4*24*60)
    def detail_page(self, response):
        
        
        global id
        
        each = BeautifulSoup(str(response.doc))
        
        #获取评论
        tmp=each.find_all('li',class_="comment-item")

        for tr in tmp:
                
            res={}
                
            id +=1
                
            #评论id
            res['itemid']=id
            
            #用户名
            if tr.find('p',class_='user-info'):
                res['user']=tr.find('p',class_='user-info').a.string.encode('utf-8')
            else:
                res['user']='-'
                
            res['comment']={}
                
            #点赞次数
            date=tr.find('div',class_='misc-info')
            res['time']=date.find('span',class_='time').string.encode('utf-8')
            
            #商店信息
            info = tr.find('p',class_='shop-info')
                
            #商店得分情况
            star=info.span.get('class')[1]
            res['level']=int(re.findall(r'sml-str(.*)',str(star))[0])*1.0/10
            #口味环境和服务得分
            if info.find_all('span',class_='item'):
                    
                for thing in info.find_all('span',class_='item'):
                        
                    thing = thing.string.encode('utf-8').split('£º')
                        
                    res['comment'][thing[0]]=thing[1]
            
            if info.find('span',class_='average'):
                res['price']=info.find('span',class_='average').string.encode('utf-8').split('£º')[1]
            else:
                res['price']='-'
               
            #展开评论
            content=tr.find('div',class_='info J-info-all Hide')
                
            if content:
                    
                res['content']=content.p.string.encode('utf-8')
                
            else:
                if tr.find('div',class_='info J-info-short'):
                        
                    res['content']=tr.find('div',class_='info J-info-short').p.string.encode('utf-8').strip()
                        
                else:
                    res['content']='-'
                    
            
    @config(age=4*24*60)
    def detail_page_all(self, response):
        
        global count
        
        
        #得到全部评论
        for each in response.doc('DIV.comment-list').items():
            
            each = BeautifulSoup(str(each))
            
            tmp=each.find_all('li')
            
            for tr in tmp:
               
                res={}
                count += 1
               
                #点评的id
                res['itemid']=count
                
                #星级
                star=tr.find('div',class_='content')
                if star:
                    
                    rank=star.span.get('class')[1]
                
                    res['level']=int(re.findall(r'irr-star(.*)',str(rank))[0])*1.0/10
                    
                else:
                    continue
                    
                #点赞次数
                date=tr.find('div',class_='misc-info')
                res['time']=date.find('span',class_='time').string.encode('utf-8')
                
                #用户名
                name = tr.find('div',class_='pic')
                if name:
                    
                    res['user']=name.find('p',class_='name').string.encode('utf-8')
                else:
                    
                    res['user']='-'
                
                #口味环境服务
                res['comment']={}
                page=tr.find('div',class_='comment-rst')
                if page:
                    
                    info= re.findall('class="rst">(.*)<em class="col-exp">(.*)</em></span>',str(page))
                    
                    
                    if info:

                        for td in info:

                            res['comment'][td[0]]=td[1].strip('(').strip(')')
                #是否为团购点评
                group=tr.find('div',class_='comment-txt')
                if group.find('a',target='blank'):
                    
                    res['shopping_group']=group.find('a',target='blank').string.encode('utf-8')
                    
                else:
                    res['shopping_group']='-'
                    
                #人均价格     
                price=tr.find('span',class_='comm-per')
                if price:
                    res['price']=price.string.encode('utf-8')

                else:
                    res['price']='-'
                #简要评论
                if tr.find('div',class_='J_brief-cont'):
                    
                    tmp = str(tr.find('div',class_='J_brief-cont'))
                    res['content']=re.findall(r'<div class="J_brief-cont">([wW]*)</div>',tmp)[0].strip()
                    
                else:
                    res['content']='-'
                
        
        #下一页
        for each in response.doc('A.NextPage').items():
           
            self.crawl(each.attr.href, callback=self.detail_page_all)

相关阅读:
(转)Java线程：线程的同步与锁
 (转)线程栈模型与线程的变量、线程状态转换
 (转)概念与原理
 Oracle 技术支持之现场优化的思维路径
 oracle复合索引的选择和使用
 oracle中sql执行性能关注点
 oracle中位图索引和B-tree索引的区别
 oracle锁表
 oracle索引的理解
 oracle海量数据中提升创建索引的速度
原文地址：https://www.cnblogs.com/jingyuewutong/p/5569108.html