1.安装环境
http://pan.baidu.com/s/1bnAKBSz
修改环境变量path.添加 c:python27;c:python27Script;
2.如需使用mysql等数据库请自行安装
3.安装好环境后,进入命令行,进入到工作目录,使用以下命令创建工程(工程名以doubanmoive为例)
scrapy startproject doubanmoive
4.做一系列修改使用以下命令运行项目
scrapy crawl doubanmoive
5.scrapy的一些注意地方
(1)步骤3完成以后,项目的目录应该是这样的(根目录各人不同)
D:WEBPythondoubanmoive>tree /f
Folder PATH listing for volume Data
Volume serial number is 00000200 34EC:9CB9
D:.
│ scrapy.cfg
│
└─doubanmoive
│ items.py
│ pipelines.py
│ settings.py
│ __init__.py
│
└─spiders
__init__.py
moive_spider.py
moive_spider.pyc
(2)这些文件主要功能为:
- doubanmoive/items.py:定义需要获取的内容字段,类似于实体类。
- doubanmoive/pipelines.py:项目管道文件,用来处理Spider抓取的数据。
- doubanmoive/settings.py:项目配置文件
- doubanmoive/spiders/moive_spider.py:放置spider的目录
(3)demo
doubanmoive/items.py
from scrapy.item import Item, Field class DoubanmoiveItem(Item): name=Field()#电影名 year=Field()#上映年份 score=Field()#豆瓣分数 director=Field()#导演 classification=Field()#分类 actor=Field()#演员 img=Field()#剧照
doubanmoive/pipelines.py
# -*- coding: utf-8 -*- from scrapy import log from twisted.enterprise import adbapi from scrapy.http import Request from scrapy.selector import HtmlXPathSelector import urllib import MySQLdb import MySQLdb.cursors class DoubanmoivePipeline(object): def __init__(self):
#定义mysql数据,db:mysql database name,user: mysql username,passwd:mysql password self.dbpool = adbapi.ConnectionPool('MySQLdb', db = 'python', user = 'root', passwd = 'root', cursorclass = MySQLdb.cursors.DictCursor, charset = 'utf8', use_unicode = False ) def process_item(self, item, spider): query = self.dbpool.runInteraction(self._conditional_insert, item) query.addErrback(self.handle_error) return item def _conditional_insert(self,tx,item):
#检查数据库中是否存在该名字的电影 tx.execute("select * from doubanmoive where m_name= %s",(item['name'][0],)) result=tx.fetchone() # log.msg(result,level=log.DEBUG) print result if result: log.msg("Item already stored in db:%s" % item,level=log.DEBUG) else:
#处理过多的主演和不符合的时间格式 classification=actor='' lenClassification=len(item['classification']) lenActor=len(item['actor']) for n in xrange(lenClassification): classification+=item['classification'][n] if n<lenClassification-1: classification+='/' for n in xrange(lenActor): actor+=item['actor'][n] if n<lenActor-1: actor+='/' #获取海报下载地址 site= item['img'][0]
#截取海报地址的最后一个/,生成本地的文件名 str = site.split('/'); print str path = str[-1] print 'local img path %s'%(path) #开始下载海报 print '--------------------download img %s'%(site) data = urllib.urlopen(site).read() newfile = open(path,"wb") newfile.write(data) newfile.close() #将这些数据插入到数据库里面 tx.execute( "insert into doubanmoive (m_name,m_year,m_score,m_director,m_classification,m_actor,m_img,m_local_img) values (%s,%s,%s,%s,%s,%s,%s,%s)", (item['name'][0],item['year'][0],item['score'][0],item['director'][0],classification,actor,site,path)) # log.msg("Item stored in db: %s" % item, level=log.DEBUG) def handle_error(self, e): log.err(e)
doubanmoive/spiders
# -*- coding: utf-8 -*- from scrapy.selector import HtmlXPathSelector from scrapy.contrib.spiders import CrawlSpider,Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from doubanmoive.items import DoubanmoiveItem class MoiveSpider(CrawlSpider): name="doubanmoive"
#允许访问的domain allowed_domains=["movie.douban.com"]
#开始的地址 start_urls=["http://movie.douban.com/top250"]
#定义规则:允许访问的地址正则表达式,申明爬虫回调方法 rules=[ Rule(SgmlLinkExtractor(allow=(r'http://movie.douban.com/top250?start=d+.*'))), Rule(SgmlLinkExtractor(allow=(r'http://movie.douban.com/subject/d+')),callback="parse_item"), ]
#rules申明的爬虫回调方法 def parse_item(self,response): sel=HtmlXPathSelector(response) item=DoubanmoiveItem()
#定义获取item字段的xpath表达式,完成item解析赋值 item['name']=sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract() item['year']=sel.xpath('//*[@id="content"]/h1/span[2]/text()').re(r'((d+))') item['score']=sel.xpath('//*[@id="interest_sectl"]/div/p[1]/strong/text()').extract() item['director']=sel.xpath('//*[@id="info"]/span[1]/a/text()').extract() item['classification']= sel.xpath('//span[@property="v:genre"]/text()').extract() item['actor']= sel.xpath('//*[@id="info"]/span[3]/a[1]/text()').extract() # item['img']= sel.xpath('//*[@id="content"]/div/div[1]/ol/li[1]/div/div[1]/a/img/text()').extract() # item['img']= sel.select('//ol/li/div/div/a/img/@src').extract() item['img']= sel.select('//a/img/@src').extract() return item
doubanmoive/settings.py
# -*- coding: utf-8 -*- # Scrapy settings for doubanmoive project # # For simplicity, this file contains only the most important settings by # default. All the other settings are documented here: # # http://doc.scrapy.org/en/latest/topics/settings.html # BOT_NAME = 'doubanmoive' SPIDER_MODULES = ['doubanmoive.spiders'] NEWSPIDER_MODULE = 'doubanmoive.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'doubanmoive (+http://www.yourdomain.com)' ITEM_PIPELINES={ 'doubanmoive.pipelines.DoubanmoivePipeline':400, } LOG_LEVEL='DEBUG' DOWNLOAD_DELAY = 2 RANDOMIZE_DOWNLOAD_DELAY = True USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5' COOKIES_ENABLED = True
运行完成以后,数据库结果