#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2015-03-20 09:46:20 # Project: fly_spider import re import time #from pyspider.database.mysql.mysqldb import SQL from pyspider.libs.base_handler import * from pyquery import PyQuery as pq class Handler(BaseHandler): headers= { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding":"gzip, deflate, sdch", "Accept-Language":"zh-CN,zh;q=0.8", "Cache-Control":"max-age=0", "Connection":"keep-alive", "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36" } crawl_config = { "headers" : headers, "timeout" : 100 } @every(minutes= 1) def on_start(self): self.crawl('http://www.zhanqi.tv/games',callback=self.index_page) @config(age=10 * 24 * 60 * 60) def index_page(self, response): print(response) for each in response.doc('a[href^="http://www.zhanqi.tv/games/"]').items(): if re.match("http://www.zhanqi.tv/games/w+", each.attr.href, re.U): self.crawl(each.attr.href, fetch_type='js', js_script=""" function() { setTimeout(window.scrollTo(0,document.body.scrollHeight), 5000); } """,callback=self.list_page) @config(age=1*60*60, priority=2) def list_page(self, response): for each in response.doc('.active > div.live-list-tabc > ul#hotList.clearfix > li > a').items(): if re.match("http://www.zhanqi.tv/w+", each.attr.href, re.U): self.crawl(each.attr.href, fetch_type='js', js_script=""" function() { setTimeout(window.scrollTo(0,document.body.scrollHeight), 5000); } """,callback=self.detail_page) @config(age=1*60*60, priority=2) def detail_page(self, response): for each in response.doc('.video-flash-cont').items(): d = pq(each) print(d.html()) return { "url": response.url, "author":response.doc('.meat > span').text(), "title":response.doc('.title-name').text(), "game-name":response.doc('span > .game-name').text(), "users2":response.doc('div.live-anchor-info.clearfix > div.sub-anchor-info > div.clearfix > div.meat-info > span.num.dv.js-onlines-panel > span.dv.js-onlines-txt > span').text(), "flash-cont":d.html(), "picture":response.doc('.active > img').text(), }