• pyspider—爬取视频链接


    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # Created on 2015-03-20 09:46:20
    # Project: fly_spider
    
    import re
    import time
    #from pyspider.database.mysql.mysqldb import SQL
    from pyspider.libs.base_handler import *
    from pyquery import PyQuery as pq
    
    class Handler(BaseHandler):
    
        headers= {
        "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding":"gzip, deflate, sdch",
        "Accept-Language":"zh-CN,zh;q=0.8",
        "Cache-Control":"max-age=0",
        "Connection":"keep-alive",
        "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36"
        }
    
        crawl_config = {
            "headers" : headers,
            "timeout" : 100
        }
        @every(minutes= 1)
        def on_start(self):
            self.crawl('http://www.zhanqi.tv/games',callback=self.index_page)
    
        @config(age=10 * 24 * 60 * 60)
        def index_page(self, response):
            print(response)
            for each in response.doc('a[href^="http://www.zhanqi.tv/games/"]').items():
                if re.match("http://www.zhanqi.tv/games/w+", each.attr.href, re.U):
                    self.crawl(each.attr.href, 
                    fetch_type='js',                   
                    js_script="""
                    function() {
                        setTimeout(window.scrollTo(0,document.body.scrollHeight), 5000);         
                    }
                    """,callback=self.list_page)  
    
        @config(age=1*60*60, priority=2)                
        def list_page(self, response):
            for each in response.doc('.active > div.live-list-tabc > ul#hotList.clearfix > li > a').items():
                if re.match("http://www.zhanqi.tv/w+", each.attr.href, re.U):
                    self.crawl(each.attr.href,
                    fetch_type='js',                   
                    js_script="""
                    function() {
                        setTimeout(window.scrollTo(0,document.body.scrollHeight), 5000);        
                    }
                    """,callback=self.detail_page)    
    
        @config(age=1*60*60, priority=2)
        def detail_page(self, response):
            for each in response.doc('.video-flash-cont').items():
                d = pq(each)
                print(d.html())
    
            return {
                "url": response.url,
                "author":response.doc('.meat > span').text(),
                "title":response.doc('.title-name').text(),
                "game-name":response.doc('span > .game-name').text(),
                "users2":response.doc('div.live-anchor-info.clearfix > div.sub-anchor-info > div.clearfix > div.meat-info > span.num.dv.js-onlines-panel > span.dv.js-onlines-txt > span').text(),
                "flash-cont":d.html(),
                "picture":response.doc('.active > img').text(),
                }
  • 相关阅读:
    嵌套网址如何防止根目录的web.config 覆盖子目录的web.config
    win7 C# 利用windows自带语音类库读书 spvoice,电脑端 读书摘自网络
    webapi 中 如何用一个实体model 接受前端的传参
    mysql 数据库 trigger 触发器的使用
    PS简单朦胧特效步骤
    windows WebStorm 破解教程(详细视频教程)
    windows PyCharm 破解教程(详细视频教程)
    WebFlux中使用WebClient的时候抓取流量包
    List比较工具 Ether.ListCompare
    Blazor技术开发了一个访客管理系统
  • 原文地址:https://www.cnblogs.com/panliu/p/4849217.html
Copyright © 2020-2023  润新知