• 油猴脚本爬虫


    脚本内容

    // ==UserScript==
    // @name         大众点评评论爬虫
    // @namespace    http://tampermonkey.net/
    // @version      0.1
    // @description  crawl is greate
    // @author       陈祥安
    // @include      http://www.dianping.com/shop*
    // @match        http://www.dianping.com/ajax/json/shopDynamic/allReview*
    // @require      http://cdn.bootcss.com/jquery/1.11.2/jquery.js
    // @grant        GM_xmlhttpRequest
    
    
    // ==/UserScript==
    
    (function() {
        var $x = function (xpath, context) {
            var nodes = [];
            try {
                var doc = (context && context.ownerDocument) || window.document;
                var results = doc.evaluate(xpath, context || doc, null, XPathResult.ANY_TYPE, null);
                var node;
                while (node = results.iterateNext()) {
                    nodes.push(node);
                }
            } catch (e) {
                 throw e;
            }
            return nodes;
        }
    
        var server_url = 'http://127.0.0.1:9090/comment/'
    
        window.addEventListener('load', (event) => {
           //关闭弹窗
           let close_btn = $(".J-bonus-close")
           console.log("准备关闭",close_btn)
           if(close_btn){
                      close_btn.click();
           }
           let li_item_list=$x("//ul[@class='comment-list J-list']/li[@class='comment-item']/div[@class='content']//p[@class='desc']");
           var dataList = [];
           li_item_list.forEach(v=>{
                 console.log(v);
                 dataList.push({"data":v.innerText})
           });
    
    
            GM_xmlhttpRequest({
                method: "POST",
                url: server_url,
                data : JSON.stringify({'name':"爬虫","dataList":dataList}),
                onload: function(response) {
                    //这里写处理函数
                    //document.getElementById('text').innerHTML = this.responseText;
                    console.log(response);
                    console.log("dataList",dataList);
                    //window.close();
                }
            });
        });
    })();
    

    python代码

    # @Author : cxa
    # @File : server.py
    # @Software: PyCharm
    import json
    
    from flask import Flask, request, render_template
    
    app = Flask(__name__)
    
    
    @app.route('/')
    def index():
        return "<h1>大众点评API</h1>"
    
    
    @app.route('/comment/', methods=['GET', 'POST'])
    def login():
        if request.method == 'POST':
            form_data = request.get_data()
            result = json.loads(form_data.decode("utf-8"))
        print(result)
        return result
    
    
    @app.errorhandler(404)
    def miss(e):
        return render_template('404.html'), 404
    
    
    @app.errorhandler(500)
    def error(e):
        return render_template('500.html'), 500
    
    
    if __name__ == '__main__':
        app.run(host='0.0.0.0', port=9090, debug=True)
    
    
  • 相关阅读:
    我回来了
    wget 官方jdk
    linux rpm命令安装卸载 初步使用
    关于一些对location认识的误区(转)
    直接插入排序
    冒泡排序
    Wireshark下TCP三次握手四次挥手
    linux内存使用率详解
    Linux下硬盘使用率详解及shell脚本实现
    Linux下CPU使用率详解
  • 原文地址:https://www.cnblogs.com/c-x-a/p/12016854.html
Copyright © 2020-2023  润新知