• C#使用phantomjs,爬取AJAX加载完成之后的页面


     1、开发思路:入参根据apiSetting配置文件,分配静态文件存储地址,可实现不同站点的静态页生成功能。静态页生成功能使用无头浏览器生成,生成之后的字符串进行正则替换为固定地址,实现本地正常访问。

    2、已发现问题:如果js在载入页面时进行某些重写dom操作,已用正则替换掉的动态路径代码,会被覆盖,导致本地访问无效。 这一点只能是站点开发那边重新对页面进行优化,从而避免这种情况。 但是这仅影响本地情况,如果静态页面部署到服务器,使用相对路径其实也不会影响。

    using Newtonsoft.Json;
    using Newtonsoft.Json.Linq;
    using System;
    using System.Collections.Generic;
    using System.Diagnostics;
    using System.IO;
    using System.Linq;
    using System.Net;
    using System.Text;
    using System.Text.RegularExpressions;
    using System.Web;
    using System.Web.Mvc;
    
    namespace QuartZNetService.Controllers
    {
        public class BuildStaticController : Controller
        {
            /// <summary>
            /// 配置地址
            /// </summary>
            public static string jsonUrl = AppDomain.CurrentDomain.BaseDirectory + "apiSetting.json";
            
            /// <summary>
            /// 网站配置类
            /// </summary>
            public class HttpConfig
            {
                /// <summary>
                /// 网站cookie信息
                /// </summary>
                public string Cookie { get; set; }
    
                /// <summary>
                /// 页面Referer信息
                /// </summary>
                public string Referer { get; set; }
    
                /// <summary>
                /// 默认(text/html)
                /// </summary>
                public string ContentType { get; set; }
    
                public string Accept { get; set; }
    
                public string AcceptEncoding { get; set; }
    
                /// <summary>
                /// 超时时间(毫秒)默认100000
                /// </summary>
                public int Timeout { get; set; }
    
                public string UserAgent { get; set; }
    
                /// <summary>
                /// POST请求时,数据是否进行gzip压缩
                /// </summary>
                public bool GZipCompress { get; set; }
    
                public bool KeepAlive { get; set; }
    
                public string CharacterSet { get; set; }
    
                public HttpConfig()
                {
                    this.Timeout = 100000;
                    this.ContentType = "text/html; charset=" + Encoding.UTF8.WebName;
    
                    this.UserAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36";
                    this.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
                    this.AcceptEncoding = "gzip,deflate";
                    this.GZipCompress = false;
                    this.KeepAlive = true;
                    this.CharacterSet = "UTF-8";
                }
            }
    
            /// <summary>
            /// 利用phantomjs 爬取AJAX加载完成之后的页面
            /// JS脚本刷新时间间隔为3秒,防止页面AJAX请求时间过长导致数据无法获取
            /// </summary>
            /// <param name="url"></param>
            /// <param name="sitId">站点ID 用于配置站点盘符位置</param>
            /// <param name="type">存储文件夹 可为空</param>
            /// <param name="config"></param>
            /// <param name="interval"></param>
            /// <returns></returns>
            public JsonResult Do(string url, string sitId, string typeId, string fileName, HttpConfig config, int interval = 3000)
            {
                try
                {
                    var readjson = Readjson(sitId, typeId);
                    JObject jo = (JObject)JsonConvert.DeserializeObject(readjson.ToString());
                    var sitUrl = jo["url"].ToString();
                    var folder = jo["folder"].ToString();
    
                    string path = System.AppDomain.CurrentDomain.BaseDirectory.ToString();
                    ProcessStartInfo start = new ProcessStartInfo(path + @"webToolsphantomjs.exe");//设置运行的命令行文件问ping.exe文件,这个文件系统会自己找到 
                    start.WorkingDirectory = path + @"webTools";
    
                    //设置命令参数
                    string commond = string.Format("{0} {1} {2} {3} {4} {5}", path + @"webToolscodes.js", url, interval, config.UserAgent, config.Accept, config.Referer);
                    start.Arguments = commond;
                    StringBuilder sb = new StringBuilder();
                    start.CreateNoWindow = true;//不显示dos命令行窗口 
                    start.RedirectStandardOutput = true;// 
                    start.RedirectStandardInput = true;// 
                    start.UseShellExecute = false;//是否指定操作系统外壳进程启动程序 
                    Process p = Process.Start(start);
                    StreamReader reader = new StreamReader(p.StandardOutput.BaseStream,Encoding.UTF8);//截取输出流    
    
                    //正则匹配完整外链js
                    Regex myreg = new Regex("(http|https)://(?<domain>[^(:|/]*)");
                    Match myMatch = myreg.Match(url);
                    var reader_txt = reader.ReadToEnd();
                    StringBuilder reader_write = new StringBuilder(reader_txt);
                    Regex regex = new Regex("<script[^>]*?src="([^>]*?)"[^>]*?>", RegexOptions.IgnoreCase);//正则匹配外链html代码
                    MatchCollection userMatchColl = regex.Matches(reader_txt);
    
                    //自定义替换区域 bg
                    if (userMatchColl.Count > 0)
                    {
                        foreach (Match matchItem in userMatchColl)
                        {
                            if (reader_write.ToString().IndexOf(matchItem.Value) > 0 && matchItem.Value.IndexOf("xxx.cn") == -1)
                            {
                                reader_write.Insert(
                                    (reader_write.ToString().IndexOf(matchItem.Value) + matchItem.Value.IndexOf("src="") + ("src="").Length),
                                    "https://www.xxx.cn"
                                    );
                            }
                        }
                    }
                    reader_write.Replace("src="//", "src="https://");//增加https
                    reader_write.Replace("href="//", "href="https://");//增加https
                    reader_write.Replace(""//images", ""https://images");//增加https
                    //自定义替换区域 end
    
                    StreamWriter write = new StreamWriter(sitUrl + folder + "//" + fileName, false, Encoding.UTF8);//写入文件
                    write.Write(reader_write);
                    write.Flush();
                    write.Close();
                    p.WaitForExit();//等待程序执行完退出进程 
                    p.Close();//关闭进程  
                    reader.Close();//关闭流 
                    return Json(true, JsonRequestBehavior.AllowGet);
                }
                catch (Exception ex)
                {
                    return Json(ex.Message, JsonRequestBehavior.AllowGet);
                }
            }
    
            /// <summary>
            /// 读取配置文件
            /// </summary>
            /// <param name="sitId"></param>
            /// <param name="typeId"></param>
            /// <returns></returns>
            public static string Readjson(string sitId, string typeId)
            {
                string url = "";
                string folder = "";
                using (System.IO.StreamReader file = System.IO.File.OpenText(jsonUrl))
                {
                    using (JsonTextReader reader = new JsonTextReader(file))
                    {
                        JObject JObject = (JObject)JToken.ReadFrom(reader);
                        //取站点路径
                        var sit = JObject["sit"];
                        foreach (JObject item in sit)
                        {
                            if (item["sitId"].ToString() == sitId)
                            {
                                url = item["sitUrl"].ToString();
                            }
                        }
                        //取文件夹名称 可为空
                        var type = JObject["type"];
                        foreach (JObject item in type)
                        {
                            if (item["typeId"].ToString() == typeId)
                            {
                                folder = item["folder"].ToString();
                            }
                        }
                    }
                }
                return JsonConvert.SerializeObject(new
                {
                    url = url,
                    folder = folder
                });
            }
        }
    }

    codes.js 配置 

    var page = require('webpage').create(), system = require('system');
    var url = system.args[1];
    var interval = system.args[2];
    var settings = {
        timeout: interval,
        encoding: "UTF-8",
        operation: "GET",
        headers: {
            "User-Agent": system.args[3],
            "Accept": system.args[4],
            "Accept-Language": "zh-CN,en;q=0.7,en-US;q=0.3",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": 1,
            "Connection": "keep-alive",
            "Pragma": "no-cache",
            "Cache-Control": "no-cache",
            "Referer": system.args[5]
        }
    }
    page.settings = settings;
    page.open(url, function (status) {
        phantom.outputEncoding = "UTF-8";
        if (status !== 'success') {
            console.log('Unable to post!');
            phantom.exit();
        } else {
            setTimeout(function () {
                console.log(page.content);
                phantom.exit();
            }, interval);
        }
    });

    apiSetting.json 配置

    {
        "sit": [
            {
                "sitId": "1",
                "sitUrl": "D://"
            },
            {
                "sitId": "60",
                "sitUrl": "D://"
            }
        ],
        "type": [
        {
            "typeId": "1",
            "folder": "zmPC"
        },
        {
            "typeId": "60",
            "folder": "zmCP"    
        }
    ]
    }
  • 相关阅读:
    【python3.8】斐波拉契数列实现
    【Java开发基础】计算两个毫秒之间相差多少天
    FileZilla关闭更新检测
    【Java开发基础】生成两个正数之间的随机数
    Thinkphp6笔记十九:加载自定义配置
    linux系统安装坚果云
    vim NERDTree 目录插件常见用法
    vim 插件管理
    vim 窗口分割 以及 tab常用操作
    vim Ntree 树形目录常见用法
  • 原文地址:https://www.cnblogs.com/souphm/p/11245262.html
Copyright © 2020-2023  润新知