最近一直在做数据采集的事情,目的是使用java开发一套分析指定采集规则,模拟用户动作做数据提取。
因此定义了一套动作脚本,open,click,get,list,opentab,closetab。。。
java解析脚本,调用phantomjs做数据提取,生成数据json文件,对外提供数据接口。
采集引擎终于写的差不多了,虽然还有很多问题需要修改,但是终于不用加班了,嘿嘿嘿。-------jstarseven
码字挺累的,转载请注明出处:http://www.cnblogs.com/jstarseven/p/6278197.html
言归正传,由于一直搞这些东西,突然想着拿js去写个采集玩一玩,就用tampermonkey,毕竟好久没玩了。
简介:针对一些网站的数据列表,定义采集脚本,模拟用户操作,做列表数据提取,生成json数据格式化展示。
json采集脚本定义:
1 { 2 "type": "list", 3 "selector": "",//列表选择器 4 "max_page": 1,//采集页数 5 "page_selector": "",//翻页选择器 6 "iframe_selector": "",//iframe 选择器 7 "datas": [//采集字段定义 8 { 9 "selector": " ",//字段选择器<此处为针对列表的子选择器> 10 "column": "title",//字段名称 11 "from": "text",//采集类型 12 "iframe_selector": "",//iframe选择器 防止一些网站怪异 一般不需要 13 "open_tab": [//当前字段开新标签做采集 14 { 15 "selector": " ",//新标签字段选择器 16 "column": " ", 17 "from": "text", 18 "iframe_selector": "" 19 }, 20 { 21 "selector": " ", 22 "column": " ", 23 "from": "text", 24 "iframe_selector": "" 25 }, 26 { 27 "selector": " ", 28 "column": " ", 29 "from": "text", 30 "iframe_selector": "" 31 } 32 ] 33 }, 34 { 35 "selector": " ",//字段选择器 36 "column": " ", 37 "from": "text", 38 "iframe_selector": "" 39 }, 40 { 41 "selector": " ",//字段选择器 42 "column": " ", 43 "from": "text", 44 "iframe_selector": "" 45 } 46 ] 47 }
脚本定义好了,剩下的就是写js代码解析脚本,做数据采集,数据合并了。
那么怎么去解析实现呢,针对新开标签页的数据采集,怎么样要和之前的列表项数据做合并,保证数据的完整性呢?
1.因为数据需要做存储,首先想到这么多数据该怎么存储呢,首先想到sessionStorage,但是sessionStorage在我新开标签页的时候数据不能共享,
那么就用localStorage,localStorage一般上限5m左右,足以存储一般列表的十几页数据。
2.详情页面的数据和列表项数据合并,既然上面说到localStorage,那么就在localStorage里面放入一个指定的map,存放列表数据
针对列表的每一项做一个key,然后再新开标签的时候传递key,提取详情的数据,将详情页面数据,放入map中指定key的数据中。
js实现map方便数据存储:
1 /* 2 * MAP对象,实现MAP功能 3 * 4 * 接口: 5 * size() 获取MAP元素个数 6 * isEmpty() 判断MAP是否为空 7 * clear() 删除MAP所有元素 8 * put(key, value) 向MAP中增加元素(key, value) 9 * remove(key) 删除指定KEY的元素,成功返回True,失败返回False 10 * get(key) 获取指定KEY的元素值VALUE,失败返回NULL 11 * element(index) 获取指定索引的元素(使用element.key,element.value获取KEY和VALUE),失败返回NULL 12 * containsKey(key) 判断MAP中是否含有指定KEY的元素 13 * containsValue(value) 判断MAP中是否含有指定VALUE的元素 14 * values() 获取MAP中所有VALUE的数组(ARRAY) 15 * keys() 获取MAP中所有KEY的数组(ARRAY) 16 */ 17 function Map() { 18 this.elements = []; 19 20 //获取MAP元素个数 21 this.size = function () { 22 return this.elements.length; 23 }; 24 25 //判断MAP是否为空 26 this.isEmpty = function () { 27 return (this.elements.length < 1); 28 }; 29 30 //删除MAP所有元素 31 this.clear = function () { 32 this.elements = []; 33 }; 34 35 //向MAP中增加元素(key, value) 36 this.put = function (_key, _value) { 37 for (var i = 0; i < this.elements.length; i++) { 38 if (this.elements[i].key == _key) { 39 this.elements[i].value = _value; 40 return; 41 } 42 } 43 this.elements.push({ 44 key: _key, 45 value: _value 46 }); 47 }; 48 49 //删除指定KEY的元素,成功返回True,失败返回False 50 this.remove = function (_key) { 51 var bln = false; 52 try { 53 for (var i = 0; i < this.elements.length; i++) { 54 if (this.elements[i].key == _key) { 55 this.elements.splice(i, 1); 56 return true; 57 } 58 } 59 } catch (e) { 60 bln = false; 61 } 62 return bln; 63 }; 64 65 //获取指定KEY的元素值VALUE,失败返回NULL 66 this.get = function (_key) { 67 try { 68 for (var i = 0; i < this.elements.length; i++) { 69 if (this.elements[i].key == _key) { 70 return this.elements[i].value; 71 } 72 } 73 } catch (e) { 74 return null; 75 } 76 }; 77 78 //获取指定索引的元素(使用element.key,element.value获取KEY和VALUE),失败返回NULL 79 this.element = function (_index) { 80 if (_index < 0 || _index >= this.elements.length) { 81 return null; 82 } 83 return this.elements[_index]; 84 }; 85 86 //判断MAP中是否含有指定KEY的元素 87 this.containsKey = function (_key) { 88 var bln = false; 89 try { 90 for (var i = 0; i < this.elements.length; i++) { 91 if (this.elements[i].key == _key) { 92 bln = true; 93 } 94 } 95 } catch (e) { 96 bln = false; 97 } 98 return bln; 99 }; 100 101 //判断MAP中是否含有指定VALUE的元素 102 this.containsValue = function (_value) { 103 var bln = false; 104 try { 105 for (var i = 0; i < this.elements.length; i++) { 106 if (this.elements[i].value == _value) { 107 bln = true; 108 } 109 } 110 } catch (e) { 111 bln = false; 112 } 113 return bln; 114 }; 115 116 //获取MAP中所有VALUE的数组(ARRAY) 117 this.values = function () { 118 var arr = []; 119 for (var i = 0; i < this.elements.length; i++) { 120 arr.push(this.elements[i].value); 121 } 122 return arr; 123 }; 124 125 //获取MAP中所有KEY的数组(ARRAY) 126 this.keys = function () { 127 var arr = []; 128 for (var i = 0; i < this.elements.length; i++) { 129 arr.push(this.elements[i].key); 130 } 131 return arr; 132 }; 133 }
js实现操作localStorage:
1 /** 2 *获取当前任务配置信息 3 */ 4 function getTaskDataMap() { 5 var data_maps = localStorage.getItem("data_maps"); 6 var datas = new Map(); 7 if (isNullParam(data_maps)) { 8 data_maps = datas; 9 } else { 10 datas.elements = JSON.parse(data_maps).elements; 11 return datas; 12 } 13 return data_maps; 14 } 15 16 /** 17 *清空当前任务配置信息 18 */ 19 function clearTaskDataMap() { 20 localStorage.setItem("data_maps", ""); 21 } 22 23 /** 24 * 当前任务添加配置信息 25 * @param step_id 脚本步骤id 26 * @param config [doms,json] 27 */ 28 function addTaskDataMap(key, values) { 29 if (isNullParam(key) || isNullParam(values)) 30 return; 31 var data_maps = getTaskDataMap(); 32 data_maps.put(key, values); 33 localStorage.setItem("data_maps", JSON.stringify(data_maps)); 34 }
采用jquery.simulate.js实现点击
1 /*! 2 * jQuery Simulate v@VERSION - simulate browser mouse and keyboard events 3 * https://github.com/jquery/jquery-simulate 4 * 5 * Copyright jQuery Foundation and other contributors 6 * Released under the MIT license. 7 * http://jquery.org/license 8 * 9 * Date: @DATE 10 */ 11 12 ;(function ($, undefined) { 13 14 var rkeyEvent = /^key/, 15 rmouseEvent = /^(?:mouse|contextmenu)|click/; 16 17 $.fn.simulate = function (type, options) { 18 return this.each(function () { 19 new $.simulate(this, type, options); 20 }); 21 }; 22 23 $.simulate = function (elem, type, options) { 24 var method = $.camelCase("simulate-" + type); 25 26 this.target = elem; 27 this.options = options; 28 29 if (this[method]) { 30 this[method](); 31 } else { 32 this.simulateEvent(elem, type, options); 33 } 34 }; 35 36 $.extend($.simulate, { 37 38 keyCode: { 39 BACKSPACE: 8, 40 COMMA: 188, 41 DELETE: 46, 42 DOWN: 40, 43 END: 35, 44 ENTER: 13, 45 ESCAPE: 27, 46 HOME: 36, 47 LEFT: 37, 48 NUMPAD_ADD: 107, 49 NUMPAD_DECIMAL: 110, 50 NUMPAD_DIVIDE: 111, 51 NUMPAD_ENTER: 108, 52 NUMPAD_MULTIPLY: 106, 53 NUMPAD_SUBTRACT: 109, 54 PAGE_DOWN: 34, 55 PAGE_UP: 33, 56 PERIOD: 190, 57 RIGHT: 39, 58 SPACE: 32, 59 TAB: 9, 60 UP: 38 61 }, 62 63 buttonCode: { 64 LEFT: 0, 65 MIDDLE: 1, 66 RIGHT: 2 67 } 68 }); 69 70 $.extend($.simulate.prototype, { 71 72 simulateEvent: function (elem, type, options) { 73 var event = this.createEvent(type, options); 74 this.dispatchEvent(elem, type, event, options); 75 }, 76 77 createEvent: function (type, options) { 78 if (rkeyEvent.test(type)) { 79 return this.keyEvent(type, options); 80 } 81 82 if (rmouseEvent.test(type)) { 83 return this.mouseEvent(type, options); 84 } 85 }, 86 87 mouseEvent: function (type, options) { 88 var event, eventDoc, doc, body; 89 options = $.extend({ 90 bubbles: true, 91 cancelable: (type !== "mousemove"), 92 view: window, 93 detail: 0, 94 screenX: 0, 95 screenY: 0, 96 clientX: 1, 97 clientY: 1, 98 ctrlKey: false, 99 altKey: false, 100 shiftKey: false, 101 metaKey: false, 102 button: 0, 103 relatedTarget: undefined 104 }, options); 105 106 if (document.createEvent) { 107 event = document.createEvent("MouseEvents"); 108 event.initMouseEvent(type, options.bubbles, options.cancelable, 109 options.view, options.detail, 110 options.screenX, options.screenY, options.clientX, options.clientY, 111 options.ctrlKey, options.altKey, options.shiftKey, options.metaKey, 112 options.button, options.relatedTarget || document.body.parentNode); 113 114 // IE 9+ creates events with pageX and pageY set to 0. 115 // Trying to modify the properties throws an error, 116 // so we define getters to return the correct values. 117 if (event.pageX === 0 && event.pageY === 0 && Object.defineProperty) { 118 eventDoc = event.relatedTarget.ownerDocument || document; 119 doc = eventDoc.documentElement; 120 body = eventDoc.body; 121 122 Object.defineProperty(event, "pageX", { 123 get: function () { 124 return options.clientX + 125 ( doc && doc.scrollLeft || body && body.scrollLeft || 0 ) - 126 ( doc && doc.clientLeft || body && body.clientLeft || 0 ); 127 } 128 }); 129 Object.defineProperty(event, "pageY", { 130 get: function () { 131 return options.clientY + 132 ( doc && doc.scrollTop || body && body.scrollTop || 0 ) - 133 ( doc && doc.clientTop || body && body.clientTop || 0 ); 134 } 135 }); 136 } 137 } else if (document.createEventObject) { 138 event = document.createEventObject(); 139 $.extend(event, options); 140 // standards event.button uses constants defined here: http://msdn.microsoft.com/en-us/library/ie/ff974877(v=vs.85).aspx 141 // old IE event.button uses constants defined here: http://msdn.microsoft.com/en-us/library/ie/ms533544(v=vs.85).aspx 142 // so we actually need to map the standard back to oldIE 143 event.button = { 144 0: 1, 145 1: 4, 146 2: 2 147 }[event.button] || ( event.button === -1 ? 0 : event.button ); 148 } 149 150 return event; 151 }, 152 153 keyEvent: function (type, options) { 154 var event; 155 options = $.extend({ 156 bubbles: true, 157 cancelable: true, 158 view: window, 159 ctrlKey: false, 160 altKey: false, 161 shiftKey: false, 162 metaKey: false, 163 keyCode: 0, 164 charCode: undefined 165 }, options); 166 167 if (document.createEvent) { 168 try { 169 event = document.createEvent("KeyEvents"); 170 event.initKeyEvent(type, options.bubbles, options.cancelable, options.view, 171 options.ctrlKey, options.altKey, options.shiftKey, options.metaKey, 172 options.keyCode, options.charCode); 173 // initKeyEvent throws an exception in WebKit 174 // see: http://stackoverflow.com/questions/6406784/initkeyevent-keypress-only-works-in-firefox-need-a-cross-browser-solution 175 // and also https://bugs.webkit.org/show_bug.cgi?id=13368 176 // fall back to a generic event until we decide to implement initKeyboardEvent 177 } catch (err) { 178 event = document.createEvent("Events"); 179 event.initEvent(type, options.bubbles, options.cancelable); 180 $.extend(event, { 181 view: options.view, 182 ctrlKey: options.ctrlKey, 183 altKey: options.altKey, 184 shiftKey: options.shiftKey, 185 metaKey: options.metaKey, 186 keyCode: options.keyCode, 187 charCode: options.charCode 188 }); 189 } 190 } else if (document.createEventObject) { 191 event = document.createEventObject(); 192 $.extend(event, options); 193 } 194 195 if (!!/msie [w.]+/.exec(navigator.userAgent.toLowerCase()) || (({}).toString.call(window.opera) === "[object Opera]")) { 196 event.keyCode = (options.charCode > 0) ? options.charCode : options.keyCode; 197 event.charCode = undefined; 198 } 199 200 return event; 201 }, 202 203 dispatchEvent: function (elem, type, event) { 204 if (elem.dispatchEvent) { 205 elem.dispatchEvent(event); 206 } else if (type === "click" && elem.click && elem.nodeName.toLowerCase() === "input") { 207 elem.click(); 208 } else if (elem.fireEvent) { 209 elem.fireEvent("on" + type, event); 210 } 211 }, 212 213 simulateFocus: function () { 214 var focusinEvent, 215 triggered = false, 216 element = $(this.target); 217 218 function trigger() { 219 triggered = true; 220 } 221 222 element.bind("focus", trigger); 223 element[0].focus(); 224 225 if (!triggered) { 226 focusinEvent = $.Event("focusin"); 227 focusinEvent.preventDefault(); 228 element.trigger(focusinEvent); 229 element.triggerHandler("focus"); 230 } 231 element.unbind("focus", trigger); 232 }, 233 234 simulateBlur: function () { 235 var focusoutEvent, 236 triggered = false, 237 element = $(this.target); 238 239 function trigger() { 240 triggered = true; 241 } 242 243 element.bind("blur", trigger); 244 element[0].blur(); 245 246 // blur events are async in IE 247 setTimeout(function () { 248 // IE won't let the blur occur if the window is inactive 249 if (element[0].ownerDocument.activeElement === element[0]) { 250 element[0].ownerDocument.body.focus(); 251 } 252 253 // Firefox won't trigger events if the window is inactive 254 // IE doesn't trigger events if we had to manually focus the body 255 if (!triggered) { 256 focusoutEvent = $.Event("focusout"); 257 focusoutEvent.preventDefault(); 258 element.trigger(focusoutEvent); 259 element.triggerHandler("blur"); 260 } 261 element.unbind("blur", trigger); 262 }, 1); 263 } 264 }); 265 266 267 /** complex events **/ 268 269 function findCenter(elem) { 270 var offset, 271 document = $(elem.ownerDocument); 272 elem = $(elem); 273 offset = elem.offset(); 274 275 return { 276 x: offset.left + elem.outerWidth() / 2 - document.scrollLeft(), 277 y: offset.top + elem.outerHeight() / 2 - document.scrollTop() 278 }; 279 } 280 281 function findCorner(elem) { 282 var offset, 283 document = $(elem.ownerDocument); 284 elem = $(elem); 285 offset = elem.offset(); 286 287 return { 288 x: offset.left - document.scrollLeft(), 289 y: offset.top - document.scrollTop() 290 }; 291 } 292 293 $.extend($.simulate.prototype, { 294 simulateDrag: function () { 295 var i = 0, 296 target = this.target, 297 eventDoc = target.ownerDocument, 298 options = this.options, 299 center = options.handle === "corner" ? findCorner(target) : findCenter(target), 300 x = Math.floor(center.x), 301 y = Math.floor(center.y), 302 coord = {clientX: x, clientY: y}, 303 dx = options.dx || ( options.x !== undefined ? options.x - x : 0 ), 304 dy = options.dy || ( options.y !== undefined ? options.y - y : 0 ), 305 moves = options.moves || 3; 306 307 this.simulateEvent(target, "mousedown", coord); 308 309 for (; i < moves; i++) { 310 x += dx / moves; 311 y += dy / moves; 312 313 coord = { 314 clientX: Math.round(x), 315 clientY: Math.round(y) 316 }; 317 318 this.simulateEvent(eventDoc, "mousemove", coord); 319 } 320 321 if ($.contains(eventDoc, target)) { 322 this.simulateEvent(target, "mouseup", coord); 323 this.simulateEvent(target, "click", coord); 324 } else { 325 this.simulateEvent(eventDoc, "mouseup", coord); 326 } 327 } 328 }); 329 330 })(jQuery);
格式化json数据,高亮显示
1 /** 2 * 格式化json 3 * @param json 4 * @returns {string|XML} 5 */ 6 function jsonSyntaxHighLight(json) { 7 if (typeof json != 'string') 8 json = JSON.stringify(json, undefined, 2); 9 json = json.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>'); 10 return json.replace(/("(\u[a-zA-Z0-9]{4}|\[^u]|[^\"])*"(s*:)?|(true|false|null)|-?d+(?:.d*)?(?:[eE][+-]?d+)?)/g, function (match) { 11 var cls = 'number'; 12 if (/^"/.test(match)) { 13 if (/:$/.test(match)) { 14 cls = 'key'; 15 } else { 16 cls = 'string'; 17 } 18 } else if (/true|false/.test(match)) { 19 cls = 'boolean'; 20 } else if (/null/.test(match)) { 21 cls = 'null'; 22 } 23 return '<span class="' + cls + '">' + match + '</span>'; 24 }); 25 }
操作:
(以懒财网公告为例,测试)目前已经测试懒财,cnblog。。。
1.首先安装tampermonkey插件下载地址: http://tampermonkey.net/
2.新建脚本,复制web-extract-list.js 内容粘贴 ctrl+s
3.新建脚本,复制web-extract-detail.js 内容粘贴 ctrl+s
4.打开https://www.lancai.cn/about/notice.html 看执行效果
采集结束之后,json页面:
注意:根据采集的网站不同需要变更js文件里面的// @match 处匹配的url, 以及task_json的脚本配置信息
项目代码github地址:https://github.com/jstarseven/web-list-extract
码字挺累的,转载请注明出处:http://www.cnblogs.com/jstarseven/p/6278197.html
-END-