试想一下,如果每天要收集100页网页数据甚至更多。如果采用人工收集会吐血,用程序去收集也就成为一个不二的选择。首先肯定会想到说用java、php、C#等高级语言,但这偏偏又有个登陆和验证码,搞到无所适从。还在为收集web端的数据感到苦恼吗?很高兴,你找对地方了。
应用场景:
1、需要每天大量重复收集web端的数据
2、web页面数据需要登陆后才能采集
3、web页面存在翻页
解决方案:
手工登陆,然后采用chrome插件的方式进行收集。当然你会说用selenium等自动化测试的方法进行收集更cool,而且可以每天自动收集,完全的自动化不用人工参与。但是作为chrome的忠实脑残粉,再者只需要前端的js、服务器端的接收文件、数据库就可以完美解决这一问题。再加上部署和操作简单。 脑残粉总有很多理由的嘛。好吧,就算是一种憋屈的曲线救国和实现方式吧。
思路:
帮助手册:http://open.chrome.360.cn/extension_dev/overview.html
实例:
抓取某电商后台订单数据
1、创建一个项目文件夹并引入所需文件:如D: oolchrome_server_plugin
jquery-2.1.1.min.js、icon.png
2、创建background.html
<html><head> </head></html>
3、创建配置文件manifest.json文件
{
"name": "获取某电商后台订单信息",
"version": "1.0",
"manifest_version": 2,
"description": "*********获取某电商后台订单信息*********",
"browser_action": {
"default_icon": "icon.png"
},
"permissions": [
"webNavigation",
"tabs",
"contextMenus",
"http://服务器接受数据url/"
],
"background": {
"scripts": ["eventPage.js","jquery-2.1.1.min.js"]
},
"content_scripts": [
{
"matches": ["http://抓取页面url/*"],
"js": ["jquery-2.1.1.min.js", "contentscript.js"]
}
]
}
4、创建前端js文件contentscript.js
var totalPage; var page = 0; //注册前台页面监听事件 chrome.extension.onMessage.addListener( function(request, sender, sendResponse) { totalPage = $("input[name=totalPage]").val(); console.log("totalPage----------" + totalPage); //console.log("msg----------contentscript.js" + request.greeting); getOrderInfo( sendResponse ); }); //获取订单信息 function getOrderInfo( sendResponse ){ var flag = false; payMoney = [];//货款金额 orderTime = [];//下单时间 $("tr[class=head] span").each(function(index){ spantxt = ''; spantxt = $(this).text(); if(spantxt.indexOf('货款金额:') > -1){ money = spantxt.substr(5); //console.log(index + "---------payMoney-------货款金额:" + money); payMoney.push(money); }else if(spantxt.indexOf('下单时间:') > -1){ time = spantxt.substr(5); //console.log(index + "---------orderTime-------下单时间:" + time); orderTime.push(time); } }); paytype = [];//物流方式 yunfei = [];//运费 $("td[class=p-values]").each(function(index){ tdtxt = ''; tdtxt = $(this).text(); if(tdtxt.indexOf('货到付款') > -1){ paytype.push('货到付款'); }else{ paytype.push('在线支付'); } yf_index = tdtxt.indexOf('运费:'); if(yf_index > -1){ temp = tdtxt.substr(yf_index); temp_yf = temp.substr(3); //console.log(index + "---------yunfei-------"+ temp +"===" + temp_yf); yunfei.push(temp_yf); }else{ yunfei.push(0); } //console.log(index + "---------tdtxt-------" + tdtxt); }); orderStatus = [];//订单状态 users = [];//买家账号 remark = [];//备注 $("tr[class=content] td[class=t-c]").each(function(index){ tdtxt = ''; tdtxt = $(this).text().replace(/[ ] +/g,"");//将回车,换行,空格去掉 temp = index % 5; if(1 == temp){ orderStatus.push(tdtxt); //console.log(index + "---------statu-------" + tdtxt); }else if(2 == temp){ users.push(tdtxt); //console.log(index + "---------users-------" + tdtxt); }else if(3 == temp){ remark.push(tdtxt); //console.log(index + "---------remark-------" + tdtxt); } }); express = [];//快递单号 $("tr[class=content] td div[style='text-align: center;']").each(function(index){ tdtxt = ''; tdtxt = $(this).text().replace(/[ ] +/g,"");//将回车,换行,空格去掉 express.push(tdtxt); //console.log( "============快递单号=======" + tdtxt); }); orderInfo = []; splitstr = "@_@"; $("tr[class=head] a[track=orderinfopagebeta]").each(function(index){ orderid = $(this).text(); //console.log("---------orderid-------" + orderid); mycomment = $("a[id=comment_" + orderid + "]").attr('style').replace(/[ ] +/g,""); if("display: block;" == mycomment){ mycomment = '已评价'; }else if('display:none;' == mycomment){ mycomment = '未评价'; } tempshopid = $("img[id=remarkFlag_" + orderid + "]").attr('onclick'); shopidIndex = tempshopid.indexOf(","); shopid = tempshopid.substr(shopidIndex + 1).replace(/[);]/g,""); //console.log("---------shopid-------" + shopid); orderdesc = shopid + splitstr + orderid + splitstr + mycomment + splitstr + payMoney[index] + splitstr + orderTime[index] + splitstr + paytype[index] + splitstr + yunfei[index] + splitstr + orderStatus[index] + splitstr + users[index] + splitstr + remark[index] + splitstr + express[index]; console.log("---------orderdesc-------" + orderdesc); orderInfo.push(orderdesc); }); //chrome.extension.sendMessage({"orderInfo": orderInfo}, function(response) {}); page = parseInt($("a[class=current]").text()); totalPage = parseInt($("input[name=totalPage]").val()); console.log(page + "--page-----------totalPage---" + totalPage); if(page < totalPage && page < 100){ console.log("---------next-------"); sendMsg( orderInfo, "next" ); $('a.next')[1].click(); }else{ console.log("---------end-------"); sendMsg( orderInfo, "end" ); } // } //将获取内容传递给后台文件进行处理 function sendMsg( msg, cmd){ chrome.extension.sendMessage({"msg": msg, "cmd": cmd}, function(response) {}); }
5、创建后台处理js文件eventPage.js
var flag = false; var currentTabId; chrome.browserAction.onClicked.addListener(function(tab) { counter = 40; console.log('Turning ' + tab.url); flag = true; currentTabId = tab.id; chrome.tabs.getSelected(null, function(tab) { sendMsg(tab.id); }); }); chrome.webNavigation.onCompleted.addListener(function( tab ){ console.log('加载完成***' + tab.tabId ); if( flag ){ sendMsg( tab.tabId ); } }); chrome.extension.onMessage.addListener( function(request, sender, sendResponse) { console.log("*******evenPage.js***chrome.extension.onMessage.addListener"); articleData = request; $.ajax({ url: "服务器接受数据URL/getOrderinfo.php", cache: false, type: "POST", data: {'orderinfo': request.msg.join("#$#")}, dataType: "json" }).done(function(msg) { console.log('*******************json*************' + msg.sql ); chrome.tabs.sendMessage(currentTabId, {"cmd":"end"}, function(response) { console.log(response); }); }).fail(function(jqXHR, textStatus) { articleData.firstAccess = textStatus; }); cmd = request.cmd; if('end' == cmd){ flag = false;//确保不会自动运行 } }); function sendSku2Info(colores){ chrome.tabs.query( {active: true, currentWindow: true}, function(tabs) { chrome.tabs.sendMessage(tabs[0].id, {"cmd":"ok", "sku": colores}, function(response) { console.log(response); }); }); } function sendMsg( tabid ){ console.log(tabid + "--sendMsg()----eventPage.js"); chrome.tabs.sendMessage(tabid, {greeting: "start working"}, function(response) { }); }
4、创建服务器接收文件getOrderInfo.php(放在服务器哦,亲!)
<?php header("Content-type:text/html; charset=utf-8"); //include("./includes/global.php"); echo "***********************"; $con = mysql_connect("localhost","root","root"); echo "=============="; if (!$con) { die('Could not connect: ' . mysql_error()); } mysql_select_db("test", $con); //var_dump($_REQUEST); $orderinfo = $_POST['orderinfo']; $orderArr = explode('#$#', $orderinfo); print_r($orderArr); $sql_value = array(); $split = "', '"; foreach($orderArr as $myorder){ $value = explode('@_@', $myorder); echo "===========" . $value[10] ."</br>"; $sql = "INSERT INTO test(venderId, orderid, pingjia, money, ordertime, paytype, yunfei, orderstatu, user, remark, express) VALUES "; $sql .= "('" . $value[0] . $split . $value[1] . $split . $value[2] . $split . $value[3] . $split . $value[4] . $split . $value[5] . $split . $value[6] . $split . $value[7] . $split . $value[8] . $split . $value[9] . $split . $value[10] . "') ON DUPLICATE KEY UPDATE remark = '" . $value[9] . "', pingjia = '" . $value[2] . "', orderstatu = '" . $value[7] . "', express = '" . $value[10] . "'"; mysql_query($sql); } mysql_close($con); ?>
5、创建数据库表
CREATE TABLE `test` ( `id` int(10) unsigned NOT NULL AUTO_INCREMENT COMMENT 'id', `venderId` int(10) NOT NULL DEFAULT '0' COMMENT '商家ID', `orderid` bigint(20) NOT NULL DEFAULT '0' COMMENT '订单ID', `pingjia` varchar(100) NOT NULL DEFAULT '' COMMENT '订单发出后的状态(是否评价)', `money` decimal(10,2) NOT NULL DEFAULT '0.00' COMMENT '订单金额', `ordertime` varchar(100) NOT NULL DEFAULT '' COMMENT '下单时间', `paytype` varchar(100) NOT NULL DEFAULT '' COMMENT '付款方式', `yunfei` decimal(10,2) NOT NULL DEFAULT '0.00' COMMENT '运费', `orderstatu` varchar(100) NOT NULL DEFAULT '' COMMENT '订单状态', `user` varchar(255) NOT NULL DEFAULT '' COMMENT '订单用户', `remark` varchar(255) NOT NULL DEFAULT '' COMMENT '备注', `express` varchar(255) NOT NULL DEFAULT '' COMMENT '物流方式和运单号', `shop_id` int(10) unsigned NOT NULL DEFAULT '0' COMMENT '店铺表ID', `shop_name` varchar(255) NOT NULL DEFAULT '' COMMENT '店铺名称', `stattime` int(11) NOT NULL DEFAULT '0' COMMENT '下单年月日', PRIMARY KEY (`id`), UNIQUE KEY `orderid` (`orderid`) ) ENGINE=MyISAM AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COMMENT=''
注意:各文件中红色部分需要手工修改,此外如果修改js文件名,请在manifest.json中修改相应的配置信息即可。
插件加载步骤:
示例代码:https://github.com/jackgitgz/chrome_server_plugin
此示例仅供参考,有理解不到位或错误的还请指出。