最近为了写论文,要大批量收集慕课网的相关用户数据(因为用户个人主页是公开的),故而写了一个插件进行收集。需要在慕课网控制台输入。最后收集了3000多份数据。
/*
收集项 |
收集标准 |
用户编号 |
慕课网用户编号是七位数递增的,范围在5000000以内。故而本次收集将在1000000至5000000内分区段进行数据的随机收集。 |
学习时长 |
慕课有记录用户的总共学习时长,如110时34分。为方便后期统计,将该数据转化为以分钟为单位的数据。 |
积分 |
积分反映的是用户参与网站社交的程度。 |
经验 |
相比学习时长,经验更能直接反映课程完成程度。 |
学习课程数 |
为避免用户参加课程而没有进行学习,课程学习数以学习进度超过5%的课程为准入标准。 |
平均完成度 |
指学生课程的平均完成度。 |
完成课程数 |
为课程完成度80%及以上的课程。 |
参加路径数 |
慕课网的学习路径的参与数目。 |
路径完成度 |
学习路径的完成程度。 |
*/ var userInfo = function(user,link,i){ var user = user; var link = link; this.name = i; var _this_ = this; this.getBasicInfo(user); this.getCourseNum(user); this.getRoute(user,link); setTimeout(function(){ console.log(_this_.name+" "+_this_.learnTime+" "+_this_.credit+" "+_this_.mp+" "+_this_.CourseNum+" "+_this_.learnAve+" "+_this_.finish+" "+_this_.routeNum+" "+_this_.routeAve); user.close(); },10000); }; userInfo.prototype ={ //便捷取值函数; getElem : function(user,className,tag) { var infor =user.document.getElementsByClassName(className)[0].getElementsByTagName(tag)[0].innerHTML; return infor; }, //获取基本信息:名字、学习时长、积分、经验值。 getBasicInfo:function(d){ var user = d; //get time; var time =this.getElem(user,"u-info-learn","em"); var cutHour = /d+(?=W{2})/g; var hour = parseInt(time.match(cutHour))*60; var cutMinute = /d+(?!W{2})/g; var minute = parseInt(time.match(cutMinute)); if(isNaN(hour)){ var learnTime = minute; }else{ var learnTime = hour+minute; } this.learnTime = learnTime; //get credit; var credit = parseFloat(this.getElem(user,"u-info-credit","em")); this.credit = credit; //get mp; var mp = parseFloat(this.getElem(user,"u-info-mp","em")); this.mp = mp; }, //获得课程数量和课程完成度 getCourseNum:function(d){ var self = this; var user = d; var CourseNum = 0, learnSum = 0, finish = 0; //获得一个页面的课程数量和完成度 function getOnepage(obj){ if(obj.document.getElementsByClassName("course-one").length > 0){ var course = obj.document.getElementsByClassName("course-one"); for(i = 0;i<course.length;i++){ var hasLearn = course[i].getElementsByClassName("i-left")[0].innerHTML, cutWord = /WW/g; var hasLearn =parseFloat(hasLearn.replace(cutWord,"")); if(hasLearn > 5){ CourseNum = CourseNum + 1; learnSum = learnSum + hasLearn; if(hasLearn > 80){ finish = finish + 1; } }; }; self.CourseNum = CourseNum; if(CourseNum !== 0){ self.learnAve = (learnSum/CourseNum).toFixed(2); } self.finish = finish; } }; //获取当前页面 getOnepage(user); //获取其他页面 if(user.document.getElementsByClassName("text-page-tag").length > 0){ var pages = user.document.getElementsByClassName("text-page-tag"); if(pages.length > 1){ for(i=1;i<pages.length;i++){ !function(i){ var page = window.open(pages[i].getAttribute("href")); setTimeout(function get(){ if(page.document.getElementsByClassName("course-one").length > 0){ getOnepage(page); page.close(); self.CourseNum = CourseNum; self.learnAve = (learnSum/CourseNum).toFixed(2); self.finish = finish; }else{ console.log("page" + i +"didn't load!"); } },5000);//要若有些页面没有在4S内打开,导致无法取值怎么办? }(i); }; }; } //获取其他页面列表 if(user.document.getElementsByClassName("page").length > 0){ var lastPage =user.document.getElementsByClassName("page")[0].lastChild.getAttribute("href"); var allPage = parseInt(lastPage.match(/d$/)); if(allPage>7){ var otherLink = lastPage.match(/^.*=(?=d)/g); for(i=8;i <= allPage;i++){ !function(i){ var page = window.open(otherLink+i); setTimeout(function get(){ if(page.document.getElementsByClassName("course-one").length > 0){ getOnepage(page); page.close(); self.CourseNum = CourseNum; self.learnAve = (learnSum/CourseNum).toFixed(2); self.finish = finish; }else{ console.log("page" + i +"didn't load!"); } },4000);//记得找个好的网络,防止有些页面没有在4S内打开,导致无法取值。 }(i); }; }; } }, //获取参加路径数 getRoute : function(d,link){ var user = d; var self = this; var currPage = link; var openPage = window.open(currPage.replace("courses","plans")) setTimeout(function(){ if(openPage.document.getElementsByClassName("plans-item").length > 0){ var route = openPage.document.getElementsByClassName("plans-item"); var routeNum = route.length; var routeLearn = openPage.document.getElementsByClassName("plans-list-progress"); var routeSum = 0,learnSum = 0; for(i=0;i<routeNum;i++){ var hasLearn = parseFloat(routeLearn[i].innerHTML.replace(/W+/,"")); if(hasLearn>2){ routeSum = routeSum + hasLearn; learnSum +=1; }; }; if(learnSum > 0 ){ routeAve = routeSum/learnSum; self.routeAve = routeAve; } self.routeNum =routeNum; }; openPage.close(); },5000) } };
//打开页面 var n = Math.round(Math.random()*1000000+4000000);//取3000000-4000000之间的随机数 var min = n; function openLink(){ if(n< min+1000){ //取1000份数据 var link = "http://www.imooc.com/u/"+ n +"/courses"; var user = window.open(link); setTimeout(function(){ if(user.document.getElementsByClassName("user-name").length > 0){ new userInfo(user,link,n); }else{ console.log(n+" page didn't exit!") }; },4000); setTimeout(function(){n = n+1;openLink();},4000) } }; openLink();