• 通过 phantomjs抓取仁医在线的练习题


    先模拟登录,再按照指定课程挨个去抓取,还有点不完善,会有重复题目出现。

    var page = require('webpage').create();
    phantom.outputEncoding = 'gbk';
    
    page.settings.userAgent = 'chrome';
    
    page.onConsoleMessage = function(msg) {
        console.log(msg);
    };
    
    function getBetween(targetString, beginString, endString) {
        if (!targetString) {
            return targetString;
        }
        if (!beginString && !endString) {
            return targetString;
        }
        if (!beginString) {
            var i = targetString.indexOf(endString);
            if (i < 0) {
                return '';
            }
            return targetString.substring(0, i);
        } else if (!endString) {
            var i = targetString.indexOf(beginString);
            if (i < 0) {
                return '';
            }
            return targetString.substring(i + beginString.length);
        } else {
            var i = targetString.indexOf(beginString);
            if (i < 0) {
                return null;
            }
            var j = targetString.indexOf(endString, i + beginString.length);
            if (j < 0) {
                return null;
            }
            return targetString.substring(i + beginString.length, j);
        }
    }
    
    var fs = require('fs');
    
    var entryList = [
    2684,230
    ,2685,145
    ,2686,235
    ,2687,237
    ,2688,224
    ,2689,117
    ,2690,120
    ,2691,79
    ,2692,80
    ,2693,40
    ,2694,70
    ,2695,80
    ,2696,40
    ,2697,38
    ,2698,90
    ];
    
    var sId = 2683;
    
    page.open('http://www.renyiwang.net/Mobile/Login.aspx', function(status) {
      if (status !== 'success') {
        console.log('Unable to access Login Page, status is ' + status + '!');
        phantom.exit();
      } else {
        console.log('success to open Login Page, status is ' + status + '!');
        
        page.evaluate(function() {
          document.getElementById('TstNumber').value = "用户名";
          document.getElementById('TstPassword').value = "密码";
          document.getElementById('But_Login').click();
        });
    
        setTimeout(function() {
            var pageHtml = page.evaluate(function() {
                return document.body.innerHTML;
            });
            if (pageHtml && pageHtml.indexOf("三基培训") > 0) {
                console.log('success to login');
    
                var captureQuestion = function(entryIndex, qIndex) {
                    qIndex++;
                    page.open('http://www.renyiwang.net/Mobile/Practice.aspx?o_id=6&SelQuesetions='+sId+'&q_id=' + entryList[entryIndex * 2] + '&class=0', function(status){
                        if (status !== 'success') {
                            console.log('Unable to access Practice Page, status is ' + status + '!');
                            phantom.exit();
                        } else {
                            var pageHtml = page.evaluate(function() {
                                //return document.getElementById('app1').innerHTML;
                                return document.body.innerHTML;
                            });
                            if (pageHtml && pageHtml.indexOf('Rad_T_A_Id') > 0) {
                                //console.log('success to open practice page!');
    
                                var answerId = getBetween(pageHtml, 'id="Hid_Answer" value="', '"');
                                //console.log('answerId: ' + answerId);
                                var questionTitle = getBetween(pageHtml, 'id="Hid_Choose" value="0">', '</div>');
                                if (questionTitle) {
                                    questionTitle = questionTitle.trim();
                                } else {
                                    console.log("ERROR: " + pageHtml);
                                }
    
                                var anserCode = '';
                                var optionList = [];
                                var optionInfoList = pageHtml.match(new RegExp('Rad_T_A_Id_[\\d]+', 'g'));
                                for (var i = 0; i < optionInfoList.length; i+=2) {
                                    var optionId = optionInfoList[i].replace('Rad_T_A_Id_', '');
                                    var optionTitle = getBetween(pageHtml, optionInfoList[i] + '">', '</label>');
                                    if (optionTitle) {
                                        optionList.push(optionTitle);
                                        if (answerId == optionId) {
                                            anserCode = optionTitle[0];
                                        }
                                    }
                                }
                                
                                var info = '第' + qIndex + '题:' + questionTitle + '\n' + optionList.join('\n') + '\n' + '答案:' + anserCode + '\n\n';
                                console.log(info);
                                
                                var categoryName = getBetween(pageHtml, '<span style="font-weight:bold;color:#808080;">', '</span>');
                                
                                var fs = require('fs');
                                fs.write('d:\\' + categoryName + '.txt', info, 'a');
    
                                var maxCount = entryList[entryIndex * 2 + 1];
                                if (qIndex >= maxCount) {
                                    entryIndex++;
                                    if (entryIndex * 2 >= entryList.length) {
                                        console.log('finished!');
                                        phantom.exit();
                                    } else {
                                        qIndex = 0;
                                        page.open('http://www.renyiwang.net/Mobile/PracticeClear.aspx?o_id=6&SelQuesetions='+sId+'&q_id=' + entryList[entryIndex * 2], 'post', {}, function (status) {
                                            console.log('PracticeClear ' + categoryName);
                                            setTimeout(function(){
                                                captureQuestion(entryIndex, qIndex);
                                            }, 5000);
                                        });
                                    }
                                } else {
                                    setTimeout(function(){
                                        captureQuestion(entryIndex, qIndex);
                                    }, 1000);
                                }
                            } else {
                                console.log(pageHtml);
                                console.log('fail to open pratice page!');
                                phantom.exit();
                            }
                        }
                    });
                };
                captureQuestion(0, 0);
            } else {
                console.log(pageHtml);
                phantom.exit();
            }
        }, 5000);
      }
    });
  • 相关阅读:
    苹果CMS
    rel=nofollow 是什么意思
    如何获得select被选中option的value和text和......
    使用phpexcel导出到xls文件的时候出现乱码解决
    Infinispan's GridFileSystem基于内存的网格文件系统,互联网营销 狼人:
    云计算的可伸缩性迫使App服务无状态化,互联网营销 狼人:
    那些你知道的和不知道的搜索引擎,互联网营销 狼人:
    IPv6的未来,互联网营销 狼人:
    互联网上五个最高级的搜索引擎,互联网营销 狼人:
    剖析IE浏览器子系统的性能权重,互联网营销 狼人:
  • 原文地址:https://www.cnblogs.com/lavezhang/p/16336504.html
Copyright © 2020-2023  润新知