• 【Heritrix基础教程之4】开始一个爬虫抓取的全流程代码分析 分类: H3_NUTCH 2014-06-04 20:10 1039人阅读 评论(0) 收藏



    在创建一个job后,就要开始job的运行,运行的全流程如下:

    1、在界面上启动job


    2、index.jsp

    查看上述页面对应的源代码

    <a href='"+request.getContextPath()+"/console/action.jsp?action=start'>Start</a>

    3、action.jsp


        String sAction = request.getParameter("action");
        if(sAction != null)
        {
            // Need to handle an action    
            if(sAction.equalsIgnoreCase("start"))
            {
                // Tell handler to start crawl job
                handler.startCrawler();
            } else if(sAction.equalsIgnoreCase("stop")) {
                // Tell handler to stop crawl job
                handler.stopCrawler();
            } else if(sAction.equalsIgnoreCase("terminate")) {
                // Delete current job
                if(handler.getCurrentJob()!=null){
                    handler.deleteJob(handler.getCurrentJob().getUID());
                }
            } else if(sAction.equalsIgnoreCase("pause")) {
                // Tell handler to pause crawl job
                handler.pauseJob();
            } else if(sAction.equalsIgnoreCase("resume")) {
                // Tell handler to resume crawl job
                handler.resumeJob();
            } else if(sAction.equalsIgnoreCase("checkpoint")) {
                if(handler.getCurrentJob() != null) {
                    handler.checkpointJob();
                }
            }
        }    
        response.sendRedirect(request.getContextPath() + "/index.jsp");

    4、CrawlJobHandler.jsp

    (1)

        public void startCrawler() {
            running = true;
            if (pendingCrawlJobs.size() > 0 && isCrawling() == false) {
                // Ok, can just start the next job
                startNextJob();
            }
        }

    (2)

        protected final void startNextJob() {
            synchronized (this) {
                if(startingNextJob != null) {
                    try {
                        startingNextJob.join();
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                        return;
                    }
                }
                startingNextJob = new Thread(new Runnable() {
                    public void run() {
                        startNextJobInternal();
                    }
                }, "StartNextJob");
                startingNextJob.start();
            }
        }

    (3)

       protected void startNextJobInternal() {
            if (pendingCrawlJobs.size() == 0 || isCrawling()) {
                // No job ready or already crawling.
                return;
            }
            this.currentJob = (CrawlJob)pendingCrawlJobs.first();
            assert pendingCrawlJobs.contains(currentJob) :
                "pendingCrawlJobs is in an illegal state";
            pendingCrawlJobs.remove(currentJob);
            try {
                this.currentJob.setupForCrawlStart();
                // This is ugly but needed so I can clear the currentJob
                // reference in the crawlEnding and update the list of completed
                // jobs.  Also, crawlEnded can startup next job.
                this.currentJob.getController().addCrawlStatusListener(this);
                // now, actually start
                this.currentJob.getController().requestCrawlStart();
            } catch (InitializationException e) {
                loadJob(getStateJobFile(this.currentJob.getDirectory()));
                this.currentJob = null;
                startNextJobInternal(); // Load the next job if there is one.
            }
        }

    (4)

        public void requestCrawlStart() {
            runProcessorInitialTasks();
    
            sendCrawlStateChangeEvent(STARTED, CrawlJob.STATUS_PENDING);
            String jobState;
            state = RUNNING;
            jobState = CrawlJob.STATUS_RUNNING;
            sendCrawlStateChangeEvent(this.state, jobState);
    
            // A proper exit will change this value.
            this.sExit = CrawlJob.STATUS_FINISHED_ABNORMAL;
            
            Thread statLogger = new Thread(statistics);
            statLogger.setName("StatLogger");
            statLogger.start();
            
            frontier.start();
        }



    版权声明:本文为博主原创文章,未经博主允许不得转载。

  • 相关阅读:
    java 或者 js 获取项目访问路径(域名)
    jquery validate 使用示例
    项目中调试SQLServer 方便的查看SQL语句的执行时间的方法
    Oracle Replace函数的简单使用
    LINE_NO in format of XXXX example -> Line 10 is 0010
    Oracle序列使用:建立、删除、使用
    Oracle 存储过程创建及调用
    char、varchar、varchar2区别
    missing equal sign
    [转]Windows中的句柄(handle)
  • 原文地址:https://www.cnblogs.com/lujinhong2/p/4637317.html
Copyright © 2020-2023  润新知