• 【Heritrix基础教程之4】开始一个爬虫抓取的全流程代码分析



    在创建一个job后,就要开始job的运行,运行的全流程如下:

    1、在界面上启动job


    2、index.jsp

    查看上述页面对应的源代码

    <a href='"+request.getContextPath()+"/console/action.jsp?action=start'>Start</a>

    3、action.jsp


        String sAction = request.getParameter("action");
        if(sAction != null)
        {
            // Need to handle an action    
            if(sAction.equalsIgnoreCase("start"))
            {
                // Tell handler to start crawl job
                handler.startCrawler();
            } else if(sAction.equalsIgnoreCase("stop")) {
                // Tell handler to stop crawl job
                handler.stopCrawler();
            } else if(sAction.equalsIgnoreCase("terminate")) {
                // Delete current job
                if(handler.getCurrentJob()!=null){
                    handler.deleteJob(handler.getCurrentJob().getUID());
                }
            } else if(sAction.equalsIgnoreCase("pause")) {
                // Tell handler to pause crawl job
                handler.pauseJob();
            } else if(sAction.equalsIgnoreCase("resume")) {
                // Tell handler to resume crawl job
                handler.resumeJob();
            } else if(sAction.equalsIgnoreCase("checkpoint")) {
                if(handler.getCurrentJob() != null) {
                    handler.checkpointJob();
                }
            }
        }    
        response.sendRedirect(request.getContextPath() + "/index.jsp");

    4、CrawlJobHandler.jsp

    (1)

        public void startCrawler() {
            running = true;
            if (pendingCrawlJobs.size() > 0 && isCrawling() == false) {
                // Ok, can just start the next job
                startNextJob();
            }
        }

    (2)

        protected final void startNextJob() {
            synchronized (this) {
                if(startingNextJob != null) {
                    try {
                        startingNextJob.join();
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                        return;
                    }
                }
                startingNextJob = new Thread(new Runnable() {
                    public void run() {
                        startNextJobInternal();
                    }
                }, "StartNextJob");
                startingNextJob.start();
            }
        }

    (3)

       protected void startNextJobInternal() {
            if (pendingCrawlJobs.size() == 0 || isCrawling()) {
                // No job ready or already crawling.
                return;
            }
            this.currentJob = (CrawlJob)pendingCrawlJobs.first();
            assert pendingCrawlJobs.contains(currentJob) :
                "pendingCrawlJobs is in an illegal state";
            pendingCrawlJobs.remove(currentJob);
            try {
                this.currentJob.setupForCrawlStart();
                // This is ugly but needed so I can clear the currentJob
                // reference in the crawlEnding and update the list of completed
                // jobs.  Also, crawlEnded can startup next job.
                this.currentJob.getController().addCrawlStatusListener(this);
                // now, actually start
                this.currentJob.getController().requestCrawlStart();
            } catch (InitializationException e) {
                loadJob(getStateJobFile(this.currentJob.getDirectory()));
                this.currentJob = null;
                startNextJobInternal(); // Load the next job if there is one.
            }
        }

    (4)

        public void requestCrawlStart() {
            runProcessorInitialTasks();
    
            sendCrawlStateChangeEvent(STARTED, CrawlJob.STATUS_PENDING);
            String jobState;
            state = RUNNING;
            jobState = CrawlJob.STATUS_RUNNING;
            sendCrawlStateChangeEvent(this.state, jobState);
    
            // A proper exit will change this value.
            this.sExit = CrawlJob.STATUS_FINISHED_ABNORMAL;
            
            Thread statLogger = new Thread(statistics);
            statLogger.setName("StatLogger");
            statLogger.start();
            
            frontier.start();
        }



  • 相关阅读:
    Mac-安装Git以及Git的配置
    Mac 安装Maven,并设置环境变量
    Mac Tab自动补全键
    Eclipse 代码快捷键模板(一)
    网易博客迁移(2011-05-27)
    前端JS插件整理
    Ajax请求二进制流并在页面展示
    IDE中使用System.getProperty()获取一些属性
    Spring Boot:快速入门(二)
    c 语言 指针 与地址
  • 原文地址:https://www.cnblogs.com/jediael/p/4304128.html
Copyright © 2020-2023  润新知