• MinerHtmlThread.java 爬取页面线程


    MinerHtmlThread.java 爬取页面线程

    package com.iteye.injavawetrust.miner;
    
    import org.apache.commons.logging.Log;
    import org.apache.commons.logging.LogFactory;
    import org.jsoup.Connection;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    
    /**
     * 爬取页面线程
     * @author InJavaWetrust
     *
     */
    public class MinerHtmlThread extends Thread {
    	
    	private static final Log LOG = LogFactory.getLog(MinerHtmlThread.class);
    	
    	private MinerConfig config = null;
    	
    	public MinerHtmlThread(MinerConfig config) {
    		this.config = config;
    	}
    	
    	@Override
    	public void run() {
    		while (!MinerMonitorThread.done) {
    			minerHtml();
    		}
    	}
    	
    	public synchronized void minerHtml() {
    		MinerUrl minerUrl = MinerQueue.unVisitedPoll(); // 待访问出队列。
    		try {
    			//判断当前页面爬取深度
    			if(null == minerUrl || MinerUtil.isBlank(minerUrl.getUrl()) || minerUrl.getDepth() > config.getMaxDepth()) {
    				return;
    			}
    			//判断爬取页面URL是否包含http
    			if("http".contains(minerUrl.getUrl())) {
    				LOG.info("MinerHtmlThread当前爬取URL[" + minerUrl.getUrl() + "]没有http");
    				return;
    			}
    			LOG.info("MinerHtmlThread当前爬取页面[" + minerUrl.getUrl() + "]爬取深度[" + minerUrl.getDepth() + "] 当前线程 [" + Thread.currentThread().getName() + "]");
    			Connection conn = Jsoup.connect(minerUrl.getUrl());
    			conn.header("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13");//配置模拟浏览器  
    			Document doc = conn.get();
    			String page = doc.html();
    			Html html = new Html();
    			html.setUrl(minerUrl.getUrl());
    			html.setHtml(page);
    			html.setDepth(minerUrl.getDepth());
    			
    			// 添加到存储队列
    			MinerQueue.addStore(html);
    			
    			// 已经爬取的页面 添加到等待提取URL的分析页面队列
    			MinerQueue.addWaitingMisering(html); 
    			
    		} catch(Exception e) {
    			LOG.info("MinerHtmlThread爬取页面失败 URL [" + minerUrl.getUrl() + "]");
    			LOG.info("MinerHtmlThreadError info [" + e.getMessage() + "]");
    		}
    		
    	}
    
    }
    

    返回列表

  • 相关阅读:
    mybatis plus foreach 的用法
    mongodb聚合查询
    mongodb and 和 or 查询
    mongodb全文搜索
    时间参数的传递
    rabbitmq
    AOP各种的实现
    OWASP Top 10十大风险 – 10个最重大的Web应用风险与攻防
    OAuth2.0认证和授权机制讲解
    MySQL主从复制
  • 原文地址:https://www.cnblogs.com/muyuge/p/6152084.html
Copyright © 2020-2023  润新知