• 提取新闻下一页


    package com.unbank.robotspider.util;
    
    import java.util.HashMap;
    import java.util.Map;
    
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    public class SmartNextPageFecther {
    
    	public static void main(String[] args) {
    		String url = "http://focus.stockstar.com/SS2014061700001351.shtml";
    		Document document = JsoupUtil.readUrl(url);
    		Map<Integer, String> pageList = new SmartNextPageFecther()
    				.getNextPageUrl(document, url);
    
    		for (int i = 0; i < pageList.size() + 3; i++) {
    			String nextUrl = pageList.get(i);
    			if (nextUrl != null) {
    				System.out.println(nextUrl);
    			}
    
    		}
    
    	}
    
    	public Map<Integer, String> getNextPageUrl(Document doc, String baseurl) {
    		Document document = doc.clone();
    		Map<Integer, String> map = new HashMap<Integer, String>();
    		Elements a_elements = document.getElementsByTag("a");
    		int prePageNum = 5;
    		int pageNum = 0;
    		for (Element e : a_elements) {
    			String uu = e.attr("href");
    			uu = UrlTools.getFullUrl(baseurl, uu);
    			if (uu == null || uu.trim().isEmpty()) {
    				continue;
    			}
    			String a_text = e.text();
    			// 是否是下一页的
    			boolean bl = checkText(a_text);
    
    			if (bl) {
    				int cu = checkUrl(baseurl, uu);
    				if (cu != -1) {
    					pageNum = pageNum > cu ? pageNum : cu;
    					prePageNum = prePageNum < cu ? prePageNum : cu;
    					map.put(cu, uu);
    				}
    			}
    		}
    		if (map.size() >= 2) {
    			// 说明是3页了
    			String second = null;
    			String third = null;
    			if (prePageNum == 0) {
    				second = map.get(0);
    				third = map.get(1);
    			} else if (prePageNum == 1) {
    				second = map.get(1);
    				third = map.get(2);
    			} else if (prePageNum == 2) {
    				second = map.get(2);
    				third = map.get(3);
    			}
    			String urlRule = UrlRuleUtil.getURlRule(second, third);
    			for (int i = prePageNum; i <= pageNum; i++) {
    				if (map.get(i) == null) {
    					String page = UrlRuleUtil.getcheckURL(urlRule, i);
    					map.put(i, page);
    				}
    			}
    
    		}
    
    		return map;
    	}
    
    	public boolean checkText(String text) {
    		String[] texts = { "首页", "第一页", "下一页", "末页", "最后一页", "尾页" };
    		for (int i = 0; i < texts.length; i++) {
    			if (texts[i].equals(text)) {
    				return true;
    			}
    		}
    		if (text.matches("\d{1,2}")) {
    			return true;
    		}
    		return false;
    	}
    
    	public int checkUrl(String url1, String url2) {
    		int l1 = url1.length();
    		int l2 = url2.length();
    		if (l1 == 0 || l2 == 0) {
    			return -1;
    		}
    
    		String longStr = l1 > l2 ? url1 : url2;
    		String shortStr = l1 < l2 ? url1 : url2;
    		int j = 0;
    		StringBuffer sb = new StringBuffer();
    		for (int i = 0; i < longStr.length() - 1; i++) {
    			if (longStr.charAt(i) != shortStr.charAt(j)) {
    				sb.append(longStr.charAt(i));
    			} else {
    				j++;
    				if (j == shortStr.length()) {
    					break;
    				}
    			}
    		}
    		if (sb.length() == 0) {
    			return -1;
    		}
    		String variances = sb.toString();
    		String numStr = variances.replaceAll("_", "").replaceAll("=", "")
    				.replaceAll("index", "").replaceAll("page", "")
    				.replaceAll("p", "").replaceAll("-", "");
    		if (numStr.matches("\d{1,2}")) {
    			return Integer.valueOf(numStr);
    		} else {
    			return -1;
    		}
    	}
    }
    
  • 相关阅读:
    第二章第1节: 2020.04.22 智能互联网之核心技术实践篇【一】
    分布式和集群理解
    CMDB了解
    Git常用命令
    brpc支持多协议
    数据库性能瓶颈了解
    接口理解
    mysql explain与索引
    InnoDB的redo log学习
    数据库抖动原因了解
  • 原文地址:https://www.cnblogs.com/tomcattd/p/3808550.html
Copyright © 2020-2023  润新知