• java基础:9.4 web爬虫


    跟随超链接来自动遍历Web.

    package day11;
    import java.util.ArrayList;
    import java.util.Scanner;
    public class WebCrawler {
    	public static void main(String[] args) {
    		Scanner input = new Scanner(System.in);
    		System.out.println("enter a URL(such as:http://wwww.xxxx.com):");
    		String url = input.nextLine();
    		crawler(url);
    	}
    	
    	public static void crawler(String startingURL) {
    		ArrayList<String> listOfPendingURLs = new ArrayList<>();
    		ArrayList<String> listOfTraversedURLs = new ArrayList<>();
    		listOfPendingURLs.add(startingURL) ;
    		int i= 0;
    		while (!listOfPendingURLs.isEmpty() &&   //is.Empty() :empty return true
    				listOfTraversedURLs.size() <= 100) {
    			String urlString = listOfPendingURLs.remove(0);  // remove the first url
    			if (!listOfTraversedURLs.contains(urlString)) {
    				listOfTraversedURLs.add(urlString);
    				System.out.println("Crawl " + ++i + "  "+urlString);
    				
    				for (String s: getSubURLs(urlString)) {
    					if (!listOfTraversedURLs.contains(s))
    						listOfPendingURLs.add (s);
    				}
    			}
    		}
    	}
    	
    	public static ArrayList<String> getSubURLs(String urlString) {
    		ArrayList<String> list = new ArrayList<>() ;
    		
    		try {
    			java.net.URL url = new java.net.URL(urlString);
    			Scanner input = new Scanner(url.openStream());
    			int current = 0;
    			while(input.hasNext()) {
    				String line = input.nextLine();
    				current = line.indexOf("http:",current);
    				while (current > 0) {
    					int endIndex = line.indexOf(""",current);
    				    if (endIndex > 0 ) {
    				    	list.add(line.substring(current,endIndex));
    				    	current = line.indexOf("http:",endIndex);
    				    }
    				    else 
    				    	current = -1;
    				}
    			}
    		}
    		catch (Exception ex) {
    			System.out.println("error:" + ex.getMessage());
    		}
    		
    		return list;
    	}
    }
    
  • 相关阅读:
    容器编排之rancher
    ActiveMQ安装配置
    Ansible Playbook
    AnsibleTower
    Ansible Configuration file
    jenkins报错jdk1.8/jre/lib/amd64/libawt_xawt.so
    Nexus安装配置
    maven 国内可用的中央仓库 阿里云
    jenkins Master stays offline if low disk space
    win版tesseract安装
  • 原文地址:https://www.cnblogs.com/l20902/p/10610933.html
Copyright © 2020-2023  润新知