• java robots协议检测工具


    /**
     * @(#)SearchCrawler.java, 2016年4月12日. Copyright 2016 Youdao, Inc. All rights
     *                         reserved. YOUDAO PROPRIETARY/CONFIDENTIAL. Use is
     *                         subject to license terms.
     */
    package testZK;
    
    import java.util.*;
    import java.net.*;
    import java.io.*;
    
    /**
     * 
     * @author zhoukang
     */
    public class SearchCrawler extends Thread {
        private HashMap<String, ArrayList<String>> disallowListCache = new HashMap<String, ArrayList<String>>();
        
        private List<String> urlList;
        
        private static File resultFile = new File("result.txt");
        private static BufferedWriter writer;
        static {
            try {
                if(!resultFile.exists()) {
                    resultFile.createNewFile();
                }
                writer = new BufferedWriter(new FileWriter(resultFile));
            } catch (Exception e) {
                //TODO:
            }
        }
    
        public SearchCrawler(String str, boolean file) throws IOException{
            urlList = new ArrayList<String>();
            if (file) {
                File f = null;
                BufferedReader reader = null;
                try { 
                    f = new File(str);
                    reader = new BufferedReader(new FileReader(f));
                    String line = "";
                    while(line != null) {
                        line = reader.readLine();
                        urlList.add(line);
                    }
                } catch (Exception e) {
                    //TODO
                } finally {
                    if(reader != null) {
                        reader.close();
                    }
                }
            } else {
                urlList.add(str);
            }
        }
        
        public BufferedWriter getBufferedWriter() {
            return writer;
        }
    
        public void run() {
            checkUrl(urlList);
        }
        
        private void checkUrl(List<String> urls) {
            Iterator<String> urlIter = urls.iterator();
            while(urlIter.hasNext()) {
                String url = urlIter.next();
                if(url == null || url.equals("")) {
                    continue;
                }
                url = removeWwwFromUrl(url);
                URL verifiedUrl = verifyUrl(url);
                System.out.println(url);
                try {
                    if(isRobotAllowed(verifiedUrl)) {
                        writer.write(url+":true");
                    } else {
                        writer.write(url+":false");
                    }
                    writer.newLine();
                    writer.flush();
                } catch (Exception e) {
                    //TODO:
                }
            }        
        }
    
        private URL verifyUrl(String url) {
            if (!url.toLowerCase().startsWith("http://"))
                return null;
    
            URL verifiedUrl = null;
            try {
                verifiedUrl = new URL(url);
            } catch (Exception e) {
                return null;
            }
    
            return verifiedUrl;
        }
    
        private boolean isRobotAllowed(URL urlToCheck) {
            String host = urlToCheck.getHost().toLowerCase();
            ArrayList<String> disallowList = disallowListCache.get(host);
    
            if (disallowList == null) {
                disallowList = new ArrayList<String>();
                try {
                    URL robotsFileUrl = new URL("http://" + host + "/robots.txt");
                    BufferedReader reader = new BufferedReader(
                            new InputStreamReader(robotsFileUrl.openStream()));
    
                    String line;
                    while ((line = reader.readLine()) != null) {
                        if (line.indexOf("Disallow:") == 0) {
                            String disallowPath = line.substring("Disallow:"
                                    .length());
    
                            int commentIndex = disallowPath.indexOf("#");
                            if (commentIndex != -1) {
                                disallowPath = disallowPath.substring(0,
                                        commentIndex);
                            }
    
                            disallowPath = disallowPath.trim();
                            disallowList.add(disallowPath);
                        }
                    }
    
                    disallowListCache.put(host, disallowList);
                } catch (Exception e) {
                    return true; 
                }
            }
    
            String file = urlToCheck.getFile();
            for (int i = 0; i < disallowList.size(); i++) {
                String disallow = disallowList.get(i);
                if (file.startsWith(disallow)) {
                    return false;
                }
            }
    
            return true;
        }
    
        private String removeWwwFromUrl(String url) {
            int index = url.indexOf("://www.");
            if (index != -1) {
                return url.substring(0, index + 3) + url.substring(index + 7);
            }
    
            return (url);
        }
        
        private static void addShutDownHook(final SearchCrawler searchCrawler) {
            Runtime.getRuntime().addShutdownHook(new Thread() {
               public void run() {
                   BufferedWriter writer = searchCrawler.getBufferedWriter();
                   try {
                       writer.close();
                   } catch (Exception e) {
                       //TODO
                       System.out.println("Add error");
                   }
               } 
            });
        }
    
        public static void main(String[] args) throws InterruptedException, IOException{
            if (args.length != 1 && args.length != 2 ) {
                System.out
                        .println("Usage-1:java SearchCrawler url");
                System.out
                .println("Usage-2:java SearchCrawler -f filename");
                return;
            }
    
            SearchCrawler crawler = null;
            if(args.length == 1) {
                crawler = new SearchCrawler(args[0], false);
            } else {
                crawler = new SearchCrawler(args[1], true);
            }
            addShutDownHook(crawler);
            crawler.setDaemon(true);
            crawler.start();
            crawler.join();  
        }
    }
  • 相关阅读:
    LAMP LNMP 和 LNMPA
    nginx版本如何选择?
    如何看apache的版本号
    CLR Via CSharp读书笔记(12):泛型
    要先怀疑外部代码的错误,再检测是不是自己代码的问题
    redis的那些事
    An Illustrated Guide to SSH Agent Forwarding
    Using sshagent with ssh
    dtach
    [原创]bind DNS IP列表的精确获取
  • 原文地址:https://www.cnblogs.com/cane/p/5383735.html
Copyright © 2020-2023  润新知