• Heritrix 3.1.0 源码解析(二十九)


    本文接下来分析CrawlServer类和CrawlHost类,两者都实现了IdentityCacheable接口(可缓存对象接口)

    CrawlServer对象代表服务器,里面存储了服务器的相关信息,包括服务名 端口 robots信息 Credential集合及相关操作等

    private static final long serialVersionUID = 3L;
    
        public static final long ROBOTS_NOT_FETCHED = -1;
        /** only check if robots-fetch is perhaps superfluous 
         * after this many tries */
        public static final long MIN_ROBOTS_RETRIES = 3;
    
        private String server; // actually, host+port in the https case
        private int port;
        protected Robotstxt robotstxt;
        long robotsFetched = ROBOTS_NOT_FETCHED;
        boolean validRobots = false;
        FetchStats substats = new FetchStats();
        
        // how many consecutive connection errors have been encountered;
        // used to drive exponentially increasing retry timeout or decision
        // to 'freeze' entire class (queue) of URIs
        protected int consecutiveConnectionErrors = 0;
    
        /**
         * Set of credentials.
         */
        private transient Set<Credential> credentials =  null;

    String server表示站点服务器的标识,其构造方法如下(初始化站点服务器的标识和端口

    /**
         * Creates a new CrawlServer object.
         *
         * @param h the host string for the server.
         */
        public CrawlServer(String h) {
            // TODO: possibly check for illegal host string
            server = h;
            int colonIndex = server.lastIndexOf(":");
            if (colonIndex < 0) {
                port = -1;
            } else {
                try {
                    port = Integer.parseInt(server.substring(colonIndex + 1));
                } catch (NumberFormatException e) {
                    port = -1;
                }
            }
        }

    下面的方法是有关Robotstxt robotstxt对象操作的

    public Robotstxt getRobotstxt() {
            return robotstxt;
        }
        
        /** Update the robotstxt
        *
        * @param curi the crawl URI containing the fetched robots.txt
        * @throws IOException
        */
       public synchronized void updateRobots(CrawlURI curi) {
    
           robotsFetched = System.currentTimeMillis();
           
           boolean gotSomething = curi.getFetchType() == HTTP_GET 
               && (curi.getFetchStatus() > 0 || curi.getFetchStatus() == S_DEEMED_NOT_FOUND);
           
           
           if (!gotSomething && curi.getFetchAttempts() < MIN_ROBOTS_RETRIES) {
               // robots.txt lookup failed, still trying, no reason to consider IGNORE yet
               validRobots = false;
               return;
           }
                  
           // special deeming for a particular kind of connection-lost (empty server response)
            if (curi.getFetchStatus() == S_CONNECT_LOST
                    && CollectionUtils.exists(curi.getNonFatalFailures(),
                            PredicateUtils.instanceofPredicate(NoHttpResponseException.class))) {
                curi.setFetchStatus(S_DEEMED_NOT_FOUND);
                gotSomething = true;
            }
           
           if (!gotSomething) {
               // robots.txt fetch failed and exceptions (ignore/deeming) don't apply; no valid robots info yet
               validRobots = false;
               return;
           }
           
           int fetchStatus = curi.getFetchStatus();
           if (fetchStatus < 200 || fetchStatus >= 300) {
               // Not found or anything but a status code in the 2xx range is
               // treated as giving access to all of a sites' content.
               // This is the prevailing practice of Google, since 4xx
               // responses on robots.txt are usually indicative of a 
               // misconfiguration or blanket-block, not an intentional
               // indicator of partial blocking. 
               // TODO: consider handling server errors, redirects differently
               robotstxt = Robotstxt.NO_ROBOTS;
               validRobots = true;
               return;
           }
    
           InputStream contentBodyStream = null;
           try {
               BufferedReader reader;
               contentBodyStream = curi.getRecorder().getContentReplayInputStream();
    
               reader = new BufferedReader(new InputStreamReader(contentBodyStream));
               robotstxt = new Robotstxt(reader); 
               validRobots = true;
           } catch (IOException e) {
               robotstxt = Robotstxt.NO_ROBOTS;
               logger.log(Level.WARNING,"problem reading robots.txt for "+curi,e);
               validRobots = true;
               curi.getNonFatalFailures().add(e);
           } finally {
               IOUtils.closeQuietly(contentBodyStream);
           }
       }    
    /**
         * If true then valid robots.txt information has been retrieved. If false
         * either no attempt has been made to fetch robots.txt or the attempt
         * failed.
         *
         * @return Returns the validRobots.
         */
        public synchronized boolean isValidRobots() {
            return validRobots;
        }
    /**
         * Is the robots policy expired.
         *
         * This method will also return true if we haven't tried to get the
         * robots.txt for this server.
         *
         * @param curi
         * @return true if the robots policy is expired.
         */
        public synchronized boolean isRobotsExpired(int validityDuration) {
            if (robotsFetched == ROBOTS_NOT_FETCHED) {
                // Have not attempted to fetch robots
                return true;
            }
            long duration = validityDuration*1000L;
            if (duration == 0) {
                // When zero, robots should be valid forever
                return false;
            }
            if (robotsFetched + duration < System.currentTimeMillis()) {
                // Robots is still valid
                return true;
            }
            return false;
        }

    Set<Credential> credentials证书集合方法

    /**
         * @return Credential avatars for this server.  Returns null if none.
         */
        public Set<Credential> getCredentials() {
            return this.credentials;
        }
    
        /**
         * @return True if there are avatars attached to this instance.
         */
        public boolean hasCredentials() {
            return this.credentials != null && this.credentials.size() > 0;
        }
    
        /**
         * Add an avatar.
         *
         * @param ca Credential avatar to add to set of avatars.
         */
        public void addCredential(Credential cred) {
            if (this.credentials == null) {
                this.credentials = new HashSet<Credential>();
            }
            this.credentials.add(cred);
        }

    根据UURI uuri对象生成key的静态方法(用于站点服务器标识)

    /**
         * Get key to use doing lookup on server instances.
         * 
         * @param cauri  CandidateURI we're to get server key for.
         * @return String to use as server key.
         * @throws URIException
         */
        /**
         * 根据UURI uuri对象生成key
         * 这里的key不同于classkey,应该保证同一域名下的所有url的key的一致性
         * @param uuri
         * @return
         * @throws URIException
         */
        public static String getServerKey(UURI uuri) throws URIException {
            // TODO: evaluate if this is really necessary -- why not
            // make the server of a dns CandidateURI the looked-up domain,
            // also simplifying FetchDNS?
            String key = uuri.getAuthorityMinusUserinfo();
            if (key == null) {
                // Fallback for cases where getAuthority() fails (eg 'dns:'.
                // DNS UURIs have the 'domain' in the 'path' parameter, not
                // in the authority).
                key = uuri.getCurrentHierPath();
                if (key != null && !key.matches("[-_\\w\\.:]+")) {
                    // Not just word chars and dots and colons and dashes and
                    // underscores; throw away
                    key = null;
                }
            }
            if (key != null && uuri.getScheme().equals(UURIFactory.HTTPS)) {
                // If https and no port specified, add default https port to
                // distinuish https from http server without a port.
                if (!key.matches(".+:[0-9]+")) {
                    key += UURIFactory.HTTPS_PORT;
                }
            }
            return key;
        }

    CrawlHost对象代表主机,里面存储了主机标识(域名) IP地址 抓取时间  国家代码信息等

    /** Flag value indicating always-valid IP */
        public static final long IP_NEVER_EXPIRES = -1;
        /** Flag value indicating an IP has not yet been looked up */
        public static final long IP_NEVER_LOOKED_UP = -2;
        private String hostname;
        private String countryCode;
        private InetAddress ip;
        private long ipFetched = IP_NEVER_LOOKED_UP;
        protected FetchStats substats = new FetchStats(); 
        /**
         * TTL gotten from dns record.
         *
         * From rfc2035:
         * <pre>
         * TTL       a 32 bit unsigned integer that specifies the time
         *           interval (in seconds) that the resource record may be
         *           cached before it should be discarded.  Zero values are
         *           interpreted to mean that the RR can only be used for the
         *           transaction in progress, and should not be cached.
         * </pre>
         */
        private long ipTTL = IP_NEVER_LOOKED_UP;
    
        // Used when bandwith constraint are used
        private long earliestNextURIEmitTime = 0;

    构造方法初始化主机标识

    /** 
         * Create a new CrawlHost object.
         *
         * @param hostname the host name for this host.
         */
        public CrawlHost(String hostname) {
                this(hostname, null);
        }
    
        /** 
         * Create a new CrawlHost object.
         *
         * @param hostname the host name for this host.
         * @param countryCode the country code for this host.
         */
        public CrawlHost(String hostname, String countryCode) {
            this.hostname = hostname;
            this.countryCode = countryCode;
            InetAddress tmp = InetAddressUtil.getIPHostAddress(hostname);
            if (tmp != null) {
                setIP(tmp, IP_NEVER_EXPIRES);
            }
        }

    下面的方法用于设置IP地址

    /** Return true if the IP for this host has been looked up.
         *
         * Returns true even if the lookup failed.
         *
         * @return true if the IP for this host has been looked up.
         */
        public boolean hasBeenLookedUp() {
            return ipFetched != IP_NEVER_LOOKED_UP;
        }
    
        /**
         * Set the IP address for this host.
         *
         * @param address
         * @param ttl the TTL from the dns record in seconds or -1 if it should live
         * forever (is a numeric IP).
         */
        /**
         * 设置IP FetchNDS处理器解析IP
         * @param address
         * @param ttl
         */
        public void setIP(InetAddress address, long ttl) {
            this.ip = address;
            // Assume that a lookup as occurred by the time
            // a caller decides to set this (even to null)
            this.ipFetched = System.currentTimeMillis();
            this.ipTTL = ttl;
            if (logger.isLoggable(Level.FINE)) {
                logger.fine(hostname + ": " +
                    ((address != null)? address.toString(): "null"));
            }
        }

    ---------------------------------------------------------------------------

    本系列Heritrix 3.1.0 源码解析系本人原创

    转载请注明出处 博客园 刺猬的温驯

    本文链接 http://www.cnblogs.com/chenying99/archive/2013/04/29/3050940.html

  • 相关阅读:
    [ lucene高级 ] lucene中的算法PriorityQueue
    [ lucene扩展 ] MoreLikeThis 相似检索
    排序08归并排序
    lucene中的数值型字段(NumericField)
    两三年前的搜索管理系统
    java中的集合包简要分析
    倒排索引基础
    散列02java中的hashMap
    Mysql数据库中InnoDB和MyISAM的差别
    ajax 乱码
  • 原文地址:https://www.cnblogs.com/chenying99/p/3050940.html
Copyright © 2020-2023  润新知