• ZZ:爬虫


      1 /**
      2  * @author Jack.Wang
      3  * 
      4  */
      5 import java.io.BufferedReader;
      6 import java.io.InputStreamReader;
      7 import java.net.URL;
      8 import java.util.ArrayList;
      9 import java.util.HashMap;
     10 import java.util.HashSet;
     11 import java.util.LinkedHashSet;
     12 import java.util.regex.Matcher;
     13 import java.util.regex.Pattern;
     14 
     15 // 搜索Web爬行者
     16 public class SearchCrawler implements Runnable {
     17 
     18  /*
     19   * disallowListCache缓存robot不允许搜索的URL。 Robot协议在Web站点的根目录下设置一个robots.txt文件,
     20   * 规定站点上的哪些页面是限制搜索的。 
     21   * 搜索程序应该在搜索过程中跳过这些区域,下面是robots.txt的一个例子: 
     22   * # robots.txt for http://somehost.com/ User-agent:
     23   * Disallow: /cgi-bin/ 
     24   * Disallow: /registration # Disallow robots on registration page
     25   * Disallow: /login
     26   */
     27 
     28  private HashMap<String, ArrayList<String>> disallowListCache = new HashMap<String, ArrayList<String>>();
     29  ArrayList<String> errorList = new ArrayList<String>();// 错误信息
     30  ArrayList<String> result = new ArrayList<String>(); // 搜索到的结果
     31  String startUrl;// 开始搜索的起点
     32  int maxUrl;// 最大处理的url数
     33  String searchString;// 要搜索的字符串(英文)
     34  boolean caseSensitive = false;// 是否区分大小写
     35  boolean limitHost = false;// 是否在限制的主机内搜索
     36 
     37  public SearchCrawler(String startUrl, int maxUrl, String searchString) {
     38   this.startUrl = startUrl;
     39   this.maxUrl = maxUrl;
     40   this.searchString = searchString;
     41  }
     42 
     43  public ArrayList<String> getResult() {
     44   return result;
     45  }
     46 
     47  public void run() {// 启动搜索线程
     48   crawl(startUrl, maxUrl, searchString, limitHost, caseSensitive);
     49  }
     50 
     51  // 检测URL格式
     52  private URL verifyUrl(String url) {
     53   // 只处理HTTP URLs.
     54   if (!url.toLowerCase().startsWith("http://"))
     55    return null;
     56   URL verifiedUrl = null;
     57   try {
     58    verifiedUrl = new URL(url);
     59   } catch (Exception e) {
     60    return null;
     61   }
     62   return verifiedUrl;
     63  }
     64 
     65  // 检测robot是否允许访问给出的URL.
     66  private boolean isRobotAllowed(URL urlToCheck) {
     67   String host = urlToCheck.getHost().toLowerCase();// 获取给出RUL的主机
     68   // System.out.println("主机="+host);
     69 
     70   // 获取主机不允许搜索的URL缓存
     71   ArrayList<String> disallowList = disallowListCache.get(host);
     72 
     73   // 如果还没有缓存,下载并缓存。
     74   if (disallowList == null) {
     75    disallowList = new ArrayList<String>();
     76    try {
     77     URL robotsFileUrl = new URL("http://" + host + "/robots.txt");
     78     BufferedReader reader = new BufferedReader(
     79       new InputStreamReader(robotsFileUrl.openStream()));
     80 
     81     // 读robot文件,创建不允许访问的路径列表。
     82     String line;
     83     while ((line = reader.readLine()) != null) {
     84      if (line.indexOf("Disallow:") == 0) {// 是否包含"Disallow:"
     85       String disallowPath = line.substring("Disallow:"
     86         .length());// 获取不允许访问路径
     87 
     88       // 检查是否有注释。
     89       int commentIndex = disallowPath.indexOf("#");
     90       if (commentIndex != -1) {
     91        disallowPath = disallowPath.substring(0,
     92          commentIndex);// 去掉注释
     93       }
     94 
     95       disallowPath = disallowPath.trim();
     96       disallowList.add(disallowPath);
     97      }
     98     }
     99 
    100     // 缓存此主机不允许访问的路径。
    101     disallowListCache.put(host, disallowList);
    102    } catch (Exception e) {
    103     return true; // web站点根目录下没有robots.txt文件,返回真
    104    }
    105   }
    106 
    107   String file = urlToCheck.getFile();
    108   // System.out.println("文件getFile()="+file);
    109   for (int i = 0; i < disallowList.size(); i++) {
    110    String disallow = disallowList.get(i);
    111    if (file.startsWith(disallow)) {
    112     return false;
    113    }
    114   }
    115 
    116   return true;
    117  }
    118 
    119  private String downloadPage(URL pageUrl) {
    120   try {
    121    // Open connection to URL for reading.
    122    BufferedReader reader = new BufferedReader(new InputStreamReader(
    123      pageUrl.openStream()));
    124 
    125    // Read page into buffer.
    126    String line;
    127    StringBuffer pageBuffer = new StringBuffer();
    128    while ((line = reader.readLine()) != null) {
    129     pageBuffer.append(line);
    130    }
    131 
    132    return pageBuffer.toString();
    133   } catch (Exception e) {
    134   }
    135 
    136   return null;
    137  }
    138 
    139  // 从URL中去掉"www"
    140  private String removeWwwFromUrl(String url) {
    141   int index = url.indexOf("://www.");
    142   if (index != -1) {
    143    return url.substring(0, index + 3) + url.substring(index + 7);
    144   }
    145 
    146   return (url);
    147  }
    148 
    149  // 解析页面并找出链接
    150  private ArrayList<String> retrieveLinks(URL pageUrl, String pageContents,
    151    HashSet crawledList, boolean limitHost) {
    152   // 用正则表达式编译链接的匹配模式。
    153   Pattern p = Pattern.compile("<a\s+href\s*=\s*"?(.*?)["|>]",
    154     Pattern.CASE_INSENSITIVE);
    155   Matcher m = p.matcher(pageContents);
    156 
    157   ArrayList<String> linkList = new ArrayList<String>();
    158   while (m.find()) {
    159    String link = m.group(1).trim();
    160 
    161    if (link.length() < 1) {
    162     continue;
    163    }
    164 
    165    // 跳过链到本页面内链接。
    166    if (link.charAt(0) == '#') {
    167     continue;
    168    }
    169 
    170    if (link.indexOf("mailto:") != -1) {
    171     continue;
    172    }
    173 
    174    if (link.toLowerCase().indexOf("javascript") != -1) {
    175     continue;
    176    }
    177 
    178    if (link.indexOf("://") == -1) {
    179     if (link.charAt(0) == '/') {// 处理绝对地
    180      link = "http://" + pageUrl.getHost() + ":"
    181        + pageUrl.getPort() + link;
    182     } else {
    183      String file = pageUrl.getFile();
    184      if (file.indexOf('/') == -1) {// 处理相对地址
    185       link = "http://" + pageUrl.getHost() + ":"
    186         + pageUrl.getPort() + "/" + link;
    187      } else {
    188       String path = file.substring(0,
    189         file.lastIndexOf('/') + 1);
    190       link = "http://" + pageUrl.getHost() + ":"
    191         + pageUrl.getPort() + path + link;
    192      }
    193     }
    194    }
    195 
    196    int index = link.indexOf('#');
    197    if (index != -1) {
    198     link = link.substring(0, index);
    199    }
    200 
    201    link = removeWwwFromUrl(link);
    202 
    203    URL verifiedLink = verifyUrl(link);
    204    if (verifiedLink == null) {
    205     continue;
    206    }
    207 
    208    /* 如果限定主机,排除那些不合条件的URL */
    209    if (limitHost
    210      && !pageUrl.getHost().toLowerCase().equals(
    211        verifiedLink.getHost().toLowerCase())) {
    212     continue;
    213    }
    214 
    215    // 跳过那些已经处理的链接.
    216    if (crawledList.contains(link)) {
    217     continue;
    218    }
    219 
    220    linkList.add(link);
    221   }
    222 
    223   return (linkList);
    224  }
    225 
    226  // 搜索下载Web页面的内容,判断在该页面内有没有指定的搜索字符串
    227 
    228  private boolean searchStringMatches(String pageContents,
    229    String searchString, boolean caseSensitive) {
    230   String searchContents = pageContents;
    231   if (!caseSensitive) {// 如果不区分大小写
    232    searchContents = pageContents.toLowerCase();
    233   }
    234 
    235   Pattern p = Pattern.compile("[\s]+");
    236   String[] terms = p.split(searchString);
    237   for (int i = 0; i < terms.length; i++) {
    238    if (caseSensitive) {
    239     if (searchContents.indexOf(terms[i]) == -1) {
    240      return false;
    241     }
    242    } else {
    243     if (searchContents.indexOf(terms[i].toLowerCase()) == -1) {
    244      return false;
    245     }
    246    }
    247   }
    248 
    249   return true;
    250  }
    251 
    252  // 执行实际的搜索操作
    253  public ArrayList<String> crawl(String startUrl, int maxUrls,
    254    String searchString, boolean limithost, boolean caseSensitive) {
    255 
    256   HashSet<String> crawledList = new HashSet<String>();
    257   LinkedHashSet<String> toCrawlList = new LinkedHashSet<String>();
    258 
    259   if (maxUrls < 1) {
    260    errorList.add("Invalid Max URLs value.");
    261    System.out.println("Invalid Max URLs value.");
    262   }
    263 
    264   if (searchString.length() < 1) {
    265    errorList.add("Missing Search String.");
    266    System.out.println("Missing search String");
    267   }
    268 
    269   if (errorList.size() > 0) {
    270    System.out.println("err!!!");
    271    return errorList;
    272   }
    273 
    274   // 从开始URL中移出www
    275   startUrl = removeWwwFromUrl(startUrl);
    276 
    277   toCrawlList.add(startUrl);
    278   while (toCrawlList.size() > 0) {
    279 
    280    if (maxUrls != -1) {
    281     if (crawledList.size() == maxUrls) {
    282      break;
    283     }
    284    }
    285 
    286    // Get URL at bottom of the list.
    287    String url = toCrawlList.iterator().next();
    288 
    289    // Remove URL from the to crawl list.
    290    toCrawlList.remove(url);
    291 
    292    // Convert string url to URL object.
    293    URL verifiedUrl = verifyUrl(url);
    294 
    295    // Skip URL if robots are not allowed to access it.
    296    if (!isRobotAllowed(verifiedUrl)) {
    297     continue;
    298    }
    299 
    300    // 增加已处理的URL到crawledList
    301    crawledList.add(url);
    302    String pageContents = downloadPage(verifiedUrl);
    303 
    304    if (pageContents != null && pageContents.length() > 0) {
    305     // 从页面中获取有效的链接
    306     ArrayList<String> links = retrieveLinks(verifiedUrl,
    307       pageContents, crawledList, limitHost);
    308 
    309     toCrawlList.addAll(links);
    310 
    311     if (searchStringMatches(pageContents, searchString,
    312       caseSensitive)) {
    313      result.add(url);
    314      System.out.println(url);
    315     }
    316    }
    317 
    318   }
    319   return result;
    320  }
    321 
    322  // 主函数
    323  public static void main(String[] args) { 
    324   SearchCrawler crawler = new SearchCrawler("http://www.blogjava.net/Jack2007/", 20,"jack");
    325   Thread search = new Thread(crawler);
    326   System.out.println("Start searching...");
    327   System.out.println("result:");
    328   search.start();
    329   try {
    330    search.join();
    331   } catch (InterruptedException e) {
    332    // TODO Auto-generated catch block
    333    e.printStackTrace();
    334   }
    335  }
    336 }
  • 相关阅读:
    注释代码片段
    更新docker时间-需要重启docker
    mysql随机查询若干条数据的方法
    Linux 块设备驱动 (一)
    Linux SD/MMC/SDIO驱动分析
    【转】Alsa音频编程【精华】
    goahead webserver源码分析
    【转】Linux系统调用列表
    Arm Linux系统调用流程详细解析
    Socket 相关的知识
  • 原文地址:https://www.cnblogs.com/DuSizhong/p/3365824.html
Copyright © 2020-2023  润新知