网页信息的抓取以及URL,URLConnection的使用

package com.page.util;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;

/**
* @author BrinPage
* @description 网页数据抓取类
* @time 2012.7.14　17:15:00
*/
public class WebCrawlers {

   private static File file = null;
   private static FileWriter writer = null;
   private static BufferedReader reader = null;
   private static URL url = null;
   private static URLConnection con = null;

   /**
   * 从网页抓取小说名
   * @param url　网页的URL地址
   * @param file　要写入的文件地址
   */
   public static void writeNovelName(String urlPath, String filePath){

       file = new File(filePath);
       try {
           writer = new FileWriter(file,true);
           url = new URL(urlPath);
           con = url.openConnection();
           reader = new BufferedReader(new InputStreamReader(con.getInputStream(),"UTF-8"));
           StringBuffer buffer = new StringBuffer();
           int start = 0;
           int end = 0;
           String line = reader.readLine();
           while(line != null){
               if(line.indexOf("class=\"h2\"><a href=") > 0){
                   start = line.indexOf("_blank") + "_blank".length() + 3;
                   end = line.indexOf("</a>") - 1;
                   buffer.append(line.substring(start, end));
                   buffer.append("\r\n");
               }
               line = reader.readLine();
           }
           writer.write(buffer.toString());
           writer.flush();
       } catch (IOException e) {
           e.printStackTrace();
       }
   }

   /**
   * 从网页抓取小说的作者
   * @param urlPath　小说的URL
   * @param filePath 写入文件的路径
   */
   public static void writeNovelAuthor(String urlPath, String filePath){
       file = new File(filePath);
       try {
           url = new URL(urlPath);
           con = url.openConnection();
           writer = new FileWriter(file,true);
           reader = new BufferedReader(new InputStreamReader(con.getInputStream(),"UTF-8"));
           StringBuffer buffer = new StringBuffer();
           int start = 0;
           int end = 0;
           String line = reader.readLine();
           while(line != null){
               if(line.indexOf("<li class=\"h4\">") > 0 && line.indexOf("作者") < 0){
                   start = line.indexOf("<li class=\"h4\">") + "<li class=\"h4\">".length();
                   end = line.indexOf("</li>");
                   buffer.append(line.substring(start, end));
                   buffer.append("\r\n");
               }
               line = reader.readLine();
           }
           writer.write(buffer.toString());
           writer.flush();

       } catch (IOException e) {
           e.printStackTrace();
       }
   }

   /**
   * @description 从网页中抓取小说名称
   * @param urlPath　网页的URL地址
   * @param filePath　写入文件的地址路径
   * @param flag　写入文件是否是可追加的的　true为可追加的 false为不可追加的
   * @param start_Name　开始抓取的字符串，从而确定抓取数据的起始位置
   * @param end_Name　结束抓取的字符串，从而确定抓取数据的结束位置
   */
   public static void writeNovelName(String urlPath, String filePath, boolean flag, String start_Name, String end_Name){

       file = new File(filePath);
       try {
           writer = new FileWriter(file,flag);
           url = new URL(urlPath);
           con = url.openConnection();
           reader = new BufferedReader(new InputStreamReader(con.getInputStream(),"UTF-8"));
           StringBuffer buffer = new StringBuffer();
           int start = 0;
           int end = 0;
           String line = reader.readLine();
           while(line != null){
               if(line.indexOf(start_Name) > 0){
                   start = line.indexOf(">") + 1;
                   end = line.indexOf(end_Name);
                   buffer.append(line.substring(start, end));
                   buffer.append("\r\n");
               }
               line = reader.readLine();
           }
           System.out.println(buffer);
           writer.write(buffer.toString());
           writer.flush();
       } catch (IOException e) {
           e.printStackTrace();
       }
   }

   /**
   * @description 从网页中抓取小说作者
   * @param urlPath　URL地址
   * @param filePath　写入文件的地址
   * @param flag　写入文件是否可追加
   * @param start_Author　开始抓取的字符串，从而确定抓取数据的起始位置
   * @param end_Author　结束抓取的字符串，从而确定抓取数据的结束位置
   */
   public static void writeNovelAuthor(String urlPath, String filePath, boolean flag, String start_Author, String end_Author){
       file = new File(filePath);
       try {
           url = new URL(urlPath);
           con = url.openConnection();
           writer = new FileWriter(file,flag);
           reader = new BufferedReader(new InputStreamReader(con.getInputStream(),"UTF-8"));
           StringBuffer buffer = new StringBuffer();
           int start = 0;
           int end = 0;
           String line = reader.readLine();
           String temp = null;
           while(line != null){
               if(line.indexOf(start_Author) > 0){
                   temp = line.substring(start_Author.length() + 3);
                   start = temp.indexOf(">") + 1;
                   end = temp.indexOf(end_Author);
                   buffer.append(temp.substring(start, end));
                   buffer.append("\r\n");
               }
               line = reader.readLine();
           }
           System.out.println(buffer);
           writer.write(buffer.toString());
           writer.flush();

       } catch (IOException e) {
           e.printStackTrace();
       }
   }


   /**
   * @param args
   */
   public static void main(String[] args) {
       /**
       String arg[] = {"http://s.zhulang.com/w_book_list.php",
               "http://s.zhulang.com/w_book_list.php?bState=all&classa=&classb=&initial=&total=137939&pageNum=2&PHPSESSID=5f1af55354a3f6551a033d980f9cc34b"};

       CurrentTime time = new CurrentTime();
       System.out.println("写入文件数据（数据量13750条）开始时间：" + time.getCurrentTime());

       for(int i = 0; i < arg.length; i ++){
           System.out.println("方法开始时间：" + time.getCurrentTime());
           WebCrawlers.writeNovelName(arg[i], "E:\\novel name list.txt");
           System.out.println("方法结束时间：" + time.getCurrentTime());
           WebCrawlers.writeNovelAuthor(arg[i], "E:\\novel author list.txt");
       }
       String start = "http://s.zhulang.com/w_book_list.php?bState=all&classa=&classb=&initial=&total=137939&pageNum=";
       String end = "&PHPSESSID=5f1af55354a3f6551a033d980f9cc34b";
       String s = null;
       for(int i = 3; i < 2751; i ++){
           s = start + i + end;
           WebCrawlers.writeNovelName(s, "E:\\novel name list.txt");
           WebCrawlers.writeNovelAuthor(s, "E:\\novel author list.txt");
       }

       System.out.println("写入文件数据（数据量13750条）结束时间：" + time.getCurrentTime());
       */


       String s = "http://www.yuncheng.com/category/search?cate=%E5%9B%BE%E4%B9%A6";
       String end = "&pn=";
       CurrentTime time = new CurrentTime();
       System.out.println("写入文件数据（数据量54899条）开始时间：" + time.getCurrentTime());
       for(int i = 1; i < 50000; i ++){
           if(i > 1) s = s + end + i;
           WebCrawlers.writeNovelAuthor(s, "E:\\novel name list.txt", true,"p class=\"info\">作者：<a href", "</a>");
           WebCrawlers.writeNovelName(s, "E:\\novel author list.txt", true, "a action=\"title\"", "</a>");
       }
       System.out.println("写入文件数据（数据量54899条）结束时间：" + time.getCurrentTime());
   }

}

原文地址：https://www.cnblogs.com/Jiphen/p/2593804.html