package com.page.util;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
/**
* @author BrinPage
* @description 网页数据抓取类
* @time 2012.7.14 17:15:00
*/
public class WebCrawlers {
private static File file = null;
private static FileWriter writer = null;
private static BufferedReader reader = null;
private static URL url = null;
private static URLConnection con = null;
/**
* 从网页抓取小说名
* @param url 网页的URL地址
* @param file 要写入的文件地址
*/
public static void writeNovelName(String urlPath, String filePath){
file = new File(filePath);
try {
writer = new FileWriter(file,true);
url = new URL(urlPath);
con = url.openConnection();
reader = new BufferedReader(new InputStreamReader(con.getInputStream(),"UTF-8"));
StringBuffer buffer = new StringBuffer();
int start = 0;
int end = 0;
String line = reader.readLine();
while(line != null){
if(line.indexOf("class=\"h2\"><a href=") > 0){
start = line.indexOf("_blank") + "_blank".length() + 3;
end = line.indexOf("</a>") - 1;
buffer.append(line.substring(start, end));
buffer.append("\r\n");
}
line = reader.readLine();
}
writer.write(buffer.toString());
writer.flush();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 从网页抓取小说的作者
* @param urlPath 小说的URL
* @param filePath 写入文件的路径
*/
public static void writeNovelAuthor(String urlPath, String filePath){
file = new File(filePath);
try {
url = new URL(urlPath);
con = url.openConnection();
writer = new FileWriter(file,true);
reader = new BufferedReader(new InputStreamReader(con.getInputStream(),"UTF-8"));
StringBuffer buffer = new StringBuffer();
int start = 0;
int end = 0;
String line = reader.readLine();
while(line != null){
if(line.indexOf("<li class=\"h4\">") > 0 && line.indexOf("作者") < 0){
start = line.indexOf("<li class=\"h4\">") + "<li class=\"h4\">".length();
end = line.indexOf("</li>");
buffer.append(line.substring(start, end));
buffer.append("\r\n");
}
line = reader.readLine();
}
writer.write(buffer.toString());
writer.flush();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* @description 从网页中抓取小说名称
* @param urlPath 网页的URL地址
* @param filePath 写入文件的地址路径
* @param flag 写入文件是否是可追加的的 true为可追加的 false为不可追加的
* @param start_Name 开始抓取的字符串,从而确定抓取数据的起始位置
* @param end_Name 结束抓取的字符串,从而确定抓取数据的结束位置
*/
public static void writeNovelName(String urlPath, String filePath, boolean flag, String start_Name, String end_Name){
file = new File(filePath);
try {
writer = new FileWriter(file,flag);
url = new URL(urlPath);
con = url.openConnection();
reader = new BufferedReader(new InputStreamReader(con.getInputStream(),"UTF-8"));
StringBuffer buffer = new StringBuffer();
int start = 0;
int end = 0;
String line = reader.readLine();
while(line != null){
if(line.indexOf(start_Name) > 0){
start = line.indexOf(">") + 1;
end = line.indexOf(end_Name);
buffer.append(line.substring(start, end));
buffer.append("\r\n");
}
line = reader.readLine();
}
System.out.println(buffer);
writer.write(buffer.toString());
writer.flush();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* @description 从网页中抓取小说作者
* @param urlPath URL地址
* @param filePath 写入文件的地址
* @param flag 写入文件是否可追加
* @param start_Author 开始抓取的字符串,从而确定抓取数据的起始位置
* @param end_Author 结束抓取的字符串,从而确定抓取数据的结束位置
*/
public static void writeNovelAuthor(String urlPath, String filePath, boolean flag, String start_Author, String end_Author){
file = new File(filePath);
try {
url = new URL(urlPath);
con = url.openConnection();
writer = new FileWriter(file,flag);
reader = new BufferedReader(new InputStreamReader(con.getInputStream(),"UTF-8"));
StringBuffer buffer = new StringBuffer();
int start = 0;
int end = 0;
String line = reader.readLine();
String temp = null;
while(line != null){
if(line.indexOf(start_Author) > 0){
temp = line.substring(start_Author.length() + 3);
start = temp.indexOf(">") + 1;
end = temp.indexOf(end_Author);
buffer.append(temp.substring(start, end));
buffer.append("\r\n");
}
line = reader.readLine();
}
System.out.println(buffer);
writer.write(buffer.toString());
writer.flush();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* @param args
*/
public static void main(String[] args) {
/**
String arg[] = {"http://s.zhulang.com/w_book_list.php",
"http://s.zhulang.com/w_book_list.php?bState=all&classa=&classb=&initial=&total=137939&pageNum=2&PHPSESSID=5f1af55354a3f6551a033d980f9cc34b"};
CurrentTime time = new CurrentTime();
System.out.println("写入文件数据(数据量13750条)开始时间:" + time.getCurrentTime());
for(int i = 0; i < arg.length; i ++){
System.out.println("方法开始时间:" + time.getCurrentTime());
WebCrawlers.writeNovelName(arg[i], "E:\\novel name list.txt");
System.out.println("方法结束时间:" + time.getCurrentTime());
WebCrawlers.writeNovelAuthor(arg[i], "E:\\novel author list.txt");
}
String start = "http://s.zhulang.com/w_book_list.php?bState=all&classa=&classb=&initial=&total=137939&pageNum=";
String end = "&PHPSESSID=5f1af55354a3f6551a033d980f9cc34b";
String s = null;
for(int i = 3; i < 2751; i ++){
s = start + i + end;
WebCrawlers.writeNovelName(s, "E:\\novel name list.txt");
WebCrawlers.writeNovelAuthor(s, "E:\\novel author list.txt");
}
System.out.println("写入文件数据(数据量13750条)结束时间:" + time.getCurrentTime());
*/
String s = "http://www.yuncheng.com/category/search?cate=%E5%9B%BE%E4%B9%A6";
String end = "&pn=";
CurrentTime time = new CurrentTime();
System.out.println("写入文件数据(数据量54899条)开始时间:" + time.getCurrentTime());
for(int i = 1; i < 50000; i ++){
if(i > 1) s = s + end + i;
WebCrawlers.writeNovelAuthor(s, "E:\\novel name list.txt", true,"p class=\"info\">作者:<a href", "</a>");
WebCrawlers.writeNovelName(s, "E:\\novel author list.txt", true, "a action=\"title\"", "</a>");
}
System.out.println("写入文件数据(数据量54899条)结束时间:" + time.getCurrentTime());
}
}