import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/** *//**
* 读取html页面内容
* Html页面内容获取处理页面
*/
public class ReadHtml {
/** *//**
* 读取html页面内容CED块的html源文件代码
* @param urlPath 网页地址
* urlPath 网页地址格式 http://whois.asia/cgi-bin/whois.cgi?whois_query_field=today
*/
public static String GetCEDSource(String urlPath)
{
StringBuffer document = new StringBuffer();
try
{
if(urlPath==""||urlPath==null)
{
urlPath="http://whois.asia/cgi-bin/whois.cgi?whois_query_field=today";
}
URL url = new URL(urlPath);
URLConnection conn = url.openConnection();
BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream()));
String line = null;
while ((line = reader.readLine()) != null)
document.append(line + " ");
reader.close();
}
catch (Exception e)
{
e.printStackTrace();
}
return document.toString();
}
/** *//** *
* @param url 地址格式 例如: http://whois.asia/cgi-bin/whois.cgi?whois_query_field=today
* @return 字符串数组 String[] 返回页面的CED块信息
*/
public static String[] GetCEDContent(String url){
return GetCEDContent(url,"");
}
/** *//** *
* @return 字符串数组 String[] 返回页面的CED块信息
*/
public static String[] GetCEDContent(){
return GetCEDContent("","");
}
/** *//** *
* @param url 地址格式 例如: http://whois.asia/cgi-bin/whois.cgi?whois_query_field=today
* @param keyword 正则要匹配的内容 块所包括的名称
* @return 字符串数组 String[] 返回页面的CED块信息
*/
public static String[] GetCEDContent(String url, String keyword)
{
String input = ReadHtml.GetCEDSource(url);
///匹配包括 <tr>...</tr>
if(keyword==""||keyword==null)
{
keyword ="ced";
}
String pipei = "<tr[^>]*>[\\s]*<td[^>]*>[\\s]*("+keyword+"){1}[\\s\\S]*?<\\/td>[\\s]*<\\/tr>";
Pattern pattern = Pattern.compile(pipei, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(input);
///正则所匹配的<tr>...</tr>信息
String match="";
while (matcher.find())
{
int start = matcher.start();
int end = matcher.end();
match += input.substring(start, end);
}
//System.out.println(match);
///匹配<td></td>
String regex2 = "<td[^>]*>[\\s\\S]*?<\\/td>";
Pattern pattern2 = Pattern.compile(regex2, Pattern.CASE_INSENSITIVE);
Matcher matcher2 = pattern2.matcher(match);
///正则所匹配的<td></td>信息
String matchstr ="";
while (matcher2.find())
{
int start = matcher2.start();
int end = matcher2.end();
matchstr += match.substring(start, end);
}
String content = ReplaceStr(matchstr);
content = content.substring(1,content.length()-1);
///取出<a 的标记
String regex3="<a[^>]*>[\\s\\S]*?<\\/a>";
Pattern pattern3 = Pattern.compile(regex3, Pattern.CASE_INSENSITIVE);
Matcher matcher3 = pattern3.matcher(content);
String result="";
while (matcher3.find())
{
///把<a>..</a>标记清除掉
result = matcher3.replaceAll("");
}
return SplitStr(result);
}
/** *//** *
* @param url 网页地址
* @param keyword 正则要匹配的内容 块所包括的名字
* @return 字符串数组 String[] 返回页面的一块html信息
* 获取html页面的一块 以字符串数组的形式显示
*/
public static String[] GetHtmlContentByGroup(String url,String keyword)
{
String input = ReadHtml.GetCEDSource(url);
if(keyword==""||keyword==null)
{
keyword ="ced";
}
///匹配包括 <tr>...</tr>
String pipei = "<tr[^>]*>[\\s]*<td[^>]*>[\\s]*("+keyword+"){1}[\\s\\S]*?<\\/td>[\\s]*<\\/tr>";
Pattern pattern = Pattern.compile(pipei, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(input);
String s0="";
while (matcher.find()) {
s0 += matcher.group();
}
///匹配<td></td>
String regex2 = "<td[^>]*>[\\s\\S]*?<\\/td>";
Pattern p2 = Pattern.compile(regex2, Pattern.CASE_INSENSITIVE);
Matcher matcher2 = p2.matcher(s0);
String matchstr ="";
while (matcher2.find())
{
matchstr += matcher2.group();
}
String yy = ReplaceStr(matchstr);
yy = yy.substring(1, yy.length()-1);
///取出<a 的标记
String r3="<a[^>]*>[\\s\\S]*?<\\/a>";
Pattern p3 = Pattern.compile(r3, Pattern.CASE_INSENSITIVE);
Matcher matcher3 = p3.matcher(yy);
String m3="";
while (matcher3.find())
{
m3 = matcher3.replaceAll("");
}
return SplitStr(m3);
}
/** *//** *
* @param str 要进行分割处理的字符串
* @return 字符串数组 String[]
*/
public static String[] SplitStr(String str)
{
if(str==""|| str==null)
return null;
else
return str.split(",");
}
/** *//** *
* @param str 要进行替换处理的字符串(<td></td>)
* @return 字符串数组 String[] 清除掉<td>标记
*/
public static String ReplaceStr(String str)
{
str = str.replace("<td>", ",");
str = str.replace("</td>", ",");
return str;
}
/** *//**
* main方法测试
* @param s
* @throws IOException
*/
public static void main(String[] s) throws IOException {
String[] test = GetCEDContent();
for(String html : test)
{
System.out.println(html);
}
}
}