类并不是一个通用的工具类,需要按自己的要求实现,这里只记录了Htmlparse.jar包的一些用法。仅此而已!
详细看这里:http://gundumw100.iteye.com/blog/704311
- import java.util.*;
- import org.htmlparser.Node;
- import org.htmlparser.NodeFilter;
- import org.htmlparser.Parser;
- import org.htmlparser.filters.AndFilter;
- import org.htmlparser.filters.HasAttributeFilter;
- import org.htmlparser.filters.NodeClassFilter;
- import org.htmlparser.filters.TagNameFilter;
- import org.htmlparser.tags.BodyTag;
- import org.htmlparser.tags.LinkTag;
- import org.htmlparser.util.NodeList;
- import org.htmlparser.util.ParserException;
- /**
- * httpclient与htmlparse对网页的解析
- *
- * @author Administrator
- *
- */
- public class HtmlparseUtil {
- WebHttpClient util=new WebHttpClient();
- /**
- * 获得网页中的超链接,将href和text保存在Map中:map(href,text)
- * @param url
- * @param charset
- * @return
- */
- public Map<String, String> linkGet(String url, String charset) {
- String content=util.getWebContentByGet(url,charset);
- Map<String, String> linkMap = new HashMap<String, String>();
- try {
- //开始解析
- Parser parser = Parser.createParser(content, charset);
- // 过滤出<a></a>标签
- NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
- NodeList list = parser.extractAllNodesThatMatch(linkFilter);
- Node node = null;
- for (int i = 0; i < list.size(); i++) {
- node = list.elementAt(i);
- // 获得网页中的链接map(href,text)
- linkMap.put(((LinkTag) node).getLink(), this.processText(((LinkTag) node).getLinkText()));
- }
- } catch (ParserException e) {
- e.printStackTrace();
- }
- return linkMap;
- }
- /**
- * 获得网页<body></body>标签中的内容, 保存在body中
- * @param url
- * @param charset
- * @return
- */
- public String bodyGet(String url, String charset) {
- String content=util.getWebContentByGet(url,charset);
- String body = "";
- try {
- Parser parser = Parser.createParser(content, charset);
- // 过滤<body></body>标签
- NodeFilter bodyFilter = new NodeClassFilter(BodyTag.class);
- NodeList list = parser.extractAllNodesThatMatch(bodyFilter);
- Node node = null;
- for (int i = 0; i < list.size(); i++) {
- node = list.elementAt(i);
- // 获得网页内容 保存在content中
- body = ((BodyTag) node).getBody();
- }
- } catch (ParserException e) {
- e.printStackTrace();
- }
- return body;
- }
- /**
- * 过滤出class为term的<span>元素,并获得他们的文本
- * @param url
- * @param charset
- * @return
- */
- public Map<String,String> termGet(String url, String charset) {
- String content=util.getWebContentByGet(url,charset);
- Map<String, String> map = new HashMap<String, String>();
- try {
- //开始解析
- // 过滤出class为term的<span>元素
- Parser parser = Parser.createParser(content, charset);
- AndFilter filter =
- new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","term"));
- Node node = null;
- NodeList nodeList = parser.parse(filter);
- for (int i = 0; i < nodeList.size(); i++) {
- node = nodeList.elementAt(i);
- map.put("term", node.toPlainTextString());
- }
- // 过滤出class为start-time的<span>元素
- Parser parser2 = Parser.createParser(content, charset);
- AndFilter filter2 =
- new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","start-time"));
- NodeList nodeList2 = parser2.parse(filter2);
- for (int i = 0; i < nodeList2.size(); i++) {
- node = nodeList2.elementAt(i);
- map.put("start-time", node.toPlainTextString());
- }
- // 过滤出id为J_SingleEndTimeLabel的<span>元素
- Parser parser3 = Parser.createParser(content, charset);
- AndFilter filter3 =
- new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("id","J_SingleEndTimeLabel"));
- NodeList nodeList3 = parser3.parse(filter3);
- for (int i = 0; i < nodeList3.size(); i++) {
- node = nodeList3.elementAt(i);
- map.put("end-time", node.toPlainTextString());
- }
- // 过滤出class为box post的<div>元素
- Parser parser4 = Parser.createParser(content, charset);
- AndFilter filter4 =
- new AndFilter(new TagNameFilter("div"),new HasAttributeFilter("class","box post"));
- NodeList nodeList4 = parser4.parse(filter4);
- for (int i = 0; i < nodeList4.size(); i++) {
- node = nodeList4.elementAt(i);
- String temp=node.toPlainTextString().trim();
- temp=temp.substring(10,20).trim();
- map.put("pre-term", temp);
- }
- // 过滤出class为J_AwardNumber的<span>元素
- Parser parser5 = Parser.createParser(content, charset);
- // AndFilter filter5 =
- // new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","J_AwardNumber"));
- NodeList nodeList5 = parser5.parse(new HasAttributeFilter("class","J_AwardNumber"));
- StringBuffer buffer=new StringBuffer();
- for (int i = 0; i < nodeList5.size(); i++) {
- node = nodeList5.elementAt(i);
- buffer.append(","+node.toPlainTextString());
- }
- buffer.append("|");
- // 过滤出class为blue J_AwardNumber的<span>元素
- Parser parser6 = Parser.createParser(content, charset);
- // AndFilter filter6 =
- // new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","blue J_AwardNumber"));
- NodeList nodeList6 = parser6.parse(new HasAttributeFilter("class","blue J_AwardNumber"));
- for (int i = 0; i < nodeList6.size(); i++) {
- node = nodeList6.elementAt(i);
- buffer.append(node.toPlainTextString()+",");
- }
- map.put("numbers", buffer.toString());
- } catch (ParserException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- return map;
- }
- private String processText(String content){
- content=content.trim().replaceAll(" ", "");
- // content=content.replaceAll("<p>", " ");
- // content=content.replaceAll("</TD>", "");
- // content=content.replaceAll("</div>", "");
- // content=content.replaceAll("</a>", "");
- // content=content.replaceAll("<a href=.*>", "");
- return content;
- }
- public static void main(String[] str) {
- String url="http://caipiao.taobao.com/lottery/order/lottery_dlt.htm?type=1";
- HtmlparseUtil util=new HtmlparseUtil();
- Map<String,String> map=util.termGet(url, "gb2312");
- System.out.println("term="+map.get("term"));//<span class="term">第<em>10074</em>期</span>
- System.out.println("start-time="+map.get("start-time"));//
- System.out.println("end-time="+map.get("end-time"));//
- System.out.println("pre-term="+map.get("pre-term"));//
- System.out.println("numbers="+map.get("numbers"));//
- /*
- Map<String, String> linkMap = util.linkGet(url, "gb2312");
- for (String s : linkMap.keySet()) {
- System.out.println(s + " = " + linkMap.get(s));
- //如果是个链接,则再获取它的<body>中的内容
- // if (s.startsWith("http")) {
- // util.bodyGet(s, "gb2312");
- // }
- }
- */
- }
- }