Jsoup解析网页html
解析网页demo:
利用Jsoup获取截图中的数据信息:
html代码片段:
1 <!-- 当前基金档案计算定投开户 start --> 2 <div class="wrapper"> 3 <div class="wrapper_min"> 4 <div class="merchandiseDetail"> 5 <div class="fundDetail-header"> 6 <div class="fundDetail-tit"> 7 <div style="float: left">兴全社会责任混合 8 <span>(</span> 9 <span class="ui-num">340007</span></div>)</div> 10 <div class="fundDetail-tools"> 11 <a class="jijinba" href="http://guba.eastmoney.com/list,of340007.html">基金吧</a> 12 <!-- 未自选 start --> 13 <a class="addSel" id="addSel" href="javascript:;" target="_self">加自选</a> 14 <!-- 未自选 end --> 15 <a class="addCom" id="addCom" href="javascript:;" target="_self" onclick="common.addCompare()">加对比</a> 16 <a class="addDownApp" href="http://fundact.eastmoney.com/app/">手机版天天基金下载</a></div> 17 </div> 18 <div class="fundDetail-main"> 19 <!-- 档案 start --> 20 <div class="fundInfoItem"> 21 <!--开放式基金收益率模块--> 22 <div class="dataOfFund"> 23 <dl class="dataItem01"> 24 <dt> 25 <p> 26 <span> 27 <span class="sp01">净值估算</span></span> 28 <span id="gz_gztime">(17-12-20 15:00)</span> 29 <span class="infoTips"> 30 <span class="tipsBubble" style="display: none;">净值估算每个交易日9:30-15:00盘中实时更新(QDII基金为海外交易时段),是按照基金持仓、指数走势和基金过往业绩估算,估算数据并不代表真实净值,仅供参考,请以基金管理人披露净值为准。</span></span> 31 </p> 32 </dt> 33 <dd class="dataNums"> 34 <dl class="floatleft"> 35 <span class="ui-font-large ui-color-green ui-num" id="gz_gsz">3.7576</span></dl> 36 <dl id="gz_icon" class="gzdown"></dl> 37 <dl class="floatleft fundZdf"> 38 <span class="ui-font-middle ui-color-green ui-num" id="gz_gszze">0.0594</span> 39 <span class="ui-font-middle ui-color-green ui-num" id="gz_gszzl">-1.56%</span></dl> 40 </dd> 41 <dd> 42 <span>近1月:</span> 43 <span class="ui-font-middle ui-color-green ui-num">-4.62%</span></dd> 44 <dd> 45 <span>近1年:</span> 46 <span class="ui-font-middle ui-color-red ui-num">44.20%</span></dd> 47 </dl> 48 <span class="dataOfFund-line"></span> 49 <dl class="dataItem02"> 50 <dt> 51 <p> 52 <span class="ui-color-blue"> 53 <span class="sp01"> 54 <a href="http://fund.eastmoney.com/f10/jjjz_340007.html">单位净值</a></span>(</span>2017-12-19)</p> 55 </dt> 56 <dd class="dataNums"> 57 <span class="ui-font-large ui-color-red ui-num">3.8170</span> 58 <span class="ui-font-middle ui-color-red ui-num">1.41%</span></dd> 59 <dd> 60 <span>近3月:</span> 61 <span class="ui-font-middle ui-color-red ui-num">13.47%</span></dd> 62 <dd> 63 <span>近3年:</span> 64 <span class="ui-font-middle ui-color-red ui-num">113.48%</span></dd> 65 </dl> 66 <span class="dataOfFund-line"></span> 67 <dl class="dataItem03"> 68 <dt> 69 <p> 70 <span class="ui-color-blue"> 71 <span class="sp01"> 72 <a href="http://fund.eastmoney.com/f10/jjjz_340007.html">累计净值</a></span> 73 </span> 74 </p> 75 </dt> 76 <dd class="dataNums"> 77 <span class="ui-font-large ui-color-red ui-num">4.0070</span></dd> 78 <dd> 79 <span>近6月:</span> 80 <span class="ui-font-middle ui-color-red ui-num">25.35%</span></dd> 81 <dd> 82 <span>成立来:</span> 83 <span class="ui-font-middle ui-color-red ui-num">332.92%</span></dd> 84 </dl> 85 </div> 86 <div class="infoOfFund"> 87 <div class="infoOfFund-line"></div> 88 <table> 89 <tr> 90 <td>基金类型: 91 <a href="http://fund.eastmoney.com/HH_jzzzl.html#os_0;isall_0;ft_;pt_3">混合型</a> | 中高风险</td> 92 <td> 93 <a href="http://fund.eastmoney.com/f10/gmbd_340007.html">基金规模</a>:76.83亿元(2017-09-30)</td> 94 <td>基金经理: 95 <a href="http://fund.eastmoney.com/f10/jjjl_340007.html">傅鹏博</a></td> 96 </tr> 97 <tr> 98 <td> 99 <span class="letterSpace01">成 立 日</span>:2008-04-30</td> 100 <td> 101 <span class="letterSpace01">管 理 人</span>: 102 <a href="http://fund.eastmoney.com/company/80036742.html">兴全基金</a></td> 103 <td> 104 <a class="floatleft" href="http://fund.eastmoney.com/f10/jjpj_340007.html">基金评级</a> 105 <span class="floatleft">:</span> 106 <div class="jjpj4"></div> 107 </td> 108 </tr> 109 </table> 110 </div> 111 </div> 112 <!-- 档案 end -->
java实现代码:
/** * Project Name:wlpc * File Name:XyzqTask.java * Package Name:com.xyzq.wlpc.task * Date:2017年12月20日下午1:48:16 * Copyright (c) 2017 All Rights Reserved. * */ import java.io.IOException;;import net.sf.json.JSONObject; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements;/** * ClassName:XyzqTask * Function: TODO * Reason: TODO * Date: 2017年12月20日 下午1:48:16 * @author lizm * @since JDK 1.6 * */ public class XyzqTask extends BaseTask { private void getHtml(){ String url = ""; url = Pub.getPropertiesValue("wlpc", "wlpc.web.url"); try { Document doc = Jsoup.connect(url).get(); //class等于fundDetail-tit的div标签 Elements fundDetail_tit = doc.select("div.fundDetail-tit"); for (Element element : fundDetail_tit){ //特殊字符'(',使用 \( 或 [(] System.out.println("fundDetail_tit>>>>:"+element.text().split("\(")[0]); //获取div下的第一个span的class为ui-num的值 Document elementDoc = Jsoup.parse(element.toString()); Element elm = elementDoc.select("span.ui-num").first(); System.out.println("elm>>>>:"+elm.text()); } //id等于gz_gztime的span标签 Elements gz_gztime = doc.select("span#gz_gztime"); for (Element element : gz_gztime){ System.out.println("gz_gztime>>>>:"+element.text().replace("(", "").replace(")", "")); } //id等于gz_gsz的span标签 Elements gz_gsz = doc.select("span#gz_gsz"); for (Element element : gz_gsz){ System.out.println("gz_gsz>>>>:"+element.text()); } //id等于gz_gszze的span标签 Elements gz_gszze = doc.select("span#gz_gszze"); for (Element element : gz_gszze){ System.out.println("gz_gszze>>>>:"+element.text()); } //id等于gz_gszzl的span标签 Elements gz_gszzl = doc.select("span#gz_gszzl"); for (Element element : gz_gszzl){ System.out.println("gz_gszzl>>>>:"+element.text()); } //class等于dataItem02的dl标签 Elements dataItem02 = doc.select("dl.dataItem02"); for (Element element : dataItem02){ Document elementDoc = Jsoup.parse(element.toString()); Element elm1 = elementDoc.getElementsByTag("p").first(); System.out.println("elm>>>>:"+elm1.text().replace("单位净值 (", "").replace(")", "")); Element elm_dd = elementDoc.select("dd.dataNums").first(); Document doc_dd = Jsoup.parse(elm_dd.toString()); Element elm_dd_span1 = doc_dd.getElementsByTag("span").first(); System.out.println("elm_dd_span1>>>:"+elm_dd_span1.text()); Element elm_dd_span2 = doc_dd.getElementsByTag("span").last(); System.out.println("elm_dd_span2>>>:"+elm_dd_span2.text()); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void main(String[] args) { XyzqTask client = new XyzqTask(); client.getHtml(); } }
输出结果:
fundDetail_tit>>>>:兴全社会责任混合 elm>>>>:340007 gz_gztime>>>>:17-12-21 15:00 gz_gsz>>>>:3.8583 gz_gszze>>>>:+0.0933 gz_gszzl>>>>:+2.48% elm>>>>:2017-12-20 elm_dd_span1>>>:3.7650 elm_dd_span2>>>:-1.36%