• 使用jsoup解析html页面内容案例


    public String getFaGuiKuTitles(String type, int page) {
            String href = "http://info.qd-n-tax.gov.cn/NewFaGuiKu/"+type+"/";
            String baseUrl = href + "index";
            
            int no = 0;
            String msg = "";
            
            if(page>0){
                baseUrl = baseUrl + "_"+page;
            }
            
            baseUrl += ".htm";
            
            int totalPage = 0;
            
            List<FaGui> list = new ArrayList<FaGui>();
            
            try {
                URL url = new URL(baseUrl);
                org.jsoup.nodes.Document doc = Jsoup.parse(url, 10000);
                
                org.jsoup.nodes.Element table = doc.select("table").get(0);
                org.jsoup.nodes.Element tbody = table.select("tbody").get(0);
                org.jsoup.select.Elements rows = tbody.select("tr");
                
                int len = rows.size();
                
                for (int i = 0; i < len; i++) {
                    org.jsoup.select.Elements cols = rows.get(i).select("td");
                    
                    FaGui fg = new FaGui();
                    fg.setTitle(cols.get(0).text());
                    fg.setDate(cols.get(1).text());
                    
                    if(cols.size()>2){
                        fg.setFwzh(cols.get(2).text());
                    }
                    
                    
                    org.jsoup.nodes.Element a = cols.get(0).select("a").get(0);
                    fg.setHref(a.attr("href").replaceFirst("./", href));
                    
                    list.add(fg);
                }
                
                //翻页信息
                String pager = doc.getElementsByClass("pager").get(0).html();
                int start = pager.indexOf("(")+1;
                int end = pager.indexOf(",");
                pager = pager.substring(start, end);//截取页面中的总页数
                
                if(pager.matches("\d+")){
                    totalPage = Integer.parseInt(pager);
                }
                
                no = 1;
                msg = "SUCCESS";
                
                log.info("获取税收法规库标题内容", "getFaGuiKuTitles");
            } catch (MalformedURLException ex) {
                Logger.getLogger(LocalServiceImpl.class.getName()).log(Level.SEVERE, null, ex);
                msg = "获取税收法规库标题内容:baseUrl"+baseUrl+"不可用,ex:"+ex;
                log.error(msg, "getFaGuiKuTitles");
            } catch (IOException ex) {
                Logger.getLogger(LocalServiceImpl.class.getName()).log(Level.SEVERE, null, ex);
                msg = "获取税收法规库标题内容:IO异常,ex:"+ex;
                log.error(msg, "getFaGuiKuTitles");
            }        
            
            return ResultUtil.getResult(no, msg, list,totalPage,page);
        }
  • 相关阅读:
    SignalR学习笔记(一) 简单聊天室
    纽约工作日志流水账 Day 2
    纽约工作日志流水账 Day 1
    开放计算平台——数据仓库(Hive)权限控制
    SQL Standard Based Hive Authorization(基于SQL标准的Hive授权)
    jmap(Memory Map For Java)
    Hadoop CombineFileInputFormat实现原理及源码分析
    HiveServer连接优化
    Hive SQL运行状态监控(HiveSQLMonitor)
    jstack(Stack Trace for Java)
  • 原文地址:https://www.cnblogs.com/yshyee/p/4481592.html
Copyright © 2020-2023  润新知