• java读取html文件,截取<body>标签中内容


     1     public String readfile(String filePath){
     2         File file = new File(filePath);  
     3         InputStream input = null;
     4         try {
     5             input = new FileInputStream(file);
     6         } catch (FileNotFoundException e) {
     7             e.printStackTrace();
     8         }  
     9         StringBuffer buffer = new StringBuffer();  
    10         byte[] bytes = new byte[1024];
    11         try {
    12             for(int n ; (n = input.read(bytes))!=-1 ; ){  
    13                 buffer.append(new String(bytes,0,n,"GBK"));  
    14             }
    15         } catch (IOException e) {
    16             e.printStackTrace();
    17         }
    18 //        System.out.println(buffer);
    19         return buffer.toString();  
    20     }
    21     
    22      public String getBody(String val) {
    23           String start = "<body>";
    24           String end = "</body>";
    25           int s = val.indexOf(start) + start.length();
    26           int e = val.indexOf(end);
    27         return val.substring(s, e);
    28     }
    29     
     1     public static void main(String [] args){
     2         OaDao m = new OaDao();
     3 //        String sql = "SELECT sth,xdh FROM TK_ST_0331 where sth='022012050101131000100' and rownum <=10";
     4         String sql = "select t.sth , t.stgjz ,t.stly, x.mc from TK_ST_0331 t ,TK_STK_ST_0331 k,TK_TX X  where t.sth = k.sth AND X.BH = t.tx and rownum <10 ";
     5         List<OaVo> datalist= m.findAll(sql);
     6         for(OaVo vo : datalist){
     7             System.out.println(vo.getVal1()+"///"+vo.getVal2());
     8             
     9 //            String sth = "022012010100000100100";
    10             String sth = vo.getVal1();
    11             String kmh = sth.substring(0, 2);    //科目号
    12             String nf = sth.substring(2, 6);    //年份
    13             String yf = sth.substring(6,10);    //月份
    14             String serialno = sth.substring(10, 16);    //序列号
    15             String stxl = sth.substring(16, 19);    //题型
    16             String path ="/"+kmh+"/"+nf+"/"+yf+"/"+serialno+"/"+stxl+"/";
    17             
    18             String tm_path ="H:/tk_source/"+kmh+"/"+yf+"/"+serialno+"/"+stxl+"/"+sth+"_tm.htm";
    19             String da_path ="H:/tk_source/"+kmh+"/"+yf+"/"+serialno+"/"+stxl+"/"+sth+"_da.htm";
    20             String jx_path ="H:/tk_source/"+kmh+"/"+yf+"/"+serialno+"/"+stxl+"/"+sth+"_jx.htm";
    21             
    22     //        String path = "H:/tk_source/02/0101/000001/001/022012010100000100100_da.htm";
    23             
    24             String tm = m.getBody(m.readfile(tm_path));
    25             System.out.println("----------------------题目------------------------------");
    26             System.out.println(tm);
    27             
    28             String da = m.getBody(m.readfile(da_path));
    29             System.out.println("----------------------答案------------------------------");
    30             System.out.println(da);
    31             
    32             
    33             String jx = m.getBody(m.readfile(da_path));
    34             System.out.println("----------------------解析------------------------------");
    35             System.out.println(jx);
    36         }
    37     }
     1 /**
     2      * 从HTML源码中提取图片路径,最后以一个 String 类型的 List 返回,如果不包含任何图片,则返回一个 size=0 的List
     3      * 需要注意的是,此方法只会提取以下格式的图片:.jpg|.bmp|.eps|.gif|.mif|.miff|.png|.tif|.tiff|.svg|.wmf|.jpe|.jpeg|.dib|.ico|.tga|.cut|.pic
     4      * @param htmlCode HTML源码
     5      * @return <img>标签 src 属性指向的图片地址的List集合
     6      * @author Carl He
     7      */
     8     public static List<String> getImageSrc(String htmlCode) {
     9         List<String> imageSrcList = new ArrayList<String>();
    10         Pattern p = Pattern.compile("<img//b[^>]*//bsrc//b//s*=//s*('|/")?([^'/"/n/r/f>]+(//.jpg|//.bmp|//.eps|//.gif|//.mif|//.miff|//.png|//.tif|//.tiff|//.svg|//.wmf|//.jpe|//.jpeg|//.dib|//.ico|//.tga|//.cut|//.pic)//b)[^>]*>", Pattern.CASE_INSENSITIVE);
    11         Matcher m = p.matcher(htmlCode);
    12         String quote = null;
    13         String src = null;
    14         while (m.find()) {
    15             quote = m.group(1);
    16             src = (quote == null || quote.trim().length() == 0) ? m.group(2).split("//s+")[0] : m.group(2);
    17             imageSrcList.add(src);
    18         }
    19         return imageSrcList;
    20     }
  • 相关阅读:
    hadoop 2.5 hdfs namenode –format 出错Usage: java NameNode [-backup] |
    自己动手编译hadoop-2.5.2源码
    CentOS Linux解决Device eth0 does not seem to be present
    Liz Murray成功故事的偶然与必然(转)
    【BZOJ4242】水壶(克鲁斯卡尔重构树,BFS)
    【BZOJ3551】Peaks加强版(Kruskal重构树,主席树)
    【agc023E】Inversions(线段树,动态规划)
    【CF183D】T-shirt(动态规划,贪心)
    【BZOJ2423】最长公共子序列(动态规划)
    【BZOJ2118】墨墨的等式(最短路)
  • 原文地址:https://www.cnblogs.com/huanglibin/p/6671202.html
Copyright © 2020-2023  润新知