• 通过网络得到html,并解析出其中网址(JAVA程序)


    网络版程序:

    import java.io.BufferedReader;
    import java.io.InputStream;
    import java.io.InputStreamReader;
    import java.net.URL;
    import java.net.URLConnection;
    import java.util.ArrayList;
    import java.util.List;
    
    public class TestIndex {
        
        private String rootUrl = "http://localhost/apk/";
        private String listUrl = rootUrl + "test-index.htm";
        private static List<String> imageUrlList = new ArrayList<String>();
        public static void main(String args[]){
            TestIndex ti = new TestIndex();
            ti.getData();
            System.out.println(imageUrlList.size());
            for(int i=0; i<imageUrlList.size();i++){
                System.out.println(imageUrlList.get(i));
            }
            
        }
        
        private InputStream getNetInputStream(String urlStr)
        {
            try
            {
                URL url = new URL(urlStr);
                URLConnection conn = url.openConnection();
                conn.connect();
                InputStream is = conn.getInputStream();
                return is;
            }
            catch (Exception e)
            {
    
            }
            return null;
        }
        private void getData() {
            try
            {
                InputStream is = getNetInputStream(listUrl);
                InputStreamReader isr = new InputStreamReader(is);
                BufferedReader br = new BufferedReader(isr);
                String s = null;
                String html="";
                while ((s = br.readLine()) != null)
                {
                    html+=s;
                }
                
                is.close();
                String startStr = "src="https://";
                String endStr = " width=";
                int start = 0;
                int end = 0;
                int index =0;
                imageUrlList.clear();
                while (true)
                {
                    start = html.indexOf(startStr, index);
                    if (start < 0)
                        break;
                    index=start;
                    end = html.indexOf(endStr, index);
                    String ss = html.substring(start+5,end-1);
                    imageUrlList.add(ss);
                    index +=ss.length();
                }
            }
            catch (Exception e)
            {
                // TODO: handle exception
            }
        }
    }

    本地版程序:

    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileNotFoundException;
    import java.io.InputStream;
    import java.io.InputStreamReader;
    import java.util.ArrayList;
    import java.util.List;
    
    public class TestIndex_IO {
        
        private String rootUrl = "D:/Hixin/webandroid/";
        private String listUrl = rootUrl + "test-index.htm";
        private static List<String> imageUrlList = new ArrayList<String>();
        public static void main(String args[]){
            TestIndex_IO ti = new TestIndex_IO();
            ti.getData();
            System.out.println(imageUrlList.size());
            for(int i=0; i<imageUrlList.size();i++){
                System.out.println(imageUrlList.get(i));
            }
            
        }
        
        private InputStream getNetInputStream(String urlStr)
        {
        
        
                InputStream is;
                try {
                    is = new FileInputStream(new File(urlStr));
                    return is;
                } catch (FileNotFoundException e) {
                    e.printStackTrace();
                }
                return null;
         
         
           
        }
        private void getData() {
            try
            {
                InputStream is = getNetInputStream(listUrl);
                InputStreamReader isr = new InputStreamReader(is);
                BufferedReader br = new BufferedReader(isr);
                String s = null;
                String html="";
                while ((s = br.readLine()) != null)
                {
                    html+=s;
                }
                
                is.close();
                String startStr = "src="https://";
                String endStr = " width=";
                int start = 0;
                int end = 0;
                int index =0;
                imageUrlList.clear();
                while (true)
                {
                    start = html.indexOf(startStr, index);
                    if (start < 0)
                        break;
                    index=start;
                    end = html.indexOf(endStr, index);
                    String ss = html.substring(start+5,end-1);
                    imageUrlList.add(ss);
                    index +=ss.length();
                }
            }
            catch (Exception e)
            {
                // TODO: handle exception
            }
        }
    }


    差别仅仅在于private InputStream getNetInputStream(String urlStr)函数。为避免中文乱码,建议InputStreamReader isr = new InputStreamReader(is,"utf-8");

                InputStream is = getNetInputStream(listUrl);
                InputStreamReader isr = new InputStreamReader(is);      
                BufferedReader br = new BufferedReader(isr);
                String s = null;
                String html="";
                while ((s = br.readLine()) != null)
                {
                    html+=s;
                }
                System.out.println(html.length());
                is.close();

    输出结果为:77300

                InputStream is = getNetInputStream(listUrl);
                InputStreamReader isr = new InputStreamReader(is,"utf-8");  
                BufferedReader br = new BufferedReader(isr);
                String s = null;
                String html="";
                while ((s = br.readLine()) != null)
                {
                    html+=s;
                }
                System.out.println(html.length());
                is.close();

    输出结果为:77135

     private String writeUrl = "D:/newfile/new/new";
      private String fileName ="test-index.htm";
     File f = new File(writeUrl);
                if(!f.exists()) {
                    f.mkdirs();
                }
                File f1 = new File(f, fileName);
                FileOutputStream fos = new FileOutputStream(f1);
                OutputStreamWriter osw = new OutputStreamWriter(fos);  
              
               osw.write(html,0,html.length());
               osw.flush();  
               osw.close();


    解析出htm文件中包含的网址。

    结果:

    20
    https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcRvQgUjsVDBncM3mVIgIyIuE87BnlyJUy2BNsAp8kUoTanrC_css5mVAw
    https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcThd8cYjOTmCgYJZxX5ls-xpxaAlH1_yocOSCqI5_7OkL29SNtbCZ7q2Yoj
    https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTl-FzKmsppxuwzmTITGCv9uDxmrWr1pG0lw8mUD9wkWIloASxQeBEMnVjz
    https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcQWbmiZJIXKHV2IoTBp7zSY6kD5g5VPzVtBTLJYYR5nwTtKi2-0_u93qL4e
    https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcSlrLi_GtVgUehU7coFe1eMdrJxPdvS42iTqXkla0g75s31NBfAq2u1LE4
    https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcSkrlyGxSs8Dr_7k3MUvoGq1vE45LgHZ0zEhIEdD9LLZiaoMcE7IAqn8ho
    https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTu__OUSJ4R4EKBu4jOi2ZAdHohpVQIBy3-SfnI8FYpN8wVC9kJG_aWuk_w
    https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcR3Bf7YtsHJ813A5_wWzpxIy4MbEmqz5NLw3qv1nPxOZqVjH7QlY-qYSCg
    https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcToB4nJPqVwnzn0xeasnXyhxGgOqHXdypE6KZIMTfV9k52eIrE3iYsA6Ixm
    https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcTkKw0LpqdB2eQMUpwdQdvM9DTeNtq1mrvMNivoQtN37p3m0OPsx4ME9i4O
    https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSZGzMf_3hmdDktz91yp5ZQi-eGWLCenZ0U446sXT2nqYuwlWRI_V_BVIWi
    https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTQF-55T5GM3dLdaoafPdlIYK0ESNvM6-Bsb4-B2rQTeyD5gGoCKxokExM-
    https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcRoRjo4TFeXmx47zE6VH0ylcO0IQ2HBsOHYIMJCI9MsRyg_PF1WhHbqG76Q
    https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcRrdegt1koEy51dLWrJAbVMJBlCEZ7fPl2mztYYM6onvxocRCq030Ft1gE
    https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcTtnQpte0uq9Ue9nsg25GeO1kw_-Hcn69ozTQkiMBHrXKwlANutyhwKD9XM
    https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRNRdxzmuFKABoGgyv2SC0gMticosL2LB3V1fBMOwNtVBZxHkyMw4IcWBFj
    https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQr40CEf75nWCj5dg-oeKtb9zK6mhktu7vnfoYAh5ioy34goC3c9ptDkQwP
    https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcQUnyHrVEbppqhZnWnQrijhBFP0X34gRf7pKw6PdT4ggepB2k9g-p71sgGh
    https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcR9Us9qblbTJaw47gULXCI8sHKN4I61gYsT2ijebtZzgsMDI8GmYqQpIIw
    https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSIrW-IbBZjM9Ztn60r9QE1_FIMjt494qGX12tqsLsibYPLuFVwyVSgz1I

     用正则表达式更简单:

     1 InputStream is = getNetInputStream(listUrl);
     2             InputStreamReader isr = new InputStreamReader(is);
     3             BufferedReader br = new BufferedReader(isr);
     4             String s = "";
     5             
     6             while ((s = br.readLine()) != null)
     7             {
     8                 Pattern p = Pattern.compile("src="https[^"]+");
     9                 Matcher m = p.matcher(s);
    10                 while(m.find()) {
    11                     System.out.println(m.group());
    12                 }
    13             }
     1 src="https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcRvQgUjsVDBncM3mVIgIyIuE87BnlyJUy2BNsAp8kUoTanrC_css5mVAw
     2 src="https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcThd8cYjOTmCgYJZxX5ls-xpxaAlH1_yocOSCqI5_7OkL29SNtbCZ7q2Yoj
     3 src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTl-FzKmsppxuwzmTITGCv9uDxmrWr1pG0lw8mUD9wkWIloASxQeBEMnVjz
     4 src="https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcQWbmiZJIXKHV2IoTBp7zSY6kD5g5VPzVtBTLJYYR5nwTtKi2-0_u93qL4e
     5 src="https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcSlrLi_GtVgUehU7coFe1eMdrJxPdvS42iTqXkla0g75s31NBfAq2u1LE4
     6 src="https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcSkrlyGxSs8Dr_7k3MUvoGq1vE45LgHZ0zEhIEdD9LLZiaoMcE7IAqn8ho
     7 src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTu__OUSJ4R4EKBu4jOi2ZAdHohpVQIBy3-SfnI8FYpN8wVC9kJG_aWuk_w
     8 src="https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcR3Bf7YtsHJ813A5_wWzpxIy4MbEmqz5NLw3qv1nPxOZqVjH7QlY-qYSCg
     9 src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcToB4nJPqVwnzn0xeasnXyhxGgOqHXdypE6KZIMTfV9k52eIrE3iYsA6Ixm
    10 src="https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcTkKw0LpqdB2eQMUpwdQdvM9DTeNtq1mrvMNivoQtN37p3m0OPsx4ME9i4O
    11 src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSZGzMf_3hmdDktz91yp5ZQi-eGWLCenZ0U446sXT2nqYuwlWRI_V_BVIWi
    12 src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTQF-55T5GM3dLdaoafPdlIYK0ESNvM6-Bsb4-B2rQTeyD5gGoCKxokExM-
    13 src="https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcRoRjo4TFeXmx47zE6VH0ylcO0IQ2HBsOHYIMJCI9MsRyg_PF1WhHbqG76Q
    14 src="https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcRrdegt1koEy51dLWrJAbVMJBlCEZ7fPl2mztYYM6onvxocRCq030Ft1gE
    15 src="https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcTtnQpte0uq9Ue9nsg25GeO1kw_-Hcn69ozTQkiMBHrXKwlANutyhwKD9XM
    16 src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRNRdxzmuFKABoGgyv2SC0gMticosL2LB3V1fBMOwNtVBZxHkyMw4IcWBFj
    17 src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQr40CEf75nWCj5dg-oeKtb9zK6mhktu7vnfoYAh5ioy34goC3c9ptDkQwP
    18 src="https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcQUnyHrVEbppqhZnWnQrijhBFP0X34gRf7pKw6PdT4ggepB2k9g-p71sgGh
    19 src="https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcR9Us9qblbTJaw47gULXCI8sHKN4I61gYsT2ijebtZzgsMDI8GmYqQpIIw
    20 src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSIrW-IbBZjM9Ztn60r9QE1_FIMjt494qGX12tqsLsibYPLuFVwyVSgz1I
  • 相关阅读:
    查询已存入数据库中的图片,并显示出来
    图像插值的缺点
    windows网络服务之配置网络负载均衡(NLB)群集
    QR 码的位置检测符
    二维条码识别系统设计原理
    教你看懂Code128条形码
    DataMatrix二维条码源码分析检测识别图像位置
    C#条形码生成(五)----Web下的测试
    C# 生产成条形码3种方法
    屏蔽弹出对话框
  • 原文地址:https://www.cnblogs.com/hixin/p/4158930.html
Copyright © 2020-2023  润新知