• 网页抓取


    //根据书名,获取相关图书的ISBN号。抓取的是豆瓣读书的搜索页面。
     1 package cn.edu.xmu.zgy;
     2 
     3 import java.io.BufferedReader;
     4 import java.io.InputStreamReader;
     5 import java.net.HttpURLConnection;
     6 import java.net.URL;
     7 import java.util.ArrayList;
     8 
     9 public class WebpageCapture {
    10     public ArrayList<String> captureHtml(String urls,String begins,String ends) throws Exception {
    11         String strURL =urls;
    12         URL url = new URL(strURL);
    13         HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();
    14         InputStreamReader input = new InputStreamReader(
    15                 httpConn.getInputStream(), "utf-8");
    16         BufferedReader bufReader = new BufferedReader(input);
    17         String line = "";
    18         StringBuilder contentBuf = new StringBuilder();
    19         while ((line = bufReader.readLine()) != null) {
    20             contentBuf.append(line);
    21         }
    22         String buf = contentBuf.toString();
    23         ArrayList<String> al=new ArrayList<String>();
    24         while(true){
    25         int beginIx = buf.indexOf(begins);
    26         
    27         if(beginIx==-1)
    28             break;
    29         String result = buf.substring(beginIx);
    30         int endIx = result.indexOf(ends);
    31         if(endIx==-1)
    32             break;
    33         buf=result.substring(endIx+10);
    34         result = result.substring(0, endIx);
    35         al.add(result);
    36         //System.out.println("captureHtml()的结果:
    " + result);
    37         }
    38         return al;
    39     }
    40         
    41 
    42     public void captureJavascript(String postid) throws Exception {
    43         String strURL = "http://www.kiees.cn/sf.php?wen=" + postid
    44                 + "&channel=";
    45         URL url = new URL(strURL);
    46         HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();
    47         InputStreamReader input = new InputStreamReader(
    48                 httpConn.getInputStream(), "utf-8");
    49         BufferedReader bufReader = new BufferedReader(input);
    50         String line = "";
    51         StringBuilder contentBuf = new StringBuilder();
    52         while ((line = bufReader.readLine()) != null) {
    53             contentBuf.append(line);
    54         }
    55         System.out.println("captureJavascript()的结果:
    " + contentBuf.toString());
    56     }
    57 
    58 public static void main(String[] args) {
    59         WebpageCapture demo = new WebpageCapture();
    60         try {
    61             String ip="算法导论";
    62             String strURL = "http://book.douban.com/subject_search?search_text="+ip+"&cat=1001";
    63             String begin="a class="nbg" href="http://book.";
    64             String end="/" "+
    65 "  onclick=&#34";
    66             ArrayList<String> AL1=new ArrayList<String>();
    67             ArrayList<String> AL2=new ArrayList<String>();
    68             String result;
    69             AL1=demo.captureHtml(strURL,begin,end);
    70             for(int i=0;i<AL1.size();i++)
    71             {
    72                 AL1.set(i, AL1.get(i).substring(AL1.get(i).indexOf("http:")));
    73                 System.out.println(AL1.get(i));
    74             }
    75             
    76             begin="ISBN:</span> ";
    77             end="<br/>";
    78             for(int i=0;i<AL1.size();i++){
    79             strURL=AL1.get(i);
    80             AL2=demo.captureHtml(strURL,begin,end);
    81             AL2.set(0,  AL2.get(0).substring(13));
    82             System.out.println(AL2.get(0));
    83             }
    84             //demo.captureJavascript("107818590577");
    85         } catch (Exception e) {
    86             e.printStackTrace();
    87         }
    88     }
    89 
    90 
    91 }
    View Code

    参考链接:http://blog.csdn.net/zgyulongfei/article/details/7909006

    网页图片下载:http://blog.csdn.net/oyzl68/article/details/9706373

    代码很好

    附带一个看不懂的抓取:http://www.oschina.net/code/snippet_1021353_35133?p=1

  • 相关阅读:
    全面了解Nginx主要应用场景
    手把手教你构建 C 语言编译器
    Docker镜像原理和最佳实践
    Docker网络深度解读
    PostgreSQL 10.0 preview 功能增强
    阿里沈询:分布式事务原理与实践
    CPU、内存、IO虚拟化关键技术及其优化探索
    原理、方法双管齐下,大神带你细解Redis内存管理和优化---场景研读
    ASP.NET 5已终结,迎来ASP.NET Core 1.0和.NET Core 1.0 转
    RabbitMQ学习系列
  • 原文地址:https://www.cnblogs.com/zzsf/p/4498117.html
Copyright © 2020-2023  润新知