• 网页抓取


    //根据书名,获取相关图书的ISBN号。抓取的是豆瓣读书的搜索页面。
     1 package cn.edu.xmu.zgy;
     2 
     3 import java.io.BufferedReader;
     4 import java.io.InputStreamReader;
     5 import java.net.HttpURLConnection;
     6 import java.net.URL;
     7 import java.util.ArrayList;
     8 
     9 public class WebpageCapture {
    10     public ArrayList<String> captureHtml(String urls,String begins,String ends) throws Exception {
    11         String strURL =urls;
    12         URL url = new URL(strURL);
    13         HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();
    14         InputStreamReader input = new InputStreamReader(
    15                 httpConn.getInputStream(), "utf-8");
    16         BufferedReader bufReader = new BufferedReader(input);
    17         String line = "";
    18         StringBuilder contentBuf = new StringBuilder();
    19         while ((line = bufReader.readLine()) != null) {
    20             contentBuf.append(line);
    21         }
    22         String buf = contentBuf.toString();
    23         ArrayList<String> al=new ArrayList<String>();
    24         while(true){
    25         int beginIx = buf.indexOf(begins);
    26         
    27         if(beginIx==-1)
    28             break;
    29         String result = buf.substring(beginIx);
    30         int endIx = result.indexOf(ends);
    31         if(endIx==-1)
    32             break;
    33         buf=result.substring(endIx+10);
    34         result = result.substring(0, endIx);
    35         al.add(result);
    36         //System.out.println("captureHtml()的结果:
    " + result);
    37         }
    38         return al;
    39     }
    40         
    41 
    42     public void captureJavascript(String postid) throws Exception {
    43         String strURL = "http://www.kiees.cn/sf.php?wen=" + postid
    44                 + "&channel=";
    45         URL url = new URL(strURL);
    46         HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();
    47         InputStreamReader input = new InputStreamReader(
    48                 httpConn.getInputStream(), "utf-8");
    49         BufferedReader bufReader = new BufferedReader(input);
    50         String line = "";
    51         StringBuilder contentBuf = new StringBuilder();
    52         while ((line = bufReader.readLine()) != null) {
    53             contentBuf.append(line);
    54         }
    55         System.out.println("captureJavascript()的结果:
    " + contentBuf.toString());
    56     }
    57 
    58 public static void main(String[] args) {
    59         WebpageCapture demo = new WebpageCapture();
    60         try {
    61             String ip="算法导论";
    62             String strURL = "http://book.douban.com/subject_search?search_text="+ip+"&cat=1001";
    63             String begin="a class="nbg" href="http://book.";
    64             String end="/" "+
    65 "  onclick=&#34";
    66             ArrayList<String> AL1=new ArrayList<String>();
    67             ArrayList<String> AL2=new ArrayList<String>();
    68             String result;
    69             AL1=demo.captureHtml(strURL,begin,end);
    70             for(int i=0;i<AL1.size();i++)
    71             {
    72                 AL1.set(i, AL1.get(i).substring(AL1.get(i).indexOf("http:")));
    73                 System.out.println(AL1.get(i));
    74             }
    75             
    76             begin="ISBN:</span> ";
    77             end="<br/>";
    78             for(int i=0;i<AL1.size();i++){
    79             strURL=AL1.get(i);
    80             AL2=demo.captureHtml(strURL,begin,end);
    81             AL2.set(0,  AL2.get(0).substring(13));
    82             System.out.println(AL2.get(0));
    83             }
    84             //demo.captureJavascript("107818590577");
    85         } catch (Exception e) {
    86             e.printStackTrace();
    87         }
    88     }
    89 
    90 
    91 }
    View Code

    参考链接:http://blog.csdn.net/zgyulongfei/article/details/7909006

    网页图片下载:http://blog.csdn.net/oyzl68/article/details/9706373

    代码很好

    附带一个看不懂的抓取:http://www.oschina.net/code/snippet_1021353_35133?p=1

  • 相关阅读:
    Spark记录-SparkSQL远程操作MySQL和ORACLE
    Spark记录-Spark on Yarn框架
    Linux记录-重启后磁盘丢失问题解决方案
    Spark记录-阿里巴巴开源工具DataX数据同步工具使用
    Spark记录-SparkSql官方文档中文翻译(部分转载)
    Spark记录-SparkSQL一些操作
    Spark记录-Spark-Shell客户端操作读取Hive数据
    Spark记录-SparkSQL相关学习
    Spark记录-Scala程序例子(函数/List/match/option/泛型/隐式转换)
    CM记录-升级Spark版本到2.x(转载)
  • 原文地址:https://www.cnblogs.com/zzsf/p/4498117.html
Copyright © 2020-2023  润新知