• Java爬虫学习(2)之用对象保存文件demo(1)


     1 package com.mieba.spider;
     2 
     3 import java.util.ArrayList;
     4 import java.util.List;
     5 import java.util.Vector;
     6 
     7 import us.codecraft.webmagic.Page;
     8 import us.codecraft.webmagic.Site;
     9 import us.codecraft.webmagic.processor.PageProcessor;
    10 import us.codecraft.webmagic.selector.Html;
    11 
    12 public class WanhoPageProcessor implements PageProcessor
    13 {
    14 
    15     private Site site = Site
    16             .me()
    17             .setTimeOut(10000)
    18             .setRetryTimes(3)
    19             .setSleepTime(1000)
    20             .setCharset("UTF-8");
    21 
    22     @Override
    23     public Site getSite()
    24     {
    25         // TODO Auto-generated method stub
    26         return site;
    27     }
    28 
    29     @Override
    30     public void process(Page page)
    31     {
    32         // TODO Auto-generated method stub
    33         //获取当前页的所有喜报
    34          List<String> list = page.getHtml().xpath("//div[@class='main_l']/ul/li").all();
    35         //要保存喜报的集合
    36         Vector<ArticleVo> voLst = new Vector<>();
    37       //遍历喜报
    38         String title;
    39         String content;
    40         String img;
    41         for (String item : list) 
    42         {
    43             Html tmp = Html.create(item);
    44             //标题
    45             title = tmp.xpath("//div[@class='content']/h4/a/text()").toString();
    46             //内容
    47             content = tmp.xpath("//div[@class='content']/p/text()").toString();
    48             //图片路径
    49             img = tmp.xpath("//a/img/@src").toString();
    50             //加入集合
    51             ArticleVo vo = new ArticleVo(title, content, img);
    52             voLst.add(vo);
    53         }
    54       //保存数据至page中,后续进行持久化
    55         page.putField("e_list", voLst);
    56       //加载其它页
    57         page.addTargetRequests( getOtherUrls());
    58     }
    59     
    60     
    61     //其它页
    62     public List<String> getOtherUrls()
    63     {
    64          List<String> urlLsts = new ArrayList<>();
    65          for(int i=2;i<7;i++){
    66              urlLsts.add("http://www.wanho.net/a/jyxb/list_15_"+i+".html");
    67          }
    68         return urlLsts;
    69     }
    70 
    71 }
      1 package com.mieba.spider;
      2 
      3 import java.io.BufferedInputStream;
      4 import java.io.BufferedOutputStream;
      5 import java.io.File;
      6 import java.io.FileNotFoundException;
      7 import java.io.FileOutputStream;
      8 import java.io.FileWriter;
      9 import java.io.IOException;
     10 import java.io.InputStream;
     11 import java.io.PrintWriter;
     12 import java.net.MalformedURLException;
     13 import java.net.URL;
     14 import java.net.URLConnection;
     15 import java.util.Vector;
     16 
     17 import us.codecraft.webmagic.ResultItems;
     18 import us.codecraft.webmagic.Task;
     19 import us.codecraft.webmagic.pipeline.Pipeline;
     20 
     21 public class WanhoPipeline implements Pipeline
     22 {
     23 
     24     @Override
     25     public void process(ResultItems resultItems, Task arg1)
     26     {
     27         // TODO Auto-generated method stub
     28         // 获取抓取过程中保存的数据
     29         Vector<ArticleVo> voLst = resultItems.get("e_list");
     30         // 持久到文件中
     31         PrintWriter pw = null;
     32         try
     33         {
     34             pw = new PrintWriter(new FileWriter("wanho.txt", true));
     35             for (ArticleVo vo : voLst)
     36             {
     37                 pw.println(vo);
     38                 pw.flush();
     39                 saveImg(vo.getImg());
     40             }
     41         } catch (FileNotFoundException e)
     42         {
     43             e.printStackTrace();
     44         } catch (IOException e)
     45         {
     46             e.printStackTrace();
     47         } finally
     48         {
     49             pw.close();
     50         }
     51     }
     52 
     53     private void saveImg(String img)
     54     {
     55         // TODO Auto-generated method stub
     56         String imgUrl = "http://www.wanho.net" + img;
     57         InputStream is = null;
     58         BufferedInputStream bis = null;
     59         BufferedOutputStream bos = null;
     60         try
     61         {
     62             URL url = new URL(imgUrl);
     63             URLConnection uc = url.openConnection();
     64             is = uc.getInputStream();
     65             bis = new BufferedInputStream(is);
     66             File photoFile = new File("photo");
     67             if (!photoFile.exists())
     68             {
     69                 photoFile.mkdirs();
     70             }
     71             String imgName = img.substring(img.lastIndexOf("/") + 1);
     72             File saveFile = new File(photoFile, imgName);
     73             bos = new BufferedOutputStream(new FileOutputStream(saveFile));
     74             byte[] bs = new byte[1024];
     75             int len;
     76             while ((len = bis.read(bs)) != -1)
     77             {
     78                 bos.write(bs, 0, len);
     79             }
     80 
     81         } catch (MalformedURLException e)
     82         {
     83             // TODO: handle exception
     84             e.printStackTrace();
     85         } catch (IOException e)
     86         {
     87             e.printStackTrace();
     88         } finally
     89         {
     90             try
     91             {
     92                 bos.close();
     93             } catch (IOException e)
     94             {
     95                 e.printStackTrace();
     96             }
     97             try
     98             {
     99                 bis.close();
    100             } catch (IOException e)
    101             {
    102                 e.printStackTrace();
    103             }
    104             try
    105             {
    106                 is.close();
    107             } catch (IOException e)
    108             {
    109                 e.printStackTrace();
    110             }
    111 
    112         }
    113     }
    114 
    115 }
     1 package com.mieba.spider;
     2 
     3 public class ArticleVo
     4 {
     5     private String title;
     6     private String content;
     7     private String img;
     8     public String getTitle()
     9     {
    10         return title;
    11     }
    12     public void setTitle(String title)
    13     {
    14         this.title = title;
    15     }
    16     public String getContent()
    17     {
    18         return content;
    19     }
    20     public void setContent(String content)
    21     {
    22         this.content = content;
    23     }
    24     public String getImg()
    25     {
    26         return img;
    27     }
    28     public void setImg(String img)
    29     {
    30         this.img = img;
    31     }
    32     public ArticleVo(String title, String content, String img)
    33     {
    34         super();
    35         this.title = title;
    36         this.content = content;
    37         this.img = img;
    38     }
    39     @Override
    40     public String toString()
    41     {
    42         return "ArticleVo [title=" + title + ", content=" + content + ", img=" + img + "]";
    43     }
    44     
    45     
    46 }
    package com.mieba.spider;
    
    import us.codecraft.webmagic.Spider;
    
    public class Demo
    {
        public static void main(String[] args)
        { // 爬取开始
            Spider 
            // 爬取过程 
            .create(new WanhoPageProcessor()) 
            // 爬取结果保存
            .addPipeline(new WanhoPipeline())
            // 爬取的第一个页面
            .addUrl("http://www.wanho.net/a/jyxb/") 
            // 启用的线程数
            .thread(5).run();
            }
        }
    
    
            

    爬取到的照片

     爬取到的简报

     大家如果要使用代码,配置webmagic的依赖包即可使用。

  • 相关阅读:
    JavaScript Eval 函数使用
    WPFToolkit Calendar & DatePicker 使用介绍
    Windows Mobile 6.5 配置环境,数据库访问,部署简单实例
    ThreadPool.QueueUserWorkItem 方法 (WaitCallback)
    Windws Mobile 6.5 Professional ADO.NET数据访问
    WPF调用Web Services
    c#中Interface的理解
    PagesSection.MaintainScrollPositionOnPostBack 属性
    EclipseRCP中文语言包版本不一致,导致导出错误
    SWT美化开源控件网站
  • 原文地址:https://www.cnblogs.com/quxiangjia/p/12326249.html
Copyright © 2020-2023  润新知