1 package com.mieba.spider; 2 3 import java.util.ArrayList; 4 import java.util.List; 5 import java.util.Vector; 6 7 import us.codecraft.webmagic.Page; 8 import us.codecraft.webmagic.Site; 9 import us.codecraft.webmagic.processor.PageProcessor; 10 import us.codecraft.webmagic.selector.Html; 11 12 public class WanhoPageProcessor implements PageProcessor 13 { 14 15 private Site site = Site 16 .me() 17 .setTimeOut(10000) 18 .setRetryTimes(3) 19 .setSleepTime(1000) 20 .setCharset("UTF-8"); 21 22 @Override 23 public Site getSite() 24 { 25 // TODO Auto-generated method stub 26 return site; 27 } 28 29 @Override 30 public void process(Page page) 31 { 32 // TODO Auto-generated method stub 33 //获取当前页的所有喜报 34 List<String> list = page.getHtml().xpath("//div[@class='main_l']/ul/li").all(); 35 //要保存喜报的集合 36 Vector<ArticleVo> voLst = new Vector<>(); 37 //遍历喜报 38 String title; 39 String content; 40 String img; 41 for (String item : list) 42 { 43 Html tmp = Html.create(item); 44 //标题 45 title = tmp.xpath("//div[@class='content']/h4/a/text()").toString(); 46 //内容 47 content = tmp.xpath("//div[@class='content']/p/text()").toString(); 48 //图片路径 49 img = tmp.xpath("//a/img/@src").toString(); 50 //加入集合 51 ArticleVo vo = new ArticleVo(title, content, img); 52 voLst.add(vo); 53 } 54 //保存数据至page中,后续进行持久化 55 page.putField("e_list", voLst); 56 //加载其它页 57 page.addTargetRequests( getOtherUrls()); 58 } 59 60 61 //其它页 62 public List<String> getOtherUrls() 63 { 64 List<String> urlLsts = new ArrayList<>(); 65 for(int i=2;i<7;i++){ 66 urlLsts.add("http://www.wanho.net/a/jyxb/list_15_"+i+".html"); 67 } 68 return urlLsts; 69 } 70 71 }
1 package com.mieba.spider; 2 3 import java.io.BufferedInputStream; 4 import java.io.BufferedOutputStream; 5 import java.io.File; 6 import java.io.FileNotFoundException; 7 import java.io.FileOutputStream; 8 import java.io.FileWriter; 9 import java.io.IOException; 10 import java.io.InputStream; 11 import java.io.PrintWriter; 12 import java.net.MalformedURLException; 13 import java.net.URL; 14 import java.net.URLConnection; 15 import java.util.Vector; 16 17 import us.codecraft.webmagic.ResultItems; 18 import us.codecraft.webmagic.Task; 19 import us.codecraft.webmagic.pipeline.Pipeline; 20 21 public class WanhoPipeline implements Pipeline 22 { 23 24 @Override 25 public void process(ResultItems resultItems, Task arg1) 26 { 27 // TODO Auto-generated method stub 28 // 获取抓取过程中保存的数据 29 Vector<ArticleVo> voLst = resultItems.get("e_list"); 30 // 持久到文件中 31 PrintWriter pw = null; 32 try 33 { 34 pw = new PrintWriter(new FileWriter("wanho.txt", true)); 35 for (ArticleVo vo : voLst) 36 { 37 pw.println(vo); 38 pw.flush(); 39 saveImg(vo.getImg()); 40 } 41 } catch (FileNotFoundException e) 42 { 43 e.printStackTrace(); 44 } catch (IOException e) 45 { 46 e.printStackTrace(); 47 } finally 48 { 49 pw.close(); 50 } 51 } 52 53 private void saveImg(String img) 54 { 55 // TODO Auto-generated method stub 56 String imgUrl = "http://www.wanho.net" + img; 57 InputStream is = null; 58 BufferedInputStream bis = null; 59 BufferedOutputStream bos = null; 60 try 61 { 62 URL url = new URL(imgUrl); 63 URLConnection uc = url.openConnection(); 64 is = uc.getInputStream(); 65 bis = new BufferedInputStream(is); 66 File photoFile = new File("photo"); 67 if (!photoFile.exists()) 68 { 69 photoFile.mkdirs(); 70 } 71 String imgName = img.substring(img.lastIndexOf("/") + 1); 72 File saveFile = new File(photoFile, imgName); 73 bos = new BufferedOutputStream(new FileOutputStream(saveFile)); 74 byte[] bs = new byte[1024]; 75 int len; 76 while ((len = bis.read(bs)) != -1) 77 { 78 bos.write(bs, 0, len); 79 } 80 81 } catch (MalformedURLException e) 82 { 83 // TODO: handle exception 84 e.printStackTrace(); 85 } catch (IOException e) 86 { 87 e.printStackTrace(); 88 } finally 89 { 90 try 91 { 92 bos.close(); 93 } catch (IOException e) 94 { 95 e.printStackTrace(); 96 } 97 try 98 { 99 bis.close(); 100 } catch (IOException e) 101 { 102 e.printStackTrace(); 103 } 104 try 105 { 106 is.close(); 107 } catch (IOException e) 108 { 109 e.printStackTrace(); 110 } 111 112 } 113 } 114 115 }
1 package com.mieba.spider; 2 3 public class ArticleVo 4 { 5 private String title; 6 private String content; 7 private String img; 8 public String getTitle() 9 { 10 return title; 11 } 12 public void setTitle(String title) 13 { 14 this.title = title; 15 } 16 public String getContent() 17 { 18 return content; 19 } 20 public void setContent(String content) 21 { 22 this.content = content; 23 } 24 public String getImg() 25 { 26 return img; 27 } 28 public void setImg(String img) 29 { 30 this.img = img; 31 } 32 public ArticleVo(String title, String content, String img) 33 { 34 super(); 35 this.title = title; 36 this.content = content; 37 this.img = img; 38 } 39 @Override 40 public String toString() 41 { 42 return "ArticleVo [title=" + title + ", content=" + content + ", img=" + img + "]"; 43 } 44 45 46 }
package com.mieba.spider; import us.codecraft.webmagic.Spider; public class Demo { public static void main(String[] args) { // 爬取开始 Spider // 爬取过程 .create(new WanhoPageProcessor()) // 爬取结果保存 .addPipeline(new WanhoPipeline()) // 爬取的第一个页面 .addUrl("http://www.wanho.net/a/jyxb/") // 启用的线程数 .thread(5).run(); } }
爬取到的照片
爬取到的简报
大家如果要使用代码,配置webmagic的依赖包即可使用。