实现爬取数据并存入数据库
项目结构:
核心代码位于test包下的BlogPageProcessor
1 package com.zyk.test; 2 3 import com.zyk.dao.CnblogsDao; 4 import com.zyk.dao.CnblogsDaoImpl; 5 import com.zyk.entity.Cnblogs; 6 import us.codecraft.webmagic.Page; 7 import us.codecraft.webmagic.Site; 8 import us.codecraft.webmagic.Spider; 9 import us.codecraft.webmagic.processor.PageProcessor; 10 11 import java.text.SimpleDateFormat; 12 import java.util.Calendar; 13 import java.util.Date; 14 import java.util.regex.Matcher; 15 import java.util.regex.Pattern; 16 17 public class BlogPageProcessor implements PageProcessor { 18 19 private Site site = Site.me().setRetryTimes(10).setSleepTime(1000); 20 21 private static int num = 0; 22 CnblogsDao cnblogsDao=new CnblogsDaoImpl(); 23 24 public static void main(String[] args) throws Exception { 25 long startTime ,endTime; 26 System.out.println("========zyk的爬虫【启动】!========="); 27 startTime = new Date().getTime(); 28 Spider.create(new BlogPageProcessor()).addUrl("https://www.cnblogs.com/cate/108698/").thread(200).run(); 29 endTime = new Date().getTime(); 30 System.out.println("========zyk小爬虫【结束】========="); 31 System.out.println("一共爬到"+num+"篇博客!用时为:"+(endTime-startTime)/1000+"s"); 32 } 33 @Override 34 public void process(Page page) { 35 if(page.getUrl().regex("https://www.cnblogs.com/cate/108698/.*").match()){ 36 page.addTargetRequests(page.getHtml().xpath("//div[@id='post_list']").links().regex("^(.*\.html)$").all()); 37 page.addTargetRequests(page.getHtml().xpath("////div[@class='pager']").links().all()); 38 } 39 else{ 40 try { 41 42 Cnblogs cnblogs=new Cnblogs(); 43 //获取url 44 String url=page.getHtml().xpath("//a[@id='cb_post_title_url']/@href").get(); 45 //获取标题 46 String title = page.getHtml().xpath("//a[@id='cb_post_title_url']/text()").toString(); 47 //获取作者 48 String author = page.getHtml().xpath("//a[@id='Header1_HeaderTitle']/text()").toString(); 49 //获取时间 50 String time=page.getHtml().xpath("//span[@id='post-date']/text()").toString(); 51 //获取评论量 52 String comment = page.getHtml().xpath("////span[@id='stats_comment_count']/text()").get(); 53 54 //获取阅读量 55 String view = page.getHtml().xpath("//div[@class='postDesc']/span[@id='post_view_count']/text()").toString(); 56 57 58 59 60 cnblogs.setUrl(url); 61 cnblogs.setTitle(title); 62 cnblogs.setAuthor(author); 63 cnblogs.setTime(time); 64 if(comment==null){ 65 cnblogs.setComment("评论 - 0"); 66 }else{ 67 cnblogs.setComment(comment); 68 } 69 cnblogs.setComment(comment); 70 cnblogs.setView(view); 71 72 num++; 73 74 cnblogsDao.saveBlog(cnblogs); 75 76 } catch (Exception e) { 77 e.printStackTrace(); 78 } 79 } 80 } 81 82 @Override 83 public Site getSite() { 84 return this.site; 85 } 86 }
通过调用webmagic的接口就可以存入数据库中