• 定时爬虫系统(以爬取[百度7日关注]为例)


    1、web.xml加载servlet

     1 <?xml version="1.0" encoding="UTF-8"?>
     2 <web-app xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://java.sun.com/xml/ns/javaee" xsi:schemaLocation="http://java.sun.com/xml/ns/javaee http://java.sun.com/xml/ns/javaee/web-app_2_5.xsd" id="WebApp_ID" version="2.5">
     3   <display-name>TaskSchedule</display-name>
     4   <welcome-file-list>
     5     <welcome-file>index.html</welcome-file>
     6     <welcome-file>index.htm</welcome-file>
     7     <welcome-file>index.jsp</welcome-file>
     8     <welcome-file>default.html</welcome-file>
     9     <welcome-file>default.htm</welcome-file>
    10     <welcome-file>default.jsp</welcome-file>
    11   </welcome-file-list>
    12   <servlet>
    13       <servlet-name>hotword</servlet-name>
    14       <servlet-class>com.richinfo.asynctask.servlet.TaskScheduleServlet</servlet-class>
    15       <load-on-startup>1</load-on-startup>
    16   </servlet>
    17 </web-app>

    2、TaskScheduleServlet初始化init

     1 public class TaskScheduleServlet extends HttpServlet{
     2     private static final Log logger = LogFactory.getLog(TaskScheduleServlet.class);
     3     private static final long serialVersionUID = 9089148097823231232L;
     4     
     5     @Override
     6     public void init() throws ServletException {
     7         super.init();
     8         logger.info("TaskScheduleServlet init!");
     9         System.setProperty("sun.net.client.defaultConnectTimeout", "30000");
    10         System.setProperty("sun.net.client.defaultReadTimeout", "30000");
    11         TaskRegister.getInstance().start();
    12         printLocalHost();
    13         logger.info("TaskScheduleServlet start!");
    14     }
    15     private void printLocalHost() {
    16         try {
    17             InetAddress local = InetAddress.getLocalHost();
    18             String ip = local.getHostAddress();
    19             String host = local.getHostName();
    20             logger.info("服务器信息:ip=" + ip + ", host=" + host);
    21         } catch (UnknownHostException e) {
    22             logger.error(e.getMessage());
    23         }
    24     }
    25 
    26     @Override
    27     public void destroy() {
    28         super.destroy();
    29         TaskRegister.getInstance().shutdown();
    30         logger.info("TaskScheduleServlet destroy!");
    31     }
    32 }

    3、TaskRegister.getInstance().start()

    4、定时任务的设置,这里不做赘述(有兴趣可以看我的另一篇有关定时任务配置的文章http://www.cnblogs.com/zhuziyu/p/7704661.html)

    5、爬虫具体实现

    (1)调用任务

     1 public class BaiduDayHotFocusGrabJob implements Job{
     2     private final static Log logger = LogFactory.getLog(BaiduDayHotFocusGrabJob.class);
     3     private UnifiedPosMgrPlatService service = new UnifiedPosMgrPlatSeviceImpl();
     4     @Override
     5     public void execute(JobExecutionContext context)
     6             throws JobExecutionException {
     7         JobKey key = context.getJobDetail().getKey();
     8         logger.info("开始任务["+key.getGroup()+"."+key.getName()+"]");
     9         
    10         try {
    11             resolve();
    12         } catch (Exception e) {
    13             logger.error(Tools.getStackInfo(e));
    14         }
    15     }
    16     
    17     public void resolve() throws IOException {
    18         Document doc = Jsoup.connect("http://top.baidu.com/").timeout(10000).get();
    19         Element attentionDiv = doc.select("#main div.tab-bd div.tab-box:eq(1)").first();
    20         Elements attentionEles = attentionDiv.select("ul li a.list-title");
    21         // 七日关注
    22         
    23         List<HotSpot> attentionList = new ArrayList<HotSpot>();
    24         int len = attentionEles.size();
    25         for (int i = 0; i < len; i++) {
    26             Element ele = attentionEles.get(i);
    27             String name = ele.text().trim();
    28             String uri = ele.attr("href").trim();
    29             attentionList.add(new HotSpot(Source.BAIDU_TOP, uri, name, i));
    30         }
    31         HotFocus focus = new HotFocus(Type.DAILY, Source.BAIDU_TOP, attentionList);
    32         int ret = service.addHotFocus(focus);
    33         if (ret != 1) {
    34             logger.warn("更新七日关注失败!");
    35         }
    36         else {
    37             logger.info("更新七日关注成功!");
    38         }
    39     }
    40 }

    (2)具体实现

     1 public class UnifiedPosMgrPlatSeviceImpl implements UnifiedPosMgrPlatService {
     2     private static final Logger logger = Logger.getLogger(UnifiedPosMgrPlatSeviceImpl.class);
     3     
     4     @Override
     5     public int addHotFocus(HotFocus focus) throws IOException {
     6         if (focus == null) return -1;
     7         List<HotSpot> spots = focus.getHots();
     8         if (spots == null || spots.isEmpty()) return -1;
     9         String uri = getAddHotWordInvokeUrl();
    10         String requestXml = requestAddHotWord(focus);
    11         String responseXml = invokeAddHotWords(uri, requestXml);
    12         return responseAddHotWords(responseXml);
    13     }
    14 }

    对应要调用的接口系统:本人的是http://127.0.0.1:8080/接口系统/方法

    6、对应系统的接口实现这里不赘述,最终执行数据库操作,将爬取内容写入数据库表

  • 相关阅读:
    CentOS7 运维
    【推荐】开源项目ElasticAmbari助力 ElasticSearch、Kibana、ambari服务高效运维管理
    逆向工程,调试Hello World !程序(更新中)
    校园网内网穿透
    搭建PXE服务及实现安装银河麒麟桌面操作系统
    Linux 的基础知识关于基本操作命令 --- No.3
    Unix/Linux fork前传
    60行C代码实现一个shell
    Linux 的基础知识回顾(安装vmware) ---- No.1 后面都以Centos8 为例
    vue v-bind绑定属性和样式
  • 原文地址:https://www.cnblogs.com/zhuziyu/p/8920349.html
Copyright © 2020-2023  润新知