• 新型冠状病毒 疫情 数据爬取(Java 含源码)


    代码:

      1 package 疫情;
      2 
      3 
      4 import java.text.SimpleDateFormat;
      5 import java.util.ArrayList;
      6 import java.util.Date;
      7 import java.util.List;
      8 
      9 import com.dao.InfoDao;
     10 import com.dao.YiDao;
     11 import org.jsoup.Jsoup;
     12 import org.jsoup.nodes.Document;
     13 import util.StringHandle;
     14 import us.codecraft.webmagic.Page;
     15 import us.codecraft.webmagic.Site;
     16 import us.codecraft.webmagic.Spider;
     17 import us.codecraft.webmagic.processor.PageProcessor;
     18 
     19 public class Info implements PageProcessor {
     20     static String regEx="[
    `'' " , ,]";
     21     static String aa="";
     22     // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
     23     private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
     24     private static int count =0;
     25 
     26     @Override
     27     public Site getSite() {
     28         return site;
     29     }
     30     @Override
     31     public void process(Page page) {
     32         Date format = new Date();
     33         SimpleDateFormat ft = new SimpleDateFormat ("yyyy-MM-dd hh:mm:ss");
     34         String date=ft.format(format);
     35         System.out.println("当前时间为: " + ft.format(format));
     36         //System.out.println(page.getHtml());
     37         StringHandle sh=new StringHandle();
     38         String test=page.getHtml().xpath("//script [@id='getAreaStat']").get();
     39         System.out.println(test);
     40 
     41 
     42 
     43         List<String> Provinces=sh.getExpString(""provinceName":"(.*?)","provinceShortName":"(.*?)","confirmedCount":(.*?),"suspectedCount":(.*?),"curedCount":(.*?),"deadCount":(.*?),"comment":(.*?)"","locationId":(.*?),"", test);
     44 
     45         for(String Province:Provinces)
     46         {
     47             String Province_names=sh.getExpString(""provinceName":"(.*?)"", Province).get(0).replaceAll(""provinceName":", "").replaceAll(regEx, aa);
     48 
     49             String Province_num_confirmed=sh.getExpString(""confirmedCount":(.*?),", Province).get(0).replaceAll(""confirmedCount":", "").replaceAll(regEx, aa);
     50 
     51 
     52             String Province_num_cured=sh.getExpString(""curedCount":(.*?),", Province).get(0).replaceAll(""curedCount":", "").replaceAll(regEx, aa);
     53 
     54             String Province_num_dead=sh.getExpString(""deadCount":(.*?),", Province).get(0).replaceAll(""deadCount":", "").replaceAll(regEx, aa);
     55 
     56             String Province_num_locationId=sh.getExpString(""locationId":(.*?),", Province).get(0).replaceAll(""locationId":", "").replaceAll(regEx, aa);
     57 
     58 
     59             YiDao.add("info",date,Province_names,"",Province_num_confirmed,"",Province_num_cured,Province_num_dead,Province_num_locationId);
     60         }
     61 
     62 
     63 
     64 
     65         List<String> citys=sh.getExpString(""cityName":"(.*?)","confirmedCount":(.*?),"suspectedCount":(.*?),"curedCount":(.*?),"deadCount":(.*?),"locationId":(.*?)}", test);
     66         System.out.println(citys.get(5));
     67         for(String city:citys)
     68         {
     69 
     70           //  String Province_names=sh.getExpString(""provinceName":"(.*?)"", city).get(0).replaceAll(""provinceName":", "").replaceAll(regEx, aa);
     71 
     72             String City_names=sh.getExpString(""cityName":"(.*?)"", city).get(0).replaceAll(""cityName":", "").replaceAll(regEx, aa);
     73 
     74             String City_num_confirmed=sh.getExpString(""confirmedCount":(.*?),", city).get(0).replaceAll(""confirmedCount":", "").replaceAll(regEx, aa);
     75 
     76             String City_num_cured=sh.getExpString(""curedCount":(.*?),", city).get(0).replaceAll(""curedCount":", "").replaceAll(regEx, aa);
     77 
     78             String City_num_dead=sh.getExpString(""deadCount":(.*?),", city).get(0).replaceAll(""deadCount":", "").replaceAll(regEx, aa);
     79 
     80             String City_num_locationId=sh.getExpString(""locationId":(.*?)}", city).get(0).replaceAll(""locationId":", "").replaceAll(regEx, aa);
     81 
     82             System.out.println(City_names+City_num_confirmed+""+City_num_cured+City_num_dead+City_num_locationId);
     83             YiDao.add("info",date,"",City_names,City_num_confirmed,"",City_num_cured,City_num_dead,City_num_locationId);
     84         }
     85 
     86 
     87         System.out.println("AAAA");
     88         System.out.println(citys.get(0));
     89 
     90 
     91         count ++;
     92     }
     93 
     94     public static void main(String[] args) {
     95         long startTime, endTime;
     96         System.out.println("开始爬取...");
     97         InfoDao.delete("info");
     98         startTime = System.currentTimeMillis();
     99         Spider.create(new Info()).addUrl("https://ncov.dxy.cn/ncovh5/view/pneumonia_peopleapp?from=timeline&isappinstalled=0").thread(5).run();
    100         endTime = System.currentTimeMillis();
    101         System.out.println("爬取结束,耗时约" + ((endTime - startTime) / 1000) + "秒,抓取了"+count+"条记录");
    102     }
    103 
    104 
    105 }

    效果图:

  • 相关阅读:
    poj 最长公共子序列 1458 记忆式搜索
    选择排序
    直接 插入排序
    直接插入排序
    洛谷-P3389-高斯消元
    经济中的哪些概念
    uva-622-dp
    UVA-607-DP
    转转---面试题
    Linux事件驱动IO中select vs epoll
  • 原文地址:https://www.cnblogs.com/smartisn/p/12283472.html
Copyright © 2020-2023  润新知