代码:
1 package 疫情; 2 3 4 import java.text.SimpleDateFormat; 5 import java.util.ArrayList; 6 import java.util.Date; 7 import java.util.List; 8 9 import com.dao.InfoDao; 10 import com.dao.YiDao; 11 import org.jsoup.Jsoup; 12 import org.jsoup.nodes.Document; 13 import util.StringHandle; 14 import us.codecraft.webmagic.Page; 15 import us.codecraft.webmagic.Site; 16 import us.codecraft.webmagic.Spider; 17 import us.codecraft.webmagic.processor.PageProcessor; 18 19 public class Info implements PageProcessor { 20 static String regEx="[ `'' " , ,]"; 21 static String aa=""; 22 // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 23 private Site site = Site.me().setRetryTimes(3).setSleepTime(100); 24 private static int count =0; 25 26 @Override 27 public Site getSite() { 28 return site; 29 } 30 @Override 31 public void process(Page page) { 32 Date format = new Date(); 33 SimpleDateFormat ft = new SimpleDateFormat ("yyyy-MM-dd hh:mm:ss"); 34 String date=ft.format(format); 35 System.out.println("当前时间为: " + ft.format(format)); 36 //System.out.println(page.getHtml()); 37 StringHandle sh=new StringHandle(); 38 String test=page.getHtml().xpath("//script [@id='getAreaStat']").get(); 39 System.out.println(test); 40 41 42 43 List<String> Provinces=sh.getExpString(""provinceName":"(.*?)","provinceShortName":"(.*?)","confirmedCount":(.*?),"suspectedCount":(.*?),"curedCount":(.*?),"deadCount":(.*?),"comment":(.*?)"","locationId":(.*?),"", test); 44 45 for(String Province:Provinces) 46 { 47 String Province_names=sh.getExpString(""provinceName":"(.*?)"", Province).get(0).replaceAll(""provinceName":", "").replaceAll(regEx, aa); 48 49 String Province_num_confirmed=sh.getExpString(""confirmedCount":(.*?),", Province).get(0).replaceAll(""confirmedCount":", "").replaceAll(regEx, aa); 50 51 52 String Province_num_cured=sh.getExpString(""curedCount":(.*?),", Province).get(0).replaceAll(""curedCount":", "").replaceAll(regEx, aa); 53 54 String Province_num_dead=sh.getExpString(""deadCount":(.*?),", Province).get(0).replaceAll(""deadCount":", "").replaceAll(regEx, aa); 55 56 String Province_num_locationId=sh.getExpString(""locationId":(.*?),", Province).get(0).replaceAll(""locationId":", "").replaceAll(regEx, aa); 57 58 59 YiDao.add("info",date,Province_names,"",Province_num_confirmed,"",Province_num_cured,Province_num_dead,Province_num_locationId); 60 } 61 62 63 64 65 List<String> citys=sh.getExpString(""cityName":"(.*?)","confirmedCount":(.*?),"suspectedCount":(.*?),"curedCount":(.*?),"deadCount":(.*?),"locationId":(.*?)}", test); 66 System.out.println(citys.get(5)); 67 for(String city:citys) 68 { 69 70 // String Province_names=sh.getExpString(""provinceName":"(.*?)"", city).get(0).replaceAll(""provinceName":", "").replaceAll(regEx, aa); 71 72 String City_names=sh.getExpString(""cityName":"(.*?)"", city).get(0).replaceAll(""cityName":", "").replaceAll(regEx, aa); 73 74 String City_num_confirmed=sh.getExpString(""confirmedCount":(.*?),", city).get(0).replaceAll(""confirmedCount":", "").replaceAll(regEx, aa); 75 76 String City_num_cured=sh.getExpString(""curedCount":(.*?),", city).get(0).replaceAll(""curedCount":", "").replaceAll(regEx, aa); 77 78 String City_num_dead=sh.getExpString(""deadCount":(.*?),", city).get(0).replaceAll(""deadCount":", "").replaceAll(regEx, aa); 79 80 String City_num_locationId=sh.getExpString(""locationId":(.*?)}", city).get(0).replaceAll(""locationId":", "").replaceAll(regEx, aa); 81 82 System.out.println(City_names+City_num_confirmed+""+City_num_cured+City_num_dead+City_num_locationId); 83 YiDao.add("info",date,"",City_names,City_num_confirmed,"",City_num_cured,City_num_dead,City_num_locationId); 84 } 85 86 87 System.out.println("AAAA"); 88 System.out.println(citys.get(0)); 89 90 91 count ++; 92 } 93 94 public static void main(String[] args) { 95 long startTime, endTime; 96 System.out.println("开始爬取..."); 97 InfoDao.delete("info"); 98 startTime = System.currentTimeMillis(); 99 Spider.create(new Info()).addUrl("https://ncov.dxy.cn/ncovh5/view/pneumonia_peopleapp?from=timeline&isappinstalled=0").thread(5).run(); 100 endTime = System.currentTimeMillis(); 101 System.out.println("爬取结束,耗时约" + ((endTime - startTime) / 1000) + "秒,抓取了"+count+"条记录"); 102 } 103 104 105 }
效果图: