1 package com.test.pic.crawler; 2 3 import java.io.File; 4 import java.io.FileOutputStream; 5 import java.io.IOException; 6 import java.io.InputStream; 7 import java.io.OutputStream; 8 import java.net.URL; 9 import java.net.URLConnection; 10 import java.util.Arrays; 11 import java.util.HashSet; 12 import java.util.List; 13 import java.util.Set; 14 import java.util.concurrent.BlockingQueue; 15 import java.util.concurrent.LinkedBlockingDeque; 16 import java.util.concurrent.ScheduledExecutorService; 17 import java.util.concurrent.ScheduledThreadPoolExecutor; 18 import org.apache.commons.lang3.concurrent.BasicThreadFactory; 19 import org.jsoup.Jsoup; 20 import org.jsoup.nodes.*; 21 import org.jsoup.select.Elements; 22 23 24 25 26 /** 27 * @Title: PicCrawler.java 28 * 29 * @Package com.test.pic.crawler 30 * 31 * @Description: 爬取指定网站的指定Tag下的图片或者全部Tag图片 32 * 33 * @author CoderZZ 34 * 35 * @date 2018年1月12日 下午11:22:41 36 * 37 * @version V1.0 38 * 39 */ 40 public class PicCrawler implements Runnable{ 41 private static String pathString = "G:/test/pic/";//存储目录 42 //存储真正的爬取页面 43 static BlockingQueue<String> urlBlockingQueue = new LinkedBlockingDeque<String>(1000); 44 static int threadNum = 10; 45 // public PicCrawler(String url){ 46 // this.url = url; 47 // } 48 49 /** 50 * @Title: main 51 * 52 * @Description: TODO(这里用一句话描述这个方法的作用) 53 * 54 * @param @param args 设定文件 55 * 56 * @return void 返回类型 57 * 58 * @throws 59 * 60 */ 61 public static void main(String[] args) { 62 String homeurlString = "https://www.xxxx.com";//爬取页面的基本地址 63 String tagPageUrl = "https://www.xxxx.com/tag/";//tag分页地址 64 //Tag标签的完整路径 65 Set<String> tagFullHrefSet = new HashSet<String>(16); 66 //想要爬取哪些tag,如果为空,则全部爬取;否则只配置对应的tag 67 String[] crawlerTagArray = {"风景"}; 68 List<String> crawlerTagList = Arrays.asList(crawlerTagArray); 69 try { 70 //1.获取想要的tag完整的url 71 Document tagListDocument = Jsoup.connect(tagPageUrl).get(); 72 Elements tagsListDivElements = tagListDocument.getElementsByClass("tags_list"); 73 for(Element element:tagsListDivElements){ 74 Elements aElements = element.getElementsByTag("a"); 75 for(Element a:aElements){ 76 if(crawlerTagList.size() == 0 || crawlerTagList.contains(a.text())){ 77 String tagUrlString = homeurlString+a.attr("href"); 78 //https://www.xxxx.com/tag/fengjing.html 79 tagUrlString = tagUrlString.substring(0, tagUrlString.lastIndexOf("."))+"/1.html"; 80 tagFullHrefSet.add(tagUrlString); 81 } 82 } 83 } 84 //2.获取图片链接页面地址,分页爬取 85 for(String tagUrl:tagFullHrefSet){ 86 String tempTagUrlString = tagUrl; 87 int currentPageNum = 1; 88 while(true){ 89 try{ 90 Document imagePageDocument = Jsoup.connect(tempTagUrlString).get(); 91 Elements imageListElements = imagePageDocument.getElementsByClass("Pli-litpic"); 92 if(imageListElements.size() == 0){ 93 break; 94 } 95 for(Element image:imageListElements){ 96 urlBlockingQueue.offer(homeurlString+image.attr("href")); 97 } 98 //https://www.xxxx.com/tag/fengjing/1.html 99 tempTagUrlString = tempTagUrlString.substring(0, tempTagUrlString.lastIndexOf("/")+1)+(++currentPageNum)+".html"; 100 }catch(Exception e){ 101 break; 102 } 103 } 104 } 105 ScheduledExecutorService excutor = new ScheduledThreadPoolExecutor(threadNum,new BasicThreadFactory.Builder().namingPattern("my-crawler-thread-%d").daemon(false).build()); 106 for(int i=0;i<threadNum;i++){ 107 // excutor.schedule(new PicCrawler(urlArray[i]), 1, TimeUnit.SECONDS); 108 // excutor.execute(new PicCrawler(urlArray[i])); 109 excutor.submit(new PicCrawler()); 110 } 111 } catch (IOException e) { 112 // TODO Auto-generated catch block 113 e.printStackTrace(); 114 } 115 } 116 @Override 117 public void run() { 118 while (true) { 119 try { 120 long begin = System.currentTimeMillis(); 121 String url = urlBlockingQueue.poll(); 122 if(null != url){ 123 Document doc = Jsoup.connect(url).get(); 124 Elements titleElements =doc.select("#photos > h1"); 125 if(null != titleElements && null != titleElements.get(0)){ 126 Set<String> imgSrcSet = new HashSet<String>(16); 127 Element titleElement = titleElements.get(0); 128 String foldNameString = titleElement.text(); 129 String[] nameArray = foldNameString.split("\("); 130 foldNameString = nameArray[0]; 131 nameArray = nameArray[1].split("/"); 132 int totalPaggs = Integer.parseInt(nameArray[1].replace(")", "")); 133 for(int i=1;i<=totalPaggs;i++){ 134 String urlTemp = url.replace(".html", "_"+i+".html"); 135 Document docTemp = Jsoup.connect(urlTemp).get(); 136 Element element = docTemp.getElementById("big-pic"); 137 Elements imgElements = element.getElementsByTag("img"); 138 for(Element imgElement:imgElements){ 139 imgSrcSet.add(imgElement.attr("src")); 140 } 141 } 142 if(imgSrcSet.size()>0){ 143 for(String imgSrc:imgSrcSet){ 144 // 构造URL 145 URL imgurl = new URL(imgSrc); 146 // 打开连接 147 URLConnection con = imgurl.openConnection(); 148 //设置请求超时为10s 149 con.setConnectTimeout(10*1000); 150 // 输入流 151 InputStream is = con.getInputStream(); 152 // 500k的数据缓冲 153 byte[] bs = new byte[1024*500]; 154 // 读取到的数据长度 155 int len; 156 // 输出的文件流 157 File sf=new File(pathString+"\"+foldNameString); 158 if(!sf.exists()){ 159 sf.mkdirs(); 160 } 161 String filename = imgSrc.split("/")[imgSrc.split("/").length-1]; 162 OutputStream os = new FileOutputStream(sf.getPath()+"\"+filename); 163 // 开始读取 164 while ((len = is.read(bs)) != -1) { 165 os.write(bs, 0, len); 166 } 167 // 完毕,关闭所有链接 168 os.close(); 169 is.close(); 170 System.out.println(imgSrc+"下载完成!!!"); 171 } 172 } 173 long end = System.currentTimeMillis(); 174 System.out.println("================================================================"); 175 System.out.println(Thread.currentThread().getName()+"******************已全部下载完成,用时:"+((end-begin)/1000)+"S"); 176 } 177 }else{ 178 System.out.println("========================BlockingQueue已空,已全部抓取完成!======================="); 179 } 180 } catch (Exception e) { 181 System.out.println("========================抓取异常======================="); 182 } 183 } 184 } 185 }