闲的无聊,写了个抓图片的网络爬虫
有喜欢的,拿去,不要把人家辛苦做的站抓宕机就好了。欢迎评论指点;
1 import com.google.common.collect.Lists; 2 3 4 import org.apache.commons.io.IOUtils; 5 6 import org.apache.http.Header; 7 import org.apache.http.HttpEntity; 8 import org.apache.http.HttpHeaders; 9 import org.apache.http.HttpResponse; 10 import org.apache.http.client.ClientProtocolException; 11 import org.apache.http.client.HttpClient; 12 import org.apache.http.client.config.RequestConfig; 13 import org.apache.http.client.methods.HttpGet; 14 import org.apache.http.client.methods.HttpUriRequest; 15 import org.apache.http.client.methods.RequestBuilder; 16 import org.apache.http.client.utils.URIBuilder; 17 import org.apache.http.impl.client.HttpClients; 18 import org.apache.http.message.BasicHeader; 19 20 import org.apache.http.protocol.HTTP; 21 import org.apache.http.util.EntityUtils; 22 import org.jsoup.Jsoup; 23 import org.jsoup.nodes.Document; 24 import org.jsoup.nodes.Element; 25 import org.jsoup.select.Elements; 26 27 28 import java.io.*; 29 import java.net.URI; 30 import java.net.URISyntaxException; 31 32 33 import java.util.List; 34 import java.util.Stack; 35 36 37 public class SexlCrawler { 38 39 //抓取路径以及保存到本地的路径 40 41 public static String sexpage_url="https://www.mm131.net/xinggan/5354.html"; 42 public static String local_path="d://temp//sexImg//"; 43 44 public java.util.Stack imgStack =new Stack(); 45 public String pageNum; 46 47 48 49 50 public SexlCrawler(){ 51 imgStack.push(sexpage_url); 52 } 53 54 public static void main(String[] args) { 55 56 SexlCrawler sc=new SexlCrawler(); 57 while(!sc.imgStack.empty()){ 58 59 String url= (String) sc.imgStack.pop(); 60 sc.startCrawler(url); 61 62 } 63 64 65 } 66 67 68 public void startCrawler(String startPage_url){ 69 70 71 //构造请求路径,并添加参数 72 try { 73 74 75 URI uri = new URIBuilder(startPage_url).build(); 76 List<Header> headerList = Lists.newArrayList(); 77 78 headerList.add(new BasicHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate")); 79 headerList.add(new BasicHeader(HTTP.CONN_KEEP_ALIVE, "keep-alive")); 80 headerList.add(new BasicHeader(HttpHeaders.ACCEPT_LANGUAGE, "zh-cn,zh;q=0.5")); 81 82 83 84 //构造HttpClient 85 HttpClient httpClient = HttpClients.custom().setDefaultHeaders(headerList).build(); 86 87 88 89 //构造HttpGet请求 90 HttpUriRequest httpUriRequest = RequestBuilder.get().setUri(uri).build(); 91 92 93 //---加入参数,让反爬虫失效 94 httpUriRequest.setHeader("Referer","sogou.com"); 95 //httpUriRequest.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"); 96 httpUriRequest.setHeader("User-Agent","sogou.com"); 97 98 //获取结果 99 HttpResponse httpResponse = httpClient.execute(httpUriRequest); 100 //获取返回结果中的实体 101 HttpEntity entity = httpResponse.getEntity(); 102 //查看页面内容结果 103 //---内容编码 104 String rawHTMLContent = EntityUtils.toString(entity, "GBK"); 105 106 107 //关闭HttpEntity流 108 EntityUtils.consume(entity); 109 110 111 112 this.setPageNum(startPage_url); 113 114 //---获取路径---// 115 String imgUrl= this.dealWith(rawHTMLContent,startPage_url); 116 //---存图片---// 117 this.savepic(imgUrl,httpClient); 118 119 120 121 122 123 } catch (URISyntaxException e) { 124 // TODO Auto-generated catch block 125 e.printStackTrace(); 126 } catch (ClientProtocolException e) { 127 // TODO Auto-generated catch block 128 e.printStackTrace(); 129 } catch (IOException e) { 130 // TODO Auto-generated catch block 131 e.printStackTrace(); 132 } 133 134 135 } 136 137 138 139 public String dealWith(String inputHtml,String startPage_url){ 140 141 Document doc = Jsoup.parse(inputHtml); 142 String imgurl=""; 143 144 145 Elements elements = doc.select("div[class=content-pic]").select("img"); 146 147 Elements next_elements = doc.select("div[class=content-pic]").select("a"); 148 149 for( Element element : elements ){ 150 151 imgurl= element.attr("src"); 152 } 153 154 for( Element element : next_elements ){ 155 String next_url=""; 156 157 next_url= element.attr("href"); 158 159 //--从开始第一页,每页都是类似 "666_6.html",但是在最后一页,跳转到新的例如"667.html"时,页面又加上了类似于 https开头的链接 160 //---所以要特殊处理 161 if(next_url.indexOf("http")<0) { 162 int p = startPage_url.lastIndexOf("/"); 163 next_url = startPage_url.substring(0, p + 1) + next_url; 164 } 165 else{ 166 try { 167 Thread.sleep(2000); 168 } catch (InterruptedException e) { 169 e.printStackTrace(); 170 } 171 } 172 173 174 this.imgStack.push(next_url); 175 } 176 177 return imgurl; 178 179 } 180 181 public String setPageNum(String url){ 182 String out=""; 183 184 int m = url.indexOf(".html"); 185 int p=0; 186 if(url.lastIndexOf("_")>0) { 187 int j=url.lastIndexOf("_"); 188 int k=url.lastIndexOf("/"); 189 out=url.substring(k,j); 190 } 191 else{ 192 p= url.lastIndexOf("/"); 193 out=url.substring(p+1,m); 194 } 195 196 this.pageNum=out; 197 198 return out; 199 200 201 } 202 203 204 public void savepic(String ImgURL,HttpClient httpClient){ 205 if(ImgURL == null){ 206 return ; 207 } 208 209 String[] strs = ImgURL.split("/"); 210 String fileName = strs[strs.length-1]; 211 String savePath = local_path+ File.separator+this.pageNum+"_"+fileName; 212 HttpEntity entity = null; 213 try { 214 215 HttpGet get = new HttpGet(ImgURL); 216 217 // RequestConfig requestConfig = RequestConfig.custom() 218 // .setConnectTimeout(50000).setConnectionRequestTimeout(1000) 219 // .setSocketTimeout(50000).build(); 220 // get.setConfig(requestConfig); 221 222 223 //get.setHeader("Referer",ImgURL); 224 //get.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"); 225 226 //---这步很关键,是网站反爬虫的策略,只要加上 Referer参数即可,注意参数一定是这个网站名,否则抓的图片都是一个; 227 get.setHeader("Referer","https://www.mm131.net/"); 228 get.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"); 229 get.setHeader("User-Agent","sogou.com"); 230 231 HttpResponse response = httpClient.execute(get); 232 233 entity = response.getEntity(); 234 System.out.println("保存图片>>>>.>>>>>>"+fileName); 235 InputStream is = entity.getContent(); 236 OutputStream os = new FileOutputStream(savePath); 237 IOUtils.copy(is, os); 238 IOUtils.closeQuietly(os); 239 IOUtils.closeQuietly(is); 240 241 } catch (Exception e) { 242 e.printStackTrace(); 243 System.out.println("图片保存失败"); 244 return ; 245 } 246 } 247 248 249 }
花了2个小时,随便写写的;
欢迎点评;