• 闲的无聊,写了个抓图片的网络爬虫


    闲的无聊,写了个抓图片的网络爬虫

    有喜欢的,拿去,不要把人家辛苦做的站抓宕机就好了。欢迎评论指点;

      1 import com.google.common.collect.Lists;
      2 
      3 
      4 import org.apache.commons.io.IOUtils;
      5 
      6 import org.apache.http.Header;
      7 import org.apache.http.HttpEntity;
      8 import org.apache.http.HttpHeaders;
      9 import org.apache.http.HttpResponse;
     10 import org.apache.http.client.ClientProtocolException;
     11 import org.apache.http.client.HttpClient;
     12 import org.apache.http.client.config.RequestConfig;
     13 import org.apache.http.client.methods.HttpGet;
     14 import org.apache.http.client.methods.HttpUriRequest;
     15 import org.apache.http.client.methods.RequestBuilder;
     16 import org.apache.http.client.utils.URIBuilder;
     17 import org.apache.http.impl.client.HttpClients;
     18 import org.apache.http.message.BasicHeader;
     19 
     20 import org.apache.http.protocol.HTTP;
     21 import org.apache.http.util.EntityUtils;
     22 import org.jsoup.Jsoup;
     23 import org.jsoup.nodes.Document;
     24 import org.jsoup.nodes.Element;
     25 import org.jsoup.select.Elements;
     26 
     27 
     28 import java.io.*;
     29 import java.net.URI;
     30 import java.net.URISyntaxException;
     31 
     32 
     33 import java.util.List;
     34 import java.util.Stack;
     35 
     36 
     37 public class SexlCrawler {
     38 
     39     //抓取路径以及保存到本地的路径
     40 
     41     public static String sexpage_url="https://www.mm131.net/xinggan/5354.html";
     42     public static String local_path="d://temp//sexImg//";
     43 
     44     public java.util.Stack imgStack =new Stack();
     45     public String pageNum;
     46 
     47 
     48 
     49 
     50     public SexlCrawler(){
     51         imgStack.push(sexpage_url);
     52     }
     53 
     54     public static void main(String[] args) {
     55 
     56         SexlCrawler sc=new SexlCrawler();
     57         while(!sc.imgStack.empty()){
     58 
     59             String url= (String) sc.imgStack.pop();
     60             sc.startCrawler(url);
     61 
     62         }
     63 
     64 
     65     }
     66 
     67 
     68     public void startCrawler(String startPage_url){
     69 
     70 
     71         //构造请求路径,并添加参数
     72         try {
     73 
     74 
     75             URI uri = new URIBuilder(startPage_url).build();
     76             List<Header> headerList = Lists.newArrayList();
     77 
     78             headerList.add(new BasicHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate"));
     79             headerList.add(new BasicHeader(HTTP.CONN_KEEP_ALIVE, "keep-alive"));
     80             headerList.add(new BasicHeader(HttpHeaders.ACCEPT_LANGUAGE, "zh-cn,zh;q=0.5"));
     81 
     82 
     83 
     84             //构造HttpClient
     85             HttpClient httpClient = HttpClients.custom().setDefaultHeaders(headerList).build();
     86 
     87 
     88 
     89             //构造HttpGet请求
     90             HttpUriRequest httpUriRequest = RequestBuilder.get().setUri(uri).build();
     91 
     92 
     93             //---加入参数,让反爬虫失效
     94             httpUriRequest.setHeader("Referer","sogou.com");
     95             //httpUriRequest.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36");
     96             httpUriRequest.setHeader("User-Agent","sogou.com");
     97 
     98             //获取结果
     99             HttpResponse httpResponse = httpClient.execute(httpUriRequest);
    100             //获取返回结果中的实体
    101             HttpEntity entity = httpResponse.getEntity();
    102             //查看页面内容结果
    103             //---内容编码
    104             String rawHTMLContent = EntityUtils.toString(entity, "GBK");
    105 
    106 
    107             //关闭HttpEntity流
    108             EntityUtils.consume(entity);
    109 
    110 
    111 
    112             this.setPageNum(startPage_url);
    113 
    114             //---获取路径---//
    115             String imgUrl= this.dealWith(rawHTMLContent,startPage_url);
    116             //---存图片---//
    117             this.savepic(imgUrl,httpClient);
    118 
    119 
    120 
    121 
    122 
    123         } catch (URISyntaxException e) {
    124             // TODO Auto-generated catch block
    125             e.printStackTrace();
    126         } catch (ClientProtocolException e) {
    127             // TODO Auto-generated catch block
    128             e.printStackTrace();
    129         } catch (IOException e) {
    130             // TODO Auto-generated catch block
    131             e.printStackTrace();
    132         }
    133 
    134 
    135     }
    136 
    137 
    138 
    139     public String dealWith(String inputHtml,String startPage_url){
    140 
    141         Document doc = Jsoup.parse(inputHtml);
    142         String imgurl="";
    143 
    144 
    145         Elements elements = doc.select("div[class=content-pic]").select("img");
    146 
    147         Elements next_elements = doc.select("div[class=content-pic]").select("a");
    148 
    149         for( Element element : elements ){
    150 
    151             imgurl= element.attr("src");
    152         }
    153 
    154         for( Element element : next_elements ){
    155             String next_url="";
    156 
    157             next_url= element.attr("href");
    158 
    159             //--从开始第一页,每页都是类似 "666_6.html",但是在最后一页,跳转到新的例如"667.html"时,页面又加上了类似于 https开头的链接
    160             //---所以要特殊处理
    161             if(next_url.indexOf("http")<0) {
    162                 int p = startPage_url.lastIndexOf("/");
    163                 next_url = startPage_url.substring(0, p + 1) + next_url;
    164             }
    165             else{
    166                 try {
    167                     Thread.sleep(2000);
    168                 } catch (InterruptedException e) {
    169                     e.printStackTrace();
    170                 }
    171             }
    172 
    173 
    174             this.imgStack.push(next_url);
    175         }
    176 
    177         return imgurl;
    178 
    179     }
    180 
    181     public String setPageNum(String url){
    182         String out="";
    183 
    184         int m = url.indexOf(".html");
    185         int p=0;
    186         if(url.lastIndexOf("_")>0) {
    187            int j=url.lastIndexOf("_");
    188            int k=url.lastIndexOf("/");
    189             out=url.substring(k,j);
    190         }
    191         else{
    192            p= url.lastIndexOf("/");
    193            out=url.substring(p+1,m);
    194         }
    195 
    196         this.pageNum=out;
    197 
    198         return out;
    199 
    200 
    201     }
    202 
    203 
    204     public void savepic(String ImgURL,HttpClient httpClient){
    205         if(ImgURL == null){
    206             return ;
    207         }
    208 
    209         String[] strs = ImgURL.split("/");
    210         String fileName = strs[strs.length-1];
    211         String savePath = local_path+ File.separator+this.pageNum+"_"+fileName;
    212         HttpEntity entity = null;
    213         try {
    214 
    215             HttpGet get = new HttpGet(ImgURL);
    216 
    217 //            RequestConfig requestConfig = RequestConfig.custom()
    218 //                    .setConnectTimeout(50000).setConnectionRequestTimeout(1000)
    219 //                    .setSocketTimeout(50000).build();
    220 //            get.setConfig(requestConfig);
    221 
    222 
    223             //get.setHeader("Referer",ImgURL);
    224             //get.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36");
    225 
    226             //---这步很关键,是网站反爬虫的策略,只要加上 Referer参数即可,注意参数一定是这个网站名,否则抓的图片都是一个;
    227             get.setHeader("Referer","https://www.mm131.net/");
    228             get.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36");
    229             get.setHeader("User-Agent","sogou.com");
    230 
    231             HttpResponse response = httpClient.execute(get);
    232 
    233             entity = response.getEntity();
    234             System.out.println("保存图片>>>>.>>>>>>"+fileName);
    235             InputStream is = entity.getContent();
    236             OutputStream os = new FileOutputStream(savePath);
    237             IOUtils.copy(is, os);
    238             IOUtils.closeQuietly(os);
    239             IOUtils.closeQuietly(is);
    240 
    241         } catch (Exception e) {
    242             e.printStackTrace();
    243             System.out.println("图片保存失败");
    244             return ;
    245         }
    246     }
    247 
    248 
    249 }

    花了2个小时,随便写写的;

    欢迎点评;

  • 相关阅读:
    C#把外部文件拖入PictureBox中
    DirectInfo.GetFiles返回数组的默认排序
    NULL在SQLServer数据库日志文件中的存储
    C#中用NamedPipe进程间通信
    punycode和中文相互转换
    C#事件和委托的基础知识模型
    反射APP_CODE下的类和方法
    [ListView.View=List]的垂直滚动条
    换个思路"SQL2005下字符串字段内的字符排序"
    mono for android 的ADB
  • 原文地址:https://www.cnblogs.com/alexgl2008/p/12346875.html
Copyright © 2020-2023  润新知