• HttpClent4.3 的例子


    package com.unbank.robotspider.util;
    
    import java.io.IOException;
    import java.net.MalformedURLException;
    import java.net.URI;
    import java.net.URISyntaxException;
    import java.net.URL;
    import java.util.List;
    
    import org.apache.http.Header;
    import org.apache.http.HttpEntity;
    import org.apache.http.HttpStatus;
    import org.apache.http.NameValuePair;
    import org.apache.http.client.ClientProtocolException;
    import org.apache.http.client.CookieStore;
    import org.apache.http.client.config.CookieSpecs;
    import org.apache.http.client.config.RequestConfig;
    import org.apache.http.client.entity.UrlEncodedFormEntity;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.client.methods.HttpPost;
    import org.apache.http.client.methods.HttpUriRequest;
    import org.apache.http.client.protocol.HttpClientContext;
    import org.apache.http.entity.ContentType;
    import org.apache.http.impl.client.BasicCookieStore;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.ContentEncodingHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.params.CoreProtocolPNames;
    import org.apache.http.util.EntityUtils;
    import org.apache.log4j.Logger;
    
    public class CrawlerRequest {
    
        private final static Logger logger = Logger.getLogger(CrawlerRequest.class);
        private static String constUserAgent_Chrome = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.99 Safari/533.4";
    
        public String getUrlRespHtml(String url) {
            return getUrlRespHtml(url, null, null, 2000, "utf-8");
        }
    
        /***
         *
         * 
         * 
         */
        public String getUrlRespHtml(String pageUrl,
                List<NameValuePair> headerDict, List<NameValuePair> postDict,
                int timeout, String htmlCharset) {
            String respHtml = "";
            String defaultCharset = "utf-8";
            CloseableHttpResponse response = null;
            HttpUriRequest request = null;
    
            CloseableHttpClient httpClient = HttpClients.createDefault();
            URL url = null;
            try {
                url = new URL(pageUrl);
            } catch (MalformedURLException e2) {
                e2.printStackTrace();
            }
            URI uri = null;
            try {
                uri = new URI(url.getProtocol(), url.getHost(), url.getPath(),
                        url.getQuery(), null);
            } catch (URISyntaxException e2) {
                e2.printStackTrace();
            }// 防止pageUrl中出现空格
                // httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY,
                // CookiePolicy.BEST_MATCH);
                // httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY,
                // CookiePolicy.BEST_MATCH);
    
            // RequestConfig globalConfig = RequestConfig.custom()
            // .setCookieSpec(CookieSpecs.BEST_MATCH)
            // .build();
            // CloseableHttpClient httpclient = HttpClients.custom()
            // .setDefaultRequestConfig(globalConfig)
            // .build();
            // RequestConfig localConfig = RequestConfig.copy(globalConfig)
            // .setCookieSpec(CookieSpecs.BROWSER_COMPATIBILITY)
            // .build();
            // HttpGet httpGet = new HttpGet("/");
            // httpGet.setConfig(localConfig);
    
            RequestConfig requestConfig = RequestConfig.custom()
                    .setSocketTimeout(5000).setConnectTimeout(5000)
                    .setCookieSpec(CookieSpecs.BROWSER_COMPATIBILITY).build();// 设置请求和传输超时时间
    
            CookieStore cookieStore = new BasicCookieStore();
    
            // logger.info(uri);
            if (postDict != null) {
                HttpPost postReq = new HttpPost(uri);
                postReq.setConfig(requestConfig);
                postReq.addHeader("User-Agent", constUserAgent_Chrome);
                // postReq.addHeader(
                // "Accept",
                // "application/x-ms-application, image/jpeg, application/xaml+xml, "
                // + "image/gif, image/pjpeg, application/x-ms-xbap, */*");
                // postReq.addHeader("Accept-Language", "zh-CN");
                // postReq.addHeader("", "zh-CN");
                // postReq.addHeader("Connection", "close");
                // postReq.addHeader("Content-Type", "text/html;charset=UTF-8");
                try {
                    HttpEntity postBodyEnt = new UrlEncodedFormEntity(postDict,
                            "UTF-8");
                    postReq.setEntity(postBodyEnt);
                } catch (Exception e) {
                    e.printStackTrace();
                }
    
                request = postReq;
            } else {
                HttpGet getReq = new HttpGet(uri);
                getReq.setConfig(requestConfig);
                getReq.addHeader("User-Agent", constUserAgent_Chrome);
                // getReq.addHeader(
                // "Accept",
                // "application/x-ms-application, image/jpeg, application/xaml+xml, "
                // + "image/gif, image/pjpeg, application/x-ms-xbap, */*");
                // getReq.addHeader("Accept-Language", "zh-CN");
                // getReq.addHeader("", "zh-CN");
                // getReq.addHeader("Connection", "close");
                request = getReq;
    
            }
    
            HttpClientContext localContext = HttpClientContext.create();
            localContext.setCookieStore(cookieStore);
            try {
                response = httpClient.execute(request, localContext);
            } catch (Exception e) {
                // logger.info(url + "=====读取出错===" + e);
                for (int i = 0; i < 5; i++) {
                    if (response != null) {
                        break;
                    }
                    try {
                        Thread.sleep(((int) (Math.random() * 6) + 1) * 1000);
                        response = httpClient.execute(request, localContext);
                    } catch (Exception e1) {
                        // logger.info("读取失败次数" + i);
                    }
    
                }
    
            }
            try {
                if (response != null
                        && response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
                    HttpEntity respEnt = response.getEntity();
                    // ContentType contentType = ContentType.getOrDefault(respEnt);
                    // text/html; charset=utf-8
                    // String charset = StringUtil.getStringByReg(
                    // contentType.toString(), "charset=([^;]*)");
                    // if (charset == null || charset.isEmpty()) {
                    //
                    // } else {
                    // htmlCharset = charset.split("=")[1];
                    // }
                    if ((null == htmlCharset) || htmlCharset.isEmpty()) {
                        htmlCharset = defaultCharset;
                    }
                    respHtml = EntityUtils.toString(respEnt, htmlCharset);
    
                } else {
                    // 保存到数据库
                }
            } catch (ClientProtocolException cpe) {
                logger.info(url + "=====读取出错===" + cpe);
                // cpe.printStackTrace();
            } catch (IOException ioe) {
                logger.info(url + "=====读取出错===" + ioe);
                // ioe.printStackTrace();
            } finally {
    
                try {
                    cookieStore.clear();
                    request.abort();
                    if (response != null) {
    
                        response.close();
                    }
                    httpClient.close();
                } catch (IOException e) {
                    e.printStackTrace();
                    logger.info(e);
                }
            }
    
            return respHtml;
        }
    
    }
  • 相关阅读:
    常用分页插件
    sessionStorage二种存值取值的方法
    $(this).index()与$(obj).index(this)的区别
    每次移1px的无缝轮播图
    为什么全局变量在赋值之前调用会报错
    Number()、parseInt()和parseFloat()的区别
    JSON.parse()与JSON.stringify()
    HDU
    出现负数的01背包问题
    HDU
  • 原文地址:https://www.cnblogs.com/tomcattd/p/3793216.html
Copyright © 2020-2023  润新知