• 使用JAVA抓取网页数据


    一、使用 HttpClient 抓取网页数据

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    public String getHtml(String htmlurl) throws IOException {
            StringBuffer sb = new StringBuffer();
            String acceptEncoding = "";
            /* 1.生成 HttpClinet 对象并设置参数 */
            HttpClient httpClient = new HttpClient();
            GetMethod method = new GetMethod(htmlurl);
            int statusCode;
            try {
                statusCode = httpClient.executeMethod(method);
                // 判断访问的状态码
                if (statusCode != HttpStatus.SC_OK) {
                    return null;
                else {
                    if (method.getResponseHeader("Content-Encoding") != null)
                        acceptEncoding = method.getResponseHeader(
                                "Content-Encoding").getValue();
                    if (acceptEncoding.toLowerCase().indexOf("gzip") > -1) {
                        // 建立gzip解压工作流
                        InputStream is;
                        is = method.getResponseBodyAsStream();
                        GZIPInputStream gzin = new GZIPInputStream(is);
                        InputStreamReader isr = new InputStreamReader(gzin, Charset.forName(CHARSET)); // 设置读取流的编码格式,自定义编码
                        java.io.BufferedReader br = new java.io.BufferedReader(isr);
                        String tempbf;
                        while ((tempbf = br.readLine()) != null) {
                            if(StringUtils.isNotBlank(tempbf)){
                                sb.append(tempbf);
                            }
                        }
                        isr.close();
                        gzin.close();
                        System.out.println(sb);
                    else {
                        InputStreamReader isr;
                        isr = new InputStreamReader(
                                method.getResponseBodyAsStream(), CHARSET);
                        java.io.BufferedReader br = new java.io.BufferedReader(isr);
                        String tempbf;
                        while ((tempbf = br.readLine()) != null) {
                            if(StringUtils.isNotBlank(tempbf)){
                                sb.append(tempbf);
                            }
                        }
                        isr.close();
                    }
                }
            catch (HttpException e) {
                e.printStackTrace();
            catch (IOException e) {
                e.printStackTrace();
            }
            method.abort();
            method.releaseConnection();
            return sb.toString();
        }

    二、使用HttpPost抓取网页数据

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    private static CloseableHttpClient httpClient;
        private static BasicHttpContext httpContext;
        private static BasicCookieStore cookieStore;
        private static PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
        private static RequestConfig globalConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.BEST_MATCH).build();
        private static RequestConfig localConfig = RequestConfig.copy(globalConfig).setCookieSpec(CookieSpecs.BROWSER_COMPATIBILITY).build();
     
    public String getHtml(String url){
            HttpClientBuilder builder = HttpClients.custom();
            cookieStore = new BasicCookieStore();
            builder.setConnectionManager(cm);
            builder.setDefaultCookieStore(cookieStore);
            builder.setDefaultRequestConfig(globalConfig);
            httpClient = builder.build();
            httpContext = new BasicHttpContext();
            httpContext.setAttribute(ClientContext.COOKIE_STORE, cookieStore);
            HttpPost httpPost = new HttpPost(url);
            httpPost.setConfig(localConfig);
            httpPost.setHeader("Accept""text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
            httpPost.setHeader("Accept-Encoding","gzip, deflate");
            httpPost.setHeader("Accept-Language","zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
            httpPost.setHeader("Connection","keep-alive");
            httpPost.setHeader("Cookie","ASP.NET_SessionId=11vrr4ucwsgeqtmpyfx4hmvx; _5t_trace_sid=89c4ffb8633d267e4ae322a157b52471; _5t_trace_tms=1; CheckCode=X0P64");
            httpPost.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0");
             List <NameValuePair> nvps = new ArrayList <NameValuePair>();
                nvps.add(new BasicNameValuePair("pid""99-C3-57-35-6D-70-3D-F2"));
                nvps.add(new BasicNameValuePair("CurrentlyPageIndex""2"));
                httpPost.setEntity(new UrlEncodedFormEntity(nvps, Consts.UTF_8));
            try {
                CloseableHttpResponse response = httpClient.execute(httpPost,httpContext);
                int status = response.getStatusLine().getStatusCode();
                if (status >= 200 && status < 300) {
                    HttpEntity httpEntity = response.getEntity();
                    if(httpEntity!=null){
                        String cont = trimLineToString(httpEntity, "UTF-8");
                        EntityUtils.consume(httpEntity);
                        return cont;
                    }
                }
            catch (ClientProtocolException e) {
                e.printStackTrace();
            catch (IOException e) {
                e.printStackTrace();
            }
            return null;
        }
             
        public synchronized static String trimLineToString(HttpEntity entiry,String charset) {
     
            StringBuffer sb = new StringBuffer();
            BufferedReader reader = null;
            try {
                InputStream instream = entiry.getContent();
                reader = new BufferedReader(new InputStreamReader(instream, charset));
                String str = null;
                while ((str = reader.readLine()) != null) {
                    if(StringUtils.isNotBlank(str)) {
                        sb.append(str.trim());
                    }
                }
                instream.close();
            catch (IllegalStateException e) {
                e.printStackTrace();
            catch (IOException e) {
                e.printStackTrace();
            finally {
                if (reader != null) {
                    try {
                        reader.close();
                    catch (IOException e) {
                        e.printStackTrace();
                    }
                }
            }
            return sb.toString();
        }
  • 相关阅读:
    屏幕截图 从安卓模拟器中识别出屏幕文字
    srcset
    Bitwise and Bit Shift Operators 位运算 取反 补码
    text recognizer (OCR) Engine 光学字符识别
    删除目录下 某类名字的文件
    appmaptile
    登录框
    将代码设置的剪切板内容通过输入法软件粘贴入app搜索框
    面向问题的高级语言
    使用心理视觉来进行图像处理
  • 原文地址:https://www.cnblogs.com/likeju/p/5121544.html
Copyright © 2020-2023  润新知