• java抓取https网页爬虫,解决Server returned HTTP response code: 403 for URL报错


    java抓取https网页爬虫,解决Server returned HTTP response code: 403 for URL报错

    关键是在忽略https的地方加上:connection.setRequestProperty("User-Agent", "Mozilla/4.76");

    注意:需要加在new BufferedReader 前面才行,否则无效。

    HttpsURLConnection.setDefaultHostnameVerifier(hv);
    
                connection = (HttpURLConnection) validationUrl.openConnection();
                //first set User-Agent to solve Server returned HTTP response code: 403 for URL
                connection.setRequestProperty("User-Agent", "Mozilla/4.76");
                
                final BufferedReader in = new BufferedReader(new InputStreamReader(
                        connection.getInputStream()));

    抓取的地方先调用忽略https的代码

    //先调用下忽略https证书的再请求才可以
                HttpsUrlValidator.retrieveResponseFromServer(url);
                
                doc = Jsoup
                        .connect(url)
                        .header("User-Agent",rand_agents)

    完整的HttpsUrlValidator.java代码如下:

    import java.io.BufferedReader;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.net.HttpURLConnection;
    import java.net.URL;
    
    import javax.net.ssl.HostnameVerifier;
    import javax.net.ssl.HttpsURLConnection;
    import javax.net.ssl.SSLSession;
    
    
    public class HttpsUrlValidator {
    
        static HostnameVerifier hv = new HostnameVerifier() {
            public boolean verify(String urlHostName, SSLSession session) {
                System.out.println("Warning: URL Host: " + urlHostName + " vs. "
                                   + session.getPeerHost());
                return true;
            }
        };
    
        public final static String retrieveResponseFromServer(final String url) {
            HttpURLConnection connection = null;
            
            try {
                URL validationUrl = new URL(url);
                trustAllHttpsCertificates();
                HttpsURLConnection.setDefaultHostnameVerifier(hv);
    
                connection = (HttpURLConnection) validationUrl.openConnection();
                //first set User-Agent to solve Server returned HTTP response code: 403 for URL
                connection.setRequestProperty("User-Agent", "Mozilla/4.76");
                
                final BufferedReader in = new BufferedReader(new InputStreamReader(
                        connection.getInputStream()));
                
                String line;
                final StringBuffer stringBuffer = new StringBuffer(255);
    
                synchronized (stringBuffer) {
                    while ((line = in.readLine()) != null) {
                        stringBuffer.append(line);
                        stringBuffer.append("\n");
                    }
                    return stringBuffer.toString();
                }
    
            } catch (final IOException e) {
                System.out.println(e.getMessage());
                return null;
            } catch (final Exception e1){
                System.out.println(e1.getMessage());
                return null;
            }finally {
                if (connection != null) {
                    connection.disconnect();
                }
            }
        }
        
        public static void trustAllHttpsCertificates() throws Exception {
            javax.net.ssl.TrustManager[] trustAllCerts = new javax.net.ssl.TrustManager[1];
            javax.net.ssl.TrustManager tm = new miTM();
            trustAllCerts[0] = tm;
            javax.net.ssl.SSLContext sc = javax.net.ssl.SSLContext
                    .getInstance("SSL");
            sc.init(null, trustAllCerts, null);
            javax.net.ssl.HttpsURLConnection.setDefaultSSLSocketFactory(sc
                    .getSocketFactory());
        }
    
        static class miTM implements javax.net.ssl.TrustManager,
                javax.net.ssl.X509TrustManager {
            public java.security.cert.X509Certificate[] getAcceptedIssuers() {
                return null;
            }
    
            public boolean isServerTrusted(
                    java.security.cert.X509Certificate[] certs) {
                return true;
            }
    
            public boolean isClientTrusted(
                    java.security.cert.X509Certificate[] certs) {
                return true;
            }
    
            public void checkServerTrusted(
                    java.security.cert.X509Certificate[] certs, String authType)
                    throws java.security.cert.CertificateException {
                return;
            }
    
            public void checkClientTrusted(
                    java.security.cert.X509Certificate[] certs, String authType)
                    throws java.security.cert.CertificateException {
                return;
            }
        }
    
    }
  • 相关阅读:
    Python之文件操作
    document.hasFocus() & $(window).blur()
    innerHtml 会忽略里面元素的属性
    ng  命令集合
    阿里云ECS CentOs7.3下搭建LAMP环境(Apache2.4 + Mysql5.7 + PHP5.6 + Laravel5.2)
    在忘记root密码的时候,可以这样 亲测可用
    下一次装mysql 试一下这个方法
    CentOS-6.8安装Mysql-5.5.29
    阿里云服务器下安装LAMP环境(CentOS Linux 6.3)
    CentOS 7.2 配置Apache服务(httpd)--上篇
  • 原文地址:https://www.cnblogs.com/zdz8207/p/16813796.html
Copyright © 2020-2023  润新知