• 爬虫笔记1


    目录:

    一.get请求,没有参数

    二.get请求,有参数

    三.post请求,没有参数

    四.post请求,有参数

    五.连接池创建连接

    六.jsou解析

    七.使用dom方式遍历文档

    1.get请求,没有参数

    package Demo1.CrawlerDemo1;
    
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.util.EntityUtils;
    
    import java.io.IOException;
    import java.net.URI;
    
    import org.apache.http.HttpEntity;
    import org.apache.http.client.ClientProtocolException;
    import org.apache.http.client.HttpClient;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.client.protocol.ResponseAuthCache;
    import org.apache.http.impl.client.CloseableHttpClient;
    
    public class Crawler {
        public static void main(String[] args) {
            //1.browser:create httpclient
            CloseableHttpClient httpClient=HttpClients.createDefault();
            //2.url
            HttpGet httpGet=new HttpGet("http://www.itcast.cn");
                    //3.request
            CloseableHttpResponse response=null;
            try {
                response=httpClient.execute(httpGet);
                if(response.getStatusLine().getStatusCode()==200) {
                    HttpEntity httpEntity=response.getEntity();
                    String content=EntityUtils.toString(httpEntity,"utf-8");
                    System.out.println(content);
                }
            } catch (ClientProtocolException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }finally {
                try {
                    response.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
                try {
                    httpClient.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
                    //4.get response
            
        }
        
        
    }

    2.get请求,有参数

    package Demo1.CrawlerDemo1;
    
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.util.EntityUtils;
    
    import java.io.IOException;
    import java.net.URI;
    import java.net.URISyntaxException;
    
    import org.apache.http.HttpEntity;
    import org.apache.http.client.ClientProtocolException;
    import org.apache.http.client.HttpClient;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.client.protocol.ResponseAuthCache;
    import org.apache.http.client.utils.URIBuilder;
    import org.apache.http.impl.client.CloseableHttpClient;
    /**get请求带参数
     * 
     * @author 18430
     *
     */
    public class CrawlerParameter {
        
        public static void main(String[] args) throws Exception {
        
            CloseableHttpClient httpClient=HttpClients.createDefault();
            
            
            URIBuilder uriBuilder=new URIBuilder("http://yun.itheima.com/search");
        
            uriBuilder.setParameter("keys", "java");
            //多个参数的设置
            //uriBuilder.setParameter("keys", "java").setParameter("keys", "java");
            
            HttpGet httpGet=new HttpGet(uriBuilder.build());
                
            CloseableHttpResponse response=null;
            try {
                response=httpClient.execute(httpGet);
                if(response.getStatusLine().getStatusCode()==200) {
                    HttpEntity httpEntity=response.getEntity();
                    String content=EntityUtils.toString(httpEntity,"utf-8");
                    System.out.println(content);
                }
            } catch (ClientProtocolException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }finally {
                try {
                    response.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
                try {
                    httpClient.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
                    //4.get response
            
        }
        
        
    }

    3.post请求不带参数

    package Demo1.CrawlerDemo1;
    
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.util.EntityUtils;
    
    import java.io.IOException;
    import java.net.URI;
    import java.net.URISyntaxException;
    
    import org.apache.http.HttpEntity;
    import org.apache.http.client.ClientProtocolException;
    import org.apache.http.client.HttpClient;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.client.methods.HttpPost;
    import org.apache.http.client.protocol.ResponseAuthCache;
    import org.apache.http.client.utils.URIBuilder;
    import org.apache.http.impl.client.CloseableHttpClient;
    /**
     * post请求
     * @author 18430
     *
     */
    public class CrawlerPost {
        
        public static void main(String[] args) throws Exception {
        
            CloseableHttpClient httpClient=HttpClients.createDefault();
            
            
            URIBuilder uriBuilder=new URIBuilder("http://www.itcast.cn");
        
            
            
            HttpPost httpPost=new HttpPost(uriBuilder.build());
                
            CloseableHttpResponse response=null;
            try {
                response=httpClient.execute(httpPost);
                if(response.getStatusLine().getStatusCode()==200) {
                    HttpEntity httpEntity=response.getEntity();
                    String content=EntityUtils.toString(httpEntity,"utf-8");
                    System.out.println(content);
                }
            } catch (ClientProtocolException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }finally {
                try {
                    response.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
                try {
                    httpClient.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
                    //4.get response
            
        }
        
        
    }

    4.post请求,带参数

    import java.io.IOException;
    import java.io.UnsupportedEncodingException;
    import java.util.ArrayList;
    import java.util.List;
    
    import org.apache.http.NameValuePair;
    import org.apache.http.client.ClientProtocolException;
    import org.apache.http.client.entity.UrlEncodedFormEntity;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpPost;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.message.BasicNameValuePair;
    import org.apache.http.util.EntityUtils;
    
    public class PostWithParam {
    public static void main(String[] args) throws Exception {
        CloseableHttpClient httpClient = HttpClients.createDefault();
        HttpPost httpPost = new HttpPost("http://yun.itheima.com/search");
        //申明list封装表单中的参数
        List<NameValuePair> params = new ArrayList<NameValuePair>();
        params.add(new BasicNameValuePair("keys","java"));
        
        //创建表单entity对象
        UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params,"utf-8");
        
        //设置表单entity对象到post请求中
        httpPost.setEntity(formEntity);
        
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpPost);
            if(response.getStatusLine().getStatusCode() == 200) {
                String content = EntityUtils.toString(response.getEntity(),"utf-8");
            System.out.println(content);
            }
        } catch (ClientProtocolException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
    }

     5.使用连接池

    import java.io.IOException;
    
    import org.apache.http.client.ClientProtocolException;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
    import org.apache.http.util.EntityUtils;
    
    public class ConnectPool {
    public static void main(String[] args) {
        //创建连接池管理器
        PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
        //设置连接数
        cm.setMaxTotal(100);
        //设置每个主机的最大连接数
        cm.setDefaultMaxPerRoute(10);
        //使用连接池管理器发起请求
        doGet(cm);
    
    }
    private static void doGet(PoolingHttpClientConnectionManager cm) {
    CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
    HttpGet httpGet = new HttpGet("http://www.itcast.cn");
    CloseableHttpResponse response = null; 
    try {
        response =httpClient.execute(httpGet);
        if(response.getStatusLine().getStatusCode() == 200) {
        String content     = EntityUtils.toString(response.getEntity(),"utf-8");
        System.out.println(content.length());
        }
    } catch (ClientProtocolException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }finally {
        //httpclient有连接池管理,不能关闭
        if(response != null) {
            try {
                response.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    }
    }
    }

     6.Jsoup解析

    package Demo1.CrawlerDemo1;
    
    import java.io.File;
    import java.net.URL;
    
    import org.apache.commons.io.FileUtils;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.junit.Test;
    
    public class Jsoup1Test2 {
        @Test
        public void testUrl() throws Exception {
            // 解析url
            Document doc = Jsoup.parse(new URL("http://www.itcast.cn"), 1000);
            // 使用标签选择器获取title标签内容
            String title = doc.getElementsByTag("title").first().text();
            System.out.println(title);
    
        }
    
        @Test
        public void testString() throws Exception {
            // 解析字符串
            String content = FileUtils.readFileToString(
                    new File(System.getProperty("user.dir") + "\src\main\java\resources\test.html"), "utf-8");
            Document doc = Jsoup.parse(content);
            String title = doc.getElementsByTag("title").first().text();
            System.out.println(title);
        }
    
        @Test
        public void testFile() throws Exception {
            // 解析文件
            Document doc = Jsoup.parse(new File(System.getProperty("user.dir") + "\src\main\java\resources\test.html"),
                    "utf-8");
            String title = doc.getElementsByTag("title").first().text();
            System.out.println(title);
        }
    
    }
  • 相关阅读:
    CAP 与数据一致性
    C++的构造函数为何不能为虚函数
    构造函数和析构函数中可以调用调用虚函数吗
    HTTP状态码
    C++ 单例模式实现
    【转】十大经典排序算法
    C++ short/int/long/long long 等数据类型大小
    块/文件/对象三种存储的优缺点
    罗振宇《时间的朋友》2019-2020
    Google Hacking
  • 原文地址:https://www.cnblogs.com/yanweichen/p/13339706.html
Copyright © 2020-2023  润新知