• 2020年寒假假期总结0114


      WebMagic的学习基础:Jsoup的学习(Http基础API和Jsoup基础API)

      在学习WebMagic之前,我们需要简单了解关于Jsoup的知识,WebMagic是基于Jsoup的爬虫工具。

      下面我会列出关于Jsoup的api的使用。先列出关于Http的一些基本操作

      所需要的依赖:

    <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
            <dependency>
                <groupId>org.jsoup</groupId>
                <artifactId>jsoup</artifactId>
                <version>1.10.2</version>
            </dependency>
            <!-- https://mvnrepository.com/artifact/junit/junit -->
            <dependency>
                <groupId>junit</groupId>
                <artifactId>junit</artifactId>
                <version>4.12</version>
                <scope>test</scope>
            </dependency>
            <!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
            <dependency>
                <groupId>commons-io</groupId>
                <artifactId>commons-io</artifactId>
                <version>2.4</version>
            </dependency>
            <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
            <dependency>
                <groupId>org.apache.commons</groupId>
                <artifactId>commons-lang3</artifactId>
                <version>3.7</version>
            </dependency>

      HTTPGet的使用:

    public class HttpGetTest {
    
        public static void main(String[] arge){
            //创建httpClient对象
            CloseableHttpClient httpClient= HttpClients.createDefault();
    
            //创建HTTPGet的对象,设置url访问地址
            HttpGet httpGet=new HttpGet("http://www.itcast.cn");
            CloseableHttpResponse response =null;
            //使用Http发送请求,获取response
            try {
                response =httpClient.execute(httpGet);
    
                //解析响应
                if(response.getStatusLine().getStatusCode()==200){
                    String content=EntityUtils.toString(response.getEntity(),"utf8");
                    System.out.println(content.length());
                }
            } catch (IOException e) {
                e.printStackTrace();
            }finally {
                //关闭连接请求
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                try {
                    httpClient.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
    
        }
    }

      带参数的Get请求:

    public class HttpGetParmTest {
        public static void main(String[] arge) {
            //创建httpClient对象
            CloseableHttpClient httpClient = HttpClients.createDefault();
    
            //创建URLBuilder
            URIBuilder uriBuilder = null;
            try {
                uriBuilder = new URIBuilder("http://yun.itheima.com/search");
            } catch (URISyntaxException e) {
                e.printStackTrace();
            }
            uriBuilder.setParameter("keys", "Java");
            //创建HTTPGet的对象,设置url访问地址
            HttpGet httpGet = null;
            try {
                httpGet = new HttpGet(uriBuilder.build());
            } catch (URISyntaxException e) {
                e.printStackTrace();
            }
            CloseableHttpResponse response = null;
            //使用Http发送请求,获取response
            System.out.println("发送的请求地址:" + httpGet);
            try {
                response = httpClient.execute(httpGet);
    
                //解析响应
                if (response.getStatusLine().getStatusCode() == 200) {
                    String content = EntityUtils.toString(response.getEntity(), "utf8");
                    System.out.println(content.length());
                }
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                try {
                    httpClient.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
    
        }
    }

      Post的请求:

      与Get请求相似,只需要将Get请求的类改成Post类即可

      Post中带参数请求:(添加的参数为表单信息)

    public class HttpPostParmTest {
    
        public static void main(String[] arge) throws UnsupportedEncodingException {
            //创建httpClient对象
            CloseableHttpClient httpClient= HttpClients.createDefault();
    
            //创建HTTPPost的对象,设置url访问地址
            HttpPost httpPost=new HttpPost("http://itcast.cn");
            //声明list集合 封装表单中的参数
            List<NameValuePair> pairs=new ArrayList<NameValuePair>();
            pairs.add(new BasicNameValuePair("keys","Java"));
            //创建表单中Entit对象
            UrlEncodedFormEntity formEntity=new UrlEncodedFormEntity(pairs,"utf8");
            //设置表单对象到Post请求中
            httpPost.setEntity(formEntity);
            System.out.println("发送的请求为:"+httpPost);
            CloseableHttpResponse response =null;
            //使用Http发送请求,获取response
            try {
                response =httpClient.execute(httpPost);
    
                //解析响应
                if(response.getStatusLine().getStatusCode()==200){
                    String content=EntityUtils.toString(response.getEntity(),"utf8");
                    System.out.println(content.length());
                }
            } catch (IOException e) {
                e.printStackTrace();
            }finally {
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                try {
                    httpClient.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
    
        }
    }

      连接池的使用:

    public class HttpClientPool {
        public static void main(String[] args) {
            //创建连接池管理器
            PoolingHttpClientConnectionManager clientConnectionManager = new PoolingHttpClientConnectionManager();
            //设置连接数
            clientConnectionManager.setMaxTotal(100);
            //设置每个主机的最大连接数
            clientConnectionManager.setDefaultMaxPerRoute(10);
            //使用管理器发起请求
            doGet(clientConnectionManager);
            doGet(clientConnectionManager);
        }
    
        private static void doGet(PoolingHttpClientConnectionManager clientConnectionManager) {
            //从连接池中获取对象
            CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(clientConnectionManager).build();
            HttpGet httpGet = new HttpGet("http://www.itcast.cn");
    
            CloseableHttpResponse response = null;
    
            try {
                response = httpClient.execute(httpGet);
                if (response.getStatusLine().getStatusCode() == 200) {
                    String content= EntityUtils.toString(response.getEntity(),"utf8");
                    System.out.println(content.length());
                }
            } catch (IOException e) {
                e.printStackTrace();
            }finally {
                if(response!=null){
                    try {
                        response.close();
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                    //httpClient的关闭由连接池管理
    
                }
            }
    
        }
    }

       Get请求携带配置信息:

    //配置请求信息
            RequestConfig config=RequestConfig.custom().setConnectTimeout(1000)//创建连接的最长时间
                    .setConnectionRequestTimeout(500)//设置获取连接的最长时间
                    .setSocketTimeout(10*1000)//设置数据传输的最长时间
                    .build();
            //给设置的请求信息加进去
            httpGet.setConfig(config);

      Jsoup基础API使用:(注释很详细,就不做解释了)

    public class JsoupFirsttest {
    
    
        @Test
        public void testUrl() throws Exception {
            //解析Url地址,第一个参数是访问额url,第二个参数是访问时候的超时时间
            Document doc = Jsoup.parse(new URL("http://www.itcast.cn"), 1000);
            //使用标签选择器
            String title = doc.getElementsByTag("title").first().text();
            //打印
            System.out.println(title);
        }
    
        @Test
        public void testString() throws Exception {
            //读取文件,获取字符串
            String content = FileUtils.readFileToString(new File("C:\Users\SuperMan\Desktop\test.html"), "utf8");
            //解析字符串
            Document doc = Jsoup.parse(content);
            String title = doc.getElementsByTag("title").first().text();
            System.out.println(title);
        }
    
        @Test
        public void testFile() throws Exception {
            //解析文件
            Document doc = Jsoup.parse(new File("C:\Users\SuperMan\Desktop\test.html"), "utf8");
            String title = doc.getElementsByTag("title").first().text();
            System.out.println(title);
        }
    
        @Test
        public void TestDom() throws Exception {
            Document doc = Jsoup.parse(new File("C:\Users\SuperMan\Desktop\test.html"), "utf8");
    
            //获取元素
            //通过ID获取元素内容
            Element element = doc.getElementById("city_bj");
            //通过标签获取元素
            Element element1 = doc.getElementsByTag("span").get(12);
            //通过class获取元素
            Element element2 = doc.getElementsByClass("fdnav").first();
            //通过属性获取
            Element element3 = doc.getElementsByAttribute("abc").first();
            Element element4 = doc.getElementsByAttributeValue("href", "http://yun.itheima.com/").first();
            //打印
            System.out.println(element.text());
            System.out.println(element1.text());
            System.out.println(element2.text());
            System.out.println(element3.text());
            System.out.println(element4.text());
        }
    
        @Test
        public void testData()throws Exception{
            Document doc = Jsoup.parse(new File("C:\Users\SuperMan\Desktop\test.html"), "utf8");
    
            //获取元素
            Element element=doc.getElementById("cy");
            String str="";
            //从元素中获取ID
            str=element.id();
            System.out.println(str);
            //获取ClassName
            Set<String> s =element.classNames();
            for (String string:s
                 ) {
                System.out.println(string);
            }
            //获取attr
            str=element.attr("id");
            //获取所有属性
            Attributes attribute=element.attributes();
            System.out.println(attribute.toString());
            //获取文本内容
            str=element.text();
            System.out.println(str);
        }
    }

      

      

      

  • 相关阅读:
    HTML页面下echarts图形绘制
    nth-child的运用
    黑客零做起
    回溯法-背包问题
    回溯法-迷宫问题
    ECMA概述
    微信小程序-蓝牙
    JavaScript实现千位分隔符
    Vue 就地复用策略
    内联函数inline
  • 原文地址:https://www.cnblogs.com/heiyang/p/12199107.html
Copyright © 2020-2023  润新知