• 【apache】使用HttpClient,进行简单网页抓取


     1 package com.lw.httpclient.test;
     2 import org.apache.http.client.methods.CloseableHttpResponse;
     3 import org.apache.http.client.methods.HttpGet;
     4 import org.apache.http.impl.client.CloseableHttpClient;
     5 import org.apache.http.impl.client.HttpClients;
     6 import org.apache.http.util.EntityUtils;
     7 
     8 public class HttpClientTest {
     9     public static void main(String[] args) throws Exception {
    10         // TODO Auto-generated method stub
    11         get1();
    12         get2();
    13     }
    14     /**
    15      * 获取指定链接的网页的内容【初级版】
    16      * @throws Exception
    17      */
    18     public static void get1()throws Exception{
    19         //HttpClient hc=new DefaultHttpClient();
    20         String url="http://www.budejie.com";
    21         url="http://www.btba.com.cn";//网站限制爬,这种方式不再实用。
    22         CloseableHttpClient chc=HttpClients.createDefault();
    23         HttpGet hg=new HttpGet(url);
    24         CloseableHttpResponse chp=chc.execute(hg);
    25         System.out.println(EntityUtils.toString(chp.getEntity(),"UTF-8"));
    26     }
    27     /**
    28      * 通过模拟浏览器获取指定链接的页面
    29      * @throws Exception
    30      */
    31     public static void get2()throws Exception{
    32         CloseableHttpClient closeableHttpClient=HttpClients.createDefault();
    33         String url="http://www.btba.com.cn";
    34         HttpGet httpGet=new HttpGet(url);
    35         //设置请求头,模拟浏览器访问
    36         httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0");
    37         CloseableHttpResponse chr=closeableHttpClient.execute(httpGet);
    38         System.out.println(EntityUtils.toString(chr.getEntity(),"UTF-8"));
    39     }
    40 }

    未完待续

    将会添加如何解析获取到的内容,得到自己想要的部分。。

  • 相关阅读:
    BZOJ2759 一个动态树好题
    BZOJ3527 力
    HDU6069 String
    HDU5069 Harry And Biological Teacher
    AC自动机初步
    HDU6155 Subsequence Count
    while与until
    RADI
    linux压缩及归档
    挂载与卸载
  • 原文地址:https://www.cnblogs.com/oldwei/p/8620387.html
Copyright © 2020-2023  润新知