• JAVA爬虫


    基于httpclient和jsoup获取网页对象和解析,使用了idea工具,spring boot框架

    在网上找到的GECCO爬虫框架,使用的源码,有问题可以在github上找到gecco框架

    1.

    Requestor 获取网页对象,以封装
    package com.example.demo.httpclient;

    import org.apache.http.*;
    import org.apache.http.client.ClientProtocolException;
    import org.apache.http.client.CookieStore;
    import org.apache.http.client.config.CookieSpecs;
    import org.apache.http.client.config.RequestConfig;
    import org.apache.http.client.entity.UrlEncodedFormEntity;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.client.methods.HttpPost;
    import org.apache.http.client.methods.HttpRequestBase;
    import org.apache.http.client.protocol.HttpClientContext;
    import org.apache.http.config.Registry;
    import org.apache.http.config.RegistryBuilder;
    import org.apache.http.cookie.CookieSpecProvider;
    import org.apache.http.impl.client.BasicCookieStore;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.impl.cookie.BasicClientCookie;
    import org.apache.http.impl.cookie.DefaultCookieSpecProvider;
    import org.apache.http.message.BasicNameValuePair;
    import org.apache.http.util.EntityUtils;

    import java.io.IOException;
    import java.io.UnsupportedEncodingException;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.Map;
    import java.util.Set;

    public class Requestor {

    protected final static String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36";
    protected final static String Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
    protected final static String AcceptLanguage = "zh-CN,zh;q=0.8,en;q=0.6";

    protected CookieStore cookieStore;
    protected HttpClientContext context;
    protected CloseableHttpClient client;

    public Requestor() {
    client = HttpClients.createDefault();
    }

    public void doLogin(String loginUrl, Map<String, String> params) {
    HttpResponse httpResponse = doPost(loginUrl, params);
    printResponse(httpResponse);
    }

    public void printResponse(HttpResponse httpResponse) {
    // 获取响应消息实体
    HttpEntity entity = httpResponse.getEntity();
    // 响应状态
    System.out.println("status:" + httpResponse.getStatusLine());
    System.out.println("headers:");
    HeaderIterator iterator = httpResponse.headerIterator();
    while (iterator.hasNext()) {
    System.out.println(" " + iterator.next());
    }
    // 判断响应实体是否为空
    if (entity != null) {
    String responseString;
    try {
    responseString = EntityUtils.toString(entity);
    System.out.println("response length:" + responseString.length());
    System.out.println("response content:" + responseString.replace(" ", ""));
    } catch (org.apache.http.ParseException | IOException e) {
    e.printStackTrace();
    }
    }
    }

    public void setContext() {
    context = HttpClientContext.create();
    Registry<CookieSpecProvider> registry = RegistryBuilder.<CookieSpecProvider> create()
    .register(CookieSpecs.DEFAULT, new DefaultCookieSpecProvider()).register(CookieSpecs.DEFAULT, new DefaultCookieSpecProvider())
    .build();
    context.setCookieSpecRegistry(registry);
    context.setCookieStore(cookieStore);
    }

    public void setCookieStore(HttpResponse httpResponse) {
    cookieStore = new BasicCookieStore();
    Header[] headers = httpResponse.getHeaders("Set-Cookie");

    String cookieValue = null;
    for (Header header : headers) {
    System.out.println(header.getName() + ":" + header.getValue());
    cookieValue = header.getValue();
    }
    // 新建一个Cookie
    BasicClientCookie cookie = new BasicClientCookie("oscid", cookieValue);
    cookie.setDomain(".oschina.net");
    cookie.setPath("/");
    cookieStore.addCookie(cookie);
    }

    public List<NameValuePair> getParam(Map<String, String> parameterMap) {
    List<NameValuePair> nameValuePairs = new ArrayList<NameValuePair>();
    Set<String> keySet = parameterMap.keySet();
    for (String key : keySet) {
    nameValuePairs.add(new BasicNameValuePair(key, parameterMap.get(key)));
    }
    return nameValuePairs;
    }

    public HttpResponse doGet(String url) {
    HttpResponse result = null;
    HttpGet httpGet = new HttpGet(url);
    config(httpGet);
    try {
    result = client.execute(httpGet);
    } catch (IOException e) {
    e.printStackTrace();
    }
    return result;
    }

    public HttpResponse doPost(String url, Map<String, String> params) {
    HttpResponse result = null;
    HttpPost httpPost = new HttpPost(url);
    UrlEncodedFormEntity postEntity;
    try {
    postEntity = new UrlEncodedFormEntity(getParam(params), "UTF-8");
    httpPost.setEntity(postEntity);
    config(httpPost);
    result = client.execute(httpPost);
    } catch (UnsupportedEncodingException e) {
    e.printStackTrace();
    } catch (ClientProtocolException e) {
    e.printStackTrace();
    } catch (IOException e) {
    e.printStackTrace();
    }
    return result;
    }

    protected void config(HttpRequestBase httpRequestBase) {
    httpRequestBase.setHeader("User-Agent", USER_AGENT);
    httpRequestBase.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
    httpRequestBase.setHeader("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6");
    httpRequestBase.setHeader("Referer", "https://www.oschina.net/home/login?goto_page=http%3A%2F%2Fwww.oschina.net%2F");
    // 配置请求的超时设置
    RequestConfig requestConfig = RequestConfig.custom().setConnectionRequestTimeout(300000).setConnectTimeout(300000).setSocketTimeout(300000).build();
    httpRequestBase.setConfig(requestConfig);
    }

    }

    2.
    AbstractClient类
    package com.example.demo.httpclient;

    import org.apache.http.Header;
    import org.apache.http.HttpResponse;
    import org.apache.http.NameValuePair;
    import org.apache.http.client.ClientProtocolException;
    import org.apache.http.client.CookieStore;
    import org.apache.http.client.config.CookieSpecs;
    import org.apache.http.client.config.RequestConfig;
    import org.apache.http.client.entity.UrlEncodedFormEntity;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.client.methods.HttpPost;
    import org.apache.http.client.methods.HttpRequestBase;
    import org.apache.http.client.protocol.HttpClientContext;
    import org.apache.http.config.Registry;
    import org.apache.http.config.RegistryBuilder;
    import org.apache.http.cookie.CookieSpecProvider;
    import org.apache.http.impl.client.BasicCookieStore;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.impl.cookie.BasicClientCookie;
    import org.apache.http.impl.cookie.DefaultCookieSpecProvider;
    import org.apache.http.message.BasicNameValuePair;

    import java.io.IOException;
    import java.io.UnsupportedEncodingException;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.Map;
    import java.util.Set;

    public class AbstractClient {

    protected final static String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36";
    protected final static String Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
    protected final static String AcceptLanguage = "zh-CN,zh;q=0.8,en;q=0.6";

    protected CookieStore cookieStore;
    protected HttpClientContext context;
    protected CloseableHttpClient client;

    public AbstractClient() {
    client = HttpClients.createDefault();
    }

    public void setContext() {
    context = HttpClientContext.create();
    Registry<CookieSpecProvider> registry = RegistryBuilder.<CookieSpecProvider> create()
    .register(CookieSpecs.DEFAULT, new DefaultCookieSpecProvider())
    .register(CookieSpecs.DEFAULT, new DefaultCookieSpecProvider()).build();
    context.setCookieSpecRegistry(registry);
    context.setCookieStore(cookieStore);
    }

    public void setCookieStore(HttpResponse httpResponse) {
    cookieStore = new BasicCookieStore();
    Header[] headers = httpResponse.getHeaders("Set-Cookie");

    String cookieValue = null;
    for (Header header : headers) {
    System.out.println(header.getName() + ":" + header.getValue());
    cookieValue = header.getValue();
    }
    // 新建一个Cookie
    BasicClientCookie cookie = new BasicClientCookie("oscid", cookieValue);
    cookie.setDomain(".oschina.net");
    cookie.setPath("/");
    cookieStore.addCookie(cookie);
    }

    public List<NameValuePair> getParam(Map<String, String> parameterMap) {
    List<NameValuePair> nameValuePairs = new ArrayList<NameValuePair>();
    Set<String> keySet = parameterMap.keySet();
    for (String key : keySet) {
    nameValuePairs.add(new BasicNameValuePair(key, parameterMap.get(key)));
    }
    return nameValuePairs;
    }

    public HttpResponse doGet(String url) {
    HttpResponse result = null;
    HttpGet httpGet = new HttpGet(url);
    config(httpGet);
    try {
    result = client.execute(httpGet);
    } catch (IOException e) {
    e.printStackTrace();
    }
    return result;
    }

    public HttpResponse doPost(String url, Map<String, String> params) {
    HttpResponse result = null;
    HttpPost httpPost = new HttpPost(url);
    UrlEncodedFormEntity postEntity;
    try {
    postEntity = new UrlEncodedFormEntity(getParam(params), "UTF-8");
    httpPost.setEntity(postEntity);
    config(httpPost);
    result = client.execute(httpPost);
    } catch (UnsupportedEncodingException e) {
    e.printStackTrace();
    } catch (ClientProtocolException e) {
    e.printStackTrace();
    } catch (IOException e) {
    e.printStackTrace();
    }
    return result;
    }

    protected void config(HttpRequestBase httpRequestBase) {
    httpRequestBase.setHeader("User-Agent", USER_AGENT);
    httpRequestBase.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
    httpRequestBase.setHeader("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6");
    // 配置请求的超时设置
    RequestConfig requestConfig = RequestConfig.custom().setConnectionRequestTimeout(3000).setConnectTimeout(3000)
    .setSocketTimeout(3000).build();
    httpRequestBase.setConfig(requestConfig);
    }

    }

    3.使用--参照用法
    package com.example.demo.getpage;

    import com.example.demo.entity.CarBrand;
    import com.example.demo.entity.CarDemio;
    import com.example.demo.entity.CarVehicle;
    import com.alibaba.fastjson.JSONArray;
    import com.example.demo.httpclient.Requestor;
    import org.apache.http.HttpEntity;
    import org.apache.http.HttpResponse;
    import org.apache.http.util.EntityUtils;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;

    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.Arrays;
    import java.util.List;

    public class CarGet {

    private Requestor requestor = new Requestor();

    /**
    * 获取汽车品牌
    * @return
    */
    public List<CarBrand> getCarBrands() {
    List<CarBrand> carBrands=new ArrayList<CarBrand>();
    try {
    Document Alldocument = Jsoup.connect("https://www.che300.com/?from=bd_seo&city=11").get();
    // String title=Alldocument.title();
    Element elementDiv = Alldocument.getElementsByAttributeValue("class", "ucarselecttype_pinpaibottom_ul brand").first();
    Elements links = elementDiv.getElementsByTag("p");
    String[] chars = {"Q", "W", "E", "R", "T", "Y", "U",
    "I", "O", "P", "A", "S", "D", "F", "G", "H", "J", "K", "L", "Z", "X", "C", "V", "B", "N", "M"};
    List<String> clist = Arrays.asList(chars);
    for (Element e : links) {
    //判断id是否是A-Z 是就排除
    if (!clist.contains(e.id())) {
    CarBrand cb = new CarBrand();
    cb.setSeries_brand(e.id());
    cb.setBrand_name(e.html());
    cb.setRel(e.attr("rel"));
    carBrands.add(cb);
    }
    // System.out.println(e.html()+"---"+e.attr("rel")+"---"+e.id());
    }
    // for (CarBrand cb:carBrands
    // ) {
    // System.out.println(cb);
    // }
    // System.out.println(title);
    } catch (IOException e) {
    e.printStackTrace();
    }finally {
    return carBrands;
    }

    }


    //获取某一个品牌下的所有系列
    //接口地址
    private List<CarDemio> getOneCarDemio(String url){
    List<CarDemio> cds = new ArrayList<CarDemio>();
    try {
    //接口地址
    // String url="https://ssl-meta.che300.com/meta/series/series_brand{0}.json?v=159";
    HttpResponse response = requestor.doGet(url);
    HttpEntity entity = response.getEntity();
    String str =EntityUtils.toString(entity);
    JSONArray array = JSONArray.parseArray(str);
    cds=array.toJavaList(CarDemio.class);
    } catch (IOException e) {
    e.printStackTrace();
    }finally {
    return cds;
    }
    }


    /**
    * 获取汽车系列
    * @param carBrands
    * @return
    */
    public List<CarDemio> getCarDemio(List<CarBrand> carBrands) {
    List<CarDemio> carDemios=new ArrayList<CarDemio>();
    for (CarBrand cb : carBrands) {
    String url="https://ssl-meta.che300.com/meta/series/series_brand"+cb.getSeries_brand()+".json?v=159";
    List<CarDemio> cars=this.getOneCarDemio(url);
    carDemios.addAll(cars);
    }
    return carDemios;
    }

    //获取单个系列下的汽车类型
    private List<CarVehicle> getOneCarVhicle(String url){
    List<CarVehicle> carVehicleList =new ArrayList<CarVehicle>();
    try {
    HttpResponse response = requestor.doGet(url);
    HttpEntity entity = response.getEntity();
    String str =EntityUtils.toString(entity);
    JSONArray array = JSONArray.parseArray(str);
    carVehicleList=array.toJavaList(CarVehicle.class);
    } catch (IOException e) {
    e.printStackTrace();
    }finally {
    return carVehicleList;
    }
    }

    /**
    * 获取所有汽车品种
    * @param carDemios
    * @return
    */
    public List<CarVehicle> getCarVehicles(List<CarDemio> carDemios){
    List<CarVehicle> carVehicles = new ArrayList<CarVehicle>();
    for (CarDemio cd : carDemios) {
    String url="https://ssl-meta.che300.com/meta/model/model_series"+cd.getSeries_id()+".json?v=159";
    List<CarVehicle> vehicleList=this.getOneCarVhicle(url);
    carVehicles.addAll(vehicleList);
    }
    return carVehicles;

    }
    }

    4.主要用法
    private Requestor requestor = new Requestor();

    /**
    * 通过链接获取json格式的值
    * 汽车品牌
    * @throws Exception
    */
    @Test
    public void testVisitBlog() throws Exception {
    HttpResponse response = requestor.doGet(testUrl);
    HttpEntity entity = response.getEntity();
    String str =EntityUtils.toString(entity);
    JSONArray array = JSONArray.parseArray(str);
    List<CarDemio> carDemioList=array.toJavaList(CarDemio.class);
    for (CarDemio c:carDemioList
    ) {
    System.out.println(c);
    }
    }
    控制台打印:

    CarDemio{series_id='2476', series_group_name='知豆电动车', series_name='知豆', is_green='1'}
    CarDemio{series_id='2477', series_group_name='知豆电动车', series_name='知豆D1', is_green='1'}
    CarDemio{series_id='2478', series_group_name='知豆电动车', series_name='知豆D2', is_green='1'}
    CarDemio{series_id='33135', series_group_name='知豆电动车', series_name='知豆D3', is_green='1'}

    5.jar包

    <!-- https://mvnrepository.com/artifact/com.google.code.gson/gson -->
    <dependency>
    <groupId>com.google.code.gson</groupId>
    <artifactId>gson</artifactId>
    <version>2.8.2</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
    <dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.8.3</version>
    </dependency>

    <dependency>
       <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
    <version>4.5.2</version>
    </dependency>

    <dependency>
    <groupId>junit</groupId>
    <artifactId>junit</artifactId>
    <version>4.12</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
    <dependency>
    <groupId>com.alibaba</groupId>
    <artifactId>fastjson</artifactId>
    <version>1.2.47</version>
    </dependency>







  • 相关阅读:
    全世界最好听的钢琴曲
    清华“70后”院长刘云浩——生命在于运动,梦想从未止步 | 新“清”年特辑
    Android---60---Notification 通知栏的简单使用
    面向对象的三大特性
    一道淘汰85%面试者的百度开发人员面试题?
    Linux下基于Erlang的高并发TCP连接压力实验
    2014南瑞暑期实习面试笔试经历
    Oracle cloud control 12c 怎样改动sysmanpassword
    JEECG社区 一个微信教育站点案例源代码分享
    stm32智能小车之路之小车启动
  • 原文地址:https://www.cnblogs.com/Sora-L/p/9002930.html
Copyright © 2020-2023  润新知