• java编写网站数据抓取


    来公司已经俩月了,每天加班平均工时11个小时的我又想起了老东家温馨舒适安逸的生活。已经有好久没时间读博客写博客了,我觉得我退步了,嗯嗯,我很不开心

    今天记录下抓数据的一些东西吧。

    数据抓取现在是很普遍的事情,有用Python的,当然我还是很弱,我只能用java搞,以下就是正经话了。

    以下需要注意的:

    1.首先有个目标,抓取的目标页面

    2.目标页面的数据结构

    3.目标网站是否有反爬虫机制(就是会封你ip)

    4.数据解析之 存库

    获取httpClient

    package com.app.utils;
    
    import org.apache.http.config.Registry;
    import org.apache.http.config.RegistryBuilder;
    import org.apache.http.conn.socket.ConnectionSocketFactory;
    import org.apache.http.conn.socket.LayeredConnectionSocketFactory;
    import org.apache.http.conn.socket.PlainConnectionSocketFactory;
    import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
    import org.springframework.stereotype.Component;
    
    import javax.annotation.PostConstruct;
    import javax.net.ssl.SSLContext;
    import java.security.NoSuchAlgorithmException;
    
    @Component
    public class HttpConnectionManager {
        PoolingHttpClientConnectionManager cm = null;
    
        @PostConstruct
        public void init() {
            LayeredConnectionSocketFactory sslsf = null;
            try {
                sslsf = new SSLConnectionSocketFactory(SSLContext.getDefault());
            } catch (NoSuchAlgorithmException e) {
                e.printStackTrace();
            }
    
    
            Registry<ConnectionSocketFactory> socketFactoryRegistry = RegistryBuilder.<ConnectionSocketFactory> create()
                    .register("https", sslsf)
                    .register("http", new PlainConnectionSocketFactory())
                    .build();
            cm =new PoolingHttpClientConnectionManager(socketFactoryRegistry);
            cm.setMaxTotal(200);
            cm.setDefaultMaxPerRoute(20);
        }
    
        public CloseableHttpClient getHttpClient() {
            CloseableHttpClient httpClient = HttpClients.custom()
                    .setConnectionManager(cm)
                    .build();
            return httpClient;
        }
    }

    抓取页面工具类 因为很多网站都做了反爬虫机制,它会在单位时间内限制你的访问次数,如果你爬去的数据不停在变化,即时性要求很高,这就需要使用代理IP 

    package com.zyt.creenshot.util;
    
    import org.apache.commons.collections.MapUtils;
    import org.apache.commons.lang3.StringUtils;
    import org.apache.http.HttpEntity;
    import org.apache.http.HttpHost;
    import org.apache.http.client.config.RequestConfig;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.util.EntityUtils;
    import org.springframework.beans.factory.annotation.Autowired;
    import org.springframework.stereotype.Component;
    
    import java.nio.charset.StandardCharsets;
    import java.util.HashMap;
    import java.util.Map;
    
    /**
     * @ClassName:DocumentHelper
     * @Description:<抓取页面工具类>
     * @Author:zhaiyutao
     * @Data:2019/7/1 11:15
     * @Vesion: v1.0
     */
    @Component
    public class DocumentHelper {
    
        @Autowired
        HttpConnectionManager connManager;
    
        public String getProxyHttp(String url, String address, int port, String charset) {
            CloseableHttpResponse response = null;
            CloseableHttpClient httpClient = connManager.getHttpClient();
            try {
                //发送get请求
                HttpGet httpGet = new HttpGet(url);
                //构建代理IP请求
                httpGet = buildProxy(httpGet, address, port);
                Map<String, String> headerMap = new HashMap<String, String>();
                headerMap.put("Referer", "http://*********.com/");
                headerMap.put("Content-Type", "text/html; charset=utf-8");
                headerMap.put("User-Agent", UserAgentUtil.getRandomUserAgent());
                headerMap.put("Accept-Language", "zh-CN,zh;q=0.9");
                headerMap.put("Accept-Encoding", "gzip, deflate");
                //构建请求header头
                httpGet = buildRequestHeader(headerMap, httpGet);
                response = httpClient.execute(httpGet);
                response.addHeader("Content-Type", "text/html; charset=utf-8");
                //获取响应实体
                HttpEntity entity = response.getEntity();
                if (entity != null) {
                    String content = EntityUtils.toString(entity);
                    if(null != charset && !"".equals(charset)) {
                        content = new String(content.getBytes(StandardCharsets.ISO_8859_1), charset);
                    }
                    return content;
                }
            } catch (Exception e){
                //出现问题不处理异常 再次请求 最后统一处理
                //log.error("代理解析内部出现问题 url {} address {} port {}",url,address,port);
                return "";
            }finally {
                try {
                    response.close();
                } catch (Exception e) {
                }
            }
            return "";
        }
    
        private static HttpGet buildProxy(HttpGet httpGet,String address,int port) {
            RequestConfig requestConfig = null;
            if(StringUtils.isNotEmpty(address)){
                HttpHost proxy = new HttpHost(address, port);
                requestConfig = RequestConfig.custom()
                        .setProxy(proxy)
                        .setConnectTimeout(4000)
                        .setSocketTimeout(8000)
                        .setConnectionRequestTimeout(4000)
                        .build();
            }else{
                requestConfig = RequestConfig.custom()
                        .setConnectTimeout(4000)
                        .setSocketTimeout(8000)
                        .setConnectionRequestTimeout(4000)
                        .build();
            }
            httpGet.setConfig(requestConfig);
            return httpGet;
        }
    
        /**
         * 构建header头
         * @param headerMap
         * @param httpGet
         * @return
         */
        private static HttpGet buildRequestHeader(Map<String,String> headerMap,HttpGet httpGet) {
            if (MapUtils.isNotEmpty(headerMap)) {
                for (Map.Entry<String,String> kv :headerMap.entrySet()) {
                    httpGet.setHeader(kv.getKey(),kv.getValue());
                }
            }
            return httpGet;
        }
    
    }

    请求头伪装工具类

    package com.zyt.creenshot.util;
    
    import java.util.Random;
    
    public class UserAgentUtil {
        
        private static final String[] USER_AGENT = {
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36", //google
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36 OPR/60.0.3255.109", //opera
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE", //360
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36 Maxthon/5.2.7.3000", //遨游
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0", //firefox
            "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50" //safari
            //"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763" //IE
        };
        
        /**
         * 随机获取user-agent
         * @return
         */
        public static String getRandomUserAgent() {
            Random random = new Random();
            int i = random.nextInt(USER_AGENT.length);
            String userAgent = USER_AGENT[i];
            return userAgent;
        }
        
    }

    如下以爬取某个汽车网站数据为例 当然表的创建需要自己根据需要设计 我就不贴具体表结构了

    package com.zyt.creenshot.service.crawlerData.impl;
    
    import com.zyt.creenshot.entity.CarBaseData;
    import com.zyt.creenshot.mapper.CarBaseDataMapper;
    import com.zyt.creenshot.service.crawlerData.ICrawlerData;
    import com.zyt.creenshot.util.DocumentHelper;
    import com.zyt.creenshot.util.HttpConnectionManager;
    import lombok.extern.slf4j.Slf4j;
    import org.apache.commons.collections4.CollectionUtils;
    import org.apache.commons.lang3.StringUtils;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    import org.springframework.beans.factory.annotation.Autowired;
    import org.springframework.stereotype.Component;
    
    import java.util.ArrayList;
    import java.util.List;
    
    /**
     * @ClassName:CrawlerDataImpl
     * @Description:<TODO>
     * @Author:zhaiyutao
     * @Data:2019/7/8 17:48
     * @Vesion: v1.0
     */
    @Component
    @Slf4j
    public class CrawlerDataImpl implements ICrawlerData {
    
        @Autowired
        private HttpConnectionManager connectionManager;
    
        @Autowired(required = false)
        private CarBaseDataMapper carBaseDataMapper;
    
    
        @Override
        public void crawlerCarBaseData() {
    
            String url = "***********要爬取的网址*************";
            String resultHtml = DocumentHelper.getProxyHttp(url, null, 0, "GBK", connectionManager);
            if(StringUtils.isEmpty(resultHtml)){
                log.error("没有爬到网站数据");
            }
            Document html = Jsoup.parse(resultHtml);
            //解析品牌
            Elements brandList = html.select("div[class=braRow]");
            if(null != brandList && brandList.size() > 0 ){
                List<CarBaseData> listCar = new ArrayList<>();
                // 获取车的大品牌
                for(Element brand : brandList){
                    Elements brandBig = brand.select("div[class=braRow-icon]");
                    //大品牌名称 和 车标
                    String brandName = brandBig.select("p").text().replace("?","·");
                    String brandPic = brandBig.select("img[src]").attr("#src");
    
                    Elements smallBrandList = brand.select("div[class=modA noBorder]");
                    for( Element sb : smallBrandList){
                        Elements brandItem = sb.select("div[class=thA]");
                        // 细分品牌
                        String brandSmallName = brandItem.select("a[href]").text();
    
                        Elements sbInner = sb.select("div[class=tbA ]");
                        for(Element in : sbInner){
                            dealCarData(listCar, brandName, brandPic, brandSmallName, in);
                        }
                        Elements sbInnerNother = sb.select("div[class=tbA mt10 noBorder]");
                        for(Element inner : sbInnerNother){
                            dealCarData(listCar, brandName, brandPic, brandSmallName, inner);
                        }
                    }
                }
                if(CollectionUtils.isNotEmpty(listCar)){
                    carBaseDataMapper.insertBatch(listCar);
                }
            }
        }
    
        private void dealCarData(List<CarBaseData> listCar, String brandName, String brandPic, String brandSmallName, Element in) {
            String carTypeName = in.select("p[class=stit]").text().split("(")[0];
            Elements li = in.select("li");
            for(Element element : li){
                Element tit = element.select("p[class=tit]").get(0);
                Element price = element.select("p[class=price]").get(0);
                Elements carHref = tit.select("a[href]");
                String priceStr = price.text();
                if(null != carHref){
                    String href = carHref.attr("href");
                    if(StringUtils.isEmpty(href)){
                        continue;
                    }
                    String carName = carHref.attr("title");
                    String carId = StringUtils.substring(href, 1, href.length() - 1);
                    CarBaseData carBaseData = new CarBaseData();
                    carBaseData.setCarId(carId);
                    carBaseData.setCarName(carName);
                    carBaseData.setBrandName(brandName);
                    carBaseData.setBrandPic(brandPic);
                    carBaseData.setSubBrandName(brandSmallName);
                    carBaseData.setCarType(carTypeName);
                    carBaseData.setCarPrice(priceStr);
                    listCar.add(carBaseData);
                }
                if(listCar.size()>=500){
                    carBaseDataMapper.insertBatch(listCar);
                    listCar.clear();
                }
            }
        }
    }

     爬取的数据:

  • 相关阅读:
    CCPC 2017秦皇岛 M Safest Buildings (给一个圆心在原点的大圆R ,以及n个点 在大圆内存在一个小圆r 问那些点同时在两圆的概率最大)
    LightOJ 1366
    Android UI -- 内容简介
    Android 布局优化 -- 学习笔记
    arcgis android 加载google切片 天地图切片 并且能进行缓存
    Eclipse 卸载插件
    Android 不能勾选 Project Build Target
    spatialite-android-library 环境搭建
    HUFFMAN 树
    指示器随机变量
  • 原文地址:https://www.cnblogs.com/zhaiyt/p/11200524.html
Copyright © 2020-2023  润新知