java爬虫入门

通用网络爬虫又称全网爬虫（Scalable Web Crawler），爬行对象从一些种子 URL 扩充到整个 Web，主要为门户站点搜索引擎和大型 Web 服务提供商采集数据。

今天我写的主要是一些皮毛入门

现在来看下我们的pom依赖

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.javaxl</groupId>
    <artifactId>T226_jsoup</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <packaging>jar</packaging>

    <name>T226_jsoup</name>
    <url>http://maven.apache.org</url>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    </properties>

    <dependencies>
        <!-- jdbc驱动包 -->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.44</version>
        </dependency>

        <!-- 添加Httpclient支持 -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.2</version>
        </dependency>

        <!-- 添加jsoup支持 -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.1</version>
        </dependency>


        <!-- 添加日志支持 -->
        <dependency>
            <groupId>log4j</groupId>
            <artifactId>log4j</artifactId>
            <version>1.2.16</version>
        </dependency>

        <!-- 添加ehcache支持 -->
        <dependency>
            <groupId>net.sf.ehcache</groupId>
            <artifactId>ehcache</artifactId>
            <version>2.10.3</version>
        </dependency>

        <!-- 添加commons io支持 -->
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.5</version>
        </dependency>

        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.47</version>
        </dependency>
    </dependencies>
</project>

我们现在先来爬取一下单张图片

package com.javaxl.crawler;

import java.io.File;
import java.io.IOException;
import java.util.UUID;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.log4j.Logger;

import com.javaxl.util.DateUtil;
import com.javaxl.util.PropertiesUtil;

public class DownloadImg {
    private static Logger logger = Logger.getLogger(DownloadImg.class);
    private static String URL = "https://ss0.bdstatic.com/70cFvHSh_Q1YnxGkpoWK1HF6hhy/it/u=2786741331,312930537&fm=26&gp=0.jpg";
    public static void main(String[] args) {
        logger.info("开始爬取首页：" + URL);
        CloseableHttpClient httpClient = HttpClients.createDefault();
        HttpGet httpGet = new HttpGet(URL);
        RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build();
        httpGet.setConfig(config);
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            if (response == null) {
                logger.info("连接超时！！！");
            } else {
                HttpEntity entity = response.getEntity();
                String imgPath = PropertiesUtil.getValue("blogImages");
                String dateDir = DateUtil.getCurrentDatePath();
                String uuid = UUID.randomUUID().toString();
                String subfix = entity.getContentType().getValue().split("/")[1];
                String localFile = imgPath+dateDir+"/"+uuid+"."+subfix;
//                System.out.println(localFile);
                FileUtils.copyInputStreamToFile(entity.getContent(), new File(localFile));
            }
        } catch (ClientProtocolException e) {
            logger.error(URL+"-ClientProtocolException", e);
        } catch (IOException e) {
            logger.error(URL+"-IOException", e);
        } catch (Exception e) {
            logger.error(URL+"-Exception", e);
        } finally {
            try {
                if (response != null) {
                    response.close();
                }
                if(httpClient != null) {
                    httpClient.close();
                }
            } catch (IOException e) {
                logger.error(URL+"-IOException", e);
            }
        }
        

        logger.info("结束首页爬取：" + URL);
    
    }
}

在来看下配置文件

再来看下运行结果

简单的爬单图片就结束了，下面我们来爬下数据到数据库（就选择博客园的爬取把）

BlogCrawlerStarter.java

package com.javaxl.crawler;

import java.io.File;
import java.io.IOException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.javaxl.util.DateUtil;
import com.javaxl.util.DbUtil;
import com.javaxl.util.PropertiesUtil;

import net.sf.ehcache.Cache;
import net.sf.ehcache.CacheManager;
import net.sf.ehcache.Status;

/**
 * @author Administrator
 *
 */
public class BlogCrawlerStarter {

    private static Logger logger = Logger.getLogger(BlogCrawlerStarter.class);
//    https://www.csdn.net/nav/newarticles
    private static String HOMEURL = "https://www.cnblogs.com/";
    private static CloseableHttpClient httpClient;
    private static Connection con;
    private static CacheManager cacheManager;
    private static Cache cache;

    /**
     * httpclient解析首页，获取首页内容
     */
    public static void parseHomePage() {
        logger.info("开始爬取首页：" + HOMEURL);
        
        cacheManager = CacheManager.create(PropertiesUtil.getValue("ehcacheXmlPath"));
        cache = cacheManager.getCache("cnblog");
        
        httpClient = HttpClients.createDefault();
        HttpGet httpGet = new HttpGet(HOMEURL);
        RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build();
        httpGet.setConfig(config);
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            if (response == null) {
                logger.info(HOMEURL + ":爬取无响应");
                return;
            }

            if (response.getStatusLine().getStatusCode() == 200) {
                HttpEntity entity = response.getEntity();
                String homePageContent = EntityUtils.toString(entity, "utf-8");
                // System.out.println(homePageContent);
                parseHomePageContent(homePageContent);
            }

        } catch (ClientProtocolException e) {
            logger.error(HOMEURL + "-ClientProtocolException", e);
        } catch (IOException e) {
            logger.error(HOMEURL + "-IOException", e);
        } finally {
            try {
                if (response != null) {
                    response.close();
                }

                if (httpClient != null) {
                    httpClient.close();
                }
            } catch (IOException e) {
                logger.error(HOMEURL + "-IOException", e);
            }
        }

        if(cache.getStatus() ==  Status.STATUS_ALIVE) {
            cache.flush();
        }
        cacheManager.shutdown();
        logger.info("结束爬取首页：" + HOMEURL);

    }

    /**
     * 通过网络爬虫框架jsoup，解析网页类容，获取想要数据（博客的连接）
     * 
     * @param homePageContent
     */
    private static void parseHomePageContent(String homePageContent) {
        Document doc = Jsoup.parse(homePageContent);
        //#feedlist_id .list_con .title h2 a
        Elements aEles = doc.select("#post_list .post_item .post_item_body h3 a");
        for (Element aEle : aEles) {
//            这个是首页中的博客列表中的单个链接URL
            String blogUrl = aEle.attr("href");
            if (null == blogUrl || "".equals(blogUrl)) {
                logger.info("该博客未内容，不再爬取插入数据库！");
                continue;
            }
            if(cache.get(blogUrl) != null) {
                logger.info("该数据已经被爬取到数据库中，数据库不再收录！");
                continue;
            }
//            System.out.println("************************"+blogUrl+"****************************");
            
            parseBlogUrl(blogUrl);
        }
    }

    /**
     * 通过博客地址获取博客的标题，以及博客的类容
     * 
     * @param blogUrl
     */
    private static void parseBlogUrl(String blogUrl) {

        logger.info("开始爬取博客网页：" + blogUrl);
        httpClient = HttpClients.createDefault();
        HttpGet httpGet = new HttpGet(blogUrl);
        RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build();
        httpGet.setConfig(config);
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            if (response == null) {
                logger.info(blogUrl + ":爬取无响应");
                return;
            }

            if (response.getStatusLine().getStatusCode() == 200) {
                HttpEntity entity = response.getEntity();
                String blogContent = EntityUtils.toString(entity, "utf-8");
                parseBlogContent(blogContent, blogUrl);
            }

        } catch (ClientProtocolException e) {
            logger.error(blogUrl + "-ClientProtocolException", e);
        } catch (IOException e) {
            logger.error(blogUrl + "-IOException", e);
        } finally {
            try {
                if (response != null) {
                    response.close();
                }
            } catch (IOException e) {
                logger.error(blogUrl + "-IOException", e);
            }
        }

        logger.info("结束爬取博客网页：" + HOMEURL);

    }

    /**
     * 解析博客类容，获取博客中标题以及所有内容
     * 
     * @param blogContent
     */
    private static void parseBlogContent(String blogContent, String link) {
        Document doc = Jsoup.parse(blogContent);
        if(!link.contains("ansion2014")) {
            System.out.println(blogContent);
        }
        Elements titleEles = doc
                //#mainBox main .blog-content-box .article-header-box .article-header .article-title-box h1
                .select("#topics .post h1 a");
        System.out.println("123");
        System.out.println(titleEles.toString());
        System.out.println("123");
        if (titleEles.size() == 0) {
            logger.info("博客标题为空，不插入数据库！");
            return;
        }
        String title = titleEles.get(0).html();

        Elements blogContentEles = doc.select("#cnblogs_post_body ");
        if (blogContentEles.size() == 0) {
            logger.info("博客内容为空，不插入数据库！");
            return;
        }
        String blogContentBody = blogContentEles.get(0).html();
        
//        Elements imgEles = doc.select("img");
//        List<String> imgUrlList = new LinkedList<String>();
//        if(imgEles.size() > 0) {
//            for (Element imgEle : imgEles) {
//                imgUrlList.add(imgEle.attr("src"));
//            }
//        }
//        
//        if(imgUrlList.size() > 0) {
//            Map<String, String> replaceUrlMap = downloadImgList(imgUrlList);
//            blogContent = replaceContent(blogContent,replaceUrlMap);
//        }

        String sql = "insert into `t_jsoup_article` values(null,?,?,null,now(),0,0,null,?,0,null)";
        try {
            PreparedStatement pst = con.prepareStatement(sql);
            pst.setObject(1, title);
            pst.setObject(2, blogContentBody);
            pst.setObject(3, link);
            if(pst.executeUpdate() == 0) {
                logger.info("爬取博客信息插入数据库失败");
            }else {
                cache.put(new net.sf.ehcache.Element(link, link));
                logger.info("爬取博客信息插入数据库成功");
            }
        } catch (SQLException e) {
            logger.error("数据异常-SQLException：",e);
        }
    }

    /**
     * 将别人博客内容进行加工，将原有图片地址换成本地的图片地址
     * @param blogContent
     * @param replaceUrlMap
     * @return
     */
    private static String replaceContent(String blogContent, Map<String, String> replaceUrlMap) {
        for(Map.Entry<String, String> entry: replaceUrlMap.entrySet()) {
            blogContent = blogContent.replace(entry.getKey(), entry.getValue());
        }
        return blogContent;
    }

    /**
     * 别人服务器图片本地化
     * @param imgUrlList
     * @return
     */
    private static Map<String, String> downloadImgList(List<String> imgUrlList) {
        Map<String, String> replaceMap = new HashMap<String, String>();
        for (String imgUrl : imgUrlList) {
            CloseableHttpClient httpClient = HttpClients.createDefault();
            HttpGet httpGet = new HttpGet(imgUrl);
            RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build();
            httpGet.setConfig(config);
            CloseableHttpResponse response = null;
            try {
                response = httpClient.execute(httpGet);
                if (response == null) {
                    logger.info(HOMEURL + ":爬取无响应");
                }else {
                    if (response.getStatusLine().getStatusCode() == 200) {
                        HttpEntity entity = response.getEntity();
                        String blogImagesPath = PropertiesUtil.getValue("blogImages");
                        String dateDir = DateUtil.getCurrentDatePath();
                        String uuid = UUID.randomUUID().toString();
                        String subfix = entity.getContentType().getValue().split("/")[1];
                        String fileName = blogImagesPath + dateDir + "/" + uuid + "." + subfix;
                        
                        FileUtils.copyInputStreamToFile(entity.getContent(), new File(fileName));
                        replaceMap.put(imgUrl, fileName);
                    }
                }
            } catch (ClientProtocolException e) {
                logger.error(imgUrl + "-ClientProtocolException", e);
            } catch (IOException e) {
                logger.error(imgUrl + "-IOException", e);
            } catch (Exception e) {
                logger.error(imgUrl + "-Exception", e);
            } finally {
                try {
                    if (response != null) {
                        response.close();
                    }
                } catch (IOException e) {
                    logger.error(imgUrl + "-IOException", e);
                }
            }
        
        }
        return replaceMap;
    }

    public static void start() {
        while(true) {
            DbUtil dbUtil = new DbUtil();
            try {
                con = dbUtil.getCon();
                parseHomePage();
            } catch (Exception e) {
                logger.error("数据库连接势失败！");
            } finally {
                try {
                    if (con != null) {
                        con.close();
                    }
                } catch (SQLException e) {
                    logger.error("数据关闭异常-SQLException：",e);
                }
            }
            try {
                Thread.sleep(1000*60);
            } catch (InterruptedException e) {
                logger.error("主线程休眠异常-InterruptedException：",e);
            }
        }
    }

    public static void main(String[] args) {
        start();
    }
}

看下博客园的博客

运行结果

数据库的数据

相关阅读:
Ajax基础：3.Json
Head First Design Patterns State Pattern
Head First Design Patterns Template Method Pattern
Articles For CSS Related
Head First Design Patterns Decorator Pattern
代码审查工具
 How To Be More Active In A Group
Head First Design Patterns Factory Method Pattern
Head First Design Patterns Composite Pattern
Tech Articles
原文地址：https://www.cnblogs.com/ztbk/p/11637914.html