java抓取古诗文的单线程爬虫

准备知识

1.HTML, CSS, HTML DOM树

参考http://www.w3school.com.cn/htmldom/

2.Jsoup的使用，使用DOM方法遍历一个document对象，使用选择器语法来选择一个元素，从元素中抽取数据。

参考www.open-open.com/jsoup/example-list-links.htm

3.java正则表达式及其语法

参考http://www.cnblogs.com/chuiyuan/p/5187359.html

我们先来做一个单线程的爬虫。

整体步骤：

1.定义爬取内容的对象Poem结构。

2.完成从网上爬取Document对象的模块HttpService功能。

3.从Document对象中解析出所有唐诗的href，并保存到List<Poem>中。

4.从3中得到的href再爬取出每首古诗的内容。

下面帖一下代码实现。

1.Poem对象只列出其属性

2.抓取Document对象模块HttpService

　　首先定义一个Rule，封装所有的请求，不管是get还是post。

在HttpService中，使用Rule来抓取document对象。

/**
 * Created by chuiyuan on 2/11/16.
 */
public class HttpService {
    /**
     *
     * @param rule
     * @return doc
     */
    public  Document extrace(Rule rule){
        validateRule(rule);

        String url = rule.getUrl();
        String [] params = rule.getParams() ;
        String [] values = rule.getValues() ;
        String resultTagName = rule.getResultTagName() ;
        int type = rule.getType();
        int requestMethod = rule.getRequestMethod() ;
        Document doc =null ;
        try {
            Connection conn = Jsoup.connect(url);
            conn.userAgent("Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0 Iceweasel/38.5.0");
            if (params!= null){
                for (int i =0; i<params.length ;i++){
                    conn.data(params[i], values[i]);
                }
            }
            //Document doc = null ;
            switch (requestMethod){
                case  Rule.GET:
                    doc =conn.timeout(10000).get();
                    break;
                case Rule.POST:
                    doc =conn.timeout(10000).post();
                break;
            }
        }catch (IOException e ){
            System.out.println("No network");
            //e.printStackTrace();
        }
        return doc ;
    }

    /**
     * validate input params
     */
    private static void validateRule(Rule rule){
        String url = rule.getUrl() ;
        if (url == null || url.length()==0){
            throw new RuleException("url can't be null");
        }
        if (!url.startsWith("http://")){
            throw  new RuleException("url not in correct format");
        }
        /**
         * not consider total right
         */
        if (rule.getParams()!= null && rule.getValues()!= null){
            if (rule.getParams().length!= rule.getValues().length){
                throw new RuleException("params length!= values length");
            }
        }
    }
}

可以看到，我们先对rule进行了验证，rule中的resultTagName并没有用到，可以去掉。

3.从Document中解析出所有唐诗的href

4.从每首诗文的href中提到详细的内容

public class ProcessDoc {

    /**
     *
     * @param url
     * @param httpService
     * @return List<Poem></Poem>
     */
    public List<Poem> processGuShiWen(String url, HttpService httpService ){
        Rule rule = new Rule(url,
                null,null,
                null,
                -1,
                Rule.GET);
        Document doc = httpService.extrace(rule);
        if (doc == null){
            System.out.println("doc null");
            return null;
        }
        //left panel
        Elements mainEles = doc.select("div.leftlei");
        //String dynasty = mainEles.select("div.son1").first().text();
        Elements poemsEles = mainEles.select("div.son2");//size=7 category
        //System.out.println(poemsEles.size());

        List<Poem> poemList = new ArrayList<Poem>();
        //Pattern p =Pattern.compile("\(|\)");

        //some times there is no author
        for (Element poemsEle : poemsEles){
            Elements poemEles = poemsEle.select("span");
            String category1 = poemEles.get(0).text(); //category with ":"
            String category = category1.substring(0,category1.length()-1);
            for (int i =1;i<poemEles.size();i++){
                Poem poem = new Poem() ;
                //poem.setDynasty(dynasty);
                poem.setCategory(category);
                poem.setHref(poemEles.get(i).select("a").attr("abs:href")+"/");//ref
                poem.setTitle(poemEles.get(i).select("a").text());
                poemList.add(poem);
            }
        }

        /*for (Poem poem : poemList){
            System.out.println(poem.getCategory()+
                    " "+poem.getTitle()+
                    " "+poem.getHref());
        }*/
        return poemList ;

    }

    /**
     * get details of poem
     * @param poem
     * @param httpService
     */
    public void processDetails(Poem poem, HttpService httpService ){
        String url = poem.getHref() ;
        //System.out.println(url);
        Rule rule = new Rule(url,
                null,null,
                null,
                -1,
                Rule.GET);
        Document doc = httpService.extrace(rule);
        if (doc == null) {
            System.out.println("doc=null");
            return;
        }

        Elements mainEles = doc.select("div.son2");
        //title already ok
        Elements poemDetailEles = mainEles.select("p");
        String dynasty = poemDetailEles.get(0).text().split("：")[1];//note，chinese :
        //Stng dynasty = poemDetailEles.get(0).getElementsByTag("span").text();
        //System.out.println(dynasty);
        String author = poemDetailEles.get(1).getElementsByTag("a").text();
        //System.out.println(author);
        String content = mainEles.text().split("原文：")[1];//note
        //System.out.println(content);
        //do not consider translation
        //Element translationEle = doc.select("#")

        poem.setDynasty(dynasty);
        poem.setAuthor(author);
        poem.setContent(content);

        //System.out.println(poem.toString());

    }
}

整体调用如下

/**
     * single thread model,
     * save to mysql
     */
    public void getGuShiWenSingleThread(){
        String url ="http://so.gushiwen.org/gushi/tangshi.aspx/";

        HttpService httpService = new HttpService();

        ProcessDoc processDoc= new ProcessDoc();

        List<Poem> poemList = processDoc.processGuShiWen(url, httpService);


        //get poem content details
        for (Poem poem : poemList){
            processDoc.processDetails(poem,httpService);
            System.out.println(poem.toString());
        }

        //store to mysql
        PoemDao poemDao = new PoemDaoImpl() ;//not PoemDaoImpl
        for (Poem poem: poemList){
            try {
                poemDao.add(poem);
            }catch (SQLException e){
                e.printStackTrace();
            }
        }
    }

最后结果保存到了MySQL中，数据库部分将在下一篇文章中讲解。

相关阅读:
LeetCode 189. Rotate Array
LeetCode 965. Univalued Binary Tree
LeetCode 111. Minimum Depth of Binary Tree
LeetCode 104. Maximum Depth of Binary Tree
Windows下MySQL的安装与配置
 LeetCode 58. Length of Last Word
LeetCode 41. First Missing Positive
LeetCode 283. Move Zeroes
《蚂蚁金服11.11：支付宝和蚂蚁花呗的技术架构及实践》读后感
 删除docker下的镜像
原文地址：https://www.cnblogs.com/chuiyuan/p/5200076.html