• jsoup抓取百度百科


    公司有一个虫库需要完善虫子的信息,于是..................................

            <dependency>
                <!-- jsoup HTML parser library @ https://jsoup.org/ -->
                <groupId>org.jsoup</groupId>
                <artifactId>jsoup</artifactId>
                <version>1.9.1</version>
            </dependency>
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    import java.io.IOException;
    import java.util.ListIterator;
    
    /**
     * @description 百度百科数据抓取
     * @version V1.0
     * @author zhang
     * @date
     * @update
     */
    public class BaiKeiCap {
        public static void main(String[] args) throws IOException {
            start("肾毒蛾");
        }
    
        private static StringBuilder start(String insect) {
            StringBuilder stringBuilder = new StringBuilder();
            try {
                Document document = Jsoup.connect("https://baike.baidu.com/item/" + insect).get();
                Elements description = document.getElementsByClass("description");
                Elements supNormal = document.getElementsByClass("sup--normal");
                Elements titlePrefix = document.getElementsByClass("title-prefix");
                Elements audio = document.getElementsByClass("J-part-audio-text");
                Elements lemma = document.getElementsByClass("wiki-lemma-icons_edit-lemma");
                Elements anchorList = document.getElementsByClass("anchor-list");
                Elements editIcon = document.getElementsByClass("edit-icon");
                Elements lemmaAnchor = document.getElementsByClass("lemma-anchor");
                // 去除图片
                supNormal.remove();
                // 去除图片描述
                description.remove();
                // 去除标题前缀
                titlePrefix.remove();
                // 移除编辑和播报
                audio.remove();
                lemma.remove();
                // 移除段落小标题
                anchorList.remove();
                editIcon.remove();
                lemmaAnchor.remove();
                Elements intro = document.getElementsByClass("lemma-summary");
                Elements introChildren = intro.get(0).children();
                ListIterator<Element> elementListIterator = introChildren.listIterator();
                System.err.println(
                        "==========================================================简介==========================================================");
                stringBuilder.append(
                        "<p class=\"ql-align-center\"><span class=\"ql-size-huge\">简介</span></p>");
                while (elementListIterator.hasNext()) {
                    Element next = elementListIterator.next();
                    if (next.text().contains("概述图参考来源")) {
                        continue;
                    }
                    System.err.println(next.text() + "<p>");
                }
                // 获取到所有章节
                Elements chapter = document.getElementsByClass("J-chapter");
                ListIterator<Element> chapterIterator = chapter.listIterator();
                // 遍历每个章节
                while (chapterIterator.hasNext()) {
                    // 获取他的兄弟元素
                    Element next = chapterIterator.next();
                    // 获取章节标题
                    String chapterTitle = next.select(".title-text").text();
                    System.err.println(
                            "=========================================================="
                                    + chapterTitle
                                    + "==========================================================");
                    stringBuilder.append("<p class=\"ql-align-center\"><span class=\"ql-size-large\">");
                    stringBuilder.append(chapterTitle);
                    stringBuilder.append("</span></p>");
                    getSub(next.nextElementSibling(), stringBuilder);
                }
                return stringBuilder;
            } catch (Exception e) {
                System.err.println("未搜索到内容");
                return stringBuilder;
            }
        }
    
        private static void getSub(Element element, StringBuilder stringBuilder) {
            if (element.hasClass("J-chapter") || "J-main-content-end-dom".equals(element.attr("id"))) {
                return;
            }
            if (element.hasClass("para-title")
                    && element.hasAttr("label-module")
                    && element.hasAttr("data-index")) {
                System.err.println(
                        "=========================================================="
                                + element.text()
                                + "==========================================================");
                stringBuilder.append("<p class=\"ql-align-center\"><span class=\"ql-size-large\">");
                stringBuilder.append(element.text());
                stringBuilder.append("</span></p>");
            }
            if (element.hasClass("para") && element.hasAttr("label-module")) {
                System.err.println(element.text());
                stringBuilder.append("</p>  " + element.text() + "</p>");
            }
            element = element.nextElementSibling();
            getSub(element, stringBuilder);
        }
    }
  • 相关阅读:
    消息队列中间件的技术选型分析
    数据库和缓存一致性的问题
    《RocketMQ 安装和使用》
    RocketMQ原理讲解系列文章
    阿里巴巴开源项目
    RocketMQ与Kafka对比(18项差异)
    对象初始化
    pytest_05_fixture之conftest.py
    pytest_04_测试用例setup和teardown
    Python与MogoDB交互
  • 原文地址:https://www.cnblogs.com/xyzxy/p/16335044.html
Copyright © 2020-2023  润新知