• java实现爬取静态页面的新闻数据



    可能需要的pom依赖包:

    <!-- https://mvnrepository.com/artifact/commons-codec/commons-codec -->
    <dependency>
    <groupId>commons-codec</groupId>
    <artifactId>commons-codec</artifactId>
    <version>1.4</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
    <dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
    <version>4.5.2</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/commons-logging/commons-logging -->
    <dependency>
    <groupId>commons-logging</groupId>
    <artifactId>commons-logging</artifactId>
    <version>1.1.1</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/commons-httpclient/commons-httpclient -->
    <dependency>
    <groupId>commons-httpclient</groupId>
    <artifactId>commons-httpclient</artifactId>
    <version>3.1</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
    <dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.8.3</version>
    </dependency>




    主要贴出实现类相关代码:


    @Override
    public boolean inserturlNews(String urls) {


    // TODO: 2021/5/17 只支持新民网数据爬取,可根据页面标签定时解析
    String url = urls;
    Document doc = null;
    try {
    doc = Jsoup.connect(url).get();
    Elements listDiv = doc.getElementsByAttributeValue("class", "type_content_list type-item");
    NewsInformation newsInformation= new NewsInformation();
    for (Element element : listDiv) {
    Elements texts = element.getElementsByTag("a");
    for (Element text : texts) {
    String newsUrl=text.attr("href");
    String ptext = text.attr("title");
    if (! ptext.isEmpty() && newsUrl.contains(".html")){
    newsInformation.setTitle(ptext);
    newsInformation.setNewsUrl(newsUrl);
    try {
    Document newsDoc = Jsoup.connect(newsUrl).get();
    newsInformation.setForm(newsDoc.select(".info").select("span").get(0).text());
    //环球,时政
    if (url.contains("http://newsxmwb.xinmin.cn/world/") || url.contains("http://newsxmwb.xinmin.cn/shizheng/")) {
    newsInformation.setAuthor(newsDoc.select(".info").select("span").get(1).text());
    newsInformation.setDataTime(newsDoc.select(".info").select("span").get(3).text());
    if (!newsInformation.getDataTime().contains("2021-")){
    newsInformation.setDataTime(now.format(fmTime));
    }
    //文、体会
    }else if (url.contains("http://newsxmwb.xinmin.cn/wentihui/")){
    newsInformation.setAuthor(newsDoc.select(".info").select("span").get(1).text());
    newsInformation.setDataTime(newsDoc.select(".info").select("span").get(4).text());
    if (!newsInformation.getDataTime().contains("2021-")){
    newsInformation.setDataTime(newsDoc.select(".info").select("span").get(3).text());
    }
    //头条
    } else if (url.contains("http://shanghai.xinmin.cn/t/gdbd/")){
    newsInformation.setAuthor(newsDoc.select(".info").select("span").get(1).text());
    newsInformation.setDataTime(newsDoc.select(".info").select("span").get(3).text());
    if (!newsInformation.getDataTime().contains("2021-")){
    newsInformation.setDataTime(now.format(fmTime));
    }
    }
    Elements listNewsDetail = newsDoc.getElementsByAttributeValue("class", "a_content");
    for (Element listNews : listNewsDetail) {
    Elements contents = listNews.getElementsByTag("p");
    Elements images = listNews.getElementsByTag("img");
    newsInformation.setImage(images.attr("src"));
    StringBuffer buffer =new StringBuffer();
    for (Element newsContent : contents) {
    buffer.append(newsContent.text().trim());
    }
    newsInformation.setContent(buffer.toString().trim());
    newsInformation.setStatus(1);
    }
    } catch (IOException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
    }
    newsMapperExt.inserturlNews(newsInformation);
    }
    }
    }


    } catch (IOException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
    }

    return true;
    }
    }


    插入本地数据库 展示



  • 相关阅读:
    K3s+Jetson Nano,在边缘端实现实时视频分析!
    15分钟连接Jetson Nano与K8S,轻松搭建机器学习集群
    配置高可用K3s集群完全攻略
    K3s+Sysdig,8分钟部署并保护集群安全!
    1款工具助力Rancher HA快速部署,极速提升研发测试效率
    连刷40道题,告别动态规划,谈谈我的经验
    直通BAT算法精讲视频教程分享
    关于三次握手和四次挥手,面试官想听到怎样的回答?
    Redisson 分布式锁实战与 watch dog 机制解读
    Spring 注解动态数据源设计实践
  • 原文地址:https://www.cnblogs.com/yangsanluo/p/14845374.html
Copyright © 2020-2023  润新知