• java提取(获取)博客信息(内容)


    package com.wbg.my.service;
    import java.io.*;
    import java.net.HttpURLConnection;
    import java.net.URL;
    import java.util.*;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    /**
     * @author Jack Chen
     * */
    public class BlogUtil {
    
        /**
         * URL_PAGE:cnblogs url
         * URL_PAGE_DETAIL:详情页url
         * PAGE_COUNT:页数
         * urlLists:所有详情页url Set集合(防止重复)
         * p:匹配模式
         * */
        public final static String URL_PAGE = "https://www.cnblogs.com/weibanggang/default.html?page=";
        public final static String URL_PAGE_DETAIL = "https://www.cnblogs.com/weibanggang/p/([0-9]+.html)";
        public final static int PAGE_COUNT = 20;
        public static Set<String> urlLists = new TreeSet<String>();
        public final static Pattern p = Pattern.compile(URL_PAGE_DETAIL);
        //文件路径
        public static String file="d:index.html";
        static String  [] arr=null;
       static int sun=0;
        public static void main(String[] args) throws Exception {
            for(int i = 1;i<=PAGE_COUNT;i++) {
                getUrls(i);
            }
            System.out.println("开始获取内容!");
            arr=new String[urlLists.size()];
            for(Iterator<String> i = urlLists.iterator();i.hasNext();) {
                createFile(i.next());
                sun++;
            }
            System.out.println("获取内容完毕!");
            System.out.println("开始写入文件!");
            StringBuffer stringBuffer=new StringBuffer(kais());
            for (int i = 0; i < arr.length; i++) {
                stringBuffer.append(arr[i]);
            }
            stringBuffer.append(jiehun());
            System.out.println("写入文件完毕!");
            System.out.println("开始导出文件!");
            createFile(file,stringBuffer);
            System.out.println("导出文件完毕!");
            System.out.println("输出文件地址为:"+file);
        }
        /*
         * 将结果写入文件
         */
        private static void createFile(String file, StringBuffer buffer) {
            try {
                File newFile = new File(file);
                if (newFile.exists())// 存在,则删除
                    if (!newFile.delete())// 删除成功则创建
                    {
                        System.err.println("删除文件" + newFile + "失败");
                    }
                if (newFile.createNewFile()) {// 创建成功,则写入文件内容
                    PrintWriter p = new PrintWriter(new FileOutputStream(newFile
                            .getAbsolutePath()));
                    p.write(buffer.toString());
                    p.close();
                } else {
                    System.err.println("创建文件:" + newFile + "失败");
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        //开始头部
        public static String kais(){
            return "<!DOCTYPE html>
    " +
                    "<html>
    " +
                    "<head>
    " +
                    "    <meta charset="utf-8">
    " +
                    "    <title>weibanggang.github.io</title>
    " +
                    "    <meta name="renderer" content="webkit">
    " +
                    "    <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
    " +
                    "    <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
    " +
                    "    <style>
    " +
                    "        html,body{100%;height: 100%}
    " +
                    "        table{ 1150px;height:500px;margin: auto}
    " +
                    "        table,td,th{border: 1px solid #e6e6e6;border-collapse:collapse; }
    " +
                    "        body{-moz-background-size:100% 100%; background-size:100% 100%;background-image:url("link.jpg");background-repeat: no-repeat}         body{-moz-background-size:100% 100%; background-size:100% 100%;background-image:url("link.jpg");background-repeat: no-repeat}
    " +
                    "        * { margin: 0; padding: 0; }
    " +
                    "        table { border-collapse: collapse; text-align: center;  }
    " +
                    "        /*关键设置 tbody出现滚动条*/
    " +
                    "        table tbody {
    " +
                    "            display: block;
    " +
                    "            height: 500px;
    " +
                    "            overflow-y: scroll;overflow-x:hidden;
    " +
                    "        }
    " +
                    "  table thead,  tbody tr { display: table; 100%; table-layout: fixed;  }
    " +
                    "        table thead th {  height: 40px  }
    " +
                    "        table tbody td {height: 30px }
    " +
                    "    </style>
    " +
                    "</head>
    " +
                    "
    " +
                    "<body>
    " +
                    "<marquee><h1 style="color:white;">本网页仅作为参考博客、github等地址</h1></marquee>
    " +
                    "<table width="80%" border="1">
    " +
                    "    <thead>
    " +
                    "    <tr>
    " +
                    "        <th style="230px">序号</th>
    " +
                    "        <th style="231px">标题链接</th>
    " +
                    "        <th style="231px">时间</th>
    " +
                    "        <th style="231px">来源</th>
    " +
                    "        <th style="249px">备注</th>
    " +
                    "    </tr>
    " +
                    "    </thead>
    " +
                    "    <tbody>
    " +
                    "
    " +
                    "    </tbody>
    " +
                    "</table>
    " +
                    "</body>
    " +
                    "<script src="js/jquery.js"></script>
    " +
                    "<script>
    " +
                    "    var sum=[";
        }
        //结尾
        public static String jiehun(){
            return " ];
    " +
                    "    
    " +
                    "    for(var i=0;i<sum.length;i++){
    " +
                    "        var tr=$("<tr/>");
    " +
                    "            //序号
    " +
                    "            $("<td/>").html(i+1).appendTo(tr);
    " +
                    "            //标题链接
    " +
                    "            var a="<a href='"+sum[i][0]+"' target='_blank'>"+sum[i][1]+"</a>"
    " +
                    "            $("<td/>").html(a).appendTo(tr);
    " +
                    "            //时间
    " +
                    "            $("<td/>").html(sum[i][2]).appendTo(tr);
    " +
                    "            //来源
    " +
                    "            $("<td/>").html(sum[i][3]).appendTo(tr);
    " +
                    "            //备注
    " +
                    "            $("<td/>").html(sum[i][4]).appendTo(tr);
    " +
                    "            $("table tbody").append(tr);
    " +
                    "    }
    " +
                    "</script>
    " +
                    "</html>";
        }
        static String fh="";
        /**
         * @param url
         * 获取所有内容
         * @throws
         */
        private static void createFile(String url) throws Exception {
            Matcher m = p.matcher(url);
            m.find();
            String fileName = m.group(1);
            URL u = new URL(url);
            HttpURLConnection conn = (HttpURLConnection) u.openConnection();
            conn.connect();
            BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "utf-8"));
            String str;
            StringBuffer s=new StringBuffer();
            while((str = br.readLine()) != null){
                s.append(str);
            }
            String href="https://www.cnblogs.com/weibanggang/p/"+fileName;
            String title=getTitle(s);
            String data=getDate(s);
            arr[sun]=fh+"[""+href+"",""+title+"",""+data+"","博客","正常"]";
            fh=",";
            br.close();
            conn.disconnect();
        }
        //获取时间
        public static String getDate(StringBuffer sb){
            int first=sb.indexOf("<span id="post-date">")+"<span id="post-date">".length();
            String aa=sb.substring(first);
            int last=aa.indexOf("</span>");
            String sa=aa.substring(0,last);
            return sa;
        }
        //获取标题
        public static String getTitle(StringBuffer sb){
            int first=sb.indexOf("<title>");
            int last=sb.indexOf("</title>");
            String sa=sb.substring(first+7,last);
            int errorindex=sa.lastIndexOf("- 韦邦杠 - 博客园");
            return sa.substring(0,errorindex);
        }
        /**
         * @param idx
         * 获取页数
         * @throws
         */
        private static void getUrls(int idx) throws Exception{
            URL u = new URL(URL_PAGE+""+idx);
            HttpURLConnection conn = (HttpURLConnection) u.openConnection();
            conn.connect();
            BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "utf-8"));
            String str;
            while((str = br.readLine()) != null){
                if(null != str && str.contains("https://www.cnblogs.com/weibanggang/p/")) {
                    Matcher m = p.matcher(str);
                    if(m.find()) {
                        urlLists.add(m.group());
                    }
                }
            }
            br.close();
            conn.disconnect();
        }
    
    }

  • 相关阅读:
    实验二 Nmap的实践
    《网络攻击与防范》第八周学习总结
    《网络攻击与防范》第七周学习总结
    《网络攻击与防范》第六周学习总结
    《网络攻击与防范》第五周学习总结
    《网络攻击与防范》第四周学习总结
    《网络攻击与防范》第三周学习总结
    《网络攻击与防范》第二周学习总结
    Linux 基础入门学习总结
    20169312 2016-2017-2《网络攻防实践》课程总结
  • 原文地址:https://www.cnblogs.com/weibanggang/p/10019453.html
Copyright © 2020-2023  润新知