• 信息化领域热词分类分析及解释


    一 项目名称:

    信息化领域热词分类分析及解释

    二 功能设计:

    1) 数据 采集:要求从定期自动从网络中爬取信息领域的相关热
    词;
    2) 数据 清洗:对热词信息进行数据清洗,并采用自动分类技术
    生成信息领域热词目录,;
    3) 热 热 词 解释:针对每个热词名词自动添加中文解释(参照百度
    百科或维基百科);
    4) 热词 引用 :并对近期引用热词的文章或新闻进行标记,生成
    超链接目录,用户可以点击访问;
    5) 数据 可视化 展示:
    ① 用字符云或热词图进行可视化展示;
    ② 用关系图标识热词之间的紧密程度。
    6) 数据 报告:可将所有热词目录和名词解释生成 WORD 版报告
    形式导出。

    三 项目源码:

    python:

    热点新闻爬取

    import requests
    import re
    import xlwt
    url = 'https://news.cnblogs.com/n/recommend'
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
    }
    def get_page(url):
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                print('获取网页成功')
                print(response.encoding)
                return response.text
            else:
                print('获取网页失败')
        except Exception as e:
            print(e)
    f = xlwt.Workbook(encoding='utf-8')
    sheet01 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)
    sheet01.write(0, 0, '博客最热新闻')  # 第一行第一列
    urls = ['https://news.cnblogs.com/n/recommend?page={}'.format(i * 1) for i in range(100)]
    temp=0
    num=0
    for url in urls:
        print(url)
        page = get_page(url)
        items = re.findall('<h2 class="news_entry">.*?<a href=".*?" target="_blank">(.*?)</a>',page,re.S)
        print(len(items))
        print(items)
        for i in range(len(items)):
            sheet01.write(temp + i + 1, 0, items[i])
        temp += len(items)
        num+=1
        print("已打印完第"+str(num)+"")
    print("打印完!!!")
    f.save('Hotword.xls')

    热词拆分:

    import jieba
    import pandas as pd
    import re
    import mysql.connector
    from collections import Counter
    if __name__ == '__main__':
        filehandle = open("Hotword.txt", "r", encoding='GBK');
        mystr = filehandle.read()
        seg_list = jieba.cut(mystr)  # 默认是精确模式
        print(seg_list)
        stopwords = ['', '', '', '', '-', '.', '', '', '', '(', ')', '', '', '使用', '实现', '', '', '', '', '', '',
               '', '', '中国', '发布', '公司', '首次', '全球', '正式', '2019', '2020', '亿美元', '20', '10', '&#', '首个', '正在', '最大',
               '成为', '第一', '', '', '如何', '人类', '什么', '一个', '宣布', '可能', '推出', '没有', '地球', '到底', '回应', '50', '100', '可以',
               '开始', '这个', '问题', '为什么', '我们', '背后', '终于', '重磅', '160', '国内', '需要', '为何', '亿元', '发现', '成功', '最强', '不是', '人生']
        c = Counter()
        for x in seg_list:
            if x not in stopwords:
                if len(x) > 1 and x != '
    ':
                    c[x] += 1
    
        print('
    词频统计结果:')
        f = open("finalwords.txt", 'w', encoding='utf-8')
        f2 = open("finalnumber.txt", 'w', encoding='utf-8')
        for (k, v) in c.most_common(100):  # 输出词频最高的前两个词
            print("%s:%d" % (k, v))
            words = k
            number = str(v)
            f.write( words )
            f2.write(number)
            line = '
    '
            f.write( line )
            f2.write(line)
    
        f.close()
    
        filehandle.close();

    解释关联及导出

    import requests
    import re
    import xlwt
    import linecache
    import mysql.connector
    url = 'https://baike.baidu.com/'
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
    }
    mydb = mysql.connector.connect(host='localhost', user='root', password='123456', database='python',charset='utf8')
    mycursor = mydb.cursor()
    lst=[]
    def get_page(url):
        try:
            response = requests.get(url, headers=headers)
            response.encoding = 'utf-8'
            if response.status_code == 200:
                print('获取网页成功')
                #print(response.encoding)
                return response.text
            else:
                print('获取网页失败')
        except Exception as e:
            print(e)
    fopen = open('finalwords.txt', 'r',encoding='utf-8')
    fopen2 = open('finalnumber.txt', 'r',encoding='utf-8')
    lines = fopen.readlines()
    urls = ['https://baike.baidu.com/item/{}'.format(line) for line in lines]
    i=0
    for url in urls:
         print(url.replace("
    ", ""))
         page = get_page(url.replace("
    ", ""))
         items = re.findall('<meta name="description" content="(.*?)">',page,re.S)
         if len(items)>0:
                words = linecache.getline("finalwords.txt", i+1).strip()
                num = linecache.getline("finalnumber.txt", i+1).strip()
                message = items[0]
                print(message)
                wurl = url.replace("
    ", "")
                lst.append((words,num,message,wurl))
                i+= 1
         print("总爬取完毕数量:" + str(i))
    print("打印完!!!")
    print(lst)
    
    tuple_lst = tuple(lst)
    sql = "insert into hotwords (words,num,message,url) values (%s,%s,%s,%s)"
    mycursor.executemany(sql, tuple_lst)
    mydb.commit()

    java:

    RCServlet

    package com.servlet;
    
    import java.io.IOException;
    import java.util.Map;
    import javax.servlet.ServletException;
    import javax.servlet.annotation.WebServlet;
    import javax.servlet.http.HttpServlet;
    import javax.servlet.http.HttpServletRequest;
    import javax.servlet.http.HttpServletResponse;
    import com.dao.Dao;
    import net.sf.json.JSONArray;
    import net.sf.json.JSONObject;
    
    @WebServlet("/RcServlet")
    public class RcServlet extends HttpServlet {
        private static final long serialVersionUID = 1L;
           
        /**
         * @see HttpServlet#HttpServlet()
         */
        public RcServlet() {
            super();
            // TODO Auto-generated constructor stub
        }
    
        /**
         * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
         */
        protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
    
            this.doPost(request, response);
        }
    
        /**
         * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
         */
        protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
            request.setCharacterEncoding("utf-8");
            response.setContentType("text/html;charset=utf-8");
            Map<String, Integer>sortMap=Dao.getrc();
            JSONArray json =new JSONArray();
            int k=0;
            for (Map.Entry<String, Integer> entry : sortMap.entrySet()) 
            {
                JSONObject ob=new JSONObject();
                ob.put("name", entry.getKey());
                ob.put("value", entry.getValue());
               
                    json.add(ob);
                    k++;
                if(k==100)
                    break;
            }
            System.out.println(json.toString());
            
            response.getWriter().write(json.toString());
        }
    }

    ClickServlet

    package com.servlet;
    
    import java.io.IOException;
    import java.util.List;
    
    import javax.servlet.ServletException;
    import javax.servlet.annotation.WebServlet;
    import javax.servlet.http.HttpServlet;
    import javax.servlet.http.HttpServletRequest;
    import javax.servlet.http.HttpServletResponse;
    
    import com.bean.Data;
    import com.dao.Dao;
    
    /**
     * Servlet implementation class ClickServlet
     */
    @WebServlet("/ClickServlet")
    public class ClickServlet extends HttpServlet {
        private static final long serialVersionUID = 1L;
           
        /**
         * @see HttpServlet#HttpServlet()
         */
        public ClickServlet() {
            super();
            // TODO Auto-generated constructor stub
        }
    
        /**
         * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
         */
        protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
    
            doPost(request, response);
        }
    
        /**
         * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
         */
        protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
            request.setCharacterEncoding("UTF-8");
            response.setContentType("text/html;charset=utf-8");
            String words=request.getParameter("words");
            Dao dao = new Dao();
            List<Data> list=null;
            list=dao.list(words);
            System.out.println(list);
            request.setAttribute("list",list); 
            request.getRequestDispatcher("RC.jsp").forward(request, response);
        }
    
    }

    RC.jsp

    <%@ page language="java" contentType="text/html; charset=UTF-8" pageEncoding="UTF-8"%>
    <%@taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%>
    
    <%request.setCharacterEncoding("utf-8"); 
    response.setCharacterEncoding("utf-8");%>
    <!DOCTYPE html>
    <html>
    <head>
    <meta charset="ISO-8859-1">
    <title>热词云</title>
    </head>
    
    <meta charset="UTF-8">
         <link type="text/css" rel="stylesheet" href="css/style.css">
    <script src="js/jquery-3.4.1.min.js"></script>
    <script src="js/echarts.min.js"></script>
    <script src="js/echarts-cloud.js"></script>
    <style>
      
                    
               #main{
                  width: 30%;
                  height: 500px;
                  
                  border:1px solid #ddd;
                  float:right;
              }
              #table{
                    overflow-x: auto;
                     overflow-y: auto;
                    width: 70%;
                    height: 500px;
                    float:left;
                    margin-top:100dp;
                    padding-top:100dp;
                    
                }
    
            </style>
    
    </head>
    
    <body >
    <br>
    <h1>热词云</h1>
    <br>
    <br>
    <br>
    
    <div id="table">
      <table id='gradient-style' >
        <tr>
          <th align="center">热词简介</th>
        </tr>
        <c:forEach var="item" items="${list}">
          <tr>
            <td><a href="${item.url}">${item.message}</a></td>
          </tr>
        </c:forEach>
      </table>
    </div>
    
    
      <div id="main">
      
      </div>
      <script type="text/javascript">
    
        var dt;
       
                $.ajax({
                    url : "RcServlet",
                    async : true,
                    type : "POST",
                    data : {        
                    },
                    dataType : "json",
                    contentType: 'application/x-www-form-urlencoded; charset=UTF-8',
                    success : function(data) {
                        dt = data;
                        
                         var mydata = new Array(0);
                         for (var i = 0; i < dt.length; i++) {
                              var d = {};
                              
                              d["name"] = dt[i].name;
                             
                              d["value"] = dt[i].value;
                              mydata.push(d);
                          }
                         var myChart = echarts.init(document.getElementById('main'));
                         //设置点击效果
                        
                         
                         
                         myChart.setOption({
                             title: {
                                 text: ''
                             },
                             tooltip: {},
                             series: [{
                                 type : 'wordCloud',  //类型为字符云
                                     shape:'smooth',  //平滑
                                     gridSize : 8, //网格尺寸
                                     size : ['50%','50%'],
                                     //sizeRange : [ 50, 100 ],
                                     rotationRange : [-45, 0, 45, 90], //旋转范围
                                     textStyle : {
                                         normal : {
                                             fontFamily:'微软雅黑',
                                             color: function() {
                                                 return 'rgb(' + 
                                                     Math.round(Math.random() * 255) +
                                              ', ' + Math.round(Math.random() * 255) +
                                              ', ' + Math.round(Math.random() * 255) + ')'
                                                    }
                                             },
                                         emphasis : {
                                             shadowBlur : 5,  //阴影距离
                                             shadowColor : '#333'  //阴影颜色
                                         }
                                     },
                                     left: 'center',
                                     top: 'center',
                                     right: null,
                                     bottom: null,
                                     '100%',
                                     height:'100%',
                                     data:mydata
                             }]
                         });
                         
                         myChart.on('click', function (params) {
                             var url = "ClickServlet?words=" + params.name;
                             window.location.href = url;
                           });
                         
                        alert("成功!");
                       
       
                    },
                    error : function() {
                        alert("请求失败");
                    },
               });
        
             
           
    
    
    </script>
        
    
    </body>
    </html>

    四 运行截图:

    点击热词:

    点击热词解释:

  • 相关阅读:
    安装VMware16兼容Hyper-v+WSL2+Docker+解决0x80370102报错
    家用联通光纤开启IPv6
    配置微软Azure大数据HDInsight云集群
    Hadoop集群搭建-05安装配置YARN
    Hadoop集群搭建-04安装配置HDFS
    Hadoop集群搭建-03编译安装hadoop
    Hadoop集群搭建-02安装配置Zookeeper
    Hadoop集群搭建-01前期准备
    springMVC+request.session实现用户登录和访问权限控制
    idea+spring4+springmvc+mybatis+maven实现简单增删改查CRUD
  • 原文地址:https://www.cnblogs.com/wendi/p/13535274.html
Copyright © 2020-2023  润新知