一 项目名称:
信息化领域热词分类分析及解释
二 功能设计:
1) 数据 采集:要求从定期自动从网络中爬取信息领域的相关热
词;
2) 数据 清洗:对热词信息进行数据清洗,并采用自动分类技术
生成信息领域热词目录,;
3) 热 热 词 解释:针对每个热词名词自动添加中文解释(参照百度
百科或维基百科);
4) 热词 引用 :并对近期引用热词的文章或新闻进行标记,生成
超链接目录,用户可以点击访问;
5) 数据 可视化 展示:
① 用字符云或热词图进行可视化展示;
② 用关系图标识热词之间的紧密程度。
6) 数据 报告:可将所有热词目录和名词解释生成 WORD 版报告
形式导出。
三 项目源码:
python:
热点新闻爬取
import requests import re import xlwt url = 'https://news.cnblogs.com/n/recommend' headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36" } def get_page(url): try: response = requests.get(url, headers=headers) if response.status_code == 200: print('获取网页成功') print(response.encoding) return response.text else: print('获取网页失败') except Exception as e: print(e) f = xlwt.Workbook(encoding='utf-8') sheet01 = f.add_sheet(u'sheet1', cell_overwrite_ok=True) sheet01.write(0, 0, '博客最热新闻') # 第一行第一列 urls = ['https://news.cnblogs.com/n/recommend?page={}'.format(i * 1) for i in range(100)] temp=0 num=0 for url in urls: print(url) page = get_page(url) items = re.findall('<h2 class="news_entry">.*?<a href=".*?" target="_blank">(.*?)</a>',page,re.S) print(len(items)) print(items) for i in range(len(items)): sheet01.write(temp + i + 1, 0, items[i]) temp += len(items) num+=1 print("已打印完第"+str(num)+"页") print("打印完!!!") f.save('Hotword.xls')
热词拆分:
import jieba import pandas as pd import re import mysql.connector from collections import Counter if __name__ == '__main__': filehandle = open("Hotword.txt", "r", encoding='GBK'); mystr = filehandle.read() seg_list = jieba.cut(mystr) # 默认是精确模式 print(seg_list) stopwords = ['的', ',', ')', '(', '-', '.', '—', ':', '之', '(', ')', '?', '和', '使用', '实现', '、', '与', '!', '你', '了', '中', '】', '【', '中国', '发布', '公司', '首次', '全球', '正式', '2019', '2020', '亿美元', '20', '10', '&#', '首个', '正在', '最大', '成为', '第一', '这', '个', '如何', '人类', '什么', '一个', '宣布', '可能', '推出', '没有', '地球', '到底', '回应', '50', '100', '可以', '开始', '这个', '问题', '为什么', '我们', '背后', '终于', '重磅', '160', '国内', '需要', '为何', '亿元', '发现', '成功', '最强', '不是', '人生'] c = Counter() for x in seg_list: if x not in stopwords: if len(x) > 1 and x != ' ': c[x] += 1 print(' 词频统计结果:') f = open("finalwords.txt", 'w', encoding='utf-8') f2 = open("finalnumber.txt", 'w', encoding='utf-8') for (k, v) in c.most_common(100): # 输出词频最高的前两个词 print("%s:%d" % (k, v)) words = k number = str(v) f.write( words ) f2.write(number) line = ' ' f.write( line ) f2.write(line) f.close() filehandle.close();
解释关联及导出
import requests import re import xlwt import linecache import mysql.connector url = 'https://baike.baidu.com/' headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36" } mydb = mysql.connector.connect(host='localhost', user='root', password='123456', database='python',charset='utf8') mycursor = mydb.cursor() lst=[] def get_page(url): try: response = requests.get(url, headers=headers) response.encoding = 'utf-8' if response.status_code == 200: print('获取网页成功') #print(response.encoding) return response.text else: print('获取网页失败') except Exception as e: print(e) fopen = open('finalwords.txt', 'r',encoding='utf-8') fopen2 = open('finalnumber.txt', 'r',encoding='utf-8') lines = fopen.readlines() urls = ['https://baike.baidu.com/item/{}'.format(line) for line in lines] i=0 for url in urls: print(url.replace(" ", "")) page = get_page(url.replace(" ", "")) items = re.findall('<meta name="description" content="(.*?)">',page,re.S) if len(items)>0: words = linecache.getline("finalwords.txt", i+1).strip() num = linecache.getline("finalnumber.txt", i+1).strip() message = items[0] print(message) wurl = url.replace(" ", "") lst.append((words,num,message,wurl)) i+= 1 print("总爬取完毕数量:" + str(i)) print("打印完!!!") print(lst) tuple_lst = tuple(lst) sql = "insert into hotwords (words,num,message,url) values (%s,%s,%s,%s)" mycursor.executemany(sql, tuple_lst) mydb.commit()
java:
RCServlet
package com.servlet; import java.io.IOException; import java.util.Map; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import com.dao.Dao; import net.sf.json.JSONArray; import net.sf.json.JSONObject; @WebServlet("/RcServlet") public class RcServlet extends HttpServlet { private static final long serialVersionUID = 1L; /** * @see HttpServlet#HttpServlet() */ public RcServlet() { super(); // TODO Auto-generated constructor stub } /** * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response) */ protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { this.doPost(request, response); } /** * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response) */ protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { request.setCharacterEncoding("utf-8"); response.setContentType("text/html;charset=utf-8"); Map<String, Integer>sortMap=Dao.getrc(); JSONArray json =new JSONArray(); int k=0; for (Map.Entry<String, Integer> entry : sortMap.entrySet()) { JSONObject ob=new JSONObject(); ob.put("name", entry.getKey()); ob.put("value", entry.getValue()); json.add(ob); k++; if(k==100) break; } System.out.println(json.toString()); response.getWriter().write(json.toString()); } }
ClickServlet
package com.servlet; import java.io.IOException; import java.util.List; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import com.bean.Data; import com.dao.Dao; /** * Servlet implementation class ClickServlet */ @WebServlet("/ClickServlet") public class ClickServlet extends HttpServlet { private static final long serialVersionUID = 1L; /** * @see HttpServlet#HttpServlet() */ public ClickServlet() { super(); // TODO Auto-generated constructor stub } /** * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response) */ protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { doPost(request, response); } /** * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response) */ protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { request.setCharacterEncoding("UTF-8"); response.setContentType("text/html;charset=utf-8"); String words=request.getParameter("words"); Dao dao = new Dao(); List<Data> list=null; list=dao.list(words); System.out.println(list); request.setAttribute("list",list); request.getRequestDispatcher("RC.jsp").forward(request, response); } }
RC.jsp
<%@ page language="java" contentType="text/html; charset=UTF-8" pageEncoding="UTF-8"%> <%@taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%> <%request.setCharacterEncoding("utf-8"); response.setCharacterEncoding("utf-8");%> <!DOCTYPE html> <html> <head> <meta charset="ISO-8859-1"> <title>热词云</title> </head> <meta charset="UTF-8"> <link type="text/css" rel="stylesheet" href="css/style.css"> <script src="js/jquery-3.4.1.min.js"></script> <script src="js/echarts.min.js"></script> <script src="js/echarts-cloud.js"></script> <style> #main{ width: 30%; height: 500px; border:1px solid #ddd; float:right; } #table{ overflow-x: auto; overflow-y: auto; width: 70%; height: 500px; float:left; margin-top:100dp; padding-top:100dp; } </style> </head> <body > <br> <h1>热词云</h1> <br> <br> <br> <div id="table"> <table id='gradient-style' > <tr> <th align="center">热词简介</th> </tr> <c:forEach var="item" items="${list}"> <tr> <td><a href="${item.url}">${item.message}</a></td> </tr> </c:forEach> </table> </div> <div id="main"> </div> <script type="text/javascript"> var dt; $.ajax({ url : "RcServlet", async : true, type : "POST", data : { }, dataType : "json", contentType: 'application/x-www-form-urlencoded; charset=UTF-8', success : function(data) { dt = data; var mydata = new Array(0); for (var i = 0; i < dt.length; i++) { var d = {}; d["name"] = dt[i].name; d["value"] = dt[i].value; mydata.push(d); } var myChart = echarts.init(document.getElementById('main')); //设置点击效果 myChart.setOption({ title: { text: '' }, tooltip: {}, series: [{ type : 'wordCloud', //类型为字符云 shape:'smooth', //平滑 gridSize : 8, //网格尺寸 size : ['50%','50%'], //sizeRange : [ 50, 100 ], rotationRange : [-45, 0, 45, 90], //旋转范围 textStyle : { normal : { fontFamily:'微软雅黑', color: function() { return 'rgb(' + Math.round(Math.random() * 255) + ', ' + Math.round(Math.random() * 255) + ', ' + Math.round(Math.random() * 255) + ')' } }, emphasis : { shadowBlur : 5, //阴影距离 shadowColor : '#333' //阴影颜色 } }, left: 'center', top: 'center', right: null, bottom: null, '100%', height:'100%', data:mydata }] }); myChart.on('click', function (params) { var url = "ClickServlet?words=" + params.name; window.location.href = url; }); alert("成功!"); }, error : function() { alert("请求失败"); }, }); </script> </body> </html>
四 运行截图:
点击热词:
点击热词解释: