• 热词云


    爬取代码:

    1 import requests
      2 from bs4 import BeautifulSoup
      3 import bs4
      4 # -*- coding: UTF-8 -*
      5 from urllib.request import urlopen
      6 from pdfminer.pdfinterp import PDFResourceManager, process_pdf
      7 from pdfminer.converter import TextConverter
      8 from pdfminer.layout import LAParams
      9 from io import StringIO
     10 from pyhanlp import *
     11 import time
     12 
     13 import requests
     14 import json
     15 from pymysql import *
     16 
     17 #连接数据库的方法
     18 def connectDB():
     19     try:
     20         db=connect(host='localhost',port=3306,user='root',password='123456',db='python')
     21         print("数据库连接成功")
     22         return db
     23     except Exception as e:
     24         print(e)
     25     return NULL
     26 
     27 db = connectDB()
     28 
     29 #向数据库中插入数据的方法
     30 def insertInformation(title,abstract,keywords,href):
     31     cursor=db.cursor()
     32     try:
     33         cursor.execute("insert into new_table(title,abstract,keywords,href) values('%s','%s','%s','%s')" % (title,abstract,keywords,href))
     34         print("插入成功")
     35         db.commit()
     36         cursor.close()
     37         return True
     38     except Exception as e:
     39         print(e)
     40         db.rollback()
     41     return False
     42 
     43 list_href=[]
     44 list_title=[]
     45 
     46 def getHtmlText(url):
     47     r = requests.get(url)
     48     r.raise_for_status()
     49     r.encoding = r.apparent_encoding
     50     html = r.text
     51     return html
     52 
     53 
     54 
     55 def getDataFromHtml(list,html):
     56     bs = BeautifulSoup(html, "lxml")
     57     for td in bs.tbody.find_all("td"):
     58         if isinstance(td,bs4.element.Tag):
     59             for a in td.find_all("a"):
     60                 list_href.append(a['href'])
     61                 list_title.append(a.text)
     62 
     63 def showAll(list):
     64     for univ in list:
     65         print(univ)
     66 
     67 
     68 def readPDF(pdfFile):
     69     rsrcmgr = PDFResourceManager()
     70     retstr = StringIO()
     71     laparams = LAParams()
     72     device = TextConverter(rsrcmgr, retstr, laparams=laparams)
     73     process_pdf(rsrcmgr, device, pdfFile)
     74     device.close()
     75     content = retstr.getvalue()
     76     retstr.close()
     77     return content
     78 
     79 if __name__ == '__main__':
     80     url = "https://blog.csdn.net/u014636245/article/details/91426736"
     81     try:
     82         html = getHtmlText(url)
     83         getDataFromHtml(list,html)
     84         for i in range(0,len(list_title)):
     85             print(i)
     86             pdfFile = urlopen(list_href[i])
     87             # 远程
     88             outputString = readPDF(pdfFile)
     89             if "Abstract" in outputString:
     90                 document = ""
     91                 if "1. Introduction" in outputString and "Abstract" in outputString:
     92                     document = outputString[outputString.index("Abstract"):outputString.index("1. Introduction")]
     93                 elif "1.Introduction" in outputString and "Abstract" in outputString:
     94                     document = outputString[outputString.index("Abstract"):outputString.index("1.Introduction")]
     95                 else :
     96                     document = outputString[outputString.index("Abstract"):outputString.index("Abstract")+800]
     97                 # print(document)
     98                 keywords = HanLP.extractKeyword(document, 10)
     99                 print(keywords)
    100                 str = ""
    101                 for k in keywords:
    102                     str+=k+" "
    103                 pdfFile.close()
    104                 insertInformation(list_title[i],document,str,list_href[i])
    105             time.sleep(0.1)
    106     except Exception as e:
    107         print(e)
    108         print("爬取失败")
    
    py

    结果:

    有很多很多条 ,关键词是每个keyword里面有10个关键词;

    然后就是将他们从数据库中取出来放在数组中,然后再进行排序,找最大;

    不要忘记将介词等无用词去掉;

    进行排序最简单的是使用的map

    // 排序
    List<Map.Entry<String ,Integer>> list = new ArrayList<Map.Entry<String,Integer>>(map.entrySet());
    //在java中,如果要对集合对象或数组对象进行排序,需要实现Comparator接口以达到我们想要的目标
    Comparator<Map.Entry<String,Integer>> comparator = new Comparator<Map.Entry<String, Integer>>() {
        public int compare(Map.Entry<String, Integer> left, Map.Entry<String, Integer> right) {
            return (left.getValue().compareTo(right.getValue()));
        }
    };
    // 集合默认升序升序
    Collections.sort(list,comparator);
    String ten[]=new String[50];
    int shu[]=new int[50];
    for(int i=0;i<50;i++){// 由高到低输出
        
     ten[i]=list.get(list.size()-i-1).getKey();
     shu[i]=list.get(list.size()-i-1).getValue();
     
     Tu tu =new Tu();
     tu.name=ten[i];
     tu.value=shu[i];
     list_tu.add(tu);
        System.out.println(list.get(list.size()-i-1).getKey() +":"+list.get(list.size()-i-1).getValue());
    }

    然后设置一个点击事件,转换成json的代码形式

    Gson gson = new Gson();
    String json = gson.toJson(list_tu);
    response.getWriter().write(json);

    然后使用echarts设计热词云

    <%@ page language="java" contentType="text/html; charset=UTF-8"
        pageEncoding="UTF-8"%>
    <%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%>
    <!DOCTYPE html>
    <html>
    <head>
    <meta charset="UTF-8">
    <title>Insert title here</title>
    <link rel="stylesheet" href="css/bootstrap.min.css" type="text/css" />
    <script src="js/jquery-1.11.3.min.js" type="text/javascript"></script>
    <script type="text/javascript" src="js/echarts.min.js"></script>
    <script type="text/javascript" src="js/china.js"></script>
    <script src="js/bootstrap.min.js" type="text/javascript"></script>
    <script src='https://cdn.bootcss.com/echarts/3.7.0/echarts.simple.js'></script>
    <script src='js/echarts-wordcloud.js'></script>
    </head>
    <body>
    <div id="main" style=" 100%;height: 400px"></div>
    <div>
      <table class="table" style=" 100%;align-content: center;" >
        <tr>
          <th align="center">论文连接</th>
        </tr>
        <c:forEach var="item" items="${list}">
          <tr>
            <td><a href="${item.lianjie }">${item.title}</a></td>
          </tr>
        </c:forEach>
      </table>
    </div>
    <script>
      var chart = echarts.init(document.getElementById('main'));
      var dt;
      $.ajax({
        url : "PaperServlet_",
        async : false,
        type : "POST",
        success : function(data) {
          dt = data;
         // alert(dt[0].title);
        },
        error : function() {
          alert("请求失败");
        },
        dataType : "json"
      });
      var mydata = new Array(0);
      for (var i = 0; i < dt.length; i++) {
          var d = {};
    
          d["name"] = dt[i].name;
          //alert(dt[i].name);
          d["value"] = dt[i].value;
          mydata.push(d);
      }
      var option = {
        tooltip: {},
        series: [ {
          type: 'wordCloud',
          gridSize: 2,
          sizeRange: [20, 50],
          rotationRange: [-90, 90],
          shape: 'pentagon',
           600,
          height: 300,
          drawOutOfBound: true,
          textStyle: {
            normal: {
              color: function () {
                return 'rgb(' + [
                  Math.round(Math.random() * 160),
                  Math.round(Math.random() * 160),
                  Math.round(Math.random() * 160)
                ].join(',') + ')';
              }
            },
            emphasis: {
              shadowBlur: 10,
              shadowColor: '#333'
            }
          },
          data: mydata
        } ]
      };
    
      chart.setOption(option);
      chart.on('click', function (params) {
          var url = "ClickServlet?geunjian=" + params.name;
          window.location.href = url;
        });
      window.onresize = chart.resize;
    </script>
    </body>
    </html>

    然后点击热词后携带此热词到servlet,再从数据库中找出论文的关键字中包含此热词的论文列表

    import java.io.IOException;
    import java.sql.SQLException;
    import java.util.ArrayList;
    import java.util.List;
    
    import javax.servlet.ServletException;
    import javax.servlet.annotation.WebServlet;
    import javax.servlet.http.HttpServlet;
    import javax.servlet.http.HttpServletRequest;
    import javax.servlet.http.HttpServletResponse;
    
    import com.me.dao.LWDao;
    import com.me.domain.LunWen;
    
    /**
     * Servlet implementation class ClickServlet
     */
    @WebServlet("/ClickServlet")
    public class ClickServlet extends HttpServlet {
        private static final long serialVersionUID = 1L;
        LWDao dao = new LWDao();
    
        public ClickServlet() {
            super();
            // TODO Auto-generated constructor stub
        }
    
        protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
            String geunjian = request.getParameter("geunjian");
            System.out.println(geunjian);
            List<LunWen> guan = new ArrayList<LunWen>();
            try {
                guan = dao.login(geunjian);
            } catch (SQLException e) {
                e.printStackTrace();
            }
            for(int i=0;i<guan.size();i++) {
                if(guan.get(i).getLianjie()!=null) {
                    String ss = guan.get(i).getLianjie().substring(6,guan.get(i).getLianjie().length());
                    guan.get(i).setLianjie("http://openaccess.thecvf.com/"+ss);
                }
    
            }
            request.setAttribute("list", guan);
            System.out.println(guan.size());
            request.getRequestDispatcher("lw.jsp").forward(request, response);
        }
    
        /**
         * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
         */
        protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
            // TODO Auto-generated method stub
            doGet(request, response);
        }
    
    }
  • 相关阅读:
    英文、简繁体中文 IT 词汇对照表
    VB.NET 中的 As New 以及型別指定
    使用 ADO.NET 的 ExecuteScalar 方法返回单一值
    适时调整 SqlDataSource 控件的 DataSourceMode 属性
    ADO.NET 2.0 的并行控制与数据存取冲突侦测
    让 ADO.NET 2.0 的 SqlCommand 和 SqlDataAdapter 合作
    透过 Socket API 让 PDA 和远程 PC 联机
    探讨 .NET 语言的 using statement 与资源释放
    让 user control 中的 Button 也能启用验证
    dotNET 語言中可提升效能的邏輯運算子
  • 原文地址:https://www.cnblogs.com/xjmm/p/13064644.html
Copyright © 2020-2023  润新知