任务:
爬取CVPR2019年所有论文的题目,并提取题目中的关键字,做成按照热度显示大小的热词云。
代码:
爬虫:
# coding=utf-8 import pymysql import requests from lxml import etree class Spider: def __init__(self): self.url = "http://openaccess.thecvf.com/CVPR2019.py" self.header = { "user-agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Mobile Safari/537.36"} self.db = pymysql.connect(host='localhost', port=3306, user='root', passwd='abc456', db='paperdata', charset='utf8') self.cursor = self.db.cursor() self.html_list = [] def getHtmlList(self): response = requests.get(self.url, headers=self.header) html_body = etree.HTML(response.text) title = html_body.xpath("//dt[@class='ptitle']/a/@href") for item in title: self.html_list.append("http://openaccess.thecvf.com/" + item) def getContent(self, url): try: response = requests.get(url, headers=self.header) body = etree.HTML(response.text) title = body.xpath("//div[@id='papertitle']/text()")[0] abstract = body.xpath("//div[@id='abstract']/text()")[0] down_url = body.xpath("//div[@id='content']//a/@href")[0].replace("../../", "http://openaccess.thecvf.com/") sql = '''insert into data values({},"{}","{}","{}")'''.format(0, title, down_url, str(abstract)) self.cursor.execute(sql) print(title + "插入成功!") self.db.commit() except Exception as e: print(e) def run(self): self.getHtmlList() for url in self.html_list: self.getContent(url) if __name__ == '__main__': spwder = Spider() spwder.run()
package dao; import java.sql.SQLException; import java.util.List; import org.apache.commons.dbutils.QueryRunner; import org.apache.commons.dbutils.handlers.BeanListHandler; import pojo.Data; import utils.DataSourceUtils; /** * @author: connor * @version锛�2020骞�4鏈�15鏃� 涓婂崍10:19:06 * */ public class DataDao { public List<Data> getData() throws SQLException { QueryRunner queryRunner = new QueryRunner(DataSourceUtils.getDataSource()); String sql = "select * from data "; List<Data> dataList = queryRunner.query(sql, new BeanListHandler<Data>(Data.class)); return dataList; } public List<Data> getLink(String name) throws SQLException { QueryRunner queryRunner = new QueryRunner(DataSourceUtils.getDataSource()); String sql = "select * from data where papername like ?"; List<Data> dataList = queryRunner.query(sql, new BeanListHandler<Data>(Data.class),"%"+name+"%"); return dataList; } }
package pojo; public class Data { private int id; private String papername; private String paperlink; private String paperabstract; public int getId() { return id; } public void setId(int id) { this.id = id; } public String getPapername() { return papername; } public void setPapername(String papername) { this.papername = papername; } public String getPaperlink() { return paperlink; } public void setPaperlink(String paperlink) { this.paperlink = paperlink; } public String getPaperabstract() { return paperabstract; } public void setPaperabstract(String paperabstract) { this.paperabstract = paperabstract; } }
package pojo; public class Word { private String name; private int value; public String getName() { return name; } public void setName(String name) { this.name = name; } public int getValue() { return value; } public void setValue(int value) { this.value = value; } }
package service; import java.sql.SQLException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import org.apache.commons.lang.ArrayUtils; import dao.DataDao; import pojo.Data; import pojo.Word; public class DataService { public List<Word> getData() throws SQLException { DataDao dao = new DataDao(); List<Data> dataList= dao.getData(); List<Word> wordList = new ArrayList<Word>(); String [] names = new String[100000]; for(Data data:dataList) { String name = data.getPapername(); String[] namestemp = name.split(" "); names = (String[]) ArrayUtils.addAll(namestemp, names); } HashMap<String, Integer> name_value = new HashMap<>(); for(String name:names) { name_value.put(name, !name_value.containsKey(name)?1:name_value.get(name)+1); } for(String name:name_value.keySet()) { Word word = new Word(); if(name!=null&&(name_value.get(name)>1)&&(name.length()>4)) { word.setName(name); word.setValue(name_value.get(name)); wordList.add(word); } } return wordList; } public List<Data> getLink(String name) throws SQLException { DataDao dao = new DataDao(); return dao.getLink(name); } }
package servlet; import java.io.IOException; import java.sql.SQLException; import java.util.List; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import pojo.Data; import service.DataService; /** * Servlet implementation class ClickFunctionServlet */ @WebServlet("/clickFunction") public class ClickFunctionServlet extends HttpServlet { private static final long serialVersionUID = 1L; /** * @see HttpServlet#HttpServlet() */ public ClickFunctionServlet() { super(); // TODO Auto-generated constructor stub } /** * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response) */ protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { // TODO Auto-generated method stub request.setCharacterEncoding("utf-8"); response.setContentType("text/html;charset=UTF-8"); String name = request.getParameter("name"); List<Data> dataList =null; DataService service = new DataService(); try { dataList = service.getLink(name); } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } request.setAttribute("dataList", dataList); request.getRequestDispatcher("papercloud.jsp").forward(request, response); } /** * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response) */ protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { // TODO Auto-generated method stub doGet(request, response); } }
package servlet; import java.io.IOException; import java.sql.SQLException; import java.util.List; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import com.google.gson.Gson; import pojo.Word; import service.DataService; /** * Servlet implementation class GetDataServlet */ @WebServlet("/getData") public class GetDataServlet extends HttpServlet { private static final long serialVersionUID = 1L; /** * @see HttpServlet#HttpServlet() */ public GetDataServlet() { super(); // TODO Auto-generated constructor stub } /** * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response) */ protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { // TODO Auto-generated method stub request.setCharacterEncoding("utf-8"); response.setContentType("text/html;charset=UTF-8"); List<Word> wordList = null; DataService service = new DataService(); try { wordList = service.getData(); } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } Gson gson = new Gson(); String json = gson.toJson(wordList); response.getWriter().write(json); } /** * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response) */ protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { // TODO Auto-generated method stub doGet(request, response); } }
package utils; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import javax.sql.DataSource; import com.mchange.v2.c3p0.ComboPooledDataSource; public class DataSourceUtils { private static DataSource dataSource = new ComboPooledDataSource(); private static ThreadLocal<Connection> tl = new ThreadLocal<Connection>(); public static DataSource getDataSource() { return dataSource; } public static Connection getConnection() throws SQLException { Connection con = tl.get(); if (con == null) { con = dataSource.getConnection(); tl.set(con); } return con; } public static void startTransaction() throws SQLException { Connection con = getConnection(); if (con != null) { con.setAutoCommit(false); } } public static void rollback() throws SQLException { Connection con = getConnection(); if (con != null) { con.rollback(); } } public static void commitAndRelease() throws SQLException { Connection con = getConnection(); if (con != null) { con.commit(); con.close(); tl.remove(); } } public static void closeConnection() throws SQLException { Connection con = getConnection(); if (con != null) { con.close(); } } public static void closeStatement(Statement st) throws SQLException { if (st != null) { st.close(); } } public static void closeResultSet(ResultSet rs) throws SQLException { if (rs != null) { rs.close(); } } }
<?xml version="1.0" encoding="UTF-8"?> <c3p0-config> <default-config> <property name="user">root</property> <property name="password">0608</property> <property name="driverClass">com.mysql.jdbc.Driver</property> <property name="jdbcUrl">jdbc:mysql://localhost:3306/paperdata?serverTimezone=GMT%2B8&useUnicode=true&characterEncoding=UTF-8</property> </default-config> </c3p0-config>
<%@ page language="java" contentType="text/html; charset=UTF-8" pageEncoding="UTF-8"%> <%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%> <!DOCTYPE html> <html> <head> <meta charset="UTF-8"> <title>论文云</title> <script src='https://cdn.bootcss.com/echarts/3.7.0/echarts.simple.js'></script> <script src="./js/echarts-wordcloud.js"></script> <script src="./js/jquery-1.11.3.min.js"></script> <!-- 引入Bootstrap核心样式文件 --> <link href="css/bootstrap.css" rel="stylesheet"> <!-- 引入BootStrap核心js文件 --> <script src="./js/bootstrap.js"></script> <style> html, body, #main { 100%; height: 100%; margin: 0; } </style> </head> <body> <div id="main"></div> <div> <table class="table table-hover"> <thead> <tr> <td style="font-size: 20px;">论文链接</td> </tr> </thead> <tbody> <c:forEach items="${dataList}" var="data" varStatus="vs"> <tr> <td><a href="${data.paperlink}">${data.papername}</a></td> </tr> </c:forEach> </tbody> </table> </div> <script> var chart = echarts.init(document.getElementById('main')); var postURL = "/PaperData/getData"; var mydata = new Array(); $.ajaxSettings.async = false; $.post(postURL, {}, function(rs) { var dataList = JSON.parse(rs); for (var i = 0; i < dataList.length; i++) { var d = {}; d['name'] = dataList[i].name; d['value'] = dataList[i].value; mydata.push(d); } }); $.ajaxSettings.async = true; var option = { tooltip : {}, series : [ { type : 'wordCloud', gridSize : 2, sizeRange : [ 20, 50 ], rotationRange : [ -90, 90 ], shape : 'pentagon', width : 800, height : 600, drawOutOfBound : false, textStyle : { normal : { color : function() { return 'rgb(' + [ Math.round(Math.random() * 160), Math.round(Math.random() * 160), Math.round(Math.random() * 160) ] .join(',') + ')'; } }, emphasis : { shadowBlur : 10, shadowColor : '#333' } }, data : mydata } ] }; chart.setOption(option); chart.on('click', function(params) { var url = "clickFunction?name=" + params.name; window.location.href = url; }); </script> </body> </html>
<!DOCTYPE html> <html> <head> <meta charset="UTF-8"> <title>论文云</title> <script src='https://cdn.bootcss.com/echarts/3.7.0/echarts.simple.js'></script> <script src="./js/echarts-wordcloud.js"></script> <script src="./js/jquery-1.11.3.min.js"></script> <!-- 引入Bootstrap核心样式文件 --> <link href="css/bootstrap.css" rel="stylesheet"> <!-- 引入BootStrap核心js文件 --> <script src="./js/bootstrap.js"></script> <style> html, body, #main { 100%; height: 100%; margin: 0; } </style> </head> <body> <div id="main"></div> <div> <table class="table table-hover"> <thead> <tr> <td style="font-size: 20px;">论文链接</td> </tr> </thead> <tbody> <tr> <td><a>www.baidu.com</a></td> </tr> </tbody> </table> </div> <script> var chart = echarts.init(document.getElementById('main')); var postURL = "/PaperData/getData"; var mydata = new Array(); $.ajaxSettings.async = false; $.post(postURL, {}, function(rs) { var dataList = JSON.parse(rs); for (var i = 0; i < dataList.length; i++) { var d = {}; d['name'] = dataList[i].name; d['value'] = dataList[i].value; mydata.push(d); } }); $.ajaxSettings.async = true; var option = { tooltip : {}, series : [ { type : 'wordCloud', gridSize : 2, sizeRange : [ 20, 50 ], rotationRange : [ -90, 90 ], shape : 'pentagon', width : 800, height : 600, drawOutOfBound : false, textStyle : { normal : { color : function() { return 'rgb(' + [ Math.round(Math.random() * 160), Math.round(Math.random() * 160), Math.round(Math.random() * 160) ] .join(',') + ')'; } }, emphasis : { shadowBlur : 10, shadowColor : '#333' } }, data : mydata } ] }; chart.setOption(option); chart.on('click', function(params) { var url = "clickFunction?name=" + params.name; window.location.href = url; }); </script> </body> </html>
截图: