爬取CVPR2019年所有论文的题目,并提取题目中的关键字,做成按照热度显示大小的热词云。
代码:
# coding=utf-8import pymysqlimport requestsfrom lxml import etree
class Spider:
def __init__(self):
self.url = "http://openaccess.thecvf.com/CVPR2019.py"
self.header = {
"user-agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Mobile Safari/537.36"}
self.db = pymysql.connect(host='localhost', port=3306, user='root', passwd='abc456', db='paperdata',
charset='utf8')
self.cursor = self.db.cursor()
self.html_list = []
def getHtmlList(self):
response = requests.get(self.url, headers=self.header)
html_body = etree.HTML(response.text)
title = html_body.xpath("//dt[@class='ptitle']/a/@href")
for item in title:
self.html_list.append("http://openaccess.thecvf.com/" + item)
def getContent(self, url):
try:
response = requests.get(url, headers=self.header)
body = etree.HTML(response.text)
title = body.xpath("//div[@id='papertitle']/text()")[0]
abstract = body.xpath("//div[@id='abstract']/text()")[0]
down_url = body.xpath("//div[@id='content']//a/@href")[0].replace("../../", "http://openaccess.thecvf.com/")
sql = '''insert into data values({},"{}","{}","{}")'''.format(0, title, down_url, str(abstract))
self.cursor.execute(sql)
print(title + "插入成功!")
self.db.commit()
except Exception as e:
print(e)
def run(self):
self.getHtmlList()
for url in self.html_list:
self.getContent(url)
if __name__ == '__main__':
spwder = Spider()
spwder.run()
DataDao.java
package dao;
import java.sql.SQLException;
import java.util.List;
import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.BeanListHandler;
import pojo.Data;
import utils.DataSourceUtils;
public class DataDao {
public List<Data> getData() throws SQLException {
QueryRunner queryRunner = new QueryRunner(DataSourceUtils.getDataSource());
String sql = "select * from data ";
List<Data> dataList = queryRunner.query(sql, new BeanListHandler<Data>(Data.class));
return dataList;
}
public List<Data> getLink(String name) throws SQLException {
QueryRunner queryRunner = new QueryRunner(DataSourceUtils.getDataSource());
String sql = "select * from data where papername like ?";
List<Data> dataList = queryRunner.query(sql, new BeanListHandler<Data>(Data.class),"%"+name+"%");
return dataList;
}
}
Data.java
package pojo;
public class Data {
private int id;
private String papername;
private String paperlink;
private String paperabstract;
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getPapername() {
return papername;
}
public void setPapername(String papername) {
this.papername = papername;
}
public String getPaperlink() {
return paperlink;
}
public void setPaperlink(String paperlink) {
this.paperlink = paperlink;
}
public String getPaperabstract() {
return paperabstract;
}
public void setPaperabstract(String paperabstract) {
this.paperabstract = paperabstract;
}
}
World.java
package pojo;
public class Word {
private String name;
private int value;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getValue() {
return value;
}
public void setValue(int value) {
this.value = value;
}
}
DataService.java
package service;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import org.apache.commons.lang.ArrayUtils;
import dao.DataDao;
import pojo.Data;
import pojo.Word;
public class DataService {
public List<Word> getData() throws SQLException {
DataDao dao = new DataDao();
List<Data> dataList= dao.getData();
List<Word> wordList = new ArrayList<Word>();
String [] names = new String[100000];
for(Data data:dataList) {
String name = data.getPapername();
String[] namestemp = name.split(" ");
names = (String[]) ArrayUtils.addAll(namestemp, names);
}
HashMap<String, Integer> name_value = new HashMap<>();
for(String name:names) {
name_value.put(name, !name_value.containsKey(name)?1:name_value.get(name)+1);
}
for(String name:name_value.keySet()) {
Word word = new Word();
if(name!=null&&(name_value.get(name)>1)&&(name.length()>4)) {
word.setName(name);
word.setValue(name_value.get(name));
wordList.add(word);
}
}
return wordList;
}
public List<Data> getLink(String name) throws SQLException {
DataDao dao = new DataDao();
return dao.getLink(name);
}
}
ClickFunctionServlet.java
package servlet;
import java.io.IOException;
import java.sql.SQLException;
import java.util.List;
import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import pojo.Data;
import service.DataService;
/**
* Servlet implementation class ClickFunctionServlet
*/
@WebServlet("/clickFunction")
public class ClickFunctionServlet extends HttpServlet {
private static final long serialVersionUID = 1L;
/**
* @see HttpServlet#HttpServlet()
*/
public ClickFunctionServlet() {
super();
// TODO Auto-generated constructor stub
}
/**
* @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
*/
protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
// TODO Auto-generated method stub
request.setCharacterEncoding("utf-8");
response.setContentType("text/html;charset=UTF-8");
String name = request.getParameter("name");
List<Data> dataList =null;
DataService service = new DataService();
try {
dataList = service.getLink(name);
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
request.setAttribute("dataList", dataList);
request.getRequestDispatcher("papercloud.jsp").forward(request, response);
}
/**
* @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
*/
protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
// TODO Auto-generated method stub
doGet(request, response);
}
}
GetDataServlet.java
package servlet;
import java.io.IOException;
import java.sql.SQLException;
import java.util.List;
import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import com.google.gson.Gson;
import pojo.Word;
import service.DataService;
/**
* Servlet implementation class GetDataServlet
*/
@WebServlet("/getData")
public class GetDataServlet extends HttpServlet {
private static final long serialVersionUID = 1L;
/**
* @see HttpServlet#HttpServlet()
*/
public GetDataServlet() {
super();
// TODO Auto-generated constructor stub
}
/**
* @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
*/
protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
// TODO Auto-generated method stub
request.setCharacterEncoding("utf-8");
response.setContentType("text/html;charset=UTF-8");
List<Word> wordList = null;
DataService service = new DataService();
try {
wordList = service.getData();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
Gson gson = new Gson();
String json = gson.toJson(wordList);
response.getWriter().write(json);
}
/**
* @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
*/
protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
// TODO Auto-generated method stub
doGet(request, response);
}
}
DataSourceUtils.java
package utils;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import javax.sql.DataSource;
import com.mchange.v2.c3p0.ComboPooledDataSource;
public class DataSourceUtils {
private static DataSource dataSource = new ComboPooledDataSource();
private static ThreadLocal<Connection> tl = new ThreadLocal<Connection>();
public static DataSource getDataSource() {
return dataSource;
}
public static Connection getConnection() throws SQLException {
Connection con = tl.get();
if (con == null) {
con = dataSource.getConnection();
tl.set(con);
}
return con;
}
public static void startTransaction() throws SQLException {
Connection con = getConnection();
if (con != null) {
con.setAutoCommit(false);
}
}
public static void rollback() throws SQLException {
Connection con = getConnection();
if (con != null) {
con.rollback();
}
}
public static void commitAndRelease() throws SQLException {
Connection con = getConnection();
if (con != null) {
con.commit();
con.close();
tl.remove();
}
}
public static void closeConnection() throws SQLException {
Connection con = getConnection();
if (con != null) {
con.close();
}
}
public static void closeStatement(Statement st) throws SQLException {
if (st != null) {
st.close();
}
}
public static void closeResultSet(ResultSet rs) throws SQLException {
if (rs != null) {
rs.close();
}
}
}
c3p0-config.xml
<?xml version="1.0" encoding="UTF-8"?>
<c3p0-config>
<default-config>
<property name="user">root</property>
<property name="password">0608</property>
<property name="driverClass">com.mysql.jdbc.Driver</property>
<property name="jdbcUrl">jdbc:mysql://localhost:3306/paperdata?serverTimezone=GMT%2B8&useUnicode=true&characterEncoding=UTF-8</property>
</default-config>
</c3p0-config>
papercloud.jsp
<%@ page language="java" contentType="text/html; charset=UTF-8"
pageEncoding="UTF-8"%>
<%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>论文云</title>
<script src='https://cdn.bootcss.com/echarts/3.7.0/echarts.simple.js'></script>
<script src="./js/echarts-wordcloud.js"></script>
<script src="./js/jquery-1.11.3.min.js"></script>
<!-- 引入Bootstrap核心样式文件 -->
<link href="css/bootstrap.css" rel="stylesheet">
<!-- 引入BootStrap核心js文件 -->
<script src="./js/bootstrap.js"></script>
<style>
html, body, #main {
100%;
height: 100%;
margin: 0;
}
</style>
</head>
<body>
<div id="main"></div>
<div>
<table class="table table-hover">
<thead>
<tr>
<td style="font-size: 20px;">论文链接</td>
</tr>
</thead>
<tbody>
<c:forEach items="${dataList}" var="data" varStatus="vs">
<tr>
<td><a href="${data.paperlink}">${data.papername}</a></td>
</tr>
</c:forEach>
</tbody>
</table>
</div>
<script>
var chart = echarts.init(document.getElementById('main'));
var postURL = "/PaperData/getData";
var mydata = new Array();
$.ajaxSettings.async = false;
$.post(postURL, {}, function(rs) {
var dataList = JSON.parse(rs);
for (var i = 0; i < dataList.length; i++) {
var d = {};
d['name'] = dataList[i].name;
d['value'] = dataList[i].value;
mydata.push(d);
}
});
$.ajaxSettings.async = true;
var option = {
tooltip : {},
series : [ {
type : 'wordCloud',
gridSize : 2,
sizeRange : [ 20, 50 ],
rotationRange : [ -90, 90 ],
shape : 'pentagon',
width : 800,
height : 600,
drawOutOfBound : false,
textStyle : {
normal : {
color : function() {
return 'rgb('
+ [ Math.round(Math.random() * 160),
Math.round(Math.random() * 160),
Math.round(Math.random() * 160) ]
.join(',') + ')';
}
},
emphasis : {
shadowBlur : 10,
shadowColor : '#333'
}
},
data : mydata
} ]
};
chart.setOption(option);
chart.on('click', function(params) {
var url = "clickFunction?name=" + params.name;
window.location.href = url;
});
</script>
</body>
</html>
papercloud.html
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>论文云</title>
<script src='https://cdn.bootcss.com/echarts/3.7.0/echarts.simple.js'></script>
<script src="./js/echarts-wordcloud.js"></script>
<script src="./js/jquery-1.11.3.min.js"></script>
<!-- 引入Bootstrap核心样式文件 -->
<link href="css/bootstrap.css" rel="stylesheet">
<!-- 引入BootStrap核心js文件 -->
<script src="./js/bootstrap.js"></script>
<style>
html, body, #main {
100%;
height: 100%;
margin: 0;
}
</style>
</head>
<body>
<div id="main"></div>
<div>
<table class="table table-hover">
<thead>
<tr>
<td style="font-size: 20px;">论文链接</td>
</tr>
</thead>
<tbody>
<tr>
<td><a>www.baidu.com</a></td>
</tr>
</tbody>
</table>
</div>
<script>
var chart = echarts.init(document.getElementById('main'));
var postURL = "/PaperData/getData";
var mydata = new Array();
$.ajaxSettings.async = false;
$.post(postURL, {}, function(rs) {
var dataList = JSON.parse(rs);
for (var i = 0; i < dataList.length; i++) {
var d = {};
d['name'] = dataList[i].name;
d['value'] = dataList[i].value;
mydata.push(d);
}
});
$.ajaxSettings.async = true;
var option = {
tooltip : {},
series : [ {
type : 'wordCloud',
gridSize : 2,
sizeRange : [ 20, 50 ],
rotationRange : [ -90, 90 ],
shape : 'pentagon',
width : 800,
height : 600,
drawOutOfBound : false,
textStyle : {
normal : {
color : function() {
return 'rgb('
+ [ Math.round(Math.random() * 160),
Math.round(Math.random() * 160),
Math.round(Math.random() * 160) ]
.join(',') + ')';
}
},
emphasis : {
shadowBlur : 10,
shadowColor : '#333'
}
},
data : mydata
} ]
};
chart.setOption(option);
chart.on('click', function(params) {
var url = "clickFunction?name=" + params.name;
window.location.href = url;
});
</script>
</body>
</html>