一。 爬取博客信息
使用jsoup可以下载并且使用jquery语法来解析xml 这里先通过http://blog.csdn.net/liaomin416100569 右键查看源代码可以看到分类的源代码为:
声明:该内容为个人学习使用 不做其他用途 如果侵权 请联系以下邮箱 416100569@qq.com
<div id="panel_Category" class="panel">
<ul class="panel_head"><span>文章分类</span></ul>
<ul class="panel_body">
<li>
<a href="/liaomin416100569/article/category/744627" onclick="_gaq.push(['_trackEvent','function', 'onclick', 'blog_articles_wenzhangfenlei']); ">activeMQ</a><span>(3)</span>
</li>
<li>
<a href="/liaomin416100569/article/category/643417" onclick="_gaq.push(['_trackEvent','function', 'onclick', 'blog_articles_wenzhangfenlei']); ">ant</a><span>(3)</span>
</li>
<li>
</ul>
</div>
获取所有的分类 通过jquery选择器 #panel_Category a 即可获取所有的超链接 通过 text()获取分类名称 通过 attr("href")获取分类url
同理 获取分类下的所有文章的标题 描述 内容 最后更新时间等 最后通过lucene生成索引 代码如下
》分类实体类
package cn.et.spilder;
public class Category {
private String name;
private String count;
private String url;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getCount() {
return count;
}
public void setCount(String count) {
this.count = count;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
}
》文章实体
package cn.et.spilder;
public class Arcticle {
private String title;
private String description;
private String url;
private String createTime;
private String filePath;
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getCreateTime() {
return createTime;
}
public void setCreateTime(String createTime) {
this.createTime = createTime;
}
public String getFilePath() {
return filePath;
}
public void setFilePath(String filePath) {
this.filePath = filePath;
}
}
》lucene工具类
package cn.et.index;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.jsoup.Jsoup;
import org.wltea.analyzer.lucene.IKAnalyzer;
import cn.et.spilder.Arcticle;
import cn.et.spilder.CsdnSpilder;
/**
* lucene操作类
*时间:2017-6-30 上午09:14:48
*作者: LM
*联系方式:973465719@qq.com
*
*/
public class LuceneUtils {
//定义IK分词器
static Analyzer analyzer = new IKAnalyzer();
//批量处理内存存储的最大docuemnt个数
final static int dealCount=50;
//html格式化器 在关键字前后加上红色字体标签
static SimpleHTMLFormatter htmlFormatter=new SimpleHTMLFormatter("<font color="red">","</font>");
/**
* 关闭writer对象
* @param iw
* @throws IOException
*/
public static void close(IndexWriter iw) throws IOException{
iw.close();
}
public static IndexWriter getRamWriter(RAMDirectory ramDirectory) throws IOException{
IndexWriterConfig iwc=new IndexWriterConfig(Version.LUCENE_47,analyzer);
IndexWriter iw=new IndexWriter(ramDirectory, iwc);
return iw;
}
public static IndexWriter getWriter(String dir) throws IOException{
// FSDirectory fsDirectory= FSDirectory.open(Paths.get(dir));
FSDirectory fsDirectory= FSDirectory.open(new File(dir));
IndexWriterConfig iwc=new IndexWriterConfig(Version.LUCENE_47,analyzer);
IndexWriter iw=new IndexWriter(fsDirectory, iwc);
return iw;
}
public static List<Arcticle> search(String indexDir,String keys) throws Exception{
FSDirectory fsDirectory= FSDirectory.open(new File(indexDir));
IndexSearcher searcher=new IndexSearcher(DirectoryReader.open(fsDirectory));
MultiFieldQueryParser qp=new MultiFieldQueryParser(Version.LUCENE_47, new String[]{"title","content"}, analyzer);
Query query=qp.parse(keys);
//初始化高亮器
Highlighter high=new Highlighter(htmlFormatter, new QueryScorer(query));
TopDocs td=searcher.search(query, 10);
ScoreDoc[] sd=td.scoreDocs;
List<Arcticle> arcList=new ArrayList<Arcticle>();
for(ScoreDoc ss:sd){
Document doc=searcher.doc(ss.doc);
Arcticle arc=new Arcticle();
arc.setTitle(doc.getField("title").stringValue());
TokenStream tokenStream = analyzer.tokenStream("desc", new StringReader(doc.get("desc")));
String str = high.getBestFragment(tokenStream, doc.get("desc"));
arc.setDescription(str);
arc.setUrl(CsdnSpilder.rootDir+doc.getField("url").stringValue());
arc.setCreateTime(doc.getField("createTime").stringValue());
arcList.add(arc);
}
return arcList;
}
/**
* 优化方式 先将索引写入内存 到一定量后写入磁盘
* @param dir
* @param arc
* @throws IOException
*/
public static void indexs(String dir,List<Arcticle> arc) throws IOException{
RAMDirectory ramDirectory=new RAMDirectory();
IndexWriter ramWriter=getRamWriter(ramDirectory);
new File(dir).mkdirs();
//索引文件存在于目录中 IndexWriter只是个写入对象
IndexWriter iw=getWriter(dir);
for(int i=1;i<=arc.size();i++){
Arcticle srcTmp=arc.get(i-1);
if(i%dealCount==0 || i==arc.size()){
//必须关闭writer才能将目录索引数据写入到其他的writer中
ramWriter.commit();
ramWriter.close();
iw.addIndexes(ramDirectory);
if(i<arc.size())
ramWriter=getRamWriter(ramDirectory);
}else{
index(ramWriter,srcTmp);
}
}
iw.commit();
iw.close();
}
/**
* 直接写入磁盘
* @param writer
* @param arc
* @return
* @throws IOException
*/
public static Document index(IndexWriter writer,Arcticle arc) throws IOException{
Document doc=new Document();
doc.add(new TextField("title", arc.getTitle(), Store.YES));
doc.add(new TextField("url", arc.getUrl(), Store.YES));
doc.add(new TextField("createTime", arc.getCreateTime(), Store.YES));
doc.add(new TextField("desc", arc.getDescription(), Store.YES));
doc.add(new TextField("content", Jsoup.parse(FileUtils.readFileToString(new File(arc.getFilePath()))).text(), Store.YES));
writer.addDocument(doc);
return doc;
}
}
》爬取类package cn.et.spilder;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import org.apache.commons.io.FileUtils;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import cn.et.index.LuceneUtils;
/**
* Csdn的蜘蛛 需要自行分析博客的html语法的特点
*时间:2017-6-29 上午10:42:47
*作者: LM
*联系方式:973465719@qq.com
*
*/
public class CsdnSpilder {
//存放下载文件的目录
public final static String SAVED_DIR="F:/mycsdn/";
//CSDN的blog的官网
public final static String rootDir="http://blog.csdn.net/";
//爬取的用户名
public final static String userName="liaomin416100569";
//路徑分隔符
public final static String separatorChar="/";
//索引目录
public final static String indexDir="F:/myindex/";
//日志处理
public final static Logger logger=Logger.getLogger(CsdnSpilder.class);
/**
开始爬虫的入口
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
logger.debug("开始处理:"+rootDir+userName);
Document doc = getDoc(rootDir+userName);
//获取所有的分类
List<Category> category=getCategory(doc);
logger.debug("获取到类型个数为::"+category.size());
for(Category tmCate:category){
String curl=tmCate.getUrl();
logger.debug("开始处理文章分类:"+tmCate.getName()+",路径:"+curl+",文章数:"+tmCate.getCount());
//创建目录
new File(SAVED_DIR+curl).mkdirs();
List<Arcticle> arcList=getArcticle(curl);
logger.debug("开始索引");
LuceneUtils.indexs(indexDir, arcList);
logger.debug("处理完成");
}
}
/**
* 无法获取重试获取Document对象
* @return
*/
public static Document getDoc(String path){
Document doc;
while(true){
try {
doc = Jsoup.connect(path).get();
break;
} catch (Exception e) {
e.printStackTrace();
}
}
return doc;
}
@Test
public void testArcticle() throws IOException{
String curl="/liaomin416100569/article/category/650802";
//创建目录
new File(SAVED_DIR+curl).mkdirs();
List<Arcticle> arcList=getArcticle(curl);
LuceneUtils.indexs(indexDir, arcList);
System.out.println(arcList.size());
}
/**
* 获取所有分类
* @return
* @throws IOException
*/
public static List<Category> getCategory(Document doc) throws IOException{
Elements newsHeadlines = doc.select("#panel_Category a");
String[] categoryArr=newsHeadlines.text().replaceAll(" +", " ").split(" ");
List<Category> cates=new ArrayList<Category>();
for(int i=0;i<newsHeadlines.size();i++){
Element el=newsHeadlines.get(i);
Category c=new Category();
c.setName(el.text().trim());
c.setCount(el.nextElementSibling().text());
c.setUrl(el.attr("href"));
cates.add(c);
}
return cates;
}
/**
* 通过分类获取文章 文章过多不建议调用此方法
* @param cl
* @return
* @throws IOException
*/
public static List<Arcticle> getArcticle(String typeUrl) throws IOException{
String category=rootDir+typeUrl;
Document doc = getDoc(category);
//判断是否存在分页
Elements newsHeadlines = doc.select("#papelist");
int totalPage=0;
List<Arcticle> arcs=new ArrayList<Arcticle>();
//存在分页
if(newsHeadlines.size()>0){
totalPage=Integer.parseInt(newsHeadlines.get(0).child(0).text().split("共")[1].split("页")[0]);
for(int i=1;i<=totalPage;i++){
arcs.addAll(getArcticle(typeUrl,i));
}
}else{
arcs.addAll(getArcticle(typeUrl,1));
}
return arcs;
}
/**
* 获取分配中某一页的文章
* @param typeUrl
* @param page
* @return
* @throws IOException
*/
public static List<Arcticle> getArcticle(String typeUrl,int page) throws IOException{
//獲取當前頁的url地址
String pageUrl=rootDir+typeUrl+separatorChar+page;
//获取需要保存的目录
String destPath=SAVED_DIR+typeUrl+separatorChar+page;
//提前创建该目录
new File(destPath).mkdirs();
//如果如法连接自动重连
Document doc = getDoc(pageUrl);
//获取到所有文章的标题标签
Elements docs = doc.select(".link_title");
List<Arcticle> arcs=new ArrayList<Arcticle>();
for(int i=0;i<docs.size();i++){
Element el=docs.get(i);
Element ael=el.child(0);
//获取标题上超链接的地址
String url=ael.attr("href");
String title=ael.text();
Arcticle a=new Arcticle();
a.setUrl(url);
a.setTitle(title);
//获取描述的标签
Element descElement=el.parent().parent().nextElementSibling();
String description=descElement.text();
a.setDescription(description);
//获取最后更新时间的标签
Element timeElement=descElement.nextElementSibling();
String createTime=timeElement.select(".link_postdate").text();
a.setCreateTime(createTime);
logger.debug(title+url+description+createTime);
arcs.add(a);
//保存到文件中
String filePath=saveFile(destPath,rootDir+url,a);
a.setFilePath(filePath);
}
return arcs;
}
/**
* 保存html内容到文件中
* @param destDir 需要保存的目標目錄
* @param htmlUrl 抓取的htmlurl
* @throws IOException
*/
public static String saveFile(String destDir,String htmlUrl,Arcticle arc) throws IOException{
Document doc = getDoc(htmlUrl);
String fileName=htmlUrl.substring(htmlUrl.lastIndexOf("/")+1);
String file=destDir+separatorChar+fileName+".html";
File hfile=new File(file);
boolean ifUpdate=true;
//文件存在需要判断文件是否存在更新
if(hfile.exists()){
Properties p=new Properties();
p.load(new FileInputStream(destDir+separatorChar+fileName+".ini"));
String createTime=p.getProperty("createTime");
//之前的文章创建时间 小于 网上爬取文章 时间 所以文章被修改过 需要更新
if(createTime.compareTo(arc.getCreateTime())<0){
hfile.delete();
ifUpdate=true;
}else{
ifUpdate=false;
}
}
if(ifUpdate){
//写入文件 并将信息写入资源文件
Properties p=new Properties();
p.setProperty("title", arc.getTitle());
p.setProperty("url", arc.getUrl());
p.setProperty("description", arc.getDescription());
p.setProperty("createTime", arc.getCreateTime());
p.store(new FileOutputStream(destDir+separatorChar+fileName+".ini"),htmlUrl);
FileUtils.writeStringToFile(hfile, doc.toString(),"UTF-8");
}
return file;
}
}
》maven依赖配置(IK分词器 2012停止更新 只支持到lucene4.7.2所以使用该版本)
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-io</artifactId>
<version>1.3.2</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.4.3</version>
</dependency>
<dependency>
<groupId>com.janeluo</groupId>
<artifactId>ikanalyzer</artifactId>
<version>2012_u6</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>4.7.2</version>
</dependency>
》log4j配置 在 src/main/java添加 log4j.properties
log4j.rootLogger=debug,stdout
log4j.appender.stdout = org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Target = System.out
log4j.appender.stdout.layout = org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern =%d{ABSOLUTE} %5p %c :%L - %m%n
执行 CsdnSpilder 自动在f:/myindex目录创建索引 f:/mycsdn备份自己的博客html文件 效果如下
二。 搜索索引文件
使用springboot快速发布web应用 使用angularjs 实现ajax请求和迭代输出
》添加maven依赖
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>cn.et</groupId>
<artifactId>spilder</artifactId>
<version>0.0.1-SNAPSHOT</version>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>1.5.4.RELEASE</version>
</parent>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!--对jsp的支持-->
<dependency>
<groupId>org.apache.tomcat.embed</groupId>
<artifactId>tomcat-embed-jasper</artifactId>
</dependency>
<dependency>
<groupId>javax.servlet</groupId>
<artifactId>javax.servlet-api</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>javax.servlet</groupId>
<artifactId>jstl</artifactId>
</dependency>
<dependency>
<groupId>com.janeluo</groupId>
<artifactId>ikanalyzer</artifactId>
<version>2012_u6</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>4.7.2</version>
</dependency>
</dependencies>
</project>
》启动springboot类package cn.et;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
@SpringBootApplication
public class WebStart {
public static void main(String[] args) {
SpringApplication.run(WebStart.class, args);
}
}
》添加springmvc控制层类package cn.et.controller;
import java.util.List;
import javax.servlet.http.HttpServletResponse;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import cn.et.index.LuceneUtils;
import cn.et.spilder.Arcticle;
@RestController
public class SearchController {
//索引目录
public final static String indexDir="F:/myindex/";
@RequestMapping("search")
public List<Arcticle> search(String key,HttpServletResponse rsp) throws Exception{
rsp.setHeader("Content-Type", "application/json;charset=UTF-8");
return LuceneUtils.search(indexDir, key);
}
}
》src/main添加 webapp目录 新建jsp文件和添加angularjs库
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
<title>s.html</title>
<meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
<meta http-equiv="description" content="this is my page">
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
<!--<link rel="stylesheet" type="text/css" href="./styles.css">-->
<script type="text/javascript" src="angular.min.js"></script>
</head>
<body>
<div ng-app="searchApp" ng-controller="searchController">
搜索key:<input type="text" name="key" ng-model="key"> <input type="button" value='查询' ng-click="search()">
<div ng-repeat="x in searchData">
<a href="{{x.url}}"><font size="5"><titl ng-bind-html="x.title | trustHtml"></titl><br/></font></a>
<desp ng-bind-html="x.description | trustHtml"></desp><hr>
</div>
</div>
</body>
<script type="text/javascript">
var searchApp=angular.module("searchApp",[]);
searchApp.controller("searchController",function($scope,$http,$sce){
$scope.search=function(){
$http({
method:"GET",
url:"search?key="+$scope.key
}).then(function(res){
var d=res.data;
$scope.searchData=d;
},function(res){
alert("查询失败"+res.status+"-"+res.statusText);
});
}
})
//因为angular会自动将html内容转义 需要添加一个过滤器 表示信任
searchApp.filter("trustHtml",function($sce){
return function(val){
return $sce.trustAsHtml(val);
}
});
</script>
</html>
启动运行Webstart 自动内嵌tomcat并运行 默认tomcat 端口 8080 上下文路径 /
访问 http://localhost:8080/s.html 尝试搜索关键 点击标题 自动链接到csdn博客
技术学习路线(学习周期 一周)
jsoup 学习jquery用法即可 (http://www.runoob.com/jquery/jquery-tutorial.html)
angularjs学习 (http://www.runoob.com/angularjs/angularjs-tutorial.html)
springboot 学习 (http://docs.spring.io/spring-boot/docs/1.5.4.RELEASE/reference/htmlsingle/)
lucence 官网关于lucence例子 (http://lucene.apache.org/core/6_6_0/index.html)