转载 http://blog.csdn.net/rongyongfeikai2/article/details/7798838
最近,在看博客园上的文章。希望能够爬取指定的博客园的文章,并保存为WORD文档的形式。所以,趁着周末休息,花了半天时间把它给做了出来。
完整代码下载地址:http://download.csdn.net/detail/rongyongfeikai2/4462085
首先,我们爬取的文章,应该包括三个部分:标题、链接和正文。所以,我们用一个POJO来存储文章。
- package com.BlogCrawler.Model;
- /*
- * author:Tammy Pi
- * function:写入doc文档的类
- * Email:victory_pj@163.com
- */
- public class Document {
- //属性域,分别代表文章的标题、内容和文章的链接
- private String title;
- private String content;
- private String link;
- public String getTitle() {
- return title;
- }
- public void setTitle(String title) {
- this.title = title;
- }
- public String getContent() {
- return content;
- }
- public void setContent(String content) {
- this.content = content;
- }
- public String getLink() {
- return link;
- }
- public void setLink(String link) {
- this.link = link;
- }
- }
定义好Document后,我们需要爬取指定URL对应的文章列表,再根据文章列表中文章所对应的链接,提取其对应的文章,并分析文章内容。
目前分析博客园列表结构,它的文章列表是放在class="post"或者class="postTitle"的div中。我们用HttpClient包爬取相应的文章后,再用HtmlParser分析DOM,提取出文章列表。对于博客园正文结构,分析得,它的正文是放在id="cnblogs_post_body",同样也可以用HttpClient包将其提取出来。
提取页面和分析页面的代码为:
- package com.BlogCrawler.Fetcher;
- import java.io.BufferedInputStream;
- import java.io.InputStream;
- import java.util.*;
- import java.util.regex.Pattern;
- import org.apache.commons.httpclient.HttpClient;
- import org.apache.commons.httpclient.HttpStatus;
- import org.apache.commons.httpclient.methods.GetMethod;
- import org.htmlparser.Node;
- import org.htmlparser.Parser;
- import org.htmlparser.filters.HasAttributeFilter;
- import org.htmlparser.filters.TagNameFilter;
- import org.htmlparser.util.NodeList;
- import org.htmlparser.util.ParserException;
- /*
- * author:Tammy Pi
- * function:用于爬取文章的类,针对于博客园
- * Email:victory_pj@163.com
- */
- public class FetcherHelper {
- //根据url爬取url指向的页面的内容,并返回
- public String getPageContent(String url){
- StringBuffer sb = new StringBuffer();
- HttpClient httpClient = new HttpClient();
- GetMethod getMethod = new GetMethod(url);
- BufferedInputStream reader = null;
- try{
- int statusCode = httpClient.executeMethod(getMethod);
- //判断状态
- if(statusCode==HttpStatus.SC_OK){
- InputStream inputStream = getMethod.getResponseBodyAsStream();
- reader = new BufferedInputStream(inputStream);
- int index;
- byte[] buffer = new byte[1024];
- while((index=reader.read(buffer))!=-1){
- sb.append(new String(buffer,0,index,"utf-8"));
- }
- }else{
- System.out.println("爬取出错。错误代码:"+statusCode);
- }
- }catch(Exception ex){
- ex.printStackTrace();
- }
- return sb.toString();
- }
- //利用HTMLParser过滤文章内容
- public String filterContent(String page,String tag,String attr,String value){
- TagNameFilter tagFilter = new TagNameFilter(tag);
- Parser parser = new Parser();
- parser = parser.createParser(page,"utf-8");
- NodeList tagList = null;
- String rtn = "";
- try {
- tagList = parser.parse(tagFilter);
- for(int u=0;u<tagList.size();u++) {
- String html = tagList.elementAt(u).toHtml();
- HasAttributeFilter attrFilter = new HasAttributeFilter(attr,value);
- Parser parser2 = parser.createParser(html,"utf-8");
- NodeList list2 = parser2.parse(attrFilter);
- if(list2.size()>0){
- rtn = list2.elementAt(0).toHtml();
- }
- }
- } catch (ParserException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- return rtn;
- }
- //利用HTMLParser分析文章列表
- public List<com.BlogCrawler.Model.Document> htmlFilter(String page,String tag,String attr,String value){
- String rtn = "";
- TagNameFilter tagFilter = new TagNameFilter(tag);
- Parser parser = new Parser();
- parser = parser.createParser(page,"utf-8");
- NodeList tagList = null;
- List<com.BlogCrawler.Model.Document> list = new ArrayList<com.BlogCrawler.Model.Document>();
- try {
- tagList = parser.parse(tagFilter);
- //遍历过滤后的tagList
- HasAttributeFilter attrFilter = new HasAttributeFilter(attr,value);
- NodeList list2 = tagList.extractAllNodesThatMatch(attrFilter);
- //过滤得到h2,针对博客园的过滤
- String html = list2.toHtml();
- Parser parser2 = parser.createParser(html,"utf-8");
- NodeList list3 = null;
- if(value.equals("post")){
- list3 = parser2.parse(new TagNameFilter("h2"));
- }else if(value.endsWith("postTitle")){
- list3 = parser2.parse(new TagNameFilter("a"));
- }
- for(int i=0;i<list3.size();i++) {
- Node node = list3.elementAt(i);
- com.BlogCrawler.Model.Document doc = new com.BlogCrawler.Model.Document();
- doc.setTitle(node.toPlainTextString());
- String html1 = node.toHtml();
- html1 = html1.substring(html1.indexOf("href=\"")+6);
- html1 = html1.substring(0,html1.indexOf("\""));
- doc.setLink(html1);
- //获得内容
- doc.setContent(filterContent(getPageContent(doc.getLink()),"div","id","cnblogs_post_body"));
- list.add(doc);
- }
- } catch (ParserException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- return list;
- }
- //用于测试的主函数
- public static void main(String[] args){
- FetcherHelper helper = new FetcherHelper();
- String rtn = helper.getPageContent("http://www.cnblogs.com/passzh/default.html?OnlyTitle=1");
- if(rtn.indexOf("postTitle")==-1){
- helper.htmlFilter(rtn,"div","class", "post");
- }else{
- System.out.println("进入");
- helper.htmlFilter(rtn,"div","class", "postTitle");
- }
- }
- }
最后,我们将文章写入doc文档中,这是就是用IText.jar包,它提供了HtmlWorker类,可以很方便的将HTML按照格式写入Word文档中。写入Word文档的代码为:
- package com.BlogCrawler.DocHelper;
- import java.awt.Color;
- import java.io.*;
- import com.lowagie.text.Document;
- import com.lowagie.text.DocumentException;
- import com.lowagie.text.Element;
- import com.lowagie.text.Font;
- import com.lowagie.text.PageSize;
- import com.lowagie.text.Paragraph;
- import com.lowagie.text.Rectangle;
- import com.lowagie.text.html.simpleparser.HTMLWorker;
- import com.lowagie.text.html.simpleparser.StyleSheet;
- import com.lowagie.text.rtf.RtfWriter2;
- import java.util.*;
- /*
- * author:Tammy Pi
- * function:写入doc文档的类
- * Email:victory_pj@163.com
- */
- public class DocHelper {
- private BufferedWriter writer = null;
- private String path = "c:\\Blog\\";
- //定义A4纸张
- private Rectangle pageSize = new Rectangle(PageSize.A4);
- private Document doc = null;
- public void docHelper(String fileName,List<com.BlogCrawler.Model.Document> list) {
- try {
- //判断路径是否存在
- if(!new File(path).exists()){
- new File(path).mkdir();
- }
- //判断文件是否存在
- if(!new File(path+fileName).exists()){
- new File(path+fileName).createNewFile();
- }
- File file = new File(path+fileName);
- if(!(file.canRead()&&file.canWrite())){
- System.out.println("您不具有此word文档的读写操作权限!");
- return;
- }
- pageSize = pageSize.rotate();
- //创建word文档,并设置纸张大小
- doc = new Document(pageSize,80,80,50,50);
- //创建一个word文档的书写器
- RtfWriter2.getInstance(doc,new FileOutputStream(path + fileName));
- doc.open();
- //设置标题的格式
- Paragraph titleParagraph = null;
- Paragraph contentGraph = null;
- //循环遍历Document
- for(int i=0;i<list.size();i++) {
- com.BlogCrawler.Model.Document document = list.get(i);
- //书写blog文章的头部
- System.out.println(document.getTitle());
- titleParagraph = new Paragraph(document.getTitle(),new Font(Font.NORMAL,18,Font.BOLD,new Color(0,0,0)));
- //标题居中
- titleParagraph.setAlignment(Element.ALIGN_CENTER);
- try {
- doc.add(titleParagraph);
- //书写内容
- StyleSheet ss = new StyleSheet();
- List htmlList = HTMLWorker.parseToList(new StringReader(document.getContent()), ss);
- for(int j=0;j<htmlList.size();j++) {
- Element e = (Element) htmlList.get(j);
- Paragraph par = new Paragraph();
- par.add(e);
- doc.add(par);
- }
- } catch (DocumentException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- }
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- } finally{
- if(writer!=null){
- //关闭文件输出流
- try {
- writer.close();
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- }
- if(doc!=null){
- doc.close();
- }
- }
- }
- public static void main(String[] args){
- DocHelper docHelper = new DocHelper();
- com.BlogCrawler.Model.Document doc1 = new com.BlogCrawler.Model.Document();
- doc1.setTitle("标题1");
- doc1.setContent("内容1");
- com.BlogCrawler.Model.Document doc2 = new com.BlogCrawler.Model.Document();
- doc2.setTitle("标题2");
- doc2.setContent("内容2");
- List<com.BlogCrawler.Model.Document> list = new ArrayList<com.BlogCrawler.Model.Document>();
- list.add(doc1);
- list.add(doc2);
- docHelper.docHelper("卧龙居.doc",list);
- System.out.println("word文档书写完成!");
- }
- }
最后,程序要运行,自然要有一个入口类。代码为:
- package com.BlogCrawler.DocHelper;
- import com.BlogCrawler.Fetcher.FetcherHelper;
- import java.util.*;
- /*
- * author:Tammy Pi
- * function:爬取器命令行运行
- * Email:victory_pj@163.com
- */
- public class CrawlerHelper {
- public boolean doCrawler(String path,String username) {
- FetcherHelper fetcherHelper = new FetcherHelper();
- String html = fetcherHelper.getPageContent(path);
- if(html.equals("")){
- return false;
- }
- List<com.BlogCrawler.Model.Document> list = null;
- if(html.indexOf("postTitle")==-1){
- list = fetcherHelper.htmlFilter(html,"div","class","post");
- }else{
- list = fetcherHelper.htmlFilter(html,"div","class","postTitle");
- }
- DocHelper docHelper = new DocHelper();
- docHelper.docHelper(username+".doc", list);
- return true;
- }
- public static void main(String[] args){
- Scanner scan = new Scanner(System.in);
- System.out.println("请输入博客地址:");
- String path = scan.nextLine();
- System.out.println("请输入用户名:");
- String username = scan.nextLine();
- CrawlerHelper helper = new CrawlerHelper();
- int index = 1;
- helper.doCrawler(path, username);
- System.out.println("写入完成。文档位于c:\\Blog下。");
- }
- }
运行CrawlerHelper,运行效果为;
请输入博客地址:
http://www.cnblogs.com/passzh/
请输入用户名:
风清月明
最近项目开发中需记住的一些经验
JSP的URL页面传参乱码问题的解决
Hibernate的使用
比较重要的三个正则表达式(JAVA)
JAVA中Map按照value值逆序排序
MFC的列表控件的使用
MFC中TAB控件的使用
MFC通过ADO连接MS SQLSERVER数据库
不产生乱码的方法
在Dreamweaver中建立JSP站点的方法
写入完成。文档位于c:\Blog下。
再看看你的C盘Blog文件夹下,是否有个“风清月明.doc”,这篇博客的本页内容,就被写入DOC文档了。大功告成!
PS:对于有人在我的资源中评论说不能爬取的问题,我想说,我自己已经试验过了(首先,你写的地址必须是【博客园】的【具体的人的博客的地址】;其次,大概可以爬60%,有40%由于博客DOM结构没有分析到,所以爬取不了)。如果要用,请自行修改。