1,jsoup简介
jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。jsoup 是基于 MIT 协议发布的,可放心使用于商业项目。
jsoup 的主要功能如下:
1. 从一个 URL,文件或字符串中解析 HTML;
2. 使用 DOM 或 CSS 选择器来查找、取出数据;
3. 可操作 HTML 元素、属性、文本;
2,jsoup使用
1,下载jsoup的jar包:http://jsoup.org/download
2, jsoup英文的开发手册:http://jsoup.org/cookbook/
3,jsoup的jsoup cookbook中文版:http://www.open-open.com/jsoup/
下面是一个简单例子
1,获取新浪财经的website 以及标题,打印输出。
2,获取1中一个wensite的正文信息,打印并输出。
代码实现:
package jSoupTesting; import java.io.IOException; import org.jsoup.Jsoup; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class GetSinaUrlAndTitle { public static void main(String[] args) { // TODO Auto-generated method stub getUrlAndTitle(); getTextMes(); } public static void getUrlAndTitle() { String url="http://finance.sina.com.cn/"; try { Document doc=Jsoup.connect(url).timeout(10000).get();//get all infomation from url website //System.out.println(doc); Elements ListDiv = doc.getElementsByAttributeValue("class","fin_tabs0_c0"); //System.out.println(ListDiv); for (Element div :ListDiv) { Elements links = div.getElementsByTag("a"); // System.out.println(links); for (Element link : links) { String linkHref = link.attr("href").trim(); String linkText = link.text().trim(); System.out.println(linkHref+" "+linkText); } } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void getTextMes() { String url="http://finance.sina.com.cn/hy/20140823/100220099682.shtml"; String textMes=""; try { Document doc=Jsoup.connect(url).timeout(10000).get(); Elements ListDiv = doc.getElementsByAttributeValue("class","blkContainerSblkCon BSHARE_POP"); //System.out.println(ListDiv); for(Element div:ListDiv) { Elements textInfos=div.getElementsByTag("p"); //System.out.println(textInfos); for(Element textInfo:textInfos) { String text=textInfo.text().trim(); textMes=textMes+text+" "; } } System.out.println(textMes); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
3,新闻抓取要求
新闻筛选过程:(以“新浪财经 “为例) http://finance.sina.com.cn/
1. 选择方向
(1)宏观新闻:宏观新闻:包括一些重大的国内外宏观调控,我国银监会等监管机构出台的一些文件,或者例如自贸区发展,金砖银行成立等国内重大金融新闻。
(2)公司新闻:包括客户公司或其他大型金融机构的管理层变动,兼并收购,战略转型,新推产品等新闻。
2. 网页选择
1.宏观新闻:进入http://finance.sina.com.cn/ -----》 首页“要闻“
2.公司新闻:进入http://finance.sina.com.cn/ 选择“银行“ -》 ”要闻“
3,抓取要求
1,要求抓取要闻部分所有网址,标题,关键字。
2,要求抓取1中网址下的正文。
3,并且前一天看过的新闻不能存在于后一天。
4,要求抓好的新闻放在txt文档中。
4,代码实现
package sinaSpider; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; import java.util.HashMap; import java.util.Map; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class GetSinaInfo { public static void main(String[] args) throws IOException { // TODO Auto-generated method stub getSinaInforamtion(); } public static void getSinaInforamtion() { Map<String,String> pathMap=createNewFiles(); try { getSinaYaoWen(pathMap); getSinaChangJing(pathMap); getSinaBank(pathMap); } catch (IOException e) { e.printStackTrace(); } } public static void getSinaYaoWen(Map<String,String> pathMap) throws IOException { String YaoWenTextPath=pathMap.get("yaowen")+"//yaowen"+GetDate()+"outputText.txt"; String YaoWenTitlePath=pathMap.get("yaowen")+"//yaowen"+GetDate()+"outputTitle.txt"; String YaoWenUrlPath=pathMap.get("yaowen")+"//"+GetDate()+"url.txt"; FileWriter urlWriter = new FileWriter(YaoWenUrlPath); FileWriter textWriter = new FileWriter(YaoWenTextPath); FileWriter titleWriter = new FileWriter(YaoWenTitlePath); String oldUrlPath=pathMap.get("yaowen")+"//"+GetYesterday()+"url.txt"; String[] oldUrls=GetYesterdayInfo(oldUrlPath); Document doc = Jsoup.connect("http://finance.sina.com.cn/").timeout(5000).get(); Elements ListDiv = doc.getElementsByAttributeValue("class","fin_tabs0_c0"); //System.out.println(ListDiv); for (Element element :ListDiv) { Elements links = element.getElementsByTag("a"); for (Element link : links) { String linkHref = link.attr("href").trim(); String linkText = link.text().trim(); if(judgeDup(oldUrls,linkHref)) { getWebText(linkHref,linkText,textWriter,titleWriter,urlWriter); } } } textWriter.close(); titleWriter.close(); urlWriter.close(); } public static void getSinaChangJing(Map<String,String> pathMap) throws IOException { String ChanJingTextPath=pathMap.get("chanjing")+"//chanjing"+GetDate()+"outputText.txt"; String ChanJingTitlePath=pathMap.get("chanjing")+"//chanjing"+GetDate()+"outputTitle.txt"; String ChanJingUrlPath=pathMap.get("chanjing")+"//"+GetDate()+"url.txt"; FileWriter urlWriter = new FileWriter(ChanJingUrlPath); FileWriter textWriter = new FileWriter(ChanJingTextPath); FileWriter titleWriter = new FileWriter(ChanJingTitlePath); String oldUrlPath=pathMap.get("chanjing")+"//"+GetYesterday()+"url.txt"; String[] oldUrls=GetYesterdayInfo(oldUrlPath); Document doc = Jsoup.connect("http://finance.sina.com.cn/chanjing/").timeout(5000).get(); Elements ListDiv = doc.getElementsByAttributeValue("class","blk_03"); //System.out.println(ListDiv); for (Element element :ListDiv) { Elements links = element.getElementsByTag("a"); for (Element link : links) { String linkHref = link.attr("href").trim(); String linkText = link.text().trim(); if(judgeDup(oldUrls,linkHref)) { getWebText(linkHref,linkText,textWriter,titleWriter,urlWriter); } } } textWriter.close(); titleWriter.close(); urlWriter.close(); } public static void getSinaBank(Map<String,String> pathMap) throws IOException { String bankTextPath=pathMap.get("bank")+"//bank"+GetDate()+"outputText.txt"; String bankTitlePath=pathMap.get("bank")+"//bank"+GetDate()+"outputTitle.txt"; String bankUrlPath=pathMap.get("bank")+"//"+GetDate()+"url.txt"; FileWriter urlWriter = new FileWriter(bankUrlPath); FileWriter textWriter = new FileWriter(bankTextPath); FileWriter titleWriter = new FileWriter(bankTitlePath); String oldUrlPath=pathMap.get("bank")+"//"+GetYesterday()+"url.txt"; String[] oldUrls=GetYesterdayInfo(oldUrlPath); Document doc = Jsoup.connect("http://finance.sina.com.cn/money/bank/").timeout(5000).get(); Elements ListDiv = doc.getElementsByAttributeValue("class","blk05"); //System.out.println(ListDiv); for (Element element :ListDiv) { Elements links = element.getElementsByTag("a"); for (Element link : links) { String linkHref = link.attr("href").trim(); String linkText = link.text().trim(); if(judgeDup(oldUrls,linkHref)) { getWebText(linkHref,linkText,textWriter,titleWriter,urlWriter); } } } textWriter.close(); titleWriter.close(); urlWriter.close(); } public static void getWebText(String url,String subTitle, FileWriter textWriter,FileWriter titleWriter, FileWriter urlWriter) throws IOException { Document doc; doc = Jsoup.connect(url).timeout(10000).get(); Elements ListDiv = doc.getElementsByAttributeValue("class","blkContainerSblkCon BSHARE_POP"); if(ListDiv.isEmpty()!=true) { String webTitleKeywords=getTitleAndWebsite(url,subTitle)+getKeyWords(doc); System.out.println(webTitleKeywords); writeSTK(webTitleKeywords, titleWriter); textWriter.write(webTitleKeywords+" "); urlWriter.write(url+" "); for (Element element :ListDiv) { Elements links = element.getElementsByTag("p"); for (Element link : links) { String linkText = link.text().trim(); textWriter.write(linkText+" "); // System.out.println(linkText); } } } } public static String getTitleAndWebsite(String url,String subTitle) { String titleAndWebsite; titleAndWebsite=url+" "+subTitle; return titleAndWebsite; } public static void writeSTK(String webTitleKeywords,FileWriter writeWebTitle) { try { writeWebTitle.write(webTitleKeywords+" "); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static String getKeyWords(Document doc) { Elements listKey=doc.getElementsByAttributeValue("class","art_keywords"); String keywords =" keywords:"; for(Element element:listKey) { Elements links = element.getElementsByTag("a"); for (Element link : links) { String linkText = link.text().trim(); keywords = keywords+linkText+","; } } return keywords; } public static String GetDate() { Date dt=new Date(); SimpleDateFormat simpleDate=new SimpleDateFormat("yyyy-MM-dd"); // System.out.println(simpleDate.format(dt)); return simpleDate.format(dt); } public static String GetYesterday() { Calendar calendar = Calendar.getInstance(); calendar.add(Calendar.DATE, -1); String yestedayDate = new SimpleDateFormat("yyyy-MM-dd").format(calendar.getTime()); // System.out.println(yestedayDate); return yestedayDate; } public static String[] GetYesterdayInfo(String oldFilePath) throws IOException { String encoding="Utf-8"; File file=new File(oldFilePath); if(file.exists()) { return getOldUrls(file,encoding); } else { file.createNewFile(); return getOldUrls(file,encoding); } } public static String[] getOldUrls(File file,String encoding) throws IOException { FileInputStream fis=new FileInputStream(file); InputStreamReader inStream=new InputStreamReader(fis,encoding); BufferedReader input=new BufferedReader(inStream); String url=input.readLine(); StringBuilder sb = new StringBuilder(""); while(url!=null){ sb.append(url.trim()); sb.append(","); url=input.readLine(); } String sbStr = sb.toString(); String oldUrls[]=sbStr.split(","); return oldUrls; } public static boolean judgeDup(String[] oldUrls ,String newUrl) { for(int i=0;i<oldUrls.length;i++) { if(newUrl.equals(oldUrls[i])==true) { return false; } } return true; } public static Map<String,String> createNewFiles() { String path=getWorkPath()+"//output"; String [] fileNames = {"yaowen","chanjing","bank"}; Map<String,String> pathMap=new HashMap<String,String>(); String pathArray[] = new String[fileNames.length]; for(int i=0;i<fileNames.length;i++) { String filePath=path+"//"+fileNames[i]; File file=new File(filePath); if(!file.exists()) { file.mkdirs(); } pathArray[i]=file.getPath().replace("\", "//"); pathMap.put(fileNames[i], pathArray[i]); } return pathMap; } public static String getWorkPath() { String workspacePath = null; try { File directory = new File("");//参数为空 workspacePath = directory.getCanonicalPath() ; //System.out.println(workspacePath); workspacePath = workspacePath.replace("\", "//"); //System.out.println(workspacePath); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return workspacePath; } }