1 import java.io.IOException; 2 import java.util.Set; 3 4 import org.htmlparser.Node; 5 import org.htmlparser.Parser; 6 import org.htmlparser.filters.TagNameFilter; 7 import org.htmlparser.util.NodeList; 8 import org.htmlparser.util.ParserException; 9 10 11 public class wikiCrawler { 12 private void initCrawlerWithSeeds(String[] seeds) { 13 for(int i=0;i<seeds.length;i++){ 14 LinkQueue.addUnvisitedUrl(seeds[i]); 15 } 16 } 17 public void crawling(String[] seeds) throws IOException, ParserException{ 18 initCrawlerWithSeeds(seeds); 19 while(!LinkQueue.unVisitedUrlEmpty()&&LinkQueue.getVisitedUrlNum()<1000){ 20 String visitUrl=(String)LinkQueue.unVisitedUrlDequeue(); 21 22 TagNameFilter tagNameFilter=new TagNameFilter("title"); 23 24 25 DownLoadFile downLoadFile=new DownLoadFile("D://"); 26 String filepath=downLoadFile.downloadFile(visitUrl); 27 System.out.println(filepath); 28 if(filepath!=null){ 29 String contentString=HtmlContent.getHtml(filepath); 30 31 NodeList list=new Parser(contentString).extractAllNodesThatMatch(tagNameFilter); 32 String title=((Node)list.elementAt(0)).toPlainTextString(); 33 System.out.println(title); 34 LinkQueue.addVisitedUrl(visitUrl); 35 if(contentString!=null){ 36 Set<String> linksSet=WikiParseHtml.extractLinkSet(contentString); 37 for(Object link:linksSet){ 38 LinkQueue.addUnvisitedUrl((String) link); 39 } 40 } 41 } 42 } 43 } 44 public static void main(String[] args) throws IOException, ParserException{ 45 wikiCrawler crawler=new wikiCrawler(); 46 crawler.crawling(new String[]{"http://free0007.iteye.com"}); 47 } 48 }
html content
1 import java.io.BufferedReader; 2 import java.io.DataOutputStream; 3 import java.io.File; 4 import java.io.FileInputStream; 5 import java.io.FileOutputStream; 6 import java.io.FileReader; 7 import java.io.IOException; 8 import java.io.InputStream; 9 import java.io.InputStreamReader; 10 import java.io.OutputStream; 11 12 import org.apache.http.HttpEntity; 13 import org.apache.http.client.ClientProtocolException; 14 import org.apache.http.client.methods.CloseableHttpResponse; 15 import org.apache.http.client.methods.HttpGet; 16 import org.apache.http.impl.client.CloseableHttpClient; 17 import org.apache.http.impl.client.HttpClients; 18 import org.apache.http.util.EntityUtils; 19 20 21 public class HtmlContent { 22 23 public static String getHtml(String filepath) throws IOException { 24 //File file = new File(filepath); 25 26 /*StringBuffer sb = new StringBuffer(); 27 String s =""; 28 BufferedReader br=new BufferedReader(new InputStreamReader(new FileInputStream(filepath),"UTF-8")); 29 30 while( (s = br.readLine()) != null) { 31 sb.append(s + " "); 32 } 33 34 br.close(); 35 String str = sb.toString(); 36 return str;*/ 37 38 try { 39 BufferedReader bis = new BufferedReader(new InputStreamReader(new FileInputStream( new File(filepath)),"UTF-8") ); 40 StringBuffer stringBuffer=new StringBuffer(250000); 41 String szTemp; 42 43 while ( (szTemp = bis.readLine()) != null) { 44 stringBuffer.append(szTemp+" "); 45 } 46 bis.close(); 47 return stringBuffer.toString(); 48 } 49 catch( Exception e ) { 50 return ""; 51 } 52 } 53 /*public static void main(String[] args){ 54 try { 55 System.out.print(HtmlContent.getHtml("D://zh.wikipedia.org_wiki_Wikipedia_%E9%A6%96%E9%A1%B5.html")); 56 } catch (IOException e) { 57 System.out.print("error"); 58 e.printStackTrace(); 59 } 60 }*/ 61 }
downloadfile
1 import java.io.DataOutputStream; 2 import java.io.File; 3 import java.io.FileNotFoundException; 4 import java.io.FileOutputStream; 5 import java.io.IOException; 6 import java.io.InputStream; 7 import java.io.OutputStreamWriter; 8 9 import org.apache.http.Header; 10 import org.apache.http.HttpEntity; 11 import org.apache.http.HttpStatus; 12 import org.apache.http.client.ClientProtocolException; 13 import org.apache.http.client.HttpClient; 14 import org.apache.http.client.config.RequestConfig; 15 import org.apache.http.client.config.RequestConfig.Builder; 16 import org.apache.http.client.methods.CloseableHttpResponse; 17 import org.apache.http.client.methods.HttpGet; 18 import org.apache.http.impl.client.CloseableHttpClient; 19 import org.apache.http.impl.client.HttpClients; 20 import org.apache.http.util.EntityUtils; 21 22 23 public class DownLoadFile { 24 private String filepath=""; 25 public String getFileNameByUrl(String url, String contentType){ 26 url=url.substring(7); 27 if(contentType.indexOf("html")!=-1){ 28 url=url.replaceAll("[\?/:*|<>"]", "_")+".html"; 29 return url; 30 } 31 //application/pdf 32 else{ 33 return url.replaceAll("[\?/:*|<>"]", "_")+contentType.substring(contentType.indexOf("/")+1); 34 } 35 } 36 public DownLoadFile(String filepath){ 37 this.filepath=filepath; 38 } 39 private void saveToLocal(InputStream is,String filePath) throws IOException{ 40 try { 41 DataOutputStream outputStream=new DataOutputStream(new FileOutputStream(new File(filePath))); 42 int len=0; 43 byte[] buffer=new byte[1024]; 44 while((len=is.read(buffer))!=-1){ 45 outputStream.write(buffer, 0, len); 46 } 47 outputStream.flush(); 48 outputStream.close(); 49 } catch (FileNotFoundException e) { 50 51 e.printStackTrace(); 52 } 53 } 54 public String downloadFile(String url) throws IOException{ 55 String filePathString=null; 56 CloseableHttpClient httpClient=HttpClients.createDefault(); 57 HttpGet httpGet=new HttpGet(url); 58 RequestConfig requestConfig=RequestConfig.copy(RequestConfig.custom().build()).setConnectTimeout(5000).build(); 59 httpGet.setConfig(requestConfig); 60 CloseableHttpResponse response=httpClient.execute(httpGet); 61 try { 62 String statusCode=response.getStatusLine().toString(); 63 if(Integer.parseInt(statusCode.split(" ")[1])!=HttpStatus.SC_OK){ 64 System.err.println(url+" Failed:"+statusCode); 65 filePathString=null; 66 } 67 else { 68 HttpEntity entity=response.getEntity(); 69 InputStream input= entity.getContent(); 70 Header header= entity.getContentType(); 71 filePathString=filepath+getFileNameByUrl(url, header.getValue()); 72 73 saveToLocal(input, filePathString); 74 75 76 } 77 78 } catch (ClientProtocolException e) { 79 // TODO Auto-generated catch block 80 e.printStackTrace(); 81 } catch (IOException e) { 82 // TODO Auto-generated catch block 83 e.printStackTrace(); 84 }finally{ 85 response.close(); 86 httpClient.close(); 87 } 88 89 return filePathString; 90 } 91 92 93 }
wikiparserhtml
1 import java.util.ArrayList; 2 import java.util.HashSet; 3 import java.util.Set; 4 5 import org.htmlparser.Node; 6 import org.htmlparser.NodeFilter; 7 import org.htmlparser.Parser; 8 import org.htmlparser.filters.NodeClassFilter; 9 import org.htmlparser.tags.LinkTag; 10 import org.htmlparser.util.NodeList; 11 12 13 public class WikiParseHtml { 14 public static Set<String> extractLinkSet(String content){ 15 Set<String> linksSet=new HashSet<String>(); 16 try { 17 Parser parser= Parser.createParser(content, "utf-8"); 18 NodeClassFilter nodeClassFilter=new NodeClassFilter(LinkTag.class); 19 NodeList list=parser.extractAllNodesThatMatch(nodeClassFilter); 20 for(int i=0;i<list.size();i++){ 21 Node tagNode=list.elementAt(i); 22 if(tagNode instanceof LinkTag){ 23 LinkTag linkTag=(LinkTag)tagNode; 24 String urlString=linkTag.getLink(); 25 if(urlString.startsWith("http://")){ 26 linksSet.add(urlString); 27 } 28 } 29 } 30 } catch (Exception e) { 31 e.printStackTrace(); 32 } 33 return linksSet; 34 35 } 36 }
queue
1 import java.util.LinkedList; 2 3 4 public class Queue { 5 private LinkedList<Object> queue=new LinkedList<Object>(); 6 public void enQueue(Object t){ 7 queue.addLast(t); 8 } 9 public Object deQueue(){ 10 return queue.removeFirst(); 11 } 12 public boolean isQueueEmpty(){ 13 return queue.isEmpty(); 14 } 15 public boolean contains(Object t){ 16 return queue.contains(t); 17 } 18 public boolean empty(){ 19 return queue.isEmpty(); 20 } 21 }
linkqueue
1 import java.util.HashSet; 2 import java.util.Set; 3 4 5 public class LinkQueue { 6 private static Set visitedUrl=new HashSet(); 7 private static Queue unvisitedUrl=new Queue(); 8 public static Queue getUnvisitedUrl(){ 9 return unvisitedUrl; 10 } 11 public static void addVisitedUrl(String url){ 12 visitedUrl.add(url); 13 } 14 public static void removeVisitedUrl(String url){ 15 visitedUrl.remove(url); 16 } 17 public static Object unVisitedUrlDequeue(){ 18 return unvisitedUrl.deQueue(); 19 } 20 public static void addUnvisitedUrl(String url){ 21 if(url!=null||!url.trim().equals("")||!visitedUrl.contains(url)||!unvisitedUrl.contains(url)) 22 unvisitedUrl.enQueue(url); 23 } 24 public static int getVisitedUrlNum(){ 25 return visitedUrl.size(); 26 } 27 public static boolean unVisitedUrlEmpty(){ 28 return unvisitedUrl.isQueueEmpty(); 29 } 30 }