• 搜索网站的多线程爬虫


    package GetUrls;
    
    import java.io.BufferedReader;
    import java.io.BufferedWriter;
    import java.io.File;
    import java.io.FileReader;
    import java.io.FileWriter;
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.concurrent.ExecutorService;
    import java.util.concurrent.Executors;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    
    public class GetIt {
    	private static BufferedWriter bw;
    	public static ArrayList<String> AllUrls=new ArrayList<String>();
    	public static ArrayList<String> get=new ArrayList<String>();
    	public static String sitename="smarter";
    	public static GetIt a=new GetIt();
    	public static ArrayList<String> JsName=new ArrayList<String>();
    	private static BufferedReader br;
    	public static ArrayList<String> tmp=new ArrayList<String>();
    	public static ArrayList<String> Error=new ArrayList<String>();
    	public static ExecutorService p = null;
    //	public static ArrayList<String> ErrorName=new ArrayList<String>();
    	public static void main(String args[]) throws IOException{
    		File f1 = new File("./src/GetAllUrljs/jsname");
    		FileReader reader = new FileReader(f1);
    		br = new BufferedReader(reader);
    		String line="";
    		while((line=br.readLine()) != null){
    			JsName.add(line);
    		}
    //		File f = new File("./src/GetUrls/errorname.txt");
    //		FileReader reader1 = new FileReader(f);
    //		br = new BufferedReader(reader1);
    //		String line1="";
    //		while((line1=br.readLine()) != null){
    //			ErrorName.add(line1);
    //		}
    		String url="http://www."+sitename+".com";
    		a.getAll(url);
    		for(int i=0;i<get.size();i++){
    			if(AllUrls.contains(get.get(i))==false){
    				tmp.add(get.get(i));
    			}
    		}
    		a.getrun();
    		if(Error.size()>0){
    			File f2 = new File("./src/GetUrls/"+sitename+"error.txt");
    			FileWriter writer = new FileWriter(f2, true);
    			bw = new BufferedWriter(writer);
    			for(int i=0;i<Error.size();i++){
    				bw.write(Error.get(i));
    				bw.newLine();
    				bw.flush();
    			}
    			bw.close();
    		}
    		
    	}
    	public void getrun() {
    		// TODO Auto-generated method stub
    		get.clear();
    		p=Executors.newFixedThreadPool(10);
    		for(int i=0;i<tmp.size();i++){
    			p.execute(new runer(tmp.get(i)));
    		}
    		while (p.isTerminated() == false) {
    			p.shutdown();
    			tmp.clear();
    		}
    		for(int i=0;i<get.size();i++){
    			if(AllUrls.contains(get.get(i))==false){
    				tmp.add(get.get(i));
    			}
    		}
    		if(tmp.size()>0){
    			a.getrun();
    		}
    	}
    	public void getAll(String url) throws IOException {
    		// TODO Auto-generated method stub
    		AllUrls.add(url);
    		try{
    			Document doc = Jsoup.connect(url).timeout(120000).get();
    			Elements links=doc.select("a[href]");
    			for(Element link :links){
    				String urls=link.attr("href");
    //				for(int i=0;i<ErrorName.size();i++){
    //					if(urls.equals(ErrorName.get(i))){
    //						System.out.println(url+"	"+ErrorName.get(i));
    //					}
    //				}
    //				CSUS判断逻辑
    //				if(urls !=null){
    //					if((urls.substring(0, 1)).equals("/") && urls.length()>2 && urls.indexOf("/topic/")<0 && urls.indexOf("/sst/")<0 && urls.indexOf("search.php")<0 && urls.indexOf(".jpg")<0 && urls.indexOf("/track/scripts/")<0){
    //						urls="http://www."+sitename+".com"+urls;
    //						if(get.contains(urls)==false)
    //							get.add(urls);
    //					}
    //					if(urls.indexOf("www."+sitename)>0 && urls.length()>2 && urls.indexOf("/topic/")<0 && urls.indexOf("/sst/")<0 && urls.indexOf("search.php")<0 && urls.indexOf(".jpg")<0 && urls.indexOf("/track/scripts/")<0){
    //						if(get.contains(urls)==false)
    //							get.add(urls);
    //					}
    //				}
    //				smarter判断逻辑
    				if(urls != null){
    					if((urls.substring(0, 1)).equals("/") && urls.length()>2 && urls.indexOf("/pl--")<0 && urls.indexOf("/se--")<0 && urls.indexOf("/sd--")<0 && urls.indexOf("/sz--")<0 && urls.indexOf("/cl--")<0 && urls.indexOf("/scripts/")<0){
    						urls="http://www."+sitename+".com"+urls;
    						if(get.contains(urls)==false)
    							get.add(urls);
    					}
    					if(urls.indexOf("http://www."+sitename)>0 && urls.length()>2 && urls.indexOf("/pl--")<0 && urls.indexOf("/se--")<0 && urls.indexOf("/sd--")<0 && urls.indexOf("/sz--")<0 && urls.indexOf("/cl--")<0 && urls.indexOf("/scripts/")<0){
    						if(get.contains(urls)==false)
    							get.add(urls);
    					}
    				}
    			}
    			String html=doc.html();
    			int station[]=new int[JsName.size()];
    			for(int i=0;i<JsName.size();i++){
    				if(html.indexOf(JsName.get(i))>0)
    					station[i]=1;
    				else
    					station[i]=0;
    			}
    			a.witer(url,station);
    		}catch(Exception e){
    			Error.add(url+"	"+e.getMessage());
    		}
    	}
    	private void witer(String url, int[] station) throws IOException {
    		// TODO Auto-generated method stub
    		File f2 = new File("./src/GetUrls/"+sitename+"report.txt");
    		FileWriter writer = new FileWriter(f2, true);
    		bw = new BufferedWriter(writer);
    		bw.write(url+"	"+station[0]+"	"+station[1]+"	"+station[2]+"	"+station[3]+"	"+station[4]+"	"+station[5]);
    		bw.newLine();
    		bw.flush();
    		bw.close();
    	}
    }
    

      

    package GetUrls;
    
    import java.io.IOException;
    
    public class runer implements Runnable {
    	String url=null;
    	public runer(String s) {
    		// TODO Auto-generated constructor stub
    		this.url=s;
    	}
    
    	@Override
    	public void run() {
    		// TODO Auto-generated method stub
    		GetIt a=new GetIt();
    		try {
    			a.getAll(url);
    		} catch (IOException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		}
    	}
    
    }
    

      

  • 相关阅读:
    【交互稿】sample
    【公开数据】网站
    【交互】规范
    【Flask】https
    【Flask】run with ssl /https
    需求模版
    低功耗蓝牙BLE外围模式(peripheral)-使用BLE作为服务端
    AIDL示例
    Android使用BLE(低功耗蓝牙,Bluetooth Low Energy)
    Android网络访问库
  • 原文地址:https://www.cnblogs.com/leonxiaosi/p/3193793.html
Copyright © 2020-2023  润新知