• 记一次爬需要登录之后才能爬取数据的demo


    一:工程概况

    注意:

    二:涉及到的类

    package com.bigdata.crawler;
    
    import java.io.IOException;
    import java.util.List;
    
    import org.apache.commons.io.IOUtils;
    import org.apache.http.HttpEntity;
    import org.apache.http.client.ClientProtocolException;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    import org.openqa.selenium.By;
    import org.openqa.selenium.Keys;
    import org.openqa.selenium.WebElement;
    import org.openqa.selenium.chrome.ChromeDriver;
    import org.openqa.selenium.firefox.FirefoxDriver;
    import org.openqa.selenium.interactions.Actions;
    
    import com.bigdata.util.DriverCommon;
    
    public class CnzzCrawler {
    	private String baseUrl ="http://new.cnzz.com/v1/login.php?siteid=1262437219";
       private String password = "******";//查看密码
    	
    	private ChromeDriver driver;
    	public CnzzCrawler() {
    	}
    	public CnzzCrawler(ChromeDriver driver) {
    		super();
    		this.driver = driver;
    	}
    	public void start(){
    		// 登入网站
    		driver.get(baseUrl);
    		// 输入密码
    		driver.findElement(By.id("password")).sendKeys(password);
    			
    		try {
    			Thread.sleep(1000);
    		} catch (InterruptedException e) {
    			e.printStackTrace();
    		}
    		// 点击登入 html body div.pwdmain div.pwdcheck div.pwdcheck4 div form div img
    		// body > div.pwdmain > div.pwdcheck > div.pwdcheck4 > div:nth-child(1) > form > div:nth-child(2) > img
    		
    		driver.findElement(By.cssSelector("div.pwdcheck4 > div:nth-child(1) > form > div:nth-child(2) > img")).click();
    		try {
    			Thread.sleep(1000);
    		} catch (InterruptedException e) {
    			e.printStackTrace();
    		}
    		Document doc  = Jsoup.parse(driver.getPageSource());		
    		//System.out.println(doc);
    		//html.cnzz body div#userLoginHeader.userLoginHeader div.section div#rightContainer.rightContainer div#dashboardRootEl.dashboard ul#module_container.module.ui-sortable li.module_data0.moduleTwo table tbody tr.bg-white td.url div a.blue12
    		Elements  elements= doc.select("a.blue12");
    		
    		for(Element e: elements ){
            	String string = e.attr("href");
            	System.out.println(string);
    		}
    		driver.close();
    	}
    	public static void main(String[] args) throws IOException {
    		System.setProperty("webdriver.chrome.driver", DriverCommon.getDriverName(DriverCommon.getOSType()));
    		//System.setProperty("webdriver.firefox.driver", "D:/Program Files (x86)/Mozilla Firefox/firefox.exe");
    		
    		ChromeDriver driver = new ChromeDriver();
    		//FirefoxDriver driver = new FirefoxDriver();
    		new CnzzCrawler(driver).start();
    	}
    	
    }
    

      

    package com.bigdata.util;
    
    import java.io.BufferedReader;
    import java.io.IOException;
    import java.io.InputStreamReader;
    
    public class DriverCommon {
    	/**
         * 获取操作系统类型
         */
        public static String getOSType(){
        	String temp  = System.getProperty("os.name");
        	if(temp.contains("Mac")){
        		return "mac";
        	}else if(temp.contains("Win")){
        		return "win";
        	}else{
        		try {
    				Process process = Runtime.getRuntime().exec("getconf LONG_BIT");
    				BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(process.getInputStream()));
    				String s = bufferedReader.readLine();
    				if(s.contains("64")){
    					return "linux64";
    				}else{
    					return "linux32";
    				}
    			} catch (IOException e) {
    				e.printStackTrace();
    				return "linux64"; //默认Linux64
    			}
        	}
        }
        
        /**
         * 获取浏览器驱动
         * @param os
         * @return
         * @throws IOException
         */
        public static String getDriverName(String os) throws IOException{
    		if(os == null)
    			return null;
    		switch (os) {
    		case "win":
    			return "chromedriver.exe";
    		case "mac":
    			return "chromedriver_mac";
    		case "linux_32":
    			return "chromedriver_linux32";
    		case "linux_64":
    		default:
    			return "chromedriver_linux64";
    		}
    	}
    }
    

      

  • 相关阅读:
    FTP和SSH的区别
    Hadoop之回收站
    什么是簇?
    linux中环境变量的配置
    windows系统中的系统变量和用户变量,以及配置JDK中各个参数的意义
    linux 中yum和rpm 总结
    ajax请求之async:false/true的作用
    JavaScript eval() 函数的用法
    js模式
    数组的一些操作
  • 原文地址:https://www.cnblogs.com/ipetergo/p/7102290.html
Copyright © 2020-2023  润新知