• webcollector + selenium 爬取空间相册图片


      1 package cn.hb.util;
      2 
      3 import java.io.File;
      4 import java.io.FileNotFoundException;
      5 import java.io.FileWriter;
      6 import java.io.IOException;
      7 import java.util.ArrayList;
      8 import java.util.List;
      9 import java.util.Set;
     10 import java.util.UUID;
     11 import java.util.concurrent.TimeUnit;
     12 
     13 import org.apache.commons.io.IOUtils;
     14 import org.openqa.selenium.By;
     15 import org.openqa.selenium.Cookie;
     16 import org.openqa.selenium.JavascriptExecutor;
     17 import org.openqa.selenium.Keys;
     18 import org.openqa.selenium.WebDriver;
     19 import org.openqa.selenium.WebElement;
     20 import org.openqa.selenium.firefox.FirefoxDriver;
     21 import org.openqa.selenium.firefox.FirefoxOptions;
     22 import org.openqa.selenium.interactions.Actions;
     23 import cn.edu.hfut.dmic.webcollector.conf.Configuration;
     24 import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
     25 import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
     26 import cn.edu.hfut.dmic.webcollector.model.Page;
     27 import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
     28 import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
     29 import cn.edu.hfut.dmic.webcollector.util.FileUtils;
     30 
     31 /**
     32  * 爬取空间图片 selenium登录后提取链接给webcollector处理即可
     33  * 
     34  * @author tele
     35  *
     36  */
     37 public class QZoneCrawler extends BreadthCrawler {
     38     static String url = "https://user.qzone.qq.com/qq号";
     39     static String cookies = "";
     40     static final int pageSize = 98;
     41     static List<String> crawdataList = new ArrayList<String>();
     42     static File baseDir = new File("F:/qz/image");
     43 
     44     public QZoneCrawler(String crawlPath, boolean autoParse) {
     45         super(crawlPath, autoParse);
     46     }
     47 
     48     @Override
     49     public void visit(Page page, CrawlDatums next) {
     50         try {
     51             Thread.sleep(3000);
     52         } catch (InterruptedException e) {
     53             e.printStackTrace();
     54         }
     55         String name = UUID.randomUUID().toString() + ".jpg";
     56         try {
     57             FileUtils.write(new File(baseDir, name), page.content());
     58         } catch (FileNotFoundException e) {
     59             e.printStackTrace();
     60         } catch (IOException e) {
     61             e.printStackTrace();
     62         }
     63     }
     64 
     65     String userAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0";
     66 
     67     // 设置cookies
     68     @Override
     69     public Page getResponse(CrawlDatum crawlDatum) throws Exception {
     70         HttpRequest request = new HttpRequest(crawlDatum);
     71         request.setCookie(cookies);
     72         request.setUserAgent(userAgent);
     73         return request.responsePage();
     74     }
     75 
     76     public static void main(String[] args) throws Exception {
     77 
     78         QZoneCrawler qz = new QZoneCrawler("F:/qz/image/webcollector", true);
     79 
     80         Configuration conf = Configuration.copyDefault();
     81         conf.setAutoDetectImg(true);
     82         conf.setConnectTimeout(5000);
     83         conf.setReadTimeout(10000);
     84 
     85         // 线程爬取间隔
     86         conf.setExecuteInterval(5000);
     87         qz.setConf(conf);
     88         qz.setThreads(100);
     89 
     90         login();
     91         qz.addSeed(crawdataList);
     92         qz.start(1);
     93 
     94     }
     95 
     96     /**
     97      * 登录
     98      * 
     99      * @throws InterruptedException
    100      * @throws IOException
    101      */
    102     public static void login() throws InterruptedException, IOException {
    103         System.setProperty("webdriver.gecko.driver", "D:/browserdriver/geckodriver.exe");
    104 
    105         FirefoxOptions options = new FirefoxOptions();
    106         options.setBinary("F:/ff/firefox.exe");
    107 
    108         WebDriver driver = new FirefoxDriver(options);
    109         driver.manage().window().maximize();
    110         // 超时
    111         try {
    112             driver.manage().timeouts().pageLoadTimeout(3, TimeUnit.SECONDS);
    113             driver.manage().timeouts().setScriptTimeout(3, TimeUnit.SECONDS);
    114             driver.get(url);
    115         } catch (Exception e) {
    116             System.out.println("所需元素已出现,停止加载页面");
    117         } finally {
    118             // 切换到登录login
    119             driver.switchTo().frame("login_frame");
    120 
    121             WebElement switcher_plogin = driver.findElement(By.id("switcher_plogin"));
    122             System.out.println(switcher_plogin.getText());
    123             if (switcher_plogin.isDisplayed()) {
    124                 switcher_plogin.click();
    125             }
    126             // 用户名
    127             driver.findElement(By.id("u")).clear();
    128             driver.findElement(By.id("u")).sendKeys("账号");
    129 
    130             // 密码
    131             driver.findElement(By.id("p")).clear();
    132             driver.findElement(By.id("p")).sendKeys("密码");
    133 
    134             // 登录
    135             try {
    136                 driver.findElement(By.id("login_button")).click();
    137                 Thread.sleep(3000);
    138             } catch (Exception e) {
    139                 e.printStackTrace();
    140             } finally {
    141                 if ("https://i.qq.com/".equals(driver.getCurrentUrl())) {
    142                     System.out.println("登录失败!5秒后再次尝试登录");
    143                     Thread.sleep(5000);
    144                     driver.findElement(By.id("login_button")).click();
    145                 }
    146             }
    147 
    148             // 退出frame
    149             driver.switchTo().defaultContent();
    150 
    151             System.out.println(driver.getCurrentUrl());
    152 
    153             JavascriptExecutor jsExecutor = (JavascriptExecutor) driver;
    154 
    155             // 如果有亲密度提示
    156             /*
    157              * try { WebElement fs_guide = driver.findElement(By.xpath(
    158              * "//div[@id='friendship_promote_layer']/table[@class='tbl-fs-guide']//a"
    159              * )); if(fs_guide != null && fs_guide.isDisplayed()) {
    160              * fs_guide.click(); } } catch (Exception e) { e.printStackTrace();
    161              * }finally {
    162              * 
    163              * }
    164              */
    165 
    166             // 点击相册
    167             driver.findElement(By.cssSelector("#menuContainer ul.head-nav-menu>li.menu_item_4>a")).click();
    168 
    169             Thread.sleep(2000);
    170 
    171             // 切换到frame
    172             driver.switchTo().frame(driver.findElement(By.className("app_canvas_frame")));
    173 
    174             // 进入图片列表(说说相册)
    175             // driver.findElement(By.xpath("//ul[@class='js-album-list-ul']/li[1]/div[1]/div[1]/a")).click();
    176 
    177             // 拼接cookie
    178             StringBuilder builder = new StringBuilder();
    179             Set<Cookie> cookieSet = driver.manage().getCookies();
    180             cookieSet.forEach(c -> builder.append(c.getName()).append("=").append(c.getValue()).append("; "));
    181             cookies = builder.toString();
    182 
    183             // 获得相册列表
    184             List<WebElement> photoList = driver.findElements(By.xpath("//ul[@class='js-album-list-ul']/li"));
    185             if (photoList == null || photoList.size() == 0) {
    186                 throw new RuntimeException("定位相册列表元素失败!");
    187             }
    188 
    189             // 构造不同相册的xpath路径
    190             List<String> xpathList = new ArrayList<String>();
    191             for (int i = 0; i < photoList.size(); i++) {
    192                 xpathList.add("//ul[@class='js-album-list-ul']/li[" + (i + 1) + "]");
    193             }
    194 
    195             // 窗口句柄
    196             List<String> allHandles = new ArrayList<String>(driver.getWindowHandles());
    197 
    198             // 遍历xpath
    199             String newUrl = driver.getCurrentUrl();
    200             for (int i = 0; i < xpathList.size(); i++) {
    201                 // 打开新标签页
    202                 jsExecutor.executeScript("window.open('" + newUrl + "');");
    203                 allHandles = new ArrayList<String>(driver.getWindowHandles());
    204 
    205                 Thread.sleep(2000);
    206                 String xpath = xpathList.get(i);
    207 
    208                 // 句柄切换需要时间
    209                 driver.switchTo().window(allHandles.get(i + 1));
    210                 Thread.sleep(2000);
    211 
    212                 List<String> urlList = getImageUrl(driver, xpath);
    213                 if (urlList == null) {
    214                     break;
    215                 }
    216                 crawdataList.addAll(urlList);
    217             }
    218 
    219             System.out.println("所有相册图片链接提取完毕,退出浏览器");
    220             driver.quit();
    221 
    222         }
    223     }
    224 
    225     /**
    226      * 提取图片url
    227      * 
    228      * @param driver
    229      * @param xpath
    230      * @throws InterruptedException
    231      * @throws IOException
    232      */
    233     public static List<String> getImageUrl(WebDriver driver, String xpath) throws InterruptedException, IOException {
    234         List<String> urlList = new ArrayList<String>();
    235 
    236         // 点击相册
    237         driver.findElement(By.cssSelector("#menuContainer ul.head-nav-menu>li.menu_item_4>a")).click();
    238 
    239         // 切换到图片的frame
    240         driver.switchTo().frame(driver.findElement(By.className("app_canvas_frame")));
    241         Thread.sleep(1000);
    242 
    243         // 获得相册名称
    244         String photo_name = driver.findElement(By.xpath(xpath + "//a[@class='c-tx2 js-album-desc-a']")).getText();
    245 
    246         //// 文件夹检测
    247         File imageUrl = new File("f:/qz/" + photo_name + ".txt");
    248         if (!imageUrl.getParentFile().exists()) {
    249             imageUrl.mkdirs();
    250         } else {
    251             imageUrl.delete();
    252         }
    253 
    254         // 获得图片总数,每页最多98张图片
    255         WebElement span = driver.findElement(By.xpath(xpath + "/div[1]/div[1]/a" + "/span"));
    256         String text = span.getText();
    257         int count = Integer.parseInt(text);
    258 
    259         // 进入列表
    260         driver.findElement(By.xpath(xpath + "/div[1]/div[1]/a")).click();
    261         Thread.sleep(3000);
    262 
    263         // 计算页数
    264         int totalPage = (int) Math.ceil((double) count / (double) pageSize);
    265         System.out.println(photo_name + "图片总数为----" + count + "张,共计---" + totalPage + "页");
    266 
    267         FileWriter fileWriter = new FileWriter(imageUrl, true);
    268      Actions actions = new Actions(driver);
    269         for (int i = 0; i < totalPage; i++) {
    270 
    271             // 模拟按键加载图片
    272      //       Actions actions = new Actions(driver);
    273             for (int j = 0; j < 50; j++) {
    274                 if (j % 5 == 0) {
    275                     Thread.sleep(1000);
    276                 }
    277                 actions.sendKeys(Keys.ARROW_DOWN).perform();
    278             }
    279 
    280             // 提取本页的image链接
    281             List<WebElement> list = driver.findElements(
    282                     By.xpath("//a[@class='item-cover j-pl-photoitem-imgctn']/img[@class='j-pl-photoitem-img']"));
    283             if (list == null || list.size() == 0) {
    284                 // 相册无权限访问或定位失败
    285                 System.out.println("无法提取图片链接!");
    286                 return null;
    287             }
    288             for (WebElement element : list) {
    289                 String src = element.getAttribute("src") + "
    ";
    290                 IOUtils.write(src, fileWriter);
    291                 System.out.println(src);
    292                 // 添加链接
    293                 urlList.add(src);
    294             }
    295             System.out.println("第" + (i + 1) + "页图片链接提取完毕");
    296             Thread.sleep(1000);
    297             // 跳转到下一页
    298             if ((i + 2) <= totalPage) {
    299                 driver.findElement(By.xpath("//a[@id='pager_num_1_" + (i + 2) + "']")).click();
    300                 ;
    301             }
    302         }
    303 
    304         fileWriter.close();
    305         return urlList;
    306     }
    307 
    308 }

    运行环境与上篇博文相同https://www.cnblogs.com/tele-share/p/9595265.html爬取结果

  • 相关阅读:
    Python之面向对象新式类和经典类
    Python之面向对象继承和派生
    Python之面向对象类和对象
    Python之面向对象的程序设计
    Python之面向对象函数式编程
    Python之内置函数
    列表解析与生成器表达式
    03: 交换机基本原理与配置
    02: 网络布线与数制转换
    01:数据封装解封的过程
  • 原文地址:https://www.cnblogs.com/tele-share/p/9610791.html
Copyright © 2020-2023  润新知