• 爬虫验证码破解任务


    之前爬取的网站中有部分需要验证码才能点击下载图片,所以查阅了一些破解验证码相关的资料;

    思路:先将验证码图片下载到临时文件中,再用工具去破解,然后将破解的验证码通过selenium模拟输入到文本中再模拟点击完成;

    附上代码:

    //模拟点击免费下载
    driver.findElement(By.id("detail_free_download_btn")).click();
    try {
    TimeUnit.SECONDS.sleep(1);
    } catch (InterruptedException e) {
    e.printStackTrace();
    }
    //检查是否跳出验证码
    Document checkDoc = Jsoup.parse(driver.getPageSource());
    if (checkDoc != null) {
    Element verifyElement = checkDoc.select("div.visit-tc-main").first();
    if (verifyElement != null) {//出现验证码
    crackCaptcha(checkDoc, url);
    //再次模拟点击免费下载
    driver.findElement(By.id("detail_free_download_btn")).click();
    try {
    TimeUnit.SECONDS.sleep(1);
    } catch (InterruptedException e) {
    e.printStackTrace();
    }
    }
    }

    /**
    * 破解验证码
    * @param document
    * @param accountUrl
    * @return
    */
    private void crackCaptcha(Document document, String accountUrl) {
    if (document != null && document.text().contains("输入验证码")) {
    File captchaFile = screenshotCaptcha(driver, driver.findElement(By.id("download_verify_code"))); //下载验证码图片到本地
    String captcha = recognizeCaptcha(captchaFile);  //识别验证码
    LOG.info("Recognizing captcha {}", captcha);
    // 输入验证码
    WebElement webElement = driver.findElement(By.id("download-verify-input-code"));
    webElement.sendKeys(captcha);
    // 提交按钮
    driver.findElement(By.id("check-download-verify-code")).click();
    try {
    TimeUnit.SECONDS.sleep(2);
    } catch (InterruptedException e) {
    e.printStackTrace();
    }
    }
    }

    /**
    * 快照验证码
    * @param driver
    * @param element
    * @return
    */
    private File screenshotCaptcha(WebDriver driver, WebElement element) {
    File verifyFile = new File(System.getProperty("java.io.tmpdir")+"verify");
    if (!verifyFile.exists() && !verifyFile.isDirectory()) {
    verifyFile.mkdir();
    }
    File[] files = verifyFile.listFiles();
    for (File file : files) {
    file.delete();
    }
    File file = null;
    File scrFile = null;
    try {
    String path = System.getProperty("java.io.tmpdir")+"verify/" + UUID.randomUUID().toString() + ".png";
    file = new File(path);
    scrFile = ((TakesScreenshot) driver).getScreenshotAs(OutputType.FILE);
    Point p = element.getLocation();
    int width = element.getSize().getWidth();
    int height = element.getSize().getHeight();
    Rectangle rect = new Rectangle(width, height);
    BufferedImage img = ImageIO.read(scrFile);
    BufferedImage dest = img.getSubimage(p.getX(), p.getY(), rect.width, rect.height);
    ImageIO.write(dest, "png", scrFile);
    Thread.sleep(1000);
    FileUtils.copyFile(scrFile, file);
    } catch (Exception e) {
    e.printStackTrace();
    }
    return file;
    }

    @SuppressWarnings("all")
    public String recognizeCaptcha(File file){
    if(file==null){
    return "";
    }
    String result = "";
    try {

    String base64Image = Base64.encodeBase64String(FileUtils.readFileToByteArray(file));
    //http://api.jisuapi.com/captcha/recognize?appkey=yourappkey&type=n4            //通过极速数据这个平台提供识别验证码接口(当然是收费的)
    String requestUrl = "http://api.jisuapi.com/captcha/recognize";
    Map<String,String> parameters = Maps.newHashMap();
    parameters.put("appkey", "*******");
    parameters.put("type", "***");
    parameters.put("pic",base64Image);

    String response = WebUtil.post(requestUrl, parameters,5000,20000);
    logger.info("Response is {}",response);
    Map<String,Object> data = JsonUtil.fromJson(response, Map.class);
    if(data!=null && data.containsKey("result")){
    Map<String,Object> resultMap = (Map<String,Object>)data.get("result");
    if(resultMap!=null && resultMap.containsKey("code")){
    return (String) resultMap.get("code");
    }
    }
    } catch (IOException e) {
    e.printStackTrace();
    }

    return result;
    }

  • 相关阅读:
    GPT(4kb硬盘) 单硬盘装变色龙、GAH61MAD2V、ALC887VD、HD6570成功驱动经验(转)
    unable to dequeue a cell with identifier Cell must register a nib or a class for the identifier or connect a prototype cell in a storyboard'
    2013.5.29
    平等博弈
    组合数学
    哈密顿+欧拉图
    差分约束
    11.11
    如何直接跳出多重循环
    摘要:数组练习与部分字符串练习
  • 原文地址:https://www.cnblogs.com/yzf666/p/7055460.html
Copyright © 2020-2023  润新知