• 使用 selenium 实现谷歌以图搜图爬虫(爬取大图)


    实现思路

    原理非常简单,就是利用selenium去操作浏览器,获取到想要的链接,然后进行图片的下载,和一般的爬虫无异。

    用到的技术:multiprocessing,selenium,xpath,requests

    以下按照代码执行的顺序进行讲解。

    首先导入需要的包

    # coding=utf-8
    import base64
    import hashlib
    import os
    import re
    import shutil
    import time
    from multiprocessing import Pool, cpu_count
    
    import requests
    import tqdm
    from colorama import Fore
    from selenium import webdriver
    from selenium.common.exceptions import (ElementNotVisibleException,
                                            StaleElementReferenceException)
    from selenium.webdriver.common.by import By
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait
    

    定义一个 run()函数,作为入口。这里使用多进程技术,同时打开多个浏览器进行图片爬取。

    def run():
        num_process = cpu_count() # 进程数设置为cpu核心数
        pool = Pool(num_process) # 建立一个进程池
        filelist = []
        upload = r"./upload" # 需要进行上传的图片文件夹
        getfilelist(upload, filelist)  # 递归查找文件夹里面所有的图片文件
        result = partition(filelist, num_process) # 将图片文件列表平均分为几个list,每个进程跑一部分
        pool.map_async(download_task, result) # 下载任务丢进进程池
        pool.close() # 不再允许加入进程池
        pool.join() # 等待进程完成
    

    其中 getfilelist()函数是递归查找,工作中用得很多了。

    EXTEND = [".bmp", ".jpg", ".jpeg", ".tif", ".tiff",
              ".jfif", ".png", ".gif", ".iff", ".ilbm"]
              
    def is_img(img_path):
        # 根据后缀判断是否为图片
        ext = os.path.splitext(img_path)[1]
        if ext in EXTEND:
            return True
        else:
            return False
            
    def getfilelist(path, filelist):
        file = os.listdir(path)
        for im_name in file:
            if os.path.isdir(os.path.join(path, im_name)):
                getfilelist(os.path.join(path, im_name), filelist)
            else:
                if is_img(im_name):
                    name = os.path.join(path, im_name)
                    filelist.append(name)
    

    partition()函数用于将一个列表均分为几份,以便实现多进程。

    def partition(ls, size):
        num_per_list = len(ls)//size
        result = []
        if num_per_list*size == len(ls):
            for i in range(size):
                result.append(ls[num_per_list*i:num_per_list*(i+1)])
        else:
            for i in range(size-1):
                result.append(ls[num_per_list*i:num_per_list*(i+1)])
            result.append(ls[num_per_list*(size-1):])
        return result
    

    download_task()为具体的下载任务,一个task实例化一个GoogleSearcher类,遍历自己的图片列表进行以图搜图。

    def download_task(filelist):
        searcher = GoogleSearcher(
            download=r"./download")
        for file in filelist:
            searcher.simple_file_run(file)  # 上传单张图并进行以图搜图
    

    GoogleSearcher类比较长,在注释中进行讲解。

    USERNAME = os.environ['USERNAME']
    class GoogleSearcher:
        def __init__(self, download="download", sleep_time=1):
            super().__init__()
            self._download = download # 下载文件夹
            self.sleep_time = sleep_time  # 下载页面时等待时间
            self.header = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"}
    
            os.makedirs(self._download, exist_ok=True)  # 创建下载文件夹
    
            self.option = webdriver.ChromeOptions()
            # self.option.add_argument("--user-data-dir=" + f"C:/Users/{USERNAME}/AppData/Local/Google/Chrome/User Data/")
            # self.option.add_argument("headless")  # if use headless, may failed.
            self.option.add_argument("disable-gpu")
            self.driver = webdriver.Chrome(options=self.option) # 以上为浏览器对象创建
    
        def upload_img_get_html(self, file):
        	# 上传图片并转到图片列表页面
            print(
                f"{Fore.GREEN} Begin to upload image {os.path.split(file)[1]} {Fore.RESET}")
            self.driver.get("https://www.google.com/imghp")
    
            # 等待相机按钮出现
            condition_1 = EC.visibility_of_element_located(
                (By.CLASS_NAME, "LM8x9c"))
            WebDriverWait(self.driver, timeout=20,
                          poll_frequency=0.5).until(condition_1)
            # 相机按钮出现后点击
            image_button = self.driver.find_element_by_class_name("LM8x9c")
            image_button.send_keys(Keys.ENTER)
    
            # 等待出现上传图片字样
            condition_2 = EC.visibility_of_element_located(
                (By.ID, "dRSWfb"))
            WebDriverWait(self.driver, timeout=20, poll_frequency=0.5).until(
                condition_2)
    
            # 点击上传图片
            upload = self.driver.find_element_by_xpath('//*[@id="dRSWfb"]/div/a')
            upload.send_keys(Keys.ENTER)
    
            # 找到上传图片的控件
            condition_3 = EC.visibility_of_element_located(
                (By.ID, 'awyMjb'))
            WebDriverWait(self.driver, timeout=10, poll_frequency=0.5).until(
                condition_3)
            input_ = self.driver.find_element_by_id('awyMjb')
    
            # 因为上传图片的控件是一个input,直接将文件send就行
            input_.send_keys(file)
            print(f"{Fore.GREEN} uploaded {Fore.RESET}")
    
            # 页面转向另一页
            condition_4 = EC.visibility_of_element_located(
                (By.XPATH, '//*[@id="top_nav"]'))
            WebDriverWait(self.driver, timeout=20,
                          poll_frequency=0.5).until(condition_4)
            # 等待片刻
            time.sleep(self.sleep_time)
    
            # print(driver.current_url)
            # print(driver.page_source)
            print(f"{Fore.GREEN} Finish download source code{Fore.RESET}")
            return self.driver.page_source
    
        def highlight(self, element):
            self.driver.execute_script(
                "arguments[0].setAttribute('style', arguments[1]);", element, "background: yellow; border: 2px solid red;")
    
        def wait_and_click(self, xpath):
            #  Sometimes click fails unreasonably. So tries to click at all cost.
            try:
                w = WebDriverWait(self.driver, 15)
                elem = w.until(EC.element_to_be_clickable((By.XPATH, xpath)))
                elem.click()
                self.highlight(elem)
            except Exception as e:
                print('Click time out - {}'.format(xpath))
                print('Refreshing browser...')
                self.browser.refresh()
                time.sleep(2)
                return self.wait_and_click(xpath)
            return elem
    
        def get_extension_from_link(self, link, default='jpg'):
        # 获取文件后缀
            splits = str(link).split('.')
            if len(splits) == 0:
                return default
            ext = splits[-1].lower()
            if ext == 'jpg' or ext == 'jpeg':
                return 'jpg'
            elif ext == 'gif':
                return 'gif'
            elif ext == 'png':
                return 'png'
            else:
                return default
    
        def base64_to_object(self, src):
        # base64 解码
            header, encoded = str(src).split(',', 1)
            data = base64.decodebytes(bytes(encoded, encoding='utf-8'))
            return data
    
        def download_images(self, links, download_dir):
        # 下载图片
            total = len(links)
            for index, link in enumerate(links):
                try:
                    if len(link) < 100:
                        print('Downloading {} : {} / {}'.format(link, index + 1, total))
                    else:
                        print(
                            'Downloading {} : {} / {}'.format(link[:100], index + 1, total))
                            # 链接过长,只打印部分
                    if str(link).startswith('data:image/jpeg;base64'):
                    # base64编码的jpg图片
                        response = self.base64_to_object(src=link)
                        ext = 'jpg'
                        is_base64 = True
                    elif str(link).startswith('data:image/png;base64'):
                    # base64编码的png图片
                        response = self.base64_to_object(src=link)
                        ext = 'png'
                        is_base64 = True
                    else:
                    # 图片超链接
                        response = requests.get(link, stream=True, timeout=5)
                        ext = self.get_extension_from_link(link=link)
                        is_base64 = False
    
                    path = os.path.join(download_dir, str(index).zfill(4)+"."+ext)
                    try:
                        with open(path, "wb") as f:
                        # base64图片和超链接图片两种保存方法
                            if is_base64:
                                f.write(response)
                            else:
                                shutil.copyfileobj(response.raw, f)
                    except Exception as e:
                        print('Save failed - {}'.format(e))
    
                    del response
                except Exception as e:
                    print('Download failed - ', e)
                    continue
    
        def get_full_resolution_links(self):
            print('[Full Resolution Mode]')
            time.sleep(1)
            elem = self.driver.find_element_by_tag_name("body")
            print('Scraping links')
            self.wait_and_click('//div[@data-ri="0"]')
            time.sleep(1)
            links = []
            count = 1
            last_scroll = 0
            scroll_patience = 0
            while True:
                try:
                    xpath = '//div[@id="islsp"]//div[@class="v4dQwb"]'
                    div_box = self.driver.find_element(By.XPATH, xpath)
                    self.highlight(div_box)
                    xpath = '//img[@class="n3VNCb"]'
                    img = div_box.find_element(By.XPATH, xpath)
                    self.highlight(img)
                    xpath = '//div[@class="k7O2sd"]'
                    loading_bar = div_box.find_element(By.XPATH, xpath)
                    # 等待图片加载,如果加载不完,获取到的是 base64 编码的图片
                    while str(loading_bar.get_attribute('style')) != 'display: none;':
                        time.sleep(0.1)
                    src = img.get_attribute('src')
                    if src is not None:
                        links.append(src)
                        if len(src) < 100:
                            print('%d: %s' % (count, src))
                        else:
                            print('%d: %s' % (count, src[:100])) # 如果太长,只打印一部分
                        count += 1
                except StaleElementReferenceException:
                    pass
                except Exception as e:
                    print(
                        '[Exception occurred while collecting links from google_full] {}'.format(e))
                scroll = self.driver.execute_script("return window.pageYOffset;") # 页面滚动的位置
                if scroll == last_scroll:
                # 页面滚动1
                    scroll_patience += 1
                else:
                    scroll_patience = 0
                    last_scroll = scroll
                if scroll_patience >= 30:
                #页面滚动30,停止
                    break
                elem.send_keys(Keys.RIGHT)
            links = list(dict.fromkeys(links)) # 链接去重
            print('Collect links done. Total: {}'.format(len(links)))
            return links
    
        def simple_file_run(self, img):
            # 上传图片并进行搜索
            img_name = os.path.splitext(os.path.split(img)[1])[0] # 图片名
            parent_name = os.path.split(os.path.split(img)[0])[-1] # 图片的父级名字,用来区分图片的类别
            print("--> Processing image:  {}  ".format(img_name))
            download_dir = os.path.join(self._download, parent_name, img_name)
            os.makedirs(download_dir, exist_ok=True)  
            html_source = self.upload_img_get_html(img)  # 上传图片,到搜索结果页
            similar_img_href = self.driver.find_element_by_xpath(
                '//div[@class="e2BEnf U7izfe"]/h3/a')
            similar_img_href.click()  # 查找“类似图片”的链接并点击,进入图片列表页
            links = self.get_full_resolution_links()  # 将所有图片的大图链接进行收集
            self.download_images(links, download_dir)  # 下载这些大图
            print("{}Image {} finished
    {}".format(
                Fore.GREEN, img_name, Fore.RESET))
    

    整个流程就跟打开浏览器进行操作一样,难点在于如何控制速度,不被谷歌反爬,不然出现谷歌验证码,破解是不可能的,就要帮它免费打码了。

    有何用途

    当你需要训练一个图片分类的模型,手头上图片有限,那就可以用这个方法,每一张图都找跟它相似的,轻轻松松就把训练集扩大了几十倍(理想情况,不被反爬的话)。

    参考

    1. https://github.com/YoongiKim/AutoCrawler
    2. https://github.com/Cyberist-Edgar/Google_Image_Searcher
  • 相关阅读:
    ASP.NET ZERO 学习 JTable的ChildTable用法
    ASP.NET ZERO Core Application 学习笔记
    uploadify ASP.net 使用笔记
    金额的加减乘除运算
    利用autoit自动关闭指定标题窗口
    Struts2源代码解读之Action调用
    利用btrace工具监控在线运行java程序
    自己实现的简单MVC框架(类似Struts2+Spring)
    简单实用后台任务执行框架(Struts2+Spring+AJAX前端web界面可以获取进度)
    mybatis源代码分析:mybatis延迟加载机制改进
  • 原文地址:https://www.cnblogs.com/willwell/p/google_image_search.html
Copyright © 2020-2023  润新知