• python+selenium爬取关键字搜索google图片


      1 # -*- coding: utf-8 -*-
      2 
      3 import json
      4 import os
      5 import time
      6 from multiprocessing import Pool
      7 import multiprocessing
      8 import requests
      9 from selenium import webdriver
     10 
     11 
     12 def get_image_links(keyword, num_requested = 1000):
     13     """get image links with selenium
     14     """
     15     number_of_scrolls = int(num_requested/400) + 1 
     16     img_urls = set()#设置为集合,自动去除重复链接
     17     chrome_options = webdriver.ChromeOptions()
     18     # chrome_options.add_argument('--headless')#设置无头浏览器
     19     # chrome_options.add_argument('user-agent="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"')
     20     # chrome_options.add_argument("lang=en_US")#设置语言
     21     # prefs = {"profile.managed_default_content_settings.images":2}
     22     # chrome_options.add_experimental_option("prefs",prefs)#配置不加载图片
     23     driver = webdriver.Chrome(chrome_options=chrome_options)
     24     driver.maximize_window()
     25     search_query = keyword
     26     url = "https://www.google.com/search?q="+search_query+"&source=lnms&tbm=isch"
     27     driver.get(url)
     28     for _ in range(number_of_scrolls):
     29         for i in range(5):
     30             # multiple scrolls needed to show all 400 images
     31             driver.execute_script("window.scrollBy(0, 100000)")
     32             time.sleep(1)
     33         time.sleep(5)#等待页面刷新,否则有可能元素不可见
     34         try:
     35             # driver.find_element_by_xpath("//input[@value='Show more results']").click()#浏览器的中英文版本不同
     36             driver.find_element_by_xpath("//input[@value='显示更多结果']").click()
     37         except Exception as e:
     38             print("reach the end of page ")
     39             break
     40 
     41     # with open('page.html','w') as f:
     42     #     f.write(driver.page_source)
     43     imgs = driver.find_elements_by_xpath('//div[contains(@class,"rg_meta")]')#模糊定位
     44     for i,img in enumerate(imgs):
     45         img_url = json.loads(img.get_attribute('innerHTML'))["ou"]
     46         img_urls.add(img_url)
     47     driver.quit()
     48     print("finish getting all image urls!")
     49 
     50     return img_urls
     51 
     52 def download(urls,download_dir):
     53     '''download images
     54     '''
     55     print("start downloading images!")
     56     for url in urls:
     57         filename=os.path.join(download_dir,os.path.basename(url))
     58         try:
     59             r = requests.get(url, stream=True, timeout=60)
     60             r.raise_for_status()
     61             with open(filename, 'wb') as f:
     62                 f.write(r.content)  
     63         except Exception:
     64             continue
     65     print("finish downloading images!")
     66 
     67 keywords = ['girl','boy']
     68 download_dir = './images/'
     69 download_dirs = []
     70 for keyword in keywords:
     71     path = os.path.join(download_dir,keyword)
     72     download_dirs.append(path)
     73     if not os.path.exists(path):
     74         os.makedirs(path)
     75 
     76 # for keyword in main_keywords: 
     77 #     image_urls = get_image_links(keyword)
     78 #     download(image_urls,download_dir)
     79 
     80 
     81 ###################################
     82 # get image links/MultiProcess
     83 ################################### 
     84 img_urls=[]
     85 multiprocessing.freeze_support()
     86 p = Pool(4) # default number of process is the number of cores of your CPU, change it by yourself
     87 for keyword in keywords:
     88     img_urls.append(p.apply_async(get_image_links, (keyword,)))
     89 #img_urls:[<multiprocessing.pool.ApplyResult object at 0x7f536925fcc0>, <multiprocessing.pool.ApplyResult object at 0x7f536925fd68>]
     90 for i,urls in enumerate(img_urls):
     91     img_urls[i]=urls.get()
     92 p.close()
     93 p.join()
     94 
     95 
     96 # # ###################################
     97 # # # download images/MultiProcess
     98 # # ###################################
     99 p = Pool(4) # default number of process is the number of cores of your CPU, change it by yourself
    100 for i,urls in enumerate(img_urls):
    101     p.apply_async(download, [urls,download_dirs[i]])
    102 p.close()
    103 p.join()
  • 相关阅读:
    实用 zsh 插件
    laravel 实用扩展包
    laravel Collection mapToDictionary 例子
    laravel mapSpread 例子
    mac 命令行大杂烩
    iview table中 on-view事件点击无效
    github网站打不开了
    iview table 表头样式修改
    $attrs is readonly
    iview中modal如何修改标题颜色
  • 原文地址:https://www.cnblogs.com/buyizhiyou/p/11140128.html
Copyright © 2020-2023  润新知