• 使用selenium 模拟爬取谷歌图片代码实例)


    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    """
    Created on 2019/02/15
    @filename: get_pic_from_google.py
    @author: sdhzdtwhm
    Description:
        1.此脚本为使用selenium模拟访问爬取谷歌图片中的搜索结果
        2.运行环境为python3 需要安装selenium、bs4、requests库
        3.访问谷歌环境需要翻墙
        4.需要下载谷歌浏览器的驱动
    """
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    import time
    from bs4 import BeautifulSoup as bs
    import uuid
    import requests
    import os
    
    
    class Crawler:
        def __init__(self):
            self.url = base_url_part1 + search_query + base_url_part2
    
        """启动Chrome浏览器驱动"""
        def start_brower(self):
            chrome_options = Options()
            chrome_options.add_argument("--disable-infobars")
            """谷歌浏览器驱动地址"""
            executable_path = "C:\Windows\System32\chromedriver.exe"
            """启动Chrome浏览器"""
            driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=executable_path)
            """最大化窗口,因为每一次爬取只能看到视窗内的图片"""
            driver.maximize_window()
            """浏览器打开爬取页面"""
            driver.get(self.url)
            return driver
    
        def downloadImg(self, driver):
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                              '(KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
                'Connection': 'keep - alive',
                'content-type': 'application/json'
            }
            """滑动滚动条至:加载更多处"""
            for i in range(5):
                pos = i * 50000
                js = "document.documentElement.scrollTop=%d" % pos
                driver.execute_script(js)
                time.sleep(3)
            print("准备点击:加载更多")
            driver.find_element_by_xpath("./*//input[@value='显示更多结果']").click()
            time.sleep(2)
            for i in range(5):
                pos = i * 50000
                js = "document.documentElement.scrollTop=%d" % pos
                driver.execute_script(js)
                time.sleep(3)
            time.sleep(2)
            html_page = driver.page_source
            """利用Beautifulsoup4创建soup对象并进行页面解析"""
            soup = bs(html_page, "html.parser")
            """通过soup对象中的findAll函数图像信息提取"""
            imglist = soup.findAll('img', {'class': 'rg_ic rg_i'})
            url_list = []
            """获取图片地址,并添加进图片列表中"""
            for i in imglist:
                try:
                    url_list.append(i['data-src'])
                except Exception as ex1:
                    url_list.append(i['src'])
                    print(ex1)
            print("总共包含图片个数为:" + str(len(url_list)))
            for i in url_list:
                try:
                    ir = requests.get(i, headers=headers)
                    s_uuid = str(uuid.uuid1())
                    l_uuid = s_uuid.split('-')
                    name = ''.join(l_uuid)
                    print("开始下载图片:%s.jpg" % name)
                    open(localPath + '%s.jpg' % name, 'wb').write(ir.content)
                except Exception as ex2:
                    print('下载图片出错!!' + str(ex2))
    
        def run(self):
            driver = self.start_brower()
            self.downloadImg(driver)
            driver.close()
            print("Download has finished.")
    
    
    if __name__ == '__main__':
        print(
            '			**************************************
    			**		Welcome to Use Spider		**
    			*'
            '*************************************')
        """ base_url_part1以及base_url_part2都是固定不变的,无需更改"""
        base_url_part1 = 'https://www.google.com/search?q='
        base_url_part2 = '&source=lnms&tbm=isch'
        """爬取关键字"""
        search_list = ['管制刀具']
        for search_query in search_list:
            localPath = 'D:/weijin/'
            try:
                os.mkdir(localPath)
            except Exception as e:
                print(e)
            craw = Crawler()
            craw.run()
    
  • 相关阅读:
    多想一点和多做一步
    js 判断字符是否以汉字开头
    代码片断编辑测试窗
    部署WAR文件到tomcat
    ROS机器人程序设计(原书第2版)补充资料 (柒) 第七章 3D建模与仿真 urdf Gazebo V-Rep Webots Morse
    ROS机器人程序设计(原书第2版)补充资料 (陆) 第六章 点云 PCL
    ROS机器人程序设计(原书第2版)补充资料 (伍) 第五章 计算机视觉
    ROS机器人程序设计(原书第2版)补充资料 (肆) 第四章 在ROS下使用传感器和执行器
    ROS机器人程序设计(原书第2版)补充资料 (叁) 第三章 可视化和调试工具
    ROS机器人程序设计(原书第2版)补充资料 (贰) 第二章 ROS系统架构及概念
  • 原文地址:https://www.cnblogs.com/sdhzdtwhm/p/10437005.html
Copyright © 2020-2023  润新知