• selenium实现百度图片爬取


    因为是百度图片是瀑布流ajax异步上传的数据,所以这里用到抓包工具来抓取链接(fiddler)

    好了直接上代码,

     1 from selenium import webdriver
     2 from selenium.webdriver.common.by import By
     3 import requests,time
     4 from queue import Queue
     5 from urllib import request
     6 import os,gevent
     7 from lxml import etree
     8 
     9 
    10 
    11 
    12 def get_img(html):
    13     html = html.get()
    14 
    15     html = etree.HTML(html)
    16 
    17     img_url = html.xpath('//div[@id="imgid"]/div[last()]//li/@data-objurl')
    18     # print(img_url)
    19     path = './baidupic/'
    20     if not os.path.exists(path):
    21         os.makedirs(path)
    22 
    23     for url in img_url:
    24         print(url)
    25         # response = requests.get(url)
    26         # img = response.content
    27         try:
    28             fname = url.split('/')[-1]
    29             request.urlretrieve(url,os.path.join(path, fname))
    30             print('下载成功')
    31         except:
    32             print('图片不存在')
    33 
    34 
    35 def get_page():
    36     #创建数据队列
    37     q = Queue()
    38 
    39     #百度图片搜索地址
    40     base_url = 'https://image.baidu.com/'
    41     #返回浏览器对象
    42     browser = webdriver.Chrome(executable_path=r'C:UserszhaozhiDesktopchromedriver.exe')
    43     #模拟访问
    44     browser.get(base_url)
    45     #输入搜索关键字
    46     browser.find_element_by_id('kw').send_keys('美女')
    47     #按键
    48     browser.find_element_by_class_name('s_search').click()
    49     # time.sleep(2)
    50     for i in range(10):
    51         browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    52         # time.sleep(2)
    53         # html = browser.page_source
    54 
    55 
    56         q.put(browser.page_source)
    57     # browser.close()
    58     # print(browser.page_source)
    59     g_list=[]
    60     for i  in range(20):
    61         g= gevent.spawn(get_img,q)
    62         g_list.append(g)
    63 
    64     gevent.joinall(g_list)
    65 
    66 
    67 
    68 
    69 
    70 
    71 
    72 
    73 # browser.save_screenshot('baidupic.png')
    74 # print(browser.page_source)
    75 # browser.find_element(By_)
    76 
    77 if __name__ == '__main__':
    78     get_page()
  • 相关阅读:
    [node.js学习]为node.js写的一个操作mysql的类
    极光IM简单接入步骤
    windows自带的netsh 端口转发
    nodejs 做的带管理后台的东东,主要学习到 ....我忘了学到什么了
    利用来JS控制页面控件显示和隐藏有两种方法
    phpstudy 出现You don't have permission to access / on this server.
    禁用input自动填充
    一般充值的流程
    jq传输json字符串
    ECSHOP更改后台顶部图片
  • 原文地址:https://www.cnblogs.com/lyxdw/p/9231515.html
Copyright © 2020-2023  润新知