• 某写真网站爬虫


    写了一个很粗糙的某写真网站的小爬虫,有空改改

    from selenium import webdriver
    import re
    import requests
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from pyquery import PyQuery as pq
    from selenium.webdriver.firefox.options import Options
    
    url = 'http://www.tujidao.com/a/?id=25309'
    
    PhantomJS_conf = ['--load-images=false','--disk-cache=false']  # 浏览器不加载图片,不开启缓存
    
    options = webdriver.FirefoxOptions()
    options.add_argument('-headless')
    browser = webdriver.Firefox(firefox_options=options)
    
    # browser = webdriver.PhantomJS(service_args=PhantomJS_conf)
    # browser.set_window_size(1400,900)                               # 设置浏览器窗口大小
    wait = WebDriverWait(browser,10)
    
    def login():
        browser.get(url)
        # 输入账号
        int_user = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div.layui-form-item:nth-child(1) > div:nth-child(2) > input:nth-child(1)')))
        # 输入密码
        int_pass = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div.layui-form-item:nth-child(2) > div:nth-child(2) > input:nth-child(1)')))
    
        # 登陆按钮
        log = wait.until(EC.presence_of_element_located(
            (By.CSS_SELECTOR, '.layui-btn')))
        int_user.send_keys(int())
        int_pass.send_keys(int())
        log.click()
        browser.get(url)
        return browser.page_source
    
    def get_image():
        # wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.footer')))
        html2 = login()
        doc = pq(html2)
        item = doc('#kbox img')        # 找到ID
        # print(item)
        return item
    
    def register():
        html3 = get_image()
        c1 = re.compile('<img.*?data-src="(.*?)"/>',re.S)
        c2 = re.findall(c1,str(html3))
        return c2
    
    count = 0
    for i in register():
        response = requests.get(i)
        dir = r'C:UsersadminDesktop	esta'
        # print(response.content)
        with open(dir+'{}'.format(count)+'.jpg',mode='wb') as f:
            count += 1
            f.write(response.content)
  • 相关阅读:
    关于mysql数据库引擎MyIsam的表锁理解
    关于mysql数据库引擎InnoDB事务的表锁和行锁理解
    Linux查看端口、进程情况及kill进程
    linux 用yum安装软件和卸载软件
    VS 2017常用快捷键
    cin作为判断条件时(关于cin的理解)
    uva 101 木块问题
    Uva 10474 sort以及lower_bound的用法
    团队项目-选题报告
    第一次结对编程作业
  • 原文地址:https://www.cnblogs.com/jiuyachun/p/11284311.html
Copyright © 2020-2023  润新知