• 某写真网站爬虫


    写了一个很粗糙的某写真网站的小爬虫,有空改改

    from selenium import webdriver
    import re
    import requests
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from pyquery import PyQuery as pq
    from selenium.webdriver.firefox.options import Options
    
    url = 'http://www.tujidao.com/a/?id=25309'
    
    PhantomJS_conf = ['--load-images=false','--disk-cache=false']  # 浏览器不加载图片,不开启缓存
    
    options = webdriver.FirefoxOptions()
    options.add_argument('-headless')
    browser = webdriver.Firefox(firefox_options=options)
    
    # browser = webdriver.PhantomJS(service_args=PhantomJS_conf)
    # browser.set_window_size(1400,900)                               # 设置浏览器窗口大小
    wait = WebDriverWait(browser,10)
    
    def login():
        browser.get(url)
        # 输入账号
        int_user = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div.layui-form-item:nth-child(1) > div:nth-child(2) > input:nth-child(1)')))
        # 输入密码
        int_pass = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div.layui-form-item:nth-child(2) > div:nth-child(2) > input:nth-child(1)')))
    
        # 登陆按钮
        log = wait.until(EC.presence_of_element_located(
            (By.CSS_SELECTOR, '.layui-btn')))
        int_user.send_keys(int())
        int_pass.send_keys(int())
        log.click()
        browser.get(url)
        return browser.page_source
    
    def get_image():
        # wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.footer')))
        html2 = login()
        doc = pq(html2)
        item = doc('#kbox img')        # 找到ID
        # print(item)
        return item
    
    def register():
        html3 = get_image()
        c1 = re.compile('<img.*?data-src="(.*?)"/>',re.S)
        c2 = re.findall(c1,str(html3))
        return c2
    
    count = 0
    for i in register():
        response = requests.get(i)
        dir = r'C:UsersadminDesktop	esta'
        # print(response.content)
        with open(dir+'{}'.format(count)+'.jpg',mode='wb') as f:
            count += 1
            f.write(response.content)
  • 相关阅读:
    设计模式之构造模式
    设计模式之创建模式
    用mongodb 固定集合实现只保留固定数量的记录,自动淘汰老旧数据
    多线程何如获取返回值
    基于redis的消息订阅与发布
    multiple类型的select option在django后台如何取值
    使用redis分布式锁解决并发线程资源共享问题
    数据库架构
    MongoDB数据库设计中6条重要的经验法则
    【mysql】开启binlog后异常:impossible to write to binary log since BINLOG_FORMAT = STATEMENT
  • 原文地址:https://www.cnblogs.com/jiuyachun/p/11284311.html
Copyright © 2020-2023  润新知