• 用selenium爬取淘宝美食


    '''利用selenium爬取淘宝美食网页内容'''
    
    import re
    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from pyquery import PyQuery as pq
    from config import *
    
    driver = webdriver.PhantomJS(service_args=SERVICE_ARGS)
    # driver = webdriver.Chrome()
    wait = WebDriverWait(driver, 10)
    
    driver.set_window_size(1400,900)            #有这这句话就是可以爬取到网页的内容,没有的话就出现TimeOut错误
    
    def search():
        print('正在搜索')
        try:
            driver.get('http://www.taobao.com')
            s_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#q')))
            sumbit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_TSearchForm > div.search-button > button')))
            s_input.send_keys(KEYWORD)
            sumbit.click()
            totle = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total')))
            get_products()
            return totle.text
        except TimeoutException:
            print('TimeOut')
            return search()
    
    def next_page(page_number):
        print('正在翻页', page_number)
        try:
            s_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input')))
            sumbit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
            s_input.clear()
            s_input.send_keys(page_number)
            sumbit.click()
            wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number)))
            get_products()
        except TimeoutException:
            print('TimeOut')
            next_page(page_number)
    
    def get_products():
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item')))
        html = driver.page_source
        doc = pq(html)
        items = doc('#mainsrp-itemlist .items .item').items()
        for item in items:
            product = {
                'image': item.find('.pic .img').attr('src'),
                'price':item.find('.price').text(),
                'deal': item.find('.deal-cnt').text()[:-3],
                'title': item.find('.title').text(),
                'shop': item.find('.shop').text(),
                'location': item.find('.location').text()
            }
            print(product)
    
    
    def main():
        try:
            totle = search()
            totle = int(re.compile('(d+)').search(totle).group(1))
            for num in range(2,totle + 1):
                next_page(num)
        except Exception as e:
            print(e)
        finally:        #最后执行的操作
            driver.close()
    
    if __name__  == '__main__':
        main()
    View Code

    config文件

    SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
    KEYWORD = '美食'
    View Code
  • 相关阅读:
    openssl windows ndk 编译----0
    openssl windows ndk 编译
    android Android NDK开发2之Windows及L下的gcc手动编译(交叉连编译
    list 和 set 的区别
    Arrays.toString()方法和Arrays类 的一些概念
    几个简单的面试题
    大脑记忆(自我总结)
    DI() T()函数
    Latex设置
    selenium获取元素信息的方法
  • 原文地址:https://www.cnblogs.com/114811yayi/p/7226206.html
Copyright © 2020-2023  润新知