• 使用selenium爬取天猫美食店铺


    '''利用selenium爬取网页内容'''
    
    import re
    
    import time
    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from pyquery import PyQuery as pq
    from config import *
    
    # driver = webdriver.PhantomJS(service_args=SERVICE_ARGS)
    driver = webdriver.Chrome()
    wait = WebDriverWait(driver, 10)
    
    # driver.set_window_size(1400,900)            #有这这句话就是可以爬取到网页的内容,没有的话就出现TimeOut错误
    
    def search():
        print('正在搜索')
        try:
            driver.get('http://www.tmall.com')
            s_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mq')))
            sumbit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#mallSearch > form > fieldset > div > button')))
            s_input.send_keys(KEYWORD)
            sumbit.click()
            shop = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_Filter > a.fType-w')))
            shop.click()
            totle = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#content > div > div.ui-page > div > b.ui-page-skip > form')))
            get_shopname()
            return totle.text
        except TimeoutException:
            print('TimeOut')
            return search()
    
    def next_page(page_num):
        print('正在翻页', page_num)
        try:
            s_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#content > div > div.ui-page > div > b.ui-page-skip > form > input.ui-page-skipTo')))
            sumbit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#content > div > div.ui-page > div > b.ui-page-skip > form > button')))
            s_input.clear()
            s_input.send_keys(page_num)
            sumbit.click()
            wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#content > div > div.ui-page > div > b.ui-page-num > b.ui-page-cur'),str(page_num)))
            get_shopname()
        except TimeoutException:
            print('TimeOut')
            next_page(page_num)
    
    def get_shopname(): #获取店面名称、链接
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#J_ItemList .shopBox .shopHeader')))
        html = driver.page_source
        doc = pq(html)
        items = doc('#J_ItemList .shopBox .shopHeader').items()
        for item in items:
            shopname = {
                'shopmessage': item.find('.shopHeader-info').text(),
                'shoplink': 'http:' + item.find('.sHe-shop').attr('href'),
                'shop_score':item.find('.shopDsr-con').text()
            }
            print(shopname)
    
    # def login():
    #     login_sumbit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_Quick2Static')))
    #     print('点击使用用户名和密码登录')
    #     login_sumbit.click()
    #     user = driver.find_element_by_id('TPL_username_1')
    #     print('输入用户名')
    #     user.send_keys(USER)
    #     password = driver.find_element_by_id('TPL_password_1')
    #     print('输入密码')
    #     password.send_keys(PASSWORD)
    #     sumbit = driver.find_element_by_id('J_SubmitStatic')
    #     sumbit.click()
    #     return driver.page_source
    
    def main():
        totle = search()
        totle = int(re.compile('(d+)').search(totle).group(1))
        for num in range(2,totle+1):
            next_page(num)
            time.sleep(2)
    
    if __name__ == '__main__':
        main()
    View Code
  • 相关阅读:
    POJ2823 Sliding Window【双端队列】
    初识Identity
    dSploitzANTI渗透教程之启动zANTI工具
    dSploitzANTI渗透教程之安装zANTI工具
    iOS Sprite Kit教程之滚动场景
    iOS Sprite Kit教程之场景的切换
    iOS Sprite Kit教程之场景的设置
    iOS Sprite Kit教程之真机测试以及场景的添加与展示
    iOS Sprite Kit教程之申请和下载证书
    iOS Sprite Kit教程之使用帮助文档以及调试程序
  • 原文地址:https://www.cnblogs.com/114811yayi/p/7226231.html
Copyright © 2020-2023  润新知