• selenuim&PhantomJS&Beautifulsoup练习经典实例


    # coding = utf-8
    __autor__ = 'litao'

    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    import re, time, random
    import selenium.common.exceptions
    from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    from bs4 import BeautifulSoup
    from selenium.webdriver.common.action_chains import ActionChains
    from pymongo import MongoClient

    dcap = dict(DesiredCapabilities.PHANTOMJS)
    dcap["phantomjs.page.settings.userAgent"] = (
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.103 Safari/537.36"
    ) # 修改刘浏览器请求头的useragent属性
    SERVICE_ARGS = ['--load-images=false', '--disk-cache=true'] # 设置浏览器为无图和磁盘缓存模式
    brower = webdriver.PhantomJS(service_args=SERVICE_ARGS, desired_capabilities=dcap)
    brower.set_window_size(1920, 1080) # 设定对应的分辨率,防止在执行点击操作时浏览器找不到对应的元素
    wait = WebDriverWait(brower, 10) # 设定最大等待时间
    brower.get(url="https://www.taobao.com")


    def search(retry_times):
    # wait = WebDriverWait(brower, 10) #设定最大等待时间
    # brower.get(url="https://www.taobao.com")
    try:
    input_content = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#q")))
    search_botton = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".btn-search")))
    input_content.send_keys("美食")
    search_botton.click()
    totle = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".total")))
    print("0k")
    print("1")
    pase_page(1)
    return totle.text
    except selenium.common.exceptions.TimeoutException as e:
    print(e)
    if retry_times > 0:
    retry_times -= 1
    return search(retry_times) # 用于等待超时重新爬取
    return None


    def next_page(page_number, retry_times):
    try:
    input_content = wait.until(
    EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input")))
    search_botton = wait.until(EC.presence_of_element_located(
    (By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit")))
    input_content.clear()
    input_content.send_keys(page_number)
    search_botton.click()
    wait.until(EC.text_to_be_present_in_element(
    (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_number)))
    print(str(page_number))
    pase_page(page_number)
    except selenium.common.exceptions.TimeoutException as e:
    print(e)
    if retry_times > 0:
    retry_times -= 1
    return next_page(page_number, retry_times) # 用于等待超时重新爬取


    def pase_page(page_number):
    if page_number == 1:
    list2 = wait.until(EC.presence_of_element_located(
    (By.CSS_SELECTOR, "#J_itemlistPersonality > div:nth-child(1) > div:nth-child(1)")))
    ActionChains(brower).move_to_element(list2).perform()
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#J_itemlistCont")))
    html = brower.page_source
    # print(html+" "+" "+" "+" ")
    html = html.replace("item J_MouserOnverReq ", "item_J_MouserOnverReq")
    soup = BeautifulSoup(html, 'html.parser')
    content = soup.find_all('div', attrs={"class": "item_J_MouserOnverReq"})
    print(len(content))
    for item in content:
    result = {
    "image": item.find('img').get('data-src'),
    "price": item.find(class_="price").text.strip(),
    "deal": item.find(class_="deal-cnt").text.strip()[:-3],
    "title": re.sub('s', '', item.find(class_="title").text.strip()),
    "shop": re.sub('s', '', item.find(class_="shop").text.strip()),
    "location": re.sub('s', '', item.find(class_="location").text.strip())
    }
    print(result)
    save_to_mongodb(result)
    print("**********************************************************************************************")


    def save_to_mongodb(product):
    client = MongoClient('127.0.0.1', 27017)
    db = client.taobao
    db["taobao_meishi"].insert(product)


    def main():
    try:
    result = search(2)
    if result:
    count_page = int(re.search('.*?(d+).*', result).group(1))
    for i in range(2, count_page + 1):
    time.sleep(random.randint(1, 3))
    print("-----", i)
    next_page(i, 2)

    except Exception as e:
    print("程序运行过程中出现错误,具体错误如下所示:" + ' ', e)
    finally:
    brower.close() # 此处运用try-except-finally结构用于无论个何种因素引起的异常都会在程序退出之前将程序浏览器关闭


    if __name__ == "__main__":
    main()
  • 相关阅读:
    用欧拉公式推导三角函数所有公式包括 倍角公式-半角公式-和差化积-积化和差
    20161006-git学习笔记
    十五的学习日记20160925
    十五的学习日记20160926-你不知道的JS笔记/
    十五的学习日记20160927-ES6学习/中间变量/数组坑/
    十五的学习日记20160928-扑克牌/目标/Apache外网访问
    十五的学习日记20160929-click300毫秒延迟/requestAnimationFrame/帧率
    十五的学习日记20160930-jquery/ajax/JS引擎/job queue
    十五的学习日记20161001-insertAdjacentHTML与insertAdjacentText
    传入评分 返回整数和小数 页面拼接 --------------20160705
  • 原文地址:https://www.cnblogs.com/crawer-1/p/7636163.html
Copyright © 2020-2023  润新知