• seleniun 爬取淘宝网


      1 import re
      2 from selenium import webdriver
      3 from selenium.common.exceptions import TimeoutException
      4 from selenium.webdriver.common.by import By
      5 from selenium.webdriver.support.ui import WebDriverWait
      6 from selenium.webdriver.support import expected_conditions as EC
      7 from pyquery import PyQuery as pq
      8 
      9 import pymongo
     10 
     11 MONGO_URL = 'localhost'
     12 MONGO_DB = 'taobao'
     13 MONGO_TABLE = 'product'
     14 
     15 SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
     16 
     17 KEYWORD = '美食'
     18 
     19 client = pymongo.MongoClient(MONGO_URL)
     20 db = client[MONGO_DB]
     21 
     22 browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
     23 wait = WebDriverWait(browser, 10)
     24 
     25 browser.set_window_size(1400, 900)
     26 
     27 def search():
     28     print('正在搜索')
     29     try:
     30         browser.get('https://www.taobao.com')
     31         input = wait.until(
     32             EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))
     33         )
     34         submit = wait.until(
     35             EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))
     36         input.send_keys(KEYWORD)
     37         submit.click()
     38         total = wait.until(
     39             EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total')))
     40         get_products()
     41         return total.text
     42     except TimeoutException:
     43         return search()
     44 
     45 
     46 def next_page(page_number):
     47     print('正在翻页', page_number)
     48     try:
     49         input = wait.until(
     50             EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))
     51         )
     52         submit = wait.until(EC.element_to_be_clickable(
     53             (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
     54         input.clear()
     55         input.send_keys(page_number)
     56         submit.click()
     57         wait.until(EC.text_to_be_present_in_element(
     58             (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_number)))
     59         get_products()
     60     except TimeoutException:
     61         next_page(page_number)
     62 
     63 
     64 def get_products():
     65     wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item')))
     66     html = browser.page_source
     67     doc = pq(html)
     68     items = doc('#mainsrp-itemlist .items .item').items()
     69     for item in items:
     70         product = {
     71             'image': item.find('.pic .img').attr('src'),
     72             'price': item.find('.price').text(),
     73             'deal': item.find('.deal-cnt').text()[:-3],
     74             'title': item.find('.title').text(),
     75             'shop': item.find('.shop').text(),
     76             'location': item.find('.location').text()
     77         }
     78         print(product)
     79         save_to_mongo(product)
     80 
     81 
     82 def save_to_mongo(result):
     83     try:
     84         if db[MONGO_TABLE].insert(result):
     85             print('存储到MONGODB成功', result)
     86     except Exception:
     87         print('存储到MONGODB失败', result)
     88 
     89 
     90 def main():
     91     try:
     92         total = search()
     93         total = int(re.compile('(d+)').search(total).group(1))
     94         for i in range(2, total + 1):
     95             next_page(i)
     96     except Exception:
     97         print('出错啦')
     98     finally:
     99         browser.close()
    100 
    101 if __name__ == '__main__':
    102     main()
  • 相关阅读:
    fio工具讲解
    flask迁移数据库报错
    base64编码
    OpenStack学习笔记,未完待续
    docker运行报错
    centos搭建SVN服务器并实现自动同步至web目录 (必定安装成功)
    centos/linux 源码安装mysql详细记录
    centos7.7安装php7.3的lnmp环境和composer详细步骤
    laravel-admin使用ueditor重命名图片名称问题
    nginx 图片防盗链 设置
  • 原文地址:https://www.cnblogs.com/zhongshuiping/p/9714025.html
Copyright © 2020-2023  润新知