• 利用selenium模拟谷歌抓取淘宝商品美食信息并存储到mongodb练习(附详细注释)


     

    在崔庆才老师的视频讲解基础上,添加了模拟登陆步骤:

      1 from selenium import webdriver
      2 from selenium.webdriver.common.by import By
      3 from selenium.webdriver.support.wait import WebDriverWait
      4 from selenium.webdriver.support import expected_conditions as EC
      5 from pyquery import PyQuery as pq
      6 import re
      7 import pymongo
      8 from config import *
      9 
     10 # 连接数据库,并申明数据库
     11 client = pymongo.MongoClient(MONGO_URL)
     12 db = client[MONGO_DB]
     13 
     14 browser = webdriver.Chrome()
     15 wait = WebDriverWait(browser, 10)    # 设置explicily wait
     16 
     17 
     18 # 模拟登陆淘宝
     19 def login():
     20     browser.get('https://login.taobao.com/member/login.jhtml?spm=a21bo.2017.754894437.1.5af911d9lO6yRD&f=top&redirectURL=https%3A%2F%2Fwww.taobao.com%2F')
     21     input1 = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#fm-login-id')))
     22     input1.send_keys('淘宝网账号')
     23     input2 = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#fm-login-password')))
     24     input2.send_keys('淘宝网密码')
     25     submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#login-form > div.fm-btn > button')))
     26     submit.click()
     27 
     28 
     29 # 模拟进入搜索页
     30 def search():
     31     try:
     32         # 定位搜索输入框,并模拟输入搜索关键字
     33         input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#q')))
     34         input.send_keys('美食')
     35         # 定位搜索按钮,并模拟点击
     36         submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))
     37         submit.click()
     38         # 定位搜索页底部“共100页”
     39         total_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total')))
     40         # 获取第一页商品信息
     41         get_products()
     42         return total_page.text
     43     except TimeoutError:
     44         # 若超时,就重新搜索
     45         return search()
     46 
     47 # 模拟翻页
     48 def next_page(page_num):
     49     try:
     50         # 定位底部的页码输入框
     51         input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input")))
     52         input.clear()
     53         input.send_keys(page_num)
     54         # 定位底部的页码确定按钮
     55         submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
     56         submit.click()
     57         # 判断page_num是否是目前高亮的页码按钮文本
     58         wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_num)))
     59         # 获取第二页以后的商品信息
     60         get_products()
     61     except TimeoutError:
     62         next_page(page_num)
     63 
     64 
     65 # 获取商品信息
     66 def get_products():
     67     wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item')))
     68     # 获取网页源代码
     69     html = browser.page_source
     70     doc = pq(html)
     71     items = doc('#mainsrp-itemlist .items .item').items()   # 返回通过CSS选择器过滤的生成器对象
     72     for item in items:    # item为单个商品的PyQuery对象
     73         product = {
     74             'image': item.find('.pic .img').attr('src'),
     75             'price': item.find('.price').text(),
     76             'deal_num': item.find('.deal-cnt').text()[:-3],
     77             'title': item.find('.title').text(),
     78             'shop': item.find('.shop').text(),
    'location': item.find('.location').text()
    79 } 80 save_to_mongodb(product) 81 82 # 保存到mongodb数据库 83 def save_to_mongodb(result): 84 try: 85 if db[MONGO_TABLE].insert_one(result): 86 print('存储成功', result) 87 except Exception: 88 print('存储失败', result) 89 90 91 def main(): 92 login() 93 total_page = search() 94 total_page = int(re.compile('(d+)').search(total_page).group(1)) 95 for i in range(2, total_page+1): 96 next_page(i) 97 browser.close() 98 99 100 if __name__ == '__main__': 101 main()

     顺利抓取到4365条商品信息:

  • 相关阅读:
    使用ATL开发ActiveX控件
    [Silverlight]AutoCompleteBox控件的一个Bug?
    [Silverlight]一个简单的GroupBox控件
    WCF安全之ASP.NET兼容模式
    Mysql 性能优化记录
    【Python+Django+Pytest】数据库异常pymysql.err.InterfaceError: (0, '') 解决方案
    Django在使用logging日志模块时报错无法操作文件 logging error Permission Error [WinError 32]
    isinstance 判断一个对象属于或不属于多种数据类型
    CentOS 系统 查看 cpu核数
    我踩过的python的坑
  • 原文地址:https://www.cnblogs.com/chang2021/p/13782851.html
Copyright © 2020-2023  润新知