• 爬取京东商品信息


    爬取京东商品信息

    from selenium import webdriver
    from selenium.webdriver import ChromeOptions
    from selenium.webdriver import ActionChains
    from selenium.webdriver.common.keys import Keys
    import time
    
    option = ChromeOptions()
    option.add_argument('disable-infobars')
    
    
    
    driver = webdriver.Chrome(chrome_options=option)
    
    def get_goods(driver):
        number = 400
        for line in range(20):
            js = '''
                    window.scrollTo(0, %s)
                ''' % number
            number += 500
            driver.execute_script(js)
            time.sleep(0.2)
    
        # 查找所有商品的父标签
        good_div = driver.find_element_by_id('J_goodsList')
        # 获取所有商品的标签
        good_list = good_div.find_elements_by_class_name('gl-item')
        for good in good_list:
            '''
            商品信息:
                名称
                价格
                链接
                图片
                评价人数
            '''
            # 商品名称
            good_name = good.find_element_by_css_selector('.p-name em').text.replace('
    ', '')
    
            # 商品价格
            good_price = good.find_element_by_css_selector('.p-price').text.replace('
    ', '')
    
            # 商品链接
            good_link = good.find_element_by_css_selector('.p-img a').get_attribute('href')
    
            # 商品图片
            good_img = good.find_element_by_css_selector('.p-img img').get_attribute('src')
    
            # 评价人数
            good_commit = good.find_element_by_css_selector('.p-commit').text.replace('
    ', ' ')
    
            goods = '''
                商品名称: %s
                商品价格: %s
                商品链接: %s
                商品图片: %s
                评价人数: %s
                ''' % (good_name, good_price, good_link, good_img, good_commit)
    
            print(goods)
    
            with open('京东女士内衣数据爬去.txt', 'a', encoding='utf-8') as f:
                f.write(goods + '
    ')
    
    
        next_tag = driver.find_element_by_class_name('pn-next')
        next_tag.click()
        time.sleep(3)
    
        # 递归执行get_goods函数
        get_goods(driver)
    
    
    
    try:
        driver.get('https://www.jd.com/')
        driver.implicitly_wait(10)
        input_tag = driver.find_element_by_id('key')
        input_tag.send_keys('女士内衣')
        search_button = driver.find_element_by_class_name('button')
        search_button.click()
    
        get_goods(driver)
    
    
    
        time.sleep(1000)
    
    
    finally:
        driver.close()
    

      

  • 相关阅读:
    为什么linux下多线程程序如此消耗虚拟内存【转】
    具体解说Android的图片下载框架UniversialImageLoader之磁盘缓存的扩展(二)
    【leetcode】Longest Common Prefix
    oracle插入特殊字符'&'问题
    tomcat下配置https环境
    .NET--接口设计
    Hibernate知识点总结
    VB.NET中DataGridView控件
    eclipse内存溢出报错:java.lang.OutOfMemoryError:Java heap space
    理论与实际相结合——三层架构解析
  • 原文地址:https://www.cnblogs.com/dengyanchuan/p/11086658.html
Copyright © 2020-2023  润新知