• 爬取京东商城商品信息


    0x01 基于chrome+selenium爬取京东商城8G内存条

    from selenium import webdriver
    from selenium.webdriver import ActionChains #获取属性
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    import pymongo
    import csv
    import time
    from pyecharts.charts import Bar
    from pyecharts import options as opts
    import pandas as pd
    
    #连接mongdb数据库
    client=pymongo.MongoClient(host='localhost',port=27017)
    db=client.JD
    
    datalist=[]
    
    
    def spider_data():
        browser=webdriver.Chrome()
        url='https://www.jd.com/'
        browser.get(url)
        browser.find_element_by_id('key').send_keys('8G内存条')
        browser.find_element_by_id('key').send_keys(Keys.ENTER)
        #显示等待下一页元素加载完成
        #WebDriverWait(browser,1000).until(EC.presence_of_element_located((By.CLASS_NAME,'pn-next')))
    
        count=0
        while True:
            try:
                count+=1
                # 显示等待,直到所有商品信息加载完成
                WebDriverWait(browser,1000).until(EC.presence_of_element_located((By.CLASS_NAME,'gl-item')))
                #滚动条下拉到最下面
                browser.execute_script('document.documentElement.scrollTop=10000')
                time.sleep(3)
                browser.execute_script('document.documentElement.scrollTop=0')
                lists = browser.find_elements_by_class_name('gl-item')
                for li in lists:
                    name=li.find_element_by_xpath('.//div[@class="p-name p-name-type-2"]//em').text
                    price=li.find_element_by_xpath('.//div[@class="p-price"]//i').text
                    commit=li.find_element_by_xpath('.//div[@class="p-commit"]//a').text
                    shop_name=li.find_element_by_xpath('.//div[@class="p-shop"]//a').text
    
                    datas={}
                    datas['name']=name
                    datas['price']=price
                    datas['commit']=commit
                    datas['shop_name']=shop_name
    
                    datalist.append(datas)
                    #连接mongodb数据库
                    collection=db.datas
                    collection.insert(datas)
                    print(datas)
    
            except :
                print('ERROR')
    
            if count==1:
                break
    
            #爬取下一页
            next=browser.find_element_by_css_selector('a.pn-next')
            next.click()
        print("数据爬取完成")
    
    #写入数据
    def write_data():
    
        with open('E:/data_csv.csv','w',encoding='utf-8',newline='') as f:
            try:
                title=datalist[0].keys()
                writer=csv.DictWriter(f,title)
                writer.writeheader()
                writer.writerows(datalist)
            except:
                print('Error')
        print('文件写入完成')
    
    
    # 数据清洗
    def clear_data():
    
        data = pd.read_csv('E:data_csv.csv')
        # 删除_id列
        data.drop('_id', axis=1, inplace=True)
        # 删除'去看二手'行
        data.drop(data[data['commit'].str.contains('去看二手')].index, inplace=True)
        def convert_data(var):
            # 将+,万去除
            new_value = var.replace('+', '').replace('万', '')
            return float(new_value)
        # 写入cvs文件中
        data['commit'] = data['commit'].apply(convert_data)
        # 清除commit数大于100的行
        data.drop(data[data['commit'] >= 100].index, inplace=True)
        # 保存为csv文件
        data.to_csv('E:clear_data.csv')
    
    def group_data():
        # 数据清洗
        data1 = pd.read_csv('E:clear_data.csv')
        # 删除其它品牌,保留金士顿
        data1.drop(data1[data1['name'].str.contains('十铨|宇瞻|光威|美商海盗船|威刚|芝奇|三星|金百达|英睿达|联想|佰微')].index, inplace=True)
        # 保存为csv文件
        data1.to_csv('E:Kingston.csv')
    
        data2 = pd.read_csv('E:clear_data.csv')
        # 筛选出威刚
        data2.drop(data2[data2['name'].str.contains('金士顿|十铨|宇瞻|光威|美商海盗船|芝奇|三星|金百达|英睿达|联想|佰微')].index, inplace=True)
        # 保存为csv文件
        data2.to_csv('E:weigang.csv')
        print('数据清洗完成')
    
    #数据可视化
    def show_data():
    
        data_path=pd.read_csv('E:clear_data.csv')
        bar=Bar()
        bar.add_xaxis(data_path['name'].tolist())
        bar.add_yaxis('价格',data_path['price'].tolist())
        bar.add_yaxis('评论',data_path['commit'].tolist())
        bar.set_global_opts(title_opts=opts.TitleOpts(title="商品价格"))
        bar.render('all_data.html')
    
        data_path1 = pd.read_csv('E:Kingston.csv')
        bar1 = Bar()
        bar1.add_xaxis(data_path1['name'].tolist())
        bar1.add_yaxis('价格', data_path1['price'].tolist())
        bar1.add_yaxis('评论', data_path1['commit'].tolist())
        bar1.set_global_opts(title_opts=opts.TitleOpts(title="商品价格"))
        bar1.render('Kingston.html')
    
        data_path2 = pd.read_csv('E:weigang.csv')
        bar2 = Bar()
        bar2.add_xaxis(data_path2['name'].tolist())
        bar2.add_yaxis('价格', data_path2['price'].tolist())
        bar2.add_yaxis('评论', data_path2['commit'].tolist())
        bar2.set_global_opts(title_opts=opts.TitleOpts(title="商品价格"))
        bar2.render('weigang.html')
    
    
    if __name__=='__main__':
    
        spider_data()
        write_data()
        clear_data()
        group_data()
        show_data()
    
    
    

    0x02 结果展示


    参考链接:
    https://blog.csdn.net/weixin_44024393/article/details/89289694

  • 相关阅读:
    面试题27:二叉树的镜像
    面试题26:树的子结构
    面试题25:合并两个排序的链表
    面试题24:反转链表
    面试题23:链表中环的入口节点
    面试题22:链表中倒数第k个节点
    欧拉函数的使用
    C++ STL 全排列函数详解
    Ubuntu系统安装网易云音乐、搜狗输入法
    Ubuntu系统 安装谷歌 Chrome 浏览器
  • 原文地址:https://www.cnblogs.com/observering/p/13357429.html
Copyright © 2020-2023  润新知