• 【数据分析&数据挖掘】数据合并和拼接案例


     1 import pandas as pd
     2 import numpy as np
     3 
     4 # 加载数据——detail
     5 detail_1 = pd.read_excel("./meal_order_detail.xlsx", sheetname=0)
     6 detail_2 = pd.read_excel("./meal_order_detail.xlsx", sheetname=1)
     7 detail_3 = pd.read_excel("./meal_order_detail.xlsx", sheetname=2)
     8 
     9 print("detail_1 的形状: 
    ", detail_1.shape)
    10 print("detail_1 的列索引: 
    ", detail_1.columns)
    11 print("detail_2 的形状: 
    ", detail_2.shape)
    12 print("detail_2 的列索引: 
    ", detail_2.columns)
    13 print("detail_3 的形状: 
    ", detail_3.shape)
    14 print("detail_3 的列索引: 
    ", detail_3.columns)
    15 
    16 print("~"*60)
    17 # 将detail_2, detail_3直接追加到detaiL_1下面
    18 detail = pd.concat((detail_1, detail_2, detail_3), axis=0, join="inner")
    19 print("detail的形状;
    ", detail.shape)
    20 
    21 
    22 # 加载info
    23 info = pd.read_csv("./meal_order_info.csv", encoding="ansi")
    24 print("info: 
    ", info.shape)
    25 
    26 # info与detail进行主键拼接
    27 res = pd.merge(left=detail, right=info, left_on="order_id", right_on="info_id", how="inner")
    28 res = pd.merge(left=detail, right=info, left_on="order_id", right_on="info_id", how="left")
    29 print("info与detail主键拼接的结果为: 
    ", res.shape)
    30 print("res的列名: 
    ", res.columns)
    31 
    32 
    33 # 加载users
    34 users = pd.read_excel("./users.xlsx")
    35 # info与detail进行主键拼接的结果与users进行主键拼接
    36 res = pd.merge(left=res, right=users, left_on="name", right_on="ACCOUNT", how="inner")
    37 print("最终进行主键拼接的结果: 
    ", res)
    38 print("最终res的列名称: 
    ", res.columns)
    39 
    40 print("name与ACCOUNT对比相同", np.all(res.loc[:, "name"] == res.loc[:, "ACCOUNT"]))
    41 print("order_id与info_id对比相同", np.all(res.loc[:, "order_id"] == res.loc[:, "info_id"]))
    42 print("emp_id_x与emp_id_y对比相同", np.all(res.loc[:, "emp_id_x"] == res.loc[:, "emp_id_y"]))
    43 
    44 res.drop(labels=["ACCOUNT", "info_id", "emp_id_y"], axis=1, inplace=True)
    45 
    46 print("删除3列之后的结果: 
    ", res.shape)
    47 print("删除3列之后的结果: 
    ", res.columns)
    48 
    49 drop_list = []
    50 for column in res.columns:
    51     # 统计每一列的非空数据的数量
    52     res_count = res.loc[:, column].count()
    53     # 如果整列非空数据的数量为0,意味着整列都是空的
    54     if res_count == 0:
    55         drop_list.append(column)
    56 
    57 # 删除整列为空的列
    58 res.drop(labels=drop_list, axis=1, inplace=True)
    59 print("去除整列为空的数据之后的结果: 
    ", res.shape)
    60 print("去除整列为空的数据之后的结果: 
    ", res.columns)
    61 
    62 drop_dup_list = []
    63 # 如果整列数据完全相同——该列, 该属性对于区分各列没有意义
    64 for column in res.columns:
    65     res_ = res.drop_duplicates(subset=column, inplace=False)
    66     if res_.shape[0] == 1:
    67         print("res_.shape[0]: 
    ", res_.shape[0])
    68         drop_dup_list.append(column)
    69 
    70 # 删除全部一样的列
    71 res.drop(labels=drop_dup_list, axis=1, inplace=True)
    72 print("最终的结果: 
    ", res.shape)
    73 print("最终的结果: 
    ", res.columns)
  • 相关阅读:
    AUTOSAR-文档中所使用的UML文件
    开心一下-实现基于Java一个中文编程语言
    HDU 1026 Ignatius and the Princess I
    HDU 2553 (状压) N皇后问题 (2)
    UVa 11825 (状压DP) Hackers' Crackdown
    UVa 10891 (博弈+DP) Game of Sum
    LA 2965 Jurassic Remains
    HDU 2126 (背包方法数) Buy the souvenirs
    POJ 3984 迷宫问题
    POJ 3287 (基础BFS) Catch That Cow
  • 原文地址:https://www.cnblogs.com/Tree0108/p/12116107.html
Copyright © 2020-2023  润新知