• 把字符串离散化


    1.获取字符串的去重后列表
    2.构造全为0的数组(DataFrame), columns为字符串的列表
    3.给全为0的数组赋值
    第一步

    import pandas as pd
    import numpy as np
    
    df = pd.DataFrame({'a': range(7),
                       'b': range(7, 0, -1),
                       'c': ['one,two,three',
                             'one,two',
                             'two,four',
                             'two,five,four,six',
                             'seven,eight,one',
                             'nine,ten,six,four',
                             'ten,six,two,seven'],
                       'd': list('hjklmno')})
    # print(df)
    print('=' * 40)
    print(df['c'])
    """
    0        one,two,three
    1              one,two
    2             two,four
    3    two,five,four,six
    4      seven,eight,one
    5    nine,ten,six,four
    6    ten,six,two,seven
    Name: c, dtype: object
    """
    
    a = df['c'].str.split(',')
    print(a)
    """
    0         [one, two, three]
    1                [one, two]
    2               [two, four]
    3    [two, five, four, six]
    4       [seven, eight, one]
    5    [nine, ten, six, four]
    6    [ten, six, two, seven]
    Name: c, dtype: object
    """
    print('=' * 50)
    a_lst = df['c'].str.split(',').tolist()
    print(a_lst)
    
    # [['one', 'two', 'three'], ['one', 'two'], ['two', 'four'],
    # ['two', 'five', 'four', 'six'], ['seven', 'eight', 'one'],
    # ['nine', 'ten', 'six', 'four'], ['ten', 'six', 'two', 'seven']]
    
    print('*' * 60)
    new_lst = []
    for i in a_lst:
        for j in i:
            if j not in new_lst:
                new_lst.append(j)
    print(new_lst)
    # ['one', 'two', 'three', 'four', 'five',
    # 'six', 'seven', 'eight', 'nine', 'ten']

    第二步

    df_zeros = pd.DataFrame(data=np.zeros((df.shape[0], len(new_lst))), columns=new_lst)
    print(df_zeros)
    """
       one  two  three  four  five  six  seven  eight  nine  ten
    0  0.0  0.0    0.0   0.0   0.0  0.0    0.0    0.0   0.0  0.0
    1  0.0  0.0    0.0   0.0   0.0  0.0    0.0    0.0   0.0  0.0
    2  0.0  0.0    0.0   0.0   0.0  0.0    0.0    0.0   0.0  0.0
    3  0.0  0.0    0.0   0.0   0.0  0.0    0.0    0.0   0.0  0.0
    4  0.0  0.0    0.0   0.0   0.0  0.0    0.0    0.0   0.0  0.0
    5  0.0  0.0    0.0   0.0   0.0  0.0    0.0    0.0   0.0  0.0
    6  0.0  0.0    0.0   0.0   0.0  0.0    0.0    0.0   0.0  0.0
    """

    方法二(数据量大的情况下使用)

    for i in new_lst:
        df_zeros[i][df['c'].str.contains(i)] = 1
    print(df_zeros)

    第三步

    for i in range(df_zeros.shape[0]):
        df_zeros.loc[i, a_lst[i]] = 1
    
    print(df_zeros)
    """
       one  two  three  four  five  six  seven  eight  nine  ten
    0  1.0  1.0    1.0   0.0   0.0  0.0    0.0    0.0   0.0  0.0
    1  1.0  1.0    0.0   0.0   0.0  0.0    0.0    0.0   0.0  0.0
    2  0.0  1.0    0.0   1.0   0.0  0.0    0.0    0.0   0.0  0.0
    3  0.0  1.0    0.0   1.0   1.0  1.0    0.0    0.0   0.0  0.0
    4  1.0  0.0    0.0   0.0   0.0  0.0    1.0    1.0   0.0  0.0
    5  0.0  0.0    0.0   1.0   0.0  1.0    0.0    0.0   1.0  1.0
    6  0.0  1.0    0.0   0.0   0.0  1.0    1.0    0.0   0.0  1.0
    """
  • 相关阅读:
    关于SVN出现 svn working copy locked的原因及解决方法
    安装SVN客户端重启电脑之后,右键未出现SVN选项的原因
    Django—工程创建以及models数据库易错点
    tornado之文件上传的几种形式form,伪ajax(iframe)
    python 收录集中实现线程池的方法
    python 多线程,进程的理解
    python之路 序列化 pickle,json
    collections模块方法详解
    python之路 socket,socketsever初探
    SQL- 约束
  • 原文地址:https://www.cnblogs.com/wt7018/p/11976958.html
Copyright © 2020-2023  润新知