[代码] kv2sparse

def kv2sparse(lines, num_cols, sep=',', kv_sep=':', hash_key=False):
    """
    解析kv格式的数据.

    Parameters
    ----------
    lines : string or string tensor
        the input data in key-value format.
    num_cols : int64
        the numer of columns for the output sparse matrix.
    sep : string, optional
        the seterator for each key-value pair. The default is ','.
    kv_sep : string, optional
        the seterator between key and value. The default is ':'.
    hash_key : bool, optional
        convert the keys to buckets by hash function. The default is False.

    Returns
    -------
    X : SparseTensor
        the output sparse matrix.

    Examples
    -------
    
    ::

        lines = ["12:4,1:5,88:6,1:3,2:100", "12:4,1:5,88:6,1:3,2:100"]
        X1 = kv2sparse(lines, hash_key=True)
        X2 = kv2sparse(lines)
    
    Then X1, X2 will be returned as follows 
    
    ::

        X1 = SparseTensorValue(indices=array([[ 0, 12],
         [ 0,  1],
         [ 0, 88],
         [ 0,  1],
         [ 0,  2],
         [ 1, 12],
         [ 1,  1],
         [ 1, 88],
         [ 1,  1],
         [ 1,  2]]), values=array([  4., 5., 6., 3., 100., 4., 5., 6., 3., 100.],
        dtype=float32), dense_shape=array([  2, 100]))
        X2 = SparseTensorValue(indices=array([[ 0, 88],
         [ 0, 49],
         [ 0, 53],
         [ 0, 49],
         [ 0, 59],
         [ 1, 88],
         [ 1, 49],
         [ 1, 53],
         [ 1, 49],
         [ 1, 59]]), values=array([  4., 5., 6., 3., 100., 4., 5., 6., 3., 100.],
        dtype=float32), dense_shape=array([  2, 100]))
    """

    columns = tf.string_split(lines, sep)
    splits = tf.string_split(columns.values, kv_sep)
    id_vals = tf.reshape(splits.values,splits.dense_shape)
    col_ids, vals = tf.split(id_vals,num_or_size_splits=2,axis=1)
    if hash_key:
        col_ids = tf.string_to_hash_bucket_fast(col_ids[:, 0], num_cols)
    else:
        col_ids = tf.string_to_number(col_ids[:, 0], out_type=tf.int64)
    X = tf.SparseTensor(
        indices=tf.stack((columns.indices[:,0], col_ids), axis=-1),
        values=tf.string_to_number(vals[:,0], out_type=tf.float32),
        dense_shape=tf.stack([columns.dense_shape[0], num_cols])
        )
    return X

相关阅读:
Hbase多master
hbase 新增节点
HBase优化
hbase 各个概念，region，storefile
HBase简介（很好的梳理资料）
hbase blocksize设置，与hdfs关系
hbase常用运维命令
hbase很有价值的读写性能提升
关于hbase的read操作的深入研究 region到storefile过程
Storm 实现滑动窗口计数和TopN排序

原文地址：https://www.cnblogs.com/bregman/p/13743299.html