• [代码] kv2sparse


    def kv2sparse(lines, num_cols, sep=',', kv_sep=':', hash_key=False):
        """
        解析kv格式的数据.
    
        Parameters
        ----------
        lines : string or string tensor
            the input data in key-value format.
        num_cols : int64
            the numer of columns for the output sparse matrix.
        sep : string, optional
            the seterator for each key-value pair. The default is ','.
        kv_sep : string, optional
            the seterator between key and value. The default is ':'.
        hash_key : bool, optional
            convert the keys to buckets by hash function. The default is False.
    
        Returns
        -------
        X : SparseTensor
            the output sparse matrix.
    
        Examples
        -------
        
        ::
    
            lines = ["12:4,1:5,88:6,1:3,2:100", "12:4,1:5,88:6,1:3,2:100"]
            X1 = kv2sparse(lines, hash_key=True)
            X2 = kv2sparse(lines)
        
        Then X1, X2 will be returned as follows 
        
        ::
    
            X1 = SparseTensorValue(indices=array([[ 0, 12],
             [ 0,  1],
             [ 0, 88],
             [ 0,  1],
             [ 0,  2],
             [ 1, 12],
             [ 1,  1],
             [ 1, 88],
             [ 1,  1],
             [ 1,  2]]), values=array([  4., 5., 6., 3., 100., 4., 5., 6., 3., 100.],
            dtype=float32), dense_shape=array([  2, 100]))
            X2 = SparseTensorValue(indices=array([[ 0, 88],
             [ 0, 49],
             [ 0, 53],
             [ 0, 49],
             [ 0, 59],
             [ 1, 88],
             [ 1, 49],
             [ 1, 53],
             [ 1, 49],
             [ 1, 59]]), values=array([  4., 5., 6., 3., 100., 4., 5., 6., 3., 100.],
            dtype=float32), dense_shape=array([  2, 100]))
        """
    
        columns = tf.string_split(lines, sep)
        splits = tf.string_split(columns.values, kv_sep)
        id_vals = tf.reshape(splits.values,splits.dense_shape)
        col_ids, vals = tf.split(id_vals,num_or_size_splits=2,axis=1)
        if hash_key:
            col_ids = tf.string_to_hash_bucket_fast(col_ids[:, 0], num_cols)
        else:
            col_ids = tf.string_to_number(col_ids[:, 0], out_type=tf.int64)
        X = tf.SparseTensor(
            indices=tf.stack((columns.indices[:,0], col_ids), axis=-1),
            values=tf.string_to_number(vals[:,0], out_type=tf.float32),
            dense_shape=tf.stack([columns.dense_shape[0], num_cols])
            )
        return X
    
  • 相关阅读:
    Hbase多master
    hbase 新增节点
    HBase优化
    hbase 各个概念,region,storefile
    HBase简介(很好的梳理资料)
    hbase blocksize设置,与hdfs关系
    hbase常用运维命令
    hbase很有价值的读写性能提升
    关于hbase的read操作的深入研究 region到storefile过程
    Storm 实现滑动窗口计数和TopN排序
  • 原文地址:https://www.cnblogs.com/bregman/p/13743299.html
Copyright © 2020-2023  润新知