def kv2sparse(lines, num_cols, sep=',', kv_sep=':', hash_key=False):
"""
解析kv格式的数据.
Parameters
----------
lines : string or string tensor
the input data in key-value format.
num_cols : int64
the numer of columns for the output sparse matrix.
sep : string, optional
the seterator for each key-value pair. The default is ','.
kv_sep : string, optional
the seterator between key and value. The default is ':'.
hash_key : bool, optional
convert the keys to buckets by hash function. The default is False.
Returns
-------
X : SparseTensor
the output sparse matrix.
Examples
-------
::
lines = ["12:4,1:5,88:6,1:3,2:100", "12:4,1:5,88:6,1:3,2:100"]
X1 = kv2sparse(lines, hash_key=True)
X2 = kv2sparse(lines)
Then X1, X2 will be returned as follows
::
X1 = SparseTensorValue(indices=array([[ 0, 12],
[ 0, 1],
[ 0, 88],
[ 0, 1],
[ 0, 2],
[ 1, 12],
[ 1, 1],
[ 1, 88],
[ 1, 1],
[ 1, 2]]), values=array([ 4., 5., 6., 3., 100., 4., 5., 6., 3., 100.],
dtype=float32), dense_shape=array([ 2, 100]))
X2 = SparseTensorValue(indices=array([[ 0, 88],
[ 0, 49],
[ 0, 53],
[ 0, 49],
[ 0, 59],
[ 1, 88],
[ 1, 49],
[ 1, 53],
[ 1, 49],
[ 1, 59]]), values=array([ 4., 5., 6., 3., 100., 4., 5., 6., 3., 100.],
dtype=float32), dense_shape=array([ 2, 100]))
"""
columns = tf.string_split(lines, sep)
splits = tf.string_split(columns.values, kv_sep)
id_vals = tf.reshape(splits.values,splits.dense_shape)
col_ids, vals = tf.split(id_vals,num_or_size_splits=2,axis=1)
if hash_key:
col_ids = tf.string_to_hash_bucket_fast(col_ids[:, 0], num_cols)
else:
col_ids = tf.string_to_number(col_ids[:, 0], out_type=tf.int64)
X = tf.SparseTensor(
indices=tf.stack((columns.indices[:,0], col_ids), axis=-1),
values=tf.string_to_number(vals[:,0], out_type=tf.float32),
dense_shape=tf.stack([columns.dense_shape[0], num_cols])
)
return X