• Tensorflow处理变长特征


    处理流程:

    1. 变长特征分割成变长数组
    2. 变长数据填充成规则数组,组成n * m的矩阵 (keras.preprocessing.sequence.pad_sequences)
    3. 每一行数据进行embedding,结果可以按权重求平均、直接求平均、求最大值 得到 n*1结果矩阵

    第3步求平均可以用tf.nn.lookup_embedding_sparse 来做,也可以在Embedding之后再加一层MaxPooling2D或者AVGPooling2D。

    参考:

    # Define the Keras model
    model = Sequential()
    model.add(Embedding(num_distinct_words, embedding_output_dims, input_length=max_sequence_length))
    model.add(Dropout(0.50))
    model.add(Conv1D(filters=32, kernel_size=2, padding='same', activation='relu'))
    model.add(Dropout(0.50))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dropout(0.50))
    model.add(Dense(1, activation='sigmoid'))
    

    处理变长特征代码实现参考

    # -*- encoding: utf-8 -*-
    
    import pandas as pd
    from sklearn import preprocessing
    from tensorflow import keras
    from tensorflow.keras.models import Model, save_model
    from tensorflow.python.keras.preprocessing.sequence import pad_sequences
    from tensorflow.keras.layers import Input
    import tensorflow as tf
    
    
    class VarLenColumnEmbedding(tf.keras.layers.Embedding):
        """
        变长列做Embedding
        """
        def __init__(self, pooling_strategy='max', dropout_rate=0.,  **kwargs):
            if pooling_strategy not in ['mean', 'max']:
                raise ValueError("Param strategy should is one of mean, max")
            self.pooling_strategy = pooling_strategy
            self.dropout_rate = dropout_rate  # 支持dropout
            super(VarLenColumnEmbedding, self).__init__(**kwargs)
    
        def build(self, input_shape):
            super(VarLenColumnEmbedding, self).build(input_shape)  # Be sure to call this somewhere!
            height = input_shape[1]
            if self.pooling_strategy == "mean":
                self._pooling_layer = tf.keras.layers.AveragePooling2D(pool_size=(height, 1))
            else:
                self._pooling_layer = tf.keras.layers.MaxPooling2D(pool_size=(height, 1))
    
            if self.dropout_rate > 0:
                self._dropout = tf.keras.layers.SpatialDropout1D(self.dropout_rate)
            else:
                self._dropout = None
    
            self.built = True
    
        def call(self, inputs):
            # 1. do embedding
            embedding_output = super(VarLenColumnEmbedding, self).call(inputs)
    
            # 2. add dropout
            if self._dropout is not None:
                dropout_output = self._dropout(embedding_output)
            else:
                dropout_output = embedding_output
    
            # 3. expand dim for polling
            inputs_4d = tf.expand_dims(dropout_output, 3)  # add channels dim
    
            # 4. polling
            tensor_pooling = self._pooling_layer(inputs_4d)
    
            # 5. format output
            return tf.squeeze(tensor_pooling, 3)
    
        def compute_mask(self, inputs, mask):
            return None
    
        def get_config(self, ):
            config = {'pooling_strategy': self.pooling_strategy}
            base_config = super(VarLenColumnEmbedding, self).get_config()
            return dict(list(base_config.items()) + list(config.items()))
    
    
    data = pd.read_csv("movielens_sample.txt")
    
    series_genres = data.pop('genres').map(lambda _: _.split("|"))
    gs_list = []
    gs_max_len = 0
    for gs in series_genres:
        if len(gs) > gs_max_len:
            gs_max_len = len(gs)
        gs_list.extend(gs)
    
    gs_list = list(set(gs_list))
    gs_n_unique = len(gs_list)
    
    lb = preprocessing.LabelEncoder()
    lb.fit(gs_list)
    
    y = data['rating'].values
    
    data_geners = pad_sequences(series_genres.map(lambda _: lb.transform(_).tolist()), maxlen=gs_max_len, padding='post', )
    feature_names = ['user_id', 'movie_id', 'timestamp', 'title', 'gender', 'age', 'occupation', 'zip']
    
    for col in feature_names:
        data[col] = preprocessing.LabelEncoder().fit_transform(data[col])
    
    movie_id_uniques = len(set(data['movie_id'].tolist()))
    
    input_movie_genres = Input(shape=(gs_max_len,), name='movie_genres_input')
    layer_embedding_genres = VarLenColumnEmbedding(pooling_strategy='max', input_dim=gs_n_unique, output_dim=4)(input_movie_genres)
    
    dense1 = keras.layers.Dense(512, activation='relu')(layer_embedding_genres)
    output = keras.layers.Dense(6, activation='softmax')(dense1)
    
    model = Model(inputs=[input_movie_genres], outputs=output)
    model.run_eagerly = True
    
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    model.summary()
    
    model.fit(data_geners, y, epochs=10)
    

    网络结构:

    使用到的数据:
    movielens_sample.zip

  • 相关阅读:
    IntelliJ IDEA配置Tomcat 与安装Tomcat失败原因
    IntelliJ IDEA创建JavaWeb工程及配置Tomcat部署
    IntelliJ IDEA 通过GsonFormat插件将JSONObject格式的String 解析成实体
    as 插件GsonFormat用法(json字符串快速生成javabean)
    AndroidStudio导入项目一直卡在Building gradle project info最快速解决方案
    arraylist和linkedlist
    BNUOJ 3958 MAX Average Problem
    ZOJ 5579 Stean
    HDU 3401 Trade
    HDU 1695 GCD
  • 原文地址:https://www.cnblogs.com/oaks/p/14047575.html
Copyright © 2020-2023  润新知