• python深度学习之语音识别(speech recognize)


    问题描述

    speech recognize是kaggle上1,2年前的一个赛题,主要描述的是如果在各种环境中识别出简单的英文单词发音,比如bed,cat,right之类的语音。提供的训练集是.wav格式的语音,解压后数据集大小在2G左右。

    训练集分析

    语音识别相对图片识别来说,是一个区别较大的领域。所以首先要了解训练集的基本特征,理清不同语音之间的共性。

    振幅和频率

    def log_specgram(audio, sample_rate, window_size=20,
                     step_size=10, eps=1e-10):
        nperseg = int(round(window_size * sample_rate / 1e3))
        noverlap = int(round(step_size * sample_rate / 1e3))
        freqs, times, spec = signal.spectrogram(audio,
                                                fs=sample_rate,
                                                window='hann',
                                                nperseg=nperseg,
                                                noverlap=noverlap,
                                                detrend=False)
        return freqs, times, np.log(spec.T.astype(np.float32) + eps)
    
    
    def plt_specgram(freqs, times, spectrogram):
        fig = plt.figure(figsize=(14, 8))
        ax1 = fig.add_subplot(211)
        ax1.set_title('Raw wave of ' + filename)
        ax1.set_ylabel('Amplitude')
        ax1.plot(np.linspace(0, sample_rate / len(samples), sample_rate), samples)
    
        ax2 = fig.add_subplot(212)
        ax2.imshow(spectrogram.T, aspect='auto', origin='lower',
                   extent=[times.min(), times.max(), freqs.min(), freqs.max()])
        ax2.set_yticks(freqs[::16])
        ax2.set_xticks(times[::16])
        ax2.set_title('Spectrogram of ' + filename)
        ax2.set_ylabel('Freqs in Hz')
        ax2.set_xlabel('Seconds')
    
        plt.savefig('./output/yes_0a7c2a8d_nohash_0.png')
        plt.show()
    
    train_audio_path = './input/train/audio/'
    filename = '/yes/0a7c2a8d_nohash_0.wav'
    sample_rate, samples = wavfile.read(str(train_audio_path) + filename)
    # print(sample_rate, samples)
    
    # 频谱图
    freqs, times, spectrogram = log_specgram(samples, sample_rate)
    plt_specgram(freqs, times, spectrogram)
    

    声音强度

    def plt_spectrogram():
        plt.figure(figsize=(12, 4))
        librosa.display.specshow(log_S, sr=sample_rate, x_axis='time', y_axis='mel')
        plt.title('Mel power spectrogram ')
        plt.colorbar(format='%+02.0f dB')
        plt.tight_layout()
    
        plt.savefig('./output/spectrogram.png')
        plt.show()
    
    # 光谱图
    sig = samples
    sig = sig / max(abs(sig))
    S = librosa.feature.melspectrogram(y=sig, sr=sample_rate, n_mels=128)
    log_S = librosa.power_to_db(S, ref=np.max)
    
    plt_spectrogram()
    

    三维图像

    def spectrogram_3d():
        data = [go.Surface(z=spectrogram.T)]
    
        layout = go.Layout(
            title='Specgtrogram of "yes" in 3d',
            scene=dict(
                yaxis=dict(title='Frequencies'),
                xaxis=dict(title='Time'),
                zaxis=dict(title='Log amplitude'),
            ),
        )
    
        fig = go.Figure(data=data, layout=layout)
        py.plot(fig)
    spectrogram_3d()
    
    

    不同音频的总数统计

    def count_summary():
        dirs.sort()
        print('Number of labels: ' + str(len(dirs)))
    
        number_of_recordings = []
        for direct in dirs:
            waves = [f for f in os.listdir(join(train_audio_path, direct)) if f.endswith('.wav')]
            number_of_recordings.append(len(waves))
    
        speech_count = dict(map(lambda x, y: [x, y], dirs, number_of_recordings))
        print(speech_count)
    count_summary()
    """输出
    {'_background_noise_': 6, 'bed': 1713, 'bird': 1731, 'cat': 1733, 'dog': 1746, 'down': 2359, 'eight': 2352, 'five': 2357, 'four': 2372, 'go': 2372, 'happy': 1742, 'house': 1750, 'left': 2353, 'marvin': 1746, 'nine': 2364, 'no': 2375, 'off': 2357, 'on': 2367, 'one': 2370, 'right': 2367, 'seven': 2377, 'sheila': 1734, 'six': 2369, 'stop': 2380, 'three': 2356, 'tree': 1733, 'two': 2373, 'up': 2375, 'wow': 1745, 'yes': 2377, 'zero': 2376}
    """
    

    每个英文发音的特征识别

    def mean_fft():
        to_keep = 'yes no up down left right on off stop go'.split()
        dir = [d for d in dirs if d in to_keep]
    
        print(dir)
    
        for direct in dir:
            vals_all = []
            spec_all = []
    
            waves = [f for f in os.listdir(join(train_audio_path, direct)) if f.endswith('.wav')]
            for wav in waves:
                sample_rate, samples = wavfile.read(train_audio_path + direct + '/' + wav)
                if samples.shape[0] != 16000:
                    continue
                xf, vals = custom_fft(samples, 16000)
                vals_all.append(vals)
                freqs, times, spec = log_specgram(samples, 16000)
                spec_all.append(spec)
    
            plt.figure(figsize=(14, 4))
            plt.subplot(121)
            plt.title('Mean fft of ' + direct)
            plt.plot(np.mean(np.array(vals_all), axis=0))
            plt.grid()
            plt.subplot(122)
            plt.title('Mean specgram of ' + direct)
            plt.imshow(np.mean(np.array(spec_all), axis=0).T, aspect='auto', origin='lower',
                       extent=[times.min(), times.max(), freqs.min(), freqs.max()])
            plt.yticks(freqs[::16])
            plt.xticks(times[::16])
    
            plt.savefig('./output/mean_fft_' + direct + '.png')
            plt.show()
    mean_fft()
    

    代码是把所有英文单词的特征图都输出,这里只展示down和yes,2个特征图

    原始数据处理

    for label, fname in zip(labels, fnames):
        sample_rate, samples = wavfile.read(os.path.join(train_data_path, label, fname))
        samples = pad_audio(samples)
        if len(samples) > 16000:
            n_samples = chop_audio(samples)
        else:
            n_samples = [samples]
        for samples in n_samples:
            resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
            _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
            y_train.append(label)
            x_train.append(specgram)
    
    x_train = np.array(x_train)
    x_train = x_train.reshape(tuple(list(x_train.shape) + [1]))
    y_train = label_transform(y_train)
    label_index = y_train.columns.values
    y_train = y_train.values
    y_train = np.array(y_train)
    del labels, fnames
    gc.collect()
    
    

    cnn建模

    def model_cnn(x_train, y_train):
        input_shape = (99, 81, 1)
        nclass = 12
        inp = Input(shape=input_shape)
        norm_inp = BatchNormalization()(inp)
        img_1 = Convolution2D(8, kernel_size=2, activation=activations.relu)(norm_inp)
        img_1 = Convolution2D(8, kernel_size=2, activation=activations.relu)(img_1)
        img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
        img_1 = Dropout(rate=0.2)(img_1)
        img_1 = Convolution2D(16, kernel_size=3, activation=activations.relu)(img_1)
        img_1 = Convolution2D(16, kernel_size=3, activation=activations.relu)(img_1)
        img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
        img_1 = Dropout(rate=0.2)(img_1)
        img_1 = Convolution2D(32, kernel_size=3, activation=activations.relu)(img_1)
        img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
        img_1 = Dropout(rate=0.2)(img_1)
        img_1 = Flatten()(img_1)
    
        dense_1 = BatchNormalization()(Dense(128, activation=activations.relu)(img_1))
        dense_1 = BatchNormalization()(Dense(128, activation=activations.relu)(dense_1))
        dense_1 = Dense(nclass, activation=activations.softmax)(dense_1)
    
        model = models.Model(inputs=inp, outputs=dense_1)
        opt = optimizers.Adam()
    
        model.compile(optimizer=opt, loss=losses.binary_crossentropy)
        model.summary()
    
        x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.1, random_state=2017)
        model.fit(x_train, y_train, batch_size=16, validation_data=(x_valid, y_valid), epochs=3, shuffle=True, verbose=2)
    
        model.save(os.path.join(model_path, 'cnn.model'))
    
        return model
    
    model = model_cnn(x_train, y_train)
    
    """输出
    Using TensorFlow backend.
    2019-06-15 20:59:45.453845 task begin
    ./input/train/audio
    2019-06-15 20:59:46.634215 xy begin
    /Users/user/Library/Python/3.6/lib/python/site-packages/scipy/io/wavfile.py:273: WavFileWarning: Chunk (non-data) not understood, skipping it.
      WavFileWarning)
    2019-06-15 21:02:35.116550 reshape begin
    2019-06-15 21:02:46.166546 model begin
    WARNING:tensorflow:From /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
    Instructions for updating:
    Colocations handled automatically by placer.
    2019-06-15 21:02:46.278603: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
    _________________________________________________________________
    Layer (type)                 Output Shape              Param #
    =================================================================
    input_1 (InputLayer)         (None, 99, 81, 1)         0
    _________________________________________________________________
    batch_normalization_1 (Batch (None, 99, 81, 1)         4
    _________________________________________________________________
    conv2d_1 (Conv2D)            (None, 98, 80, 8)         40
    _________________________________________________________________
    conv2d_2 (Conv2D)            (None, 97, 79, 8)         264
    _________________________________________________________________
    max_pooling2d_1 (MaxPooling2 (None, 48, 39, 8)         0
    _________________________________________________________________
    dropout_1 (Dropout)          (None, 48, 39, 8)         0
    _________________________________________________________________
    conv2d_3 (Conv2D)            (None, 46, 37, 16)        1168
    _________________________________________________________________
    conv2d_4 (Conv2D)            (None, 44, 35, 16)        2320
    _________________________________________________________________
    max_pooling2d_2 (MaxPooling2 (None, 22, 17, 16)        0
    _________________________________________________________________
    dropout_2 (Dropout)          (None, 22, 17, 16)        0
    _________________________________________________________________
    conv2d_5 (Conv2D)            (None, 20, 15, 32)        4640
    _________________________________________________________________
    max_pooling2d_3 (MaxPooling2 (None, 10, 7, 32)         0
    _________________________________________________________________
    dropout_3 (Dropout)          (None, 10, 7, 32)         0
    _________________________________________________________________
    flatten_1 (Flatten)          (None, 2240)              0
    _________________________________________________________________
    dense_1 (Dense)              (None, 128)               286848
    _________________________________________________________________
    batch_normalization_2 (Batch (None, 128)               512
    _________________________________________________________________
    dense_2 (Dense)              (None, 128)               16512
    _________________________________________________________________
    batch_normalization_3 (Batch (None, 128)               512
    _________________________________________________________________
    dense_3 (Dense)              (None, 12)                1548
    =================================================================
    Total params: 314,368
    Trainable params: 313,854
    Non-trainable params: 514
    _________________________________________________________________
    Instructions for updating:
    Use tf.cast instead.
    Train on 58356 samples, validate on 6485 samples
    Epoch 1/3
     - 737s - loss: 0.1415 - val_loss: 0.0874
    Epoch 2/3
     - 608s - loss: 0.0807 - val_loss: 0.0577
    Epoch 3/3
     - 518s - loss: 0.0636 - val_loss: 0.0499
    2019-06-15 21:33:58.518621 predict begin
    """
    

    预测

    del x_train, y_train
    gc.collect()
    
    index = []
    results = []
    for fnames, imgs in test_data_generator(batch=32):
        predicts = model.predict(imgs)
        predicts = np.argmax(predicts, axis=1)
        predicts = [label_index[p] for p in predicts]
        index.extend(fnames)
        results.extend(predicts)
    
    df = pd.DataFrame(columns=['fname', 'label'])
    df['fname'] = index
    df['label'] = results
    df.to_csv(os.path.join(out_path, 'sub.csv'), index=False)
    
    关于预测的数据集,kaggle提供的压缩包有2,3G,解压后有将近10w条音频,个人笔记本吃不消。于是只选取其中的100条来测试,根据预测出的结果,和自己听取wav音频的结果对比,是正确的。但是并没有在大规模的数据集上预测,所以准确率不可知。后续在GPU上训练时,再考虑预测所有的数据。

    完整代码下载

    githup源码

  • 相关阅读:
    Python解释器【转载】
    Python第一行代码
    Hive安装部署
    Python 3.6安装教程
    Spark安装部署
    Code:Blocks中文输出乱码解决方法
    HBase集群安装部署
    Hadoop集群时间同步
    ZooKeeper安装部署
    Linux重置mysql密码
  • 原文地址:https://www.cnblogs.com/wanli002/p/11031392.html
Copyright © 2020-2023  润新知