• 语音预处理(三):可视化


    一.可视化

    本节主要介绍如何将语音信号可视化,以及读取wav和label文件并保存为字典,列表。
    输入:输入的wav文件所对应的数据矩阵wave_data和帧率framerate。
    例:[[1507 1374 1218 ... -78 -127 -43]] ,16000
    输出:可视化图,如波形图,频谱图

    二.代码:

    #coding=utf-8
    
    import os
    import wave
    import numpy as np
    import matplotlib.pyplot as plt
    import math 
    import time
    from python_speech_features import mfcc
    from python_speech_features import delta
    from python_speech_features import logfbank
    from scipy.fftpack import fft
    
    def read_wav_data(filename):
        """
        :param filename:输入音频的绝对路径(路径+文件名)例:D:\GitHub\wav\dae\train\A2_1.wav
        :return:wave_data,framerate:输出音频矩阵,帧率。例:A2_1 [[1507 1374 1218 ...  -78 -127  -43]]
        读取wav文件,返回声音信号的时域谱矩阵和播放时间
        """
        wav = wave.open(filename,"rb") #打开wav格式的声音文件filename
        audioname = filename.split('\')[-1] #音频名
        num_frame = wav.getnframes() #获取帧数
        #print("{}帧数为:{}".format(audioname,num_frame))
        num_channel = wav.getnchannels() #获取声道数
        #print("{}声道数为:{}".format(audioname,num_channel))
        framerate = wav.getframerate() # 获取帧率
        #print("{}帧率为:{}".format(audioname,framerate))
        num_sample_width = wav.getsampwidth() #获取每一帧的比特宽度
        #print("{}比特宽度为:{}".format(audioname,num_sample_width))
        str_data = wav.readframes(num_frame) # 读取全部的帧(二进制字符串)
        wav.close() # 关闭流
    
        wave_data = np.fromstring(str_data,dtype=np.short) # 将声音文件数据从字符串格式转换为数组矩阵形式
        # print("{} shape: {}".format("wave_data",wave_data.shape))
        # print("{} type: {}".format("wave_data",wave_data.dtype))
        # print("{}: {}".format("wave_data",wave_data))
        wave_data.shape = -1, num_channel #按照声道数将数组整形,单声道是一列,双声道是两列矩阵
        # print("{} shape(整形后): {}".format("wave_data",wave_data.shape))
        # print("{}(整形后): {}".format("wave_data",wave_data))
        wave_data = wave_data.T # 将wave_data矩阵转置
        # print("{} shape(转置后):{}".format("wave_data",wave_data.shape))
        # print("{}(转置后):{}".format("wave_data",wave_data))
        # print("{} len:{}".format("wave_data[0]",len(wave_data[0])))
    
        return wave_data, framerate
    x = np.linspace(0, 400 - 1, 400, dtype = np.int64)
    w = 0.54 - 0.46 * np.cos(2 * np.pi * (x) / (400 - 1) ) # 汉明窗
    
    def GetFrequencyFeature(wavsignal, fs):
        """
        :param wavsignal:音频矩阵 例:[[1507 1374 1218 ...  -78 -127  -43]]
        :param fs:帧率 例:16000
        :return data_input:转成频域后的音频矩阵
        """
        # wav波形 加时间窗以及时移10ms
        time_window = 25  # 单位ms
        window_length = fs / 1000 * time_window  # 计算窗长度的公式,目前全部为400固定值
        wav_arr = np.array(wavsignal)  # wav_arr:[[1507 1374 1218 ...  -78 -127  -43]]
        wav_length = wav_arr.shape[1]  # wav_arr.shape[0]:1,wav_arr.shape[1]:163000
        range0_end = int(len(wavsignal[0]) / fs * 1000 - time_window) // 10  # 计算循环终止的位置,也就是最终生成的窗数
        data_input = np.zeros((range0_end, 200), dtype=np.float)  # 用于存放最终的频率特征数据
        data_line = np.zeros((1, 400), dtype=np.float)
    
        for i in range(0, range0_end):
            p_start = i * 160  # 0,160,320,480
            p_end = p_start + 400  # 400,560,720,880
            data_line = wav_arr[0, p_start:p_end]  # 分帧
            data_line = data_line * w  # 加窗(这里是汉明窗)
            data_line = np.abs(fft(data_line)) / wav_length  # 傅里叶变换
            data_input[i] = data_line[0:200]  # 设置为400除以2的值(即200)是取一半数据,因为是对称的
        data_input = np.log(data_input + 1)  # 取log
        return data_input
    
    def wav_scale(energy):
        """
        :param energy:要进行归一化的语音信号
        :return e:归一化后的语音信号
        """
        means = energy.mean() # 均值
        var=energy.var() # 方差
        e=(energy-means)/math.sqrt(var) # 归一化能量
        return e
    
    def wav_show(wave_data, fs): # 显示出来声音波形
    	time = np.arange(0, len(wave_data)) * (1.0/fs)  # 计算声音的播放时间,单位为秒
    	# 画声音波形
    	plt.subplot(111)
    	plt.plot(time, wave_data)
    	plt.xlabel('time/s')
    	plt.ylabel('value')
    	plt.show()
    
    def get_wav_list(filename):
        '''
        读取一个wav文件列表,返回一个存储该列表的字典类型值
        '''
        txt_obj = open(filename, 'r')  # 打开文件并读入
        txt_text = txt_obj.read()
        txt_lines = txt_text.split('
    ')  # 文本分割
        # print("txt_lines:
    {}".format(txt_lines))
        dic_filelist = {}  # 初始化字典
        list_wavmark = []  # 初始化wav列表
        for i in txt_lines:
            if (i != ''):
                txt_l = i.split(' ')
                dic_filelist[txt_l[0]] = txt_l[1]
                list_wavmark.append(txt_l[0])
        txt_obj.close()
        # print("dic_filelist:
    {}".format(dic_filelist))
        # print("list_wavmark:
    {}".format(list_wavmark))
        return dic_filelist, list_wavmark
    
    def get_wav_symbol(filename):
        '''
        读取指定数据集中,所有wav文件对应的语音符号
        返回一个存储符号集的字典类型值
        '''
        txt_obj = open(filename, 'r')  # 打开文件并读入
        txt_text = txt_obj.read()
        # print("txt_text:
    {}".format(txt_text))
        txt_lines = txt_text.split('
    ')  # 文本分割
        # print("txt_lines:
    {}".format(txt_lines))
        dic_symbol_list = {}  # 初始化字典
        list_symbolmark = []  # 初始化symbol列表
        for i in txt_lines:
            if (i != ''):
                txt_l = i.split(' ')
                dic_symbol_list[txt_l[0]] = txt_l[1:]
                list_symbolmark.append(txt_l[0])
        txt_obj.close()
        # print("dic_symbol_list:
    {}".format(dic_symbol_list))
        print("list_symbolmark:
    {}".format(list_symbolmark))
        return dic_symbol_list, list_symbolmark
    
    def GetSymbolList(datapath):
        '''
        加载拼音符号列表,用于标记符号
        返回一个列表list类型变量
        '''
        txt_obj = open('dict.txt', 'r', encoding='UTF-8')  # 打开文件并读入
        txt_text = txt_obj.read()
        #print("txt_text:
    {}".format(txt_text))
        txt_lines = txt_text.split('
    ')  # 文本分割
        #print("txt_lines:
    {}".format(txt_lines))
        list_symbol = []  # 初始化符号列表
        for i in txt_lines:
            if (i != ''):
                txt_l = i.split(" ")
                list_symbol.append(txt_l[0])
        txt_obj.close()
        list_symbol.append('_')
        #print(list_symbol)
        # SymbolNum = len(list_symbol)
        return list_symbol
        
    if(__name__=='__main__'):
        wave_data, fs = read_wav_data("D:\GitHub\wav\dae\train\A2_1.wav")
        #print("wave_data, fs:{}{}".format(wave_data, fs))
        wav_show(wave_data[0], fs)
        wave_scale = wav_scale(wave_data)
        wav_show(wave_scale[0], fs)
        t0 = time.time()
        freimg = GetFrequencyFeature(wave_data, fs)
        t1 = time.time()
        print('time cost:', t1 - t0)
        freimg = freimg.T
        plt.subplot(111)
        plt.imshow(freimg)
        plt.colorbar(cax=None, ax=None, shrink=0.5)
        plt.show()
        # get_wav_list("D:\Code\pycharm\learning\20180903\train.wav.lst")
        # get_wav_symbol("D:\Code\pycharm\learning\20180903\train.syllable.txt")
    	# GetSymbolList("D:\Code\pycharm\learning\20180910\")
    

    三.程序输出:

    波形图
    波形图(归一化后)
    频谱图

  • 相关阅读:
    按照鬼哥学so变化,四,第一章的例子
    浏览器发送总共下载文件2第二个请求,如何“下载”仅仅记录1次要?
    hdu 3371 Connect the Cities
    自己写RTPserver——大约RTP协议
    cocos2d-x3.2中将XCode发展project转移到VS2010可能会发生错误
    apache kafkac系列lient发展-java
    大约++和--了解运营商
    socket计划——一个简单的例子
    PhotoShop基本工具 -- 移动工具
    ZA7783:MIPI转LVDS/MIPI转RGB888/RGB转LVDS
  • 原文地址:https://www.cnblogs.com/jianxiong0117/p/9617949.html
Copyright © 2020-2023  润新知