• [Audio processing] 数据集生成 & 性别年龄分类训练 Python


    1、重命名,Python中文路径各种错误,所以需要先将所有文件的路径名全都改成中文。用的是MAC系统,所以WIN下的命令行批处理没法解决,所以用C来完成

    //  Created by Carl on 16.
    //  Copyright (c) 2016年 Carl. All rights reserved.
    //
    
    #include <iostream>
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <dirent.h>
    #include <unistd.h>
    using namespace std;
    
    void getFileList()
    {
        string sourceDir = "/Users/karl/Work/database/rawdata/children_CN/";
        string targetDir = "/Users/karl/Work/database/rawdata/children/";
        DIR *dir;
        struct dirent *ptr;
        int i = 0;
        if ((dir=opendir(sourceDir.c_str())) == NULL)
        {
            perror("Open dir error...");
            exit(1);
        }
        while ((ptr=readdir(dir)) != NULL)
        {
            if(strcmp(ptr->d_name,".")==0 || strcmp(ptr->d_name,"..")==0)    ///current dir OR parrent
                continue;
            else if(ptr->d_type == 8)
            {
                printf("%s %s
    ",(sourceDir + ptr->d_name).c_str(),(targetDir + to_string(i) + ".wav").c_str());
                if(rename((sourceDir + ptr->d_name).c_str(), (targetDir + to_string(i++) + ".wav").c_str())<0)
                    cout<<"error"<<endl;
                else
                    cout<<"ok"<<endl;
            }
            
        }
        return;
    }
    
    int main() {
        getFileList();
        return 1;
    }
    View Code

    2、然后再使用FFMPEG那篇文章写的Python代码,将所有音频文件转成统一格式

    #coding=utf-8
    #!/usr/bin/env python
    '''CREATED:2016-03-08
    Use example of ffmpeg
    '''
    import argparse
    import sys
    import os
    import string
    import subprocess as sp
    
    #Full path of ffmpeg
    FFMPEG_BIN = "/Users/karl/Documents/python/audio/tool/ffmpeg"
    #Full path of sourceDir
    sourceDir = "/Users/karl/Work/database/rawdata/male/"
    #Full path of targetDir
    targetDir = "/Users/karl/Work/database/age/male/"
    #Channel setting 1 for mono
    ac = 1
    #Sample frequency
    sf = 16000
    #Extension setting
    ext = 'wav'
    
    def convert(sourceDir, targetDir, ac, sf, ext):
        i = 0
        if not os.path.exists(targetDir):
            os.mkdir(targetDir)
        files = os.listdir(sourceDir)
        for f in files:
            if f.endswith('.wav'):
                command = [ FFMPEG_BIN,
                           '-i', os.path.join(sourceDir, f),
                           '-ac', str(ac),
                           '-ar', str(sf), os.path.join(targetDir, str(i) + "." + ext)]
                i += 1
                print command
                pipe = sp.Popen(command, stdout = sp.PIPE, bufsize = 10**8)
    
    if __name__ == '__main__':
        convert(sourceDir, targetDir, ac, sf, ext)
    View Code

    3、用时域上RMS去除静音帧(Optional)

    #---Cut the silent head and tail of audio
    def rmsdemo(y):
        return np.sqrt((y**2).mean())
    
    def cutheadntail(y, winlen, threshold):
        totallen = y.shape[0]
        num = totallen / winlen
        i = 1
        j = num
        for i in range(num):
            if rmsdemo(y[i * winlen : (i + 1) * winlen - 1]) > threshold:
                break
        for j in range(-1,0,-1):
            if rmsdemo(y[i * winlen : (i + 1) * winlen - 1]) > threshold or j == i:
                break
        #percentage = (j - i + 1) * 1.0 / num;
        #print(i, j, percentage)
        yy = y[i * winlen : (j + 1) * winlen - 1]
        return yy
    View Code

    4、用librosa提取特征,包括MFCC、DMFCC

    from __future__ import print_function
    import argparse
    import sys
    import os
    import pprint
    import sklearn as sl
    import numpy as np
    import librosa
    import librosa.feature.spectral as f
    import svmutil
    
    #---Feature extraction and store, including MFCC, DMFCC
    def mfcclist(data_dir):
        m = []
        dm = []
        for i in range(300):
            filepath = os.path.join(data_dir, str(i) + '.wav')
            print(filepath)
            am, adm = mfccfile(filepath)
            m.append(am)
            dm.append(adm)
            i += 1
        np.savetxt("TrainFemaleMFCC",m,fmt='%s',newline='
    ')
        np.savetxt("TrainFemaleDMFCC",dm,fmt='%s',newline='
    ')
        #print(m)
        #print(dm)
    '''
        fout = open(output_file,'w')
        fout.write(str(am) + '
    ')
        fout.write(str(adm))
        fout.close()
    '''
    
    def mfccfile(input_file):
        print('Loading ', input_file)
        y, sr = librosa.load(input_file)
        M = f.mfcc(y, sr, None, 13)
        DM = M[::,1::] - M[::,0:-1:1]
        am = np.mean(M, axis = 1)
        adm = np.mean(DM, axis = 1)
        return (am, adm)
    
    #---Loading stored features file
    def loadfeatures(features_file):
        fin = open(features_file, 'r')
        features = [map(float,ln.strip().split(' '))
                    for ln in fin.read().splitlines() if ln.strip()]
                    #pprint.pprint(features)
        print(features)
    View Code

    5、用libsvm训练和预测,包括归一化

    #---SVM training and predicting process
    def svmtraindemo(x, modelname, scalar):
        x = scalar.transform(x)
        #x = sl.preprocessing.scale(x)
        x = x.tolist()
        print(x)
        y = [1.0] * 300 + [1] * 300 + [-1.0] * 600
        model = svm_train(y, x, '-b 1')
        svm_save_model(modelname + str(0), model)
        p_label, p_acc, p_val = svm_predict(y[:1200], x[:1200], model, '-b 1')
    
    def svmpredictdemo(x, modelname, scalar):
        x = scalar.transform(x)
        #x = sl.preprocessing.scale(x)
        x = x.tolist()
        print(len(x))
        y = [1.0] * 100 + [1] * 100 + [-1.0] * 200
        m = svm_load_model(modelname + str(0))
        print(p_label)
        p_label, p_acc, p_val = svm_predict(y[:400], x[:400], m, '-b 1')
    View Code

     附:

    1、经过试验,发现用无监督的方式,准确来说是基于规则的方式分辨男、女、小孩的声音还是不太靠谱,频域上的分布还是用有监督的方式自己学习应该更可靠。

    2、用有噪音的推无噪音的小孩,准确率80%,无噪音推有噪音的,准确率才60+%,所以训练还是最好用噪音环境的数据集吧,之前想的是训练应该用无噪音的样本还是太天真了。其实混合起来效果还不错。

    3、男女的准确率也就80%,样本分布还是比较好,而且均有噪音,估计在实际应用中效果也不会比80%差太远。

  • 相关阅读:
    Docker安装 Redis Stack(开发适配提供 Redis Stack 服务器和RedisInsight可视化) (6.2.2v3版本)
    Python从入门到入土第6课——列表
    深度学习基础基于Numpy的多层前馈神经网络(FFN)的构建和反向传播训练
    【JMeter】启动时报错Uncaught Exception java.lang.IllegalAccessError
    【JMeter】JMeter连接Mysql8.x数据库的坑
    【Selenium】报错 'ascii' codec can't decode byte 0xe7 in position 0: ordinal not in range(128)
    【MYSQL】彻底卸载无安装版mysql
    【MYSQL转】Mysql8.0修改数据库密码
    【MYSQL】MYSQL常用命令
    【环境】如何搭建PHP开发环境(PHP+Apache+MySQL)
  • 原文地址:https://www.cnblogs.com/littletail/p/5255290.html
Copyright © 2020-2023  润新知