1、重命名,Python中文路径各种错误,所以需要先将所有文件的路径名全都改成中文。用的是MAC系统,所以WIN下的命令行批处理没法解决,所以用C来完成
// Created by Carl on 16. // Copyright (c) 2016年 Carl. All rights reserved. // #include <iostream> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <dirent.h> #include <unistd.h> using namespace std; void getFileList() { string sourceDir = "/Users/karl/Work/database/rawdata/children_CN/"; string targetDir = "/Users/karl/Work/database/rawdata/children/"; DIR *dir; struct dirent *ptr; int i = 0; if ((dir=opendir(sourceDir.c_str())) == NULL) { perror("Open dir error..."); exit(1); } while ((ptr=readdir(dir)) != NULL) { if(strcmp(ptr->d_name,".")==0 || strcmp(ptr->d_name,"..")==0) ///current dir OR parrent continue; else if(ptr->d_type == 8) { printf("%s %s ",(sourceDir + ptr->d_name).c_str(),(targetDir + to_string(i) + ".wav").c_str()); if(rename((sourceDir + ptr->d_name).c_str(), (targetDir + to_string(i++) + ".wav").c_str())<0) cout<<"error"<<endl; else cout<<"ok"<<endl; } } return; } int main() { getFileList(); return 1; }
2、然后再使用FFMPEG那篇文章写的Python代码,将所有音频文件转成统一格式
#coding=utf-8 #!/usr/bin/env python '''CREATED:2016-03-08 Use example of ffmpeg ''' import argparse import sys import os import string import subprocess as sp #Full path of ffmpeg FFMPEG_BIN = "/Users/karl/Documents/python/audio/tool/ffmpeg" #Full path of sourceDir sourceDir = "/Users/karl/Work/database/rawdata/male/" #Full path of targetDir targetDir = "/Users/karl/Work/database/age/male/" #Channel setting 1 for mono ac = 1 #Sample frequency sf = 16000 #Extension setting ext = 'wav' def convert(sourceDir, targetDir, ac, sf, ext): i = 0 if not os.path.exists(targetDir): os.mkdir(targetDir) files = os.listdir(sourceDir) for f in files: if f.endswith('.wav'): command = [ FFMPEG_BIN, '-i', os.path.join(sourceDir, f), '-ac', str(ac), '-ar', str(sf), os.path.join(targetDir, str(i) + "." + ext)] i += 1 print command pipe = sp.Popen(command, stdout = sp.PIPE, bufsize = 10**8) if __name__ == '__main__': convert(sourceDir, targetDir, ac, sf, ext)
3、用时域上RMS去除静音帧(Optional)
#---Cut the silent head and tail of audio def rmsdemo(y): return np.sqrt((y**2).mean()) def cutheadntail(y, winlen, threshold): totallen = y.shape[0] num = totallen / winlen i = 1 j = num for i in range(num): if rmsdemo(y[i * winlen : (i + 1) * winlen - 1]) > threshold: break for j in range(-1,0,-1): if rmsdemo(y[i * winlen : (i + 1) * winlen - 1]) > threshold or j == i: break #percentage = (j - i + 1) * 1.0 / num; #print(i, j, percentage) yy = y[i * winlen : (j + 1) * winlen - 1] return yy
4、用librosa提取特征,包括MFCC、DMFCC
from __future__ import print_function import argparse import sys import os import pprint import sklearn as sl import numpy as np import librosa import librosa.feature.spectral as f import svmutil #---Feature extraction and store, including MFCC, DMFCC def mfcclist(data_dir): m = [] dm = [] for i in range(300): filepath = os.path.join(data_dir, str(i) + '.wav') print(filepath) am, adm = mfccfile(filepath) m.append(am) dm.append(adm) i += 1 np.savetxt("TrainFemaleMFCC",m,fmt='%s',newline=' ') np.savetxt("TrainFemaleDMFCC",dm,fmt='%s',newline=' ') #print(m) #print(dm) ''' fout = open(output_file,'w') fout.write(str(am) + ' ') fout.write(str(adm)) fout.close() ''' def mfccfile(input_file): print('Loading ', input_file) y, sr = librosa.load(input_file) M = f.mfcc(y, sr, None, 13) DM = M[::,1::] - M[::,0:-1:1] am = np.mean(M, axis = 1) adm = np.mean(DM, axis = 1) return (am, adm) #---Loading stored features file def loadfeatures(features_file): fin = open(features_file, 'r') features = [map(float,ln.strip().split(' ')) for ln in fin.read().splitlines() if ln.strip()] #pprint.pprint(features) print(features)
5、用libsvm训练和预测,包括归一化
#---SVM training and predicting process def svmtraindemo(x, modelname, scalar): x = scalar.transform(x) #x = sl.preprocessing.scale(x) x = x.tolist() print(x) y = [1.0] * 300 + [1] * 300 + [-1.0] * 600 model = svm_train(y, x, '-b 1') svm_save_model(modelname + str(0), model) p_label, p_acc, p_val = svm_predict(y[:1200], x[:1200], model, '-b 1') def svmpredictdemo(x, modelname, scalar): x = scalar.transform(x) #x = sl.preprocessing.scale(x) x = x.tolist() print(len(x)) y = [1.0] * 100 + [1] * 100 + [-1.0] * 200 m = svm_load_model(modelname + str(0)) print(p_label) p_label, p_acc, p_val = svm_predict(y[:400], x[:400], m, '-b 1')
附:
1、经过试验,发现用无监督的方式,准确来说是基于规则的方式分辨男、女、小孩的声音还是不太靠谱,频域上的分布还是用有监督的方式自己学习应该更可靠。
2、用有噪音的推无噪音的小孩,准确率80%,无噪音推有噪音的,准确率才60+%,所以训练还是最好用噪音环境的数据集吧,之前想的是训练应该用无噪音的样本还是太天真了。其实混合起来效果还不错。
3、男女的准确率也就80%,样本分布还是比较好,而且均有噪音,估计在实际应用中效果也不会比80%差太远。