DSP

matlab自带的VAD检测程序

2019-07-13 16:36发布

在matlab的DSP工具箱中,自带了VAD程序,本文主要记录如何使用这个程序。 clear all close all clc %% 设置音频源,读入一个speaker.wav文件,并对读入的文件进行分帧,每帧80个采样点 audioSource = dsp.AudioFileReader('SamplesPerFrame',80,... 'Filename','D:matlabworkVADspeaker.wav',... 'OutputDataType', 'single'); %% 设置一个示波器,示波器的时间范围30S,正好等于读取的上面读取的文件的长度,有两个信号输入 scope = dsp.TimeScope(2,... 'SampleRate', [8000/80 8000], ... 'BufferLength', 240000, ... 'YLimits', [-0.3 1.1], ... 'TimeSpan',30,... 'ShowGrid', true, ... 'Title','Decision speech and speech data', ... 'TimeSpanOverrunAction','Scroll'); %% 对VAD进行配置 VAD_cst_param = vadInitCstParams; clear vadG729 % 每帧数据长度10ms,计算3000帧 = 30S numTSteps = 3000; while(numTSteps) % 从audioSource中读取一帧 = 10ms 的数据 speech = audioSource(); % 对数据进行VAD判断 decision = vadG729(speech, VAD_cst_param); % 将读取的语音信号波形以及VAD判断结果显示在示波器上 scope(decision, speech); numTSteps = numTSteps - 1; end release(scope); 以下为运行结果 对于程序中的vadInitCstParams的设置函数,实现代码如下: function VAD_cst_param = vadInitCstParams() %% Algorithm Constants Initialization VAD_cst_param.Fs = single(8000); % Sampling Frequency VAD_cst_param.M = single(10); % Order of LP filter VAD_cst_param.NP = single(12); % Increased LPC order VAD_cst_param.No = single(128); % Number of frames for long-term minimum energy calculation VAD_cst_param.Ni = single(32); % Number of frames for initialization of running averages VAD_cst_param.INIT_COUNT = single(20); % High Pass Filter that is used to preprocess the signal applied to the VAD VAD_cst_param.HPF_sos = single([0.92727435, -1.8544941, 0.92727435, 1, -1.9059465, 0.91140240]); VAD_cst_param.L_WINDOW = single(240); % Window size in LP analysis VAD_cst_param.L_NEXT = single(40); % Lookahead in LP analysis VAD_cst_param.L_FRAME = single(80); % Frame size L1 = VAD_cst_param.L_NEXT; L2 = VAD_cst_param.L_WINDOW - VAD_cst_param.L_NEXT; VAD_cst_param.hamwindow = single([0.54 - 0.46*cos(2*pi*(0:L2-1)'/(2*L2-1)); cos(2*pi*(0:L1-1)'/(4*L1-1))]); % LP analysis, lag window applied to autocorrelation coefficients VAD_cst_param.lagwindow = single([1.0001; exp(-1/2 * ((2 * pi * 60 / VAD_cst_param.Fs) * (1:VAD_cst_param.NP)').^2)]); % Correlation for a lowpass filter (3 dB point on the power spectrum is % at about 2 kHz) VAD_cst_param.lbf_corr = single([0.24017939691329, 0.21398822343783, 0.14767692339633, ... 0.07018811903116, 0.00980856433051,-0.02015934721195, ... -0.02388269958005,-0.01480076155002,-0.00503292155509, ... 0.00012141366508, 0.00119354245231, 0.00065908718613, ... 0.00015015782285]'); vadInitCstParams函数的执行结果如下: Fs: 8000 M: 10 NP: 12 No: 128 Ni: 32 INIT_COUNT: 20 HPF_sos: [0.9273 -1.8545 0.9273 1 -1.9059 0.9114] L_WINDOW: 240 L_NEXT: 40 L_FRAME: 80 hamwindow: [240×1 single] lagwindow: [13×1 single] lbf_corr: [13×1 single] VAD 函数实现代码如下: function vad_flag = vadG729(speech, VAD_cst_param) % VADG729 Implement the Voice Activity Detection Algorithm. % Note that although G.729 VAD operates on pre-processed speech data, this % function is a standalone version, i.e. the pre-processing (highpass % filtering and linear predictive analysis) is also included. % % This function is in support of the 'G.729 Voice Activity Detection' % example and may change in a future release % Copyright 2015-2016 The MathWorks, Inc. %% Algorithm Components Initialization persistent biquad autocorr levSolver1 levSolver2 lpc2lsf zerodet VAD_var_param if isempty(biquad) % Create a IIR digital filter used for pre-processing biquad = dsp.BiquadFilter('SOSMatrix', VAD_cst_param.HPF_sos); % Create an autocorrelator and set its properties to compute the lags % in the range [0:NP]. autocorr = dsp.Autocorrelator('MaximumLagSource', 'Property', ... 'MaximumLag', VAD_cst_param.NP); % Create a Levinson solver which compute the reflection coefficients % from auto-correlation function using the Levinson-Durbin recursion. % The first object is configured to output polynomial coefficients and % the second object is configured to output reflection coefficients. levSolver1 = dsp.LevinsonSolver('AOutputPort', true, ... 'KOutputPort', false); levSolver2 = dsp.LevinsonSolver('AOutputPort', false, ... 'KOutputPort', true); % Create a converter from linear prediction coefficients (LPC) to line % spectral frequencies (LSF) lpc2lsf = dsp.LPCToLSF; % Create a zero crossing detector zerodet = dsp.ZeroCrossingDetector; % initialize variable parameters VAD_var_param.window_buffer = single(zeros(VAD_cst_param.L_WINDOW - VAD_cst_param.L_FRAME, 1)); VAD_var_param.frm_count = single(0); VAD_var_param.MeanLSF = single(zeros(VAD_cst_param.M, 1)); VAD_var_param.MeanSE = single(0); VAD_var_param.MeanSLE = single(0); VAD_var_param.MeanE = single(0); VAD_var_param.MeanSZC = single(0); VAD_var_param.count_sil = single(0); VAD_var_param.count_update = single(0); VAD_var_param.count_ext = single(0); VAD_var_param.less_count = single(0); VAD_var_param.flag = single(1); VAD_var_param.prev_markers = single([1, 1]); VAD_var_param.prev_energy = single(0); VAD_var_param.Prev_Min = single(Inf); VAD_var_param.Next_Min = single(Inf); VAD_var_param.Min_buffer = single(Inf * ones(1, VAD_cst_param.No)); end %% Constants Initialization NOISE = single(0); VOICE = single(1); v_flag = single(0);%#ok VAD_var_param.frm_count = single(VAD_var_param.frm_count + 1); %% Pre-processing % Filter the speech frame: this high-pass filter serves as a precaution % against undesired low-frequency components. speech_hp = biquad(32768*speech); % Store filtered data to the pre-processed speech buffer speech_buf = [VAD_var_param.window_buffer; speech_hp]; % LPC analysis % Windowing of signal speech_win = VAD_cst_param.hamwindow .* speech_buf; % Autocorrelation r = autocorr(speech_win) .* VAD_cst_param.lagwindow; % LSF a = levSolver1(r(1:VAD_cst_param.M+1)); LSF = lpc2lsf(a) / (2 * pi); % Reflection coefficients rc = levSolver2(r(1:3)); %% VAD starts here %% Parameters extraction % Full-band energy Ef = 10 * log10(r(1) / VAD_cst_param.L_WINDOW); % Low-band energy El = r(1) * VAD_cst_param.lbf_corr(1) + 2 * sum(r(2:end) .* VAD_cst_param.lbf_corr(2:end)); El = 10 * log10(El / VAD_cst_param.L_WINDOW); % Spectral Distortion SD = sum((LSF-VAD_var_param.MeanLSF).^2); % Zero-crossing rate idx = VAD_cst_param.L_WINDOW - VAD_cst_param.L_NEXT - VAD_cst_param.L_FRAME + 1; ZC = double(zerodet(speech_buf(idx:idx+VAD_cst_param.L_FRAME)))/VAD_cst_param.L_FRAME; % Long-term minimum energy VAD_var_param.Next_Min = min(Ef, VAD_var_param.Next_Min); Min = min(VAD_var_param.Prev_Min, VAD_var_param.Next_Min); if (mod(VAD_var_param.frm_count, single(8)) == 0) VAD_var_param.Min_buffer = [VAD_var_param.Min_buffer(2:end), VAD_var_param.Next_Min]; VAD_var_param.Prev_Min = min(VAD_var_param.Min_buffer); VAD_var_param.Next_Min = single(Inf); end if (VAD_var_param.frm_count < VAD_cst_param.Ni) %% Initialization of running averages if frame number is less than 32 if (Ef < 21) VAD_var_param.less_count = VAD_var_param.less_count + 1; marker = NOISE; else % include only the frames that have an energy Ef greater than 21 marker = VOICE; NE = (VAD_var_param.frm_count - 1) - VAD_var_param.less_count; VAD_var_param.MeanE = (VAD_var_param.MeanE * NE + Ef) / (NE+1); VAD_var_param.MeanSZC = (VAD_var_param.MeanSZC * NE + ZC) / (NE+1); VAD_var_param.MeanLSF = (VAD_var_param.MeanLSF * NE + LSF) / (NE+1); end else %% Start calculating the characteristic energies of background noise if (VAD_var_param.frm_count == VAD_cst_param.Ni) VAD_var_param.MeanSE = VAD_var_param.MeanE - 10; VAD_var_param.MeanSLE = VAD_var_param.MeanE - 12; end % Difference measures between current frame parameters and running % averages of background noise characteristics dSE = VAD_var_param.MeanSE - Ef; dSLE = VAD_var_param.MeanSLE - El; dSZC = VAD_var_param.MeanSZC - ZC; %% Initial VAD decision if (Ef < 21) marker = NOISE; else marker = vad_decision(dSLE, dSE, SD, dSZC); end v_flag = single(0); %% Voice activity decision smoothing % from energy considerations and neighboring past frame decisions % Step 1 if ((VAD_var_param.prev_markers(1) == VOICE) && (marker == NOISE) ... && (Ef > VAD_var_param.MeanSE + 2) && (Ef > 21)) marker = VOICE; v_flag = single(1); end % Step 2 if (VAD_var_param.flag == 1) if ((VAD_var_param.prev_markers(2) == VOICE) ... && (VAD_var_param.prev_markers(1) == VOICE) ... && (marker == NOISE) ... && (abs(Ef - VAD_var_param.prev_energy) <= 3)) VAD_var_param.count_ext = VAD_var_param.count_ext + 1; marker = VOICE; v_flag = single(1); if (VAD_var_param.count_ext <= 4) VAD_var_param.flag = single(1); else VAD_var_param.count_ext = single(0); VAD_var_param.flag = single(0); end end else VAD_var_param.flag = single(1); end if (marker == NOISE) VAD_var_param.count_sil = VAD_var_param.count_sil + 1; end % Step 3 if ((marker == VOICE) && (VAD_var_param.count_sil > 10) ... && (Ef - VAD_var_param.prev_energy <= 3)) marker = NOISE; VAD_var_param.count_sil = single(0); end if (marker == VOICE) VAD_var_param.count_sil = single(0); end % Step 4 if ((Ef < VAD_var_param.MeanSE + 3) && (VAD_var_param.frm_count > VAD_cst_param.No) ... && (v_flag == 0) && (rc(2) < 0.6)) marker = NOISE; end %% Update running averages only in the presence of background noise if ((Ef < VAD_var_param.MeanSE + 3) && (rc(2) < 0.75) && (SD < 0.002532959)) VAD_var_param.count_update = VAD_var_param.count_update + 1; % Modify update speed coefficients if (VAD_var_param.count_update < VAD_cst_param.INIT_COUNT) COEF = single(0.75); COEFZC = single(0.8); COEFSD = single(0.6); elseif (VAD_var_param.count_update < VAD_cst_param.INIT_COUNT + 10) COEF = single(0.95); COEFZC = single(0.92); COEFSD = single(0.65); elseif (VAD_var_param.count_update < VAD_cst_param.INIT_COUNT + 20) COEF = single(0.97); COEFZC = single(0.94); COEFSD = single(0.70); elseif (VAD_var_param.count_update < VAD_cst_param.INIT_COUNT + 30) COEF = single(0.99); COEFZC = single(0.96); COEFSD = single(0.75); elseif (VAD_var_param.count_update < VAD_cst_param.INIT_COUNT + 40) COEF = single(0.995); COEFZC = single(0.99); COEFSD = single(0.75); else COEF = single(0.995); COEFZC = single(0.998); COEFSD = single(0.75); end % Update mean of parameters LSF, SE, SLE, SZC VAD_var_param.MeanSE = COEF * VAD_var_param.MeanSE + (1-COEF) * Ef; VAD_var_param.MeanSLE = COEF * VAD_var_param.MeanSLE + (1-COEF) * El; VAD_var_param.MeanSZC = COEFZC * VAD_var_param.MeanSZC + (1-COEFZC) * ZC; VAD_var_param.MeanLSF = COEFSD * VAD_var_param.MeanLSF + (1-COEFSD) * LSF; end if ((VAD_var_param.frm_count > VAD_cst_param.No) && ((VAD_var_param.MeanSE < Min) && (SD < 0.002532959)) || (VAD_var_param.MeanSE > Min + 10 )) VAD_var_param.MeanSE = Min; VAD_var_param.count_update = single(0); end end %% Update parameters for next frame VAD_var_param.prev_energy = Ef; VAD_var_param.prev_markers = [marker, VAD_var_param.prev_markers(1)]; idx = VAD_cst_param.L_FRAME + 1; VAD_var_param.window_buffer = speech_buf(idx:end); %% Return final decision vad_flag = marker; return function dec = vad_decision(dSLE, dSE, SD, dSZC) % Active voice decision using multi-boundary decision regions in the space % of the 4 difference measures a = single([0.00175, -0.004545455, -25, 20, 0, ... 8800, 0, 25, -29.09091, 0, ... 14000, 0.928571, -1.5, 0.714285]); b = single([0.00085, 0.001159091, -5, -6, -4.7, ... -12.2, 0.0009, -7.0, -4.8182, -5.3, ... -15.5, 1.14285, -9, -2.1428571]); dec = single(0); if SD > a(1)*dSZC+b(1) dec = single(1); return; end if SD > a(2)*dSZC+b(2) dec = single(1); return; end if dSE < a(3)*dSZC+b(3) dec = single(1); return; end if dSE < a(4)*dSZC+b(4) dec = single(1); return; end if dSE < b(5) dec = single(1); return; end if dSE < a(6)*SD+b(6) dec = single(1); return; end if SD > b(7) dec = single(1); return; end if dSLE < a(8)*dSZC+b(8) dec = single(1); return; end if dSLE < a(9)*dSZC+b(9) dec = single(1); return; end if dSLE < b(10) dec = single(1); return; end if dSLE < a(11)*SD+b(11) dec = single(1); return; end if dSLE > a(12)*dSE+b(12) dec = single(1); return end if dSLE < a(13)*dSE+b(13) dec = single(1); return; end if dSLE < a(14)*dSE+b(14) dec = single(1); return; end return