在matlab的DSP工具箱中,自带了VAD程序,本文主要记录如何使用这个程序。
clear all
close all
clc
%% 设置音频源,读入一个speaker.wav文件,并对读入的文件进行分帧,每帧80个采样点
audioSource = dsp.AudioFileReader('SamplesPerFrame',80,...
'Filename','D:matlabworkVADspeaker.wav',...
'OutputDataType', 'single');
%% 设置一个示波器,示波器的时间范围30S,正好等于读取的上面读取的文件的长度,有两个信号输入
scope = dsp.TimeScope(2,...
'SampleRate', [8000/80 8000], ...
'BufferLength', 240000, ...
'YLimits', [-0.3 1.1], ...
'TimeSpan',30,...
'ShowGrid', true, ...
'Title','Decision speech and speech data', ...
'TimeSpanOverrunAction','Scroll');
%% 对VAD进行配置
VAD_cst_param = vadInitCstParams;
clear vadG729
% 每帧数据长度10ms,计算3000帧 = 30S
numTSteps = 3000;
while(numTSteps)
% 从audioSource中读取一帧 = 10ms 的数据
speech = audioSource();
% 对数据进行VAD判断
decision = vadG729(speech, VAD_cst_param);
% 将读取的语音信号波形以及VAD判断结果显示在示波器上
scope(decision, speech);
numTSteps = numTSteps - 1;
end
release(scope);
以下为运行结果
对于程序中的vadInitCstParams的设置函数,实现代码如下:
function VAD_cst_param = vadInitCstParams()
%% Algorithm Constants Initialization
VAD_cst_param.Fs = single(8000); % Sampling Frequency
VAD_cst_param.M = single(10); % Order of LP filter
VAD_cst_param.NP = single(12); % Increased LPC order
VAD_cst_param.No = single(128); % Number of frames for long-term minimum energy calculation
VAD_cst_param.Ni = single(32); % Number of frames for initialization of running averages
VAD_cst_param.INIT_COUNT = single(20);
% High Pass Filter that is used to preprocess the signal applied to the VAD
VAD_cst_param.HPF_sos = single([0.92727435, -1.8544941, 0.92727435, 1, -1.9059465, 0.91140240]);
VAD_cst_param.L_WINDOW = single(240); % Window size in LP analysis
VAD_cst_param.L_NEXT = single(40); % Lookahead in LP analysis
VAD_cst_param.L_FRAME = single(80); % Frame size
L1 = VAD_cst_param.L_NEXT;
L2 = VAD_cst_param.L_WINDOW - VAD_cst_param.L_NEXT;
VAD_cst_param.hamwindow = single([0.54 - 0.46*cos(2*pi*(0:L2-1)'/(2*L2-1));
cos(2*pi*(0:L1-1)'/(4*L1-1))]);
% LP analysis, lag window applied to autocorrelation coefficients
VAD_cst_param.lagwindow = single([1.0001; exp(-1/2 * ((2 * pi * 60 / VAD_cst_param.Fs) * (1:VAD_cst_param.NP)').^2)]);
% Correlation for a lowpass filter (3 dB point on the power spectrum is
% at about 2 kHz)
VAD_cst_param.lbf_corr = single([0.24017939691329, 0.21398822343783, 0.14767692339633, ...
0.07018811903116, 0.00980856433051,-0.02015934721195, ...
-0.02388269958005,-0.01480076155002,-0.00503292155509, ...
0.00012141366508, 0.00119354245231, 0.00065908718613, ...
0.00015015782285]');
vadInitCstParams函数的执行结果如下:
Fs: 8000
M: 10
NP: 12
No: 128
Ni: 32
INIT_COUNT: 20
HPF_sos: [0.9273 -1.8545 0.9273 1 -1.9059 0.9114]
L_WINDOW: 240
L_NEXT: 40
L_FRAME: 80
hamwindow: [240×1 single]
lagwindow: [13×1 single]
lbf_corr: [13×1 single]
VAD 函数实现代码如下:
function vad_flag = vadG729(speech, VAD_cst_param)
% VADG729 Implement the Voice Activity Detection Algorithm.
% Note that although G.729 VAD operates on pre-processed speech data, this
% function is a standalone version, i.e. the pre-processing (highpass
% filtering and linear predictive analysis) is also included.
%
% This function is in support of the 'G.729 Voice Activity Detection'
% example and may change in a future release
% Copyright 2015-2016 The MathWorks, Inc.
%% Algorithm Components Initialization
persistent biquad autocorr levSolver1 levSolver2 lpc2lsf zerodet VAD_var_param
if isempty(biquad)
% Create a IIR digital filter used for pre-processing
biquad = dsp.BiquadFilter('SOSMatrix', VAD_cst_param.HPF_sos);
% Create an autocorrelator and set its properties to compute the lags
% in the range [0:NP].
autocorr = dsp.Autocorrelator('MaximumLagSource', 'Property', ...
'MaximumLag', VAD_cst_param.NP);
% Create a Levinson solver which compute the reflection coefficients
% from auto-correlation function using the Levinson-Durbin recursion.
% The first object is configured to output polynomial coefficients and
% the second object is configured to output reflection coefficients.
levSolver1 = dsp.LevinsonSolver('AOutputPort', true, ...
'KOutputPort', false);
levSolver2 = dsp.LevinsonSolver('AOutputPort', false, ...
'KOutputPort', true);
% Create a converter from linear prediction coefficients (LPC) to line
% spectral frequencies (LSF)
lpc2lsf = dsp.LPCToLSF;
% Create a zero crossing detector
zerodet = dsp.ZeroCrossingDetector;
% initialize variable parameters
VAD_var_param.window_buffer = single(zeros(VAD_cst_param.L_WINDOW - VAD_cst_param.L_FRAME, 1));
VAD_var_param.frm_count = single(0);
VAD_var_param.MeanLSF = single(zeros(VAD_cst_param.M, 1));
VAD_var_param.MeanSE = single(0);
VAD_var_param.MeanSLE = single(0);
VAD_var_param.MeanE = single(0);
VAD_var_param.MeanSZC = single(0);
VAD_var_param.count_sil = single(0);
VAD_var_param.count_update = single(0);
VAD_var_param.count_ext = single(0);
VAD_var_param.less_count = single(0);
VAD_var_param.flag = single(1);
VAD_var_param.prev_markers = single([1, 1]);
VAD_var_param.prev_energy = single(0);
VAD_var_param.Prev_Min = single(Inf);
VAD_var_param.Next_Min = single(Inf);
VAD_var_param.Min_buffer = single(Inf * ones(1, VAD_cst_param.No));
end
%% Constants Initialization
NOISE = single(0);
VOICE = single(1);
v_flag = single(0);%#ok
VAD_var_param.frm_count = single(VAD_var_param.frm_count + 1);
%% Pre-processing
% Filter the speech frame: this high-pass filter serves as a precaution
% against undesired low-frequency components.
speech_hp = biquad(32768*speech);
% Store filtered data to the pre-processed speech buffer
speech_buf = [VAD_var_param.window_buffer; speech_hp];
% LPC analysis
% Windowing of signal
speech_win = VAD_cst_param.hamwindow .* speech_buf;
% Autocorrelation
r = autocorr(speech_win) .* VAD_cst_param.lagwindow;
% LSF
a = levSolver1(r(1:VAD_cst_param.M+1));
LSF = lpc2lsf(a) / (2 * pi);
% Reflection coefficients
rc = levSolver2(r(1:3));
%% VAD starts here
%% Parameters extraction
% Full-band energy
Ef = 10 * log10(r(1) / VAD_cst_param.L_WINDOW);
% Low-band energy
El = r(1) * VAD_cst_param.lbf_corr(1) + 2 * sum(r(2:end) .* VAD_cst_param.lbf_corr(2:end));
El = 10 * log10(El / VAD_cst_param.L_WINDOW);
% Spectral Distortion
SD = sum((LSF-VAD_var_param.MeanLSF).^2);
% Zero-crossing rate
idx = VAD_cst_param.L_WINDOW - VAD_cst_param.L_NEXT - VAD_cst_param.L_FRAME + 1;
ZC = double(zerodet(speech_buf(idx:idx+VAD_cst_param.L_FRAME)))/VAD_cst_param.L_FRAME;
% Long-term minimum energy
VAD_var_param.Next_Min = min(Ef, VAD_var_param.Next_Min);
Min = min(VAD_var_param.Prev_Min, VAD_var_param.Next_Min);
if (mod(VAD_var_param.frm_count, single(8)) == 0)
VAD_var_param.Min_buffer = [VAD_var_param.Min_buffer(2:end), VAD_var_param.Next_Min];
VAD_var_param.Prev_Min = min(VAD_var_param.Min_buffer);
VAD_var_param.Next_Min = single(Inf);
end
if (VAD_var_param.frm_count < VAD_cst_param.Ni)
%% Initialization of running averages if frame number is less than 32
if (Ef < 21)
VAD_var_param.less_count = VAD_var_param.less_count + 1;
marker = NOISE;
else
% include only the frames that have an energy Ef greater than 21
marker = VOICE;
NE = (VAD_var_param.frm_count - 1) - VAD_var_param.less_count;
VAD_var_param.MeanE = (VAD_var_param.MeanE * NE + Ef) / (NE+1);
VAD_var_param.MeanSZC = (VAD_var_param.MeanSZC * NE + ZC) / (NE+1);
VAD_var_param.MeanLSF = (VAD_var_param.MeanLSF * NE + LSF) / (NE+1);
end
else
%% Start calculating the characteristic energies of background noise
if (VAD_var_param.frm_count == VAD_cst_param.Ni)
VAD_var_param.MeanSE = VAD_var_param.MeanE - 10;
VAD_var_param.MeanSLE = VAD_var_param.MeanE - 12;
end
% Difference measures between current frame parameters and running
% averages of background noise characteristics
dSE = VAD_var_param.MeanSE - Ef;
dSLE = VAD_var_param.MeanSLE - El;
dSZC = VAD_var_param.MeanSZC - ZC;
%% Initial VAD decision
if (Ef < 21)
marker = NOISE;
else
marker = vad_decision(dSLE, dSE, SD, dSZC);
end
v_flag = single(0);
%% Voice activity decision smoothing
% from energy considerations and neighboring past frame decisions
% Step 1
if ((VAD_var_param.prev_markers(1) == VOICE) && (marker == NOISE) ...
&& (Ef > VAD_var_param.MeanSE + 2) && (Ef > 21))
marker = VOICE;
v_flag = single(1);
end
% Step 2
if (VAD_var_param.flag == 1)
if ((VAD_var_param.prev_markers(2) == VOICE) ...
&& (VAD_var_param.prev_markers(1) == VOICE) ...
&& (marker == NOISE) ...
&& (abs(Ef - VAD_var_param.prev_energy) <= 3))
VAD_var_param.count_ext = VAD_var_param.count_ext + 1;
marker = VOICE;
v_flag = single(1);
if (VAD_var_param.count_ext <= 4)
VAD_var_param.flag = single(1);
else
VAD_var_param.count_ext = single(0);
VAD_var_param.flag = single(0);
end
end
else
VAD_var_param.flag = single(1);
end
if (marker == NOISE)
VAD_var_param.count_sil = VAD_var_param.count_sil + 1;
end
% Step 3
if ((marker == VOICE) && (VAD_var_param.count_sil > 10) ...
&& (Ef - VAD_var_param.prev_energy <= 3))
marker = NOISE;
VAD_var_param.count_sil = single(0);
end
if (marker == VOICE)
VAD_var_param.count_sil = single(0);
end
% Step 4
if ((Ef < VAD_var_param.MeanSE + 3) && (VAD_var_param.frm_count > VAD_cst_param.No) ...
&& (v_flag == 0) && (rc(2) < 0.6))
marker = NOISE;
end
%% Update running averages only in the presence of background noise
if ((Ef < VAD_var_param.MeanSE + 3) && (rc(2) < 0.75) && (SD < 0.002532959))
VAD_var_param.count_update = VAD_var_param.count_update + 1;
% Modify update speed coefficients
if (VAD_var_param.count_update < VAD_cst_param.INIT_COUNT)
COEF = single(0.75);
COEFZC = single(0.8);
COEFSD = single(0.6);
elseif (VAD_var_param.count_update < VAD_cst_param.INIT_COUNT + 10)
COEF = single(0.95);
COEFZC = single(0.92);
COEFSD = single(0.65);
elseif (VAD_var_param.count_update < VAD_cst_param.INIT_COUNT + 20)
COEF = single(0.97);
COEFZC = single(0.94);
COEFSD = single(0.70);
elseif (VAD_var_param.count_update < VAD_cst_param.INIT_COUNT + 30)
COEF = single(0.99);
COEFZC = single(0.96);
COEFSD = single(0.75);
elseif (VAD_var_param.count_update < VAD_cst_param.INIT_COUNT + 40)
COEF = single(0.995);
COEFZC = single(0.99);
COEFSD = single(0.75);
else
COEF = single(0.995);
COEFZC = single(0.998);
COEFSD = single(0.75);
end
% Update mean of parameters LSF, SE, SLE, SZC
VAD_var_param.MeanSE = COEF * VAD_var_param.MeanSE + (1-COEF) * Ef;
VAD_var_param.MeanSLE = COEF * VAD_var_param.MeanSLE + (1-COEF) * El;
VAD_var_param.MeanSZC = COEFZC * VAD_var_param.MeanSZC + (1-COEFZC) * ZC;
VAD_var_param.MeanLSF = COEFSD * VAD_var_param.MeanLSF + (1-COEFSD) * LSF;
end
if ((VAD_var_param.frm_count > VAD_cst_param.No) && ((VAD_var_param.MeanSE < Min) && (SD < 0.002532959)) || (VAD_var_param.MeanSE > Min + 10 ))
VAD_var_param.MeanSE = Min;
VAD_var_param.count_update = single(0);
end
end
%% Update parameters for next frame
VAD_var_param.prev_energy = Ef;
VAD_var_param.prev_markers = [marker, VAD_var_param.prev_markers(1)];
idx = VAD_cst_param.L_FRAME + 1;
VAD_var_param.window_buffer = speech_buf(idx:end);
%% Return final decision
vad_flag = marker;
return
function dec = vad_decision(dSLE, dSE, SD, dSZC)
% Active voice decision using multi-boundary decision regions in the space
% of the 4 difference measures
a = single([0.00175, -0.004545455, -25, 20, 0, ...
8800, 0, 25, -29.09091, 0, ...
14000, 0.928571, -1.5, 0.714285]);
b = single([0.00085, 0.001159091, -5, -6, -4.7, ...
-12.2, 0.0009, -7.0, -4.8182, -5.3, ...
-15.5, 1.14285, -9, -2.1428571]);
dec = single(0);
if SD > a(1)*dSZC+b(1)
dec = single(1);
return;
end
if SD > a(2)*dSZC+b(2)
dec = single(1);
return;
end
if dSE < a(3)*dSZC+b(3)
dec = single(1);
return;
end
if dSE < a(4)*dSZC+b(4)
dec = single(1);
return;
end
if dSE < b(5)
dec = single(1);
return;
end
if dSE < a(6)*SD+b(6)
dec = single(1);
return;
end
if SD > b(7)
dec = single(1);
return;
end
if dSLE < a(8)*dSZC+b(8)
dec = single(1);
return;
end
if dSLE < a(9)*dSZC+b(9)
dec = single(1);
return;
end
if dSLE < b(10)
dec = single(1);
return;
end
if dSLE < a(11)*SD+b(11)
dec = single(1);
return;
end
if dSLE > a(12)*dSE+b(12)
dec = single(1);
return
end
if dSLE < a(13)*dSE+b(13)
dec = single(1);
return;
end
if dSLE < a(14)*dSE+b(14)
dec = single(1);
return;
end
return