摘要:
在上一篇的文档中,分析unimrcp中vad算法的诸多弊端,但是有没有一种更好的算法来取代呢。目前有两种方式 1. GMM 2. DNN。
其中鼎鼎大名的WebRTC VAD就是采用了GMM 算法来完成voice active dector。今天笔者重点介绍WebRTC VAD算法。在后面的文章中,
我们在刨析DNN在VAD的中应用。下面的章节中,将介绍WebRTC的检测原理。
原理:
首先呢,我们要了解一下人声和乐器的频谱范围,下图是音频的频谱。
本图来源于网络
根据音频的频谱划分了6个子带,80Hz~250Hz,250Hz~500Hz,500Hz~1K,1K~2K,2K~3K,3K~4K,分别计算出每个子带的特征。
步骤:
1. 准备工作
1.1 WebRTC的检测模式分为了4种:
0: Normal, 1. low Bitrate 2.Aggressive 3. Very Aggressive ,其激进程序与数值大小相关,可以根据实际的使用在初始化的时候可以配置。
// Set aggressiveness mode
int WebRtcVad_set_mode_core(VadInstT *self, int mode) {
int return_value = ;
switch (mode) {
case :
// Quality mode.
memcpy(self->over\_hang\_max\_1, kOverHangMax1Q,
sizeof(self->over\_hang\_max\_1));
memcpy(self->over\_hang\_max\_2, kOverHangMax2Q,
sizeof(self->over\_hang\_max\_2));
memcpy(self->individual, kLocalThresholdQ,
sizeof(self->individual));
memcpy(self->total, kGlobalThresholdQ,
sizeof(self->total));
break;
case :
// Low bitrate mode.
memcpy(self->over\_hang\_max\_1, kOverHangMax1LBR,
sizeof(self->over\_hang\_max\_1));
memcpy(self->over\_hang\_max\_2, kOverHangMax2LBR,
sizeof(self->over\_hang\_max\_2));
memcpy(self->individual, kLocalThresholdLBR,
sizeof(self->individual));
memcpy(self->total, kGlobalThresholdLBR,
sizeof(self->total));
break;
case :
// Aggressive mode.
memcpy(self->over\_hang\_max\_1, kOverHangMax1AGG,
sizeof(self->over\_hang\_max\_1));
memcpy(self->over\_hang\_max\_2, kOverHangMax2AGG,
sizeof(self->over\_hang\_max\_2));
memcpy(self->individual, kLocalThresholdAGG,
sizeof(self->individual));
memcpy(self->total, kGlobalThresholdAGG,
sizeof(self->total));
break;
case :
// Very aggressive mode.
memcpy(self->over\_hang\_max\_1, kOverHangMax1VAG,
sizeof(self->over\_hang\_max\_1));
memcpy(self->over\_hang\_max\_2, kOverHangMax2VAG,
sizeof(self->over\_hang\_max\_2));
memcpy(self->individual, kLocalThresholdVAG,
sizeof(self->individual));
memcpy(self->total, kGlobalThresholdVAG,
sizeof(self->total));
break;
default:
return\_value = -;
break;
}
return return\_value;
}
set mode code
1.2 vad 支持三种帧长,80/10ms 160/20ms 240/30ms
采样这三种帧长,是由语音信号的特点决定的,语音信号是短时平稳信号,在10ms-30ms之间被看成平稳信号,高斯马尔可夫等比较信号处理方法基于的前提是信号是平稳的。
1.3 支持频率: 8khz 16khz 32khz 48khz
WebRTC 支持8kHz 16kHz 32kHz 48kHz的音频,但是WebRTC首先都将16kHz 32kHz 48kHz首先降频到8kHz,再进行处理。
int16\_t speech\_nb\[\]; // 30 ms in 8 kHz.
const size\_t kFrameLen10ms = (size\_t) (fs / );
const size\_t kFrameLen10ms8khz = ;
size\_t num\_10ms\_frames = frame\_length / kFrameLen10ms;
int i = ;
for (i = ; i < num\_10ms\_frames; i++) {
resampleData(audio\_frame, fs, kFrameLen10ms, &speech\_nb\[i \* kFrameLen10ms8khz\],
);
}
size\_t new\_frame\_length = frame\_length \* / fs;
// Do VAD on an 8 kHz signal
vad = WebRtcVad\_CalcVad8khz(self, speech\_nb, new\_frame\_length);
2. 通过高斯模型计算子带能量,并且计算静音和语音的概率。
WebRtcVad_CalcVad8khz 函数计算特征量,其特征包括了6个子带的能量。计算后的特征存在feature_vector中。
int16_t WebRtcVad_CalculateFeatures(VadInstT *self, const int16_t *data_in,
size_t data_length, int16_t *features) {
int16_t total_energy = ;
// We expect |data_length| to be 80, 160 or 240 samples, which corresponds to
// 10, 20 or 30 ms in 8 kHz. Therefore, the intermediate downsampled data will
// have at most 120 samples after the first split and at most 60 samples after
// the second split.
int16_t hp_120[], lp_120[];
int16_t hp_60[], lp_60[];
const size_t half_data_length = data_length >> ;
size_t length = half_data_length; // |data_length| / 2, corresponds to
// bandwidth = 2000 Hz after downsampling.
// Initialize variables for the first SplitFilter().
int frequency\_band = ;
const int16\_t \*in\_ptr = data\_in; // \[0 - 4000\] Hz.
int16\_t \*hp\_out\_ptr = hp\_120; // \[2000 - 4000\] Hz.
int16\_t \*lp\_out\_ptr = lp\_120; // \[0 - 2000\] Hz.
RTC\_DCHECK\_LE(data\_length, );
RTC\_DCHECK\_LT(, kNumChannels - ); // Checking maximum |frequency\_band|.
// Split at 2000 Hz and downsample.
SplitFilter(in\_ptr, data\_length, &self->upper\_state\[frequency\_band\],
&self->lower\_state\[frequency\_band\], hp\_out\_ptr, lp\_out\_ptr);
// For the upper band (2000 Hz - 4000 Hz) split at 3000 Hz and downsample.
frequency\_band = ;
in\_ptr = hp\_120; // \[2000 - 4000\] Hz.
hp\_out\_ptr = hp\_60; // \[3000 - 4000\] Hz.
lp\_out\_ptr = lp\_60; // \[2000 - 3000\] Hz.
SplitFilter(in\_ptr, length, &self->upper\_state\[frequency\_band\],
&self->lower\_state\[frequency\_band\], hp\_out\_ptr, lp\_out\_ptr);
// Energy in 3000 Hz - 4000 Hz.
length >>= ; // |data\_length| / 4 <=> bandwidth = 1000 Hz.
LogOfEnergy(hp\_60, length, kOffsetVector\[\], &total\_energy, &features\[\]);
// Energy in 2000 Hz - 3000 Hz.
LogOfEnergy(lp\_60, length, kOffsetVector\[\], &total\_energy, &features\[\]);
// For the lower band (0 Hz - 2000 Hz) split at 1000 Hz and downsample.
frequency\_band = ;
in\_ptr = lp\_120; // \[0 - 2000\] Hz.
hp\_out\_ptr = hp\_60; // \[1000 - 2000\] Hz.
lp\_out\_ptr = lp\_60; // \[0 - 1000\] Hz.
length = half\_data\_length; // |data\_length| / 2 <=> bandwidth = 2000 Hz.
SplitFilter(in\_ptr, length, &self->upper\_state\[frequency\_band\],
&self->lower\_state\[frequency\_band\], hp\_out\_ptr, lp\_out\_ptr);
// Energy in 1000 Hz - 2000 Hz.
length >>= ; // |data\_length| / 4 <=> bandwidth = 1000 Hz.
LogOfEnergy(hp\_60, length, kOffsetVector\[\], &total\_energy, &features\[\]);
// For the lower band (0 Hz - 1000 Hz) split at 500 Hz and downsample.
frequency\_band = ;
in\_ptr = lp\_60; // \[0 - 1000\] Hz.
hp\_out\_ptr = hp\_120; // \[500 - 1000\] Hz.
lp\_out\_ptr = lp\_120; // \[0 - 500\] Hz.
SplitFilter(in\_ptr, length, &self->upper\_state\[frequency\_band\],
&self->lower\_state\[frequency\_band\], hp\_out\_ptr, lp\_out\_ptr);
// Energy in 500 Hz - 1000 Hz.
length >>= ; // |data\_length| / 8 <=> bandwidth = 500 Hz.
LogOfEnergy(hp\_120, length, kOffsetVector\[\], &total\_energy, &features\[\]);
// For the lower band (0 Hz - 500 Hz) split at 250 Hz and downsample.
frequency\_band = ;
in\_ptr = lp\_120; // \[0 - 500\] Hz.
hp\_out\_ptr = hp\_60; // \[250 - 500\] Hz.
lp\_out\_ptr = lp\_60; // \[0 - 250\] Hz.
SplitFilter(in\_ptr, length, &self->upper\_state\[frequency\_band\],
&self->lower\_state\[frequency\_band\], hp\_out\_ptr, lp\_out\_ptr);
// Energy in 250 Hz - 500 Hz.
length >>= ; // |data\_length| / 16 <=> bandwidth = 250 Hz.
LogOfEnergy(hp\_60, length, kOffsetVector\[\], &total\_energy, &features\[\]);
// Remove 0 Hz - 80 Hz, by high pass filtering the lower band.
HighPassFilter(lp\_60, length, self->hp\_filter\_state, hp\_120);
// Energy in 80 Hz - 250 Hz.
LogOfEnergy(hp\_120, length, kOffsetVector\[\], &total\_energy, &features\[\]);
return total\_energy;
}
WebRtcVad_GaussianProbability计算噪音和语音的分布概率,对于每一个特征,求其似然比,计算加权对数似然比。如果6个特征中其中有一个超过了阈值,就认为是语音。
int32_t WebRtcVad_GaussianProbability(int16_t input,
int16_t mean,
int16_t std,
int16_t *delta) {
int16_t tmp16, inv_std, inv_std2, exp_value = ;
int32_t tmp32;
// Calculate |inv\_std| = 1 / s, in Q10.
// 131072 = 1 in Q17, and (|std| >> 1) is for rounding instead of truncation.
// Q-domain: Q17 / Q7 = Q10.
tmp32 = (int32\_t) + (int32\_t) (std >> );
inv\_std = (int16\_t) DivW32W16(tmp32, std);
// Calculate |inv\_std2| = 1 / s^2, in Q14.
tmp16 = (inv\_std >> ); // Q10 -> Q8.
// Q-domain: (Q8 \* Q8) >> 2 = Q14.
inv\_std2 = (int16\_t) ((tmp16 \* tmp16) >> );
// TODO(bjornv): Investigate if changing to
// inv\_std2 = (int16\_t)((inv\_std \* inv\_std) >> 6);
// gives better accuracy.
tmp16 = (input << ); // Q4 -> Q7
tmp16 = tmp16 - mean; // Q7 - Q7 = Q7
// To be used later, when updating noise/speech model.
// |delta| = (x - m) / s^2, in Q11.
// Q-domain: (Q14 \* Q7) >> 10 = Q11.
\*delta = (int16\_t) ((inv\_std2 \* tmp16) >> );
// Calculate the exponent |tmp32| = (x - m)^2 / (2 \* s^2), in Q10. Replacing
// division by two with one shift.
// Q-domain: (Q11 \* Q7) >> 8 = Q10.
tmp32 = (\*delta \* tmp16) >> ;
// If the exponent is small enough to give a non-zero probability we calculate
// |exp\_value| ~= exp(-(x - m)^2 / (2 \* s^2))
// ~= exp2(-log2(exp(1)) \* |tmp32|).
if (tmp32 < kCompVar) {
// Calculate |tmp16| = log2(exp(1)) \* |tmp32|, in Q10.
// Q-domain: (Q12 \* Q10) >> 12 = Q10.
tmp16 = (int16\_t) ((kLog2Exp \* tmp32) >> );
tmp16 = -tmp16;
exp\_value = (0x0400 | (tmp16 & 0x03FF));
tmp16 ^= 0xFFFF;
tmp16 >>= ;
tmp16 += ;
// Get |exp\_value| = exp(-|tmp32|) in Q10.
exp\_value >>= tmp16;
}
// Calculate and return (1 / s) \* exp(-(x - m)^2 / (2 \* s^2)), in Q20.
// Q-domain: Q10 \* Q10 = Q20.
return inv\_std \* exp\_value;
}
3. 最后更新模型方差
3.1 通过WebRtcVad_FindMinimum 求出最小值更新方差,计算噪声加权平均值。
3.2 更新模型参数,噪音模型均值更新、语音模型均值更新、噪声模型方差更新、语音模型方差更新。
// Update the model parameters.
maxspe = ;
for (channel = ; channel < kNumChannels; channel++) {
// Get minimum value in past which is used for long term correction in Q4.
feature\_minimum = WebRtcVad\_FindMinimum(self, features\[channel\], channel);
// Compute the "global" mean, that is the sum of the two means weighted.
noise\_global\_mean = WeightedAverage(&self->noise\_means\[channel\], ,
&kNoiseDataWeights\[channel\]);
tmp1\_s16 = (int16\_t) (noise\_global\_mean >> ); // Q8
for (k = ; k < kNumGaussians; k++) {
gaussian = channel + k \* kNumChannels;
nmk = self->noise\_means\[gaussian\];
smk = self->speech\_means\[gaussian\];
nsk = self->noise\_stds\[gaussian\];
ssk = self->speech\_stds\[gaussian\];
// Update noise mean vector if the frame consists of noise only.
nmk2 = nmk;
if (!vadflag) {
// deltaN = (x-mu)/sigma^2
// ngprvec\[k\] = |noise\_probability\[k\]| /
// (|noise\_probability\[0\]| + |noise\_probability\[1\]|)
// (Q14 \* Q11 >> 11) = Q14.
delt = (int16\_t) ((ngprvec\[gaussian\] \* deltaN\[gaussian\]) >> );
// Q7 + (Q14 \* Q15 >> 22) = Q7.
nmk2 = nmk + (int16\_t) ((delt \* kNoiseUpdateConst) >> );
}
// Long term correction of the noise mean.
// Q8 - Q8 = Q8.
ndelt = (feature\_minimum << ) - tmp1\_s16;
// Q7 + (Q8 \* Q8) >> 9 = Q7.
nmk3 = nmk2 + (int16\_t) ((ndelt \* kBackEta) >> );
// Control that the noise mean does not drift to much.
tmp\_s16 = (int16\_t) ((k + ) << );
if (nmk3 < tmp\_s16) {
nmk3 = tmp\_s16;
}
tmp\_s16 = (int16\_t) (( + k - channel) << );
if (nmk3 > tmp\_s16) {
nmk3 = tmp\_s16;
}
self->noise\_means\[gaussian\] = nmk3;
if (vadflag) {
// Update speech mean vector:
// |deltaS| = (x-mu)/sigma^2
// sgprvec\[k\] = |speech\_probability\[k\]| /
// (|speech\_probability\[0\]| + |speech\_probability\[1\]|)
// (Q14 \* Q11) >> 11 = Q14.
delt = (int16\_t) ((sgprvec\[gaussian\] \* deltaS\[gaussian\]) >> );
// Q14 \* Q15 >> 21 = Q8.
tmp\_s16 = (int16\_t) ((delt \* kSpeechUpdateConst) >> );
// Q7 + (Q8 >> 1) = Q7. With rounding.
smk2 = smk + ((tmp\_s16 + ) >> );
// Control that the speech mean does not drift to much.
maxmu = maxspe + ;
if (smk2 < kMinimumMean\[k\]) {
smk2 = kMinimumMean\[k\];
}
if (smk2 > maxmu) {
smk2 = maxmu;
}
self->speech\_means\[gaussian\] = smk2; // Q7.
// (Q7 >> 3) = Q4. With rounding.
tmp\_s16 = ((smk + ) >> );
tmp\_s16 = features\[channel\] - tmp\_s16; // Q4
// (Q11 \* Q4 >> 3) = Q12.
tmp1\_s32 = (deltaS\[gaussian\] \* tmp\_s16) >> ;
tmp2\_s32 = tmp1\_s32 - ;
tmp\_s16 = sgprvec\[gaussian\] >> ;
// (Q14 >> 2) \* Q12 = Q24.
tmp1\_s32 = tmp\_s16 \* tmp2\_s32;
tmp2\_s32 = tmp1\_s32 >> ; // Q20
// 0.1 \* Q20 / Q7 = Q13.
if (tmp2\_s32 > ) {
tmp\_s16 = (int16\_t) DivW32W16(tmp2\_s32, ssk \* );
} else {
tmp\_s16 = (int16\_t) DivW32W16(-tmp2\_s32, ssk \* );
tmp\_s16 = -tmp\_s16;
}
// Divide by 4 giving an update factor of 0.025 (= 0.1 / 4).
// Note that division by 4 equals shift by 2, hence,
// (Q13 >> 8) = (Q13 >> 6) / 4 = Q7.
tmp\_s16 += ; // Rounding.
ssk += (tmp\_s16 >> );
if (ssk < kMinStd) {
ssk = kMinStd;
}
self->speech\_stds\[gaussian\] = ssk;
} else {
// Update GMM variance vectors.
// deltaN \* (features\[channel\] - nmk) - 1
// Q4 - (Q7 >> 3) = Q4.
tmp\_s16 = features\[channel\] - (nmk >> );
// (Q11 \* Q4 >> 3) = Q12.
tmp1\_s32 = (deltaN\[gaussian\] \* tmp\_s16) >> ;
tmp1\_s32 -= ;
// (Q14 >> 2) \* Q12 = Q24.
tmp\_s16 = (ngprvec\[gaussian\] + ) >> ;
tmp2\_s32 = (tmp\_s16 \* tmp1\_s32);
// tmp2\_s32 = OverflowingMulS16ByS32ToS32(tmp\_s16, tmp1\_s32);
// Q20 \* approx 0.001 (2^-10=0.0009766), hence,
// (Q24 >> 14) = (Q24 >> 4) / 2^10 = Q20.
tmp1\_s32 = tmp2\_s32 >> ;
// Q20 / Q7 = Q13.
if (tmp1\_s32 > ) {
tmp\_s16 = (int16\_t) DivW32W16(tmp1\_s32, nsk);
} else {
tmp\_s16 = (int16\_t) DivW32W16(-tmp1\_s32, nsk);
tmp\_s16 = -tmp\_s16;
}
tmp\_s16 += ; // Rounding
nsk += tmp\_s16 >> ; // Q13 >> 6 = Q7.
if (nsk < kMinStd) {
nsk = kMinStd;
}
self->noise\_stds\[gaussian\] = nsk;
}
}
// Separate models if they are too close.
// |noise\_global\_mean| in Q14 (= Q7 \* Q7).
noise\_global\_mean = WeightedAverage(&self->noise\_means\[channel\], ,
&kNoiseDataWeights\[channel\]);
// |speech\_global\_mean| in Q14 (= Q7 \* Q7).
speech\_global\_mean = WeightedAverage(&self->speech\_means\[channel\], ,
&kSpeechDataWeights\[channel\]);
// |diff| = "global" speech mean - "global" noise mean.
// (Q14 >> 9) - (Q14 >> 9) = Q5.
diff = (int16\_t) (speech\_global\_mean >> ) -
(int16\_t) (noise\_global\_mean >> );
if (diff < kMinimumDifference\[channel\]) {
tmp\_s16 = kMinimumDifference\[channel\] - diff;
// |tmp1\_s16| = ~0.8 \* (kMinimumDifference - diff) in Q7.
// |tmp2\_s16| = ~0.2 \* (kMinimumDifference - diff) in Q7.
tmp1\_s16 = (int16\_t) (( \* tmp\_s16) >> );
tmp2\_s16 = (int16\_t) (( \* tmp\_s16) >> );
// Move Gaussian means for speech model by |tmp1\_s16| and update
// |speech\_global\_mean|. Note that |self->speech\_means\[channel\]| is
// changed after the call.
speech\_global\_mean = WeightedAverage(&self->speech\_means\[channel\],
tmp1\_s16,
&kSpeechDataWeights\[channel\]);
// Move Gaussian means for noise model by -|tmp2\_s16| and update
noise\_global\_mean = WeightedAverage(&self->noise\_means\[channel\],
-tmp2\_s16,
&kNoiseDataWeights\[channel\]);
}
// Control that the speech & noise means do not drift to much.
maxspe = kMaximumSpeech\[channel\];
tmp2\_s16 = (int16\_t) (speech\_global\_mean >> );
if (tmp2\_s16 > maxspe) {
// Upper limit of speech model.
tmp2\_s16 -= maxspe;
for (k = ; k < kNumGaussians; k++) {
self->speech\_means\[channel + k \* kNumChannels\] -= tmp2\_s16;
}
}
tmp2\_s16 = (int16\_t) (noise\_global\_mean >> );
if (tmp2\_s16 > kMaximumNoise\[channel\]) {
tmp2\_s16 -= kMaximumNoise\[channel\];
for (k = ; k < kNumGaussians; k++) {
self->noise\_means\[channel + k \* kNumChannels\] -= tmp2\_s16;
}
}
}
手机扫一扫
移动阅读更方便
你可能感兴趣的文章