- 參考pyAudioAnalysis、openSmile以及語音信號處理實驗教程(MATLAB源代碼)
- Introduction to Audio Analysis–A Matlab Approach
- 完整測試文件
- 注意,以下代碼不在genFeatures.py內的,可在pyAudioAnalysis.audioFeatureExtraction文件內觀察得到
1.過零率
zero crossing rate
每幀信號內,信號過零點的次數,體現的是頻率特性。
import numpy as np
def stZCR(frame):
# computing zero crossing rate
count = len(frame)
count_z = np.sum(np.abs(np.diff(np.sign(frame)))) / 2
return (np.float64(count_z) / np.float64(count - 1))
2.能量
energy
短時能量,即每幀信號的平方和,體現的是信號能量的強弱
import numpy as np
def stEnergy(frame):
return (np.sum(frame ** 2) / np.float64(len(frame)))
2.1 振幅擾動度-分貝形式
shimmer in DB
import numpy as np
def stShimmerDB(frame):
'''
amplitude shimmer 振幅擾動度
expressed as variability of the peak-to-peak amplitude in decibels 分貝
[3]
'''
count = len(frame)
sigma = 0
for i in range(count):
if i == count - 1:
break
sigma += np.abs(20 * (np.log10(np.abs(frame[i + 1] / (frame[i] + eps)))))
return np.float64(sigma) / np.float64(count - 1)
2.2 振幅擾動度-百分數形式
def stShimmerRelative(frame):
'''
shimmer relative is defined as average absolute difference between the amplitude
of consecutive periods divided by the average amplitude, expressed as percentage
[3]
'''
count = len(frame)
sigma_diff = 0
sigma_sum = 0
for i in range(count):
if i < count - 1:
sigma_diff += np.abs(np.abs(frame[i]) - np.abs(frame[i + 1]))
sigma_sum += np.abs(frame[i])
return np.float64(sigma_diff / (count - 1)) / np.float64(sigma_sum / count + eps)
3. 聲強/響度
intensity / loudness
- intensity: mean of squared input values multiplied by a Hamming window
聲強和響度是對應的概念,參考openSmile程序
###################
##
## from opensimle
##
#####################
def stIntensity(frame):
'''
cannot understand what differ from energy
'''
fn = len(frame)
hamWin = np.hamming(fn)
winSum = np.sum(hamWin)
if winSum <= 0.0:
winSum = 1.0
I0 = 0.000001
Im = 0
for i in range(fn):
Im = hamWin[i] * frame[i] ** 2
intensity = Im/winSum
loudness = (Im / I0) ** .3
return intensity, loudness
4. 基頻
計算基頻的方法包括倒譜法、短時自相關法和線性預測法。本文采用短時自相關法
1)基音檢測預處理:語音端點檢測
由於語音的頭部和尾部不具有週期性,因此爲了提高基音檢測的準確性,在基音檢測時採用了端點檢測。使用譜熵法進行端點檢測。
語音信號時域波形爲,加窗分幀後第幀語音信號爲,其FFT表示爲,表示第條譜線。該語音幀在頻域中的短時能量爲
爲FFT長度,只取正頻率部分
某一譜線的能量譜爲,則每個頻率分量的歸一化譜概率密度函數定義爲
該語音幀短時譜熵定義爲
是第幀的譜熵,它表示譜的能量變化,在不同水平噪聲環境中譜熵參數具有一定穩健性,但每一譜點的幅值易受噪聲的污染而影響端點檢測。
在說話區間內的譜熵值小於噪聲區段內的譜熵值,因此能量與譜熵的比值,說話區間大於噪聲區間。能量與譜熵的比值稱爲能熵比。
設置閾值,當不小於時,爲說話區間。
2)自相關法檢測基頻
爲了減少共振峯的干擾,基頻檢測的頻率範圍一般爲60Hz~500Hz,此時相應的樣本點值爲、
將一幀信號進行自相關,當延遲量等於基音週期時,此時信號的值最大。因此,檢測基頻的具體做法即爲:將信號進行自相關運算後,找出最大值對應的採樣點數。
####################################
##
## calculate pitch with correlation
## calPitch() JitterAbsolute() JitterRelative()
##
####################################
class voiceSegment:
def __init__(self, in1=0, in2=0, duratioin=0):
self.begin = in1
self.end = in2
self.duratioin = duratioin
def pitch_vad(x, win, step, T1, miniL):
# 端點檢測
y = enframe(x, win, step).T
fn = len(y[0, :])
print(fn)
Esum = [] # energy of frames
H = [] # spectrom entropy
for i in range(fn):
Sp = np.abs(np.fft.fft(y[:, i]))
Sp = Sp[:int(win / 2)] # fft positive
Esum.append(np.sum(Sp ** 2)) # energy
prob = Sp / (np.sum(Sp)) # probability
H.append(- np.sum(prob * np.log(prob + eps)))
H = np.array(H)
hindex = np.where(H < 0.1)
H[hindex] = np.max(H)
# Ef = np.sqrt(1 + np.abs(Esum / np.linalg.inv(H))) # energy entropy percentage
Ef = np.sqrt(1 + np.abs(Esum / H))
Ef = Ef / np.max(Ef)
zindex = np.where(Ef >= T1) # 尋找Ef中大於T1的部分
zseg = findSegment(zindex) # 給出端點檢測各段的信息
zsl = len(zseg) # 給出段數
j = 0
SF = np.zeros(fn)
voiceseg = []
for k in range(zsl):
if zseg[k].duratioin >= miniL:
j = j + 1
in1 = zseg[k].begin
in2 = zseg[k].end
voiceseg.append(zseg[k])
SF[in1:in2] = 1
vosl = len(voiceseg) # 有話段的段數
return voiceseg, vosl, SF, Ef
def findSegment(express):
# express = np.array(express)
'''
if express[0][0] == 0:
voiceIndex = np.where(express == 1)
else:
voiceIndex = express
'''
voiceIndex = np.array(express).flatten()
soundSegment = []
k = 0
soundSegment.append(voiceSegment(voiceIndex[0]))
for i in range(len(voiceIndex) - 1):
if voiceIndex[i + 1] - voiceIndex[i] > 1:
soundSegment[k].end = voiceIndex[i]
soundSegment.append(voiceSegment(voiceIndex[i + 1]))
k = k + 1
soundSegment[k].end = voiceIndex[-1]
for i in range(k + 1):
soundSegment[i].duratioin = soundSegment[i].end - soundSegment[i].begin + 1
return soundSegment
def pitch_Corr(x, win, step, T1, sr, miniL=10):
win = int(win);
step = int(step)
vseg, vsl, SF, Ef = pitch_vad(x, win, step, T1, miniL)
y = enframe(x, win, step).T
fn = len(SF)
lmin = int(sr / 500)
lmax = int(sr / 27.5)
period = np.zeros(fn)
for i in range(vsl):
ixb = vseg[i].begin
ixe = vseg[i].end
ixd = vseg[i].duratioin
for k in range(ixd):
u = y[:, k + ixb]
ru = np.correlate(u, u, mode='full')
ru = ru[win - 1:] # positive
tloc = np.array(np.where(ru[lmin:lmax] == np.max(ru[lmin:lmax]))).flatten()
period[k + ixb] = lmin + tloc - 1
return vseg, vsl, SF, Ef, period
def calPitch(y, win, step, sr):
'''
calculate pitch
:param y: data of wav file
:param win: windows
:param step: inc
:param sr: frequency of wav file
:return: pitch Hz, period dot
'''
T1 = 0.05
voicesef, vosl, SF, Ef, period = pitch_Corr(y, win, step, T1, sr)
# period is pitch period
pitch = sr / (period + eps)
pindex = np.where(pitch > 5000)
pitch[pindex] = 0
return pitch, period
4.1 頻率抖動度-分貝形式
jitter
def JitterAbsolute(pitch):
period = 1 / (pitch + eps)
pindex = np.where(period > 5000)
period[pindex] = 0
n = len(period)
sigma = 0
for i in range(n - 1):
sigma = np.abs(period[i] - period[i + 1])
jitter_absolute = sigma / (n - 1)
return jitter_absolute
4.2 頻率抖動度-百分數形式
jitter
def JitterRelative(pitch):
period = 1 / (pitch + eps)
pindex = np.where(period > 5000)
period[pindex] = 0
n = len(period)
sigma = 0
jitter_relative = JitterAbsolute(pitch) / (np.sum(period) / n)
4.3 諧噪比HNR
harmonic to noise ratio
指基音頻率
def stHNR(frame, period):
'''
harmonics to noise ratio 諧噪比
HNR = 10 * log( ACF(T0) / (ACF(0) - ACF(T0)) )
frame: a frame of signal
period: pitch period of given frame
return hnr db
'''
period = int(period)
if period == 0:
return 0 # when pitch period is zero, return zero
ru = np.correlate(frame, frame, mode='full')
win = len(frame)
print(np.where(ru == np.max(ru)))
ru = ru[win - 1:]
print(np.max(ru))
print(ru[period])
HNR = 10 * np.log(ru[period] / (ru[0] - ru[period]))
return HNR
5. 共振峯
formant
共振峯可用倒譜法與LPC法估計得出。其中LPC法是通過FFT對任意頻率求得其功率譜幅值響應,並從幅值響應中找到共振峯,相應的方法包括拋物線內插法和線性預測係數求複數根法。
本文采用拋物線內插法。
聲道可以看成具有非均勻截面的聲管,在發音時起到共鳴器的作用。當準週期激勵進入聲道時會引起共振特性,產生一組共振頻率,稱爲共振峯頻率。
語音產生模型是將輻射、聲道以及聲門激勵的全部效應簡化爲一個時變的數字濾波器,其傳遞函數爲
這種表現形式稱爲階線性預測模型,這是一個全極點模型。
令,則功率譜可表示爲
利用FFT對任意頻率求功率譜幅值響應,從幅值響應中找出共振峯。相應的求解方法包括拋物線內插法和線性預測係數求復根法。
拋物線內插法,見《語音信號處理實驗教程》p105-107
#############################################
##
## calculate formant frequency and bandwidth
##
#############################################
# def Formant_Interpolation(u, sr, p=12):
def stFormant(u, sr, p=12):
'''
F: formant frequency
Bw: formant bandwith
u: one frame of signal
p: number of LPC
sr: sampling rate
return
[1] formant frequency array
[2] formant bandwidth array
'''
### calculate lpc begin
a_filter = lpc.autocor(u, p)
a_filter_num = a_filter.numdict
i = 0
a = []
for k, v in a_filter_num.items():
if i != k:
while (i != k):
a.append(0)
i = i + 1
a.append(v)
i = i + 1
a = np.array(a)
### calculate lpc end
U = lpcar2pf(a, 255) # 由LPC係數求出頻譜曲線
df = sr / 512 # 頻譜分辨力
Loc, Mdict = signal.find_peaks(U) # find peaks in U
nFormant = len(Loc)
F = np.zeros(nFormant) # 共振峯頻率
Bw = np.zeros(nFormant) # 共振峯帶寬
# 內插法
i = 0
for m in Loc:
m1 = m - 1;
m2 = m + 1
p = U[m];
p1 = U[m1];
p2 = U[m2]
aa = (p1 + p2) / 2 - p
bb = (p2 - p1) / 2
cc = p
dm = - bb / (2 * aa) # 極大值對應頻率
pp = -bb ** 2 / (4 * aa) + cc # 中心頻率對應功率譜
bf = - np.sqrt(bb ** 2 - 4 * aa * (cc - 0.5 * pp)) / (2 * aa) # 帶寬x軸值
F[i] = (m + dm) * df
Bw[i] = 2 * bf * df
i = i + 1
return F, Bw
def lpcar2pf(ar, npoints):
'''
ar : lpc coefficient
np : 頻譜範圍
return : 頻譜曲線
'''
return np.abs(np.fft.rfft(ar, 2 * npoints + 2)) ** (-2)
def pre_emphasis(y, coefficient=0.99):
'''
y : original signal
coefficient: emphasis coefficient
'''
return np.append(y[0], y[1:] - coefficient * y[:-1])
6. Entropy of Energy:能量熵
跟頻譜的譜熵(Spectral Entropy)有點類似,不過它描述的是信號的時域分佈情況,體現的是連續性。也是短時特徵。在第幀信號內,將信號分爲個子塊。
第個子塊與一幀信號總能量的比值爲,其中的值與子塊的總能量對應,爲.
則一幀信號的譜熵值爲
def stEnergyEntropy(frame, n_short_blocks=10):
"""Computes entropy of energy"""
Eol = numpy.sum(frame ** 2) # total frame energy
L = len(frame)
sub_win_len = int(numpy.floor(L / n_short_blocks))
if L != sub_win_len * n_short_blocks:
frame = frame[0:sub_win_len * n_short_blocks]
# sub_wins is of size [n_short_blocks x L]
sub_wins = frame.reshape(sub_win_len, n_short_blocks, order='F').copy()
# Compute normalized sub-frame energies:
s = numpy.sum(sub_wins ** 2, axis=0) / (Eol + eps)
# Compute entropy of the normalized sub-frame energies:
Entropy = -numpy.sum(s * numpy.log2(s + eps))
return Entropy
7. Spectral Centroid:頻譜質心
又稱爲頻譜一階距,頻譜中心的值越小,表明越多的頻譜能量集中在低頻範圍內,如:voice與music相比,通常spectral centroid較低。第幀信號的頻譜質心:
爲第幀信號的第根頻譜線,爲一幀信號的長度
def stSpectralCentroidAndSpread(X, fs):
"""Computes spectral centroid of frame (given abs(FFT))"""
ind = (numpy.arange(1, len(X) + 1)) * (fs/(2.0 * len(X)))
Xt = X.copy()
Xt = Xt / Xt.max()
NUM = numpy.sum(ind * Xt)
DEN = numpy.sum(Xt) + eps
# Centroid:
C = (NUM / DEN)
# Spread:
S = numpy.sqrt(numpy.sum(((ind - C) ** 2) * Xt) / DEN)
# Normalize:
C = C / (fs / 2.0)
S = S / (fs / 2.0)
return (C, S)
8. Spectral Spread:頻譜延展度
又稱爲頻譜二階中心矩,它描述了信號在頻譜中心周圍的分佈狀況。第i幀信號的頻譜延展度
程序同7.Spectral Centroid
9. Spectral Entropy:譜熵
根據熵的特性可以知道,分佈越均勻,熵越大,能量熵反應了每一幀信號的均勻程度,如說話人頻譜由於共振峯存在顯得不均勻,而白噪聲的頻譜就更加均勻,藉此進行VAD便是應用之一。
與計算能量熵類似,譜熵是將FFT後的頻譜劃分爲個子塊,分別計算每個子塊與頻譜總能量的比例,最後將所有子塊的譜熵相加,即爲第幀信號的譜熵。
第個子塊的比例:
第幀譜熵
def stSpectralEntropy(X, n_short_blocks=10):
"""Computes the spectral entropy"""
L = len(X) # number of frame samples
Eol = numpy.sum(X ** 2) # total spectral energy
sub_win_len = int(numpy.floor(L / n_short_blocks)) # length of sub-frame
if L != sub_win_len * n_short_blocks:
X = X[0:sub_win_len * n_short_blocks]
sub_wins = X.reshape(sub_win_len, n_short_blocks, order='F').copy() # define sub-frames (using matrix reshape)
s = numpy.sum(sub_wins ** 2, axis=0) / (Eol + eps) # compute spectral sub-energies
En = -numpy.sum(s*numpy.log2(s + eps)) # compute spectral entropy
return En
10. Spectral Flux:頻譜通量
描述的是相鄰幀頻譜的變化情況。計算了頻譜歸一化之後,兩幀頻譜差的平方的總和
其中,
指第根頻譜線。
def stSpectralFlux(X, X_prev):
"""
Computes the spectral flux feature of the current frame
ARGUMENTS:
X: the abs(fft) of the current frame
X_prev: the abs(fft) of the previous frame
"""
# compute the spectral flux as the sum of square distances:
sumX = numpy.sum(X + eps)
sumPrevX = numpy.sum(X_prev + eps)
F = numpy.sum((X / sumX - X_prev/sumPrevX) ** 2)
return F
11. Spectral Rolloff:頻譜滾降點
頻譜的能量在一定頻率範圍內是集中的。當頻譜能量達到一確切百分比(通常爲90%左右),相應的DFT座標即爲滾降點的座標。然後將滾降點座標除以FFT長度歸一化。
爲滾降點座標,爲FFT長度,爲第根譜線。
def stSpectralRollOff(X, c, fs):
"""Computes spectral roll-off"""
totalEnergy = numpy.sum(X ** 2)
fftLength = len(X)
Thres = c*totalEnergy
# Ffind the spectral rolloff as the frequency position
# where the respective spectral energy is equal to c*totalEnergy
CumSum = numpy.cumsum(X ** 2) + eps
[a, ] = numpy.nonzero(CumSum > Thres)
if len(a) > 0:
mC = numpy.float64(a[0]) / (float(fftLength))
else:
mC = 0.0
return (mC)
12. MFCCs:梅爾倒譜系數
梅爾頻率倒譜系數基於人的聽覺特性機理,即根據人的聽覺實驗結果分析語音頻譜。與實際頻率的關係爲
- 梅爾濾波器
梅爾頻率相當於在語音的頻譜範圍內設置若干帶通濾波器,,爲濾波器的個數,自己設定。每個濾波器具有三角形濾波特性,中心頻率爲,在Mel頻率範圍內,這些濾波器等帶寬。每個帶通濾波器的傳遞函數爲
梅爾濾波器的中心頻率爲
其中,和是濾波器組的最高頻率和最低頻率,爲採樣頻率,是濾波器組的數目,爲FFT變換的點數 - MFCC係數計算
(1)預處理
預加重、分幀、加窗,得第幀信號爲
(2)FFT
(3)計算譜線能量
(4)計算通過梅爾濾波器的能量
將一幀能量譜的每一根譜線分別與梅爾濾波器的頻域響應相乘並相加。由於梅爾濾波器有個,因此一幀內通過梅爾濾波器能量有個
(5)計算DCT倒譜
[我的想法。到了這一步,是將傅里葉頻域變換到梅爾頻域內。梅爾頻域的譜線與梅爾濾波器相對應]使用DCT計算梅爾頻域的各譜線能量
def mfccInitFilterBanks(fs, nfft):
"""
Computes the triangular filterbank for MFCC computation
(used in the stFeatureExtraction function before the stMFCC function call)
This function is taken from the scikits.talkbox library (MIT Licence):
https://pypi.python.org/pypi/scikits.talkbox
"""
# filter bank params:
lowfreq = 133.33
linsc = 200/3.
logsc = 1.0711703
numLinFiltTotal = 13
numLogFilt = 27
if fs < 8000:
nlogfil = 5
# Total number of filters
nFiltTotal = numLinFiltTotal + numLogFilt
# Compute frequency points of the triangle:
freqs = numpy.zeros(nFiltTotal+2)
freqs[:numLinFiltTotal] = lowfreq + numpy.arange(numLinFiltTotal) * linsc
freqs[numLinFiltTotal:] = freqs[numLinFiltTotal-1] * logsc ** numpy.arange(1, numLogFilt + 3)
heights = 2./(freqs[2:] - freqs[0:-2])
# Compute filterbank coeff (in fft domain, in bins)
fbank = numpy.zeros((nFiltTotal, nfft))
nfreqs = numpy.arange(nfft) / (1. * nfft) * fs
for i in range(nFiltTotal):
lowTrFreq = freqs[i]
cenTrFreq = freqs[i+1]
highTrFreq = freqs[i+2]
lid = numpy.arange(numpy.floor(lowTrFreq * nfft / fs) + 1,
numpy.floor(cenTrFreq * nfft / fs) + 1,
dtype=numpy.int)
lslope = heights[i] / (cenTrFreq - lowTrFreq)
rid = numpy.arange(numpy.floor(cenTrFreq * nfft / fs) + 1,
numpy.floor(highTrFreq * nfft / fs) + 1,
dtype=numpy.int)
rslope = heights[i] / (highTrFreq - cenTrFreq)
fbank[i][lid] = lslope * (nfreqs[lid] - lowTrFreq)
fbank[i][rid] = rslope * (highTrFreq - nfreqs[rid])
return fbank, freqs
def stMFCC(X, fbank, n_mfcc_feats):
"""
Computes the MFCCs of a frame, given the fft mag
ARGUMENTS:
X: fft magnitude abs(FFT)
fbank: filter bank (see mfccInitFilterBanks)
RETURN
ceps: MFCCs (13 element vector)
Note: MFCC calculation is, in general, taken from the
scikits.talkbox library (MIT Licence),
# with a small number of modifications to make it more
compact and suitable for the pyAudioAnalysis Lib
"""
mspec = numpy.log10(numpy.dot(X, fbank.T)+eps)
ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:n_mfcc_feats]
return ceps