這個國慶節作死,答應別人在七天內做2個項目。智能客服項目是其中一個,當時是答應給隔壁兄弟團隊做的。他們說實在搞不定了,節後要上線,我就因爲隨口說了一句:“這有什麼難的”,結果禍從口出,這事情就落我頭上了。
錄音識別是智能客服大項目計劃裏面的一部分,簡單說就是客戶在跟用戶聊天的時候,實時從聲卡上抓取音頻數據,然後發送給阿里雲-智能語音交互識別成句子文本後,再發送給我們的智能客服助手服務器,生成提示展示給客戶人員。聽起來很簡單,我當時也這麼認爲,但做完這個之後,我覺得以後沒啥事還是少去隔壁團隊那兒串門,^^。
這個小程序,步驟上分3步:錄音,識別、提交。
1. 錄音
本來是想着用java來寫,開發能快點,而且java也有相應的AudioSystem音頻處理模塊,應該能輕鬆搞定。但是後來發現AudioSystem只能從麥克風這種音頻輸入設備讀取數據,要想從聲卡抓取音頻數據只能用C++調用WASAPI來獲取。網上也有不少例子,可以拿來借鑑。但這兒當時遇到的最大問題是,我抓取的音頻格式是PCM FLOAT 32位,做音頻處理的時候不方便,而且有些的音頻處理程序不支持(比如: java的AudioSystem),所以我轉換成PCM SIGNED 16位。另外,音頻數據在發送給識別服務之前要按照要求轉換成單軌方式,採樣率也要調整成16000Hz。音頻處理比較頭疼,開始打算用ffmpeg來處理,但是這個庫太重,用起來太複雜,後來就自己上網查資料,自己寫算法轉換的。
另外,網上也有種說法,就是說要在windows系統上進行錄音,需要打開立體聲混響設備。其實是不需要的,除非你想同時進行聲卡和麥克風錄音。僅僅從聲卡錄音,通過WASAPI夠了。
2. 識別
將聲卡抓取的音頻數據調整成能被識別的格式之後,就可以調用阿里雲提供的SDK進行發送了。因爲文檔比較齊全,所以這塊兒還是挺順利的。
3. 提交
因爲智能服務接口是restful的所以,我這兒只用libcurl來進行HTTP處理。libcurl當時在編譯成靜態庫後,連接時總是報 找不到函數入口。開始以爲是忘記加extern "C",加了之後還是報錯。最後,查了下資料,按照這個網頁(https://blog.csdn.net/libaineu2004/article/details/79736921)的指導操作了下就好了。
4. 參考代碼
其他模塊都沒什麼難度,主要是音頻處理部分,包括,位深轉換,採樣率轉換,單軌調整。主要還是基於網上的代碼做了少量的修改。
Capture.h
#pragma once
#pragma comment(lib,"avrt.lib")
#include <Audioclient.h>
#include <mmdeviceapi.h>
#include<iostream>
#include<avrt.h>
#include <vector>
typedef struct WAVE_HEADER {
char fccID[4]; //內容爲""RIFF
unsigned long dwSize; //最後填寫,WAVE格式音頻的大小
char fccType[4]; //內容爲"WAVE"
}WAVE_HEADER;
typedef struct WAVE_FMT {
char fccID[4]; //內容爲"fmt "
unsigned long dwSize; //內容爲WAVE_FMT佔的字節數,爲16
unsigned short wFormatTag; //如果爲PCM,改值爲 1
unsigned short wChannels; //通道數,單通道=1,雙通道=2
unsigned long dwSamplesPerSec;//採用頻率
unsigned long dwAvgBytesPerSec;/* ==dwSamplesPerSec*wChannels*uiBitsPerSample/8 */
unsigned short wBlockAlign;//==wChannels*uiBitsPerSample/8
unsigned short uiBitsPerSample;//每個採樣點的bit數,8bits=8, 16bits=16
}WAVE_FMT;
typedef struct WAVE_DATA {
char fccID[4]; //內容爲"data"
unsigned long dwSize; //==NumSamples*wChannels*uiBitsPerSample/8
}WAVE_DATA;
class Capture
{
public:
Capture();
int start();
int stop();
int cap(std::vector<BYTE> &buffer, int rate, int channels);
int wav(std::vector<BYTE>& buffer, int rate, int channels);
private:
bool adjustFormatTo16Bits(WAVEFORMATEX *pwfx);
int read(std::vector<BYTE> &buffer);
int resample(std::vector<BYTE> &buffer, int rate);
int singleChannel(std::vector<BYTE> &buffer);
IAudioCaptureClient * m_pAudioCaptureClient;
IAudioClient * m_pAudioClient;
WAVEFORMATEX * m_pwfx;
IMMDevice* m_pMMDevice;
size_t m_FrameSize;
int m_SampleRate;
int m_Channels;
};
Capture.cpp
#include "Capture.h"
#define RETURN_ON_ERROR(hr) if(FAILED(hr)){CoUninitialize();return -1;}
#define RETURN_ON_NULL(p) if(p==NULL){CoUninitialize();return -1;}
#define RETURN_ON_FALSE(b) if(!b){CoUninitialize();return -1;}
bool Capture::adjustFormatTo16Bits(WAVEFORMATEX *pwfx)
{
bool ret=false;
if (pwfx->wFormatTag == WAVE_FORMAT_IEEE_FLOAT)
{
pwfx->wFormatTag = WAVE_FORMAT_PCM;
pwfx->wBitsPerSample = 16;
pwfx->nBlockAlign = pwfx->nChannels * pwfx->wBitsPerSample / 8;
pwfx->nAvgBytesPerSec = pwfx->nBlockAlign * pwfx->nSamplesPerSec;
ret = true;
}
else if (pwfx->wFormatTag == WAVE_FORMAT_EXTENSIBLE)
{
PWAVEFORMATEXTENSIBLE pEx = reinterpret_cast<PWAVEFORMATEXTENSIBLE>(pwfx);
if (IsEqualGUID(KSDATAFORMAT_SUBTYPE_IEEE_FLOAT, pEx->SubFormat))
{
pEx->SubFormat = KSDATAFORMAT_SUBTYPE_PCM;
pEx->Samples.wValidBitsPerSample = 16;
pwfx->wBitsPerSample = 16;
pwfx->nBlockAlign = pwfx->nChannels * pwfx->wBitsPerSample / 8;
pwfx->nAvgBytesPerSec = pwfx->nBlockAlign * pwfx->nSamplesPerSec;
ret = true;
}
}
return ret;
}
Capture::Capture() {
m_pAudioCaptureClient = NULL;
m_pAudioClient = NULL;
m_pMMDevice = NULL;
m_pwfx = NULL;
m_FrameSize = 0;
m_SampleRate = -1;
m_Channels = -1;
}
int Capture::start() {
CoInitialize(NULL);
IMMDeviceEnumerator *pMMDeviceEnumerator = NULL;
HRESULT hr = CoCreateInstance(__uuidof(MMDeviceEnumerator), NULL, CLSCTX_ALL,
__uuidof(IMMDeviceEnumerator), (void**)&pMMDeviceEnumerator);
RETURN_ON_ERROR(hr);
hr = pMMDeviceEnumerator->GetDefaultAudioEndpoint(eRender, eConsole, &m_pMMDevice);
RETURN_ON_ERROR(hr);
pMMDeviceEnumerator->Release();
hr = m_pMMDevice->Activate(__uuidof(IAudioClient), CLSCTX_ALL, NULL, (void**)&m_pAudioClient);
RETURN_ON_ERROR(hr);
REFERENCE_TIME hnsDefaultDevicePeriod(0);
hr = m_pAudioClient->GetDevicePeriod(&hnsDefaultDevicePeriod, NULL);
RETURN_ON_ERROR(hr);
hr = m_pAudioClient->GetMixFormat(&m_pwfx);
RETURN_ON_ERROR(hr);
/*轉換成signed 16位編碼*/
adjustFormatTo16Bits(m_pwfx);
m_FrameSize = (m_pwfx->wBitsPerSample / 8)*m_pwfx->nChannels;
hr = m_pAudioClient->Initialize(AUDCLNT_SHAREMODE_SHARED, AUDCLNT_STREAMFLAGS_LOOPBACK, 0, 0, m_pwfx, 0);
RETURN_ON_ERROR(hr);
hr = m_pAudioClient->GetService(__uuidof(IAudioCaptureClient), (void**)&m_pAudioCaptureClient);
RETURN_ON_ERROR(hr);
hr = m_pAudioClient->Start();
RETURN_ON_ERROR(hr);
CoUninitialize();
m_Channels = m_pwfx->nChannels;
m_SampleRate = m_pwfx->nSamplesPerSec;
return 0;
}
int Capture::stop() {
if (m_pAudioClient)
{
m_pAudioClient->Stop();
m_pAudioClient->Release();
m_pAudioClient = NULL;
}
if (m_pwfx != NULL)
{
CoTaskMemFree(m_pwfx);
m_pwfx = NULL;
}
if (m_pAudioCaptureClient != NULL)
{
m_pAudioCaptureClient->Release();
m_pAudioCaptureClient = NULL;
}
return 0;
}
int Capture::cap(std::vector<BYTE> &buffer, int rate, int channels)
{
read(buffer);
resample(buffer, rate);
singleChannel(buffer);
return buffer.size();
}
int Capture::read(std::vector<BYTE> &buffer) {
DWORD dwWaitResult;
UINT32 nNextPacketSize(0);
BYTE *pData = NULL;
UINT32 framesAvailable;
DWORD flags;
CoInitialize(NULL);
HRESULT hr = m_pAudioCaptureClient->GetBuffer(&pData, &framesAvailable, &flags, NULL, NULL);
RETURN_ON_ERROR(hr);
if (0 != framesAvailable)
{
buffer.insert(buffer.end(), pData, pData+framesAvailable * m_FrameSize);
}
m_pAudioCaptureClient->ReleaseBuffer(framesAvailable);
CoUninitialize();
return framesAvailable * m_FrameSize;
}
int Capture::resample(std::vector<BYTE>& buffer, int rate)
{
if (m_SampleRate == rate)return buffer.size();
if (m_pwfx == nullptr)return -1;
std::vector<BYTE> resultBuffer;
int bytes = m_pwfx->wBitsPerSample/8;
int sampleCount = buffer.size() / bytes;
int srcRate = m_pwfx->nSamplesPerSec;
int dstRate = rate;
int rateLen = srcRate / dstRate;
if (rateLen == 1) return buffer.size();
if (rateLen > 0) {
short tempRead = 0;
short tempSum = 0;
int flag = 0;
for (int i = 0; i < sampleCount; i++) {
memcpy(&tempRead, buffer.data()+i*bytes, bytes);
tempSum = tempSum + tempRead;
flag++;
if (flag == rateLen)
{
flag = 0;
tempSum = tempSum / rateLen;
resultBuffer.insert(resultBuffer.end(), ((BYTE*)&tempSum), ((BYTE*)&tempSum) + bytes);
tempSum = 0;
}
}
}
else {
rateLen = dstRate / srcRate;
int tempRead1;
int tempRead2;
int tempSum;
int tempAvgDiff;
int tempWrite;
int flag;
for (int i = 0; i < (sampleCount-1); i++) {
memcpy(&tempRead1, buffer.data() + i * bytes, bytes);
memcpy(&tempRead2, buffer.data() + i * bytes+ bytes, bytes);
tempSum = tempRead2 - tempRead1;
tempAvgDiff = tempSum / rateLen;
tempWrite = tempRead1;
flag = rateLen;
do
{
tempWrite += tempAvgDiff;
resultBuffer.insert(resultBuffer.end(), ((BYTE*)&tempWrite), ((BYTE*)&tempWrite) + bytes);
} while (--flag);
}
}
buffer.swap(resultBuffer);
return buffer.size();
}
int Capture::singleChannel(std::vector<BYTE>& buffer)
{
if (m_Channels == 1) return buffer.size();
size_t len = buffer.size() / 2;
int bytes = m_pwfx->wBitsPerSample / 8;
//std::vector<BYTE> singleBuffer(len);
BYTE *singleBuffer = new BYTE[len];
//singleBuffer.reserve(len);
for (int i = 0; i < len/bytes; i++) {
//singleBuffer.insert(singleBuffer.end(), buffer.data() + i*bytes * 2, buffer.data() + i*bytes * 2 + bytes);
memcpy(singleBuffer+i*bytes, buffer.data()+i*(2*bytes), bytes);
}
buffer.assign(singleBuffer, singleBuffer + len);
delete[] singleBuffer;
return buffer.size();
}
int Capture::wav(std::vector<BYTE>& buffer, int rate, int channels)
{
std::vector<BYTE> wavBuffer;
WAVE_HEADER pcmHEADER;
WAVE_FMT pcmFMT;
WAVE_DATA pcmDATA;
unsigned short m_pcmData;
int dataSize = buffer.size();
/* WAVE_HEADER */
memcpy(pcmHEADER.fccID, "RIFF", strlen("RIFF"));
memcpy(pcmHEADER.fccType, "WAVE", strlen("WAVE"));
pcmHEADER.dwSize = 36 + dataSize;
/* WAVE_FMT */
memcpy(pcmFMT.fccID, "fmt ", strlen("fmt "));
pcmFMT.dwSize = 16;
pcmFMT.wFormatTag = 1;
pcmFMT.wChannels = channels;
pcmFMT.dwSamplesPerSec = rate;
pcmFMT.uiBitsPerSample = 16;
/* ==dwSamplesPerSec*wChannels*uiBitsPerSample/8 */
pcmFMT.dwAvgBytesPerSec = pcmFMT.dwSamplesPerSec*pcmFMT.wChannels*pcmFMT.uiBitsPerSample / 8;
/* ==wChannels*uiBitsPerSample/8 */
pcmFMT.wBlockAlign = pcmFMT.wChannels*pcmFMT.uiBitsPerSample / 8;
/* WAVE_DATA */
memcpy(pcmDATA.fccID, "data", strlen("data"));
pcmDATA.dwSize = dataSize;
wavBuffer.insert(wavBuffer.end(), (BYTE*)&pcmHEADER, ((BYTE*)&pcmHEADER) + sizeof(WAVE_HEADER));
wavBuffer.insert(wavBuffer.end(), (BYTE*)&pcmFMT, ((BYTE*)&pcmFMT) + sizeof(WAVE_FMT));
wavBuffer.insert(wavBuffer.end(), (BYTE*)&pcmDATA, ((BYTE*)&pcmDATA) + sizeof(WAVE_DATA));
wavBuffer.insert(wavBuffer.end(), buffer.begin(), buffer.end());
buffer.swap(wavBuffer);
return buffer.size();
}