OpenCV機器學習(1):貝葉斯分類器實現代碼分析

OpenCV的機器學習類定義在ml.hpp文件中,基礎類是CvStatModel,其他各種分類器從這裏繼承而來。

今天研究CvNormalBayesClassifier分類器。

1.類定義

在ml.hpp中有以下類定義:

class CV_EXPORTS_W CvNormalBayesClassifier : public CvStatModel
{
public:
    CV_WRAP CvNormalBayesClassifier();
    virtual ~CvNormalBayesClassifier();

    CvNormalBayesClassifier( const CvMat* trainData, const CvMat* responses,
        const CvMat* varIdx=0, const CvMat* sampleIdx=0 );

    virtual bool train( const CvMat* trainData, const CvMat* responses,
        const CvMat* varIdx = 0, const CvMat* sampleIdx=0, bool update=false );

    virtual float predict( const CvMat* samples, CV_OUT CvMat* results=0 ) const;
    CV_WRAP virtual void clear();

    CV_WRAP CvNormalBayesClassifier( const cv::Mat& trainData, const cv::Mat& responses,
                            const cv::Mat& varIdx=cv::Mat(), const cv::Mat& sampleIdx=cv::Mat() );
    CV_WRAP virtual bool train( const cv::Mat& trainData, const cv::Mat& responses,
                       const cv::Mat& varIdx = cv::Mat(), const cv::Mat& sampleIdx=cv::Mat(),
                       bool update=false );
    CV_WRAP virtual float predict( const cv::Mat& samples, CV_OUT cv::Mat* results=0 ) const;

    virtual void write( CvFileStorage* storage, const char* name ) const;
    virtual void read( CvFileStorage* storage, CvFileNode* node );

protected:
    int     var_count, var_all;
    CvMat*  var_idx;
    CvMat*  cls_labels;
    CvMat** count;
    CvMat** sum;
    CvMat** productsum;
    CvMat** avg;
    CvMat** inv_eigen_values;
    CvMat** cov_rotate_mats;
    CvMat*  c;
};

2.示例

此類使用方法如下:(引用別人的代碼,忘記出處了,非常抱歉這個。。。)

//openCV中貝葉斯分類器的API函數用法舉例
//運行環境:win7 + VS2005 + openCV2.4.5

#include "global_include.h"

using namespace std;
using namespace cv;

//10個樣本特徵向量維數爲12的訓練樣本集,第一列爲該樣本的類別標籤
double inputArr[10][13] = 
{
     1,0.708333,1,1,-0.320755,-0.105023,-1,1,-0.419847,-1,-0.225806,0,1, 
    -1,0.583333,-1,0.333333,-0.603774,1,-1,1,0.358779,-1,-0.483871,0,-1,
     1,0.166667,1,-0.333333,-0.433962,-0.383562,-1,-1,0.0687023,-1,-0.903226,-1,-1,
    -1,0.458333,1,1,-0.358491,-0.374429,-1,-1,-0.480916,1,-0.935484,0,-0.333333,
    -1,0.875,-1,-0.333333,-0.509434,-0.347032,-1,1,-0.236641,1,-0.935484,-1,-0.333333,
    -1,0.5,1,1,-0.509434,-0.767123,-1,-1,0.0534351,-1,-0.870968,-1,-1,
     1,0.125,1,0.333333,-0.320755,-0.406393,1,1,0.0839695,1,-0.806452,0,-0.333333,
     1,0.25,1,1,-0.698113,-0.484018,-1,1,0.0839695,1,-0.612903,0,-0.333333,
     1,0.291667,1,1,-0.132075,-0.237443,-1,1,0.51145,-1,-0.612903,0,0.333333,
     1,0.416667,-1,1,0.0566038,0.283105,-1,1,0.267176,-1,0.290323,0,1
};

//一個測試樣本的特徵向量
double testArr[]=
{
    0.25,1,1,-0.226415,-0.506849,-1,-1,0.374046,-1,-0.83871,0,-1
};


int _tmain(int argc, _TCHAR* argv[])
{
    Mat trainData(10, 12, CV_32FC1);//構建訓練樣本的特徵向量
    for (int i=0; i<10; i++)
    {
        for (int j=0; j<12; j++)
        {
            trainData.at<float>(i, j) = inputArr[i][j+1];
        }
    }

    Mat trainResponse(10, 1, CV_32FC1);//構建訓練樣本的類別標籤
    for (int i=0; i<10; i++)
    {
        trainResponse.at<float>(i, 0) = inputArr[i][0];
    }

    CvNormalBayesClassifier nbc;
    bool trainFlag = nbc.train(trainData, trainResponse);//進行貝葉斯分類器訓練
    if (trainFlag)
    {
        cout<<"train over..."<<endl;
        nbc.save("normalBayes.txt");
    }
    else
    {
        cout<<"train error..."<<endl;
        system("pause");
        exit(-1);
    }


    CvNormalBayesClassifier testNbc;
    testNbc.load("normalBayes.txt");

    Mat testSample(1, 12, CV_32FC1);//構建測試樣本
    for (int i=0; i<12; i++)
    {
        testSample.at<float>(0, i) = testArr[i];
    }

    float flag = testNbc.predict(testSample);//進行測試
    cout<<"flag = "<<flag<<endl;

    system("pause");
    return 0;
}

3.步驟

兩步走:

1.調用train函數訓練分類器;

2.調用predict函數,判定測試樣本的類別。

以上示例代碼還延時了怎樣使用save和load函數,使得訓練好的分類器可以保存在文本中。

4.初始化

接下來,看CvNormalBayesClassifier類的無參數初始化:

CvNormalBayesClassifier::CvNormalBayesClassifier()
{
    var_count = var_all = 0;
    var_idx = 0;
    cls_labels = 0;
    count = 0;
    sum = 0;
    productsum = 0;
    avg = 0;
    inv_eigen_values = 0;
    cov_rotate_mats = 0;
    c = 0;
    default_model_name = "my_nb";
}
還有另一種帶參數的初始化形式:
CvNormalBayesClassifier::CvNormalBayesClassifier(
    const CvMat* _train_data, const CvMat* _responses,
    const CvMat* _var_idx, const CvMat* _sample_idx )
{
    var_count = var_all = 0;
    var_idx = 0;
    cls_labels = 0;
    count = 0;
    sum = 0;
    productsum = 0;
    avg = 0;
    inv_eigen_values = 0;
    cov_rotate_mats = 0;
    c = 0;
    default_model_name = "my_nb";

    train( _train_data, _responses, _var_idx, _sample_idx );
}
可見,帶參數形式糅合了類的初始化和train函數。

另外,以Mat參數形式的對應函數版本,功能是一致的,只不過爲了體現2.0以後版本的C++特性罷了。如下:

    CV_WRAP CvNormalBayesClassifier( const cv::Mat& trainData, const cv::Mat& responses,
                            const cv::Mat& varIdx=cv::Mat(), const cv::Mat& sampleIdx=cv::Mat() );
    CV_WRAP virtual bool train( const cv::Mat& trainData, const cv::Mat& responses,
                       const cv::Mat& varIdx = cv::Mat(), const cv::Mat& sampleIdx=cv::Mat(),
                       bool update=false );
    CV_WRAP virtual float predict( const cv::Mat& samples, CV_OUT cv::Mat* results=0 ) const;

5.訓練

下面開始分析train函數,分析CvMat格式參數的train函數,即:

bool train( const CvMat* trainData, const CvMat* responses,const CvMat* varIdx = 0, const CvMat* sampleIdx=0, bool update=false );

在進入該函數之前,還要先回頭看看CvNormalBayesClassifier類有哪些數據成員:

protected:
    int     var_count, var_all;	//每個樣本的特徵維數、即變量數目,或者說trainData的列數目(在varIdx=0時)
    CvMat*  var_idx;		//特徵子集的索引,可能特徵數目爲100,但是隻用其中一部分訓練
    CvMat*  cls_labels;		//類別數目
    CvMat** count;		//count[0...(classNum-1)],每個元素是一個CvMat(rows=1,cols=var_count)指針,代表訓練數據中每一類的某個特徵的數目
    CvMat** sum;		//sum[0...(classNum-1)],每個元素是一個CvMat(rows=1,cols=var_count)指針,代表訓練數據中每一類的某個特徵的累加和
    CvMat** productsum;		//productsum[0...(classNum-1)],每個元素是一個CvMat(rows=cols=var_count)指針,存儲類內特徵相關矩陣
    CvMat** avg;		//avg[0...(classNum-1)],每個元素是一個CvMat(rows=1,cols=var_count)指針,代表訓練數據中每一類的某個特徵的平均值
    CvMat** inv_eigen_values;//inv_eigen_values[0...(classNum-1)],每個元素是一個CvMat(rows=1,cols=var_count)指針,代表訓練數據中每一類的某個特徵的特徵值的倒數
    CvMat** cov_rotate_mats;	//特徵變量的協方差矩陣經過SVD奇異值分解後得到的特徵向量矩陣
    CvMat*  c;

這些數據成員,怎樣使用呢?在train函數中見分曉:

bool CvNormalBayesClassifier::train( const CvMat* _train_data, const CvMat* _responses,
									const CvMat* _var_idx, const CvMat* _sample_idx, bool update )
{
	const float min_variation = FLT_EPSILON;
	bool result = false;
	CvMat* responses   = 0;
	const float** train_data = 0;
	CvMat* __cls_labels = 0;
	CvMat* __var_idx = 0;
	CvMat* cov = 0;

	CV_FUNCNAME( "CvNormalBayesClassifier::train" );

	__BEGIN__;

	int cls, nsamples = 0, _var_count = 0, _var_all = 0, nclasses = 0;
	int s, c1, c2;
	const int* responses_data;

	//1.整理訓練數據
	CV_CALL( cvPrepareTrainData( 0,
		_train_data, CV_ROW_SAMPLE, _responses, CV_VAR_CATEGORICAL,
		_var_idx, _sample_idx, false, &train_data,
		&nsamples, &_var_count, &_var_all, &responses,
		&__cls_labels, &__var_idx ));

	if( !update )	//如果是初始訓練數據
	{
		const size_t mat_size = sizeof(CvMat*);
		size_t data_size;

		clear();

		var_idx = __var_idx;
		cls_labels = __cls_labels;
		__var_idx = __cls_labels = 0;
		var_count = _var_count;
		var_all = _var_all;

		nclasses = cls_labels->cols;
		data_size = nclasses*6*mat_size;

		CV_CALL( count = (CvMat**)cvAlloc( data_size ));
		memset( count, 0, data_size );			//count[cls]存儲第cls類每個屬性變量個數
									  
		sum             = count      + nclasses;//sum[cls]存儲第cls類每個屬性取值的累加和
		productsum      = sum        + nclasses;//productsum[cls]存儲第cls類的協方差矩陣的乘積項sum(XiXj),cov(Xi,Xj)=sum(XiXj)-sum(Xi)E(Xj)
		avg             = productsum + nclasses;//avg[cls]存儲第cls類的每個變量均值
		inv_eigen_values= avg        + nclasses;//inv_eigen_values[cls]存儲第cls類的協方差矩陣的特徵值
		cov_rotate_mats = inv_eigen_values         + nclasses;//存儲第cls類的矩陣的特徵值對應的特徵向量

		CV_CALL( c = cvCreateMat( 1, nclasses, CV_64FC1 ));
		
		for( cls = 0; cls < nclasses; cls++ )	//對所有類別
		{
			CV_CALL(count[cls]            = cvCreateMat( 1, var_count, CV_32SC1 ));
			CV_CALL(sum[cls]              = cvCreateMat( 1, var_count, CV_64FC1 ));
			CV_CALL(productsum[cls]       = cvCreateMat( var_count, var_count, CV_64FC1 ));
			CV_CALL(avg[cls]              = cvCreateMat( 1, var_count, CV_64FC1 ));
			CV_CALL(inv_eigen_values[cls] = cvCreateMat( 1, var_count, CV_64FC1 ));
			CV_CALL(cov_rotate_mats[cls]  = cvCreateMat( var_count, var_count, CV_64FC1 ));
			CV_CALL(cvZero( count[cls] ));
			CV_CALL(cvZero( sum[cls] ));
			CV_CALL(cvZero( productsum[cls] ));
			CV_CALL(cvZero( avg[cls] ));
			CV_CALL(cvZero( inv_eigen_values[cls] ));
			CV_CALL(cvZero( cov_rotate_mats[cls] ));
		}
	}
	else	//如果是更新訓練數據
	{
		// check that the new training data has the same dimensionality etc.
		if( _var_count != var_count || _var_all != var_all || !((!_var_idx && !var_idx) ||
			(_var_idx && var_idx && cvNorm(_var_idx,var_idx,CV_C) < DBL_EPSILON)) )
			CV_ERROR( CV_StsBadArg,
			"The new training data is inconsistent with the original training data" );

		if( cls_labels->cols != __cls_labels->cols ||
			cvNorm(cls_labels, __cls_labels, CV_C) > DBL_EPSILON )
			CV_ERROR( CV_StsNotImplemented,
			"In the current implementation the new training data must have absolutely "
			"the same set of class labels as used in the original training data" );

		nclasses = cls_labels->cols;
	}

	responses_data = responses->data.i;
	CV_CALL( cov = cvCreateMat( _var_count, _var_count, CV_64FC1 ));

	//2.處理訓練數據,計算每一類的
	// process train data (count, sum , productsum) 
	for( s = 0; s < nsamples; s++ )
	{
		cls = responses_data[s];
		int* count_data = count[cls]->data.i;
		double* sum_data = sum[cls]->data.db;
		double* prod_data = productsum[cls]->data.db;
		const float* train_vec = train_data[s];

		for( c1 = 0; c1 < _var_count; c1++, prod_data += _var_count )
		{
			double val1 = train_vec[c1];
			sum_data[c1] += val1;
			count_data[c1]++;
			for( c2 = c1; c2 < _var_count; c2++ )
				prod_data[c2] += train_vec[c2]*val1;
		}
	}

	//計算每一類的每個屬性平均值、協方差矩陣
	// calculate avg, covariance matrix, c
	for( cls = 0; cls < nclasses; cls++ )	//對每一類
	{
		double det = 1;
		int i, j;
		CvMat* w = inv_eigen_values[cls];
		int* count_data = count[cls]->data.i;
		double* avg_data = avg[cls]->data.db;
		double* sum1 = sum[cls]->data.db;

		cvCompleteSymm( productsum[cls], 0 );

		for( j = 0; j < _var_count; j++ )	//計算當前類別cls的每個變量屬性值的平均值
		{
			int n = count_data[j];
			avg_data[j] = n ? sum1[j] / n : 0.;
		}

		count_data = count[cls]->data.i;
		avg_data = avg[cls]->data.db;
		sum1 = sum[cls]->data.db;

		for( i = 0; i < _var_count; i++ )//計算當前類別cls的變量協方差矩陣,矩陣大小爲_var_count * _var_count,注意協方差矩陣對稱。
		{
			double* avg2_data = avg[cls]->data.db;
			double* sum2 = sum[cls]->data.db;
			double* prod_data = productsum[cls]->data.db + i*_var_count;
			double* cov_data = cov->data.db + i*_var_count;
			double s1val = sum1[i];
			double avg1 = avg_data[i];
			int _count = count_data[i];

			for( j = 0; j <= i; j++ )
			{
				double avg2 = avg2_data[j];
				double cov_val = prod_data[j] - avg1 * sum2[j] - avg2 * s1val + avg1 * avg2 * _count;
				cov_val = (_count > 1) ? cov_val / (_count - 1) : cov_val;
				cov_data[j] = cov_val;
			}
		}

		CV_CALL( cvCompleteSymm( cov, 1 ));
		CV_CALL( cvSVD( cov, w, cov_rotate_mats[cls], 0, CV_SVD_U_T ));
		CV_CALL( cvMaxS( w, min_variation, w ));
		for( j = 0; j < _var_count; j++ )
			det *= w->data.db[j];

		CV_CALL( cvDiv( NULL, w, w ));
		c->data.db[cls] = det > 0 ? log(det) : -700;
	}

	result = true;

	__END__;

	if( !result || cvGetErrStatus() < 0 )
		clear();

	cvReleaseMat( &cov );
	cvReleaseMat( &__cls_labels );
	cvReleaseMat( &__var_idx );
	cvFree( &train_data );

	return result;
}
訓練部分就此完成。

6.預測

下面看用於預測的predict函數的實現代碼:

float CvNormalBayesClassifier::predict( const CvMat* samples, CvMat* results ) const
{
    float value = 0;

    if( !CV_IS_MAT(samples) || CV_MAT_TYPE(samples->type) != CV_32FC1 || samples->cols != var_all )
        CV_Error( CV_StsBadArg,
        "The input samples must be 32f matrix with the number of columns = var_all" );

    if( samples->rows > 1 && !results )
        CV_Error( CV_StsNullPtr,
        "When the number of input samples is >1, the output vector of results must be passed" );

    if( results )
    {
        if( !CV_IS_MAT(results) || (CV_MAT_TYPE(results->type) != CV_32FC1 &&
        CV_MAT_TYPE(results->type) != CV_32SC1) ||
        (results->cols != 1 && results->rows != 1) ||
        results->cols + results->rows - 1 != samples->rows )
        CV_Error( CV_StsBadArg, "The output array must be integer or floating-point vector "
        "with the number of elements = number of rows in the input matrix" );
    }

    const int* vidx = var_idx ? var_idx->data.i : 0;

    cv::parallel_for(cv::BlockedRange(0, samples->rows), predict_body(c, cov_rotate_mats, inv_eigen_values, avg, samples,
                                                                      vidx, cls_labels, results, &value, var_count
    ));

    return value;
}
可以發現,預測部分核心代碼是:
cv::parallel_for(cv::BlockedRange(0, samples->rows), predict_body(c, cov_rotate_mats, inv_eigen_values, avg, samples,
                                                                      vidx, cls_labels, results, &value, var_count));
parallel_for是用於並行支持的,可能會調用tbb模塊。predict_body則是一個結構體,內部的()符號被重載,實現預測功能。其完整定義如下:

//predict函數調用predict_body結構體的()符號重載函數,實現基於貝葉斯的分類
struct predict_body 
{
	predict_body(CvMat* _c, CvMat** _cov_rotate_mats, CvMat** _inv_eigen_values, CvMat** _avg,
				const CvMat* _samples, const int* _vidx, CvMat* _cls_labels,
				CvMat* _results, float* _value, int _var_count1)
	{
		c = _c;
		cov_rotate_mats = _cov_rotate_mats;
		inv_eigen_values = _inv_eigen_values;
		avg = _avg;
		samples = _samples;
		vidx = _vidx;
		cls_labels = _cls_labels;
		results = _results;
		value = _value;
		var_count1 = _var_count1;
	}

	CvMat* c;
	CvMat** cov_rotate_mats;
	CvMat** inv_eigen_values;
	CvMat** avg;
	const CvMat* samples;
	const int* vidx;
	CvMat* cls_labels;

	CvMat* results;
	float* value;
	int var_count1;

	void operator()( const cv::BlockedRange& range ) const
	{

		int cls = -1;
		int rtype = 0, rstep = 0;
		int nclasses = cls_labels->cols;
		int _var_count = avg[0]->cols;

		if (results)
		{
			rtype = CV_MAT_TYPE(results->type);
			rstep = CV_IS_MAT_CONT(results->type) ? 1 : results->step/CV_ELEM_SIZE(rtype);
		}
		// allocate memory and initializing headers for calculating
		cv::AutoBuffer<double> buffer(nclasses + var_count1);
		CvMat diff = cvMat( 1, var_count1, CV_64FC1, &buffer[0] );

		for(int k = range.begin(); k < range.end(); k += 1 )//對於每個輸入測試樣本
		{
			int ival;
			double opt = FLT_MAX;

			for(int i = 0; i < nclasses; i++ )	//對於每一類別,計算其似然概率
			{

				double cur = c->data.db[i];
				CvMat* u = cov_rotate_mats[i];
				CvMat* w = inv_eigen_values[i];

				const double* avg_data = avg[i]->data.db;
				const float* x = (const float*)(samples->data.ptr + samples->step*k);

				// cov = u w u'  -->  cov^(-1) = u w^(-1) u'
				for(int j = 0; j < _var_count; j++ )	//計算特徵相對於均值的偏移
					diff.data.db[j] = avg_data[j] - x[vidx ? vidx[j] : j];

				cvGEMM( &diff, u, 1, 0, 0, &diff, CV_GEMM_B_T );
				for(int j = 0; j < _var_count; j++ )//計算特徵的聯合概率
				{
					double d = diff.data.db[j];
					cur += d*d*w->data.db[j];
				}

				if( cur < opt )	//找到分類概率最大的
				{
					cls = i;
					opt = cur;
				}
				// probability = exp( -0.5 * cur ) 

			}//for(int i = 0; i < nclasses; i++ )

			ival = cls_labels->data.i[cls];
			if( results )
			{
				if( rtype == CV_32SC1 )
					results->data.i[k*rstep] = ival;
				else
					results->data.fl[k*rstep] = (float)ival;
			}
			if( k == 0 )
				*value = (float)ival;

		}//for(int k = range.begin()...

	}//void operator()...
};
好啦,預測部分至此完成。

但有一個小小疑問:好像在predict部分實現代碼中沒有看到先驗概率參與到計算當中,而貝葉斯估計是應該p(w|x)=p(w)*p(x|w)/...的呀,但是這裏只看到了計算p(x|w)的部分。沒有p(w)的身影,不知道爲何,盼高人指點。

貝葉斯代碼分析完成。









發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章