基於樸素貝葉斯分類器的文本分類算法(C語言)

#include <stdio.h>
#include <string.h>
#include <direct.h> //_getcwd(), _chdir()
#include <stdlib.h> //_MAX_PATH, system()
#include <io.h> //_finddata_t, _findfirst(), _findnext(), _findclose()


char vocabulary[1000][20];




//@輸入參數:要分類的文本
//@輸出參數:該文本中總單詞數


int SplitToWord(char text[])
{
int i=0;
char seps[]=", .\n";  
char *substring; 


substring=strtok(text,seps); 
while(substring!=NULL) 
{   
   strcpy(vocabulary[i],substring);//將單詞存儲到vocabulary數組中 
   substring=strtok(NULL,seps); 
   i++;
}
return i; //返回一共多少個單詞
}




//@輸入參數:無
//@輸出參數:該目錄下.txt文件數


int CountDirectory()
{
int count=0; //txt文件計數器
long hFile;
    _finddata_t fileinfo;


    if ((hFile=_findfirst("*.txt",&fileinfo))!=-1L)
    {
        do
        {            
    count++;
        } while (_findnext(hFile,&fileinfo) == 0);
}
return count;
}




//@輸入參數:分類文本中單詞數
//@輸出參數:該類別下∏P(ai|vj)


float CalculateWordProbability(int wordCount)
{
int countSame; //分類文本中的某單詞在所有訓練樣本中出現次數
int countAll=0; //訓練樣本中總單詞數
char token;
FILE *fp;
float wordProbability=1; //爲後面聯乘做準備
int i,j;
long hFile;
    _finddata_t fileinfo;




for(j=0;j<wordCount;j++) //對於分類樣本中的每一個單詞
{
   countSame=0;
   countAll=0;
   if((hFile=_findfirst("*.txt",&fileinfo))!=-1L) //對於該類別下每一個.txt文本
   {
    do
    {
     if((fp=fopen(fileinfo.name,"r"))==NULL) //是否能打開該文本
     {
      printf("Sorry!Cannot open the file!\n");
      exit(0);
     }


     
     while((token = fgetc(fp)) != EOF) 
     {
      char keyword[1024]; 
      i = 0; 
     
      keyword[0] = token; // 將每個詞第一個字符賦給數組第一個元素
      while ((keyword[++i] = fgetc(fp)) != ' ' && keyword[i] != '\t' && keyword[i] != EOF && keyword[i] != '\n'); // 開始讀字符,直到遇到空白符,說明找到一個詞 
      keyword[i] = '\0';// 加結束符
      countAll++;


      if (strcmp(keyword,vocabulary[j]) == 0) //比較兩個單詞是否相同
       countSame++;
     }
     fclose(fp);


    }while (_findnext(hFile,&fileinfo) == 0); 
   }
   wordProbability*=(float)(countSame+1)/(float)(wordCount+countAll)*300; //計算∏P(wj|vi),爲了擴大效果而*380
}


return wordProbability;
}
  


//@輸入參數:分類文本中單詞數
  
void CalculateProbability(int wordCount)
{
FILE *fp;
char classList[10][20]; //類別列表
    char ch;    //臨時讀取字符使用
    int index=0; //classList的行標
    int className_c=0; //classList的列標


if((fp=fopen("ClassList.txt","r"))==NULL)
    {
        printf("Failed to open the file: ClassList.txt.\n");
    }
    ch = fgetc(fp);
    while(ch!=EOF)
    {
        if(ch!='\n')
        {
            classList[index][className_c]=ch;
            className_c++;
        }
        else
        {
            classList[index][className_c]='\0';
            index++;
            className_c=0;
        }
   ch = fgetc(fp);
}


int txtCount[10]; //每個類別下的訓練文本數
int countAll=0; //訓練集中總文本數
float wordProbability[10]; //每個類別的單詞概率,即∏P(ai|vj)


if(_chdir("F:\\SogouC\\Sample\\1")) //更改當前絕對路徑
     printf("系統找不到指定路徑!\n");
else
{
   txtCount[0]=CountDirectory(); //獲取該類別下.txt文件數
   countAll+=txtCount[0];
   wordProbability[0]=CalculateWordProbability(wordCount); //獲取該類別下∏P(wj|vi)
}
if(_chdir("F:\\SogouC\\Sample\\2")) //更改當前絕對路徑
   printf("系統找不到指定路徑!\n");
else
{
   txtCount[1]=CountDirectory(); //獲取該類別下.txt文件數
   countAll+=txtCount[1];
   wordProbability[1]=CalculateWordProbability(wordCount); //獲取該類別下∏P(wj|vi)
}
if(_chdir("F:\\SogouC\\Sample\\3")) //更改當前絕對路徑
     printf("系統找不到指定路徑!\n");
else
{
   txtCount[2]=CountDirectory(); //獲取該類別下.txt文件數
   countAll+=txtCount[2];
   wordProbability[2]=CalculateWordProbability(wordCount); //獲取該類別下∏P(wj|vi)
}
if(_chdir("F:\\SogouC\\Sample\\4")) //更改當前絕對路徑
     printf("系統找不到指定路徑!\n");
else
{
   txtCount[3]=CountDirectory(); //獲取該類別下.txt文件數
   countAll+=txtCount[3];
   wordProbability[3]=CalculateWordProbability(wordCount); //獲取該類別下∏P(wj|vi)
}
if(_chdir("F:\\SogouC\\Sample\\5")) //更改當前絕對路徑
     printf("系統找不到指定路徑!\n");
else
{
   txtCount[4]=CountDirectory(); //獲取該類別下.txt文件數
   countAll+=txtCount[4];
   wordProbability[4]=CalculateWordProbability(wordCount); //獲取該類別下∏P(wj|vi)
}
if(_chdir("F:\\SogouC\\Sample\\6")) //更改當前絕對路徑
     printf("系統找不到指定路徑!\n");
else
{
   txtCount[5]=CountDirectory(); //獲取該類別下.txt文件數
   countAll+=txtCount[5];
   wordProbability[5]=CalculateWordProbability(wordCount); //獲取該類別下∏P(wj|vi)
}
if(_chdir("F:\\SogouC\\Sample\\7")) //更改當前絕對路徑
     printf("系統找不到指定路徑!\n");
else
{
   txtCount[6]=CountDirectory(); //獲取該類別下.txt文件數
   countAll+=txtCount[6];
   wordProbability[6]=CalculateWordProbability(wordCount); //獲取該類別下∏P(wj|vi)
}
if(_chdir("F:\\SogouC\\Sample\\8")) //更改當前絕對路徑
     printf("系統找不到指定路徑!\n");
else
{
   txtCount[7]=CountDirectory(); //獲取該類別下.txt文件數
   countAll+=txtCount[7];
   wordProbability[7]=CalculateWordProbability(wordCount); //獲取該類別下∏P(wj|vi)
}
if(_chdir("F:\\SogouC\\Sample\\9")) //更改當前絕對路徑
     printf("系統找不到指定路徑!\n");
else
{
   txtCount[8]=CountDirectory(); //獲取該類別下.txt文件數
   countAll+=txtCount[8];
   wordProbability[8]=CalculateWordProbability(wordCount); //獲取該類別下∏P(wj|vi)
}
if(_chdir("F:\\SogouC\\Sample\\10")) //更改當前絕對路徑
     printf("系統找不到指定路徑!\n");
else
{
   txtCount[9]=CountDirectory(); //獲取該類別下.txt文件數
   countAll+=txtCount[9];
   wordProbability[9]=CalculateWordProbability(wordCount); //獲取該類別下∏P(wj|vi)
}


float max=0;
int classNo=0;
float priorProbability[10];
float finalProbability[10];


for(int i=0;i<10;i++) 
{
   priorProbability[i]=(float)txtCount[i]/(float)countAll; //先驗概率
   finalProbability[i]=priorProbability[i]*wordProbability[i]; //最終概率
   if(finalProbability[i]>max) //找到最大概率並記錄
   {
    max=finalProbability[i];
    classNo=i;
   }
   printf("該文本爲類別%s的概率爲:%.5e\n",classList[i],finalProbability[i]); //輸出每個類別的最終概率
}
printf("\n經分析,該文本最有可能爲%s類文本!\n",classList[classNo]); //輸出最後分類結果
}




//@輸入參數:分類文本


void NaiveBayesClassifier(char text[])
{
int vocabularyCount;//分類樣本中單詞數


vocabularyCount=SplitToWord(text); //對要分類的文本進行單詞分割,結果存儲在vocabulary數組中,返回分類樣本中單詞數
CalculateProbability(vocabularyCount); //計算最終概率
}




int main()
{
char text[]="Microsoft offered 44.6 billion dollars to buy Yahoo.February 1st network reported the Associated Press news, Microsoft offered 44.6 billion dollars in cash and stock to buy Yahoo search site.Microsoft offered to pay 31 dollars per share for Yahoo.Microsoft's acquisition offer on Jan. 31 premium of 62% than Yahoo's closing price of 19.18 dollars.Microsoft said that Yahoo shareholders can choose cash or stock transactions. Microsoft and Yahoo have sought cooperation in late 2006 and early 2007.The last two years, Yahoo has been in a dilemma: the market share decline,poor operating performance,stock prices tumbled sharply.Trying to make a difference for Microsoft in the Internet market, the acquisition of Yahoo is a shortcut, because the two sides have very strong complementarity.";


   NaiveBayesClassifier(text);
return 1;
}


 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章