分類

//
// new_keyword_extract1.0.cpp : 定義控制檯應用程序的入口點。
//

#include "stdafx.h"



//boost庫的靜態鏈接問題
//#include <boost/regex.hpp>
#include <iostream>
#include <string>
#include <regex>
#include <fstream>  

#include <sstream>
#include <utility>
#include <algorithm>
#include <string>
#include <cctype>
#include <set>
#include <queue>

#include <regex>
#include <hash_map>
#include "Term.h"

using namespace std;
//using namespace boost;

typedef struct  //用於一個簇類中文檔相似度排列的結構
{
	string word;
	double final_value;      
}Node;


//struct cmp  //重載比較函數
//{
//	bool operator()(const Node &t1,const Node &t2)
//	{
//		 return t1.final_value<t2.final_value;
//			//相當於less,小頂堆   
//	}
//};
Node* keyword_extra_entropy(string text);
string pre_treatment(string text);
void Quick_sort(Node ArrayInput[],int nLow,int nHigh);
int Partition(Node ArrayInput[],int nLow,int nHigh);
void Swap(Node &p,Node &q);
string LoadDoc(string address);

void main(){


	string text=LoadDoc("D:\\test.txt");
	text=pre_treatment(text);


	int i;
	Node *result;
	result=keyword_extra_entropy(text);
	for(i=0;i<30;i++)
		cout<<endl<<result[i].word<<"-"<<result[i].final_value;

	system("pause");
}




void Swap(Node &p,Node &q)                          
{                                                      
	Node temp = p;
	p=q;
	q=temp;
} 

//Partition function
int Partition(Node ArrayInput[],int nLow,int nHigh)                
{                                                 

	double nTemp=ArrayInput[nHigh].final_value;   
	int i = nLow, j=nLow-1;  
	for(; i<nHigh; i++)
	{
		if( ArrayInput[i].final_value>=nTemp )
		{
			j++;
			if(i !=j )
			{
				Swap(ArrayInput[i], ArrayInput[j]);
			}
		}
	}

	Swap(ArrayInput[j+1],ArrayInput[nHigh]);

	return (j+1);                                        
}

//Quick sort
void Quick_sort(Node ArrayInput[],int nLow,int nHigh)            
{                                                                                                       
	if(nLow < nHigh)                                        
	{                                                
		int nIndex=Partition(ArrayInput , nLow, nHigh);                         
		Quick_sort(ArrayInput , nLow, nIndex-1);                           
		Quick_sort(ArrayInput , nIndex+1, nHigh);                           
	}                                                 
}

string pre_treatment(string text){
	regex pattern1("\\s{2,}");
	regex pattern2("[^a-z \u4e00-\u9fa5]");
	//regex pattern4("[!@#$%^&*()+=|\}]{[:;<,>?/\"]");
	//regex pattern3("[—]");


	text=regex_replace(text,pattern2,string(" "));
	//text=regex_replace(text,pattern3,string(" "));
	//	text=regex_replace(text,pattern4,string(" "));
	text=regex_replace(text,pattern1,string(" "));

	return text;

}

Node* keyword_extra_entropy(string text){

	//算法部分
	hash_map<string,int> word_frequency;
	hash_map<string,vector<int>> word_loc;


	stringstream q;
	q.str(text);
	vector<string> wordlist;
	int i=0;
	while(q){
		string asd,we;
		q>>asd;

		we.resize(asd.size());
		transform(asd.begin(),asd.end(),we.begin(),tolower);

		if(we.empty())
			continue;

		//	if(!excluded_word.count(we))
		//{   
		i++;
		++word_frequency[we];
		word_loc[we].push_back(i);

		wordlist.push_back(we);	
		//}
	}

	int sum=i;

	vector<Term> Term_list;
	Term temp;
	hash_map<string,vector<int>>::const_iterator map_it=word_loc.begin();

	vector<int> r;
	while(map_it!=word_loc.end()){	
		Term temp;
		temp.Set_Term(map_it->first,map_it->second,map_it->second.size(),sum);
		Term_list.push_back(temp);
		++map_it;
	}

	Node *result;
	result=new Node[Term_list.size()];

	for(i=0;i<Term_list.size();i++){
		Term_list[i].Cal_Distance(sum);
		Term_list[i].divide_Mode();
		Term_list[i].Cal_Entropy();
		Term_list[i].CAL_geo(sum);
		Term_list[i].Cal_EDnor();

		if(_isnan(Term_list[i].EDnor))
			continue;

		result[i].word=Term_list[i].word;
		result[i].final_value=Term_list[i].EDnor;
	}

	Quick_sort(result,0,Term_list.size()-1);

	return result;

}

string LoadDoc(string address){

	filebuf *pbuf;  
	ifstream filestr;  
	long size;  
	char * buffer;  
	// 要讀入整個文件,必須採用二進制打開   
	filestr.open (address,ios::binary);  //TheOriginofSpeciesv6_rvPun
	// 獲取filestr對應buffer對象的指針   
	pbuf=filestr.rdbuf();  

	// 調用buffer對象方法獲取文件大小  
	size=pbuf->pubseekoff (0,ios::end,ios::in);  
	pbuf->pubseekpos (0,ios::in);  

	// 分配內存空間  
	buffer=new char[size];  

	// 獲取文件內容  
	pbuf->sgetn (buffer,size);  

	filestr.close();  
	// 輸出到標準輸出  

	string text=buffer;

	return text;
}


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章