package muyanmoyang.text_classify.Classify;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.lang.reflect.Array;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.processing.Filer;
import muyanmoyang.text_classify.toMysql.DBUtil;
import muyanmoyang.text_classify.toMysql.DocBean;
import ICTCLAS.I3S.AC.ICTCLAS50;
/**
* 搜狗語料庫的文本預處理:包括停用詞處理、獲取存儲在MySQL數據庫中的語料文本、分詞、寬窄字符串互轉、
* 對每篇文章初步過濾形成詞集合、建立詞袋子模型等
* @author moyang
* @mail [email protected]
*/
public class DataPreProcess {
public static void main(String[] args) throws IOException, SQLException {
getStopwordsSet("D:/¥菸酒僧/文本挖掘/stopwords.txt") ;
getSogouTextFromMySQL() ;
}
/**
* 停用詞處理 , 獲取停用詞表
* @param stopFileDir
* @return
* @throws IOException
*/
public static Set<String> getStopwordsSet(String stopFileDir) throws IOException
{
FileReader stopWordsReader = new FileReader(new File(stopFileDir)) ;
Set<String> stopwordsSet = new HashSet<String>() ;
BufferedReader BR = new BufferedReader(stopWordsReader) ;
String stopwordsStr ;
while((stopwordsStr = BR.readLine()) != null)
{
stopwordsSet.add(stopwordsStr) ;
}
Iterator it = stopwordsSet.iterator() ;
// while(it.hasNext())
// {
// Object obj = it.next() ;
// System.out.print(obj + "、");
// }
return stopwordsSet ;
}
/**
* 從MySQL數據庫獲取語料文本,以Map<Integer,String[]>形式保存
* @throws SQLException
* @throws IOException
*/
public static Map<Integer,String[]> getSogouTextFromMySQL() throws SQLException, IOException
{
ArrayList<String> sougouTextListFromSql = new ArrayList<String>() ;//用來存取從數據庫中select出的每條語料記錄
Map<Integer,String[]> sougouTextMapFromSql = new HashMap<Integer, String[]>() ;//保存從數據庫中select出的語料記錄
FileWriter fileWriter = new FileWriter(new File("F:/hadoop_workspace/Text-Classify/sogouTxtMap.txt")) ;
String url = "jdbc:mysql://localhost:3306/sogou?useUnicode=true&characterEncoding=UTF-8";
String username = "root";
String password = "123456";
Connection conn = DBUtil.getConnection(url, username, password);
Statement stmt = null; //表示數據庫的更新操作
ResultSet result = null; //表示接收數據庫的查詢結果
stmt = conn.createStatement();
result = stmt.executeQuery("select docno,newstitle,newscontent,newsurl,category from sohunews_reduced"); //執行SQL 語句,查詢數據庫
int count = 0 ; //記錄ResultSet記錄所在的行數
int tempcount = 0; //記錄ResultSet總行數count的值
while(result.next())
{
String docno = result.getString("docno");
String newstitle = result.getString("newstitle") ;
String newscontent = result.getString("newscontent") ;
String newsurl = result.getString("newsurl") ;
String category = result.getString("category") ;
ResultSetMetaData m = result.getMetaData() ;//獲取此 ResultSet 對象的列的編號、類型和屬性
int columns = m.getColumnCount() ;//返回此 ResultSet 對象中的列數
//將每條語料文本添加到ArrayList中
sougouTextListFromSql.add(docno) ;
sougouTextListFromSql.add(newstitle) ;
sougouTextListFromSql.add(newscontent) ;
sougouTextListFromSql.add(newsurl) ;
sougouTextListFromSql.add(category) ;
count++ ;
tempcount = count ;
System.out.println("第" + count + "行");
String[] textStringArray = {sougouTextListFromSql.get(0),sougouTextListFromSql.get(1),
sougouTextListFromSql.get(2),sougouTextListFromSql.get(3),sougouTextListFromSql.get(4)} ;
sougouTextMapFromSql.put(count,textStringArray) ; //將語料以Map的形式保存
sougouTextListFromSql.clear() ; //移除此列表中的所有元素
}
System.out.println("行數" + tempcount);
for(int i=1 ; i<=tempcount ; i++)
{
String[] str = sougouTextMapFromSql.get(i) ;//迭代出Map集合sougouTextMapFromSql
System.out.println("第" + i + "行:" + str[0] + " | " + str[1] + " | " + str[2] + " | " + str[3] + " | " + str[4]);
//寫入到指定txt文件進行保存
fileWriter.write("第" + i + "行:" + str[0] + " | " + str[1] + " | " + str[2] + " | " + str[3] + " | " + str[4] + "\n") ;
}
fileWriter.flush() ;
fileWriter.close() ;
result.close();
conn.close(); // 4、關閉數據庫
return sougouTextMapFromSql ;
}
/**
* ICTCLAS 分詞
* @throws UnsupportedEncodingException
*/
public static String segmentation() throws UnsupportedEncodingException
{
ICTCLAS50 testICTCLAS50 = new ICTCLAS50();
String argu = ".";
if(testICTCLAS50.ICTCLAS_Init(argu.getBytes("GB2312")) == false){
System.out.println("Init Fail");
}else{
System.out.println("Init Succeed!");
}
//TODO
return null ;
}
}
只完成了部分代碼,未完待續....
文本挖掘理論以及語料庫介紹請參考博客:1. http://www.cnblogs.com/finallyliuyu/archive/2010/10/04/1842261.html 2. http://www.cnblogs.com/finallyliuyu/archive/2010/09/18/1830444.html
語料庫是搜狗2008年的news_sohusite_xml.smarty.zip ,數據庫用的是MySQL,表結構如下圖,語料庫導入MySQL數據庫這裏不贅述,用的是CSDN的一個Java項目,在此給出鏈接:http://download.csdn.net/detail/raindreams/3348889,根據自己的數據庫字段和實際數據進行修改即可。