對於文本信息的向量化,Mahout 已經提供了工具類,它基於 Lucene 給出了對文本信息進行分析,然後創建文本向量。mahout提供下面兩個命令來將文本轉成向量形式(轉化成向量後可以聚類):
1.mahout seqdirectory:將文本文件轉成SequenceFile文件,SequenceFile文件是一種二制制存儲的key-value鍵值對,對應的源文件是org.apache.mahout.text.SequenceFilesFromDirectory.java
2.mahout seq2sparse:將SequenceFile轉成向量文件,對應的源文件是org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles.java
我是將mahout源碼導入到eclipse中,對以上的兩個源文件分別進行運行(運行時必須配置參數,有輸入、輸出、字符編碼)轉化的,生成的向量文件目錄結構是:
df-count 目錄:保存着文本的頻率信息
tf-vectors 目錄:保存着以 TF 作爲權值的文本向量
tfidf-vectors 目錄:保存着以 TFIDF 作爲權值的文本向量
tokenized-documents 目錄:保存着分詞過後的文本信息
wordcount 目錄:保存着全局的詞彙出現的次數
dictionary.file-0 目錄:保存着這些文本的詞彙表
frequcency-file-0 目錄 : 保存着詞彙表對應的頻率信息。
查看轉化結果:
mahout seqdumper:將SequenceFile文件轉成文本形式,對應的源文件是org.apache.mahout.utils.SequenceFileDumper.java
mahout vectordump:將向量文件轉成可讀的文本形式,對應的源文件是org.apache.mahout.utils.vectors.VectorDumper.java
mahout clusterdump:分析最後聚類的輸出結果,對應的源文件是org.apache.mahout.utils.clustering.ClusterDumper.java具體每種命令如何用及參數如何選擇,在命令行後面加-h或-help可以查看
下面是我在項目中用到的一些源碼
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.LineReader;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.utils.io.ChunkedWriter;
import com.google.common.io.Closeables;
public class WriteToSequenceFileForBayesian extends AbstractJob{//使用聚類的文件,所以要傳入一個num,比如2000,表示以2000爲單位,訓練集的劃分規範。
public static void main(String args[]) throws Exception{
ToolRunner.run(new WriteToSequenceFileForBayesian(), args);
}
@Override
public int run(String[] arg0) throws Exception {
String inputPath=arg0[0];//
String outputpoints=arg0[1];//
int k = Integer.parseInt(arg0[2]);//
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
Path inPath = new Path(inputPath );
FSDataInputStream dis = fs.open(inPath);
LineReader in = new LineReader(dis,conf);
ChunkedWriter writer = new ChunkedWriter(conf, 64, new Path(outputpoints));
Text line = new Text();
//按行讀取
long recNum = 0;
StringBuilder ss=new StringBuilder();
while(in.readLine(line) > 0){
String aline=line.toString();
String[] strs=aline.split(" ");
if (recNum-2&&Double.parseDouble(strs[0])<4) {
ss.append("one_first");
}else if (Double.parseDouble(strs[0])<-2) {
ss.append("low_first");
}else if (Double.parseDouble(strs[0])>4) {
ss.append("high_first");
}
ss.append(",");
//處理第2個數
if (Double.parseDouble(strs[1])>-3&&Double.parseDouble(strs[1])<3) {
ss.append("zero_second");
}else if (Double.parseDouble(strs[1])<-3) {
ss.append("low_second");
}else if (Double.parseDouble(strs[1])>3) {
ss.append("high_second");
}
ss.append(",");
//處理第3個數
if (Double.parseDouble(strs[2])>-2&&Double.parseDouble(strs[2])<4) {
ss.append("one_third");
}else if (Double.parseDouble(strs[2])<-2) {
ss.append("low_third");
}else if (Double.parseDouble(strs[2])>4) {
ss.append("high_third");
}
ss.append(",");
//處理第4個數
if (Double.parseDouble(strs[3])>-1&&Double.parseDouble(strs[3])<5) {
ss.append("two_fourth");
}else if (Double.parseDouble(strs[3])<-1) {
ss.append("low_fourth");
}else if (Double.parseDouble(strs[3])>5) {
ss.append("high_fourth");
}
ss.append(",");
//處理第5個數
if (Double.parseDouble(strs[4])>-2&&Double.parseDouble(strs[4])<4) {
ss.append("one_fifth");
}else if (Double.parseDouble(strs[4])<-2) {
ss.append("low_fifth");
}else if (Double.parseDouble(strs[4])>4) {
ss.append("high_fifth");
}
writer.write("first", ss.toString());
}else if (recNum1.5&&Double.parseDouble(strs[0])<2.5) {
ss.append("two_first");
}else if (Double.parseDouble(strs[0])<1.5) {
ss.append("low_first");
}else if (Double.parseDouble(strs[0])>2.5) {
ss.append("high_first");
}
ss.append(",");
//處理第2個數
if (Double.parseDouble(strs[1])>0.5&&Double.parseDouble(strs[1])<1.5) {
ss.append("one_second");
}else if (Double.parseDouble(strs[1])<0.5) {
ss.append("low_second");
}else if (Double.parseDouble(strs[1])>1.5) {
ss.append("high_second");
}
ss.append(",");
//處理第3個數
if (Double.parseDouble(strs[2])>-0.5&&Double.parseDouble(strs[2])<0.5) {
ss.append("zero_third");
}else if (Double.parseDouble(strs[2])<-0.5) {
ss.append("low_third");
}else if (Double.parseDouble(strs[2])>0.5) {
ss.append("high_third");
}
ss.append(",");
//處理第4個數
if (Double.parseDouble(strs[3])>0.5&&Double.parseDouble(strs[3])<1.5) {
ss.append("one_fourth");
}else if (Double.parseDouble(strs[3])<0.5) {
ss.append("low_fourth");
}else if (Double.parseDouble(strs[3])>1.5) {
ss.append("high_fourth");
}
ss.append(",");
//處理第5個數
if (Double.parseDouble(strs[4])>0.5&&Double.parseDouble(strs[4])<1.5) {
ss.append("one_fifth");
}else if (Double.parseDouble(strs[4])<0.5) {
ss.append("low_fifth");
}else if (Double.parseDouble(strs[4])>1.5) {
ss.append("high_fifth");
}
writer.write("second", ss.toString());
}else if (recNum0.9&&Double.parseDouble(strs[0])<1.1) {
ss.append("one_first");
}else if (Double.parseDouble(strs[0])<0.9) {
ss.append("low_first");
}else if (Double.parseDouble(strs[0])>1.1) {
ss.append("high_first");
}
ss.append(",");
//處理第2個數
if (Double.parseDouble(strs[1])>0.9&&Double.parseDouble(strs[1])<1.1) {
ss.append("one_second");
}else if (Double.parseDouble(strs[1])<0.9) {
ss.append("low_second");
}else if (Double.parseDouble(strs[1])>1.1) {
ss.append("high_second");
}
ss.append(",");
//處理第3個數
if (Double.parseDouble(strs[2])>1.9&&Double.parseDouble(strs[2])<2.1) {
ss.append("two_third");
}else if (Double.parseDouble(strs[2])<1.9) {
ss.append("low_third");
}else if (Double.parseDouble(strs[2])>2.1) {
ss.append("high_third");
}
ss.append(",");
//處理第4個數
if (Double.parseDouble(strs[3])>-0.1&&Double.parseDouble(strs[3])<0.1) {
ss.append("zero_fourth");
}else if (Double.parseDouble(strs[3])<-0.1) {
ss.append("low_fourth");
}else if (Double.parseDouble(strs[3])>0.1) {
ss.append("high_fourth");
}
ss.append(",");
//處理第5個數
if (Double.parseDouble(strs[4])>0.9&&Double.parseDouble(strs[4])<1.1) {
ss.append("one_fifth");
}else if (Double.parseDouble(strs[4])<0.9) {
ss.append("low_fifth");
}else if (Double.parseDouble(strs[4])>1.1) {
ss.append("high_fifth");
}
writer.write("third", ss.toString());
}else if (recNum-1&&Double.parseDouble(strs[0])<3) {
ss.append("one_first");
}else if (Double.parseDouble(strs[0])<-1) {
ss.append("low_first");
}else if (Double.parseDouble(strs[0])>3) {
ss.append("high_first");
}
ss.append(",");
//處理第2個數
if (Double.parseDouble(strs[1])>0&&Double.parseDouble(strs[1])<4) {
ss.append("two_second");
}else if (Double.parseDouble(strs[1])<0) {
ss.append("low_second");
}else if (Double.parseDouble(strs[1])>4) {
ss.append("high_second");
}
ss.append(",");
//處理第3個數
if (Double.parseDouble(strs[2])>-1&&Double.parseDouble(strs[2])<3) {
ss.append("one_third");
}else if (Double.parseDouble(strs[2])<-1) {
ss.append("low_third");
}else if (Double.parseDouble(strs[2])>3) {
ss.append("high_third");
}
ss.append(",");
//處理第4個數
if (Double.parseDouble(strs[3])>-1&&Double.parseDouble(strs[3])<3) {
ss.append("one_fourth");
}else if (Double.parseDouble(strs[3])<-1) {
ss.append("low_fourth");
}else if (Double.parseDouble(strs[3])>3) {
ss.append("high_fourth");
}
ss.append(",");
//處理第5個數
if (Double.parseDouble(strs[4])>-2&&Double.parseDouble(strs[4])<2) {
ss.append("zero_fifth");
}else if (Double.parseDouble(strs[4])<-2) {
ss.append("low_fifth");
}else if (Double.parseDouble(strs[4])>2) {
ss.append("high_fifth");
}
writer.write("fourth", ss.toString());
}else if (recNum-1&&Double.parseDouble(strs[0])<1) {
ss.append("zero_first");
}else if (Double.parseDouble(strs[0])<-1) {
ss.append("low_first");
}else if (Double.parseDouble(strs[0])>1) {
ss.append("high_first");
}
ss.append(",");
//處理第2個數
if (Double.parseDouble(strs[1])>0&&Double.parseDouble(strs[1])<2) {
ss.append("one_second");
}else if (Double.parseDouble(strs[1])<0) {
ss.append("low_second");
}else if (Double.parseDouble(strs[1])>2) {
ss.append("high_second");
}
ss.append(",");
//處理第3個數
if (Double.parseDouble(strs[2])>0&&Double.parseDouble(strs[2])<2) {
ss.append("one_third");
}else if (Double.parseDouble(strs[2])<0) {
ss.append("low_third");
}else if (Double.parseDouble(strs[2])>2) {
ss.append("high_third");
}
ss.append(",");
//處理第4個數
if (Double.parseDouble(strs[3])>0&&Double.parseDouble(strs[3])<2) {
ss.append("one_fourth");
}else if (Double.parseDouble(strs[3])<0) {
ss.append("low_fourth");
}else if (Double.parseDouble(strs[3])>2) {
ss.append("high_fourth");
}
ss.append(",");
//處理第5個數
if (Double.parseDouble(strs[4])>1&&Double.parseDouble(strs[4])<3) {
ss.append("two_fifth");
}else if (Double.parseDouble(strs[4])<1) {
ss.append("low_fifth");
}else if (Double.parseDouble(strs[4])>3) {
ss.append("high_fifth");
}
writer.write("fifth", ss.toString());
}
}
Closeables.close(writer, false);
dis.close();
in.close();
return 0;
}
}
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.LineReader;
import org.apache.mahout.clustering.kmeans.Kluster;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.text.SequenceFilesFromDirectory;
public class WriteToSequenceFile {
public static void main(String args[]) throws Exception {
String inputPath=args[0];//文本數據文件輸入目錄
String outputpoints=args[1];//sequenceFile中的point數據輸出目錄
String outputclusters=args[2];//sequenceFile中的cluster數據輸出目錄
int k = Integer.parseInt(args[3]);//k箇中心
List vectors = new ArrayList();
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
Path inPath = new Path(inputPath );
FSDataInputStream dis = fs.open(inPath);
LineReader in = new LineReader(dis,conf);
Text line = new Text();
SequenceFile.Writer pointwriter = new SequenceFile.Writer(fs, conf, new Path(outputpoints), LongWritable.class, VectorWritable.class);
//按行讀取
long recNum = 0;
VectorWritable vecWrite = new VectorWritable();
while(in.readLine(line) > 0){
String aline=line.toString();
String[] strs=aline.split(" ");
double[] fr = new double[5];
for (int i = 0; i < strs.length; i++) {
fr[i]=Double.parseDouble(strs[i]);
}
Vector vec = new RandomAccessSparseVector(fr.length);
vec.assign(fr);
vecWrite.set(vec);
pointwriter.append(new LongWritable(recNum++), vecWrite);
if (vectors.size()