mahout將數據轉化成序列化文件、稀疏向量

對於文本信息的向量化,Mahout 已經提供了工具類,它基於 Lucene 給出了對文本信息進行分析,然後創建文本向量。mahout提供下面兩個命令來將文本轉成向量形式(轉化成向量後可以聚類):
1.mahout seqdirectory:將文本文件轉成SequenceFile文件,SequenceFile文件是一種二制制存儲的key-value鍵值對,對應的源文件是org.apache.mahout.text.SequenceFilesFromDirectory.java

2.mahout seq2sparse:將SequenceFile轉成向量文件,對應的源文件是org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles.java

我是將mahout源碼導入到eclipse中,對以上的兩個源文件分別進行運行(運行時必須配置參數,有輸入、輸出、字符編碼)轉化的,生成的向量文件目錄結構是:
df-count 目錄:保存着文本的頻率信息 
tf-vectors 目錄:保存着以 TF 作爲權值的文本向量 
tfidf-vectors 目錄:保存着以 TFIDF 作爲權值的文本向量 
tokenized-documents 目錄:保存着分詞過後的文本信息 
wordcount 目錄:保存着全局的詞彙出現的次數 
dictionary.file-0 目錄:保存着這些文本的詞彙表 
frequcency-file-0 目錄 : 保存着詞彙表對應的頻率信息。

查看轉化結果:

mahout seqdumper:將SequenceFile文件轉成文本形式,對應的源文件是org.apache.mahout.utils.SequenceFileDumper.java
mahout vectordump:將向量文件轉成可讀的文本形式,對應的源文件是org.apache.mahout.utils.vectors.VectorDumper.java
mahout clusterdump:分析最後聚類的輸出結果,對應的源文件是org.apache.mahout.utils.clustering.ClusterDumper.java具體每種命令如何用及參數如何選擇,在命令行後面加-h或-help可以查看

下面是我在項目中用到的一些源碼

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.LineReader;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.utils.io.ChunkedWriter;

import com.google.common.io.Closeables;


public class WriteToSequenceFileForBayesian  extends AbstractJob{//使用聚類的文件,所以要傳入一個num,比如2000,表示以2000爲單位,訓練集的劃分規範。
	public static void main(String args[]) throws Exception{
		ToolRunner.run(new WriteToSequenceFileForBayesian(), args);
    }
	@Override
	public int run(String[] arg0) throws Exception {
	    String inputPath=arg0[0];//
    	String outputpoints=arg0[1];//
        int k = Integer.parseInt(arg0[2]);//
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        Path inPath = new Path(inputPath );
        FSDataInputStream dis = fs.open(inPath);
        LineReader in = new LineReader(dis,conf);  
        ChunkedWriter writer = new ChunkedWriter(conf, 64, new Path(outputpoints));
        Text line = new Text();
        //按行讀取
        long recNum = 0;
        StringBuilder ss=new StringBuilder();
        while(in.readLine(line) > 0){
        	String aline=line.toString();
        	String[] strs=aline.split(" ");
        	if (recNum-2&&Double.parseDouble(strs[0])<4) {
					ss.append("one_first");
				}else if (Double.parseDouble(strs[0])<-2) {
					ss.append("low_first");
				}else if (Double.parseDouble(strs[0])>4) {
					ss.append("high_first");
				}
        		ss.append(",");
        		//處理第2個數
        		if (Double.parseDouble(strs[1])>-3&&Double.parseDouble(strs[1])<3) {
					ss.append("zero_second");
				}else if (Double.parseDouble(strs[1])<-3) {
					ss.append("low_second");
				}else if (Double.parseDouble(strs[1])>3) {
					ss.append("high_second");
				}
        		ss.append(",");
        		//處理第3個數
        		if (Double.parseDouble(strs[2])>-2&&Double.parseDouble(strs[2])<4) {
					ss.append("one_third");
				}else if (Double.parseDouble(strs[2])<-2) {
					ss.append("low_third");
				}else if (Double.parseDouble(strs[2])>4) {
					ss.append("high_third");
				}
        		ss.append(",");
        		//處理第4個數
        		if (Double.parseDouble(strs[3])>-1&&Double.parseDouble(strs[3])<5) {
					ss.append("two_fourth");
				}else if (Double.parseDouble(strs[3])<-1) {
					ss.append("low_fourth");
				}else if (Double.parseDouble(strs[3])>5) {
					ss.append("high_fourth");
				}
        		ss.append(",");
        		//處理第5個數
        		if (Double.parseDouble(strs[4])>-2&&Double.parseDouble(strs[4])<4) {
					ss.append("one_fifth");
				}else if (Double.parseDouble(strs[4])<-2) {
					ss.append("low_fifth");
				}else if (Double.parseDouble(strs[4])>4) {
					ss.append("high_fifth");
				}
        		writer.write("first", ss.toString());
			}else if (recNum1.5&&Double.parseDouble(strs[0])<2.5) {
					ss.append("two_first");
				}else if (Double.parseDouble(strs[0])<1.5) {
					ss.append("low_first");
				}else if (Double.parseDouble(strs[0])>2.5) {
					ss.append("high_first");
				}
        		ss.append(",");
        		//處理第2個數
        		if (Double.parseDouble(strs[1])>0.5&&Double.parseDouble(strs[1])<1.5) {
					ss.append("one_second");
				}else if (Double.parseDouble(strs[1])<0.5) {
					ss.append("low_second");
				}else if (Double.parseDouble(strs[1])>1.5) {
					ss.append("high_second");
				}
        		ss.append(",");
        		//處理第3個數
        		if (Double.parseDouble(strs[2])>-0.5&&Double.parseDouble(strs[2])<0.5) {
					ss.append("zero_third");
				}else if (Double.parseDouble(strs[2])<-0.5) {
					ss.append("low_third");
				}else if (Double.parseDouble(strs[2])>0.5) {
					ss.append("high_third");
				}
        		ss.append(",");
        		//處理第4個數
        		if (Double.parseDouble(strs[3])>0.5&&Double.parseDouble(strs[3])<1.5) {
					ss.append("one_fourth");
				}else if (Double.parseDouble(strs[3])<0.5) {
					ss.append("low_fourth");
				}else if (Double.parseDouble(strs[3])>1.5) {
					ss.append("high_fourth");
				}
        		ss.append(",");
        		//處理第5個數
        		if (Double.parseDouble(strs[4])>0.5&&Double.parseDouble(strs[4])<1.5) {
					ss.append("one_fifth");
				}else if (Double.parseDouble(strs[4])<0.5) {
					ss.append("low_fifth");
				}else if (Double.parseDouble(strs[4])>1.5) {
					ss.append("high_fifth");
				}
        		writer.write("second", ss.toString());
			}else if (recNum0.9&&Double.parseDouble(strs[0])<1.1) {
					ss.append("one_first");
				}else if (Double.parseDouble(strs[0])<0.9) {
					ss.append("low_first");
				}else if (Double.parseDouble(strs[0])>1.1) {
					ss.append("high_first");
				}
        		ss.append(",");
        		//處理第2個數
        		if (Double.parseDouble(strs[1])>0.9&&Double.parseDouble(strs[1])<1.1) {
					ss.append("one_second");
				}else if (Double.parseDouble(strs[1])<0.9) {
					ss.append("low_second");
				}else if (Double.parseDouble(strs[1])>1.1) {
					ss.append("high_second");
				}
        		ss.append(",");
        		//處理第3個數
        		if (Double.parseDouble(strs[2])>1.9&&Double.parseDouble(strs[2])<2.1) {
					ss.append("two_third");
				}else if (Double.parseDouble(strs[2])<1.9) {
					ss.append("low_third");
				}else if (Double.parseDouble(strs[2])>2.1) {
					ss.append("high_third");
				}
        		ss.append(",");
        		//處理第4個數
        		if (Double.parseDouble(strs[3])>-0.1&&Double.parseDouble(strs[3])<0.1) {
					ss.append("zero_fourth");
				}else if (Double.parseDouble(strs[3])<-0.1) {
					ss.append("low_fourth");
				}else if (Double.parseDouble(strs[3])>0.1) {
					ss.append("high_fourth");
				}
        		ss.append(",");
        		//處理第5個數
        		if (Double.parseDouble(strs[4])>0.9&&Double.parseDouble(strs[4])<1.1) {
					ss.append("one_fifth");
				}else if (Double.parseDouble(strs[4])<0.9) {
					ss.append("low_fifth");
				}else if (Double.parseDouble(strs[4])>1.1) {
					ss.append("high_fifth");
				}
        		writer.write("third", ss.toString());
			}else if (recNum-1&&Double.parseDouble(strs[0])<3) {
					ss.append("one_first");
				}else if (Double.parseDouble(strs[0])<-1) {
					ss.append("low_first");
				}else if (Double.parseDouble(strs[0])>3) {
					ss.append("high_first");
				}
        		ss.append(",");
        		//處理第2個數
        		if (Double.parseDouble(strs[1])>0&&Double.parseDouble(strs[1])<4) {
					ss.append("two_second");
				}else if (Double.parseDouble(strs[1])<0) {
					ss.append("low_second");
				}else if (Double.parseDouble(strs[1])>4) {
					ss.append("high_second");
				}
        		ss.append(",");
        		//處理第3個數
        		if (Double.parseDouble(strs[2])>-1&&Double.parseDouble(strs[2])<3) {
					ss.append("one_third");
				}else if (Double.parseDouble(strs[2])<-1) {
					ss.append("low_third");
				}else if (Double.parseDouble(strs[2])>3) {
					ss.append("high_third");
				}
        		ss.append(",");
        		//處理第4個數
        		if (Double.parseDouble(strs[3])>-1&&Double.parseDouble(strs[3])<3) {
					ss.append("one_fourth");
				}else if (Double.parseDouble(strs[3])<-1) {
					ss.append("low_fourth");
				}else if (Double.parseDouble(strs[3])>3) {
					ss.append("high_fourth");
				}
        		ss.append(",");
        		//處理第5個數
        		if (Double.parseDouble(strs[4])>-2&&Double.parseDouble(strs[4])<2) {
					ss.append("zero_fifth");
				}else if (Double.parseDouble(strs[4])<-2) {
					ss.append("low_fifth");
				}else if (Double.parseDouble(strs[4])>2) {
					ss.append("high_fifth");
				}
        		writer.write("fourth", ss.toString());
			}else if (recNum-1&&Double.parseDouble(strs[0])<1) {
					ss.append("zero_first");
				}else if (Double.parseDouble(strs[0])<-1) {
					ss.append("low_first");
				}else if (Double.parseDouble(strs[0])>1) {
					ss.append("high_first");
				}
        		ss.append(",");
        		//處理第2個數
        		if (Double.parseDouble(strs[1])>0&&Double.parseDouble(strs[1])<2) {
					ss.append("one_second");
				}else if (Double.parseDouble(strs[1])<0) {
					ss.append("low_second");
				}else if (Double.parseDouble(strs[1])>2) {
					ss.append("high_second");
				}
        		ss.append(",");
        		//處理第3個數
        		if (Double.parseDouble(strs[2])>0&&Double.parseDouble(strs[2])<2) {
					ss.append("one_third");
				}else if (Double.parseDouble(strs[2])<0) {
					ss.append("low_third");
				}else if (Double.parseDouble(strs[2])>2) {
					ss.append("high_third");
				}
        		ss.append(",");
        		//處理第4個數
        		if (Double.parseDouble(strs[3])>0&&Double.parseDouble(strs[3])<2) {
					ss.append("one_fourth");
				}else if (Double.parseDouble(strs[3])<0) {
					ss.append("low_fourth");
				}else if (Double.parseDouble(strs[3])>2) {
					ss.append("high_fourth");
				}
        		ss.append(",");
        		//處理第5個數
        		if (Double.parseDouble(strs[4])>1&&Double.parseDouble(strs[4])<3) {
					ss.append("two_fifth");
				}else if (Double.parseDouble(strs[4])<1) {
					ss.append("low_fifth");
				}else if (Double.parseDouble(strs[4])>3) {
					ss.append("high_fifth");
				}
        		writer.write("fifth", ss.toString());
			}
        }
        Closeables.close(writer, false);
        dis.close();
        in.close();
		return 0;
	}
}

import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.LineReader;
import org.apache.mahout.clustering.kmeans.Kluster;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.text.SequenceFilesFromDirectory;

public class WriteToSequenceFile {
    public static void main(String args[]) throws Exception {
    	String inputPath=args[0];//文本數據文件輸入目錄
    	String outputpoints=args[1];//sequenceFile中的point數據輸出目錄
    	String outputclusters=args[2];//sequenceFile中的cluster數據輸出目錄
        int k = Integer.parseInt(args[3]);//k箇中心
        List vectors = new ArrayList();
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        Path inPath = new Path(inputPath );
        FSDataInputStream dis = fs.open(inPath);
        LineReader in = new LineReader(dis,conf);  
        Text line = new Text();
        SequenceFile.Writer pointwriter = new SequenceFile.Writer(fs, conf, new Path(outputpoints), LongWritable.class, VectorWritable.class);
        //按行讀取
        long recNum = 0;
        VectorWritable vecWrite = new VectorWritable();
        while(in.readLine(line) > 0){
        	String aline=line.toString();
        	String[] strs=aline.split(" ");
        	double[] fr = new double[5];
        	for (int i = 0; i < strs.length; i++) {
				fr[i]=Double.parseDouble(strs[i]);
			}
            Vector vec = new RandomAccessSparseVector(fr.length);
            vec.assign(fr);
            vecWrite.set(vec);
            pointwriter.append(new LongWritable(recNum++), vecWrite);
            if (vectors.size()

發佈了60 篇原創文章 · 獲贊 18 · 訪問量 38萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章