MapReduce的Kmeans聚類算法

最近在網上查看用MapReduce實現的Kmeans算法，例子是不錯，http://blog.csdn.net/jshayzf/article/details/22739063

但註釋太少了，而且參數太多，如果新手學習的話不太好理解。所以自己按照個人的理解寫了一個簡單的例子並添加了詳細的註釋。

大致的步驟是：

1，Map每讀取一條數據就與中心做對比，求出該條記錄對應的中心，然後以中心的ID爲Key，該條數據爲value將數據輸出。

2，利用reduce的歸併功能將相同的Key歸併到一起，集中與該Key對應的數據，再求出這些數據的平均值，輸出平均值。

3，對比reduce求出的平均值與原來的中心，如果不相同，這將清空原中心的數據文件，將reduce的結果寫到中心文件中。（中心的值存在一個HDFS的文件中）

刪掉reduce的輸出目錄以便下次輸出。

繼續運行任務。

4，對比reduce求出的平均值與原來的中心，如果相同。則刪掉reduce的輸出目錄，運行一個沒有reduce的任務將中心ID與值對應輸出。

package MyKmeans;

import java.io.IOException;
import java.util.ArrayList;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;

import java.util.Arrays;
import java.util.Iterator;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class MapReduce {
	
	public static class Map extends Mapper<LongWritable, Text, IntWritable, Text>{

		//中心集合
		ArrayList<ArrayList<Double>> centers = null;
		//用k箇中心
		int k = 0;
		
		//讀取中心
		protected void setup(Context context) throws IOException,
				InterruptedException {
			centers = Utils.getCentersFromHDFS(context.getConfiguration().get("centersPath"),false);
			k = centers.size();
		}


		/**
		 * 1.每次讀取一條要分類的條記錄與中心做對比，歸類到對應的中心
		 * 2.以中心ID爲key，中心包含的記錄爲value輸出(例如： 1 0.2 。  1爲聚類中心的ID，0.2爲靠近聚類中心的某個值)
		 */
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			//讀取一行數據
			ArrayList<Double> fileds = Utils.textToArray(value);
			int sizeOfFileds = fileds.size();
			
			double minDistance = 99999999;
			int centerIndex = 0;
			
			//依次取出k箇中心點與當前讀取的記錄做計算
			for(int i=0;i<k;i++){
				double currentDistance = 0;
				for(int j=0;j<sizeOfFileds;j++){
					double centerPoint = Math.abs(centers.get(i).get(j));
					double filed = Math.abs(fileds.get(j));
					currentDistance += Math.pow((centerPoint - filed) / (centerPoint + filed), 2);
				}
				//循環找出距離該記錄最接近的中心點的ID
				if(currentDistance<minDistance){
					minDistance = currentDistance;
					centerIndex = i;
				}
			}
			//以中心點在centers中的索引爲Key 將記錄原樣輸出
			context.write(new IntWritable(centerIndex+1), value);
		}
		
	}
	
	//利用reduce的歸併功能以中心爲Key將記錄歸併到一起
	public static class Reduce extends Reducer<IntWritable, Text, NullWritable, Text>{

		/**
		 * 1.Key爲聚類中心的ID value爲該中心的記錄集合
		 * 2.計數所有記錄元素的平均值，求出新的中心
		 */
		protected void reduce(IntWritable key, Iterable<Text> value,Context context)
				throws IOException, InterruptedException {
			ArrayList<ArrayList<Double>> filedsList = new ArrayList<ArrayList<Double>>();
			
			//依次讀取記錄集，每行爲一個ArrayList<Double>
			for(Iterator<Text> it =value.iterator();it.hasNext();){
				ArrayList<Double> tempList = Utils.textToArray(it.next());
				filedsList.add(tempList);
			}
			
			//計算新的中心
			//每行的元素個數
			int filedSize = filedsList.get(0).size();
			double[] avg = new double[filedSize];
			for(int i=0;i<filedSize;i++){
				//求沒列的平均值
				double sum = 0;
				int size = filedsList.size();
				for(int j=0;j<size;j++){
					sum += filedsList.get(j).get(i);
				}
				avg[i] = sum / size;
			}
			context.write(NullWritable.get() , new Text(Arrays.toString(avg).replace("[", "").replace("]", "")));
		}
		
	}
	
	@SuppressWarnings("deprecation")
	public static void run(String centerPath,String dataPath,String newCenterPath,boolean runReduce) throws IOException, ClassNotFoundException, InterruptedException{
		
		Configuration conf = new Configuration();
		conf.set("centersPath", centerPath);
		
		Job job = new Job(conf, "mykmeans");
		job.setJarByClass(MapReduce.class);
		
		job.setMapperClass(Map.class);

		job.setMapOutputKeyClass(IntWritable.class);
		job.setMapOutputValueClass(Text.class);

		if(runReduce){
			//最後依次輸出不許要reduce
			job.setReducerClass(Reduce.class);
			job.setOutputKeyClass(NullWritable.class);
			job.setOutputValueClass(Text.class);
		}
		
		FileInputFormat.addInputPath(job, new Path(dataPath));
		
		FileOutputFormat.setOutputPath(job, new Path(newCenterPath));
		
		System.out.println(job.waitForCompletion(true));
	}

	public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException {
		if(args.length < 3){
			throw new IllegalArgumentException("需要3個參數，儲存centers數據的文件名，存儲元數據的文件名，結果目錄");
		}
		String centerPath = args[0];
		String dataPath = args[1];
		String newCenterPath = args[2];
		
		centerPath = FileUtil.loadFile(newCenterPath, "MyKmeans", centerPath);
		dataPath = FileUtil.loadFile(newCenterPath, "MyKmeans", dataPath);
		
		FileUtil.deleteFile(newCenterPath);
		
		int count = 0;
		
		
		while(true){
			run(centerPath,dataPath,newCenterPath,true);
			System.out.println(" 第 " + ++count + " 次計算 ");
			if(Utils.compareCenters(centerPath,newCenterPath )){
				run(centerPath,dataPath,newCenterPath,false);
				break;
			}
		}
	}
	
}

package MyKmeans;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.LineReader;

public class Utils {
	
	//讀取中心文件的數據
	public static ArrayList<ArrayList<Double>> getCentersFromHDFS(String centersPath,boolean isDirectory) throws IOException{
		
		ArrayList<ArrayList<Double>> result = new ArrayList<ArrayList<Double>>();
		
		Path path = new Path(centersPath);
		
		Configuration conf = new Configuration();
		
		FileSystem fileSystem = path.getFileSystem(conf);

		if(isDirectory){	
			FileStatus[] listFile = fileSystem.listStatus(path);
			for (int i = 0; i < listFile.length; i++) {
				result.addAll(getCentersFromHDFS(listFile[i].getPath().toString(),false));
			}
			return result;
		}
		
		FSDataInputStream fsis = fileSystem.open(path);
		LineReader lineReader = new LineReader(fsis, conf);
		
		Text line = new Text();
		
		while(lineReader.readLine(line) > 0){
			ArrayList<Double> tempList = textToArray(line);
			result.add(tempList);
		}
		lineReader.close();
		return result;
	}
	
	//刪掉文件
	public static void deletePath(String pathStr) throws IOException{
		Configuration conf = new Configuration();
		Path path = new Path(pathStr);
		FileSystem hdfs = path.getFileSystem(conf);
		hdfs.delete(path ,true);
	}
	
	public static ArrayList<Double> textToArray(Text text){
		ArrayList<Double> list = new ArrayList<Double>();
		String[] fileds = text.toString().split(",");
		for(int i=0;i<fileds.length;i++){
			list.add(Double.parseDouble(fileds[i]));
		}
		return list;
	}
	
	public static boolean compareCenters(String centerPath,String newPath) throws IOException{
		
		List<ArrayList<Double>> oldCenters = Utils.getCentersFromHDFS(centerPath,false);
		List<ArrayList<Double>> newCenters = Utils.getCentersFromHDFS(newPath,true);
		
		int size = oldCenters.size();
		int fildSize = oldCenters.get(0).size();
		double distance = 0;
		for(int i=0;i<size;i++){
			for(int j=0;j<fildSize;j++){
				double t1 = Math.abs(oldCenters.get(i).get(j));
				double t2 = Math.abs(newCenters.get(i).get(j));
				distance += Math.pow((t1 - t2) / (t1 + t2), 2);
			}
		}
		
		if(distance == 0.0){
			//刪掉新的中心文件以便最後依次歸類輸出
			Utils.deletePath(newPath);
			return true;
		}else{
			//先清空中心文件，將新的中心文件複製到中心文件中，再刪掉新中心文件
			
			Configuration conf = new Configuration();
			Path outPath = new Path(centerPath);
			FileSystem fileSystem = outPath.getFileSystem(conf);			
			FSDataOutputStream overWrite = fileSystem.create(outPath,true);
			overWrite.writeChars("");
			overWrite.close();
			
			
			Path inPath = new Path(newPath);
			FileStatus[] listFiles = inPath.getFileSystem(conf).listStatus(inPath);
			for (int i = 0; i < listFiles.length; i++) {			
				if (listFiles[i].getPath().getName().contains("_SUCCESS")){
					continue;
				}
				FSDataOutputStream out = fileSystem.create(outPath);
				FSDataInputStream in = fileSystem.open(listFiles[i].getPath());
				IOUtils.copyBytes(in, out, 4096, true);
			}
			//刪掉新的中心文件以便第二次任務運行輸出
			Utils.deletePath(newPath);
		}
		
		return false;
	}
}

package MyKmeans;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

/**
 * 
 * @author zx
 *
 */
public class FileUtil {

	/**
	 * 上傳數據文件到hdfs
	 * @param inputPath
	 * @param fileName
	 * @return
	 * @throws IOException
	 */
	public static String loadFile(String inputPath,String folder,String fileName) throws IOException{
		
		//獲取數據文件的全路徑
		
		
		if(null != folder && !"".equals(folder)){
			folder = folder + "/";
		}
		
		String srcPathDir = FileUtil.class.getProtectionDomain().getCodeSource().getLocation()
                .getFile() + folder + fileName;
		
		Path srcpath = new Path("file:///" + srcPathDir);
		
		Path dstPath = new Path(getJobRootPath(inputPath) + fileName);
		
		Configuration conf = new Configuration();
		
		FileSystem fs = dstPath.getFileSystem(conf);
		
		fs.delete(dstPath, true);
		
		fs.copyFromLocalFile(srcpath, dstPath);
		
		fs.close();
		
		return getJobRootPath(inputPath) + fileName;
	}
	
	/**
	 * 如果路徑的最後不包哈“/”就加一個“/”
	 * @param path
	 * @return
	 */
	public static String getJobRootPath(String path){
		if(path.lastIndexOf("/") == path.length()-1){
			path = path.substring(0, path.lastIndexOf("/"));
		}
		return path.substring(0, path.lastIndexOf("/")+1);
	}
	
	public static void deleteFile(String ...filePath) throws IOException{
		Configuration conf = new Configuration();
		for (int i = 0; i < filePath.length; i++) {
			Path path = new Path(filePath[i]);
			FileSystem fs = path.getFileSystem(conf);
			fs.delete(path,true);
		}
	}
	
}

數據集 http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data

運行結果可以與 http://blog.csdn.net/jshayzf/article/details/22739063的結果做對比（前提是初始的中心相同）

MapReduce的Kmeans聚類算法

java數據結構棧

MapReduce的Kmeans聚類算法

java劃分算法

MapReduce的TopK統計加排序

java希爾排序

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結