MapReduce的TopK統計加排序

Hadoop技術內幕中指出Top K算法有兩步,一是統計詞頻,二是找出詞頻最高的前K個詞。在網上找了很多MapReduce的Top K案例,這些案例都只有排序功能,所以自己寫了個案例。(求出前k個頻率最高的詞原本不需要排序,這裏想完全使用mapreduce的功能,所以藉助reduce的排序來求前k個。不需要排序的案例在MapReduce TopK TreeMap中介紹)

這個案例分兩個步驟,第一個是就是wordCount案例,二就是根據單詞詞頻排序,獲取前K個詞。

一,統計詞頻

package TopK;
import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 統計詞頻
 * @author zx
 * [email protected]
 */
public class WordCount {
	
	/**
	 * 讀取單詞
	 * @author zx
	 *
	 */
	public static class Map extends Mapper<Object,Text,Text,IntWritable>{

		IntWritable count = new IntWritable(1);
		
		@Override
		protected void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {
			StringTokenizer st = new StringTokenizer(value.toString());
			while(st.hasMoreTokens()){	
				String word = st.nextToken().replaceAll("\"", "").replace("'", "").replace(".", "");
				context.write(new Text(word), count);
			}
		}
		
	}
	
	/**
	 * 統計詞頻
	 * @author zx
	 *
	 */
	public static class Reduce extends Reducer<Text,IntWritable,Text,IntWritable>{

		@SuppressWarnings("unused")
		@Override
		protected void reduce(Text key, Iterable<IntWritable> values,Context context)
				throws IOException, InterruptedException {
			int count = 0;
			for (IntWritable intWritable : values) {
				count ++;
			}
			context.write(key,new IntWritable(count));
		}
		
	}
	
	@SuppressWarnings("deprecation")
	public static boolean run(String in,String out) throws IOException, ClassNotFoundException, InterruptedException{
		
		Configuration conf = new Configuration();
		
		Job job = new Job(conf,"WordCount");
		job.setJarByClass(WordCount.class);
		job.setMapperClass(Map.class);
		job.setReducerClass(Reduce.class);
		
		// 設置Map輸出類型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        // 設置Reduce輸出類型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        // 設置輸入和輸出目錄
        FileInputFormat.addInputPath(job, new Path(in));
        FileOutputFormat.setOutputPath(job, new Path(out));
        
        return job.waitForCompletion(true);
	}
	
}


二,排序 並求出頻率最高的前K個詞

package TopK;

import java.io.IOException;
import java.util.StringTokenizer;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 以單詞出現的頻率排序
 * 
 * @author zx
 * [email protected]
 */
public class Sort {

	/**
	 * 讀取單詞(詞頻 word)
	 * 
	 * @author zx
	 * 
	 */
	public static class Map extends Mapper<Object, Text, IntWritable, Text> {

		// 輸出key 詞頻
		IntWritable outKey = new IntWritable();
		Text outValue = new Text();

		@Override
		protected void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {

			StringTokenizer st = new StringTokenizer(value.toString());
			while (st.hasMoreTokens()) {
				String element = st.nextToken();
				if (Pattern.matches("\\d+", element)) {
					outKey.set(Integer.parseInt(element));
				} else {
					outValue.set(element);
				}
			}

			context.write(outKey, outValue);
		}

	}

	//這裏可以使用combine預先在map端排序一次再交給reduce
	public static class Combine extends Reducer<IntWritable, Text, IntWritable,Text>{

		@Override
		protected void reduce(IntWritable arg0, Iterable<Text> arg1,Context arg2)
				throws IOException, InterruptedException {
			for (Text text : arg1) {
				arg2.write(arg0 ,text);
			}
		}
		
	}
	
	/**
	 * 根據詞頻排序
	 * 
	 * @author zx
	 * 
	 */
	public static class Reduce extends
			Reducer<IntWritable, Text, Text, IntWritable> {
		
		String[] topK = null;
		int count = 0;
		
		@Override
		protected void setup(Context context) throws IOException,
				InterruptedException {
			Configuration conf = context.getConfiguration();
			topK= new String[Integer.parseInt(conf.get("k"))];
		}

		
		/*
		 * 因爲重新定義的key的比較方法,藉助reducer的排序功能可以知道key是以詞頻從高到低排序,所有前K個就是最高的
		 * 以詞頻爲Key是要用到reduce的排序功能
		 */
		@Override
		protected void reduce(IntWritable key, Iterable<Text> values,
				Context context) throws IOException, InterruptedException {
			for (Text text : values) {
				context.write(text, key);
				if(count < 10){
					topK[count++] = text.toString();
				}
			}
		}

		@Override
		protected void cleanup(Context context) throws IOException, InterruptedException {
			Configuration conf = context.getConfiguration();
			String topKout = conf.get("topKout");
			Path topKoutPath = new Path(topKout);
			FileSystem fs = topKoutPath.getFileSystem(conf);
			FSDataOutputStream fsDOS = fs.create(topKoutPath, true);
			for (int i = 0; i < topK.length; i++) {
				fsDOS.write(topK[i].getBytes(), 0, topK[i].length());
				fsDOS.write("\r".getBytes());
			}
			fsDOS.flush();
			fsDOS.close();
		}
		
		
	}

	public static class Comp1 extends IntWritable.Comparator {

		@Override
		public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
			return -super.compare(b1, s1, l1, b2, s2, l2);
		}
		
	}
	
	@SuppressWarnings("deprecation")
	public static void run(String in, String out,String topKout,int k) throws IOException,
			ClassNotFoundException, InterruptedException {

		Path outPath = new Path(out);

		Configuration conf = new Configuration();
		
		//前K個詞要輸出到哪個目錄
		conf.set("topKout",topKout);
		conf.set("k",k+"");
		
		Job job = new Job(conf, "Sort");
		
		job.setJarByClass(Sort.class);
		job.setMapperClass(Map.class);
		job.setCombinerClass(Combine.class);
		job.setReducerClass(Reduce.class);

		// 設置Map輸出類型
		job.setMapOutputKeyClass(IntWritable.class);
		job.setMapOutputValueClass(Text.class);

		// 設置Reduce輸出類型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);

		job.setSortComparatorClass(Comp1.class);
		
		// 設置輸入和輸出目錄
		FileInputFormat.addInputPath(job, new Path(in));
		FileOutputFormat.setOutputPath(job, outPath);
		
		System.exit(job.waitForCompletion(true)?0:1);

	}

}

package TopK;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

/**
 * 
 * @author zx
 *
 */
public class FileUtil {

	/**
	 * 上傳數據文件到hdfs
	 * @param inputPath
	 * @param fileName
	 * @return
	 * @throws IOException
	 */
	public static String loadFile(String inputPath,String folder,String fileName) throws IOException{
		
		//獲取數據文件的全路徑
		
		
		if(null != folder && !"".equals(folder)){
			folder = folder + "/";
		}
		
		String srcPathDir = FileUtil.class.getProtectionDomain().getCodeSource().getLocation()
                .getFile() + folder + fileName;
		
		Path srcpath = new Path("file:///" + srcPathDir);
		
		Path dstPath = new Path(getJobRootPath(inputPath) + fileName);
		
		Configuration conf = new Configuration();
		
		FileSystem fs = dstPath.getFileSystem(conf);
		
		fs.delete(dstPath, true);
		
		fs.copyFromLocalFile(srcpath, dstPath);
		
		fs.close();
		
		return getJobRootPath(inputPath) + fileName;
	}
	
	/**
	 * 如果路徑的最後不包哈“/”就加一個“/”
	 * @param path
	 * @return
	 */
	public static String getJobRootPath(String path){
		if(path.lastIndexOf("/") == path.length()-1){
			path = path.substring(0, path.lastIndexOf("/"));
		}
		return path.substring(0, path.lastIndexOf("/")+1);
	}
	
	
}

package TopK;

import java.io.IOException;

/**
 * 
 * @author zx
 *[email protected]
 */
public class TopK {
	public static void main(String args[]) throws ClassNotFoundException, IOException, InterruptedException{
		
		if(args.length < 5){
			throw new IllegalArgumentException("要有5個參數:1,要統計的文本文件。2,統計後的結果。3,排序後的結果。4,前k個詞存放的結果路徑。5,k");
		}
		
		//要統計字數的文本文件名
		String in = args[0];
		
		//統計字數後的結果
		String wordCout = args[1];

		in = FileUtil.loadFile(wordCout, "TopK", in);
		
		//對統計完後的結果再排序後的內容
		String sort = args[2];
		
		//前K條
		String topK = args[3];
		
		int k = Integer.parseInt(args[4]);
		
		//如果統計字數的job完成後就開始排序
		if(WordCount.run(in, wordCout)){
			Sort.run(wordCout, sort, topK, k);
		}
		
	}
}

數據

I am happy to join with you today in what will go down in history as the greatest demonstration for freedom in the history of our nation.
Five score years ago, a great American, in whose symbolic shadow we stand today, signed the Emancipation Proclamation. This momentous decree came as a great beacon light of hope to millions of Negro slaves who had been seared in the flames of withering injustice. It came as a joyous daybreak to end the long night of their captivity.
But one hundred years later, the Negro still is not free. One hundred years later, the life of the Negro is still sadly crippled by the manacles of segregation and the chains of discrimination. One hundred years later, the Negro lives on a lonely island of poverty in the midst of a vast ocean of material prosperity. One hundred years later, the Negro is still languished in the corners of American society and finds himself an exile in his own land. And so we've come here today to dramatize a shameful condition.
In a sense we've come to our nation's capital to cash a check. When the architects of our republic wrote the magnificent words of the Constitution and the Declaration of Independence, they were signing a promissory note to which every American was to fall heir. This note was a promise that all men, yes, black men as well as white men, would be guaranteed the "unalienable Rights" of "Life, Liberty and the pursuit of Happiness." It is obvious today that America has defaulted on this promissory note, insofar as her citizens of color are concerned. Instead of honoring this sacred obligation, America has given the Negro people a bad check, a check which has come back marked "insufficient funds."
But we refuse to believe that the bank of justice is bankrupt. We refuse to believe that there are insufficient funds in the great vaults of opportunity of this nation. And so, we've come to cash this check, a check that will give us upon demand the riches of freedom and the security of justice.
We have also come to this hallowed spot to remind America of the fierce urgency of Now. This is no time to engage in the luxury of cooling off or to take the tranquilizing drug of gradualism. Now is the time to make real the promises of democracy. Now is the time to rise from the dark and desolate valley of segregation to the sunlit path of racial justice. Now is the time to lift our nation from the quicksands of racial injustice to the solid rock of brotherhood. Now is the time to make justice a reality for all of God's children.
It would be fatal for the nation to overlook the urgency of the moment. This sweltering summer of the Negro's legitimate discontent will not pass until there is an invigorating autumn of freedom and equality. Nineteen sixty-three is not an end, but a beginning. And those who hope that the Negro needed to blow off steam and will now be content will have a rude awakening if the nation returns to business as usual. And there will be neither rest nor tranquility in America until the Negro is granted his citizenship rights. The whirlwinds of revolt will continue to shake the foundations of our nation until the bright day of justice emerges.
But there is something that I must say to my people, who stand on the warm threshold which leads into the palace of justice: In the process of gaining our rightful place, we must not be guilty of wrongful deeds. Let us not seek to satisfy our thirst for freedom by drinking from the cup of bitterness and hatred. We must forever conduct our struggle on the high plane of dignity and discipline. We must not allow our creative protest to degenerate into physical violence. Again and again, we must rise to the majestic heights of meeting physical force with soul force.
The marvelous new militancy which has engulfed the Negro community must not lead us to a distrust of all white people, for many of our white brothers, as evidenced by their presence here today, have come to realize that their destiny is tied up with our destiny. And they have come to realize that their freedom is inextricably bound to our freedom.
We cannot walk alone.
And as we walk, we must make the pledge that we shall always march ahead.
We cannot turn back.
There are those who are asking the devotees of civil rights, "When will you be satisfied?" We can never be satisfied as long as the Negro is the victim of the unspeakable horrors of police brutality. We can never be satisfied as long as our bodies, heavy with the fatigue of travel, cannot gain lodging in the motels of the highways and the hotels of the cities. We cannot be satisfied as long as the Negro's basic mobility is from a smaller ghetto to a larger one. We can never be satisfied as long as our children are stripped of their selfhood and robbed of their dignity by signs stating "for whites only." We cannot be satisfied as long as a Negro in Mississippi cannot vote and a Negro in New York believes he has nothing for which to vote. No, no, we are not satisfied, and we will not be satisfied until "justice rolls down like waters, and righteousness like a mighty stream."
I am not unmindful that some of you have come here out of great trials and tribulations. Some of you have come fresh from narrow jail cells. And some of you have come from areas where your quest -- quest for freedom left you battered by the storms of persecution and staggered by the winds of police brutality. You have been the veterans of creative suffering. Continue to work with the faith that unearned suffering is redemptive. Go back to Mississippi, go back to Alabama, go back to South Carolina, go back to Georgia, go back to Louisiana, go back to the slums and ghettos of our northern cities, knowing that somehow this situation can and will be changed.
Let us not wallow in the valley of despair, I say to you today, my friends.
And so even though we face the difficulties of today and tomorrow, I still have a dream. It is a dream deeply rooted in the American dream.
I have a dream that one day this nation will rise up and live out the true meaning of its creed: "We hold these truths to be self-evident, that all men are created equal."
I have a dream that one day on the red hills of Georgia, the sons of former slaves and the sons of former slave owners will be able to sit down together at the table of brotherhood.
I have a dream that one day even the state of Mississippi, a state sweltering with the heat of injustice, sweltering with the heat of oppression, will be transformed into an oasis of freedom and justice.
I have a dream that my four little children will one day live in a nation where they will not be judged by the color of their skin but by the content of their character.
I have a dream today!
I have a dream that one day, down in Alabama, with its vicious racists, with its governor having his lips dripping with the words of "interposition" and "nullification" -- one day right there in Alabama little black boys and black girls will be able to join hands with little white boys and white girls as sisters and brothers.
I have a dream today!
I have a dream that one day every valley shall be exalted, and every hill and mountain shall be made low, the rough places will be made plain, and the crooked places will be made straight; "and the glory of the Lord shall be revealed and all flesh shall see it together."?
This is our hope, and this is the faith that I go back to the South with.
With this faith, we will be able to hew out of the mountain of despair a stone of hope. With this faith, we will be able to transform the jangling discords of our nation into a beautiful symphony of brotherhood. With this faith, we will be able to work together, to pray together, to struggle together, to go to jail together, to stand up for freedom together, knowing that we will be free one day.
And this will be the day -- this will be the day when all of God's children will be able to sing with new meaning:
My country 'tis of thee, sweet land of liberty, of thee I sing.
Land where my fathers died, land of the Pilgrim's pride,
From every mountainside, let freedom ring!
And if America is to be a great nation, this must become true.
And so let freedom ring from the prodigious hilltops of New Hampshire.
Let freedom ring from the mighty mountains of New York.
Let freedom ring from the heightening Alleghenies of
Pennsylvania.
Let freedom ring from the snow-capped Rockies of Colorado.
Let freedom ring from the curvaceous slopes of California.
But not only that:
Let freedom ring from Stone Mountain of Georgia.
Let freedom ring from Lookout Mountain of Tennessee.
Let freedom ring from every hill and molehill of Mississippi.
From every mountainside, let freedom ring.
zhang xian
And when this happens, when we allow freedom ring, when we let it ring from every village and every hamlet, from every state and every city, we will be able to speed up that day when all of God's children, black men and white men, Jews and Gentiles, Protestants and Catholics, will be able to join hands and sing in the words of the old Negro spiritual:
Free at last! Free at last!
Thank God Almighty, we are free at last!

結果


發佈了29 篇原創文章 · 獲贊 3 · 訪問量 3萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章