Spark核心編程:高級編程之二次排序

1.案例:

1>按照文件中的第一列排序。
2>如果第一列相同,則按照第二列排序。
3>文件部分數據:
這裏寫圖片描述
4>代碼:
Java版:
1)自定義二次排序類:

package cn.spark.study.core;

import java.io.Serializable;

import scala.math.Ordered;

/*
 * 自定義的二次排序
 */
public class SecondarySortKey implements Ordered<SecondarySortKey>, Serializable{

    private static final long serialVersionUID = 1L;
    //首先在自定義key裏面,定義需要進行排序的列
    private int first;
    private int second;


    public SecondarySortKey(int first, int second) {
        super();
        this.first = first;
        this.second = second;
    }

    @Override
    public boolean $greater(SecondarySortKey other) {
        if(this.first > other.getFirst()){
            return true;
        } else if(this.first == other.getFirst() &&
                this.second > other.getSecond()){
            return true;
        }
        return false;
    }

    @Override
    public boolean $greater$eq(SecondarySortKey other) {
        if(this.$greater(other)){
            return true;
        }else if (this.first == other.getFirst() &&
                this.second == other.getSecond()) {
            return true;
        }
        return false;
    }

    @Override
    public boolean $less(SecondarySortKey other) {
        if(this.first < other.getFirst()){
            return true;
        } else if(this.first == other.getFirst() &&
                this.second < other.getSecond()){
            return true;
        }
        return false;
    }

    @Override
    public boolean $less$eq(SecondarySortKey other) {
        if(this.$less(other)){
            return true;
        }else if(this.first == other.getFirst() &&
                this.second == other.getSecond()){
            return true;
        }
        return false;
    }

    @Override
    public int compare(SecondarySortKey other) {
        if(this.first - other.getFirst() != 0){
            return this.first - other.getFirst();
        }else{
            return this.second - other.getSecond();
        }
    }

    @Override
    public int compareTo(SecondarySortKey other) {
        if(this.first - other.getFirst() != 0){
            return this.first - other.getFirst();
        } else {
            return this.second - other.getSecond();
        }
    }
    //爲要進行排序的多個列,提供getter和setter方法,以及hashcode和equal是方法
    public int getFirst() {
        return first;
    }

    public void setFirst(int first) {
        this.first = first;
    }

    public int getSecond() {
        return second;
    }

    public void setSecond(int second) {
        this.second = second;
    }

    @Override
    public int hashCode() {
        final int prime = 31;
        int result = 1;
        result = prime * result + first;
        result = prime * result + second;
        return result;
    }

    @Override
    public boolean equals(Object obj) {
        if (this == obj)
            return true;
        if (obj == null)
            return false;
        if (getClass() != obj.getClass())
            return false;
        SecondarySortKey other = (SecondarySortKey) obj;
        if (first != other.first)
            return false;
        if (second != other.second)
            return false;
        return true;
    }


}

2)案例實現類:

package cn.spark.study.core;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;

import scala.Tuple2;


/*
 * 二次排序
 * 1,實現自定義的key,要實現Ordered接口和Serializable接口,在key中實現自己對多列的排序算法
 * 2.將包含文本的RDD,映射成key爲自定義key,value爲文本的JavaPairRDD
 * 3.使用sortByKey算子按照自定義的key進行排序
 * 4.再次映射,剔除自定義的key,只保留文本行
 */
public class SecondarySort {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf()
                .setAppName("SecondarySort")
                .setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaRDD<String>lines = sc.textFile("G://SparkDevel//test//wordCount//data//sort.txt");
        JavaPairRDD<SecondarySortKey, String> pairs = lines.mapToPair(
                new PairFunction<String, SecondarySortKey, String>() {

                    private static final long serialVersionUID = 1L;

                    @Override
                    public Tuple2<SecondarySortKey, String> call(String line) throws Exception {
                        String[] lineSplited = line.split(" ");
                        SecondarySortKey key = new SecondarySortKey(
                                Integer.valueOf(lineSplited[0]),
                                Integer.valueOf(lineSplited[1]));
                        return new Tuple2<SecondarySortKey, String>(key, line);
                    }
        });
        JavaPairRDD<SecondarySortKey, String> sortedPairs = pairs.sortByKey();
        JavaRDD<String> sortedLines = sortedPairs.map(new Function<Tuple2<SecondarySortKey,String>, String>() {

            private static final long serialVersionUID = 1L;

            @Override
            public String call(Tuple2<SecondarySortKey, String> v1) throws Exception {

                return v1._2;
            }
        });
        sortedLines.foreach(new VoidFunction<String>() {
            private static final long serialVersionUID = 1L;

            @Override
            public void call(String v1) throws Exception {
                System.out.println(v1);

            }
        });
        sc.close();
    }
}

Scala版:
1)自定義二次排序類:

package cn.spark.study.core

/**
 * @author Administrator
 */
class SecondSortKey(val first: Int, val second: Int) 
    extends Ordered[SecondSortKey] with Serializable {

  def compare(that: SecondSortKey): Int = {
    if(this.first - that.first != 0) {
      this.first - that.first
    } else {
      this.second - that.second
    }
  }

}

2)案例實現類:

package cn.spark.study.core

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

/**
 * @author Administrator
 */
object SecondSort {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
        .setAppName("SecondSort")  
        .setMaster("local")  
    val sc = new SparkContext(conf)

    val lines = sc.textFile("G://SparkDevel//test//wordCount//data//sort.txt", 1)
    val pairs = lines.map { line => (
        new SecondSortKey(line.split(" ")(0).toInt, line.split(" ")(1).toInt),
        line)}
    val sortedPairs = pairs.sortByKey()
    val sortedLines = sortedPairs.map(sortedPair => sortedPair._2)  

    sortedLines.foreach { sortedLine => println(sortedLine) }  
  }

}

結果:
這裏寫圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章