數據分區詳解

數據分區的五種常用方式：

1、隨機分區

優點：數據分佈均勻
缺點：具有相同特點的數據不會保證被分配到相同的分區

2、Hash分區

優點：具有相同特點的數據保證被分配到相同的分區
特點：會產生數據傾斜

3、範圍分區

缺點：提高查詢速度，相鄰的數據都在相同的分區
缺點：部分分區的數據量會超出其他的分區，需要進行裂變以保持所有分區的數據量是均勻的。如果每個分區不排序，那麼裂變就會非常困難

4、輪詢分區

負載均衡算法的一種
優點：確保一定不會出現數據傾斜
缺點：無法根據存儲/計算能力分配存儲/計算壓力

5、自定義分區

請參考Flink的分區規則：
public static enum PartitionMethod {
   REBALANCE,       // round-robin 分區
   HASH,           // hash散列
   RANGE,           // 範圍分區
   CUSTOM;           // 自定義
}

請看MapReduce的自定義分區的Partitioner接口的定義

/** 
 * Partitions the key space.
 * 
 * <p><code>Partitioner</code> controls the partitioning of the keys of the 
 * intermediate map-outputs. The key (or a subset of the key) is used to derive
 * the partition, typically by a hash function. The total number of partitions
 * is the same as the number of reduce tasks for the job. Hence this controls
 * which of the <code>m</code> reduce tasks the intermediate key (and hence the 
 * record) is sent for reduction.</p>
 * 
 * Note: If you require your Partitioner class to obtain the Job's configuration
 * object, implement the {@link Configurable} interface.
 * 
 * @see Reducer
 */
@InterfaceAudience.Public
@InterfaceStability.Stable
public abstract class Partitioner<KEY, VALUE> {
  
  /** 
   * Get the partition number for a given key (hence record) given the total 
   * number of partitions i.e. number of reduce-tasks for the job.
   *   
   * <p>Typically a hash function on a all or a subset of the key.</p>
   *
   * @param key the key to be partioned.
   * @param value the entry value.
   * @param numPartitions the total number of partitions.
   * @return the partition number for the <code>key</code>.
   */
  public abstract int getPartition(KEY key, VALUE value, int numPartitions);
  
}

請看Flink的自定義分區的接口Paritioner的定義：

/**
 * Function to implement a custom partition assignment for keys.
 *
 * @param <K> The type of the key to be partitioned.
 */
@Public
@FunctionalInterface
public interface Partitioner<K> extends java.io.Serializable, Function {

	/**
	 * Computes the partition for the given key.
	 *
	 * @param key The key.
	 * @param numPartitions The number of partitions to partition into.
	 * @return The partition index.
	 */
	int partition(K key, int numPartitions);
}

有個共同的特點就是：

你把元素交給這個分區器，這個分區器的一個方法邏輯來決定這個元素被分發到哪個分區。

6、測試代碼

package com.aura.funny.partition;

import java.util.*;

/**
 * 作者： 馬中華   https://blog.csdn.net/zhongqi2513
 * 時間： 2019/6/27 14:02
 * 描述：
 *      關於數據分區的代碼測試
 */
public class PartitionTest02 {

  public static void main(String[] args) {

    /**
     * 待分區的數據集
     */
    List<String> data = Arrays.asList(
            "a", "b", "c", "d", "e", "f", "g",
            "h", "i", "j", "k", "l", "m", "n",
            "o", "p", "q", "r", "s", "t",
            "u", "v", "w", "x", "y", "z",
            "a", "a", "a", "a",
            "a", "a", "a", "a",
            "a", "a", "a", "a",
            "b", "b", "b", "b");
    /**
     * 分區個數
     */
    int partitionNumber = 5;





    /**
     * 第一招：Hash散列
     */
    System.out.println("\n---------第一招：Hash散列------------");
    List<List<String>> partitionList1 = partitionData(data, new Partitioner() {
      @Override
      public int getPartition(String item, int numPartitions) {
        return (item.hashCode() & Integer.MAX_VALUE) % numPartitions;
      }
    }, partitionNumber);
    printPartitionedData(partitionList1);





    /**
     * 第二招：隨機分區
     */
    System.out.println("\n---------第二招：隨機分區------------");
    List<List<String>> partitionList2 = partitionData(data, new Partitioner() {
      Random random = new Random();
      @Override
      public int getPartition(String item, int numPartitions) {
        return random.nextInt(numPartitions);
      }
    }, partitionNumber);
    printPartitionedData(partitionList2, false);





    /**
     * 第三招：輪詢散列
     */
    System.out.println("\n---------第三招：輪詢散列------------");
    List<List<String>> partitionList3 = partitionData(data, new Partitioner() {
      int counter = 0;
      @Override
      public int getPartition(String item, int numPartitions) {
        int partitionIndex = counter;
        counter++;
        if (counter == numPartitions) {
          counter = 0;
        }
        return partitionIndex;
      }
    }, partitionNumber);
    printPartitionedData(partitionList3, false);




    /**
     * 第四招：範圍分區
     */
    System.out.println("\n---------第四招：範圍分區------------");
    List<List<String>> partitionList4 = partitionData(data, new Partitioner() {
      @Override
      public int getPartition(String item, int numPartitions) {

        // 確定範圍分界點
        Set datas = new HashSet<String>(data);
        List<String> distinctItemList = new ArrayList<String>(datas);
        Collections.sort(distinctItemList);
        int step = distinctItemList.size() / numPartitions + 1;

        int index = distinctItemList.indexOf(item);
        int partitionNum = index / step;
        return partitionNum;
      }
    }, partitionNumber);
    printPartitionedData(partitionList4);






    /**
     * 第五招：自定義分區
     */
    System.out.println("\n---------第五招：自定義分區------------");
    List<List<String>> partitionList5 = partitionData(data, new Partitioner() {
      @Override
      public int getPartition(String item, int numPartitions) {

        /**
         * 在此，自定義分區的邏輯即可。決定item這個元素到底被放置到哪個分區中。
         */

        return 0;
      }
    }, partitionNumber);
    printPartitionedData(partitionList5, false);

  }

  /**
   * 分區方法
   */
  public static List<List<String>> partitionData(List<String> data, Partitioner partitioner, int numPartitions){
    List<List<String>> partitionList = initPartitionContext(numPartitions);
    for (String item : data) {
      // 按照每個元素的hash值分配分區編號
      int partitionNum = partitioner.getPartition(item, numPartitions);
      partitionList.get(partitionNum).add(item);
    }
    return partitionList;
  }

  /**
   * 初始化裝載分區數據的容器
   */
  public static List<List<String>> initPartitionContext(int numPartitions){
    List<List<String>> partitionList = new ArrayList<List<String>>();
    // 先創建存儲每個分區數據的List
    for (int i = 0; i < numPartitions; i++) {
      partitionList.add(new ArrayList<String>());
    }
    return partitionList;
  }

  /**
   * 打印被分區的數據集，分區數據要進行排序
   */
  public static void printPartitionedData(List<List<String>> partitionResult){
    printPartitionedData(partitionResult, true);
  }

  /**
   * 打印被分區的數據集，根據需要是否排序分區的數據
   */
  public static void printPartitionedData(List<List<String>> partitionResult, boolean sort){

    if(sort){
      // 給每個分區的數據排序，爲了結果好看
      for (List<String> partition : partitionResult) {
        if(partition.size() != 0 && partition != null){
          Collections.sort(partition);
        }
      }
    }

    // 打印輸出每個分區的數據
    for (List<String> partition : partitionResult) {
      if(partition.size() != 0 && partition != null){
        String allItem = "";
        for (String item : partition) {
          allItem += (item + ",");
        }
        System.out.println(allItem.substring(0, allItem.length() - 1));
      }else{
        System.out.println("該分區的數據爲空");
      }
    }
  }

}

/**
 * 一個定義分區邏輯的接口
 */
interface Partitioner{
  int getPartition(String item, int numPartitions);
}

各位把代碼拿下去，直接就可運行看效果、！！

7、效果

在這裏，我也給大家貼一份代碼執行的效果

---------第一招：Hash散列------------
d,i,n,s,x
e,j,o,t,y
a,a,a,a,a,a,a,a,a,a,a,a,a,f,k,p,u,z
b,b,b,b,b,g,l,q,v
c,h,m,r,w

---------第二招：隨機分區------------
b,c,f,h,j,o,u,y,a,a,a,b
d,v,a,a
a,e,l,n,r,t,w,x,z,a,a,a,b,b
k,p,q,s,a,b
g,i,m,a,a,a

---------第三招：輪詢散列------------
a,f,k,p,u,z,a,a,b
b,g,l,q,v,a,a,a,b
c,h,m,r,w,a,a,a
d,i,n,s,x,a,a,b
e,j,o,t,y,a,a,b

---------第四招：範圍分區------------
a,a,a,a,a,a,a,a,a,a,a,a,a,b,b,b,b,b,c,d,e,f
g,h,i,j,k,l
m,n,o,p,q,r
s,t,u,v,w,x
y,z

---------第五招：自定義分區------------
a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,a,a,a,a,a,a,a,a,a,a,a,a,b,b,b,b
該分區的數據爲空
該分區的數據爲空
該分區的數據爲空
該分區的數據爲空

數據分區詳解

數據分區詳解

1、隨機分區

2、Hash分區

3、範圍分區

4、輪詢分區

5、自定義分區

6、測試代碼

7、效果

Hive的SQL編譯源碼詳解

Spark的任務提交和執行流程詳解

Hive--筆試題05_2--求TopN

Python全詳解--大綱（全網最清晰學習思路）

四百多篇博客專欄歸類讓你直接晉級老手

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結