數據分區詳解
數據分區的五種常用方式:
1、隨機分區
優點:數據分佈均勻
缺點:具有相同特點的數據不會保證被分配到相同的分區
2、Hash分區
優點:具有相同特點的數據保證被分配到相同的分區
特點:會產生數據傾斜
3、範圍分區
缺點:提高查詢速度,相鄰的數據都在相同的分區
缺點:部分分區的數據量會超出其他的分區,需要進行裂變以保持所有分區的數據量是均勻的。如果每個分區不排序,那麼裂變就會非常困難
4、輪詢分區
負載均衡算法的一種
優點:確保一定不會出現數據傾斜
缺點:無法根據存儲/計算能力分配存儲/計算壓力
5、自定義分區
請參考Flink的分區規則:
public static enum PartitionMethod {
REBALANCE, // round-robin 分區
HASH, // hash散列
RANGE, // 範圍分區
CUSTOM; // 自定義
}
請看MapReduce的自定義分區的Partitioner接口的定義
/**
* Partitions the key space.
*
* <p><code>Partitioner</code> controls the partitioning of the keys of the
* intermediate map-outputs. The key (or a subset of the key) is used to derive
* the partition, typically by a hash function. The total number of partitions
* is the same as the number of reduce tasks for the job. Hence this controls
* which of the <code>m</code> reduce tasks the intermediate key (and hence the
* record) is sent for reduction.</p>
*
* Note: If you require your Partitioner class to obtain the Job's configuration
* object, implement the {@link Configurable} interface.
*
* @see Reducer
*/
@InterfaceAudience.Public
@InterfaceStability.Stable
public abstract class Partitioner<KEY, VALUE> {
/**
* Get the partition number for a given key (hence record) given the total
* number of partitions i.e. number of reduce-tasks for the job.
*
* <p>Typically a hash function on a all or a subset of the key.</p>
*
* @param key the key to be partioned.
* @param value the entry value.
* @param numPartitions the total number of partitions.
* @return the partition number for the <code>key</code>.
*/
public abstract int getPartition(KEY key, VALUE value, int numPartitions);
}
請看Flink的自定義分區的接口Paritioner的定義:
/**
* Function to implement a custom partition assignment for keys.
*
* @param <K> The type of the key to be partitioned.
*/
@Public
@FunctionalInterface
public interface Partitioner<K> extends java.io.Serializable, Function {
/**
* Computes the partition for the given key.
*
* @param key The key.
* @param numPartitions The number of partitions to partition into.
* @return The partition index.
*/
int partition(K key, int numPartitions);
}
有個共同的特點就是:
你把元素交給這個分區器,這個分區器的一個方法邏輯來決定這個元素被分發到哪個分區。
6、測試代碼
package com.aura.funny.partition;
import java.util.*;
/**
* 作者: 馬中華 https://blog.csdn.net/zhongqi2513
* 時間: 2019/6/27 14:02
* 描述:
* 關於數據分區的代碼測試
*/
public class PartitionTest02 {
public static void main(String[] args) {
/**
* 待分區的數據集
*/
List<String> data = Arrays.asList(
"a", "b", "c", "d", "e", "f", "g",
"h", "i", "j", "k", "l", "m", "n",
"o", "p", "q", "r", "s", "t",
"u", "v", "w", "x", "y", "z",
"a", "a", "a", "a",
"a", "a", "a", "a",
"a", "a", "a", "a",
"b", "b", "b", "b");
/**
* 分區個數
*/
int partitionNumber = 5;
/**
* 第一招:Hash散列
*/
System.out.println("\n---------第一招:Hash散列------------");
List<List<String>> partitionList1 = partitionData(data, new Partitioner() {
@Override
public int getPartition(String item, int numPartitions) {
return (item.hashCode() & Integer.MAX_VALUE) % numPartitions;
}
}, partitionNumber);
printPartitionedData(partitionList1);
/**
* 第二招:隨機分區
*/
System.out.println("\n---------第二招:隨機分區------------");
List<List<String>> partitionList2 = partitionData(data, new Partitioner() {
Random random = new Random();
@Override
public int getPartition(String item, int numPartitions) {
return random.nextInt(numPartitions);
}
}, partitionNumber);
printPartitionedData(partitionList2, false);
/**
* 第三招:輪詢散列
*/
System.out.println("\n---------第三招:輪詢散列------------");
List<List<String>> partitionList3 = partitionData(data, new Partitioner() {
int counter = 0;
@Override
public int getPartition(String item, int numPartitions) {
int partitionIndex = counter;
counter++;
if (counter == numPartitions) {
counter = 0;
}
return partitionIndex;
}
}, partitionNumber);
printPartitionedData(partitionList3, false);
/**
* 第四招:範圍分區
*/
System.out.println("\n---------第四招:範圍分區------------");
List<List<String>> partitionList4 = partitionData(data, new Partitioner() {
@Override
public int getPartition(String item, int numPartitions) {
// 確定範圍分界點
Set datas = new HashSet<String>(data);
List<String> distinctItemList = new ArrayList<String>(datas);
Collections.sort(distinctItemList);
int step = distinctItemList.size() / numPartitions + 1;
int index = distinctItemList.indexOf(item);
int partitionNum = index / step;
return partitionNum;
}
}, partitionNumber);
printPartitionedData(partitionList4);
/**
* 第五招:自定義分區
*/
System.out.println("\n---------第五招:自定義分區------------");
List<List<String>> partitionList5 = partitionData(data, new Partitioner() {
@Override
public int getPartition(String item, int numPartitions) {
/**
* 在此,自定義分區的邏輯即可。決定item這個元素到底被放置到哪個分區中。
*/
return 0;
}
}, partitionNumber);
printPartitionedData(partitionList5, false);
}
/**
* 分區方法
*/
public static List<List<String>> partitionData(List<String> data, Partitioner partitioner, int numPartitions){
List<List<String>> partitionList = initPartitionContext(numPartitions);
for (String item : data) {
// 按照每個元素的hash值分配分區編號
int partitionNum = partitioner.getPartition(item, numPartitions);
partitionList.get(partitionNum).add(item);
}
return partitionList;
}
/**
* 初始化裝載分區數據的容器
*/
public static List<List<String>> initPartitionContext(int numPartitions){
List<List<String>> partitionList = new ArrayList<List<String>>();
// 先創建存儲每個分區數據的List
for (int i = 0; i < numPartitions; i++) {
partitionList.add(new ArrayList<String>());
}
return partitionList;
}
/**
* 打印被分區的數據集,分區數據要進行排序
*/
public static void printPartitionedData(List<List<String>> partitionResult){
printPartitionedData(partitionResult, true);
}
/**
* 打印被分區的數據集,根據需要是否排序分區的數據
*/
public static void printPartitionedData(List<List<String>> partitionResult, boolean sort){
if(sort){
// 給每個分區的數據排序,爲了結果好看
for (List<String> partition : partitionResult) {
if(partition.size() != 0 && partition != null){
Collections.sort(partition);
}
}
}
// 打印輸出每個分區的數據
for (List<String> partition : partitionResult) {
if(partition.size() != 0 && partition != null){
String allItem = "";
for (String item : partition) {
allItem += (item + ",");
}
System.out.println(allItem.substring(0, allItem.length() - 1));
}else{
System.out.println("該分區的數據爲空");
}
}
}
}
/**
* 一個定義分區邏輯的接口
*/
interface Partitioner{
int getPartition(String item, int numPartitions);
}
各位把代碼拿下去,直接就可運行看效果、!!
7、效果
在這裏,我也給大家貼一份代碼執行的效果
---------第一招:Hash散列------------
d,i,n,s,x
e,j,o,t,y
a,a,a,a,a,a,a,a,a,a,a,a,a,f,k,p,u,z
b,b,b,b,b,g,l,q,v
c,h,m,r,w
---------第二招:隨機分區------------
b,c,f,h,j,o,u,y,a,a,a,b
d,v,a,a
a,e,l,n,r,t,w,x,z,a,a,a,b,b
k,p,q,s,a,b
g,i,m,a,a,a
---------第三招:輪詢散列------------
a,f,k,p,u,z,a,a,b
b,g,l,q,v,a,a,a,b
c,h,m,r,w,a,a,a
d,i,n,s,x,a,a,b
e,j,o,t,y,a,a,b
---------第四招:範圍分區------------
a,a,a,a,a,a,a,a,a,a,a,a,a,b,b,b,b,b,c,d,e,f
g,h,i,j,k,l
m,n,o,p,q,r
s,t,u,v,w,x
y,z
---------第五招:自定義分區------------
a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,a,a,a,a,a,a,a,a,a,a,a,a,b,b,b,b
該分區的數據爲空
該分區的數據爲空
該分區的數據爲空
該分區的數據爲空