B+樹
先說紅黑樹。紅黑樹是平衡二叉查找樹。它通常用於Map結構的實現,可以根據Key快速找到Value。Java中的TreeMap就是通過紅黑樹實現的。它是一種二叉樹,一個節點最多只能有兩個子節點。
當BST的存儲結構放在硬盤上時,就出現問題了。因爲二叉樹的高度比較大,尋找一個葉子節點需要較多次數訪問硬盤。我們知道訪問硬盤的速度是非常慢的,所以BST不太適合用在硬盤上。然而B+樹可以解決這個問題。
B+樹針對這種情況做了改良,修改了BST中只有兩個子節點的限制,變成了最多有M個節點,M要大於等於3,可以是很大的值如M=1000。這裏通過幾個例子分別介紹增、刪、查操作。你可能聽說過二三樹,它本質上就是B樹(不是B+樹)在M=3時的特殊形態。
下面通過一個具體的例子展示B+樹大致的模樣。例子中M=5。圖片通過工具生成:https://www.cs.usfca.edu/~galles/visualization/BPlusTree.html。
第一步:我們往空白的B+樹中放入一個key=0,那麼跟二叉樹一樣得到如下結果:
第二步:放入一條數據key=20,由於B+樹的一個節點可以放多個值,這個例子中M=5,所以最多可以放5個,得到如下結果:
第三步:放入一條數據key=10,由於B+樹的key要保持順序,所以插到了中間,得到如下結果:
第四步:放入一條數據key=30,得到如下結果:
第五步:放入一條數據key=40,得到如下結果:
由於一個節點放了5條數據,容量已滿,需要進行分割。分割的辦法就是以中間的key作爲分界點,分割成兩個節點,並把分界的節點提到上一層。
注意分割之後key=20出現了兩次。這兩個節點的區別在於葉子節點有value數據,而中間節點沒有value數據只有key。分割之後的兩個節點之間出現了一個箭頭,表示指針,用於遍歷樹的時候快速跳轉到下一個區塊。
以上就是B+樹插入數據的大概樣子。末尾附上筆者實現的B+樹代碼,可以瞭解其原理:(依據Wiki的介紹實現:https://en.wikipedia.org/wiki/B%2B_tree)
B+樹的數據節點之間像鏈表一樣相連,因此遍歷樹的操作不需要再去訪問中間節點,因此在硬盤上做遍歷操作是比較快的。但是數據節點在硬盤上不一定緊挨在一起,有可能相隔很遠,因此HBase中提供了優化表OPTIMIZE TABLE
的操作,其原理是將數據重寫一遍,讓數據能夠在硬盤上儘量連續。於是掃描操作就變成了硬盤的連續讀取操作,能夠提升查詢速度。
package btree;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Random;
public class BPlusTree {
public static void main(String[] args) {
Node<String> tree = new LeafNode<String>(5);
List<Integer> list = new ArrayList<Integer>();
int count = 1000000;
for (int i = 0; i < count; i++) {
list.add(i);
}
Random random = new Random(1);
Collections.shuffle(list, random);
for (int e : list) {
tree = tree.insert(e, "str" + random.nextInt(10000000));
}
for (int i = 0; i < count; i++) {
System.out.println(i + ": " + tree.get(i));
}
}
}
abstract class Node<T> {
protected int[] keys;
protected int keyCount;
protected int maxKeys;
public Node(int m) {
this.maxKeys = m;
this.keys = new int[m];
}
protected int locate(int key) {
for (int i = 0; i < keyCount; i++) {
if (key < keys[i]) return i;
}
return keyCount;
}
public static <V> void arrayInsert(int size, V[] array, int pos, V data) {
for (int i = size; i > pos; i--) {
array[i] = array[i - 1];
}
array[pos] = data;
}
public static void arrayInsert(int size, int[] array, int pos, int data) {
for (int i = size; i > pos; i--) {
array[i] = array[i - 1];
}
array[pos] = data;
}
public static <V> V[] arraySlice(V[] array, int start, int end) {
return Arrays.copyOfRange(array, start, end);
}
public static <V> V[] arraySlice(V[] array, int start) {
return arraySlice(array, start, array.length);
}
public static int[] arraySlice(int[] array, int start, int end) {
return Arrays.copyOfRange(array, start, end);
}
public static int[] arraySlice(int[] array, int start) {
return arraySlice(array, start, array.length);
}
protected boolean isFull() {
return keyCount >= maxKeys;
}
public abstract T get(int key);
public abstract Node<T> insert(int key, T value);
}
class InternalNode<T> extends Node<T> {
private Node<T>[] children;
public InternalNode(int maxKeys, int[] keys, Node<T>[] children) {
super(maxKeys);
this.children = new Node[maxKeys + 1];
keyCount = keys.length;
for (int i = 0; i < keyCount; i++) {
this.keys[i] = keys[i];
this.children[i] = children[i];
}
this.children[keyCount] = children[keyCount];
}
@Override
public T get(int key) {
int i = locate(key);
return children[i].get(key);
}
@Override
public InternalNode<T> insert(int key, T value) {
int i = locate(key);
Node<T> newNode = children[i].insert(key, value);
if (newNode == children[i]) return this;
if (newNode.keyCount != 1) throw new RuntimeException("Expect only one key");
InternalNode<T> asInternalNode = (InternalNode<T>) newNode;
arrayInsert(keyCount, keys, i, newNode.keys[0]);
children[i] = asInternalNode.children[0];
arrayInsert(keyCount + 1, children, i + 1, asInternalNode.children[1]);
keyCount++;
if (!isFull()) return this;
return split();
}
private InternalNode<T> split() {
int splitPoint = keyCount / 2;
int[] keys1 = arraySlice(keys, 0, splitPoint);
int middleKey = keys[splitPoint];
int[] keys2 = arraySlice(keys, splitPoint + 1);
Node<T>[] children1 = arraySlice(children, 0, splitPoint + 1);
Node<T>[] children2 = arraySlice(children, splitPoint + 1);
InternalNode<T> node1 = new InternalNode<T>(maxKeys, keys1, children1);
InternalNode<T> node2 = new InternalNode<T>(maxKeys, keys2, children2);
return new InternalNode<T>(maxKeys, new int[]{middleKey}, new Node[]{node1, node2});
}
}
class LeafNode<T> extends Node<T> {
private T[] values;
public LeafNode(int maxKeys) {
super(maxKeys);
this.values = (T[]) new Object[maxKeys];
}
public LeafNode(int maxKeys, int[] keys, T[] values) {
this(maxKeys);
this.keyCount = keys.length;
for (int i = 0; i < keys.length; i++) {
this.keys[i] = keys[i];
this.values[i] = values[i];
}
}
@Override
public T get(int key) {
for (int i = 0; i < keyCount; i++) {
if (keys[i] == key) return values[i];
}
return null;
}
@Override
public Node<T> insert(int key, T value) {
int i = locate(key);
arrayInsert(keyCount, keys, i, key);
arrayInsert(keyCount, values, i, value);
keyCount++;
if (!isFull()) return this;
return split();
}
private InternalNode<T> split() {
int splitPoint = keyCount / 2;
int[] keys1 = arraySlice(keys, 0, splitPoint);
int[] keys2 = arraySlice(keys, splitPoint);
T[] values1 = arraySlice(values, 0, splitPoint);
T[] values2 = arraySlice(values, splitPoint);
LeafNode<T> node1 = new LeafNode<T>(maxKeys, keys1, values1);
LeafNode<T> node2 = new LeafNode<T>(maxKeys, keys2, values2);
return new InternalNode<T>(maxKeys, new int[]{keys2[0]}, new Node[]{node1, node2});
}
}