目錄
Page磁盤讀取過程
buf_page_get_gen
| | ==> rw_lock_s_lock(hash_lock);
| | ==> block = (buf_block_t*) buf_page_hash_get_low(buf_pool, space, offset, fold);
| | ==> if (block == NULL) rw_lock_s_unlock(hash_lock);
| | ==> //(MM)從存儲讀取PAGE到BP
| | ==> buf_read_page(space, zip_size, offset)
| | ==> count = buf_read_page_low(&err, true, BUF_READ_ANY_PAGE, space,
zip_size, FALSE, tablespace_version, offset);
| | ==> //從BP中申請空閒page空間
| | ==> bpage = buf_page_init_for_read(err, mode, space, zip_size, unzip, tablespace_version, offset);
| | ==> block = NULL;
| | ==> buf_pool_mutex_enter(buf_pool);
| | ==> rw_lock_x_lock(hash_lock);
| | ==> watch_page = buf_page_hash_get_low(buf_pool, space, offset, fold);
| | ==> //釋放了hashlock,還持有BP lock
| | ==> rw_lock_x_unlock(hash_lock);
| | ==> //申請空閒內存塊
| | ==> data = buf_buddy_alloc(buf_pool, zip_size, &lru);
| | ==> buf_buddy_alloc_low(buf_pool, buf_buddy_get_slot(size), lru));
| | ==> //從Free List中獲取內存塊
| | ==> block = buf_LRU_get_free_only(buf_pool);
| | ==> //free list中沒有,從LRU中獲取,此時要釋放BP鎖,因爲從LRU獲取過程中可能會休眠
| | ==> buf_pool_mutex_exit(buf_pool);
| |
| |
| | ==> block = buf_LRU_get_free_block(buf_pool);
| | ==> //標識從LRU鏈表中獲取
| | ==> *lru = TRUE;
| | ==> //獲取成功後重新加鎖
| | ==> buf_pool_mutex_enter(buf_pool);
| | ==> rw_lock_x_lock(hash_lock);
| | ==> //如果從LRU獲取的話,需要重新CHECK hash表中是否有該Page,因爲獲取過程中會釋放BP MUTEX
| | ==> /* If buf_buddy_alloc() allocated storage from the LRU list, it released and reacquired buf_pool->mutex. Thus, we must check the page_hash again, as it may have been modified. */
| | ==> if (UNIV_UNLIKELY(lru)) {
| | ==> watch_page = buf_page_hash_get_low(buf_pool, space, offset, fold);
| | ==> if (watch_page && !buf_pool_watch_is_sentinel(buf_pool, watch_page)) rw_lock_x_unlock(hash_lock); watch_page = NULL; buf_buddy_free(buf_pool, data, zip_size);
| | ==> //(MM)插入到HASH表
| | ==> HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, bpage);
| | ==> rw_lock_x_unlock(hash_lock);
| | ==> //(MM)插入到LRU鏈表
| | ==> buf_LRU_add_block(bpage, TRUE/* to old blocks */);
| | ==> buf_pool_mutex_exit(buf_pool);
| | ==> //增加引用計數
| | ==> buf_block_fix(fix_block);
| | ==> #ifdef PAGE_ATOMIC_REF_COUNT
| | ==> os_atomic_increment_uint32(&block->page.buf_fix_count, 1);
| | ==> #else
| | ==> ib_mutex_t* block_mutex = buf_page_get_mutex(&block->page);
| | ==> mutex_enter(block_mutex);
| | ==> ++block->page.buf_fix_count;
| | ==> mutex_exit(block_mutex);
| | ==> rw_lock_s_unlock(hash_lock);
| | ==> buf_wait_for_read(fix_block);
過程是:
- 對Page_Hash加s鎖,判斷是否存在。
- 如果不存在,釋放s鎖,加bp的互斥鎖、加x鎖。再次判斷是否存在。
- 如果不存在,釋放x鎖,申請空閒Block。
- 然後加x鎖,判斷是否存在,如果不存在,插入空閒BLock。釋放x鎖
- 把block加入lru鏈表。釋放bp的互斥鎖。
步驟3釋放x鎖的原因是,線程持有bp的互斥鎖,對lru\freelist的操作都依靠互斥鎖。因此可以通過bp的互斥鎖,阻塞其他線程申請空閒Block。但是在從Lru申請BLock過程中,可能出現由於休眠等待需要釋放互斥鎖,而導致其他線程進入,因此第4步根據空閒BLock的來源判斷是否需要重新Check。
可以看到,從磁盤讀取過程中,對hash的互斥成本是比較高的,加了2次x鎖。這個地方後續版本中有優化。
申請空閒PAGE空間
buf_LRU_get_free_block
| | ==> buf_pool_mutex_enter(buf_pool);
| | ==> //(MM)從free_list中獲取Page空間
| | ==> block = buf_LRU_get_free_only(buf_pool);
| | ==> UT_LIST_REMOVE(list, buf_pool->free, (&block->page));
| | ==> //(MM)如果沒有獲取到,從LRU鏈表中獲取非髒的PAGE空間
| | ==> freed = buf_LRU_scan_and_free_block(buf_pool, n_iterations > 0);
| | ==> //(MM)如果沒有,從LLU刷一個髒頁下去
| | ==> buf_flush_single_page_from_LRU(buf_pool)
buf_LRU_scan_and_free_block(buf_pool, n_iterations > 0)
| | ==> //嘗試從解壓縮頁中獲取空間
| | ==> buf_LRU_free_from_unzip_LRU_list(buf_pool, scan_all)
| | ==> //從LRU中獲取非髒頁面
| | ==> buf_LRU_free_from_common_LRU_list(buf_pool, scan_all)
| | ==> //從後向前遍歷LRU鏈表
| | ==> for (bpage = UT_LIST_GET_LAST(buf_pool->LRU), scanned = 1, freed = FALSE;
bpage != NULL && !freed && (scan_all || scanned < srv_LRU_scan_depth); ++scanned) {
| | ==> accessed = buf_page_is_accessed(bpage);
| | ==> //嘗試從LRU鏈表中轉移一個PAGE到FREE LIST
| | ==> freed = buf_LRU_free_page(bpage, true);
| | ==> //判斷頁面是否有引用或者在IO過程中
| | ==> if (!buf_page_can_relocate(bpage)) exit
| | ==> //判斷頁面是否有修改
| | ==> if (bpage->oldest_modification) exit
| | ==> //從LRU鏈表中刪除一條PAGE空間,如果是BUF_BLOCK_FILE_PAGE類型的PAGE,要在外面調用函數加入FREELIST
| | ==> buf_LRU_block_remove_hashed(bpage, zip)
| | ==> buf_LRU_remove_block(bpage);
| | ==> UT_LIST_REMOVE(LRU, buf_pool->LRU, bpage);
| | ==> //從HASH表中刪除
| | ==> HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage);
| | ==> //把Page加入到FREE LIST
| | ==> buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
| | ==> buf_LRU_block_free_non_file_page(block);
| | ==> memset
| | ==> UT_LIST_ADD_FIRST(list, buf_pool->free, (&block->page));
| | ==> }
buf_flush_single_page_from_LRU(buf_pool)
| | ==> for (bpage = UT_LIST_GET_LAST(buf_pool->LRU), scanned = 1;
bpage != NULL;
bpage = UT_LIST_GET_PREV(LRU, bpage), ++scanned) {
| | ==> //判斷頁面是否有修改且是否在IO中
| | ==> if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_SINGLE_PAGE)
| | ==> //刷髒頁(同步IO)
| | ==> buf_flush_page(buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, true);
| | ==> buf_page_set_io_fix(bpage, BUF_IO_WRITE);
| | ==> //如果是BUF_FLUSH_LIST,可能會走下面函數,此處有三個且條件
| | ==> buf_dblwr_flush_buffered_writes();
| | ==> buf_flush_write_block_low(bpage, flush_type, sync);
| | ==> if (flush_type == BUF_FLUSH_SINGLE_PAGE) buf_dblwr_write_single_page(bpage, sync);
| | ==> //持久化dblwr
| | ==> fil_flush(TRX_SYS_SPACE);
| | ==> //持久化數據頁
| | ==> buf_dblwr_write_block_to_datafile(bpage, sync);
| | ==> //調IO接口
| | ==> fil_io(flags, sync, buf_block_get_space(block), 0,
buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
(void*) block->frame, (void*) block)
| | ==> //調AIO接口
| | ==> ret = os_aio(type, mode | wake_later, node->name, node->handle, buf, offset, len, node, message);
| | ==> os_aio_func
| | ==> //如果同步IO,讀調用os_file_read_func,寫調用os_file_write_func
| | ==> //如果異步IO,走異步IO系統
| | ==> os_aio_simulated_wake_handler_thread
| | ==> //如果是同步IO,進行IO_WAIT
| | ==> fil_node_complete_io(node, fil_system, type);
| | ==> //關閉DBLWR場景
| | ==> fil_io
| | ==> //其他場景,通過dblwr持久化數據頁
| | ==> buf_dblwr_add_to_batch(bpage);
| | ==> //同步IO場景,執行IO完成後操作
| | ==> if (sync) {
| | ==> fil_flush(buf_page_get_space(bpage));
| | ==> buf_page_io_complete(bpage);
| | ==> buf_flush_write_complete(bpage)
| | ==> //從FLU鏈表中移出
| | ==> buf_flush_remove(bpage);
| | ==> }
| | ==> }
| | ==> //因爲剛刷完髒頁,重新遍歷LRU鏈表,嘗試把轉移一個PAGE到FREE_LIST
| | ==> ready = buf_flush_ready_for_replace(bpage);
| | ==> freed = buf_LRU_free_page(bpage, evict_zip);
刷髒頁和LRU鏈表
buf_flush_page_cleaner_thread
| | ==> //依次遍歷每個Buffer pool instance,從LRU尾部開始掃描,直到第srv_LRU_scan_depth個page停止,
| | ==> //按批次刷LRU,每次期望刷100個page(PAGE_CLEANER_LRU_BATCH_CHUNK_SIZE), 每個Bp會進行srv_LRU_scan_depth/100次循環
| | ==> buf_flush_LRU_tail
| | ==> for (ulint i = 0; i < srv_buf_pool_instances; i++) {
| | ==> scan_depth = UT_LIST_GET_LEN(buf_pool->LRU);
| | ==> //LRU釋放PAGE到FREE LIST
| | ==> buf_flush_LRU(buf_pool, PAGE_CLEANER_LRU_BATCH_CHUNK_SIZE, &n_flushed)
| | ==> buf_flush_start(buf_pool, BUF_FLUSH_LRU)
| |
| |
| | ==> buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0);
| | ==> case BUF_FLUSH_LRU: buf_do_LRU_batch(buf_pool, min_n);
| | ==> if (buf_LRU_evict_from_unzip_LRU(buf_pool)) buf_free_from_unzip_LRU_list_batch(buf_pool, max);
| | ==> //
| | ==> buf_flush_LRU_list_batch(buf_pool, max - count);
| | ==> //沒有引用、沒有IO、沒有變更
| | ==> evict = buf_flush_ready_for_replace(bpage);
| | ==> //嘗試從LRU鏈表中轉移PAGE到FREE LIST
| | ==> if (evict) buf_LRU_free_page(bpage, true);
| | ==> //把這個PAGE持久化
| | ==> else buf_flush_page_and_try_neighbors(bpage, BUF_FLUSH_LRU, max, &count);
| | ==> //判斷這個PAGE是否需要持久化
| | ==> buf_flush_ready_for_flush(bpage, flush_type)
| | ==> //持久化這個PAGE
| | ==> buf_flush_try_neighbors(space, offset, flush_type, *count, n_to_flush)
| | ==> buf_flush_page(buf_pool, bpage, flush_type, false)
| | ==> buf_flush_end(buf_pool, BUF_FLUSH_LRU);
| | ==> }
| | ==> //判斷是需要刷髒頁
| | ==> page_cleaner_flush_pages_if_needed
| | ==> //number of pages thatwe should attempt to flush, LSN up to which flushing must happen
| | ==> page_cleaner_do_flush_batch(ulint n_to_flush, lsn_t lsn_limit)
| | ==> buf_flush_list(n_to_flush, lsn_limit, &n_flushed);
| | ==> //依次爲每個BP執行
| | ==> buf_flush_batch(buf_pool, BUF_FLUSH_LIST, min_n, lsn_limit)
| | ==> count = buf_do_flush_list_batch(buf_pool, min_n, lsn_limit);
| | ==> //從FLU鏈表的尾部開始遍歷,跳出條件時候刷髒的PAGE數量和LSN
| | ==> buf_flush_set_hp(buf_pool, prev);
| | ==> buf_flush_list_mutex_exit(buf_pool);
| | ==> buf_flush_page_and_try_neighbors(bpage, BUF_FLUSH_LIST, min_n, &count);
| | ==> buf_flush_list_mutex_enter(buf_pool);
| | ==> buf_flush_is_hp(buf_pool, prev)
Page內存讀取和Page淘汰的互斥
buf_flush_batch
| | ==> //加LRU LOCK
| | ==> buf_pool_mutex_enter(buf_pool);
| | ==> count = buf_do_LRU_batch(buf_pool, min_n);
| | ==> buf_flush_LRU_list_batch(buf_pool, max - count);
| | ==> bpage = UT_LIST_GET_LAST(buf_pool->LRU);
| | ==> mutex_enter(block_mutex);
| | ==> evict = buf_flush_ready_for_replace(bpage);
| | ==> bpage->buf_fix_count == 0
| | ==> mutex_exit(block_mutex);
| | ==> if (evict) buf_LRU_free_page
| | ==> rw_lock_x_lock(hash_lock);
| | ==> mutex_enter(block_mutex);
| | ==> //重新判斷引用計數
| | ==> buf_page_can_relocate
| | ==> //從LRU和HASH表刪除
| | ==> buf_LRU_block_remove_hashed
| | ==> buf_LRU_remove_block(bpage);
| | ==> HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage);
| | ==> rw_lock_x_unlock(hash_lock);
| | ==> buf_pool_mutex_exit(buf_pool);
| | ==> buf_pool_mutex_enter(buf_pool);
| | ==> buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
| | ==> buf_LRU_block_free_non_file_page(block);
| | ==> UT_LIST_ADD_FIRST(list, buf_pool->free, (&block->page));
| | ==> buf_pool_mutex_exit(buf_pool);
Page淘汰過程如下:
- 加BP的互斥鎖
- 從LRU鏈表的獲取最老的數據頁
- 確認Page是否可以被淘汰
- 如果可以淘汰,加hash_page的x鎖,此時重新判斷,因爲前述判斷無法攔截對Page的訪問,因此此時對Page_hash加x鎖,進行準確的判斷。
- 如果確實可以淘汰,從Page_Hash中刪除,釋放hash的x鎖。
- 將Page加入空閒鏈表