數(shù)據(jù)結(jié)構(gòu)
storage/innobase/include/data0type.h
行結(jié)構(gòu)中气忠,除了用戶定義的列外還有3個隱藏系統(tǒng)列:DATA_ROW_ID搁进、DATA_TRX_ID灼舍、DATA_ROLL_PTR
如果表沒有定義主鍵那么DATA_ROW_ID作為主鍵列萧福,否則行結(jié)構(gòu)中沒有DATA_ROW_ID列
具體的隱藏列插入過程在dict_table_add_system_columns函數(shù)中
無論是聚簇索引崇摄,還是二級索引擎值,其每條記錄都包含了一個DELETED BIT位,用于標識該記錄是否是刪除記錄配猫,真正意義的刪除是在commit的時候,聚簇索引設置記錄deleted bit時杏死,會同時更新DATA_TRX_ID列泵肄。老版本DATA_TRX_ID進入undo表空間;二級索引設置deleted bit時淑翼,不寫入undo log
#define DATA_ROW_ID 0 /* row id: a 48-bit integer */
#define DATA_ROW_ID_LEN 6 /* stored length for row id */
/** Transaction id: 6 bytes */
constexpr size_t DATA_TRX_ID = 1;
/** Transaction ID type size in bytes. */
constexpr size_t DATA_TRX_ID_LEN = 6;
/** Rollback data pointer: 7 bytes */
constexpr size_t DATA_ROLL_PTR = 2;
/** Rollback data pointer type size in bytes. */
constexpr size_t DATA_ROLL_PTR_LEN = 7;
#define DATA_N_SYS_COLS 3 /* number of system columns defined above */
#define DATA_ITT_N_SYS_COLS 2 /* number of system columns for intrinsic
temporary table */
storage/innobase/include/page0page.h
而對于二級索引記錄腐巢,是不包含上面這兩個隱藏字段信息的,但對于二級索引玄括,會在頁中會記錄一個PAGE_MAX_TRX_ID冯丙,表示對該頁數(shù)據(jù)修改過的最大事務id
#define PAGE_MAX_TRX_ID \
18 /* highest id of a trx which may have modified \
a record on the page; trx_id_t; defined only \
in secondary indexes and in the insert buffer \
tree */
MySQL中的事務在開始到提交這段過程中,都會被保存到一個叫trx_sys的事務鏈表中遭京,這是一個基本的鏈表結(jié)構(gòu)胃惜,在客戶端執(zhí)行命令:show engine innodb status就能看到事務的鏈表
storage/innobase/trx/trx0sys.cc
/** The transaction system */
trx_sys_t *trx_sys = NULL;
ReadView類維護了為了實現(xiàn)事務一致性讀所需要的事務ID列表,用來保證活動事務中一些對數(shù)據(jù)庫的修改對當前事務不可見哪雕,其主要內(nèi)容如下:
storage/innobase/include/read0types.h
class ReadView {
/** 類似于Vector的事務ID數(shù)組類ids_t */
class ids_t {
typedef trx_ids_t::value_type value_type;
}
public:
/** 用來判斷某事務的改變對當前ReadView是否可見 */
bool changes_visible(trx_id_t id, const table_name_t &name) const
MY_ATTRIBUTE((warn_unused_result)) {
ut_ad(id > 0);
//小于高水位或者是當前事務船殉,都可見
if (id < m_up_limit_id || id == m_creator_trx_id) {
return (true);
}
check_trx_id_sanity(id, name);
//大于等于高水位時不可見
if (id >= m_low_limit_id) {
return (false);
//沒有活躍事務則認為可見
} else if (m_ids.empty()) {
return (true);
}
const ids_t::value_type *p = m_ids.data();
//通過二分查找搜索事務是否在活動事務列表中,/如果在說明在創(chuàng)建Read View時,此條記錄還處于活躍狀態(tài)則不應該查詢到斯嚎,否則說明創(chuàng)建Read View是此條記錄已經(jīng)是不活躍狀態(tài)則可以查詢到
return (!std::binary_search(p, p + m_ids.size(), id));
}
private:
/** 任何ID大于等于此值的事務都對當前事務不可見利虫,換句話說這是High water mark */
trx_id_t m_low_limit_id;
/** 任何ID小于此值的事務都對當前事務可見挨厚,換句話說這是Low water mark */
trx_id_t m_up_limit_id;
/** 當前事務(視圖創(chuàng)建者)的ID, 設為TRX_ID_MAX時表示釋放該視圖views */
trx_id_t m_creator_trx_id;
/** 快照生成時的活動事務集合 */
ids_t m_ids;
/** 任何ID小于此值的事務的undo log都不再被這個ReadView需要,故這些undo log都可以被purge線程清理 */
trx_id_t m_low_limit_no;
/** AC-NL-RO transaction view that has been "closed". */
bool m_closed;
typedef UT_LIST_NODE_T(ReadView) node_t;
/** trx_sys中的ReadView列表 */
byte pad1[64 - sizeof(node_t)];
node_t m_view_list;
};
MVCC類作為ReadView的管理類糠惫,主要起到打開疫剃、關(guān)閉、釋放硼讽、回收巢价、重用ReadView的作用,主要功能如下:
storage/innobase/include/read0read.h
class MVCC {
public:
/** 打開視圖 */
void view_open(ReadView *&view, trx_t *trx);
/** 關(guān)閉視圖 */
void view_close(ReadView *&view, bool own_mutex);
/** 釋放不活動的未關(guān)閉視圖理郑,調(diào)用者必須持有trx_sys_t::mutex互斥鎖 */
void view_release(ReadView *&view);
/** 用于預分配蹄溉,它會把最舊的視圖克隆到傳入的指針指向的視圖,同時也會試圖將標記為刪除的視圖移動到freed list您炉,無需顯式調(diào)用close*/
void clone_oldest_view(ReadView *view);
/** 由RW事務創(chuàng)建的視圖必須用此函數(shù)設置持有者ID */
static void set_view_creator_trx_id(ReadView *view, trx_id_t id);
private:
/** 從active list中獲取一個空閑視圖柒爵,如果沒有就創(chuàng)建一個新的,同時也會試圖將標記為刪除的視圖移動到freed list */
inline ReadView *get_view();
/** 獲取系統(tǒng)中最舊的視圖赚爵,同時將標記為刪除的視圖移動到freed list */
inline ReadView *get_oldest_view() const;
ReadView *get_view_created_by_trx_id(trx_id_t trx_id) const;
private:
typedef UT_LIST_BASE_NODE_T(ReadView) view_list_t;
/** 被回收的ReadView棉胀,等待被重用 */
view_list_t m_free;
/** 活動或關(guān)閉的視圖,已關(guān)閉視圖的creator trx id會被設為TRX_ID_MAX */
view_list_t m_views;
};
ReadView創(chuàng)建
start transaction和begin語句執(zhí)行后并沒有在innodb層分配事務ID冀膝、回滾段唁奢、將事務放到讀寫事務鏈表等,這些操作需要第一個SQL語句調(diào)用函數(shù)trx_start_if_not_started_xa->trx_start_low來完成窝剖,然后調(diào)用trx_assign_read_view方法為每個事務分配唯一的ReadView
storage/innobase/trx/trx0trx.cc
ReadView *trx_assign_read_view(trx_t *trx) /*!< in/out: active transaction */
{
ut_ad(trx->state == TRX_STATE_ACTIVE);
if (srv_read_only_mode) {
ut_ad(trx->read_view == NULL);
return (NULL);
} else if (!MVCC::is_view_active(trx->read_view)) {
trx_sys->mvcc->view_open(trx->read_view, trx);
}
return (trx->read_view);
}
InnoDB默認的是RR級別麻掸,在這種級別下,相當于事務開啟后赐纱,事務鏈中所有的事務脊奋,它們在事務處理期間的一切改變對我們當前開啟的事務而言都是不可見的,也可以相當于看作 m_up_limit_id == m_low_limit_id
storage/innobase/read/read0read.cc
void MVCC::view_open(ReadView *&view, trx_t *trx) {
ut_ad(!srv_read_only_mode);
/** If no new RW transaction has been started since the last view
was created then reuse the the existing view. */
if (view != NULL) {
uintptr_t p = reinterpret_cast<uintptr_t>(view);
view = reinterpret_cast<ReadView *>(p & ~1);
ut_ad(view->m_closed);
if (trx_is_autocommit_non_locking(trx) && view->empty()) {
view->m_closed = false;
if (view->m_low_limit_id == trx_sys_get_max_trx_id()) {
return;
} else {
view->m_closed = true;
}
}
mutex_enter(&trx_sys->mutex);
UT_LIST_REMOVE(m_views, view);
} else {
mutex_enter(&trx_sys->mutex);
view = get_view();
}
if (view != NULL) {
view->prepare(trx->id);
UT_LIST_ADD_FIRST(m_views, view);
ut_ad(!view->is_closed());
ut_ad(validate());
}
trx_sys_mutex_exit();
}
void ReadView::prepare(trx_id_t id) {
ut_ad(mutex_own(&trx_sys->mutex));
m_creator_trx_id = id;
m_low_limit_no = m_low_limit_id = m_up_limit_id = trx_sys->max_trx_id;
if (!trx_sys->rw_trx_ids.empty()) {
copy_trx_ids(trx_sys->rw_trx_ids);
} else {
m_ids.clear();
}
ut_ad(m_up_limit_id <= m_low_limit_id);
if (UT_LIST_GET_LEN(trx_sys->serialisation_list) > 0) {
const trx_t *trx;
trx = UT_LIST_GET_FIRST(trx_sys->serialisation_list);
if (trx->no < m_low_limit_no) {
m_low_limit_no = trx->no;
}
}
m_closed = false;
}
ReadView銷毀
- RR隔離級別下疙描,在每個事務開始的時候诚隙,會將當前系統(tǒng)中的所有的活躍事務拷貝到ReadView中。SQL語句結(jié)束后不會刪除read_view起胰,從而下一個SQL語句時久又,使用上次申請的,這樣保證事務中的read view都一樣效五,從而實現(xiàn)可重復讀的隔離級別
storage/innobase/trx/trx0trx.cc
/** Commits a transaction in memory. */
static void trx_commit_in_memory(
trx_t *trx, /*!< in/out: transaction */
const mtr_t *mtr, /*!< in: mini-transaction of
trx_write_serialisation_history(), or NULL if
the transaction did not modify anything */
bool serialised)
/*!< in: true if serialisation log was
written */
{
trx->must_flush_log_later = false;
trx->ddl_must_flush = false;
if (trx_is_autocommit_non_locking(trx)) {
...
if (trx->read_view != NULL) {
trx_sys->mvcc->view_close(trx->read_view, false);
}
...
} else {
...
if (trx->read_only || trx->rsegs.m_redo.rseg == NULL) {
MONITOR_INC(MONITOR_TRX_RO_COMMIT);
if (trx->read_view != NULL) {
trx_sys->mvcc->view_close(trx->read_view, false);
}
} else {
ut_ad(trx->id > 0);
MONITOR_INC(MONITOR_TRX_RW_COMMIT);
}
}
}
- RC隔離級別下地消,在事務中的每個語句開始時,會將當前系統(tǒng)中的所有的活躍事務拷貝到ReadView中畏妖,每次SQL語句結(jié)束后都會調(diào)用view_close將ReadView從事務中刪除犯建,這樣在下一個SQL語句啟動時,會判斷trx->read_view為NULL瓜客,從而重新創(chuàng)建ReadView
storage/innobase/handler/ha_innodb.cc
int ha_innobase::external_lock(THD *thd, /*!< in: handle to the user thread */
int lock_type) /*!< in: lock type */
{
...
/* If the MySQL lock count drops to zero we know that the current SQL
statement has ended */
if (trx->n_mysql_tables_in_use == 0) {
trx->mysql_n_tables_locked = 0;
m_prebuilt->used_in_HANDLER = FALSE;
if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
if (trx_is_started(trx)) {
innobase_commit(ht, thd, TRUE);
} else {
/* Since the trx state is TRX_NOT_STARTED,
trx_commit() will not be called. Reset
trx->is_dd_trx here */
ut_d(trx->is_dd_trx = false);
}
} else if (trx->isolation_level <= TRX_ISO_READ_COMMITTED &&
MVCC::is_view_active(trx->read_view)) {
mutex_enter(&trx_sys->mutex);
trx_sys->mvcc->view_close(trx->read_view, true);
mutex_exit(&trx_sys->mutex);
}
}
if (!trx_is_started(trx) && lock_type != F_UNLCK &&
(m_prebuilt->select_lock_type != LOCK_NONE ||
m_stored_select_lock_type != LOCK_NONE)) {
++trx->will_lock;
}
DBUG_RETURN(0);
}
可見性判斷
- 聚集索引會調(diào)用ReadView的changes_visible函數(shù)判斷是否可見适瓦,如果lock_clust_rec_cons_read_sees返回的是false竿开,那么row_search_mvcc就會調(diào)用row_sel_build_prev_vers_for_mysql,該函數(shù)則調(diào)用row_vers_build_for_consistent_read玻熙,該函數(shù)根據(jù)回滾段中的信息不斷構(gòu)建前一個版本信息直至當前事務可見
storage/innobase/lock/lock0lock.cc
/** Checks that a record is seen in a consistent read.
@return true if sees, or false if an earlier version of the record
should be retrieved */
bool lock_clust_rec_cons_read_sees(
const rec_t *rec, /*!< in: user record which should be read or
passed over by a read cursor */
dict_index_t *index, /*!< in: clustered index */
const ulint *offsets, /*!< in: rec_get_offsets(rec, index) */
ReadView *view) /*!< in: consistent read view */
{
ut_ad(index->is_clustered());
ut_ad(page_rec_is_user_rec(rec));
ut_ad(rec_offs_validate(rec, index, offsets));
/* Temp-tables are not shared across connections and multiple
transactions from different connections cannot simultaneously
operate on same temp-table and so read of temp-table is
always consistent read. */
if (srv_read_only_mode || index->table->is_temporary()) {
ut_ad(view == 0 || index->table->is_temporary());
return (true);
}
/* NOTE that we call this function while holding the search
system latch. */
trx_id_t trx_id = row_get_rec_trx_id(rec, index, offsets);
return (view->changes_visible(trx_id, index->table->name));
}
- 由于InnoDB的二級索引只保存page最后更新的trx_id否彩,當利用二級索引進行查詢的時候,如果page的trx_id小于up_limit_id嗦随,可以直接判斷page的所有記錄對于當前view是可見的列荔,否則需要回clustered索引進行判斷。如果記錄對于view不可見枚尼,需要通過記錄的DB_ROLL_PTR指針遍歷history list構(gòu)造當前view可見版本數(shù)據(jù)
storage/innobase/lock/lock0lock.cc
/** Checks that a non-clustered index record is seen in a consistent read.
NOTE that a non-clustered index page contains so little information on
its modifications that also in the case false, the present version of
rec may be the right, but we must check this from the clustered index
record.
@return true if certainly sees, or false if an earlier version of the
clustered index record might be needed */
bool lock_sec_rec_cons_read_sees(
const rec_t *rec, /*!< in: user record which
should be read or passed over
by a read cursor */
const dict_index_t *index, /*!< in: index */
const ReadView *view) /*!< in: consistent read view */
{
ut_ad(page_rec_is_user_rec(rec));
/* NOTE that we might call this function while holding the search
system latch. */
if (recv_recovery_is_on()) {
return (false);
} else if (index->table->is_temporary()) {
/* Temp-tables are not shared across connections and multiple
transactions from different connections cannot simultaneously
operate on same temp-table and so read of temp-table is
always consistent read. */
return (true);
}
trx_id_t max_trx_id = page_get_max_trx_id(page_align(rec));
ut_ad(max_trx_id > 0);
return (view->sees(max_trx_id));
}
bool sees(trx_id_t id) const
{
return(id < m_up_limit_id);
}
事務的執(zhí)行過程
- 更新記錄時會先用排他鎖鎖定該行
- 然后記錄redo log
- 原記錄將被放入到undo表空間中
- 填寫當前行的值以及新的事務編號贴浙, 并通過DB_ROLL_PTR指向該記錄
- 當插入的是一條新數(shù)據(jù)時,記錄上對應的回滾段指針為NULL
流程圖
總結(jié)
實質(zhì)上MySQL所實現(xiàn)的MVCC機制就是水位線機制署恍,或者說窗口機制崎溃。在其他系統(tǒng)里也有很多類似的設計思想,比如:
- Kafka的副本同步機制中的LEO盯质、HW以及日志清理時的LW
- Java G1垃圾回收器的preTAMS指針與nextTAMS指針
- TCP通信協(xié)議中的滑動窗口