acquire_locks是在MDS的各種請(qǐng)求流程中都必經(jīng)的過(guò)程,下面對(duì)其進(jìn)行分析
源碼文件:src/mds/Locker.cc
加鎖函數(shù):
/* If this function returns false, the mdr has been placed
* on the appropriate wait list */
bool Locker::acquire_locks(MDRequestRef& mdr,
set<SimpleLock*> &rdlocks,
set<SimpleLock*> &wrlocks,
set<SimpleLock*> &xlocks,
map<SimpleLock*,mds_rank_t> *remote_wrlocks,
CInode *auth_pin_freeze,
bool auth_pin_nonblock)
針對(duì)此函數(shù)參數(shù)說(shuō)明:
-
mdr
: 客戶端請(qǐng)求 -
rdlocks
: 需要加的讀鎖 -
wrlocks
: 需要加的本地寫(xiě)鎖 -
xlocks
: 需要加的互斥鎖 -
remote_wrlocks
: 需要加的遠(yuǎn)程寫(xiě)鎖伤柄,默認(rèn)為NULL -
auth_pin_freeze
: 需要被freeze的Inode,默認(rèn)為NULL -
auth_pin_nonblock
: 默認(rèn)為false,只有在dispatch_fragment_dir
和dispatch_export_dir
時(shí)為true
remote_wrlocks
和auth_pin_freeze
參數(shù)只會(huì)在一個(gè)流程中使用到,即rename流程
分析rename的流程可知怪瓶,僅在跨目錄做rename操作,并且源端和目標(biāo)端的父目錄分別屬于不同的RANK時(shí)践美,才會(huì)產(chǎn)生remote_wrlocks
(遠(yuǎn)程寫(xiě)鎖)
代碼如下:
void Server::handle_client_rename(MDRequestRef& mdr)
{
...
// -- locks --
map<SimpleLock*, mds_rank_t> remote_wrlocks;
...
// 若源目錄的auth MDS不是本MDS(目標(biāo)目錄MDS)洗贰,則需要為src dentry的父目錄inode的filelock和nestlock加遠(yuǎn)程寫(xiě)鎖
// 若源目錄和目標(biāo)目錄的auth MDS一致,則只需加本地寫(xiě)鎖
if (srcdirauth != mds->get_nodeid()) {
dout(10) << " will remote_wrlock srcdir scatterlocks on mds." << srcdirauth << dendl;
remote_wrlocks[&srcdn->get_dir()->inode->filelock] = srcdirauth;
remote_wrlocks[&srcdn->get_dir()->inode->nestlock] = srcdirauth;
if (srci->is_dir())
rdlocks.insert(&srci->dirfragtreelock);
} else {
wrlocks.insert(&srcdn->get_dir()->inode->filelock);
wrlocks.insert(&srcdn->get_dir()->inode->nestlock);
}
...
CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : NULL;
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks,
&remote_wrlocks, auth_pin_freeze))
return;
...
}
接下來(lái)分析acquire_locks
的處理流程陨倡,由于過(guò)程較為復(fù)雜敛滋,進(jìn)行分段分析
源碼文件:src/mds/Locker.cc
函數(shù): acquire_locks
第一步,整理出所有需要加的鎖并排序兴革,整理出mustpin列表
排序非常重要绎晃,是防止死鎖的關(guān)鍵
set<SimpleLock*, SimpleLock::ptr_lt> sorted; // sort everything we will lock
set<MDSCacheObject*> mustpin; // items to authpin
// 遍歷處理xlocks(獨(dú)占鎖)
for (set<SimpleLock*>::iterator p = xlocks.begin(); p != xlocks.end(); ++p) {
SimpleLock *lock = *p;
if ((lock->get_type() == CEPH_LOCK_ISNAP ||
lock->get_type() == CEPH_LOCK_IPOLICY) &&
mds->is_cluster_degraded() &&
mdr->is_master() &&
!mdr->is_queued_for_replay()) {
// 本節(jié)點(diǎn)為主MDS且同組的MDS處于recover狀態(tài)蜜唾,需要等待recover結(jié)束,才能重試客戶端請(qǐng)求庶艾,本次加鎖不能成功
// 此種情形直接釋放已經(jīng)獲得的鎖然后返回false
if (wait) {
dout(10) << " must xlock " << *lock << " " << *lock->get_parent()
<< ", waiting for cluster recovered" << dendl;
mds->locker->drop_locks(mdr.get(), NULL);
mdr->drop_local_auth_pins();
mds->wait_for_cluster_recovered(new C_MDS_RetryRequest(mdcache, mdr));
return false;
}
}
// lock加入sorted列表袁余,其關(guān)聯(lián)對(duì)象加入mustpin列表
sorted.insert(lock);
mustpin.insert(lock->get_parent());
// 若xlock關(guān)聯(lián)對(duì)象為dentry,則還需將其versionlock加鎖
// 若本MDS是RANK的主節(jié)點(diǎn),將versionlock加入wrlocks咱揍,若是RANK的從節(jié)點(diǎn)颖榜,則將versionlock加入xlocks
// 此操作的目的是保障同一個(gè)dentry的變更能夠保序的提交到j(luò)ournal
if ((*p)->get_type() == CEPH_LOCK_DN) {
CDentry *dn = (CDentry*)lock->get_parent();
if (!dn->is_auth())
continue;
if (xlocks.count(&dn->versionlock))
continue; // we're xlocking the versionlock too; don't wrlock it!
if (mdr->is_master()) {
// master. wrlock versionlock so we can pipeline dentry updates to journal.
wrlocks.insert(&dn->versionlock);
} else {
// slave. exclusively lock the dentry version (i.e. block other journal updates).
// this makes rollback safe.
xlocks.insert(&dn->versionlock);
sorted.insert(&dn->versionlock);
}
}
// 若xlock關(guān)聯(lián)對(duì)象為inode,同樣需將其versionlock加鎖
// 若本MDS是RANK的主節(jié)點(diǎn),將versionlock加入wrlocks煤裙,若是RANK的從節(jié)點(diǎn)掩完,則將versionlock加入xlocks
// 此操作的目的是保障同一個(gè)inode的變更能夠保序的提交到j(luò)ournal
if (lock->get_type() > CEPH_LOCK_IVERSION) {
// inode version lock?
CInode *in = (CInode*)lock->get_parent();
if (!in->is_auth())
continue;
if (mdr->is_master()) {
// master. wrlock versionlock so we can pipeline inode updates to journal.
wrlocks.insert(&in->versionlock);
} else {
// slave. exclusively lock the inode version (i.e. block other journal updates).
// this makes rollback safe.
xlocks.insert(&in->versionlock);
sorted.insert(&in->versionlock);
}
}
}
// 至此,xlocks以及其關(guān)聯(lián)inode硼砰、dentry的versionlock被加入到sorted隊(duì)列且蓬, xlocks的關(guān)聯(lián)對(duì)象被加入mustpin隊(duì)列
// 遍歷處理wrlocks
for (set<SimpleLock*>::iterator p = wrlocks.begin(); p != wrlocks.end(); ++p) {
MDSCacheObject *object = (*p)->get_parent();
sorted.insert(*p);
if (object->is_auth())
// 屬于本rank的對(duì)象,加入mustpin列表
mustpin.insert(object);
else if (!object->is_auth() &&
!(*p)->can_wrlock(client) && // we might have to request a scatter
!mdr->is_slave()) { // if we are slave (remote_wrlock), the master already authpinned
// 不屬于本rank夺刑,但已經(jīng)被別的客戶端占用獨(dú)占鎖或?qū)戞i缅疟,需要加入mustpin列表
mustpin.insert(object);
}
}
// 遍歷處理remote_wrlocks
if (remote_wrlocks) {
for (map<SimpleLock*,mds_rank_t>::iterator p = remote_wrlocks->begin(); p != remote_wrlocks->end(); ++p) {
MDSCacheObject *object = p->first->get_parent();
dout(20) << " must remote_wrlock on mds." << p->second << " "
<< *p->first << " " << *object << dendl;
sorted.insert(p->first);
mustpin.insert(object);
}
}
// 遍歷處理rdlocks
for (set<SimpleLock*>::iterator p = rdlocks.begin();
p != rdlocks.end();
++p) {
MDSCacheObject *object = (*p)->get_parent();
sorted.insert(*p);
if (object->is_auth())
// 屬于本rank的對(duì)象,加入mustpin列表
mustpin.insert(object);
else if (!object->is_auth() &&
!(*p)->can_rdlock(client)) { // we might have to request an rdlock
// 不屬于本rank遍愿,但已經(jīng)被別的客戶端占用獨(dú)占鎖或?qū)戞i存淫,需要加入mustpin列表
mustpin.insert(object);
}
}
第二步,Local auth pin處理
//
map<mds_rank_t, set<MDSCacheObject*> > mustpin_remote; // mds -> (object set)
// can i auth pin them all now?
marker.message = "failed to authpin local pins";
// 遍歷處理第一步產(chǎn)生的mustpin列表
for (set<MDSCacheObject*>::iterator p = mustpin.begin();
p != mustpin.end();
++p) {
MDSCacheObject *object = *p;
dout(10) << " must authpin " << *object << dendl;
if (mdr->is_auth_pinned(object)) {
if (object != (MDSCacheObject*)auth_pin_freeze)
continue;
if (mdr->more()->is_remote_frozen_authpin) {
if (mdr->more()->rename_inode == auth_pin_freeze)
continue;
// unfreeze auth pin for the wrong inode
mustpin_remote[mdr->more()->rename_inode->authority().first].size();
}
}
// object不屬于本RANK沼填,則釋放已經(jīng)獲得的鎖
// 若所屬RANK未確定的情況桅咆,則等待確定后重試請(qǐng)求
// 所屬RANK確定的情況,則將此object放入對(duì)應(yīng)RANK的列表坞笙,保存到mustpin_remote中岩饼,待下一步處理
if (!object->is_auth()) {
if (!mdr->locks.empty())
drop_locks(mdr.get());
if (object->is_ambiguous_auth()) {
// wait
marker.message = "waiting for single auth, object is being migrated";
dout(10) << " ambiguous auth, waiting to authpin " << *object << dendl;
object->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr));
mdr->drop_local_auth_pins();
return false;
}
mustpin_remote[object->authority().first].insert(object);
continue;
}
int err = 0;
// can_auth_pin返回false(非0值)的情況:
// dir或inode不屬于本RANK
// dir或inode本身或者其子樹(shù)中存在節(jié)點(diǎn)處于freezing狀態(tài)或者frozen狀態(tài)
if (!object->can_auth_pin(&err)) {
// 等待unfreeze后重試請(qǐng)求
object->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
if (!mdr->remote_auth_pins.empty())
notify_freeze_waiter(object);
return false;
}
}
// 運(yùn)行至此,表明所有object要么can_auth_pin薛夜,要么被放入到mustpin_remote中
// 執(zhí)行本地的auth_pin操作
for (set<MDSCacheObject*>::iterator p = mustpin.begin();
p != mustpin.end();
++p) {
MDSCacheObject *object = *p;
if (mdr->is_auth_pinned(object)) {
dout(10) << " already auth_pinned " << *object << dendl;
} else if (object->is_auth()) {
dout(10) << " auth_pinning " << *object << dendl;
mdr->auth_pin(object);
}
}
第三步籍茧,Remote auth pin處理
// request remote auth_pins
if (!mustpin_remote.empty()) {
marker.message = "requesting remote authpins";
// 將上一步mustpin列表中,不屬于本RANK的對(duì)象加入到remote_auth_pins中對(duì)應(yīng)RANK的列表
// 在上一步的處理過(guò)程中可知梯澜,不屬于本RANK的對(duì)象而且被其它客戶端鎖定的寞冯,會(huì)被加入到mustpin列表,此處對(duì)其進(jìn)行處理
for (map<MDSCacheObject*,mds_rank_t>::iterator p = mdr->remote_auth_pins.begin();
p != mdr->remote_auth_pins.end();
++p) {
if (mustpin.count(p->first)) {
assert(p->second == p->first->authority().first);
map<mds_rank_t, set<MDSCacheObject*> >::iterator q = mustpin_remote.find(p->second);
if (q != mustpin_remote.end())
q->second.insert(p->first);
}
}
// 至此晚伙,mustpin_remote中包含了remote_wrlocks涉及的對(duì)象吮龄,加上前述mustpin列表的部分對(duì)象
for (map<mds_rank_t, set<MDSCacheObject*> >::iterator p = mustpin_remote.begin();
p != mustpin_remote.end();
++p) {
dout(10) << "requesting remote auth_pins from mds." << p->first << dendl;
// 對(duì)應(yīng)RANK的MDS處于非Active,則需要等待其active后重試請(qǐng)求
if (mds->is_cluster_degraded() &&
!mds->mdsmap->is_clientreplay_or_active_or_stopping(p->first)) {
dout(10) << " mds." << p->first << " is not active" << dendl;
if (mdr->more()->waiting_on_slave.empty())
mds->wait_for_active_peer(p->first, new C_MDS_RetryRequest(mdcache, mdr));
return false;
}
// 對(duì)應(yīng)RANK的MDS正常咆疗,則向其發(fā)送MMDSSlaveRequest請(qǐng)求:OP_AUTHPIN
MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
MMDSSlaveRequest::OP_AUTHPIN);
// 將同屬于一個(gè)RANK的對(duì)象信息組裝到一個(gè)MMDSSlaveRequest請(qǐng)求中發(fā)送
for (set<MDSCacheObject*>::iterator q = p->second.begin();
q != p->second.end();
++q) {
dout(10) << " req remote auth_pin of " << **q << dendl;
MDSCacheObjectInfo info;
(*q)->set_object_info(info);
req->get_authpins().push_back(info);
if (*q == auth_pin_freeze)
(*q)->set_object_info(req->get_authpin_freeze());
mdr->pin(*q);
}
if (auth_pin_nonblock)
req->mark_nonblock();
mds->send_message_mds(req, p->first);
// 請(qǐng)求發(fā)送以后漓帚,將對(duì)應(yīng)RANK放入mdr的waiting_on_slave隊(duì)列
assert(mdr->more()->waiting_on_slave.count(p->first) == 0);
mdr->more()->waiting_on_slave.insert(p->first);
}
return false;
}
OP_AUTHPIN
請(qǐng)求會(huì)被對(duì)端MDS的Server::handle_slave_request
處理
最終分發(fā)給Server::handle_slave_auth_pin
處理,其處理過(guò)程概要如下:
- 取出請(qǐng)求中Master端發(fā)送的auth_pin objects
- 若object或者其子樹(shù)節(jié)點(diǎn)存在freeze狀態(tài)午磁,則等待unfreeze后重試
- 有指定auth_pin_freeze的情況(rename的情況)尝抖,則等待對(duì)應(yīng)inode上的其它操作全部完成以后毡们,對(duì)其做freeze_auth_pin操作,將其freeze
- 對(duì)所有objects做auth_pin操作
- 返回應(yīng)答消息MMDSSlaveRequest牵署,op類型為OP_AUTHPINACK漏隐,消息中包含了本次auth_pin成功的object列表
回到Master源端,OP_AUTHPINACK
操作會(huì)分發(fā)到Server::handle_slave_auth_pin_ack
處理
其處理流程概要如下:
- 從ack消息取出objects奴迅,加入到pinned列表
- 將objects從remote_auth_pins移除
- 將RANK從waiting_on_slave中移除
- waiting_on_slave列表中所有rank都返回了ack,都從waiting_on_slave中移除之后挺据,重新分發(fā)請(qǐng)求
mdcache->dispatch_request(mdr)
第三步很可能因?yàn)閞emote auth pin失敗而重復(fù)執(zhí)行取具,最終全部成功以后,繼續(xù)接下來(lái)的第四步操作
第四步扁耐,加鎖操作
// caps i'll need to issue
set<CInode*> issue_set;
bool result = false;
// acquire locks.
// make sure they match currently acquired locks.
set<SimpleLock*, SimpleLock::ptr_lt>::iterator existing = mdr->locks.begin();
for (set<SimpleLock*, SimpleLock::ptr_lt>::iterator p = sorted.begin();
p != sorted.end();
++p) {
bool need_wrlock = !!wrlocks.count(*p);
bool need_remote_wrlock = !!(remote_wrlocks && remote_wrlocks->count(*p));
// 已經(jīng)獲得此鎖
if (existing != mdr->locks.end() && *existing == *p) {
// right kind?
SimpleLock *have = *existing;
++existing;
...
if (mdr->remote_wrlocks.count(have)) {
if (!need_remote_wrlock ||
mdr->remote_wrlocks[have] != (*remote_wrlocks)[have]) {
// 釋放不需要的被誤鎖的remote wrlock暇检,暫不清楚為何有誤鎖的情況
}
}
...
continue;
}
// 至此,存在兩種情形:
// 1. stray locks(mdr中和sorted中不一致的寫(xiě)鎖婉称,暫不清楚為何會(huì)有此種情況發(fā)生)
// 2. 未獲得此鎖块仆,需要做加鎖操作
if (existing != mdr->locks.end() && *existing == *p) {
assert(need_wrlock || need_remote_wrlock);
SimpleLock *lock = *existing;
if (mdr->wrlocks.count(lock)) {
//釋放 stray的wrlock
wrlock_finish(lock, mdr.get(), &need_issue);
...
}
++existing;
}
// 位于stray lock之后的其它lock,若包含在本次sort列表中王暗,均需要釋放掉悔据,后面重新再加鎖
// 此種處理方式,推斷應(yīng)該是為了保證加鎖的順序嚴(yán)格按照sorted進(jìn)行俗壹,避免產(chǎn)生死鎖的情況
while (existing != mdr->locks.end()) {
SimpleLock *stray = *existing;
++existing;
//釋放各種已獲得的鎖
...
}
// cancle掉stray對(duì)象上的正在做的lock操作
if (mdr->locking && *p != mdr->locking) {
cancel_locking(mdr.get(), &issue_set);
}
if (xlocks.count(*p)) {
// 若鎖是xlock科汗,進(jìn)行xlock加鎖操作,過(guò)程概要如下
// 1. 若鎖類型為CEPH_LOCK_IVERSION或者CEPH_LOCK_DVERSION绷雏,則進(jìn)行本地xlock操作
// 2. 沒(méi)有沖突的情況头滔,直接加xlock返回
// 3. 鎖類型為CEPH_LOCK_IFILE,且inode處于recovering涎显,則提升mdcache對(duì)于inode的recover優(yōu)先級(jí)
// 4. lock處于非stabe狀態(tài)坤检,則等待其stable后重試請(qǐng)求,流程重新走到此處來(lái)
// 5. lock關(guān)聯(lián)對(duì)象不屬于此RANK期吓,則發(fā)送MMDSSlaveRequest::OP_XLOCK請(qǐng)求到對(duì)應(yīng)RANK的MDS進(jìn)行加鎖早歇,在ack中重新分發(fā)流程到此處
if (!xlock_start(*p, mdr))
goto out;
dout(10) << " got xlock on " << **p << " " << *(*p)->get_parent() << dendl;
} else if (need_wrlock || need_remote_wrlock) {
// 寫(xiě)鎖和遠(yuǎn)程寫(xiě)鎖的加鎖處理
if (need_remote_wrlock && !mdr->remote_wrlocks.count(*p)) {
// 未取得遠(yuǎn)程寫(xiě)鎖的情況,發(fā)起遠(yuǎn)程寫(xiě)鎖請(qǐng)求 MMDSSlaveRequest::OP_WRLOCK膘婶, 在ack中重新分發(fā)流程到此處
marker.message = "waiting for remote wrlocks";
remote_wrlock_start(*p, (*remote_wrlocks)[*p], mdr);
goto out;
}
if (need_wrlock && !mdr->wrlocks.count(*p)) {
// 本地寫(xiě)鎖處理
if (need_remote_wrlock && !(*p)->can_wrlock(mdr->get_client())) {
// 與gather流程沖突的情況缺前,需要先釋放已經(jīng)獲得的遠(yuǎn)程寫(xiě)鎖,使gather流程能夠運(yùn)行結(jié)束悬襟,然后重新獲取遠(yuǎn)程寫(xiě)鎖
remote_wrlock_finish(*p, mdr->remote_wrlocks[*p], mdr.get());
remote_wrlock_start(*p, (*remote_wrlocks)[*p], mdr);
goto out;
}
// 遠(yuǎn)程寫(xiě)鎖已經(jīng)獲得衅码,且本地寫(xiě)鎖與gather流程無(wú)沖突時(shí),執(zhí)行本地寫(xiě)鎖加鎖操作脊岳,大致過(guò)程如下:
// 1. 若鎖類型為CEPH_LOCK_IVERSION或者CEPH_LOCK_DVERSION逝段,則進(jìn)行本地wrlock操作
// 2. 沒(méi)有沖突的情況垛玻,直接加wrlock返回
// 3. 鎖沖突,并且已經(jīng)獲得了遠(yuǎn)程寫(xiě)鎖的情況下奶躯,返回false帚桩,goto out
// 4. 鎖狀態(tài)非stabe,并且已經(jīng)獲得了遠(yuǎn)程寫(xiě)鎖的情況下嘹黔, 返回false账嚎,goto out
// 5. 鎖狀態(tài)非stabe,不存在遠(yuǎn)程寫(xiě)鎖的情況下儡蔓,等待stable后重試流程郭蕉,重新分發(fā)至此
// 6. 鎖關(guān)聯(lián)對(duì)象不屬于本RANK的情況下,發(fā)送MLock請(qǐng)求至對(duì)應(yīng)RANK的MDS處理喂江,在ack中重新分發(fā)流程到此處
if (!wrlock_start(*p, mdr, need_remote_wrlock))
goto out;
dout(10) << " got wrlock on " << **p << " " << *(*p)->get_parent() << dendl;
}
} else {
// rdlocks加鎖處理過(guò)程
assert(mdr->is_master());
if ((*p)->needs_recover()) {
if (mds->is_cluster_degraded()) {
if (!mdr->is_queued_for_replay()) {
// MDS處于recover的情況召锈,需等待recover完成后,重新分發(fā)請(qǐng)求至此處理
...
}
} else {
(*p)->clear_need_recover();
}
}
// 開(kāi)始加讀鎖操作获询,大致過(guò)程如下
// 1. 沒(méi)有沖突的情況涨岁,直接加鎖返回
// 2. 有沖突的情況,循環(huán)同步鎖狀態(tài)吉嚣,重試加鎖
// 3. 鎖狀態(tài)非stabe梢薪,等待stable后重試流程,重新分發(fā)至此
// 4. 鎖關(guān)聯(lián)對(duì)象不屬于本RANK的情況下瓦戚,發(fā)送MLock請(qǐng)求至對(duì)應(yīng)RANK的MDS處理沮尿,在ack中重新分發(fā)流程到此處
if (!rdlock_start(*p, mdr))
goto out;
dout(10) << " got rdlock on " << **p << " " << *(*p)->get_parent() << dendl;
}
}
// 運(yùn)行至此,rdlocks wrlocks xlocks全部完成加鎖操作
// 為何到此處還可能有stray lock较解,尚不清楚畜疾,需要釋放掉這些stray lock
while (existing != mdr->locks.end()) {
SimpleLock *stray = *existing;
++existing;
dout(10) << " unlocking extra " << *stray << " " << *stray->get_parent() << dendl;
bool need_issue = false;
if (mdr->xlocks.count(stray)) {
xlock_finish(stray, mdr.get(), &need_issue);
} else if (mdr->rdlocks.count(stray)) {
rdlock_finish(stray, mdr.get(), &need_issue);
} else {
// may have acquired both wrlock and remore wrlock
if (mdr->wrlocks.count(stray))
wrlock_finish(stray, mdr.get(), &need_issue);
if (mdr->remote_wrlocks.count(stray))
remote_wrlock_finish(stray, mdr->remote_wrlocks[stray], mdr.get());
}
if (need_issue)
issue_set.insert(static_cast<CInode*>(stray->get_parent()));
}
// 加鎖操作完成,返回true
mdr->done_locking = true;
mdr->set_mds_stamp(ceph_clock_now());
result = true;
marker.message = "acquired locks";
out:
issue_caps_set(issue_set);
return result;
小結(jié)
整個(gè)加鎖過(guò)程可以概括為如下4個(gè)步驟:
- 羅列需要加的鎖以及需要auth pin的objects(dir/dentry/inode)
對(duì)于dentry和inode的獨(dú)占鎖印衔,還需將其version_lock
加寫(xiě)鎖啡捶,以保證journal的保序性 - 本地objects的auth_pin操作,增加其auth pin計(jì)數(shù)奸焙,inode瞎暑、dentry和dir都會(huì)做auth_pin
- 遠(yuǎn)程objects的auth_pin操作,主要涉及到需要遠(yuǎn)程wrlock的對(duì)象与帆,以及路徑上不屬于本RANK的object了赌,但被其它客戶端鎖定的情況,此種對(duì)象存在跨MDS數(shù)據(jù)一致性風(fēng)險(xiǎn)玄糟,所以也需要做auth_pin操作
- 按照sorted順序循環(huán)加鎖勿她,可能是獨(dú)占鎖、讀鎖或?qū)戞i阵翎,每個(gè)鎖的加鎖操作都可能遇到這些情況:
- MDS recover狀態(tài)引起的等待和重試
- lock狀態(tài)非stable引起的等待和重試
- 關(guān)聯(lián)對(duì)象不屬于本RANK而引起的遠(yuǎn)程鎖請(qǐng)求和重新分發(fā)原請(qǐng)求
- 沖突而導(dǎo)致的循環(huán)等待
- 不可解決的問(wèn)題逢并,導(dǎo)致加鎖失敗退出
- 非常順利的無(wú)沖突加鎖成功
一點(diǎn)不成熟的想法:
加鎖過(guò)程非常的復(fù)雜之剧,若目錄層級(jí)深,RANK較多砍聊,特別在開(kāi)啟動(dòng)態(tài)子樹(shù)分區(qū)的情況下背稼,很可能會(huì)產(chǎn)生非常多的跨MDS遠(yuǎn)程鎖請(qǐng)求,這將嚴(yán)重影響文件系統(tǒng)的元數(shù)據(jù)性能玻蝌。
實(shí)踐中蟹肘,盡量將目錄結(jié)構(gòu)扁平化,多活MDS環(huán)境下俯树,使用靜態(tài)分區(qū)可能是更好的選擇