RocksDB源碼分析 Read(一)內(nèi)存讀取

Get

SuperVersion* sv = GetAndRefSuperVersion丐吓;
SequenceNumber snapshot;
//獲取snapshot (目前最大的sequence)
...
bool done = false;
  if (!skip_memtable) {
    // Get value associated with key
    if (get_impl_options.get_value) {
      //查詢memtable
      if (sv->mem->Get(lkey, get_impl_options.value->GetSelf(), &s,
                       &merge_context, &max_covering_tombstone_seq,
                       read_options, get_impl_options.callback,
                       get_impl_options.is_blob_index)) {
        done = true;
        get_impl_options.value->PinSelf();
        RecordTick(stats_, MEMTABLE_HIT);
      } else if ((s.ok() || s.IsMergeInProgress()) &&
                 sv->imm->Get(lkey, get_impl_options.value->GetSelf(), &s,
                              &merge_context, &max_covering_tombstone_seq,
                              read_options, get_impl_options.callback,
                              get_impl_options.is_blob_index)) {
        done = true;
        get_impl_options.value->PinSelf();
        RecordTick(stats_, MEMTABLE_HIT);
      }
    } 
...

memtable get

存在memtable里的key是key+(type and sequence)其中type and seq混合8字節(jié)

//先查詢bloom filter
if (bloom_filter_) {
  // when both memtable_whole_key_filtering and prefix_extractor_ are set,
  // only do whole key filtering for Get() to save CPU
  if (moptions_.memtable_whole_key_filtering) {
    may_contain =
        bloom_filter_->MayContain(StripTimestampFromUserKey(user_key, ts_sz));
  } else {
    assert(prefix_extractor_);
    may_contain =
        !prefix_extractor_->InDomain(user_key) ||
        bloom_filter_->MayContain(prefix_extractor_->Transform(user_key));
  }
}

//從memtable里拿
GetFromTable(key, *max_covering_tombstone_seq, do_merge, callback,
                 is_blob_index, value, s, merge_context, seq,
                 &found_final_value, &merge_in_progress);
 
void MemTable::GetFromTable(...){
  //構(gòu)建saver 和回調(diào)
  Saver saver;
  saver.status = s;
  saver.found_final_value = found_final_value;
  saver.merge_in_progress = merge_in_progress;
  saver.key = &key;
  saver.value = value;
  saver.seq = kMaxSequenceNumber;
  saver.mem = this;
  saver.merge_context = merge_context;
  saver.max_covering_tombstone_seq = max_covering_tombstone_seq;
  saver.merge_operator = moptions_.merge_operator;
  saver.logger = moptions_.info_log;
  saver.inplace_update_support = moptions_.inplace_update_support;
  saver.statistics = moptions_.statistics;
  saver.env_ = env_;
  saver.callback_ = callback;
  saver.is_blob_index = is_blob_index;
  saver.do_merge = do_merge;
  //執(zhí)行查找
  table_->Get(key, &saver, SaveValue);
} 


void MemTableRep::Get(const LookupKey& k, void* callback_args,
                      bool (*callback_func)(void* arg, const char* entry)) {
  auto iter = GetDynamicPrefixIterator();
  //從skiplist里查找
  for (iter->Seek(k.internal_key(), k.memtable_key().data());
       iter->Valid() && callback_func(callback_args, iter->key());
       iter->Next()) {
  }
}

inline void InlineSkipList<Comparator>::Iterator::Seek(const char* target) {
  //尋找key和sequence符合條件的 node
  //返回小于我們seq的值
  //Returns the earliest node with a key >= key.
  // Return nullptr if there is no such node.
  //key在skip list里從大到小排
  //所以查找會找到key >= 我們需要的key
  //如果key相等,會按照seq降序排,所以順序過去一定是seq小于等與我們需要的seq
  node_ = list_->FindGreaterOrEqual(target);
}



static bool SaveValue(void* arg, const char* entry) {
  Saver* s = reinterpret_cast<Saver*>(arg);
  assert(s != nullptr);
  MergeContext* merge_context = s->merge_context;
  SequenceNumber max_covering_tombstone_seq = s->max_covering_tombstone_seq;
  const MergeOperator* merge_operator = s->merge_operator;

  assert(merge_context != nullptr);

  // entry format is:
  //    klength  varint32
  //    userkey  char[klength-8]
  //    tag      uint64
  //    vlength  varint32f
  //    value    char[vlength]
  // Check that it belongs to same user key.  We do not check the
  // sequence number since the Seek() call above should have skipped
  // all entries with overly large sequence numbers.
  uint32_t key_length;
  const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
  Slice user_key_slice = Slice(key_ptr, key_length - 8);
  //這里因為seek可能找到key大于我們需要的key,此時需要比較一下,如果不想等則直接跳過
  if (s->mem->GetInternalKeyComparator()
          .user_comparator()
          ->CompareWithoutTimestamp(user_key_slice, s->key->user_key()) == 0) {
    // Correct user key
    const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
    ValueType type;
    SequenceNumber seq;
    UnPackSequenceAndType(tag, &seq, &type);
    // If the value is not in the snapshot, skip it
    if (!s->CheckCallback(seq)) {
      return true;  // to continue to the next seq
    }

    s->seq = seq;

    if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex) &&
        max_covering_tombstone_seq > seq) {
      type = kTypeRangeDeletion;
    }
    switch (type) {
      ...
      //根據(jù)type處理key value
  }

  // s->state could be Corrupt, merge or notfound
  return false;
}

ThreadLocalSuperVersion

Rocksdb利用線程局部緩存和atomic來替換掉原先leveldb的version加鎖的邏輯

//在讀之前需要獲得新的superversion(最新的versionset)
SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(DBImpl* db) {
  //通過swap獲得當(dāng)前的superversion(每個線程都用InUse對象替換tls對象)
  //如果沒有寫概龄,那么在執(zhí)行ReturnThreadLocalSuperVersion前,tls都保持inuse對象
  void* ptr = local_sv_->Swap(SuperVersion::kSVInUse);
  // Invariant:
  // (1) Scrape (always) installs kSVObsolete in ThreadLocal storage
  // (2) the Swap above (always) installs kSVInUse, ThreadLocal storage
  // should only keep kSVInUse before ReturnThreadLocalSuperVersion call
  // (if no Scrape happens).
  assert(ptr != SuperVersion::kSVInUse);
  SuperVersion* sv = static_cast<SuperVersion*>(ptr);
  //如果剛獲取完superversion,就發(fā)現(xiàn)已經(jīng)過期了飞醉。那就把這個給刪了,直接通過加鎖獲取當(dāng)前最新的super version
  if (sv == SuperVersion::kSVObsolete ||
      sv->version_number != super_version_number_.load()) {
    RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_ACQUIRES);
    SuperVersion* sv_to_delete = nullptr;

    if (sv && sv->Unref()) {
      RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_CLEANUPS);
      db->mutex()->Lock();
      // NOTE: underlying resources held by superversion (sst files) might
      // not be released until the next background job.
      sv->Cleanup();
      if (db->immutable_db_options().avoid_unnecessary_blocking_io) {
        db->AddSuperVersionsToFreeQueue(sv);
        db->SchedulePurge();
      } else {
        sv_to_delete = sv;
      }
    } else {
      db->mutex()->Lock();
    }
    //這里一定要加鎖屯阀,防止在被后臺線程作出變更缅帘,并獲取當(dāng)前的全局super_version
    sv = super_version_->Ref();
    db->mutex()->Unlock();

    delete sv_to_delete;
  }
  assert(sv != nullptr);
  return sv;
}

ReturnAndCleanupSuperVersion

void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,
                                          SuperVersion* sv) {
  if (!cfd->ReturnThreadLocalSuperVersion(sv)) {
    //將當(dāng)前的superversion反還給tls,如果此時cas發(fā)現(xiàn)換不回去难衰,則說明已經(jīng)被變更了(寫線程修改了所有線程的tls為nullptr)
    //清除掉當(dāng)前保留的舊版本superversion
    CleanupSuperVersion(sv);
  }
}

bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
  assert(sv != nullptr);
  // Put the SuperVersion back
  void* expected = SuperVersion::kSVInUse;
  if (local_sv_->CompareAndSwap(static_cast<void*>(sv), expected)) {
    // When we see kSVInUse in the ThreadLocal, we are sure ThreadLocal
    // storage has not been altered and no Scrape has happened. The
    // SuperVersion is still current.
    return true;
  } else {
    // ThreadLocal scrape happened in the process of this GetImpl call (after
    // thread local Swap() at the beginning and before CompareAndSwap()).
    // This means the SuperVersion it holds is obsolete.
    assert(expected == SuperVersion::kSVObsolete);
  }
  return false;
}

InstallSuperVersion

InstallSuperVersionAndScheduleWork->
void ColumnFamilyData::InstallSuperVersion(
    SuperVersionContext* sv_context, InstrumentedMutex* db_mutex,
    const MutableCFOptions& mutable_cf_options) {
  //外部加鎖了
  SuperVersion* new_superversion = sv_context->new_superversion.release();
  new_superversion->db_mutex = db_mutex;
  new_superversion->mutable_cf_options = mutable_cf_options;
  new_superversion->Init(mem_, imm_.current(), current_);
  SuperVersion* old_superversion = super_version_;
  //設(shè)置新的suerversion
  super_version_ = new_superversion;
  ++super_version_number_;
  super_version_->version_number = super_version_number_;
  super_version_->write_stall_condition =
      RecalculateWriteStallConditions(mutable_cf_options);

  if (old_superversion != nullptr) {
    // Reset SuperVersions cached in thread local storage.
    // This should be done before old_superversion->Unref(). That's to ensure
    // that local_sv_ never holds the last reference to SuperVersion, since
    // it has no means to safely do SuperVersion cleanup.
    //將其他線程的tls設(shè)置為nullptr(SuperVersion::kSVObsolete)
    //這個在old_superversion->Unref()之前調(diào)用钦无,這樣local_sv就不會是最后一個superversion的引用
    ResetThreadLocalSuperVersions();

    if (old_superversion->mutable_cf_options.write_buffer_size !=
        mutable_cf_options.write_buffer_size) {
      mem_->UpdateWriteBufferSize(mutable_cf_options.write_buffer_size);
    }
    if (old_superversion->write_stall_condition !=
        new_superversion->write_stall_condition) {
      sv_context->PushWriteStallNotification(
          old_superversion->write_stall_condition,
          new_superversion->write_stall_condition, GetName(), ioptions());
    }
    //如果這是最后一個對old_superversion的引用,那么就將其清除掉
    if (old_superversion->Unref()) {
      old_superversion->Cleanup();
      sv_context->superversions_to_free.push_back(old_superversion);
    }
  }
}

rocksdb對leveldb的讀優(yōu)化

Mutex用時也是Atomic的3倍盖袭。

rocksdb就是將leveldb里Get()實現(xiàn)中一上來就mutex加鎖的操作換成atmoic+線程私有存儲的方式來進行優(yōu)化失暂,優(yōu)化后讀操作基本很少再會有互斥彼宠,性能提高不少

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
  • 序言:七十年代末,一起剝皮案震驚了整個濱河市弟塞,隨后出現(xiàn)的幾起案子凭峡,更是在濱河造成了極大的恐慌,老刑警劉巖决记,帶你破解...
    沈念sama閱讀 206,311評論 6 481
  • 序言:濱河連續(xù)發(fā)生了三起死亡事件摧冀,死亡現(xiàn)場離奇詭異,居然都是意外死亡系宫,警方通過查閱死者的電腦和手機索昂,發(fā)現(xiàn)死者居然都...
    沈念sama閱讀 88,339評論 2 382
  • 文/潘曉璐 我一進店門,熙熙樓的掌柜王于貴愁眉苦臉地迎上來扩借,“玉大人椒惨,你說我怎么就攤上這事〕弊铮” “怎么了康谆?”我有些...
    開封第一講書人閱讀 152,671評論 0 342
  • 文/不壞的土叔 我叫張陵,是天一觀的道長错洁。 經(jīng)常有香客問我秉宿,道長,這世上最難降的妖魔是什么屯碴? 我笑而不...
    開封第一講書人閱讀 55,252評論 1 279
  • 正文 為了忘掉前任描睦,我火速辦了婚禮,結(jié)果婚禮上导而,老公的妹妹穿的比我還像新娘忱叭。我一直安慰自己,他們只是感情好今艺,可當(dāng)我...
    茶點故事閱讀 64,253評論 5 371
  • 文/花漫 我一把揭開白布韵丑。 她就那樣靜靜地躺著,像睡著了一般虚缎。 火紅的嫁衣襯著肌膚如雪撵彻。 梳的紋絲不亂的頭發(fā)上,一...
    開封第一講書人閱讀 49,031評論 1 285
  • 那天实牡,我揣著相機與錄音陌僵,去河邊找鬼。 笑死创坞,一個胖子當(dāng)著我的面吹牛碗短,可吹牛的內(nèi)容都是我干的。 我是一名探鬼主播题涨,決...
    沈念sama閱讀 38,340評論 3 399
  • 文/蒼蘭香墨 我猛地睜開眼偎谁,長吁一口氣:“原來是場噩夢啊……” “哼总滩!你這毒婦竟也來了?” 一聲冷哼從身側(cè)響起巡雨,我...
    開封第一講書人閱讀 36,973評論 0 259
  • 序言:老撾萬榮一對情侶失蹤闰渔,失蹤者是張志新(化名)和其女友劉穎,沒想到半個月后鸯隅,有當(dāng)?shù)厝嗽跇淞掷锇l(fā)現(xiàn)了一具尸體澜建,經(jīng)...
    沈念sama閱讀 43,466評論 1 300
  • 正文 獨居荒郊野嶺守林人離奇死亡,尸身上長有42處帶血的膿包…… 初始之章·張勛 以下內(nèi)容為張勛視角 年9月15日...
    茶點故事閱讀 35,937評論 2 323
  • 正文 我和宋清朗相戀三年蝌以,在試婚紗的時候發(fā)現(xiàn)自己被綠了。 大學(xué)時的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片何之。...
    茶點故事閱讀 38,039評論 1 333
  • 序言:一個原本活蹦亂跳的男人離奇死亡跟畅,死狀恐怖,靈堂內(nèi)的尸體忽然破棺而出溶推,到底是詐尸還是另有隱情徊件,我是刑警寧澤,帶...
    沈念sama閱讀 33,701評論 4 323
  • 正文 年R本政府宣布蒜危,位于F島的核電站虱痕,受9級特大地震影響,放射性物質(zhì)發(fā)生泄漏辐赞。R本人自食惡果不足惜部翘,卻給世界環(huán)境...
    茶點故事閱讀 39,254評論 3 307
  • 文/蒙蒙 一、第九天 我趴在偏房一處隱蔽的房頂上張望响委。 院中可真熱鬧新思,春花似錦、人聲如沸赘风。這莊子的主人今日做“春日...
    開封第一講書人閱讀 30,259評論 0 19
  • 文/蒼蘭香墨 我抬頭看了看天上的太陽邀窃。三九已至荸哟,卻和暖如春,著一層夾襖步出監(jiān)牢的瞬間瞬捕,已是汗流浹背鞍历。 一陣腳步聲響...
    開封第一講書人閱讀 31,485評論 1 262
  • 我被黑心中介騙來泰國打工, 沒想到剛下飛機就差點兒被人妖公主榨干…… 1. 我叫王不留山析,地道東北人堰燎。 一個月前我還...
    沈念sama閱讀 45,497評論 2 354
  • 正文 我出身青樓,卻偏偏與公主長得像笋轨,于是被迫代替她去往敵國和親秆剪。 傳聞我的和親對象是個殘疾皇子赊淑,可洞房花燭夜當(dāng)晚...
    茶點故事閱讀 42,786評論 2 345

推薦閱讀更多精彩內(nèi)容

  • MemTable MemTable是一個內(nèi)存中數(shù)據(jù)結(jié)構(gòu),用來保存新寫入的還沒有flush到SST文件中的數(shù)據(jù)仅讽。 讀...
    周肅閱讀 4,905評論 1 5
  • 久違的晴天陶缺,家長會。 家長大會開好到教室時洁灵,離放學(xué)已經(jīng)沒多少時間了饱岸。班主任說已經(jīng)安排了三個家長分享經(jīng)驗。 放學(xué)鈴聲...
    飄雪兒5閱讀 7,493評論 16 22
  • 今天感恩節(jié)哎徽千,感謝一直在我身邊的親朋好友苫费。感恩相遇!感恩不離不棄双抽。 中午開了第一次的黨會百框,身份的轉(zhuǎn)變要...
    迷月閃星情閱讀 10,551評論 0 11
  • 可愛進取,孤獨成精牍汹。努力飛翔铐维,天堂翱翔。戰(zhàn)爭美好慎菲,孤獨進取嫁蛇。膽大飛翔,成就輝煌露该。努力進取睬棚,遙望,和諧家園有决。可愛游走...
    趙原野閱讀 2,716評論 1 1
  • 在妖界我有個名頭叫胡百曉书幕,無論是何事新荤,只要找到胡百曉即可有解決的辦法。因為是只狐貍大家以訛傳訛叫我“傾城百曉”台汇,...
    貓九0110閱讀 3,255評論 7 3