RocksDB寫入資料過程DBImpl::Write()原始碼分析

阿新 • • 發佈：2019-01-06

Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
  if (my_batch == nullptr) {
    return Status::Corruption("Batch is nullptr!");
  }
  PERF_TIMER_GUARD(write_pre_and_post_process_time);
  // WriteThread::Writer是一個寫任務的抽象結構，代表了使用者的一次寫操作。其中的batch欄位存有
  // 實際需要寫入的資料，sync欄位指明這個寫操作需不需要對事務日誌執行fsync/fdatasync操作，而
  // disableWAL指明是否需要寫事務日誌，done欄位在該寫操作完成時設定，timeout_hint_us指明瞭
  // 這個寫操作完成時間期限。
  // 最後，in_batch_group的比較有意思。在RocksDB內部，對寫入操作做了優化，儘可能地將使用者的寫入
  // 批量處理。這其中使用了一個佇列，即write_thread_內部的WriteThread::Writer*佇列。在準備寫佇列頭
  // 的任務時，會試著用BuildBatchGroup()構建一個批量任務組，將緊跟著隊頭的其他寫操作任務加入
  // 到一個BatchGroup，一次性地寫入資料庫。
  WriteThread::Writer w(&mutex_);
  w.batch = my_batch;
  w.sync = write_options.sync;
  w.disableWAL = write_options.disableWAL;
  w.in_batch_group = false;
  w.done = false;
  w.timeout_hint_us = write_options.timeout_hint_us;

  uint64_t expiration_time = 0;
  bool has_timeout = false;
  if (w.timeout_hint_us == 0) {
    w.timeout_hint_us = WriteThread::kNoTimeOut;
  } else {
    expiration_time = env_->NowMicros() + w.timeout_hint_us;
    has_timeout = true;
  }

  if (!write_options.disableWAL) {
    RecordTick(stats_, WRITE_WITH_WAL);
  }

  // ???
  WriteContext context;
  mutex_.Lock();

  if (!write_options.disableWAL) {
    default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_WITH_WAL, 1);
  }

  // 將當前寫入任務@w掛入寫佇列，並在mutex_上睡眠等待。等待直到:
  // 1) 寫操作設定了超時時間，等待超時。或，
  // 2) @w之前的任務都已完成，@w已處於佇列頭部。或，
  // 3) @w這個寫任務被別的寫執行緒完成了。
  // 第3個條件，任務被別的寫執行緒完成，實際上是被之前的寫任務合併進一個
  // WriteBatchGroup中去了。此時的@w會被標記成in_batch_group。有意思的是，在EnterWriteThread()
  // 裡面，如果因為超時喚醒了，發現當前任務in_batch_group為true，則會繼續等待，
  // 因為它已經被別的執行緒加入BatchGroup準備寫入資料庫了。
  Status status = write_thread_.EnterWriteThread(&w, expiration_time);
  assert(status.ok() || status.IsTimedOut());
  if (status.IsTimedOut()) {
    mutex_.Unlock();
    RecordTick(stats_, WRITE_TIMEDOUT);
    return Status::TimedOut();
  }
  if (w.done) {  // write was done by someone else
    default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER,
                                           1);
    mutex_.Unlock();
    RecordTick(stats_, WRITE_DONE_BY_OTHER);
    return w.status;
  }

  RecordTick(stats_, WRITE_DONE_BY_SELF);
  default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1);

  // Once reaches this point, the current writer "w" will try to do its write
  // job.  It may also pick up some of the remaining writers in the "writers_"
  // when it finds suitable, and finish them in the same write batch.
  // This is how a write job could be done by the other writer.
  assert(!single_column_family_mode_ ||
         versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1);

  uint64_t max_total_wal_size = (db_options_.max_total_wal_size == 0)
                                    ? 4 * max_total_in_memory_state_
                                    : db_options_.max_total_wal_size;
  if (UNLIKELY(!single_column_family_mode_) &&
      alive_log_files_.begin()->getting_flushed == false &&
      total_log_size_ > max_total_wal_size) {
    // 如果column family有多個，最早的活躍的事務日誌對應的memtable還沒有被寫入磁碟，
    // 而且當前日誌總大小超過了設定的最大值，那麼就需要分配新的memtable，將老的
    // immutable memtable內容寫入磁碟。
    uint64_t flush_column_family_if_log_file = alive_log_files_.begin()->number;
    alive_log_files_.begin()->getting_flushed = true;
    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
        "Flushing all column families with data in WAL number %" PRIu64
        ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64,
        flush_column_family_if_log_file, total_log_size_, max_total_wal_size);
    // no need to refcount because drop is happening in write thread, so can't
    // happen while we're in the write thread
    for (auto cfd : *versions_->GetColumnFamilySet()) {
      if (cfd->IsDropped()) {
        continue;
      }
      if (cfd->GetLogNumber() <= flush_column_family_if_log_file) {
        status = SetNewMemtableAndNewLogFile(cfd, &context);
        if (!status.ok()) {
          break;
        }
        cfd->imm()->FlushRequested();
        SchedulePendingFlush(cfd);
        context.schedule_bg_work_ = true;
      }
    }
  } else if (UNLIKELY(write_buffer_.ShouldFlush())) {
    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
        "Flushing all column families. Write buffer is using %" PRIu64
        " bytes out of a total of %" PRIu64 ".",
        write_buffer_.memory_usage(), write_buffer_.buffer_size());
    // no need to refcount because drop is happening in write thread, so can't
    // happen while we're in the write thread
    for (auto cfd : *versions_->GetColumnFamilySet()) {
      if (cfd->IsDropped()) {
        continue;
      }
      if (!cfd->mem()->IsEmpty()) {
        status = SetNewMemtableAndNewLogFile(cfd, &context);
        if (!status.ok()) {
          break;
        }
        cfd->imm()->FlushRequested();
        SchedulePendingFlush(cfd);
        context.schedule_bg_work_ = true;
      }
    }
    MaybeScheduleFlushOrCompaction();
  }

  if (UNLIKELY(status.ok() && !bg_error_.ok())) {
    status = bg_error_;
  }

  if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) {
    status = ScheduleFlushes(&context);
  }

  if (UNLIKELY(status.ok() && (write_controller_.IsStopped() ||
                               write_controller_.GetDelay() > 0))) {
    // If writer is stopped, we need to get it going,
    // so schedule flushes/compactions
    if (context.schedule_bg_work_) {
      MaybeScheduleFlushOrCompaction();
    }
    status = DelayWrite(expiration_time);
  }

  if (UNLIKELY(status.ok() && has_timeout &&
               env_->NowMicros() > expiration_time)) {
    status = Status::TimedOut();
  }

  uint64_t last_sequence = versions_->LastSequence();
  WriteThread::Writer* last_writer = &w;
  if (status.ok()) {
    autovector<WriteBatch*> write_batch_group;
    write_thread_.BuildBatchGroup(&last_writer, &write_batch_group);

    // Add to log and apply to memtable.  We can release the lock
    // during this phase since &w is currently responsible for logging
    // and protects against concurrent loggers and concurrent writes
    // into memtables
    {
      mutex_.Unlock();
      WriteBatch* updates = nullptr;
      if (write_batch_group.size() == 1) {
        updates = write_batch_group[0];
      } else {
        updates = &tmp_batch_;
        for (size_t i = 0; i < write_batch_group.size(); ++i) {
          WriteBatchInternal::Append(updates, write_batch_group[i]);
        }
      }

      const SequenceNumber current_sequence = last_sequence + 1;
      WriteBatchInternal::SetSequence(updates, current_sequence);
      int my_batch_count = WriteBatchInternal::Count(updates);
      last_sequence += my_batch_count;
      const uint64_t batch_size = WriteBatchInternal::ByteSize(updates);
      // Record statistics
      RecordTick(stats_, NUMBER_KEYS_WRITTEN, my_batch_count);
      RecordTick(stats_, BYTES_WRITTEN, batch_size);
      if (write_options.disableWAL) {
        flush_on_destroy_ = true;
      }
      PERF_TIMER_STOP(write_pre_and_post_process_time);

      uint64_t log_size = 0;
      if (!write_options.disableWAL) {
        PERF_TIMER_GUARD(write_wal_time);
        Slice log_entry = WriteBatchInternal::Contents(updates);
        status = log_->AddRecord(log_entry);
        total_log_size_ += log_entry.size();
        alive_log_files_.back().AddSize(log_entry.size());
        log_empty_ = false;
        log_size = log_entry.size();
        RecordTick(stats_, WAL_FILE_BYTES, log_size);
        if (status.ok() && write_options.sync) {
          RecordTick(stats_, WAL_FILE_SYNCED);
          StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS);
          if (db_options_.use_fsync) {
            status = log_->file()->Fsync();
          } else {
            status = log_->file()->Sync();
          }
          if (status.ok() && !log_dir_synced_) {
            // We only sync WAL directory the first time WAL syncing is
            // requested, so that in case users never turn on WAL sync,
            // we can avoid the disk I/O in the write code path.
            status = directories_.GetWalDir()->Fsync();
          }
          log_dir_synced_ = true;
        }
      }
      if (status.ok()) {
        PERF_TIMER_GUARD(write_memtable_time);

        status = WriteBatchInternal::InsertInto(
            updates, column_family_memtables_.get(),
            write_options.ignore_missing_column_families, 0, this, false);
        // A non-OK status here indicates iteration failure (either in-memory
        // writebatch corruption (very bad), or the client specified invalid
        // column family).  This will later on trigger bg_error_.
        //
        // Note that existing logic was not sound. Any partial failure writing
        // into the memtable would result in a state that some write ops might
        // have succeeded in memtable but Status reports error for all writes.

        SetTickerCount(stats_, SEQUENCE_NUMBER, last_sequence);
      }
      PERF_TIMER_START(write_pre_and_post_process_time);
      if (updates == &tmp_batch_) {
        tmp_batch_.Clear();
      }
      mutex_.Lock();
      // internal stats
      default_cf_internal_stats_->AddDBStats(
          InternalStats::BYTES_WRITTEN, batch_size);
      default_cf_internal_stats_->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN,
                                             my_batch_count);
      if (!write_options.disableWAL) {
        default_cf_internal_stats_->AddDBStats(
            InternalStats::WAL_FILE_SYNCED, 1);
        default_cf_internal_stats_->AddDBStats(
            InternalStats::WAL_FILE_BYTES, log_size);
      }
      if (status.ok()) {
        versions_->SetLastSequence(last_sequence);
      }
    }
  }
  if (db_options_.paranoid_checks && !status.ok() &&
      !status.IsTimedOut() && bg_error_.ok()) {
    bg_error_ = status; // stop compaction & fail any further writes
  }

  write_thread_.ExitWriteThread(&w, last_writer, status);

  if (context.schedule_bg_work_) {
    MaybeScheduleFlushOrCompaction();
  }
  mutex_.Unlock();

  if (status.IsTimedOut()) {
    RecordTick(stats_, WRITE_TIMEDOUT);
  }

  return status;
}

RocksDB寫入資料過程DBImpl::Write()原始碼分析

Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) { if (my_batch == nullptr) { return Status::Corru

Android應用程式內部啟動Activity過程 startActivity 的原始碼分析

上文介紹了Android應用程式的啟動過程，即應用程式預設Activity的啟動過程，一般來說，這種預設Activity是在新的程序和任務中啟動的；本文將繼續分析在應用程式內部啟動非預設Activity的過程的原始碼，這種非預設Activity一般是在原來的程序

Java資料結構----棧(Stack)原始碼分析和個人簡單實現

一、Stack原始碼分析 1.繼承結構　棧是資料結構中一種很重要的資料結構型別，因為棧的後進先出功能是實際的開發中有很多的應用場景。Java API中提供了棧（Stacck)的實現。 Stack類繼承了Vector類，而Vector類繼承了AbstractList抽象

基礎資料結構之LinkedHashMap原始碼分析

1.LinkedHashMap資料結構是雙向迴圈連結串列 LinkedHashMap節點中得知相關內容，Node節點中存在before，after，雙向連結串列特性原始碼如下 /** * LinkedHashMap entry. */ private static

Golang package輕量級KV資料快取——go-cache原始碼分析

作者：Moon-Light-Dream 出處：https://www.cnblogs.com/Moon-Light-Dream/ 轉載：歡迎轉載，但未經作者同意，必須保留此段宣告；必須在文章中給出原文連線；否則必究法律責任 ## 什麼是go-cache KV儲存引擎有很多，常用的如redis，rocks

資料結構——ArrayList的原始碼分析(你所有的疑問，都會被解答)

一．首先來看一下ArrayList的類圖： 1，實現了RandomAccess介面，可以達到隨機訪問的效果。 2，實現了Serializable介面，可以用來序列化或者反序列化。 3，實現了List介面，是List的實現類之一 4，實現了Collection介面，是Collection家族的

圖解Janusgraph系列-圖資料底層序列化原始碼分析（Data Serialize）

# 圖解Janusgraph系列-圖資料底層序列化原始碼分析（Data Serialize）大家好，我是`洋仔`，JanusGraph圖解系列文章，`實時更新`~ #### 圖資料庫文章總目錄： * **整理所有圖相關文章，請移步(超鏈)：**[圖資料庫系列-文章總目錄 ](https://li

Netty原始碼分析第5章(ByteBuf)---->第10節: SocketChannel讀取資料過程

Netty原始碼分析第五章: ByteBuf 第十節: SocketChannel讀取資料過程我們第三章分析過客戶端接入的流程, 這一小節帶大家剖析客戶端傳送資料, Server讀取資料的流程: 首先溫馨提示, 這一小節高度耦合第三章的第1, 2節的內容, 很

【Netty原始碼分析】資料讀取過程

首先客戶端連線到服務端時服務端會開啟一個執行緒，不斷的監聽客戶端的操作。這個執行緒的執行操作在NioEventLoop的run方法中，其實操作是在processSelectedKeys中，監聽是否進行讀操作protected void run() { for

Android 資料Parcel序列化過程原始碼分析

在Android系統中，所有的服務都必須註冊到ServiceManger中，當客戶程序需要請求某一服務時，首先從服務管家ServiceManger中查找出該服務，然後通過RPC遠端呼叫的方式使用該服務。服務在註冊到ServiceManager時，需要將該服務物件傳送到Ser

Parquet 寫資料過程及原始碼分析

Parquet寫資料過程及原始碼分析本文主要從parquet寫資料的角度進行分析，主要涉及parquet從拿到資料模型到最終將一條記錄經過計算、編碼、壓縮等過程寫入記憶體的過程（暫時沒有包括寫入檔案的過程，後續補充）。主要從以下幾個方面進行介紹：

Cassandra原始碼分析：資料寫入流程

org.apache.cassandra.thrift.CassandraServer類的add方法將接受客戶端的請求，該函式定義如下： public void add(ByteBuffer key, ColumnParent column_parent, Counter

【Netty原始碼分析】傳送資料過程

future.channel().writeAndFlush("Hello Netty Server ,I am a common client"); 呼叫AbstractChannel的writeAndFlush函式@Override public ChannelFutu

Netty原始碼分析（八）----- write過程原始碼分析

上一篇文章主要講了netty的read過程，本文主要分析一下write和writeAndFlush。主要內容本文分以下幾個部分闡述一個java物件最後是如何轉變成位元組流，寫到socket緩衝區中去的 pipeline中的標準連結串列結構 java物件編碼過程 write：寫佇列 flus

Flume NG原始碼分析（三）使用Event介面表示資料流

Flume NG有4個主要的元件： Event表示在Flume各個Agent之間傳遞的資料流 Source表示從外部源接收Event資料流，然後傳遞給Channel Channel表示對從Source傳遞的Event資料流的臨時儲存 Sink表示從Channel中接收儲存的Event

兄弟連區塊鏈教程Fabric1.0原始碼分析ledgerID資料

1、idStore概述 Fabric支援建立多個Ledger，不同Ledger以ledgerID區分。多個ledgerID及其創世區塊儲存在idStore資料庫中，idStore資料庫基於leveldb實現。 idStore預設使用路徑：/var/hyperledger/production

VS2013 c++連結資料庫，應用儲存過程，向資料庫中寫入資料

// ConsoleApplication1.cpp : 定義控制檯應用程式的入口點。 // #include "stdafx.h" #include "iomanip" using namespace std; #import "c:\Program Files\Common Files\S

Netty NioEventLoop 啟動過程原始碼分析

原文連結：https://wangwei.one/posts/netty-nioeventloop-analyse-for-startup.html 前面，我們分析了NioEventLoop的建立過程，接下來我們開始分析NioEventLoop的啟動和執行邏輯。

Netty NioEventLoop 建立過程原始碼分析

原文：https://wangwei.one/posts/netty-nioeventloop-analyse-for-create.html 前面，我們分析了Netty中的Channel元件，本篇我們來介紹一下與Channel關聯的另一個核心的元件 —— EventLo

zigbee 之ZStack-2.5.1a原始碼分析（三）無線資料傳送和接收

前面說過SampleApp_Init和SampleApp_ProcessEvent是我們重點關注的函式，接下來分析無線傳送和接收相關的程式碼：在SampleApp_ProcessEvent函式中： if ( events & SYS_EVENT_MSG ) { &nbs

RocksDB寫入資料過程DBImpl::Write()原始碼分析

相關推薦