内容简介:对于Ceph全新的存储引擎BlueStore来说,RocksDB的意义很大,它存储了BlueStore相关的元数据信息,对它的理解有助于更好的理解BlueStore的实现,分析之后遇到的问题;BlueStore的架构图如下,还是被广泛使用的一张:
对于Ceph全新的存储引擎BlueStore来说,RocksDB的意义很大,它存储了BlueStore相关的元数据信息,对它的理解有助于更好的理解BlueStore的实现,分析之后遇到的问题;
BlueStore架构
BlueStore的架构图如下,还是被广泛使用的一张:
如上图所示,BlueStore的几个关键组件中,RocksDB对接了BlueStore的metadata信息,本文抛开别的组件,详细描述RocksDB在这里存储的信息以及其实现;
BlueStore结构体定义
Ceph里BlueStore的定义和主要数据成员如下:
class BlueStore : public ObjectStore, public md_config_obs_t {
...
private:
BlueFS *bluefs = nullptr;
unsigned bluefs_shared_bdev = 0; ///< which bluefs bdev we are sharing
bool bluefs_single_shared_device = true;
utime_t bluefs_last_balance;
KeyValueDB *db = nullptr;
BlockDevice *bdev = nullptr;
std::string freelist_type;
FreelistManager *fm = nullptr;
Allocator *alloc = nullptr;
uuid_d fsid;
int path_fd = -1; ///< open handle to $path
int fsid_fd = -1; ///< open handle (locked) to $path/fsid
bool mounted = false;
vector<Cache*> cache_shards;
std::mutex osr_lock; ///< protect osd_set
std::set<OpSequencerRef> osr_set; ///< set of all OpSequencers
...
};
几个关键的数据成员如下:
1) BlueFS
定义: BlueFS *bluefs = nullptr;
支持RocksDB的定制FS,只实现了RocksEnv需要的API接口;
代码里在_open_db()里对其初始化:
int BlueStore::_open_db(bool create)
{
rocksdb::Env *env = NULL;
if (do_bluefs) {
bluefs = new BlueFS(cct);
}
}
2) RocksDB
定义: KeyValueDB *db = nullptr;
在BlueStore的元数据和OMap都通过DB存储,这里使用的是RocksDB,它的初始化也是在_open_db()函数中:
int BlueStore::_open_db(bool create)
{
// 获取kv的后端设备
string kv_backend;
if (create) {
kv_backend = cct->_conf->bluestore_kvbackend;
} else {
r = read_meta("kv_backend", &kv_backend);
}
// mkfs也会调用这里,create时候根据配置做bluefs的创建
if (create) {
do_bluefs = cct->_conf->bluestore_bluefs;
} else {
string s;
r = read_meta("bluefs", &s);
}
rocksdb::Env *env = NULL;
// 创建bluefs
if (do_bluefs) {
bluefs = new BlueFS(cct);
bfn = path + "/block.db";
if (::stat(bfn.c_str(), &st) == 0) {
r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn);
if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
r = _check_or_set_bdev_label(
bfn,
bluefs->get_block_device_size(BlueFS::BDEV_DB),
"bluefs db", create);
}
if (create) {
bluefs->add_block_extent(
BlueFS::BDEV_DB,
SUPER_RESERVED,
bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
}
bluefs_shared_bdev = BlueFS::BDEV_SLOW;
bluefs_single_shared_device = false;
} else {
if (::lstat(bfn.c_str(), &st) == -1) {
bluefs_shared_bdev = BlueFS::BDEV_DB;
}
}
// shared device
bfn = path + "/block";
r = bluefs->add_block_device(bluefs_shared_bdev, bfn);
bfn = path + "/block.wal";
if (::stat(bfn.c_str(), &st) == 0) {
r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn);
if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
r = _check_or_set_bdev_label(
bfn,
bluefs->get_block_device_size(BlueFS::BDEV_WAL),
"bluefs wal", create);
}
if (create) {
bluefs->add_block_extent(
BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
BDEV_LABEL_BLOCK_SIZE);
}
cct->_conf->set_val("rocksdb_separate_wal_dir", "true");
bluefs_single_shared_device = false;
}
}
// 创建RocksDB
db = KeyValueDB::create(cct,
kv_backend,
fn,
static_cast<void*>(env));
FreelistManager::setup_merge_operators(db);
db->set_merge_operator(PREFIX_STAT, merge_op);
db->set_cache_size(cache_size * cache_kv_ratio);
if (kv_backend == "rocksdb")
options = cct->_conf->bluestore_rocksdb_options;
db->init(options);
if (create)
r = db->create_and_open(err);
else
r = db->open(err);
}
3) BlockDevice
定义: BlockDevice *bdev = nullptr;
底层存储BlueStore Data / db / wal的块设备,有如下几种:
- KernelDevice
- NVMEDevice
- PMEMDevice
代码中对其初始化如下:
int BlueStore::_open_bdev(bool create)
{
string p = path + "/block";
bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this));
int r = bdev->open(p);
if (bdev->supported_bdev_label()) {
r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
}
// initialize global block parameters
block_size = bdev->get_block_size();
block_mask = ~(block_size - 1);
block_size_order = ctz(block_size);
r = _set_cache_sizes();
return 0;
}
4) FreelistManager
定义: FreelistManager *fm = nullptr;
管理BlueStore里空闲blob的;
默认使用的是:BitmapFreelistManager
int BlueStore::_open_fm(bool create){
fm = FreelistManager::create(cct, freelist_type, db, PREFIX_ALLOC);
int r = fm->init(bdev->get_size());
}
5) Allocator
定义: Allocator *alloc = nullptr;
BlueStore的blob分配器,支持如下几种:
- BitmapAllocator
- StupidAllocator
默认使用的是 StupidAllocator;
6) 总结:BlueStore的mount过程
在BlueStore的 mount过程中,会调用各个函数来初始化其使用的各个组件,顺序如下:
int BlueStore::_mount(bool kv_only)
{
int r = read_meta("type", &type);
if (type != "bluestore") {
return -EIO;
}
...
int r = _open_path();
r = _open_fsid(false);
r = _read_fsid(&fsid);
r = _lock_fsid();
r = _open_bdev(false);
r = _open_db(false);
if (kv_only)
return 0;
r = _open_super_meta();
r = _open_fm(false);
r = _open_alloc();
r = _open_collections();
r = _reload_logger();
if (bluefs) {
r = _reconcile_bluefs_freespace();
}
_kv_start();
r = _deferred_replay();
mempool_thread.init();
mounted = true;
return 0;
}
RocksDB的定义
RocksDB的定义如下,基于KeyValueDB实现接口:
/**
* Uses RocksDB to implement the KeyValueDB interface
*/
class RocksDBStore : public KeyValueDB {
...
string path;
void *priv;
rocksdb::DB *db;
rocksdb::Env *env;
std::shared_ptr<rocksdb::Statistics> dbstats;
rocksdb::BlockBasedTableOptions bbt_opts;
string options_str;
uint64_t cache_size = 0;
...
// manage async compactions
Mutex compact_queue_lock;
Cond compact_queue_cond;
list< pair<string,string> > compact_queue;
bool compact_queue_stop;
class CompactThread : public Thread {
RocksDBStore *db;
public:
explicit CompactThread(RocksDBStore *d) : db(d) {}
void *entry() override {
db->compact_thread_entry();
return NULL;
}
friend class RocksDBStore;
} compact_thread;
...
struct RocksWBHandler: public rocksdb::WriteBatch::Handler {
std::string seen ;
int num_seen = 0;
};
class RocksDBTransactionImpl : public KeyValueDB::TransactionImpl {
public:
rocksdb::WriteBatch bat;
RocksDBStore *db;
};
// DB Iterator的具体实现,比较重要
class RocksDBWholeSpaceIteratorImpl :
public KeyValueDB::WholeSpaceIteratorImpl {
protected:
rocksdb::Iterator *dbiter;
public:
explicit RocksDBWholeSpaceIteratorImpl(rocksdb::Iterator *iter) :
dbiter(iter) { }
//virtual ~RocksDBWholeSpaceIteratorImpl() { }
~RocksDBWholeSpaceIteratorImpl() override;
int seek_to_first() override;
int seek_to_first(const string &prefix) override;
int seek_to_last() override;
int seek_to_last(const string &prefix) override;
int upper_bound(const string &prefix, const string &after) override;
int lower_bound(const string &prefix, const string &to) override;
bool valid() override;
int next() override;
int prev() override;
string key() override;
pair<string,string> raw_key() override;
bool raw_key_is_prefixed(const string &prefix) override;
bufferlist value() override;
bufferptr value_as_ptr() override;
int status() override;
size_t key_size() override;
size_t value_size() override;
};
...
};
基类 KeyValueDB 的定义如下,只罗列了几个关键的基类定义:
/**
* Defines virtual interface to be implemented by key value store
*
* Kyoto Cabinet or LevelDB should implement this
*/
class KeyValueDB {
public:
class TransactionImpl {
...
};
typedef ceph::shared_ptr< TransactionImpl > Transaction;
class WholeSpaceIteratorImpl {
...
};
typedef ceph::shared_ptr< WholeSpaceIteratorImpl > WholeSpaceIterator;
class IteratorImpl : public GenericIteratorImpl {
const std::string prefix;
WholeSpaceIterator generic_iter;
...
int seek_to_first() override {
return generic_iter->seek_to_first(prefix);
}
int seek_to_last() {
return generic_iter->seek_to_last(prefix);
}
int upper_bound(const std::string &after) override {
return generic_iter->upper_bound(prefix, after);
}
int lower_bound(const std::string &to) override {
return generic_iter->lower_bound(prefix, to);
}
bool valid() override {
if (!generic_iter->valid())
return false;
return generic_iter->raw_key_is_prefixed(prefix);
}
};
typedef ceph::shared_ptr< IteratorImpl > Iterator;
WholeSpaceIterator get_iterator() {
return _get_iterator();
}
Iterator get_iterator(const std::string &prefix) {
return std::make_shared<IteratorImpl>(prefix, get_iterator());
}
};
在代码中,使用RocksDB的常用方法如下:
KeyValueDB::Iterator it;
it = db->get_iterator(PREFIX_OBJ); // 设置key的前缀
it->lower_bound(key); / it->upper_bound(key); // 找到对应key的iterator位置
while (it->valid()) { // 检查iterator是否有效
...
it->key() / it->value();; // 获取iterator对应的key或value
it->next(); // 获取下一个iterator位置
}
RocksDB里KV分类
BlueStore里所有的kv数据都可以存储在RocksDB里,实现中通过数据的前缀分类,如下:
// kv store prefixes const string PREFIX_SUPER = "S"; // field -> value const string PREFIX_STAT = "T"; // field -> value(int64 array) const string PREFIX_COLL = "C"; // collection name -> cnode_t const string PREFIX_OBJ = "O"; // object name -> onode_t const string PREFIX_OMAP = "M"; // u64 + keyname -> value const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist) const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
下面针对每一类前缀做详细介绍:
1) PREFIX_SUPER
BlueStore的超级块信息,里面BlueStore自身的元数据信息,比如:
S blobid_max S bluefs_extents S freelist_type S min_alloc_size S min_compat_ondisk_format S nid_max S ondisk_format
2) PREFIX_STAT
bluestore_statfs 信息
class BlueStore : public ObjectStore,
public md_config_obs_t {
...
struct volatile_statfs {
enum {
STATFS_ALLOCATED = 0,
STATFS_STORED,
STATFS_COMPRESSED_ORIGINAL,
STATFS_COMPRESSED,
STATFS_COMPRESSED_ALLOCATED,
STATFS_LAST
};
int64_t values[STATFS_LAST];
...
};
设置地方:
void BlueStore::_txc_update_store_statfs(TransContext *txc)
{
if (txc->statfs_delta.is_empty())
return;
...
{
std::lock_guard<std::mutex> l(vstatfs_lock);
vstatfs += txc->statfs_delta;
}
bufferlist bl;
txc->statfs_delta.encode(bl);
txc->t->merge(PREFIX_STAT, "bluestore_statfs", bl);
txc->statfs_delta.reset();
}
3) PREFIX_COLL
Collection的元数据信息,Collection对应逻辑上的PG,每个ObjectStore都会实现自己的Collection;
BlueStore存储一个PG,就会存储一个Collection的kv到RocksDB;
class BlueStore : public ObjectStore,
public md_config_obs_t {
...
typedef boost::intrusive_ptr<Collection> CollectionRef;
struct Collection : public CollectionImpl {
BlueStore *store;
Cache *cache; ///< our cache shard
coll_t cid;
bluestore_cnode_t cnode;
RWLock lock;
bool exists;
SharedBlobSet shared_blob_set; ///< open SharedBlobs
// cache onodes on a per-collection basis to avoid lock
// contention.
OnodeSpace onode_map;
//pool options
pool_opts_t pool_opts;
...
};
}
4) PREFIX_OBJ
Object的元数据信息,对于存在BlueStore里的任何一个Object,都会把其的struct Onode信息(+其他)作为value写入RocksDB;
需要访问该Object时,先查询RocksDB,构造出其内存数据结构Onode,再访问之;
class BlueStore : public ObjectStore,
public md_config_obs_t {
...
/// an in-memory object
struct Onode {
std::atomic_int nref; ///< reference count
Collection *c;
ghobject_t oid;
/// key under PREFIX_OBJ where we are stored
mempool::bluestore_cache_other::string key;
boost::intrusive::list_member_hook<> lru_item;
bluestore_onode_t onode; ///< metadata stored as value in kv store
bool exists; ///< true if object logically exists
ExtentMap extent_map;
...
};
typedef boost::intrusive_ptr<Onode> OnodeRef;
}
5) PREFIX_OMAP
Object的OMap信息,之前存储在Object的attr和Map信息,都通过PREFIX_OMAP前缀保存在RocksDB里;
6) PREFIX_DEFERRED
BlueStore Deferred transaction的信息,对应数据结构定义如下:
/// writeahead-logged transaction
struct bluestore_deferred_transaction_t {
uint64_t seq = 0;
list<bluestore_deferred_op_t> ops;
interval_set<uint64_t> released; ///< allocations to release after tx
bluestore_deferred_transaction_t() : seq(0) {}
DENC(bluestore_deferred_transaction_t, v, p) {
DENC_START(1, 1, p);
denc(v.seq, p);
denc(v.ops, p);
denc(v.released, p);
DENC_FINISH(p);
}
void dump(Formatter *f) const;
static void generate_test_instances(list<bluestore_deferred_transaction_t*>& o);
};
WRITE_CLASS_DENC(bluestore_deferred_transaction_t)
7) PREFIX_ALLOC
FreelistManager相关,默认使用BitmapFreelistManager;
B blocks B blocks_per_key B bytes_per_block B size
8) PREFIX_SHARED_BLOB
Shared blob的元数据信息,因为blob的size比较大,有可能上面的多个extent maps映射下来;
RocksDB tool
ceph提供了一个命令来获取一个kvstore里的数据:ceph-kvstore-tool,help如下:
root@ceph6:~# ceph-kvstore-tool -h Usage: ceph-kvstore-tool <leveldb|rocksdb|bluestore-kv> <store path> command [args...] Commands: list [prefix] list-crc [prefix] exists <prefix> [key] get <prefix> <key> [out <file>] crc <prefix> <key> get-size [<prefix> <key>] set <prefix> <key> [ver <N>|in <file>] rm <prefix> <key> rm-prefix <prefix> store-copy <path> [num-keys-per-tx] [leveldb|rocksdb|...] store-crc <path> compact compact-prefix <prefix> compact-range <prefix> <start> <end> repair
使用示例:
root@ceph6:~# systemctl stop ceph-osd@20.service root@ceph6:~# ceph-kvstore-tool bluestore-kv /var/lib/ceph/osd/ceph-20/ list B > list-B 2018-09-21 11:43:42.679 7f4ec14deb80 1 bluestore(/var/lib/ceph/osd/ceph-20/) _mount path /var/lib/ceph/osd/ceph-20/ 2018-09-21 11:43:42.679 7f4ec14deb80 1 bdev create path /var/lib/ceph/osd/ceph-20//block type kernel 2018-09-21 11:43:42.679 7f4ec14deb80 1 bdev(0x55ddf4e58000 /var/lib/ceph/osd/ceph-20//block) open path /var/lib/ceph/osd/ceph-20//block 2018-09-21 11:43:42.679 7f4ec14deb80 1 bdev(0x55ddf4e58000 /var/lib/ceph/osd/ceph-20//block) open size 4000783007744 (0x3a381400000, 3.6 TiB) block_size 4096 (4 KiB) rotational 2018-09-21 11:43:42.679 7f4ec14deb80 1 bluestore(/var/lib/ceph/osd/ceph-20/) _set_cache_sizes cache_size 1073741824 meta 0.5 kv 0.5 data 0 2018-09-21 11:43:42.679 7f4ec14deb80 1 bdev create path /var/lib/ceph/osd/ceph-20//block.db type kernel 2018-09-21 11:43:42.679 7f4ec14deb80 1 bdev(0x55ddf4e58380 /var/lib/ceph/osd/ceph-20//block.db) open path /var/lib/ceph/osd/ceph-20//block.db 2018-09-21 11:43:42.679 7f4ec14deb80 1 bdev(0x55ddf4e58380 /var/lib/ceph/osd/ceph-20//block.db) open size 3221225472 (0xc0000000, 3 GiB) block_size 4096 (4 KiB) non-rotational 2018-09-21 11:43:42.679 7f4ec14deb80 1 bluefs add_block_device bdev 1 path /var/lib/ceph/osd/ceph-20//block.db size 3 GiB 2018-09-21 11:43:42.679 7f4ec14deb80 1 bdev create path /var/lib/ceph/osd/ceph-20//block type kernel 2018-09-21 11:43:42.679 7f4ec14deb80 1 bdev(0x55ddf4e58700 /var/lib/ceph/osd/ceph-20//block) open path /var/lib/ceph/osd/ceph-20//block 2018-09-21 11:43:42.683 7f4ec14deb80 1 bdev(0x55ddf4e58700 /var/lib/ceph/osd/ceph-20//block) open size 4000783007744 (0x3a381400000, 3.6 TiB) block_size 4096 (4 KiB) rotational 2018-09-21 11:43:42.683 7f4ec14deb80 1 bluefs add_block_device bdev 2 path /var/lib/ceph/osd/ceph-20//block size 3.6 TiB 2018-09-21 11:43:42.683 7f4ec14deb80 1 bdev create path /var/lib/ceph/osd/ceph-20//block.wal type kernel 2018-09-21 11:43:42.683 7f4ec14deb80 1 bdev(0x55ddf4e58a80 /var/lib/ceph/osd/ceph-20//block.wal) open path /var/lib/ceph/osd/ceph-20//block.wal 2018-09-21 11:43:42.683 7f4ec14deb80 1 bdev(0x55ddf4e58a80 /var/lib/ceph/osd/ceph-20//block.wal) open size 3221225472 (0xc0000000, 3 GiB) block_size 4096 (4 KiB) non-rotational 2018-09-21 11:43:42.683 7f4ec14deb80 1 bluefs add_block_device bdev 0 path /var/lib/ceph/osd/ceph-20//block.wal size 3 GiB 2018-09-21 11:43:42.683 7f4ec14deb80 1 bluefs mount 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option compaction_readahead_size = 2097152 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option compression = kNoCompression 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option max_write_buffer_number = 4 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option min_write_buffer_number_to_merge = 1 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option recycle_log_file_num = 4 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option writable_file_max_buffer_size = 0 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option write_buffer_size = 268435456 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option compaction_readahead_size = 2097152 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option compression = kNoCompression 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option max_write_buffer_number = 4 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option min_write_buffer_number_to_merge = 1 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option recycle_log_file_num = 4 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option writable_file_max_buffer_size = 0 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option write_buffer_size = 268435456 2018-09-21 11:43:42.691 7f4ec14deb80 1 rocksdb: do_open column families: [default] 2018-09-21 11:43:42.699 7f4ec14deb80 1 bluestore(/var/lib/ceph/osd/ceph-20/) _open_db opened rocksdb path db options compression=kNoCompression,max_write_buffer_number=4,min_write_buffer_number_to_merge=1,recycle_log_file_num=4,write_buffer_size=268435456,writable_file_max_buffer_size=0,compaction_readahead_size=2097152 2018-09-21 11:43:42.703 7f4ec14deb80 1 bluestore(/var/lib/ceph/osd/ceph-20/) umount 2018-09-21 11:43:42.703 7f4ec14deb80 1 bluefs umount 2018-09-21 11:43:42.703 7f4ec14deb80 1 stupidalloc 0x0x55ddf4a92a70 shutdown 2018-09-21 11:43:42.703 7f4ec14deb80 1 stupidalloc 0x0x55ddf4a92ae0 shutdown 2018-09-21 11:43:42.703 7f4ec14deb80 1 stupidalloc 0x0x55ddf4a92b50 shutdown 2018-09-21 11:43:42.703 7f4ec14deb80 1 bdev(0x55ddf4e58a80 /var/lib/ceph/osd/ceph-20//block.wal) close 2018-09-21 11:43:42.991 7f4ec14deb80 1 bdev(0x55ddf4e58380 /var/lib/ceph/osd/ceph-20//block.db) close 2018-09-21 11:43:43.227 7f4ec14deb80 1 bdev(0x55ddf4e58700 /var/lib/ceph/osd/ceph-20//block) close 2018-09-21 11:43:43.463 7f4ec14deb80 1 bdev(0x55ddf4e58000 /var/lib/ceph/osd/ceph-20//block) close root@ceph6:~# systemctl start ceph-osd@20.service
以上就是本文的全部内容,希望本文的内容对大家的学习或者工作能带来一定的帮助,也希望大家多多支持 码农网
猜你喜欢:本站部分资源来源于网络,本站转载出于传递更多信息之目的,版权归原作者或者来源机构所有,如转载稿涉及版权问题,请联系我们。