内容简介:对于Ceph全新的存储引擎BlueStore来说,RocksDB的意义很大,它存储了BlueStore相关的元数据信息,对它的理解有助于更好的理解BlueStore的实现,分析之后遇到的问题;BlueStore的架构图如下,还是被广泛使用的一张:
对于Ceph全新的存储引擎BlueStore来说,RocksDB的意义很大,它存储了BlueStore相关的元数据信息,对它的理解有助于更好的理解BlueStore的实现,分析之后遇到的问题;
BlueStore架构
BlueStore的架构图如下,还是被广泛使用的一张:
如上图所示,BlueStore的几个关键组件中,RocksDB对接了BlueStore的metadata信息,本文抛开别的组件,详细描述RocksDB在这里存储的信息以及其实现;
BlueStore结构体定义
Ceph里BlueStore的定义和主要数据成员如下:
class BlueStore : public ObjectStore, public md_config_obs_t {
...
private:
BlueFS *bluefs = nullptr;
unsigned bluefs_shared_bdev = 0; ///< which bluefs bdev we are sharing
bool bluefs_single_shared_device = true;
utime_t bluefs_last_balance;
KeyValueDB *db = nullptr;
BlockDevice *bdev = nullptr;
std::string freelist_type;
FreelistManager *fm = nullptr;
Allocator *alloc = nullptr;
uuid_d fsid;
int path_fd = -1; ///< open handle to $path
int fsid_fd = -1; ///< open handle (locked) to $path/fsid
bool mounted = false;
vector<Cache*> cache_shards;
std::mutex osr_lock; ///< protect osd_set
std::set<OpSequencerRef> osr_set; ///< set of all OpSequencers
...
};
几个关键的数据成员如下:
1) BlueFS
定义: BlueFS *bluefs = nullptr;
支持RocksDB的定制FS,只实现了RocksEnv需要的API接口;
代码里在_open_db()里对其初始化:
int BlueStore::_open_db(bool create)
{
rocksdb::Env *env = NULL;
if (do_bluefs) {
bluefs = new BlueFS(cct);
}
}
2) RocksDB
定义: KeyValueDB *db = nullptr;
在BlueStore的元数据和OMap都通过DB存储,这里使用的是RocksDB,它的初始化也是在_open_db()函数中:
int BlueStore::_open_db(bool create)
{
// 获取kv的后端设备
string kv_backend;
if (create) {
kv_backend = cct->_conf->bluestore_kvbackend;
} else {
r = read_meta("kv_backend", &kv_backend);
}
// mkfs也会调用这里,create时候根据配置做bluefs的创建
if (create) {
do_bluefs = cct->_conf->bluestore_bluefs;
} else {
string s;
r = read_meta("bluefs", &s);
}
rocksdb::Env *env = NULL;
// 创建bluefs
if (do_bluefs) {
bluefs = new BlueFS(cct);
bfn = path + "/block.db";
if (::stat(bfn.c_str(), &st) == 0) {
r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn);
if (bluefs->bdev_support_label(BlueFS::BDEV_DB)) {
r = _check_or_set_bdev_label(
bfn,
bluefs->get_block_device_size(BlueFS::BDEV_DB),
"bluefs db", create);
}
if (create) {
bluefs->add_block_extent(
BlueFS::BDEV_DB,
SUPER_RESERVED,
bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
}
bluefs_shared_bdev = BlueFS::BDEV_SLOW;
bluefs_single_shared_device = false;
} else {
if (::lstat(bfn.c_str(), &st) == -1) {
bluefs_shared_bdev = BlueFS::BDEV_DB;
}
}
// shared device
bfn = path + "/block";
r = bluefs->add_block_device(bluefs_shared_bdev, bfn);
bfn = path + "/block.wal";
if (::stat(bfn.c_str(), &st) == 0) {
r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn);
if (bluefs->bdev_support_label(BlueFS::BDEV_WAL)) {
r = _check_or_set_bdev_label(
bfn,
bluefs->get_block_device_size(BlueFS::BDEV_WAL),
"bluefs wal", create);
}
if (create) {
bluefs->add_block_extent(
BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
BDEV_LABEL_BLOCK_SIZE);
}
cct->_conf->set_val("rocksdb_separate_wal_dir", "true");
bluefs_single_shared_device = false;
}
}
// 创建RocksDB
db = KeyValueDB::create(cct,
kv_backend,
fn,
static_cast<void*>(env));
FreelistManager::setup_merge_operators(db);
db->set_merge_operator(PREFIX_STAT, merge_op);
db->set_cache_size(cache_size * cache_kv_ratio);
if (kv_backend == "rocksdb")
options = cct->_conf->bluestore_rocksdb_options;
db->init(options);
if (create)
r = db->create_and_open(err);
else
r = db->open(err);
}
3) BlockDevice
定义: BlockDevice *bdev = nullptr;
底层存储BlueStore Data / db / wal的块设备,有如下几种:
- KernelDevice
- NVMEDevice
- PMEMDevice
代码中对其初始化如下:
int BlueStore::_open_bdev(bool create)
{
string p = path + "/block";
bdev = BlockDevice::create(cct, p, aio_cb, static_cast<void*>(this));
int r = bdev->open(p);
if (bdev->supported_bdev_label()) {
r = _check_or_set_bdev_label(p, bdev->get_size(), "main", create);
}
// initialize global block parameters
block_size = bdev->get_block_size();
block_mask = ~(block_size - 1);
block_size_order = ctz(block_size);
r = _set_cache_sizes();
return 0;
}
4) FreelistManager
定义: FreelistManager *fm = nullptr;
管理BlueStore里空闲blob的;
默认使用的是:BitmapFreelistManager
int BlueStore::_open_fm(bool create){
fm = FreelistManager::create(cct, freelist_type, db, PREFIX_ALLOC);
int r = fm->init(bdev->get_size());
}
5) Allocator
定义: Allocator *alloc = nullptr;
BlueStore的blob分配器,支持如下几种:
- BitmapAllocator
- StupidAllocator
默认使用的是 StupidAllocator;
6) 总结:BlueStore的mount过程
在BlueStore的 mount过程中,会调用各个函数来初始化其使用的各个组件,顺序如下:
int BlueStore::_mount(bool kv_only)
{
int r = read_meta("type", &type);
if (type != "bluestore") {
return -EIO;
}
...
int r = _open_path();
r = _open_fsid(false);
r = _read_fsid(&fsid);
r = _lock_fsid();
r = _open_bdev(false);
r = _open_db(false);
if (kv_only)
return 0;
r = _open_super_meta();
r = _open_fm(false);
r = _open_alloc();
r = _open_collections();
r = _reload_logger();
if (bluefs) {
r = _reconcile_bluefs_freespace();
}
_kv_start();
r = _deferred_replay();
mempool_thread.init();
mounted = true;
return 0;
}
RocksDB的定义
RocksDB的定义如下,基于KeyValueDB实现接口:
/**
* Uses RocksDB to implement the KeyValueDB interface
*/
class RocksDBStore : public KeyValueDB {
...
string path;
void *priv;
rocksdb::DB *db;
rocksdb::Env *env;
std::shared_ptr<rocksdb::Statistics> dbstats;
rocksdb::BlockBasedTableOptions bbt_opts;
string options_str;
uint64_t cache_size = 0;
...
// manage async compactions
Mutex compact_queue_lock;
Cond compact_queue_cond;
list< pair<string,string> > compact_queue;
bool compact_queue_stop;
class CompactThread : public Thread {
RocksDBStore *db;
public:
explicit CompactThread(RocksDBStore *d) : db(d) {}
void *entry() override {
db->compact_thread_entry();
return NULL;
}
friend class RocksDBStore;
} compact_thread;
...
struct RocksWBHandler: public rocksdb::WriteBatch::Handler {
std::string seen ;
int num_seen = 0;
};
class RocksDBTransactionImpl : public KeyValueDB::TransactionImpl {
public:
rocksdb::WriteBatch bat;
RocksDBStore *db;
};
// DB Iterator的具体实现,比较重要
class RocksDBWholeSpaceIteratorImpl :
public KeyValueDB::WholeSpaceIteratorImpl {
protected:
rocksdb::Iterator *dbiter;
public:
explicit RocksDBWholeSpaceIteratorImpl(rocksdb::Iterator *iter) :
dbiter(iter) { }
//virtual ~RocksDBWholeSpaceIteratorImpl() { }
~RocksDBWholeSpaceIteratorImpl() override;
int seek_to_first() override;
int seek_to_first(const string &prefix) override;
int seek_to_last() override;
int seek_to_last(const string &prefix) override;
int upper_bound(const string &prefix, const string &after) override;
int lower_bound(const string &prefix, const string &to) override;
bool valid() override;
int next() override;
int prev() override;
string key() override;
pair<string,string> raw_key() override;
bool raw_key_is_prefixed(const string &prefix) override;
bufferlist value() override;
bufferptr value_as_ptr() override;
int status() override;
size_t key_size() override;
size_t value_size() override;
};
...
};
基类 KeyValueDB 的定义如下,只罗列了几个关键的基类定义:
/**
* Defines virtual interface to be implemented by key value store
*
* Kyoto Cabinet or LevelDB should implement this
*/
class KeyValueDB {
public:
class TransactionImpl {
...
};
typedef ceph::shared_ptr< TransactionImpl > Transaction;
class WholeSpaceIteratorImpl {
...
};
typedef ceph::shared_ptr< WholeSpaceIteratorImpl > WholeSpaceIterator;
class IteratorImpl : public GenericIteratorImpl {
const std::string prefix;
WholeSpaceIterator generic_iter;
...
int seek_to_first() override {
return generic_iter->seek_to_first(prefix);
}
int seek_to_last() {
return generic_iter->seek_to_last(prefix);
}
int upper_bound(const std::string &after) override {
return generic_iter->upper_bound(prefix, after);
}
int lower_bound(const std::string &to) override {
return generic_iter->lower_bound(prefix, to);
}
bool valid() override {
if (!generic_iter->valid())
return false;
return generic_iter->raw_key_is_prefixed(prefix);
}
};
typedef ceph::shared_ptr< IteratorImpl > Iterator;
WholeSpaceIterator get_iterator() {
return _get_iterator();
}
Iterator get_iterator(const std::string &prefix) {
return std::make_shared<IteratorImpl>(prefix, get_iterator());
}
};
在代码中,使用RocksDB的常用方法如下:
KeyValueDB::Iterator it;
it = db->get_iterator(PREFIX_OBJ); // 设置key的前缀
it->lower_bound(key); / it->upper_bound(key); // 找到对应key的iterator位置
while (it->valid()) { // 检查iterator是否有效
...
it->key() / it->value();; // 获取iterator对应的key或value
it->next(); // 获取下一个iterator位置
}
RocksDB里KV分类
BlueStore里所有的kv数据都可以存储在RocksDB里,实现中通过数据的前缀分类,如下:
// kv store prefixes const string PREFIX_SUPER = "S"; // field -> value const string PREFIX_STAT = "T"; // field -> value(int64 array) const string PREFIX_COLL = "C"; // collection name -> cnode_t const string PREFIX_OBJ = "O"; // object name -> onode_t const string PREFIX_OMAP = "M"; // u64 + keyname -> value const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist) const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
下面针对每一类前缀做详细介绍:
1) PREFIX_SUPER
BlueStore的超级块信息,里面BlueStore自身的元数据信息,比如:
S blobid_max S bluefs_extents S freelist_type S min_alloc_size S min_compat_ondisk_format S nid_max S ondisk_format
2) PREFIX_STAT
bluestore_statfs 信息
class BlueStore : public ObjectStore,
public md_config_obs_t {
...
struct volatile_statfs {
enum {
STATFS_ALLOCATED = 0,
STATFS_STORED,
STATFS_COMPRESSED_ORIGINAL,
STATFS_COMPRESSED,
STATFS_COMPRESSED_ALLOCATED,
STATFS_LAST
};
int64_t values[STATFS_LAST];
...
};
设置地方:
void BlueStore::_txc_update_store_statfs(TransContext *txc)
{
if (txc->statfs_delta.is_empty())
return;
...
{
std::lock_guard<std::mutex> l(vstatfs_lock);
vstatfs += txc->statfs_delta;
}
bufferlist bl;
txc->statfs_delta.encode(bl);
txc->t->merge(PREFIX_STAT, "bluestore_statfs", bl);
txc->statfs_delta.reset();
}
3) PREFIX_COLL
Collection的元数据信息,Collection对应逻辑上的PG,每个ObjectStore都会实现自己的Collection;
BlueStore存储一个PG,就会存储一个Collection的kv到RocksDB;
class BlueStore : public ObjectStore,
public md_config_obs_t {
...
typedef boost::intrusive_ptr<Collection> CollectionRef;
struct Collection : public CollectionImpl {
BlueStore *store;
Cache *cache; ///< our cache shard
coll_t cid;
bluestore_cnode_t cnode;
RWLock lock;
bool exists;
SharedBlobSet shared_blob_set; ///< open SharedBlobs
// cache onodes on a per-collection basis to avoid lock
// contention.
OnodeSpace onode_map;
//pool options
pool_opts_t pool_opts;
...
};
}
4) PREFIX_OBJ
Object的元数据信息,对于存在BlueStore里的任何一个Object,都会把其的struct Onode信息(+其他)作为value写入RocksDB;
需要访问该Object时,先查询RocksDB,构造出其内存数据结构Onode,再访问之;
class BlueStore : public ObjectStore,
public md_config_obs_t {
...
/// an in-memory object
struct Onode {
std::atomic_int nref; ///< reference count
Collection *c;
ghobject_t oid;
/// key under PREFIX_OBJ where we are stored
mempool::bluestore_cache_other::string key;
boost::intrusive::list_member_hook<> lru_item;
bluestore_onode_t onode; ///< metadata stored as value in kv store
bool exists; ///< true if object logically exists
ExtentMap extent_map;
...
};
typedef boost::intrusive_ptr<Onode> OnodeRef;
}
5) PREFIX_OMAP
Object的OMap信息,之前存储在Object的attr和Map信息,都通过PREFIX_OMAP前缀保存在RocksDB里;
6) PREFIX_DEFERRED
BlueStore Deferred transaction的信息,对应数据结构定义如下:
/// writeahead-logged transaction
struct bluestore_deferred_transaction_t {
uint64_t seq = 0;
list<bluestore_deferred_op_t> ops;
interval_set<uint64_t> released; ///< allocations to release after tx
bluestore_deferred_transaction_t() : seq(0) {}
DENC(bluestore_deferred_transaction_t, v, p) {
DENC_START(1, 1, p);
denc(v.seq, p);
denc(v.ops, p);
denc(v.released, p);
DENC_FINISH(p);
}
void dump(Formatter *f) const;
static void generate_test_instances(list<bluestore_deferred_transaction_t*>& o);
};
WRITE_CLASS_DENC(bluestore_deferred_transaction_t)
7) PREFIX_ALLOC
FreelistManager相关,默认使用BitmapFreelistManager;
B blocks B blocks_per_key B bytes_per_block B size
8) PREFIX_SHARED_BLOB
Shared blob的元数据信息,因为blob的size比较大,有可能上面的多个extent maps映射下来;
RocksDB tool
ceph提供了一个命令来获取一个kvstore里的数据:ceph-kvstore-tool,help如下:
root@ceph6:~# ceph-kvstore-tool -h Usage: ceph-kvstore-tool <leveldb|rocksdb|bluestore-kv> <store path> command [args...] Commands: list [prefix] list-crc [prefix] exists <prefix> [key] get <prefix> <key> [out <file>] crc <prefix> <key> get-size [<prefix> <key>] set <prefix> <key> [ver <N>|in <file>] rm <prefix> <key> rm-prefix <prefix> store-copy <path> [num-keys-per-tx] [leveldb|rocksdb|...] store-crc <path> compact compact-prefix <prefix> compact-range <prefix> <start> <end> repair
使用示例:
root@ceph6:~# systemctl stop ceph-osd@20.service root@ceph6:~# ceph-kvstore-tool bluestore-kv /var/lib/ceph/osd/ceph-20/ list B > list-B 2018-09-21 11:43:42.679 7f4ec14deb80 1 bluestore(/var/lib/ceph/osd/ceph-20/) _mount path /var/lib/ceph/osd/ceph-20/ 2018-09-21 11:43:42.679 7f4ec14deb80 1 bdev create path /var/lib/ceph/osd/ceph-20//block type kernel 2018-09-21 11:43:42.679 7f4ec14deb80 1 bdev(0x55ddf4e58000 /var/lib/ceph/osd/ceph-20//block) open path /var/lib/ceph/osd/ceph-20//block 2018-09-21 11:43:42.679 7f4ec14deb80 1 bdev(0x55ddf4e58000 /var/lib/ceph/osd/ceph-20//block) open size 4000783007744 (0x3a381400000, 3.6 TiB) block_size 4096 (4 KiB) rotational 2018-09-21 11:43:42.679 7f4ec14deb80 1 bluestore(/var/lib/ceph/osd/ceph-20/) _set_cache_sizes cache_size 1073741824 meta 0.5 kv 0.5 data 0 2018-09-21 11:43:42.679 7f4ec14deb80 1 bdev create path /var/lib/ceph/osd/ceph-20//block.db type kernel 2018-09-21 11:43:42.679 7f4ec14deb80 1 bdev(0x55ddf4e58380 /var/lib/ceph/osd/ceph-20//block.db) open path /var/lib/ceph/osd/ceph-20//block.db 2018-09-21 11:43:42.679 7f4ec14deb80 1 bdev(0x55ddf4e58380 /var/lib/ceph/osd/ceph-20//block.db) open size 3221225472 (0xc0000000, 3 GiB) block_size 4096 (4 KiB) non-rotational 2018-09-21 11:43:42.679 7f4ec14deb80 1 bluefs add_block_device bdev 1 path /var/lib/ceph/osd/ceph-20//block.db size 3 GiB 2018-09-21 11:43:42.679 7f4ec14deb80 1 bdev create path /var/lib/ceph/osd/ceph-20//block type kernel 2018-09-21 11:43:42.679 7f4ec14deb80 1 bdev(0x55ddf4e58700 /var/lib/ceph/osd/ceph-20//block) open path /var/lib/ceph/osd/ceph-20//block 2018-09-21 11:43:42.683 7f4ec14deb80 1 bdev(0x55ddf4e58700 /var/lib/ceph/osd/ceph-20//block) open size 4000783007744 (0x3a381400000, 3.6 TiB) block_size 4096 (4 KiB) rotational 2018-09-21 11:43:42.683 7f4ec14deb80 1 bluefs add_block_device bdev 2 path /var/lib/ceph/osd/ceph-20//block size 3.6 TiB 2018-09-21 11:43:42.683 7f4ec14deb80 1 bdev create path /var/lib/ceph/osd/ceph-20//block.wal type kernel 2018-09-21 11:43:42.683 7f4ec14deb80 1 bdev(0x55ddf4e58a80 /var/lib/ceph/osd/ceph-20//block.wal) open path /var/lib/ceph/osd/ceph-20//block.wal 2018-09-21 11:43:42.683 7f4ec14deb80 1 bdev(0x55ddf4e58a80 /var/lib/ceph/osd/ceph-20//block.wal) open size 3221225472 (0xc0000000, 3 GiB) block_size 4096 (4 KiB) non-rotational 2018-09-21 11:43:42.683 7f4ec14deb80 1 bluefs add_block_device bdev 0 path /var/lib/ceph/osd/ceph-20//block.wal size 3 GiB 2018-09-21 11:43:42.683 7f4ec14deb80 1 bluefs mount 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option compaction_readahead_size = 2097152 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option compression = kNoCompression 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option max_write_buffer_number = 4 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option min_write_buffer_number_to_merge = 1 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option recycle_log_file_num = 4 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option writable_file_max_buffer_size = 0 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option write_buffer_size = 268435456 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option compaction_readahead_size = 2097152 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option compression = kNoCompression 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option max_write_buffer_number = 4 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option min_write_buffer_number_to_merge = 1 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option recycle_log_file_num = 4 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option writable_file_max_buffer_size = 0 2018-09-21 11:43:42.691 7f4ec14deb80 0 set rocksdb option write_buffer_size = 268435456 2018-09-21 11:43:42.691 7f4ec14deb80 1 rocksdb: do_open column families: [default] 2018-09-21 11:43:42.699 7f4ec14deb80 1 bluestore(/var/lib/ceph/osd/ceph-20/) _open_db opened rocksdb path db options compression=kNoCompression,max_write_buffer_number=4,min_write_buffer_number_to_merge=1,recycle_log_file_num=4,write_buffer_size=268435456,writable_file_max_buffer_size=0,compaction_readahead_size=2097152 2018-09-21 11:43:42.703 7f4ec14deb80 1 bluestore(/var/lib/ceph/osd/ceph-20/) umount 2018-09-21 11:43:42.703 7f4ec14deb80 1 bluefs umount 2018-09-21 11:43:42.703 7f4ec14deb80 1 stupidalloc 0x0x55ddf4a92a70 shutdown 2018-09-21 11:43:42.703 7f4ec14deb80 1 stupidalloc 0x0x55ddf4a92ae0 shutdown 2018-09-21 11:43:42.703 7f4ec14deb80 1 stupidalloc 0x0x55ddf4a92b50 shutdown 2018-09-21 11:43:42.703 7f4ec14deb80 1 bdev(0x55ddf4e58a80 /var/lib/ceph/osd/ceph-20//block.wal) close 2018-09-21 11:43:42.991 7f4ec14deb80 1 bdev(0x55ddf4e58380 /var/lib/ceph/osd/ceph-20//block.db) close 2018-09-21 11:43:43.227 7f4ec14deb80 1 bdev(0x55ddf4e58700 /var/lib/ceph/osd/ceph-20//block) close 2018-09-21 11:43:43.463 7f4ec14deb80 1 bdev(0x55ddf4e58000 /var/lib/ceph/osd/ceph-20//block) close root@ceph6:~# systemctl start ceph-osd@20.service
以上就是本文的全部内容,希望本文的内容对大家的学习或者工作能带来一定的帮助,也希望大家多多支持 码农网
猜你喜欢:本站部分资源来源于网络,本站转载出于传递更多信息之目的,版权归原作者或者来源机构所有,如转载稿涉及版权问题,请联系我们。
Alone Together
Sherry Turkle / Basic Books / 2011-1-11 / USD 28.95
Consider Facebookit’s human contact, only easier to engage with and easier to avoid. Developing technology promises closeness. Sometimes it delivers, but much of our modern life leaves us less connect......一起来看看 《Alone Together》 这本书的介绍吧!