Rocksdb,为了加速查询,基于bloom filter 设计了Filter。
FilterBlockBuilder FilterBlockBuilder
是个基类,用于为特定的table
构建filter,调用::Finish
函数后会以字符串形式返回生成filter。它也有三个子类:
BlockBasedFilterBlockBuilder
FullFilterBlockBuilder
PartitionedFilterBlockBuilder
FilterBlockBuilder
的成员函数的调用顺序,要符合正则表达式: (StartBlock Add*)* Finish
。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 class FilterBlockBuilder { public : explicit FilterBlockBuilder () {} virtual ~FilterBlockBuilder () {} FilterBlockBuilder (const FilterBlockBuilder&) = delete ; void operator =(const FilterBlockBuilder&) = delete ; virtual bool IsBlockBased () = 0 ; virtual void StartBlock (uint64_t block_offset) = 0 ; virtual void Add (const Slice& key_without_ts) = 0 ; virtual bool IsEmpty () const = 0 ; virtual size_t EstimateEntriesAdded () = 0 ; Slice Finish () { const BlockHandle empty_handle; Status dont_care_status; auto ret = Finish (empty_handle, &dont_care_status); assert (dont_care_status.ok ()); return ret; } virtual Slice Finish (const BlockHandle& tmp, Status* status) = 0 ; };
BlockBasedFilterBlockBuilder BlockBasedFilterBlockBuilder
调用::Finish
函数最后生成的字符串数据格式,如下:
1 2 3 4 5 6 7 8 9 filter_1 filter_2 ... filter_N ... filter_1_offset filter_2_offset ... filter_N_offset
下面先大致看下结构。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 class BlockBasedFilterBlockBuilder : public FilterBlockBuilder { public : BlockBasedFilterBlockBuilder (const SliceTransform* prefix_extractor, const BlockBasedTableOptions& table_opt); BlockBasedFilterBlockBuilder (const BlockBasedFilterBlockBuilder&) = delete ; void operator =(const BlockBasedFilterBlockBuilder&) = delete ; virtual bool IsBlockBased () override { return true ; } virtual void StartBlock (uint64_t block_offset) override ; virtual void Add (const Slice& key_without_ts) override ; virtual bool IsEmpty () const override { return start_.empty () && filter_offsets_.empty (); } virtual size_t EstimateEntriesAdded () override { return filter_bits_builder_->EstimateEntriesAdded (); } virtual Slice Finish (const BlockHandle& tmp, Status* status) override ; using FilterBlockBuilder::Finish; private : void AddKey (const Slice& key) ; void AddPrefix (const Slice& key) ; void GenerateFilter () ; const FilterPolicy* policy_; const SliceTransform* prefix_extractor_; bool whole_key_filtering_; size_t prev_prefix_start_; size_t prev_prefix_size_; std::string entries_; std::vector<size_t > start_; std::string result_; std::vector<Slice> tmp_entries_; std::vector<uint32_t > filter_offsets_; uint64_t total_added_in_built_; };
Add BlockBasedFilterBlockBuilder
在添加key
时:
如果能提取出待添加key
的前缀,则会添加前缀
如果 whole_key_filtering_ == true
,则也会添加整个key
源码简单如斯。
1 2 3 4 5 6 7 8 9 10 11 void BlockBasedFilterBlockBuilder::Add (const Slice& key_without_ts) { if (prefix_extractor_ && prefix_extractor_->InDomain (key_without_ts)) { AddPrefix (key_without_ts); } if (whole_key_filtering_) { AddKey (key_without_ts); } }
AddKey 在 BlockBasedFilterBlockBuilder
中,entries_
用于存储每次添加的key
,start_
存储这个key
在entries_
中偏移量。
1 2 3 4 inline void BlockBasedFilterBlockBuilder::AddKey (const Slice& key) { start_.push_back (entries_.size ()); entries_.append (key.data (), key.size ()); }
AddPrefix 非首次添加前缀时,都会先获得上一次添加key的前缀如果本次与上次不同,才添加本次key的前缀。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 inline void BlockBasedFilterBlockBuilder::AddPrefix (const Slice& key) { Slice prev; if (prev_prefix_size_ > 0 ) { prev = Slice (entries_.data () + prev_prefix_start_, prev_prefix_size_); } Slice prefix = prefix_extractor_->Transform (key); if (prev.size () == 0 || prefix != prev) { prev_prefix_start_ = entries_.size (); prev_prefix_size_ = prefix.size (); AddKey (prefix); } }
StartBlock StartBlock
函数用于构建filter block
。
BlockBasedFilterBlockBuilder
是每 2kb的数据创建一个filter block
,传入的参数block_offset
是表示最终生成的filter block
在result_
中的偏移量。
而 filter_offsets_
中记录的是已有filter block
在result_
中的偏移量。
1 2 3 4 5 6 7 8 9 10 static const size_t kFilterBaseLg = 11 ;static const size_t kFilterBase = 1 << kFilterBaseLg; void BlockBasedFilterBlockBuilder::StartBlock (uint64_t block_offset) { uint64_t filter_index = (block_offset / kFilterBase); assert (filter_index >= filter_offsets_.size ()); while (filter_index > filter_offsets_.size ()) { GenerateFilter (); } }
GenerateFilter 每个filter block
的数据都记录在entries_
和 start_
,当调用 GenerateFilter
函数时就会将entries_
、 start_
中的数据添加到result_
中,而这个filter block
的起始地址记录在filter_offsets_
中。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 void BlockBasedFilterBlockBuilder::GenerateFilter () { const size_t num_entries = start_.size (); if (num_entries == 0 ) { filter_offsets_.push_back (static_cast <uint32_t >(result_.size ())); return ; } total_added_in_built_ += num_entries; start_.push_back (entries_.size ()); tmp_entries_.resize (num_entries); for (size_t i = 0 ; i < num_entries; i++) { const char * base = entries_.data () + start_[i]; size_t length = start_[i + 1 ] - start_[i]; tmp_entries_[i] = Slice (base, length); } filter_offsets_.push_back (static_cast <uint32_t >(result_.size ())); policy_->CreateFilter (&tmp_entries_[0 ], static_cast <int >(num_entries), &result_); tmp_entries_.clear (); entries_.clear (); start_.clear (); prev_prefix_start_ = 0 ; prev_prefix_size_ = 0 ; }
Finish 在调用::Finish
函数时,需要result_
向result_
中写入三部分:
每个key
的偏移量KEY_OFFSET
key
的数量FILTER_BLOCK_NUM
结束标志 kFilterBaseLg
代码简单如此。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 Slice BlockBasedFilterBlockBuilder::Finish (const BlockHandle& , Status* status) { *status = Status::OK (); if (!start_.empty ()) { GenerateFilter (); } const uint32_t array_offset = static_cast <uint32_t >(result_.size ()); for (size_t i = 0 ; i < filter_offsets_.size (); i++) { PutFixed32 (&result_, filter_offsets_[i]); } PutFixed32 (&result_, array_offset); result_.push_back (kFilterBaseLg); return Slice (result_); }
FullFilterBlockBuilder 他就是将全部的的key都放在了一起、。
TODO:设计点
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 class FullFilterBlockBuilder : public FilterBlockBuilder { public : explicit FullFilterBlockBuilder (const SliceTransform* prefix_extractor, bool whole_key_filtering, FilterBitsBuilder* filter_bits_builder) ; FullFilterBlockBuilder (const FullFilterBlockBuilder&) = delete ; void operator =(const FullFilterBlockBuilder&) = delete ; ~FullFilterBlockBuilder () {} virtual bool IsBlockBased () override { return false ; } virtual void StartBlock (uint64_t ) override {} virtual bool IsEmpty () const override { return !any_added_; } virtual size_t EstimateEntriesAdded () override ; virtual void Add (const Slice& key_without_ts) override ; using FilterBlockBuilder::Finish; virtual Slice Finish (const BlockHandle& tmp, Status* status) override ; protected : virtual void AddKey (const Slice& key) ; virtual void Reset () ; void AddPrefix (const Slice& key) ; const SliceTransform* prefix_extractor () { return prefix_extractor_; } const std::string& last_prefix_str () const { return last_prefix_str_; } std::unique_ptr<FilterBitsBuilder> filter_bits_builder_; private : const SliceTransform* prefix_extractor_; bool whole_key_filtering_; bool last_whole_key_recorded_; std::string last_whole_key_str_; bool last_prefix_recorded_; std::string last_prefix_str_; bool last_key_in_domain_; bool any_added_; std::unique_ptr<const char []> filter_data_; };
Add
根据whole_key_filtering_
来确定是否添加整个key_without_ts
根据add_prefix
,来看是否添加key_without_ts
的前缀
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 void FullFilterBlockBuilder::Add (const Slice& key_without_ts) { const bool add_prefix = prefix_extractor_ && prefix_extractor_->InDomain (key_without_ts); if (!last_prefix_recorded_ && last_key_in_domain_) { AddKey (last_prefix_str_); last_prefix_recorded_ = true ; } if (whole_key_filtering_) { if (!add_prefix) { AddKey (key_without_ts); } else { Slice last_whole_key = Slice (last_whole_key_str_); if (!last_whole_key_recorded_ || last_whole_key.compare (key_without_ts) != 0 ) { AddKey (key_without_ts); last_whole_key_recorded_ = true ; last_whole_key_str_.assign (key_without_ts.data (), key_without_ts.size ()); } } } if (add_prefix) { last_key_in_domain_ = true ; AddPrefix (key_without_ts); } else { last_key_in_domain_ = false ; } }
AddKey 1 2 3 4 inline void FullFilterBlockBuilder::AddKey (const Slice& key) { filter_bits_builder_->AddKey (key); any_added_ = true ; }
AddPrefix
last_prefix_str_
:记录上一次添加的前缀
如果whole_key_filtering_ == true
,那么,从FullFilterBlockBuilder::Add
函数运行至FullFilterBlockBuilder::AddPrefix
函数结束,添加的数据是:
但是如果prefix
和上一个添加的前缀相同,则忽略本次的前缀,最终添加的内容只有
如果whole_key_filtering_ == false
,添加的数据只有prefix
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 void FullFilterBlockBuilder::AddPrefix (const Slice& key) { assert (prefix_extractor_ && prefix_extractor_->InDomain (key)); Slice prefix = prefix_extractor_->Transform (key); if (whole_key_filtering_) { Slice last_prefix = Slice (last_prefix_str_); if (!last_prefix_recorded_ || last_prefix.compare (prefix) != 0 ) { AddKey (prefix); last_prefix_recorded_ = true ; last_prefix_str_.assign (prefix.data (), prefix.size ()); } } else { AddKey (prefix); } }
Finish 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 void FullFilterBlockBuilder::Reset () { last_whole_key_recorded_ = false ; last_prefix_recorded_ = false ; } Slice FullFilterBlockBuilder::Finish (const BlockHandle& , Status* status) { Reset (); *status = Status::OK (); if (any_added_) { any_added_ = false ; return filter_bits_builder_->Finish (&filter_data_); } return Slice (); }
PartitionedFilterBlockBuilder 对应于 PartitionedIndexBuilder
,rocksdb 现在是一个partition filter block
对应一个partition index block
。
因此,每生成一个 partition filter block
也会请求 PartitionedIndexBuilder
生成一个 partition index block
,来保证一一对应。
两种原理都差不多。
在PartitionedFilterBlockBuilder
内部,也有个FilterEntry
结构体,类似于PartitionedIndexBuilder::Entry
,每个value
都是一个filter
。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { public : explicit PartitionedFilterBlockBuilder ( const SliceTransform* prefix_extractor, bool whole_key_filtering, FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, const bool use_value_delta_encoding, PartitionedIndexBuilder* const p_index_builder, const uint32_t partition_size) ; virtual ~PartitionedFilterBlockBuilder (); void AddKey (const Slice& key) override ; void Add (const Slice& key) override ; size_t EstimateEntriesAdded () override ; virtual Slice Finish (const BlockHandle& last_partition_block_handle, Status* status) override ; private : BlockBuilder index_on_filter_block_builder_; BlockBuilder index_on_filter_block_builder_without_seq_; struct FilterEntry { std::string key; Slice filter; }; std::list<FilterEntry> filters; std::unique_ptr<IndexBuilder> value; std::vector<std::unique_ptr<const char []>> filter_gc; bool finishing_filters = false ; void MaybeCutAFilterBlock (const Slice* next_key) ; PartitionedIndexBuilder* const p_index_builder_; uint32_t keys_per_partition_; uint32_t keys_added_to_partition_; uint64_t total_added_in_built_; BlockHandle last_encoded_handle_; };
MaybeCutAFilterBlock MaybeCutAFilterBlock
函数,用于判断当前是否需要生成一个filter block
: 当前 partition
添加的key的数量 keys_added_to_partition_
已经到达阈值 keys_per_partition_
。
由于partition filter block
和 partition index block
的数目是一一对应的。因此,需要请求 PartitionedIndexBuilder
对象 p_index_builder_
也生成一个partition index block
。此外,生成的partition idnex block
和 partition filter block
对应的key是相同的。
1 2 3 4 5 6 7 8 9 struct PartitionedIndexBuilder ::Entry { std::string key; std::unique_ptr<ShortenedIndexBuilder> value; }; struct PartitionedFilterBlockBuilderFilterEntry { std::string key; Slice filter; };
MaybeCutAFilterBlock
的函数分为三步:
如果当前 partition filter block
的keys_added_to_partition_
已达到阈值,则请求对应的partiiton index block
也创建完毕,否则就阻塞等待;
将待添加的next_key
的前缀next_key_prefix
添加到partition filer block
;
生成partition filter block
,并添加到filters
中。
顺着源码更加详细地去理解这个过程。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock (const Slice* next_key) { if (keys_added_to_partition_ == keys_per_partition_) { p_index_builder_->RequestPartitionCut (); } if (!p_index_builder_->ShouldCutFilterBlock ()) { return ; } filter_gc.push_back (std::unique_ptr <const char []>(nullptr )); const bool maybe_add_prefix = next_key && prefix_extractor () && prefix_extractor ()->InDomain (*next_key); if (maybe_add_prefix) { const Slice next_key_prefix = prefix_extractor ()->Transform (*next_key); if (next_key_prefix.compare (last_prefix_str ()) != 0 ) { AddKey (next_key_prefix); } } total_added_in_built_ += filter_bits_builder_->EstimateEntriesAdded (); Slice filter = filter_bits_builder_->Finish (&filter_gc.back ()); std::string& index_key = p_index_builder_->GetPartitionKey (); filters.push_back ({index_key, filter}); keys_added_to_partition_ = 0 ; Reset (); }
AddKey 添加 key
至当前partition filter block
中。
1 2 3 4 void PartitionedFilterBlockBuilder::AddKey (const Slice& key) { FullFilterBlockBuilder::AddKey (key); keys_added_to_partition_++; }
Add 由于PartitionedFilterBlockBuilder
继承自FullFilterBlockBuilder
,唯一的区别就在于将一整个Filter Block
,划分为多个partitions
。
因此,在每次Add
新的key
,都先判断下是是否要需要生成新的partition filter block
,然后将key
添加到当前partiton filter block
。
1 2 3 4 void PartitionedFilterBlockBuilder::Add (const Slice& key) { MaybeCutAFilterBlock (&key); FullFilterBlockBuilder::Add (key); }
Finish 类似于PartitionIndexBuiler
, 这里要将fiters
中记录的partitions
都写入到SST中,并记录下每个partition
在SST中的偏移量offset,根据partiton
的大小size和offset生成 TopLevelIndex
。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 Slice PartitionedFilterBlockBuilder::Finish ( const BlockHandle& last_partition_block_handle, Status* status) { if (finishing_filters == true ) { FilterEntry& last_entry = filters.front (); std::string handle_encoding; last_partition_block_handle.EncodeTo (&handle_encoding); std::string handle_delta_encoding; PutVarsignedint64 ( &handle_delta_encoding, last_partition_block_handle.size () - last_encoded_handle_.size ()); last_encoded_handle_ = last_partition_block_handle; const Slice handle_delta_encoding_slice (handle_delta_encoding) ; index_on_filter_block_builder_.Add (last_entry.key, handle_encoding, &handle_delta_encoding_slice); if (!p_index_builder_->seperator_is_key_plus_seq ()) { index_on_filter_block_builder_without_seq_.Add ( ExtractUserKey (last_entry.key), handle_encoding, &handle_delta_encoding_slice); } filters.pop_front (); } else { MaybeCutAFilterBlock (nullptr ); } if (UNLIKELY (filters.empty ())) { *status = Status::OK (); if (finishing_filters) { total_added_in_built_ = 0 ; if (p_index_builder_->seperator_is_key_plus_seq ()) { return index_on_filter_block_builder_.Finish (); } else { return index_on_filter_block_builder_without_seq_.Finish (); } } else { return Slice (); } } else { *status = Status::Incomplete (); finishing_filters = true ; return filters.front ().filter; } }