FilterBlock 源码分析

Rocksdb,为了加速查询,基于bloom filter 设计了Filter。

FilterBlockBuilder

FilterBlockBuilder 是个基类,用于为特定的table构建filter,调用::Finish函数后会以字符串形式返回生成filter。它也有三个子类:

  • BlockBasedFilterBlockBuilder
  • FullFilterBlockBuilder
  • PartitionedFilterBlockBuilder

FilterBlockBuilder 的成员函数的调用顺序,要符合正则表达式: (StartBlock Add*)* Finish

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
class FilterBlockBuilder {
public:
explicit FilterBlockBuilder() {}
virtual ~FilterBlockBuilder() {}

FilterBlockBuilder(const FilterBlockBuilder&) = delete;
void operator=(const FilterBlockBuilder&) = delete;

virtual bool IsBlockBased() = 0; // If is blockbased filter
virtual void StartBlock(uint64_t block_offset) = 0; // Start new block filter
virtual void Add(const Slice& key_without_ts) = 0; // Add a key to current filter
virtual bool IsEmpty() const = 0; // Empty == none added
/// For reporting stats on how many entries the builder considered unique
virtual size_t EstimateEntriesAdded() = 0;
/// Generate Filter
Slice Finish() {
const BlockHandle empty_handle;
Status dont_care_status;
auto ret = Finish(empty_handle, &dont_care_status);
assert(dont_care_status.ok());
return ret;
}
virtual Slice Finish(const BlockHandle& tmp, Status* status) = 0;
};

BlockBasedFilterBlockBuilder

BlockBasedFilterBlockBuilder 调用::Finish函数最后生成的字符串数据格式,如下:

1
2
3
4
5
6
7
8
9
filter_1
filter_2
...
filter_N
...
filter_1_offset
filter_2_offset
...
filter_N_offset

下面先大致看下结构。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
class BlockBasedFilterBlockBuilder : public FilterBlockBuilder {
public:
BlockBasedFilterBlockBuilder(const SliceTransform* prefix_extractor,
const BlockBasedTableOptions& table_opt);
// No copying allowed
BlockBasedFilterBlockBuilder(const BlockBasedFilterBlockBuilder&) = delete;
void operator=(const BlockBasedFilterBlockBuilder&) = delete;

virtual bool IsBlockBased() override { return true; }
virtual void StartBlock(uint64_t block_offset) override;
virtual void Add(const Slice& key_without_ts) override;
virtual bool IsEmpty() const override {
return start_.empty() && filter_offsets_.empty();
}

virtual size_t EstimateEntriesAdded() override {
return filter_bits_builder_->EstimateEntriesAdded();
}

virtual Slice Finish(const BlockHandle& tmp, Status* status) override;
using FilterBlockBuilder::Finish;

private:
void AddKey(const Slice& key);
void AddPrefix(const Slice& key);
void GenerateFilter();

const FilterPolicy* policy_;
const SliceTransform* prefix_extractor_;
bool whole_key_filtering_;

size_t prev_prefix_start_; // 上次添加到 entries_ 中的 key_prefix 的起始地址
size_t prev_prefix_size_; // 该 key_prefix 的大小

std::string entries_; // 将所有的key打平后存储在 entries 中
std::vector<size_t> start_; // 每个entry的的索引存储在 start_

std::string result_; // Filter data computed so far
std::vector<Slice> tmp_entries_; // policy_->CreateFilter() argument
std::vector<uint32_t> filter_offsets_;
uint64_t total_added_in_built_; // Total keys added to filters built so far
};

Add

BlockBasedFilterBlockBuilder 在添加key时:

  • 如果能提取出待添加key的前缀,则会添加前缀
  • 如果 whole_key_filtering_ == true ,则也会添加整个key

源码简单如斯。

1
2
3
4
5
6
7
8
9
10
11
void BlockBasedFilterBlockBuilder::Add(const Slice& key_without_ts) {
// 是能提取出 key_without_ts 的前缀,则添加前缀
if (prefix_extractor_ && prefix_extractor_->InDomain(key_without_ts)) {
AddPrefix(key_without_ts);
}

// 是否添加添加整个 key
if (whole_key_filtering_) {
AddKey(key_without_ts);
}
}

AddKey

BlockBasedFilterBlockBuilder 中,entries_ 用于存储每次添加的keystart_ 存储这个keyentries_中偏移量。

1
2
3
4
inline void BlockBasedFilterBlockBuilder::AddKey(const Slice& key) {
start_.push_back(entries_.size()); // entries_.size() 即下一个 key 在 entries_ 中的起始偏移量
entries_.append(key.data(), key.size());
}

AddPrefix

非首次添加前缀时,都会先获得上一次添加key的前缀如果本次与上次不同,才添加本次key的前缀。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
inline void BlockBasedFilterBlockBuilder::AddPrefix(const Slice& key) {

Slice prev;
if (prev_prefix_size_ > 0) {
// 上一个添加的 key 的前缀
prev = Slice(entries_.data() + prev_prefix_start_, prev_prefix_size_);
}

// 本次添加的key的前缀
Slice prefix = prefix_extractor_->Transform(key);
// 如果当前前缀 prefix 和之前的 prev 不同,则插入
if (prev.size() == 0 || prefix != prev) {
prev_prefix_start_ = entries_.size(); // 待插入的 key 在entries_中的起始地址
prev_prefix_size_ = prefix.size(); // 待插入的key的前缀
AddKey(prefix);
}
}

StartBlock

StartBlock 函数用于构建filter block

BlockBasedFilterBlockBuilder 是每 2kb的数据创建一个filter block,传入的参数block_offset是表示最终生成的filter blockresult_中的偏移量。

filter_offsets_中记录的是已有filter blockresult_ 中的偏移量。

1
2
3
4
5
6
7
8
9
10
static const size_t kFilterBaseLg = 11;
static const size_t kFilterBase = 1 << kFilterBaseLg; // 2kb

void BlockBasedFilterBlockBuilder::StartBlock(uint64_t block_offset) {
uint64_t filter_index = (block_offset / kFilterBase);
assert(filter_index >= filter_offsets_.size());
while (filter_index > filter_offsets_.size()) {
GenerateFilter();
}
}

GenerateFilter

每个filter block的数据都记录在entries_start_,当调用 GenerateFilter 函数时就会将entries_start_中的数据添加到result_中,而这个filter block 的起始地址记录在filter_offsets_中。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
void BlockBasedFilterBlockBuilder::GenerateFilter() {
const size_t num_entries = start_.size();
if (num_entries == 0) {
// 如果这个 filter block 没有key,直接存储上次生成的滤波器的起始地址
filter_offsets_.push_back(static_cast<uint32_t>(result_.size()));
return;
}
total_added_in_built_ += num_entries;

// 为便于下面使用 start_[i + 1] - start_[i] 计算每个entry的长度
start_.push_back(entries_.size());
tmp_entries_.resize(num_entries);
for (size_t i = 0; i < num_entries; i++) {
const char* base = entries_.data() + start_[i]; // 每个entry的起始地址
size_t length = start_[i + 1] - start_[i]; // 这个entry的长度
tmp_entries_[i] = Slice(base, length); // slice
}

// filter_offsets_ 中保存着每个 filter block 的起始地址,即 result.size()
filter_offsets_.push_back(static_cast<uint32_t>(result_.size()));
// 基于 tmp_entries 的内容创建 filter,结果存储至 result_
policy_->CreateFilter(&tmp_entries_[0],
static_cast<int>(num_entries),
&result_);

// 清零,为下个 filter block 准备
tmp_entries_.clear();
entries_.clear();
start_.clear();
prev_prefix_start_ = 0;
prev_prefix_size_ = 0;
}

Finish

在调用::Finish函数时,需要result_result_中写入三部分:

  • 每个key的偏移量KEY_OFFSET
  • key的数量FILTER_BLOCK_NUM
  • 结束标志 kFilterBaseLg

代码简单如此。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
Slice BlockBasedFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/,
Status* status) {
*status = Status::OK();
if (!start_.empty()) {
GenerateFilter();
}

const uint32_t array_offset = static_cast<uint32_t>(result_.size());
for (size_t i = 0; i < filter_offsets_.size(); i++) {
// 把每个 filter block 的偏移量添加到result
PutFixed32(&result_, filter_offsets_[i]);
}

// 添加 filter block 的数量
PutFixed32(&result_, array_offset);
// 结束标志位
result_.push_back(kFilterBaseLg);
return Slice(result_);
}

FullFilterBlockBuilder

他就是将全部的的key都放在了一起、。

TODO:设计点

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
class FullFilterBlockBuilder : public FilterBlockBuilder {
public:
explicit FullFilterBlockBuilder(const SliceTransform* prefix_extractor,
bool whole_key_filtering,
FilterBitsBuilder* filter_bits_builder);
FullFilterBlockBuilder(const FullFilterBlockBuilder&) = delete;
void operator=(const FullFilterBlockBuilder&) = delete;

~FullFilterBlockBuilder() {}

virtual bool IsBlockBased() override { return false; } // 区分 BlockBasedFilterBlockBuilder
virtual void StartBlock(uint64_t) override {}
virtual bool IsEmpty() const override { return !any_added_; }

virtual size_t EstimateEntriesAdded() override;
virtual void Add(const Slice& key_without_ts) override;

using FilterBlockBuilder::Finish;
virtual Slice Finish(const BlockHandle& tmp, Status* status) override;

protected:
virtual void AddKey(const Slice& key);
virtual void Reset();
void AddPrefix(const Slice& key);

const SliceTransform* prefix_extractor() { return prefix_extractor_; }
const std::string& last_prefix_str() const { return last_prefix_str_; }

std::unique_ptr<FilterBitsBuilder> filter_bits_builder_;
private:
const SliceTransform* prefix_extractor_; // 前缀提取器
bool whole_key_filtering_;
bool last_whole_key_recorded_;
std::string last_whole_key_str_;
bool last_prefix_recorded_;
std::string last_prefix_str_;
bool last_key_in_domain_;
bool any_added_;
std::unique_ptr<const char[]> filter_data_;
};

Add

  1. 根据whole_key_filtering_ 来确定是否添加整个key_without_ts
  2. 根据add_prefix,来看是否添加key_without_ts的前缀
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
void FullFilterBlockBuilder::Add(const Slice& key_without_ts) {
const bool add_prefix =
prefix_extractor_ && prefix_extractor_->InDomain(key_without_ts);

if (!last_prefix_recorded_ && last_key_in_domain_) {
AddKey(last_prefix_str_);
last_prefix_recorded_ = true;
}

if (whole_key_filtering_) {
// 如果没前缀,则直接添加key
if (!add_prefix) {
AddKey(key_without_ts);
} else {
// 第一个 key or 遇到新的 key
Slice last_whole_key = Slice(last_whole_key_str_);
if (!last_whole_key_recorded_ ||
last_whole_key.compare(key_without_ts) != 0) {
AddKey(key_without_ts);
last_whole_key_recorded_ = true;
last_whole_key_str_.assign(key_without_ts.data(),
key_without_ts.size());
}
}
}
if (add_prefix) {
last_key_in_domain_ = true;
AddPrefix(key_without_ts);
} else {
last_key_in_domain_ = false;
}
}

AddKey

1
2
3
4
inline void FullFilterBlockBuilder::AddKey(const Slice& key) {
filter_bits_builder_->AddKey(key);
any_added_ = true;
}

AddPrefix

  • last_prefix_str_:记录上一次添加的前缀

  • 如果whole_key_filtering_ == true,那么,从FullFilterBlockBuilder::Add函数运行至FullFilterBlockBuilder::AddPrefix 函数结束,添加的数据是:

    1
    2
    key_without_ts
    prefix

    但是如果prefix和上一个添加的前缀相同,则忽略本次的前缀,最终添加的内容只有

    1
    key_without_ts
  • 如果whole_key_filtering_ == false,添加的数据只有prefix

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
void FullFilterBlockBuilder::AddPrefix(const Slice& key) {
assert(prefix_extractor_ && prefix_extractor_->InDomain(key));
Slice prefix = prefix_extractor_->Transform(key);
if (whole_key_filtering_) {
// 前一个添加的前缀
Slice last_prefix = Slice(last_prefix_str_); // 前缀
// 遇到新的前缀
if (!last_prefix_recorded_ || last_prefix.compare(prefix) != 0) {
// 记录前缀
AddKey(prefix);
last_prefix_recorded_ = true;
last_prefix_str_.assign(prefix.data(), prefix.size());
}
} else {
// 直接添加 prefix
AddKey(prefix);
}
}

Finish

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
void FullFilterBlockBuilder::Reset() {
last_whole_key_recorded_ = false;
last_prefix_recorded_ = false;
}

Slice FullFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/,
Status* status) {
Reset();
// In this impl we ignore BlockHandle
*status = Status::OK();
if (any_added_) {
any_added_ = false;
return filter_bits_builder_->Finish(&filter_data_);
}
return Slice();
}

PartitionedFilterBlockBuilder

对应于 PartitionedIndexBuilder,rocksdb 现在是一个partition filter block 对应一个partition index block

因此,每生成一个 partition filter block 也会请求 PartitionedIndexBuilder 生成一个 partition index block,来保证一一对应。

两种原理都差不多。

PartitionedFilterBlockBuilder内部,也有个FilterEntry结构体,类似于PartitionedIndexBuilder::Entry,每个value都是一个filter

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
public:
explicit PartitionedFilterBlockBuilder(
const SliceTransform* prefix_extractor,
bool whole_key_filtering,
FilterBitsBuilder* filter_bits_builder,
int index_block_restart_interval,
const bool use_value_delta_encoding,
PartitionedIndexBuilder* const p_index_builder,
const uint32_t partition_size);

virtual ~PartitionedFilterBlockBuilder();

void AddKey(const Slice& key) override;
void Add(const Slice& key) override;
size_t EstimateEntriesAdded() override;

virtual Slice Finish(const BlockHandle& last_partition_block_handle,
Status* status) override;

private:

BlockBuilder index_on_filter_block_builder_; // top-level index builder
BlockBuilder index_on_filter_block_builder_without_seq_;
struct FilterEntry {
std::string key;
Slice filter;
};
std::list<FilterEntry> filters; // list of partitioned indexes and their keys
std::unique_ptr<IndexBuilder> value; // 可以存储 PartitionedIndexBuilder
std::vector<std::unique_ptr<const char[]>> filter_gc;
bool finishing_filters = false; // 为true,表示正调用 Finish 函数,尚未返回 Status::OK()

void MaybeCutAFilterBlock(const Slice* next_key);

PartitionedIndexBuilder* const p_index_builder_;
uint32_t keys_per_partition_; // 每个 partition 中 key 的数量
uint32_t keys_added_to_partition_;// 最近一个partion添加的key的数量
uint64_t total_added_in_built_; // 总共添加的key的数量
BlockHandle last_encoded_handle_;
};

MaybeCutAFilterBlock

MaybeCutAFilterBlock 函数,用于判断当前是否需要生成一个filter block当前 partition添加的key的数量 keys_added_to_partition_ 已经到达阈值 keys_per_partition_

由于partition filter blockpartition index block 的数目是一一对应的。因此,需要请求 PartitionedIndexBuilder 对象 p_index_builder_ 也生成一个partition index block。此外,生成的partition idnex blockpartition filter block 对应的key是相同的。

1
2
3
4
5
6
7
8
9
struct PartitionedIndexBuilder::Entry {
std::string key; // sub_index_last_key_;
std::unique_ptr<ShortenedIndexBuilder> value; // partition index block
};

struct PartitionedFilterBlockBuilderFilterEntry {
std::string key; // sub_index_last_key_
Slice filter; // partition filter block
};

MaybeCutAFilterBlock 的函数分为三步:

  1. 如果当前 partition filter blockkeys_added_to_partition_已达到阈值,则请求对应的partiiton index block也创建完毕,否则就阻塞等待;
  2. 将待添加的next_key的前缀next_key_prefix添加到partition filer block
  3. 生成partition filter block,并添加到filters 中。

顺着源码更加详细地去理解这个过程。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock(const Slice* next_key) {

// 当前 parition 的 key 达到阈值
if (keys_added_to_partition_ == keys_per_partition_) {
// 请求 p_index_builder_ 生成一个 partition
p_index_builder_->RequestPartitionCut();
}

if (!p_index_builder_->ShouldCutFilterBlock()) {
return;
}

/// p_index_builder_ 已生成一个 partition

filter_gc.push_back(std::unique_ptr<const char[]>(nullptr));

// 判断是否需要添加 next_key 的前缀
const bool maybe_add_prefix =
next_key && prefix_extractor() && prefix_extractor()->InDomain(*next_key);
if (maybe_add_prefix) {
const Slice next_key_prefix = prefix_extractor()->Transform(*next_key);
// 与之前的前缀不同,则添加
if (next_key_prefix.compare(last_prefix_str()) != 0) {
AddKey(next_key_prefix);
}
}

total_added_in_built_ += filter_bits_builder_->EstimateEntriesAdded();
// 生成一个 filter block
Slice filter = filter_bits_builder_->Finish(&filter_gc.back());
// index_key 是 partition index block 存储的时的 key
std::string& index_key = p_index_builder_->GetPartitionKey();
// {index_key, PartitionFilter}
filters.push_back({index_key, filter});
// 当前 partition 已添加的key数清零
keys_added_to_partition_ = 0;
Reset();
}

AddKey

添加 key 至当前partition filter block 中。

1
2
3
4
void PartitionedFilterBlockBuilder::AddKey(const Slice& key) {
FullFilterBlockBuilder::AddKey(key);
keys_added_to_partition_++;
}

Add

由于PartitionedFilterBlockBuilder 继承自FullFilterBlockBuilder,唯一的区别就在于将一整个Filter Block,划分为多个partitions

因此,在每次Add新的key,都先判断下是是否要需要生成新的partition filter block,然后将key添加到当前partiton filter block

1
2
3
4
void PartitionedFilterBlockBuilder::Add(const Slice& key) {
MaybeCutAFilterBlock(&key);
FullFilterBlockBuilder::Add(key);
}

Finish

类似于PartitionIndexBuiler, 这里要将fiters 中记录的partitions 都写入到SST中,并记录下每个partition 在SST中的偏移量offset,根据partiton的大小size和offset生成 TopLevelIndex

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
Slice PartitionedFilterBlockBuilder::Finish(
const BlockHandle& last_partition_block_handle,
Status* status) {

if (finishing_filters == true) {
FilterEntry& last_entry = filters.front();
std::string handle_encoding;
last_partition_block_handle.EncodeTo(&handle_encoding);
std::string handle_delta_encoding;
PutVarsignedint64(
&handle_delta_encoding,
last_partition_block_handle.size() - last_encoded_handle_.size());
last_encoded_handle_ = last_partition_block_handle;
const Slice handle_delta_encoding_slice(handle_delta_encoding);
index_on_filter_block_builder_.Add(last_entry.key, // key 相同
handle_encoding, // block_handle
&handle_delta_encoding_slice);
if (!p_index_builder_->seperator_is_key_plus_seq()) {
index_on_filter_block_builder_without_seq_.Add(
ExtractUserKey(last_entry.key), handle_encoding,
&handle_delta_encoding_slice);
}
filters.pop_front();
} else {
// 第一次调用 ::Finish 函数,此时没有 next_key,
// 仅仅是为了生成最后一个 partition fiter block
MaybeCutAFilterBlock(nullptr);
}

if (UNLIKELY(filters.empty())) {
*status = Status::OK();
if (finishing_filters) {
/// 最后一次调用 ::Finish
total_added_in_built_ = 0;
// 返回partitions的元信息
if (p_index_builder_->seperator_is_key_plus_seq()) {
return index_on_filter_block_builder_.Finish();
} else {
return index_on_filter_block_builder_without_seq_.Finish();
}
} else {
// This is the rare case where no key was added to the filter
return Slice();
}
} else {
// 返回当前 partition 的信息
*status = Status::Incomplete();
finishing_filters = true;
return filters.front().filter;
}
}