aboutsummaryrefslogtreecommitdiff
path: root/icing/index/lite/lite-index.cc
diff options
context:
space:
mode:
Diffstat (limited to 'icing/index/lite/lite-index.cc')
-rw-r--r--icing/index/lite/lite-index.cc275
1 files changed, 173 insertions, 102 deletions
diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc
index bf54dec..3f9cc93 100644
--- a/icing/index/lite/lite-index.cc
+++ b/icing/index/lite/lite-index.cc
@@ -36,6 +36,8 @@
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/hit/hit.h"
#include "icing/index/lite/lite-index-header.h"
+#include "icing/index/lite/term-id-hit-pair.h"
+#include "icing/index/term-id-codec.h"
#include "icing/index/term-property-id.h"
#include "icing/legacy/core/icing-string-util.h"
#include "icing/legacy/core/icing-timer.h"
@@ -44,10 +46,13 @@
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/legacy/index/icing-mmapper.h"
#include "icing/proto/debug.pb.h"
+#include "icing/proto/scoring.pb.h"
#include "icing/proto/storage.pb.h"
#include "icing/proto/term.pb.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
+#include "icing/store/namespace-id.h"
+#include "icing/store/suggestion-result-checker.h"
#include "icing/util/crc32.h"
#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
@@ -160,10 +165,11 @@ libtextclassifier3::Status LiteIndex::Initialize() {
}
// Set up header.
- header_mmap_.Remap(hit_buffer_fd_.get(), 0, header_size());
+ header_mmap_.Remap(hit_buffer_fd_.get(), kHeaderFileOffset, header_size());
header_ = std::make_unique<LiteIndex_HeaderImpl>(
reinterpret_cast<LiteIndex_HeaderImpl::HeaderData*>(
- header_mmap_.address()));
+ header_mmap_.address()),
+ options_.include_property_existence_metadata_hits);
header_->Reset();
if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
@@ -175,10 +181,11 @@ libtextclassifier3::Status LiteIndex::Initialize() {
UpdateChecksum();
} else {
- header_mmap_.Remap(hit_buffer_fd_.get(), 0, header_size());
+ header_mmap_.Remap(hit_buffer_fd_.get(), kHeaderFileOffset, header_size());
header_ = std::make_unique<LiteIndex_HeaderImpl>(
reinterpret_cast<LiteIndex_HeaderImpl::HeaderData*>(
- header_mmap_.address()));
+ header_mmap_.address()),
+ options_.include_property_existence_metadata_hits);
if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
sizeof(TermIdHitPair::Value), header_->cur_size(),
@@ -352,6 +359,73 @@ libtextclassifier3::StatusOr<uint32_t> LiteIndex::GetTermId(
return tvi;
}
+void LiteIndex::ScoreAndAppendFetchedHit(
+ const Hit& hit, SectionIdMask section_id_mask,
+ bool only_from_prefix_sections,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
+ const SuggestionResultChecker* suggestion_result_checker,
+ DocumentId& last_document_id, bool& is_last_document_desired,
+ int& total_score_out, std::vector<DocHitInfo>* hits_out,
+ std::vector<Hit::TermFrequencyArray>* term_frequency_out) const {
+ // Check sections.
+ if (((UINT64_C(1) << hit.section_id()) & section_id_mask) == 0) {
+ return;
+ }
+ // Check prefix section only.
+ if (only_from_prefix_sections && !hit.is_in_prefix_section()) {
+ return;
+ }
+ // Check whether this Hit is desired.
+ // TODO(b/230553264) Move common logic into helper function once we support
+ // score term by prefix_hit in lite_index.
+ DocumentId document_id = hit.document_id();
+ bool is_new_document = document_id != last_document_id;
+ if (is_new_document) {
+ last_document_id = document_id;
+ is_last_document_desired =
+ suggestion_result_checker == nullptr ||
+ suggestion_result_checker->BelongsToTargetResults(document_id,
+ hit.section_id());
+ }
+ if (!is_last_document_desired) {
+ // The document is removed or expired or not desired.
+ return;
+ }
+
+ // Score the hit by the strategy
+ switch (score_by) {
+ case SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE:
+ total_score_out = 1;
+ break;
+ case SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT:
+ if (is_new_document) {
+ ++total_score_out;
+ }
+ break;
+ case SuggestionScoringSpecProto::SuggestionRankingStrategy::TERM_FREQUENCY:
+ if (hit.has_term_frequency()) {
+ total_score_out += hit.term_frequency();
+ } else {
+ ++total_score_out;
+ }
+ break;
+ }
+
+ // Append the Hit or update hit section to the output vector.
+ if (is_new_document && hits_out != nullptr) {
+ hits_out->push_back(DocHitInfo(document_id));
+ if (term_frequency_out != nullptr) {
+ term_frequency_out->push_back(Hit::TermFrequencyArray());
+ }
+ }
+ if (hits_out != nullptr) {
+ hits_out->back().UpdateSection(hit.section_id());
+ if (term_frequency_out != nullptr) {
+ term_frequency_out->back()[hit.section_id()] = hit.term_frequency();
+ }
+ }
+}
+
int LiteIndex::FetchHits(
uint32_t term_id, SectionIdMask section_id_mask,
bool only_from_prefix_sections,
@@ -359,19 +433,38 @@ int LiteIndex::FetchHits(
const SuggestionResultChecker* suggestion_result_checker,
std::vector<DocHitInfo>* hits_out,
std::vector<Hit::TermFrequencyArray>* term_frequency_out) {
- int score = 0;
- DocumentId last_document_id = kInvalidDocumentId;
- // Record whether the last document belongs to the given namespaces.
- bool is_last_document_desired = false;
-
- if (NeedSort()) {
- // Transition from shared_lock in NeedSort to unique_lock here is safe
- // because it doesn't hurt to sort again if sorting was done already by
- // another thread after NeedSort is evaluated. NeedSort is called before
- // sorting to improve concurrency as threads can avoid acquiring the unique
- // lock if no sorting is needed.
+ bool need_sort_at_querying = false;
+ {
+ absl_ports::shared_lock l(&mutex_);
+
+ // We sort here when:
+ // 1. We don't enable sorting at indexing time (i.e. we sort at querying
+ // time), and there is an unsorted tail portion. OR
+ // 2. The unsorted tail size exceeds the hit_buffer_sort_threshold,
+ // regardless of whether or not hit_buffer_sort_at_indexing is enabled.
+ // This is more of a sanity check. We should not really be encountering
+ // this case.
+ need_sort_at_querying = NeedSortAtQuerying();
+ }
+ if (need_sort_at_querying) {
absl_ports::unique_lock l(&mutex_);
- SortHits();
+ IcingTimer timer;
+
+ // Transition from shared_lock to unique_lock is safe here because it
+ // doesn't hurt to sort again if sorting was done already by another thread
+ // after need_sort_at_querying is evaluated.
+ // We check need_sort_at_querying to improve query concurrency as threads
+ // can avoid acquiring the unique lock if no sorting is needed.
+ SortHitsImpl();
+
+ if (options_.hit_buffer_sort_at_indexing) {
+ // This is the second case for sort. Log as this should be a very rare
+ // occasion.
+ ICING_LOG(WARNING) << "Sorting HitBuffer at querying time when "
+ "hit_buffer_sort_at_indexing is enabled. Sort and "
+ "merge HitBuffer in "
+ << timer.Elapsed() * 1000 << " ms.";
+ }
}
// This downgrade from an unique_lock to a shared_lock is safe because we're
@@ -379,75 +472,72 @@ int LiteIndex::FetchHits(
// only in Seek().
// Any operations that might execute in between the transition of downgrading
// the lock here are guaranteed not to alter the searchable section (or the
- // LiteIndex due to a global lock in IcingSearchEngine).
+ // LiteIndex) due to a global lock in IcingSearchEngine.
absl_ports::shared_lock l(&mutex_);
- for (uint32_t idx = Seek(term_id); idx < header_->searchable_end(); idx++) {
- TermIdHitPair term_id_hit_pair =
- hit_buffer_.array_cast<TermIdHitPair>()[idx];
- if (term_id_hit_pair.term_id() != term_id) break;
-
- const Hit& hit = term_id_hit_pair.hit();
- // Check sections.
- if (((UINT64_C(1) << hit.section_id()) & section_id_mask) == 0) {
- continue;
- }
- // Check prefix section only.
- if (only_from_prefix_sections && !hit.is_in_prefix_section()) {
- continue;
- }
- // TODO(b/230553264) Move common logic into helper function once we support
- // score term by prefix_hit in lite_index.
- // Check whether this Hit is desired.
- DocumentId document_id = hit.document_id();
- bool is_new_document = document_id != last_document_id;
- if (is_new_document) {
- last_document_id = document_id;
- is_last_document_desired =
- suggestion_result_checker == nullptr ||
- suggestion_result_checker->BelongsToTargetResults(document_id,
- hit.section_id());
- }
- if (!is_last_document_desired) {
- // The document is removed or expired or not desired.
- continue;
- }
- // Score the hit by the strategy
- switch (score_by) {
- case SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE:
- score = 1;
- break;
- case SuggestionScoringSpecProto::SuggestionRankingStrategy::
- DOCUMENT_COUNT:
- if (is_new_document) {
- ++score;
- }
- break;
- case SuggestionScoringSpecProto::SuggestionRankingStrategy::
- TERM_FREQUENCY:
- if (hit.has_term_frequency()) {
- score += hit.term_frequency();
- } else {
- ++score;
- }
- break;
- }
+ // Search in the HitBuffer array for Hits with the corresponding term_id.
+ // Hits are added in increasing order of doc ids, so hits that get appended
+ // later have larger docIds. This means that:
+ // 1. Hits in the unsorted tail will have larger docIds than hits in the
+ // sorted portion.
+ // 2. Hits at the end of the unsorted tail will have larger docIds than hits
+ // in the front of the tail.
+ // We want to retrieve hits in descending order of docIds. Therefore we should
+ // search by doing:
+ // 1. Linear search first in reverse iteration order over the unsorted tail
+ // portion.
+ // 2. Followed by binary search on the sorted portion.
+ const TermIdHitPair* array = hit_buffer_.array_cast<TermIdHitPair>();
- // Append the Hit or update hit section to the output vector.
- if (is_new_document && hits_out != nullptr) {
- hits_out->push_back(DocHitInfo(document_id));
- if (term_frequency_out != nullptr) {
- term_frequency_out->push_back(Hit::TermFrequencyArray());
+ DocumentId last_document_id = kInvalidDocumentId;
+ // Record whether the last document belongs to the given namespaces.
+ bool is_last_document_desired = false;
+ int total_score = 0;
+
+ // Linear search over unsorted tail in reverse iteration order.
+ // This should only be performed when hit_buffer_sort_at_indexing is enabled.
+ // When disabled, the entire HitBuffer should be sorted already and only
+ // binary search is needed.
+ if (options_.hit_buffer_sort_at_indexing) {
+ uint32_t unsorted_length = header_->cur_size() - header_->searchable_end();
+ for (uint32_t i = 1; i <= unsorted_length; ++i) {
+ TermIdHitPair term_id_hit_pair = array[header_->cur_size() - i];
+ if (term_id_hit_pair.term_id() == term_id) {
+ // We've found a matched hit.
+ const Hit& matched_hit = term_id_hit_pair.hit();
+ // Score the hit and add to total_score. Also add the hits and its term
+ // frequency info to hits_out and term_frequency_out if the two vectors
+ // are non-null.
+ ScoreAndAppendFetchedHit(matched_hit, section_id_mask,
+ only_from_prefix_sections, score_by,
+ suggestion_result_checker, last_document_id,
+ is_last_document_desired, total_score,
+ hits_out, term_frequency_out);
}
}
- if (hits_out != nullptr) {
- hits_out->back().UpdateSection(hit.section_id());
- if (term_frequency_out != nullptr) {
- term_frequency_out->back()[hit.section_id()] = hit.term_frequency();
- }
+ }
+
+ // Do binary search over the sorted section and repeat the above steps.
+ TermIdHitPair target_term_id_hit_pair(
+ term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kDefaultTermFrequency));
+ for (const TermIdHitPair* ptr = std::lower_bound(
+ array, array + header_->searchable_end(), target_term_id_hit_pair);
+ ptr < array + header_->searchable_end(); ++ptr) {
+ if (ptr->term_id() != term_id) {
+ // We've processed all matches. Stop iterating further.
+ break;
}
+
+ const Hit& matched_hit = ptr->hit();
+ // Score the hit and add to total_score. Also add the hits and its term
+ // frequency info to hits_out and term_frequency_out if the two vectors are
+ // non-null.
+ ScoreAndAppendFetchedHit(
+ matched_hit, section_id_mask, only_from_prefix_sections, score_by,
+ suggestion_result_checker, last_document_id, is_last_document_desired,
+ total_score, hits_out, term_frequency_out);
}
- return score;
+ return total_score;
}
libtextclassifier3::StatusOr<int> LiteIndex::ScoreHits(
@@ -455,9 +545,9 @@ libtextclassifier3::StatusOr<int> LiteIndex::ScoreHits(
SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
const SuggestionResultChecker* suggestion_result_checker) {
return FetchHits(term_id, kSectionIdMaskAll,
- /*only_from_prefix_sections=*/false, score_by,
- suggestion_result_checker,
- /*hits_out=*/nullptr);
+ /*only_from_prefix_sections=*/false, score_by,
+ suggestion_result_checker,
+ /*hits_out=*/nullptr);
}
bool LiteIndex::is_full() const {
@@ -515,7 +605,7 @@ IndexStorageInfoProto LiteIndex::GetStorageInfo(
return storage_info;
}
-void LiteIndex::SortHits() {
+void LiteIndex::SortHitsImpl() {
// Make searchable by sorting by hit buffer.
uint32_t sort_len = header_->cur_size() - header_->searchable_end();
if (sort_len <= 0) {
@@ -546,25 +636,6 @@ void LiteIndex::SortHits() {
UpdateChecksum();
}
-uint32_t LiteIndex::Seek(uint32_t term_id) const {
- // Binary search for our term_id. Make sure we get the first
- // element. Using kBeginSortValue ensures this for the hit value.
- TermIdHitPair term_id_hit_pair(
- term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kDefaultTermFrequency));
-
- const TermIdHitPair::Value* array =
- hit_buffer_.array_cast<TermIdHitPair::Value>();
- if (header_->searchable_end() != header_->cur_size()) {
- ICING_LOG(WARNING) << "Lite index: hit buffer searchable end != current "
- << "size during Seek(): "
- << header_->searchable_end() << " vs "
- << header_->cur_size();
- }
- const TermIdHitPair::Value* ptr = std::lower_bound(
- array, array + header_->searchable_end(), term_id_hit_pair.value());
- return ptr - array;
-}
-
libtextclassifier3::Status LiteIndex::Optimize(
const std::vector<DocumentId>& document_id_old_to_new,
const TermIdCodec* term_id_codec, DocumentId new_last_added_document_id) {
@@ -575,7 +646,7 @@ libtextclassifier3::Status LiteIndex::Optimize(
}
// Sort the hits so that hits with the same term id will be grouped together,
// which helps later to determine which terms will be unused after compaction.
- SortHits();
+ SortHitsImpl();
uint32_t new_size = 0;
uint32_t curr_term_id = 0;
uint32_t curr_tvi = 0;