1 files changed, 173 insertions, 102 deletions
diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc
index bf54dec..3f9cc93 100644
--- a/icing/index/lite/lite-index.cc
+++ b/icing/index/lite/lite-index.cc
@@ -36,6 +36,8 @@
 #include "icing/index/hit/doc-hit-info.h"
 #include "icing/index/hit/hit.h"
 #include "icing/index/lite/lite-index-header.h"
+#include "icing/index/lite/term-id-hit-pair.h"
+#include "icing/index/term-id-codec.h"
 #include "icing/index/term-property-id.h"
 #include "icing/legacy/core/icing-string-util.h"
 #include "icing/legacy/core/icing-timer.h"
@@ -44,10 +46,13 @@
 #include "icing/legacy/index/icing-filesystem.h"
 #include "icing/legacy/index/icing-mmapper.h"
 #include "icing/proto/debug.pb.h"
+#include "icing/proto/scoring.pb.h"
 #include "icing/proto/storage.pb.h"
 #include "icing/proto/term.pb.h"
 #include "icing/schema/section.h"
 #include "icing/store/document-id.h"
+#include "icing/store/namespace-id.h"
+#include "icing/store/suggestion-result-checker.h"
 #include "icing/util/crc32.h"
 #include "icing/util/logging.h"
 #include "icing/util/status-macros.h"
@@ -160,10 +165,11 @@ libtextclassifier3::Status LiteIndex::Initialize() {
     }
 
     // Set up header.
-    header_mmap_.Remap(hit_buffer_fd_.get(), 0, header_size());
+    header_mmap_.Remap(hit_buffer_fd_.get(), kHeaderFileOffset, header_size());
     header_ = std::make_unique<LiteIndex_HeaderImpl>(
         reinterpret_cast<LiteIndex_HeaderImpl::HeaderData*>(
-            header_mmap_.address()));
+            header_mmap_.address()),
+        options_.include_property_existence_metadata_hits);
     header_->Reset();
 
     if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
@@ -175,10 +181,11 @@ libtextclassifier3::Status LiteIndex::Initialize() {
 
     UpdateChecksum();
   } else {
-    header_mmap_.Remap(hit_buffer_fd_.get(), 0, header_size());
+    header_mmap_.Remap(hit_buffer_fd_.get(), kHeaderFileOffset, header_size());
     header_ = std::make_unique<LiteIndex_HeaderImpl>(
         reinterpret_cast<LiteIndex_HeaderImpl::HeaderData*>(
-            header_mmap_.address()));
+            header_mmap_.address()),
+        options_.include_property_existence_metadata_hits);
 
     if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
                           sizeof(TermIdHitPair::Value), header_->cur_size(),
@@ -352,6 +359,73 @@ libtextclassifier3::StatusOr<uint32_t> LiteIndex::GetTermId(
   return tvi;
 }
 
+void LiteIndex::ScoreAndAppendFetchedHit(
+    const Hit& hit, SectionIdMask section_id_mask,
+    bool only_from_prefix_sections,
+    SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
+    const SuggestionResultChecker* suggestion_result_checker,
+    DocumentId& last_document_id, bool& is_last_document_desired,
+    int& total_score_out, std::vector<DocHitInfo>* hits_out,
+    std::vector<Hit::TermFrequencyArray>* term_frequency_out) const {
+  // Check sections.
+  if (((UINT64_C(1) << hit.section_id()) & section_id_mask) == 0) {
+    return;
+  }
+  // Check prefix section only.
+  if (only_from_prefix_sections && !hit.is_in_prefix_section()) {
+    return;
+  }
+  // Check whether this Hit is desired.
+  // TODO(b/230553264) Move common logic into helper function once we support
+  // score term by prefix_hit in lite_index.
+  DocumentId document_id = hit.document_id();
+  bool is_new_document = document_id != last_document_id;
+  if (is_new_document) {
+    last_document_id = document_id;
+    is_last_document_desired =
+        suggestion_result_checker == nullptr ||
+        suggestion_result_checker->BelongsToTargetResults(document_id,
+                                                          hit.section_id());
+  }
+  if (!is_last_document_desired) {
+    // The document is removed or expired or not desired.
+    return;
+  }
+
+  // Score the hit by the strategy
+  switch (score_by) {
+    case SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE:
+      total_score_out = 1;
+      break;
+    case SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT:
+      if (is_new_document) {
+        ++total_score_out;
+      }
+      break;
+    case SuggestionScoringSpecProto::SuggestionRankingStrategy::TERM_FREQUENCY:
+      if (hit.has_term_frequency()) {
+        total_score_out += hit.term_frequency();
+      } else {
+        ++total_score_out;
+      }
+      break;
+  }
+
+  // Append the Hit or update hit section to the output vector.
+  if (is_new_document && hits_out != nullptr) {
+    hits_out->push_back(DocHitInfo(document_id));
+    if (term_frequency_out != nullptr) {
+      term_frequency_out->push_back(Hit::TermFrequencyArray());
+    }
+  }
+  if (hits_out != nullptr) {
+    hits_out->back().UpdateSection(hit.section_id());
+    if (term_frequency_out != nullptr) {
+      term_frequency_out->back()[hit.section_id()] = hit.term_frequency();
+    }
+  }
+}
+
 int LiteIndex::FetchHits(
     uint32_t term_id, SectionIdMask section_id_mask,
     bool only_from_prefix_sections,
@@ -359,19 +433,38 @@ int LiteIndex::FetchHits(
     const SuggestionResultChecker* suggestion_result_checker,
     std::vector<DocHitInfo>* hits_out,
     std::vector<Hit::TermFrequencyArray>* term_frequency_out) {
-  int score = 0;
-  DocumentId last_document_id = kInvalidDocumentId;
-  // Record whether the last document belongs to the given namespaces.
-  bool is_last_document_desired = false;
-
-  if (NeedSort()) {
-    // Transition from shared_lock in NeedSort to unique_lock here is safe
-    // because it doesn't hurt to sort again if sorting was done already by
-    // another thread after NeedSort is evaluated. NeedSort is called before
-    // sorting to improve concurrency as threads can avoid acquiring the unique
-    // lock if no sorting is needed.
+  bool need_sort_at_querying = false;
+  {
+    absl_ports::shared_lock l(&mutex_);
+
+    // We sort here when:
+    // 1. We don't enable sorting at indexing time (i.e. we sort at querying
+    //    time), and there is an unsorted tail portion. OR
+    // 2. The unsorted tail size exceeds the hit_buffer_sort_threshold,
+    //    regardless of whether or not hit_buffer_sort_at_indexing is enabled.
+    //    This is more of a sanity check. We should not really be encountering
+    //    this case.
+    need_sort_at_querying = NeedSortAtQuerying();
+  }
+  if (need_sort_at_querying) {
     absl_ports::unique_lock l(&mutex_);
-    SortHits();
+    IcingTimer timer;
+
+    // Transition from shared_lock to unique_lock is safe here because it
+    // doesn't hurt to sort again if sorting was done already by another thread
+    // after need_sort_at_querying is evaluated.
+    // We check need_sort_at_querying to improve query concurrency as threads
+    // can avoid acquiring the unique lock if no sorting is needed.
+    SortHitsImpl();
+
+    if (options_.hit_buffer_sort_at_indexing) {
+      // This is the second case for sort. Log as this should be a very rare
+      // occasion.
+      ICING_LOG(WARNING) << "Sorting HitBuffer at querying time when "
+                            "hit_buffer_sort_at_indexing is enabled. Sort and "
+                            "merge HitBuffer in "
+                         << timer.Elapsed() * 1000 << " ms.";
+    }
   }
 
   // This downgrade from an unique_lock to a shared_lock is safe because we're
@@ -379,75 +472,72 @@ int LiteIndex::FetchHits(
   // only in Seek().
   // Any operations that might execute in between the transition of downgrading
   // the lock here are guaranteed not to alter the searchable section (or the
-  // LiteIndex due to a global lock in IcingSearchEngine).
+  // LiteIndex) due to a global lock in IcingSearchEngine.
   absl_ports::shared_lock l(&mutex_);
-  for (uint32_t idx = Seek(term_id); idx < header_->searchable_end(); idx++) {
-    TermIdHitPair term_id_hit_pair =
-        hit_buffer_.array_cast<TermIdHitPair>()[idx];
-    if (term_id_hit_pair.term_id() != term_id) break;
-
-    const Hit& hit = term_id_hit_pair.hit();
-    // Check sections.
-    if (((UINT64_C(1) << hit.section_id()) & section_id_mask) == 0) {
-      continue;
-    }
-    // Check prefix section only.
-    if (only_from_prefix_sections && !hit.is_in_prefix_section()) {
-      continue;
-    }
-    // TODO(b/230553264) Move common logic into helper function once we support
-    //  score term by prefix_hit in lite_index.
-    // Check whether this Hit is desired.
-    DocumentId document_id = hit.document_id();
-    bool is_new_document = document_id != last_document_id;
-    if (is_new_document) {
-      last_document_id = document_id;
-      is_last_document_desired =
-          suggestion_result_checker == nullptr ||
-          suggestion_result_checker->BelongsToTargetResults(document_id,
-                                                            hit.section_id());
-    }
-    if (!is_last_document_desired) {
-      // The document is removed or expired or not desired.
-      continue;
-    }
 
-    // Score the hit by the strategy
-    switch (score_by) {
-      case SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE:
-        score = 1;
-        break;
-      case SuggestionScoringSpecProto::SuggestionRankingStrategy::
-          DOCUMENT_COUNT:
-        if (is_new_document) {
-          ++score;
-        }
-        break;
-      case SuggestionScoringSpecProto::SuggestionRankingStrategy::
-          TERM_FREQUENCY:
-        if (hit.has_term_frequency()) {
-          score += hit.term_frequency();
-        } else {
-          ++score;
-        }
-        break;
-    }
+  // Search in the HitBuffer array for Hits with the corresponding term_id.
+  // Hits are added in increasing order of doc ids, so hits that get appended
+  // later have larger docIds. This means that:
+  // 1. Hits in the unsorted tail will have larger docIds than hits in the
+  //    sorted portion.
+  // 2. Hits at the end of the unsorted tail will have larger docIds than hits
+  //    in the front of the tail.
+  // We want to retrieve hits in descending order of docIds. Therefore we should
+  // search by doing:
+  // 1. Linear search first in reverse iteration order over the unsorted tail
+  //    portion.
+  // 2. Followed by binary search on the sorted portion.
+  const TermIdHitPair* array = hit_buffer_.array_cast<TermIdHitPair>();
 
-    // Append the Hit or update hit section to the output vector.
-    if (is_new_document && hits_out != nullptr) {
-      hits_out->push_back(DocHitInfo(document_id));
-      if (term_frequency_out != nullptr) {
-        term_frequency_out->push_back(Hit::TermFrequencyArray());
+  DocumentId last_document_id = kInvalidDocumentId;
+  // Record whether the last document belongs to the given namespaces.
+  bool is_last_document_desired = false;
+  int total_score = 0;
+
+  // Linear search over unsorted tail in reverse iteration order.
+  // This should only be performed when hit_buffer_sort_at_indexing is enabled.
+  // When disabled, the entire HitBuffer should be sorted already and only
+  // binary search is needed.
+  if (options_.hit_buffer_sort_at_indexing) {
+    uint32_t unsorted_length = header_->cur_size() - header_->searchable_end();
+    for (uint32_t i = 1; i <= unsorted_length; ++i) {
+      TermIdHitPair term_id_hit_pair = array[header_->cur_size() - i];
+      if (term_id_hit_pair.term_id() == term_id) {
+        // We've found a matched hit.
+        const Hit& matched_hit = term_id_hit_pair.hit();
+        // Score the hit and add to total_score. Also add the hits and its term
+        // frequency info to hits_out and term_frequency_out if the two vectors
+        // are non-null.
+        ScoreAndAppendFetchedHit(matched_hit, section_id_mask,
+                                 only_from_prefix_sections, score_by,
+                                 suggestion_result_checker, last_document_id,
+                                 is_last_document_desired, total_score,
+                                 hits_out, term_frequency_out);
       }
     }
-    if (hits_out != nullptr) {
-      hits_out->back().UpdateSection(hit.section_id());
-      if (term_frequency_out != nullptr) {
-        term_frequency_out->back()[hit.section_id()] = hit.term_frequency();
-      }
+  }
+
+  // Do binary search over the sorted section and repeat the above steps.
+  TermIdHitPair target_term_id_hit_pair(
+      term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kDefaultTermFrequency));
+  for (const TermIdHitPair* ptr = std::lower_bound(
+           array, array + header_->searchable_end(), target_term_id_hit_pair);
+       ptr < array + header_->searchable_end(); ++ptr) {
+    if (ptr->term_id() != term_id) {
+      // We've processed all matches. Stop iterating further.
+      break;
     }
+
+    const Hit& matched_hit = ptr->hit();
+    // Score the hit and add to total_score. Also add the hits and its term
+    // frequency info to hits_out and term_frequency_out if the two vectors are
+    // non-null.
+    ScoreAndAppendFetchedHit(
+        matched_hit, section_id_mask, only_from_prefix_sections, score_by,
+        suggestion_result_checker, last_document_id, is_last_document_desired,
+        total_score, hits_out, term_frequency_out);
   }
-  return score;
+  return total_score;
 }
 
 libtextclassifier3::StatusOr<int> LiteIndex::ScoreHits(
@@ -455,9 +545,9 @@ libtextclassifier3::StatusOr<int> LiteIndex::ScoreHits(
     SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
     const SuggestionResultChecker* suggestion_result_checker) {
   return FetchHits(term_id, kSectionIdMaskAll,
-                    /*only_from_prefix_sections=*/false, score_by,
-                    suggestion_result_checker,
-                    /*hits_out=*/nullptr);
+                   /*only_from_prefix_sections=*/false, score_by,
+                   suggestion_result_checker,
+                   /*hits_out=*/nullptr);
 }
 
 bool LiteIndex::is_full() const {
@@ -515,7 +605,7 @@ IndexStorageInfoProto LiteIndex::GetStorageInfo(
   return storage_info;
 }
 
-void LiteIndex::SortHits() {
+void LiteIndex::SortHitsImpl() {
   // Make searchable by sorting by hit buffer.
   uint32_t sort_len = header_->cur_size() - header_->searchable_end();
   if (sort_len <= 0) {
@@ -546,25 +636,6 @@ void LiteIndex::SortHits() {
   UpdateChecksum();
 }
 
-uint32_t LiteIndex::Seek(uint32_t term_id) const {
-  // Binary search for our term_id.  Make sure we get the first
-  // element.  Using kBeginSortValue ensures this for the hit value.
-  TermIdHitPair term_id_hit_pair(
-      term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kDefaultTermFrequency));
-
-  const TermIdHitPair::Value* array =
-      hit_buffer_.array_cast<TermIdHitPair::Value>();
-  if (header_->searchable_end() != header_->cur_size()) {
-    ICING_LOG(WARNING) << "Lite index: hit buffer searchable end != current "
-                       << "size during Seek(): "
-                       << header_->searchable_end() << " vs "
-                       << header_->cur_size();
-  }
-  const TermIdHitPair::Value* ptr = std::lower_bound(
-      array, array + header_->searchable_end(), term_id_hit_pair.value());
-  return ptr - array;
-}
-
 libtextclassifier3::Status LiteIndex::Optimize(
     const std::vector<DocumentId>& document_id_old_to_new,
     const TermIdCodec* term_id_codec, DocumentId new_last_added_document_id) {
@@ -575,7 +646,7 @@ libtextclassifier3::Status LiteIndex::Optimize(
   }
   // Sort the hits so that hits with the same term id will be grouped together,
   // which helps later to determine which terms will be unused after compaction.
-  SortHits();
+  SortHitsImpl();
   uint32_t new_size = 0;
   uint32_t curr_term_id = 0;
   uint32_t curr_tvi = 0;