icing/index/iterator/doc-hit-info-iterator-or.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282

// Copyright (C) 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "icing/index/iterator/doc-hit-info-iterator-or.h"

#include <cstdint>

#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/store/document-id.h"
#include "icing/util/status-macros.h"

namespace icing {
namespace lib {

namespace {

// When combining Or iterators, n-ary operator has better performance when
// number of operands > 2 according to benchmark cl/243321264
constexpr int kBinaryOrIteratorPerformanceThreshold = 2;

}  // namespace

std::unique_ptr<DocHitInfoIterator> CreateOrIterator(
    std::vector<std::unique_ptr<DocHitInfoIterator>> iterators) {
  if (iterators.size() == 1) {
    return std::move(iterators.at(0));
  }

  std::unique_ptr<DocHitInfoIterator> iterator;
  if (iterators.size() == kBinaryOrIteratorPerformanceThreshold) {
    iterator = std::make_unique<DocHitInfoIteratorOr>(std::move(iterators[0]),
                                                      std::move(iterators[1]));
  } else {
    // If the vector is too small, the OrNary iterator can handle it and return
    // an error on the Advance call
    iterator = std::make_unique<DocHitInfoIteratorOrNary>(std::move(iterators));
  }

  return iterator;
}

DocHitInfoIteratorOr::DocHitInfoIteratorOr(
    std::unique_ptr<DocHitInfoIterator> left_it,
    std::unique_ptr<DocHitInfoIterator> right_it)
    : left_(std::move(left_it)), right_(std::move(right_it)) {}

libtextclassifier3::StatusOr<DocHitInfoIterator::TrimmedNode>
DocHitInfoIteratorOr::TrimRightMostNode() && {
  // Trim the whole OR iterator. Only keep the prefix of the right iterator.
  //
  // The OR operator has higher priority, it is not possible that we have an
  // unfinished prefix in the nested iterator right-most child we need to search
  // suggestion for.
  //
  // eg: `foo OR (bar baz)` is not valid for search suggestion since there is no
  // unfinished last term to be filled.
  //
  // If we need to trim a OR iterator for search suggestion, the right child
  // must be the last term. We don't need left side information to
  // generate suggestion for the right side.
  ICING_ASSIGN_OR_RETURN(TrimmedNode trimmed_right,
                         std::move(*right_).TrimRightMostNode());
  trimmed_right.iterator_ = nullptr;
  return trimmed_right;
}

libtextclassifier3::Status DocHitInfoIteratorOr::Advance() {
  // Cache the document_id of the left iterator for comparison to the right.
  DocumentId orig_left_document_id = left_document_id_;

  // Advance the left iterator if necessary.
  if (left_document_id_ != kInvalidDocumentId) {
    if (right_document_id_ == kInvalidDocumentId ||
        left_document_id_ >= right_document_id_) {
      if (left_->Advance().ok()) {
        left_document_id_ = left_->doc_hit_info().document_id();
      } else {
        left_document_id_ = kInvalidDocumentId;
      }
    }
  }

  // Advance the right iterator if necessary, by comparing to the original
  // left document_id (not the one which may have been updated).
  if (right_document_id_ != kInvalidDocumentId) {
    if (orig_left_document_id == kInvalidDocumentId ||
        right_document_id_ >= orig_left_document_id) {
      if (right_->Advance().ok()) {
        right_document_id_ = right_->doc_hit_info().document_id();
      } else {
        right_document_id_ = kInvalidDocumentId;
      }
    }
  }

  // Done, we either found a match or we reached the end of potential
  // DocHitInfos
  if (left_document_id_ == kInvalidDocumentId &&
      right_document_id_ == kInvalidDocumentId) {
    // Reached the end, set these to invalid values and return
    doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
    hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
    return absl_ports::ResourceExhaustedError(
        "No more DocHitInfos in iterator");
  }

  // Now chose the best one that is not invalid.
  DocHitInfoIterator* chosen;
  if (left_document_id_ == kInvalidDocumentId) {
    chosen = right_.get();
  } else if (right_document_id_ == kInvalidDocumentId) {
    chosen = left_.get();
  } else if (left_document_id_ < right_document_id_) {
    chosen = right_.get();
  } else {
    chosen = left_.get();
  }
  current_ = chosen;

  doc_hit_info_ = chosen->doc_hit_info();
  hit_intersect_section_ids_mask_ = chosen->hit_intersect_section_ids_mask();

  // If equal, combine.
  if (left_document_id_ == right_document_id_) {
    doc_hit_info_.MergeSectionsFrom(
        right_->doc_hit_info().hit_section_ids_mask());
    hit_intersect_section_ids_mask_ &= right_->hit_intersect_section_ids_mask();
  }

  return libtextclassifier3::Status::OK;
}

int32_t DocHitInfoIteratorOr::GetNumBlocksInspected() const {
  return left_->GetNumBlocksInspected() + right_->GetNumBlocksInspected();
}

int32_t DocHitInfoIteratorOr::GetNumLeafAdvanceCalls() const {
  return left_->GetNumLeafAdvanceCalls() + right_->GetNumLeafAdvanceCalls();
}

std::string DocHitInfoIteratorOr::ToString() const {
  return absl_ports::StrCat("(", left_->ToString(), " OR ", right_->ToString(),
                            ")");
}

DocHitInfoIteratorOrNary::DocHitInfoIteratorOrNary(
    std::vector<std::unique_ptr<DocHitInfoIterator>> iterators)
    : iterators_(std::move(iterators)) {}

libtextclassifier3::StatusOr<DocHitInfoIterator::TrimmedNode>
DocHitInfoIteratorOrNary::TrimRightMostNode() && {
  // Trim the whole OR iterator.
  //
  // The OR operator has higher priority, it is not possible that we have an
  // unfinished prefix in the nested iterator right-most child we need to search
  // suggestion for.
  //
  // eg: `foo OR (bar baz)` is not valid for search suggestion since there is no
  // unfinished last term to be filled.
  //
  // If we need to trim a OR iterator for search suggestion, the right-most
  // child must be the last term. We don't need left side information to
  // generate suggestion for the right side.
  ICING_ASSIGN_OR_RETURN(TrimmedNode trimmed_right,
                         std::move(*iterators_.back()).TrimRightMostNode());
  trimmed_right.iterator_ = nullptr;
  return trimmed_right;
}

libtextclassifier3::Status DocHitInfoIteratorOrNary::Advance() {
  current_iterators_.clear();
  if (iterators_.size() < 2) {
    return absl_ports::InvalidArgumentError(
        "Not enough iterators to OR together");
  }

  if (doc_hit_info_.document_id() == 0) {
    // 0 is the smallest (last) DocumentId, can't advance further. Reset to
    // invalid values and return directly
    doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
    hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
    return absl_ports::ResourceExhaustedError(
        "No more DocHitInfos in iterator");
  }
  // The maximum possible doc id for the current Advance() call.
  const DocumentId next_document_id_max = doc_hit_info_.document_id() - 1;
  doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
  DocumentId next_document_id = kInvalidDocumentId;
  // Go through the iterators and try to find the maximum document_id that is
  // equal to or smaller than next_document_id_max
  for (const auto& iterator : iterators_) {
    if (iterator->doc_hit_info().document_id() > next_document_id_max) {
      // Advance the iterator until its value is equal to or smaller than
      // next_document_id_max
      if (!AdvanceTo(iterator.get(), next_document_id_max).ok()) {
        continue;
      }
    }
    // Now iterator->get_document_id() <= next_document_id_max
    if (next_document_id == kInvalidDocumentId) {
      next_document_id = iterator->doc_hit_info().document_id();
    } else {
      next_document_id =
          std::max(next_document_id, iterator->doc_hit_info().document_id());
    }
  }
  if (next_document_id == kInvalidDocumentId) {
    // None of the iterators had a next document_id, reset to invalid values and
    // return
    doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
    hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
    return absl_ports::ResourceExhaustedError(
        "No more DocHitInfos in iterator");
  }

  // Found the next hit DocumentId, now calculate the section info.
  hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
  for (const auto& iterator : iterators_) {
    if (iterator->doc_hit_info().document_id() == next_document_id) {
      current_iterators_.push_back(iterator.get());
      if (doc_hit_info_.document_id() == kInvalidDocumentId) {
        doc_hit_info_ = iterator->doc_hit_info();
        hit_intersect_section_ids_mask_ =
            iterator->hit_intersect_section_ids_mask();
      } else {
        doc_hit_info_.MergeSectionsFrom(
            iterator->doc_hit_info().hit_section_ids_mask());
        hit_intersect_section_ids_mask_ &=
            iterator->hit_intersect_section_ids_mask();
      }
    }
  }
  return libtextclassifier3::Status::OK;
}

int32_t DocHitInfoIteratorOrNary::GetNumBlocksInspected() const {
  int32_t blockCount = 0;
  for (const auto& iter : iterators_) {
    blockCount += iter->GetNumBlocksInspected();
  }
  return blockCount;
}

int32_t DocHitInfoIteratorOrNary::GetNumLeafAdvanceCalls() const {
  int32_t leafCount = 0;
  for (const auto& iter : iterators_) {
    leafCount += iter->GetNumLeafAdvanceCalls();
  }
  return leafCount;
}

std::string DocHitInfoIteratorOrNary::ToString() const {
  std::string ret = "(";

  for (int i = 0; i < iterators_.size(); ++i) {
    absl_ports::StrAppend(&ret, iterators_.at(i)->ToString());
    if (i != iterators_.size() - 1) {
      // Not the last element in vector
      absl_ports::StrAppend(&ret, " OR ");
    }
  }

  absl_ports::StrAppend(&ret, ")");
  return ret;
}

}  // namespace lib
}  // namespace icing