aboutsummaryrefslogtreecommitdiff
path: root/icing/join/join-processor.cc
diff options
context:
space:
mode:
Diffstat (limited to 'icing/join/join-processor.cc')
-rw-r--r--icing/join/join-processor.cc128
1 files changed, 117 insertions, 11 deletions
diff --git a/icing/join/join-processor.cc b/icing/join/join-processor.cc
index e27b1ea..1b7ca0d 100644
--- a/icing/join/join-processor.cc
+++ b/icing/join/join-processor.cc
@@ -29,6 +29,7 @@
#include "icing/join/aggregation-scorer.h"
#include "icing/join/doc-join-info.h"
#include "icing/join/join-children-fetcher.h"
+#include "icing/join/qualified-id-join-index.h"
#include "icing/join/qualified-id.h"
#include "icing/proto/schema.pb.h"
#include "icing/proto/scoring.pb.h"
@@ -37,6 +38,7 @@
#include "icing/scoring/scored-document-hit.h"
#include "icing/store/document-filter-data.h"
#include "icing/store/document-id.h"
+#include "icing/store/namespace-fingerprint-identifier.h"
#include "icing/util/status-macros.h"
namespace icing {
@@ -53,17 +55,121 @@ JoinProcessor::GetChildrenFetcher(
"Parent property expression must be ", kQualifiedIdExpr));
}
- std::sort(
- child_scored_document_hits.begin(), child_scored_document_hits.end(),
- ScoredDocumentHitComparator(
- /*is_descending=*/join_spec.nested_spec().scoring_spec().order_by() ==
- ScoringSpecProto::Order::DESC));
-
- // TODO(b/256022027):
- // - Optimization
- // - Cache property to speed up property retrieval.
- // - If there is no cache, then we still have the flexibility to fetch it
- // from actual docs via DocumentStore.
+ ScoredDocumentHitComparator score_comparator(
+ /*is_descending=*/join_spec.nested_spec().scoring_spec().order_by() ==
+ ScoringSpecProto::Order::DESC);
+
+ if (qualified_id_join_index_->is_v2()) {
+ // v2
+ // Step 1a: sort child ScoredDocumentHits in document id descending order.
+ std::sort(child_scored_document_hits.begin(),
+ child_scored_document_hits.end(),
+ [](const ScoredDocumentHit& lhs, const ScoredDocumentHit& rhs) {
+ return lhs.document_id() > rhs.document_id();
+ });
+
+ // Step 1b: group all child ScoredDocumentHits by the document's
+ // schema_type_id.
+ std::unordered_map<SchemaTypeId, std::vector<ScoredDocumentHit>>
+ schema_to_child_scored_doc_hits_map;
+ for (const ScoredDocumentHit& child_scored_document_hit :
+ child_scored_document_hits) {
+ std::optional<DocumentFilterData> child_doc_filter_data =
+ doc_store_->GetAliveDocumentFilterData(
+ child_scored_document_hit.document_id(), current_time_ms_);
+ if (!child_doc_filter_data) {
+ continue;
+ }
+
+ schema_to_child_scored_doc_hits_map[child_doc_filter_data
+ ->schema_type_id()]
+ .push_back(child_scored_document_hit);
+ }
+
+ // Step 1c: for each schema_type_id, lookup QualifiedIdJoinIndexImplV2 to
+ // fetch all child join data from posting list(s). Convert all
+ // child join data to referenced parent document ids and bucketize
+ // child ScoredDocumentHits by it.
+ std::unordered_map<DocumentId, std::vector<ScoredDocumentHit>>
+ parent_to_child_docs_map;
+ for (auto& [schema_type_id, grouped_child_scored_doc_hits] :
+ schema_to_child_scored_doc_hits_map) {
+ // Get joinable_property_id of this schema.
+ ICING_ASSIGN_OR_RETURN(
+ const JoinablePropertyMetadata* metadata,
+ schema_store_->GetJoinablePropertyMetadata(
+ schema_type_id, join_spec.child_property_expression()));
+ if (metadata == nullptr ||
+ metadata->value_type != JoinableConfig::ValueType::QUALIFIED_ID) {
+ // Currently we only support qualified id, so skip other types.
+ continue;
+ }
+
+ // Lookup QualifiedIdJoinIndexImplV2.
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<QualifiedIdJoinIndex::JoinDataIteratorBase>
+ join_index_iter,
+ qualified_id_join_index_->GetIterator(
+ schema_type_id, /*joinable_property_id=*/metadata->id));
+
+ // - Join index contains all join data of schema_type_id and
+ // join_index_iter will return all of them in (child) document id
+ // descending order.
+ // - But we only need join data of child document ids which appear in
+ // grouped_child_scored_doc_hits. Also grouped_child_scored_doc_hits
+ // contain ScoredDocumentHits in (child) document id descending order.
+ // - Therefore, we advance 2 iterators to intersect them and get desired
+ // join data.
+ auto child_scored_doc_hits_iter = grouped_child_scored_doc_hits.cbegin();
+ while (join_index_iter->Advance().ok() &&
+ child_scored_doc_hits_iter !=
+ grouped_child_scored_doc_hits.cend()) {
+ // Advance child_scored_doc_hits_iter until it points to a
+ // ScoredDocumentHit with document id <= the one pointed by
+ // join_index_iter.
+ while (child_scored_doc_hits_iter !=
+ grouped_child_scored_doc_hits.cend() &&
+ child_scored_doc_hits_iter->document_id() >
+ join_index_iter->GetCurrent().document_id()) {
+ ++child_scored_doc_hits_iter;
+ }
+
+ if (child_scored_doc_hits_iter !=
+ grouped_child_scored_doc_hits.cend() &&
+ child_scored_doc_hits_iter->document_id() ==
+ join_index_iter->GetCurrent().document_id()) {
+ // We get a join data whose child document id exists in both join
+ // index and grouped_child_scored_doc_hits. Convert its join info to
+ // referenced parent document ids and bucketize ScoredDocumentHits by
+ // it (putting into parent_to_child_docs_map).
+ const NamespaceFingerprintIdentifier& ref_ns_id =
+ join_index_iter->GetCurrent().join_info();
+ libtextclassifier3::StatusOr<DocumentId> ref_parent_doc_id_or =
+ doc_store_->GetDocumentId(ref_ns_id);
+ if (ref_parent_doc_id_or.ok()) {
+ parent_to_child_docs_map[std::move(ref_parent_doc_id_or)
+ .ValueOrDie()]
+ .push_back(*child_scored_doc_hits_iter);
+ }
+ }
+ }
+ }
+
+ // Step 1d: finally, sort each parent's joined child ScoredDocumentHits by
+ // score.
+ for (auto& [parent_doc_id, bucketized_child_scored_hits] :
+ parent_to_child_docs_map) {
+ std::sort(bucketized_child_scored_hits.begin(),
+ bucketized_child_scored_hits.end(), score_comparator);
+ }
+
+ return JoinChildrenFetcher(join_spec, std::move(parent_to_child_docs_map));
+ }
+
+ // v1
+ // TODO(b/275121148): deprecate this part after rollout v2.
+ std::sort(child_scored_document_hits.begin(),
+ child_scored_document_hits.end(), score_comparator);
// Step 1: group child documents by parent documentId. Currently we only
// support QualifiedId joining, so fetch the qualified id content of