diff options
Diffstat (limited to 'icing/join/join-processor.cc')
-rw-r--r-- | icing/join/join-processor.cc | 128 |
1 files changed, 117 insertions, 11 deletions
diff --git a/icing/join/join-processor.cc b/icing/join/join-processor.cc index e27b1ea..1b7ca0d 100644 --- a/icing/join/join-processor.cc +++ b/icing/join/join-processor.cc @@ -29,6 +29,7 @@ #include "icing/join/aggregation-scorer.h" #include "icing/join/doc-join-info.h" #include "icing/join/join-children-fetcher.h" +#include "icing/join/qualified-id-join-index.h" #include "icing/join/qualified-id.h" #include "icing/proto/schema.pb.h" #include "icing/proto/scoring.pb.h" @@ -37,6 +38,7 @@ #include "icing/scoring/scored-document-hit.h" #include "icing/store/document-filter-data.h" #include "icing/store/document-id.h" +#include "icing/store/namespace-fingerprint-identifier.h" #include "icing/util/status-macros.h" namespace icing { @@ -53,17 +55,121 @@ JoinProcessor::GetChildrenFetcher( "Parent property expression must be ", kQualifiedIdExpr)); } - std::sort( - child_scored_document_hits.begin(), child_scored_document_hits.end(), - ScoredDocumentHitComparator( - /*is_descending=*/join_spec.nested_spec().scoring_spec().order_by() == - ScoringSpecProto::Order::DESC)); - - // TODO(b/256022027): - // - Optimization - // - Cache property to speed up property retrieval. - // - If there is no cache, then we still have the flexibility to fetch it - // from actual docs via DocumentStore. + ScoredDocumentHitComparator score_comparator( + /*is_descending=*/join_spec.nested_spec().scoring_spec().order_by() == + ScoringSpecProto::Order::DESC); + + if (qualified_id_join_index_->is_v2()) { + // v2 + // Step 1a: sort child ScoredDocumentHits in document id descending order. + std::sort(child_scored_document_hits.begin(), + child_scored_document_hits.end(), + [](const ScoredDocumentHit& lhs, const ScoredDocumentHit& rhs) { + return lhs.document_id() > rhs.document_id(); + }); + + // Step 1b: group all child ScoredDocumentHits by the document's + // schema_type_id. + std::unordered_map<SchemaTypeId, std::vector<ScoredDocumentHit>> + schema_to_child_scored_doc_hits_map; + for (const ScoredDocumentHit& child_scored_document_hit : + child_scored_document_hits) { + std::optional<DocumentFilterData> child_doc_filter_data = + doc_store_->GetAliveDocumentFilterData( + child_scored_document_hit.document_id(), current_time_ms_); + if (!child_doc_filter_data) { + continue; + } + + schema_to_child_scored_doc_hits_map[child_doc_filter_data + ->schema_type_id()] + .push_back(child_scored_document_hit); + } + + // Step 1c: for each schema_type_id, lookup QualifiedIdJoinIndexImplV2 to + // fetch all child join data from posting list(s). Convert all + // child join data to referenced parent document ids and bucketize + // child ScoredDocumentHits by it. + std::unordered_map<DocumentId, std::vector<ScoredDocumentHit>> + parent_to_child_docs_map; + for (auto& [schema_type_id, grouped_child_scored_doc_hits] : + schema_to_child_scored_doc_hits_map) { + // Get joinable_property_id of this schema. + ICING_ASSIGN_OR_RETURN( + const JoinablePropertyMetadata* metadata, + schema_store_->GetJoinablePropertyMetadata( + schema_type_id, join_spec.child_property_expression())); + if (metadata == nullptr || + metadata->value_type != JoinableConfig::ValueType::QUALIFIED_ID) { + // Currently we only support qualified id, so skip other types. + continue; + } + + // Lookup QualifiedIdJoinIndexImplV2. + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<QualifiedIdJoinIndex::JoinDataIteratorBase> + join_index_iter, + qualified_id_join_index_->GetIterator( + schema_type_id, /*joinable_property_id=*/metadata->id)); + + // - Join index contains all join data of schema_type_id and + // join_index_iter will return all of them in (child) document id + // descending order. + // - But we only need join data of child document ids which appear in + // grouped_child_scored_doc_hits. Also grouped_child_scored_doc_hits + // contain ScoredDocumentHits in (child) document id descending order. + // - Therefore, we advance 2 iterators to intersect them and get desired + // join data. + auto child_scored_doc_hits_iter = grouped_child_scored_doc_hits.cbegin(); + while (join_index_iter->Advance().ok() && + child_scored_doc_hits_iter != + grouped_child_scored_doc_hits.cend()) { + // Advance child_scored_doc_hits_iter until it points to a + // ScoredDocumentHit with document id <= the one pointed by + // join_index_iter. + while (child_scored_doc_hits_iter != + grouped_child_scored_doc_hits.cend() && + child_scored_doc_hits_iter->document_id() > + join_index_iter->GetCurrent().document_id()) { + ++child_scored_doc_hits_iter; + } + + if (child_scored_doc_hits_iter != + grouped_child_scored_doc_hits.cend() && + child_scored_doc_hits_iter->document_id() == + join_index_iter->GetCurrent().document_id()) { + // We get a join data whose child document id exists in both join + // index and grouped_child_scored_doc_hits. Convert its join info to + // referenced parent document ids and bucketize ScoredDocumentHits by + // it (putting into parent_to_child_docs_map). + const NamespaceFingerprintIdentifier& ref_ns_id = + join_index_iter->GetCurrent().join_info(); + libtextclassifier3::StatusOr<DocumentId> ref_parent_doc_id_or = + doc_store_->GetDocumentId(ref_ns_id); + if (ref_parent_doc_id_or.ok()) { + parent_to_child_docs_map[std::move(ref_parent_doc_id_or) + .ValueOrDie()] + .push_back(*child_scored_doc_hits_iter); + } + } + } + } + + // Step 1d: finally, sort each parent's joined child ScoredDocumentHits by + // score. + for (auto& [parent_doc_id, bucketized_child_scored_hits] : + parent_to_child_docs_map) { + std::sort(bucketized_child_scored_hits.begin(), + bucketized_child_scored_hits.end(), score_comparator); + } + + return JoinChildrenFetcher(join_spec, std::move(parent_to_child_docs_map)); + } + + // v1 + // TODO(b/275121148): deprecate this part after rollout v2. + std::sort(child_scored_document_hits.begin(), + child_scored_document_hits.end(), score_comparator); // Step 1: group child documents by parent documentId. Currently we only // support QualifiedId joining, so fetch the qualified id content of |