diff options
author | Tim Barron <tjbarron@google.com> | 2023-03-14 09:57:47 -0700 |
---|---|---|
committer | Tim Barron <tjbarron@google.com> | 2023-03-14 09:57:47 -0700 |
commit | c1e7edff54723138756063ee4b7948c1ee91277e (patch) | |
tree | b2a55e543a6c9396631feaab459bfd671a8bc400 | |
parent | 140aaee3e7b269f02599310e42d6172090ce02d2 (diff) | |
parent | d5c81ae0c41ae9c1aefb3601f3836570b9f686c7 (diff) | |
download | icing-c1e7edff54723138756063ee4b7948c1ee91277e.tar.gz |
Merge remote-tracking branch 'goog/upstream-master' into androidx-platform-dev
* goog/upstream-master:
Update Icing from upstream.
Update Icing from upstream.
Descriptions:
========================================================================
Cache an instance of UBreakIterator to reduce unnecessary creations.
========================================================================
Cap number of individual IntegerIndexStorages that IntegerIndex creates.
========================================================================
Change error in trimRightMostNode from Unimplemented to InvalidArgument.
========================================================================
Add detection for new language features of List Filters Query Language.
========================================================================
Add option to control threshold to rebuild index during optimize by flag
========================================================================
Add option to control use of namespace id to build urimapper by flag.
========================================================================
Enforce schema validation for joinable config.
========================================================================
Adopt bucket splitting for IntegerIndexStorage.
========================================================================
Implement bucket splitting function.
========================================================================
Add Icing initialization unit tests for QualifiedIdTypeJoinableIndex.
========================================================================
Add Icing schema change unit tests for QualifiedIdTypeJoinableIndex.
========================================================================
Add Icing optimization unit tests for QualifiedIdTypeJoinableIndex.
========================================================================
Integrate QualifiedIdTypeJoinableIndex into IcingSearchEngine.
========================================================================
Implement QualifiedIdJoinablePropertyIndexingHandler.
========================================================================
Change QualifiedIdTypeJoinableIndex to store raw qualified id string.
========================================================================
Pass info about unnormalized query terms through lexer/parser/visitor.
========================================================================
Integrate Advanced Query w/ Suggest, make ADVANCED_QUERY default parser.
======================================================================
Bug: 208654892
Bug: 263890397
Bug: 259743562
Bug: 272145329
Bug: 227356108
Change-Id: I44de5853bb6c55b42800ae34d8071016be6c87cd
85 files changed, 10741 insertions, 1692 deletions
diff --git a/icing/file/file-backed-proto.h b/icing/file/file-backed-proto.h index 8deb7a6..8c5743b 100644 --- a/icing/file/file-backed-proto.h +++ b/icing/file/file-backed-proto.h @@ -22,6 +22,7 @@ #ifndef ICING_FILE_FILE_BACKED_PROTO_H_ #define ICING_FILE_FILE_BACKED_PROTO_H_ +#include <algorithm> #include <cstdint> #include <memory> #include <string> @@ -37,6 +38,7 @@ #include "icing/legacy/core/icing-string-util.h" #include "icing/util/crc32.h" #include "icing/util/logging.h" +#include "icing/util/status-macros.h" namespace icing { namespace lib { @@ -74,6 +76,13 @@ class FileBackedProto { file_path_ = swapped_to_file_path; } + // Computes the checksum of the proto stored in this file and returns it. + // RETURNS: + // - the checksum of the proto or 0 if the file is empty/non-existent + // - INTERNAL_ERROR if an IO error or a corruption was encountered. + libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const + ICING_LOCKS_EXCLUDED(mutex_); + // Returns a reference to the proto read from the file. It // internally caches the read proto so that future calls are fast. // @@ -103,6 +112,11 @@ class FileBackedProto { FileBackedProto& operator=(const FileBackedProto&) = delete; private: + // Internal method to handle reading the proto from disk. + // Requires the caller to hold an exclusive lock on mutex_. + libtextclassifier3::StatusOr<const ProtoT*> ReadInternal() const + ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + // Upper bound of file-size that is supported. static constexpr int32_t kMaxFileSize = 1 * 1024 * 1024; // 1 MiB. @@ -113,6 +127,8 @@ class FileBackedProto { std::string file_path_; mutable std::unique_ptr<ProtoT> cached_proto_ ICING_GUARDED_BY(mutex_); + + mutable std::unique_ptr<Header> cached_header_ ICING_GUARDED_BY(mutex_); }; template <typename ProtoT> @@ -124,12 +140,35 @@ FileBackedProto<ProtoT>::FileBackedProto(const Filesystem& filesystem, : filesystem_(&filesystem), file_path_(file_path) {} template <typename ProtoT> +libtextclassifier3::StatusOr<Crc32> FileBackedProto<ProtoT>::ComputeChecksum() + const { + absl_ports::unique_lock l(&mutex_); + if (cached_proto_ == nullptr) { + auto read_status = ReadInternal(); + if (!read_status.ok()) { + if (absl_ports::IsNotFound(read_status.status())) { + // File doesn't exist. So simply return 0. + return Crc32(); + } + return read_status.status(); + } + } + return Crc32(cached_header_->proto_checksum); +} + +template <typename ProtoT> libtextclassifier3::StatusOr<const ProtoT*> FileBackedProto<ProtoT>::Read() const { ICING_VLOG(1) << "Reading proto from file: " << file_path_; absl_ports::unique_lock l(&mutex_); + return ReadInternal(); +} + +template <typename ProtoT> +libtextclassifier3::StatusOr<const ProtoT*> +FileBackedProto<ProtoT>::ReadInternal() const { // Return cached proto if we've already read from disk. if (cached_proto_ != nullptr) { ICING_VLOG(1) << "Reusing cached proto for file: " << file_path_; @@ -157,8 +196,7 @@ libtextclassifier3::StatusOr<const ProtoT*> FileBackedProto<ProtoT>::Read() << " of size: " << file_size; Header header; - if (!filesystem_->PRead(fd.get(), &header, sizeof(Header), - /*offset=*/0)) { + if (!filesystem_->PRead(fd.get(), &header, sizeof(Header), /*offset=*/0)) { return absl_ports::InternalError( absl_ports::StrCat("Unable to read header of: ", file_path_)); } @@ -193,6 +231,7 @@ libtextclassifier3::StatusOr<const ProtoT*> FileBackedProto<ProtoT>::Read() ICING_VLOG(1) << "Successfully read proto from file: " << file_path_; cached_proto_ = std::move(proto); + cached_header_ = std::make_unique<Header>(std::move(header)); return cached_proto_.get(); } @@ -253,6 +292,7 @@ libtextclassifier3::Status FileBackedProto<ProtoT>::Write( ICING_VLOG(1) << "Successfully wrote proto to file: " << file_path_; cached_proto_ = std::move(new_proto); + cached_header_ = std::make_unique<Header>(std::move(header)); return libtextclassifier3::Status::OK; } diff --git a/icing/file/persistent-hash-map.cc b/icing/file/persistent-hash-map.cc index 14a1251..ce8310b 100644 --- a/icing/file/persistent-hash-map.cc +++ b/icing/file/persistent-hash-map.cc @@ -147,7 +147,9 @@ PersistentHashMap::Create(const Filesystem& filesystem, !filesystem.FileExists( GetKeyValueStorageFilePath(working_path).c_str())) { // Discard working_path if any of them is missing, and reinitialize. - ICING_RETURN_IF_ERROR(Discard(filesystem, working_path)); + if (filesystem.DirectoryExists(working_path.c_str())) { + ICING_RETURN_IF_ERROR(Discard(filesystem, working_path)); + } return InitializeNewFiles(filesystem, std::move(working_path), std::move(options)); } diff --git a/icing/file/posting_list/flash-index-storage.cc b/icing/file/posting_list/flash-index-storage.cc index 657bd96..2ba24a3 100644 --- a/icing/file/posting_list/flash-index-storage.cc +++ b/icing/file/posting_list/flash-index-storage.cc @@ -37,22 +37,6 @@ namespace icing { namespace lib { -namespace { - -uint32_t SelectBlockSize() { - // This should be close to the flash page size. - static constexpr uint32_t kMinBlockSize = 4096; - - // Determine a good block size. - uint32_t page_size = getpagesize(); - uint32_t block_size = std::max(kMinBlockSize, page_size); - - // Align up to the nearest page size. - return math_util::RoundUpTo(block_size, page_size); -} - -} // namespace - libtextclassifier3::StatusOr<FlashIndexStorage> FlashIndexStorage::Create( std::string index_filename, const Filesystem* filesystem, PostingListSerializer* serializer, bool in_memory) { @@ -75,6 +59,18 @@ FlashIndexStorage::~FlashIndexStorage() { } } +/* static */ uint32_t FlashIndexStorage::SelectBlockSize() { + // This should be close to the flash page size. + static constexpr uint32_t kMinBlockSize = 4096; + + // Determine a good block size. + uint32_t page_size = getpagesize(); + uint32_t block_size = std::max(kMinBlockSize, page_size); + + // Align up to the nearest page size. + return math_util::RoundUpTo(block_size, page_size); +} + bool FlashIndexStorage::Init() { storage_sfd_ = ScopedFd(filesystem_->OpenForWrite(index_filename_.c_str())); if (!storage_sfd_.is_valid()) { diff --git a/icing/file/posting_list/flash-index-storage.h b/icing/file/posting_list/flash-index-storage.h index 1813637..05feb08 100644 --- a/icing/file/posting_list/flash-index-storage.h +++ b/icing/file/posting_list/flash-index-storage.h @@ -105,6 +105,9 @@ class FlashIndexStorage { ~FlashIndexStorage(); + // Selects block size to use. + static uint32_t SelectBlockSize(); + // Retrieves the PostingList referred to by PostingListIdentifier. This // posting list must have been previously allocated by a prior call to // AllocatePostingList. diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc index 1b193af..7800e7e 100644 --- a/icing/icing-search-engine.cc +++ b/icing/icing-search-engine.cc @@ -41,6 +41,8 @@ #include "icing/index/numeric/integer-index.h" #include "icing/index/string-section-indexing-handler.h" #include "icing/join/join-processor.h" +#include "icing/join/qualified-id-joinable-property-indexing-handler.h" +#include "icing/join/qualified-id-type-joinable-index.h" #include "icing/legacy/index/icing-filesystem.h" #include "icing/portable/endian.h" #include "icing/proto/debug.pb.h" @@ -96,6 +98,8 @@ namespace { constexpr std::string_view kDocumentSubfolderName = "document_dir"; constexpr std::string_view kIndexSubfolderName = "index_dir"; constexpr std::string_view kIntegerIndexSubfolderName = "integer_index_dir"; +constexpr std::string_view kQualifiedIdJoinIndexSubfolderName = + "qualified_id_join_index_dir"; constexpr std::string_view kSchemaSubfolderName = "schema_dir"; constexpr std::string_view kSetSchemaMarkerFilename = "set_schema_marker"; constexpr std::string_view kInitMarkerFilename = "init_marker"; @@ -240,6 +244,14 @@ std::string MakeIntegerIndexWorkingPath(const std::string& base_dir) { return absl_ports::StrCat(base_dir, "/", kIntegerIndexSubfolderName); } +// Working path for qualified id join index. It is derived from +// PersistentStorage and it will take full ownership of this working path, +// including creation/deletion. See PersistentStorage for more details about +// working path. +std::string MakeQualifiedIdJoinIndexWorkingPath(const std::string& base_dir) { + return absl_ports::StrCat(base_dir, "/", kQualifiedIdJoinIndexSubfolderName); +} + // SchemaStore files are in a standalone subfolder for easier file management. // We can delete and recreate the subfolder and not touch/affect anything // else. @@ -347,15 +359,12 @@ libtextclassifier3::Status RetrieveAndAddDocumentInfo( return libtextclassifier3::Status::OK; } -bool ShouldRebuildIndex(const OptimizeStatsProto& optimize_stats) { +bool ShouldRebuildIndex(const OptimizeStatsProto& optimize_stats, + float optimize_rebuild_index_threshold) { int num_invalid_documents = optimize_stats.num_deleted_documents() + optimize_stats.num_expired_documents(); - // Rebuilding the index could be faster than optimizing the index if we have - // removed most of the documents. - // Based on benchmarks, 85%~95% seems to be a good threshold for most cases. - // TODO(b/238236206): Try using the number of remaining hits in this - // condition, and allow clients to configure the threshold. - return num_invalid_documents >= optimize_stats.num_original_documents() * 0.9; + return num_invalid_documents >= optimize_stats.num_original_documents() * + optimize_rebuild_index_threshold; } // Useful method to get RankingStrategy if advanced scoring is enabled. When the @@ -428,6 +437,7 @@ void IcingSearchEngine::ResetMembers() { normalizer_.reset(); index_.reset(); integer_index_.reset(); + qualified_id_join_index_.reset(); } libtextclassifier3::Status IcingSearchEngine::CheckInitMarkerFile( @@ -559,12 +569,17 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers( const std::string index_dir = MakeIndexDirectoryPath(options_.base_dir()); const std::string integer_index_dir = MakeIntegerIndexWorkingPath(options_.base_dir()); + const std::string qualified_id_join_index_dir = + MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir()); if (!filesystem_->DeleteDirectoryRecursively(doc_store_dir.c_str()) || !filesystem_->DeleteDirectoryRecursively(index_dir.c_str()) || - !IntegerIndex::Discard(*filesystem_, integer_index_dir).ok()) { - return absl_ports::InternalError( - absl_ports::StrCat("Could not delete directories: ", index_dir, ", ", - integer_index_dir, " and", doc_store_dir)); + !IntegerIndex::Discard(*filesystem_, integer_index_dir).ok() || + !QualifiedIdTypeJoinableIndex::Discard(*filesystem_, + qualified_id_join_index_dir) + .ok()) { + return absl_ports::InternalError(absl_ports::StrCat( + "Could not delete directories: ", index_dir, ", ", integer_index_dir, + ", ", qualified_id_join_index_dir, " and ", doc_store_dir)); } ICING_RETURN_IF_ERROR(InitializeDocumentStore( /*force_recovery_and_revalidate_documents=*/false, initialize_stats)); @@ -601,6 +616,16 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers( integer_index_, IntegerIndex::Create(*filesystem_, std::move(integer_index_dir))); + // Discard qualified id join index directory and instantiate a new one. + std::string qualified_id_join_index_dir = + MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir()); + ICING_RETURN_IF_ERROR(QualifiedIdTypeJoinableIndex::Discard( + *filesystem_, qualified_id_join_index_dir)); + ICING_ASSIGN_OR_RETURN( + qualified_id_join_index_, + QualifiedIdTypeJoinableIndex::Create( + *filesystem_, std::move(qualified_id_join_index_dir))); + std::unique_ptr<Timer> restore_timer = clock_->GetNewTimer(); IndexRestorationResult restore_result = RestoreIndexIfNeeded(); index_init_status = std::move(restore_result.status); @@ -621,6 +646,8 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers( InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC); initialize_stats->set_integer_index_restoration_cause( InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC); + initialize_stats->set_qualified_id_join_index_restoration_cause( + InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC); } else { ICING_RETURN_IF_ERROR(InitializeDocumentStore( /*force_recovery_and_revalidate_documents=*/false, initialize_stats)); @@ -673,9 +700,11 @@ libtextclassifier3::Status IcingSearchEngine::InitializeDocumentStore( } ICING_ASSIGN_OR_RETURN( DocumentStore::CreateResult create_result, - DocumentStore::Create( - filesystem_.get(), document_dir, clock_.get(), schema_store_.get(), - force_recovery_and_revalidate_documents, initialize_stats)); + DocumentStore::Create(filesystem_.get(), document_dir, clock_.get(), + schema_store_.get(), + force_recovery_and_revalidate_documents, + options_.document_store_namespace_id_fingerprint(), + initialize_stats)); document_store_ = std::move(create_result.document_store); return libtextclassifier3::Status::OK; @@ -693,6 +722,7 @@ libtextclassifier3::Status IcingSearchEngine::InitializeIndex( } Index::Options index_options(index_dir, options_.index_merge_size()); + // Term index InitializeStatsProto::RecoveryCause index_recovery_cause; auto index_or = Index::Create(index_options, filesystem_.get(), icing_filesystem_.get()); @@ -717,6 +747,7 @@ libtextclassifier3::Status IcingSearchEngine::InitializeIndex( index_recovery_cause = InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH; } + // Integer index std::string integer_index_dir = MakeIntegerIndexWorkingPath(options_.base_dir()); InitializeStatsProto::RecoveryCause integer_index_recovery_cause; @@ -740,10 +771,38 @@ libtextclassifier3::Status IcingSearchEngine::InitializeIndex( InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH; } + // Qualified id join index + std::string qualified_id_join_index_dir = + MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir()); + InitializeStatsProto::RecoveryCause qualified_id_join_index_recovery_cause; + auto qualified_id_join_index_or = QualifiedIdTypeJoinableIndex::Create( + *filesystem_, qualified_id_join_index_dir); + if (!qualified_id_join_index_or.ok()) { + ICING_RETURN_IF_ERROR(QualifiedIdTypeJoinableIndex::Discard( + *filesystem_, qualified_id_join_index_dir)); + + qualified_id_join_index_recovery_cause = InitializeStatsProto::IO_ERROR; + + // Try recreating it from scratch and rebuild everything. + ICING_ASSIGN_OR_RETURN( + qualified_id_join_index_, + QualifiedIdTypeJoinableIndex::Create( + *filesystem_, std::move(qualified_id_join_index_dir))); + } else { + // Qualified id join index was created fine. + qualified_id_join_index_ = + std::move(qualified_id_join_index_or).ValueOrDie(); + // If a recover does have to happen, then it must be because the index is + // out of sync with the document store. + qualified_id_join_index_recovery_cause = + InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH; + } + std::unique_ptr<Timer> restore_timer = clock_->GetNewTimer(); IndexRestorationResult restore_result = RestoreIndexIfNeeded(); if (restore_result.index_needed_restoration || - restore_result.integer_index_needed_restoration) { + restore_result.integer_index_needed_restoration || + restore_result.qualified_id_join_index_needed_restoration) { initialize_stats->set_index_restoration_latency_ms( restore_timer->GetElapsedMilliseconds()); @@ -754,6 +813,10 @@ libtextclassifier3::Status IcingSearchEngine::InitializeIndex( initialize_stats->set_integer_index_restoration_cause( integer_index_recovery_cause); } + if (restore_result.qualified_id_join_index_needed_restoration) { + initialize_stats->set_qualified_id_join_index_restoration_cause( + qualified_id_join_index_recovery_cause); + } } return restore_result.status; } @@ -863,20 +926,25 @@ SetSchemaResultProto IcingSearchEngine::SetSchema( } } - if (lost_previous_schema || join_incompatible) { - // TODO(b/256022027): rebuild joinable cache if not join compatible. This - // should be done together with index (see RestoreIndexIfNeeded) because - // we want to "replay" documents only once to cover all rebuild. + if (lost_previous_schema || index_incompatible) { + // Clears search indices + status = ClearSearchIndices(); + if (!status.ok()) { + TransformStatus(status, result_status); + return result_proto; + } } - if (lost_previous_schema || index_incompatible) { - // Clears all indices - status = ClearIndices(); + if (lost_previous_schema || join_incompatible) { + // Clears join indices + status = ClearJoinIndices(); if (!status.ok()) { TransformStatus(status, result_status); return result_proto; } + } + if (lost_previous_schema || index_incompatible || join_incompatible) { IndexRestorationResult restore_result = RestoreIndexIfNeeded(); // DATA_LOSS means that we have successfully re-added content to the // index. Some indexed content was lost, but otherwise the index is in a @@ -996,12 +1064,12 @@ PutResultProto IcingSearchEngine::Put(DocumentProto&& document) { auto index_status = index_processor.IndexDocument( tokenized_document, document_id, put_document_stats); // Getting an internal error from the index could possibly mean that the index - // is broken. Try to rebuild the index to recover. + // is broken. Try to rebuild them to recover. if (absl_ports::IsInternal(index_status)) { ICING_LOG(ERROR) << "Got an internal error from the index. Trying to " "rebuild the index!\n" << index_status.error_message(); - index_status = ClearIndices(); + index_status = ClearAllIndices(); if (index_status.ok()) { index_status = RestoreIndexIfNeeded().status; if (!index_status.ok()) { @@ -1009,8 +1077,8 @@ PutResultProto IcingSearchEngine::Put(DocumentProto&& document) { "indexing a document."; } } else { - ICING_LOG(ERROR) << "Failed to clear the index after a failure of " - "indexing a document."; + ICING_LOG(ERROR) + << "Failed to clear indices after a failure of indexing a document."; } } @@ -1411,7 +1479,9 @@ OptimizeResultProto IcingSearchEngine::Optimize() { // guaranteed to work, so we update index according to the new document store. std::unique_ptr<Timer> optimize_index_timer = clock_->GetNewTimer(); bool should_rebuild_index = - !document_id_old_to_new_or.ok() || ShouldRebuildIndex(*optimize_stats); + !document_id_old_to_new_or.ok() || + ShouldRebuildIndex(*optimize_stats, + options_.optimize_rebuild_index_threshold()); if (!should_rebuild_index) { optimize_stats->set_index_restoration_mode( OptimizeStatsProto::INDEX_TRANSLATION); @@ -1432,6 +1502,17 @@ OptimizeResultProto IcingSearchEngine::Optimize() { << integer_index_optimize_status.error_message(); should_rebuild_index = true; } + + libtextclassifier3::Status qualified_id_join_index_optimize_status = + qualified_id_join_index_->Optimize( + document_id_old_to_new_or.ValueOrDie(), + document_store_->last_added_document_id()); + if (!qualified_id_join_index_optimize_status.ok()) { + ICING_LOG(WARNING) + << "Failed to optimize qualified id join index. Error: " + << qualified_id_join_index_optimize_status.error_message(); + should_rebuild_index = true; + } } // If we received a DATA_LOSS error from OptimizeDocumentStore, we have a // valid document store, but it might be the old one or the new one. So throw @@ -1445,7 +1526,7 @@ OptimizeResultProto IcingSearchEngine::Optimize() { OptimizeStatsProto::FULL_INDEX_REBUILD); ICING_LOG(WARNING) << "Clearing the entire index!"; - libtextclassifier3::Status index_clear_status = ClearIndices(); + libtextclassifier3::Status index_clear_status = ClearAllIndices(); if (!index_clear_status.ok()) { status = absl_ports::Annotate( absl_ports::InternalError("Failed to clear index."), @@ -1652,6 +1733,7 @@ libtextclassifier3::Status IcingSearchEngine::InternalPersistToDisk( ICING_RETURN_IF_ERROR(document_store_->PersistToDisk(PersistType::FULL)); ICING_RETURN_IF_ERROR(index_->PersistToDisk()); ICING_RETURN_IF_ERROR(integer_index_->PersistToDisk()); + ICING_RETURN_IF_ERROR(qualified_id_join_index_->PersistToDisk()); return libtextclassifier3::Status::OK; } @@ -1714,7 +1796,8 @@ SearchResultProto IcingSearchEngine::Search( return result_proto; } - JoinProcessor join_processor(document_store_.get()); + JoinProcessor join_processor(document_store_.get(), schema_store_.get(), + qualified_id_join_index_.get()); // Building a JoinChildrenFetcher where child documents are grouped by // their joinable values. libtextclassifier3::StatusOr<JoinChildrenFetcher> join_children_fetcher_or = @@ -1756,7 +1839,8 @@ SearchResultProto IcingSearchEngine::Search( std::unique_ptr<ScoredDocumentHitsRanker> ranker; if (join_children_fetcher != nullptr) { // Join 2 scored document hits - JoinProcessor join_processor(document_store_.get()); + JoinProcessor join_processor(document_store_.get(), schema_store_.get(), + qualified_id_join_index_.get()); libtextclassifier3::StatusOr<std::vector<JoinedScoredDocumentHit>> joined_result_document_hits_or = join_processor.Join( join_spec, std::move(query_scoring_results.scored_document_hits), @@ -2042,9 +2126,10 @@ IcingSearchEngine::OptimizeDocumentStore(OptimizeStatsProto* optimize_stats) { // Tries to rebuild document store if swapping fails, to avoid leaving the // system in the broken state for future operations. - auto create_result_or = - DocumentStore::Create(filesystem_.get(), current_document_dir, - clock_.get(), schema_store_.get()); + auto create_result_or = DocumentStore::Create( + filesystem_.get(), current_document_dir, clock_.get(), + schema_store_.get(), /*force_recovery_and_revalidate_documents=*/false, + options_.document_store_namespace_id_fingerprint()); // TODO(b/144458732): Implement a more robust version of // TC_ASSIGN_OR_RETURN that can support error logging. if (!create_result_or.ok()) { @@ -2068,9 +2153,10 @@ IcingSearchEngine::OptimizeDocumentStore(OptimizeStatsProto* optimize_stats) { } // Recreates the doc store instance - auto create_result_or = - DocumentStore::Create(filesystem_.get(), current_document_dir, - clock_.get(), schema_store_.get()); + auto create_result_or = DocumentStore::Create( + filesystem_.get(), current_document_dir, clock_.get(), + schema_store_.get(), /*force_recovery_and_revalidate_documents=*/false, + options_.document_store_namespace_id_fingerprint()); if (!create_result_or.ok()) { // Unable to create DocumentStore from the new file. Mark as uninitialized // and return INTERNAL. @@ -2098,34 +2184,37 @@ IcingSearchEngine::RestoreIndexIfNeeded() { DocumentId last_stored_document_id = document_store_->last_added_document_id(); if (last_stored_document_id == index_->last_added_document_id() && - last_stored_document_id == integer_index_->last_added_document_id()) { + last_stored_document_id == integer_index_->last_added_document_id() && + last_stored_document_id == + qualified_id_join_index_->last_added_document_id()) { // No need to recover. - return {libtextclassifier3::Status::OK, false, false}; + return {libtextclassifier3::Status::OK, false, false, false}; } if (last_stored_document_id == kInvalidDocumentId) { // Document store is empty but index is not. Clear the index. - return {ClearIndices(), false, false}; + return {ClearAllIndices(), false, false, false}; } // Truncate indices first. auto truncate_result_or = TruncateIndicesTo(last_stored_document_id); if (!truncate_result_or.ok()) { - return {std::move(truncate_result_or).status(), false, false}; + return {std::move(truncate_result_or).status(), false, false, false}; } TruncateIndexResult truncate_result = std::move(truncate_result_or).ValueOrDie(); if (truncate_result.first_document_to_reindex > last_stored_document_id) { // Nothing to restore. Just return. - return {libtextclassifier3::Status::OK, false, false}; + return {libtextclassifier3::Status::OK, false, false, false}; } auto data_indexing_handlers_or = CreateDataIndexingHandlers(); if (!data_indexing_handlers_or.ok()) { return {data_indexing_handlers_or.status(), truncate_result.index_needed_restoration, - truncate_result.integer_index_needed_restoration}; + truncate_result.integer_index_needed_restoration, + truncate_result.qualified_id_join_index_needed_restoration}; } // By using recovery_mode for IndexProcessor, we're able to replay documents // from smaller document id and it will skip documents that are already been @@ -2151,7 +2240,8 @@ IcingSearchEngine::RestoreIndexIfNeeded() { } else { // Returns other errors return {document_or.status(), truncate_result.index_needed_restoration, - truncate_result.integer_index_needed_restoration}; + truncate_result.integer_index_needed_restoration, + truncate_result.qualified_id_join_index_needed_restoration}; } } DocumentProto document(std::move(document_or).ValueOrDie()); @@ -2163,7 +2253,8 @@ IcingSearchEngine::RestoreIndexIfNeeded() { if (!tokenized_document_or.ok()) { return {tokenized_document_or.status(), truncate_result.index_needed_restoration, - truncate_result.integer_index_needed_restoration}; + truncate_result.integer_index_needed_restoration, + truncate_result.qualified_id_join_index_needed_restoration}; } TokenizedDocument tokenized_document( std::move(tokenized_document_or).ValueOrDie()); @@ -2174,7 +2265,8 @@ IcingSearchEngine::RestoreIndexIfNeeded() { if (!absl_ports::IsDataLoss(status)) { // Real error. Stop recovering and pass it up. return {status, truncate_result.index_needed_restoration, - truncate_result.integer_index_needed_restoration}; + truncate_result.integer_index_needed_restoration, + truncate_result.qualified_id_join_index_needed_restoration}; } // FIXME: why can we skip data loss error here? // Just a data loss. Keep trying to add the remaining docs, but report the @@ -2184,7 +2276,8 @@ IcingSearchEngine::RestoreIndexIfNeeded() { } return {overall_status, truncate_result.index_needed_restoration, - truncate_result.integer_index_needed_restoration}; + truncate_result.integer_index_needed_restoration, + truncate_result.qualified_id_join_index_needed_restoration}; } libtextclassifier3::StatusOr<bool> IcingSearchEngine::LostPreviousSchema() { @@ -2228,7 +2321,14 @@ IcingSearchEngine::CreateDataIndexingHandlers() { clock_.get(), integer_index_.get())); handlers.push_back(std::move(integer_section_indexing_handler)); - // TODO(b/263890397): add QualifiedIdJoinablePropertyIndexingHandler + // Qualified id joinable property index handler + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<QualifiedIdJoinablePropertyIndexingHandler> + qualified_id_joinable_property_indexing_handler, + QualifiedIdJoinablePropertyIndexingHandler::Create( + clock_.get(), qualified_id_join_index_.get())); + handlers.push_back( + std::move(qualified_id_joinable_property_indexing_handler)); return handlers; } @@ -2265,11 +2365,12 @@ IcingSearchEngine::TruncateIndicesTo(DocumentId last_stored_document_id) { // starting from integer_index_last_added_document_id + 1. Also use std::min // since we might need to replay even smaller doc ids for term index. integer_index_needed_restoration = true; - first_document_to_reindex = - integer_index_last_added_document_id != kInvalidDocumentId - ? std::min(first_document_to_reindex, - integer_index_last_added_document_id + 1) - : kMinDocumentId; + if (integer_index_last_added_document_id != kInvalidDocumentId) { + first_document_to_reindex = std::min( + first_document_to_reindex, integer_index_last_added_document_id + 1); + } else { + first_document_to_reindex = kMinDocumentId; + } } else if (last_stored_document_id < integer_index_last_added_document_id) { // Clear the entire integer index if last_stored_document_id is smaller than // integer_index_last_added_document_id, because there is no way to remove @@ -2283,17 +2384,62 @@ IcingSearchEngine::TruncateIndicesTo(DocumentId last_stored_document_id) { first_document_to_reindex = kMinDocumentId; } + // Attempt to truncate qualified id join index + bool qualified_id_join_index_needed_restoration = false; + DocumentId qualified_id_join_index_last_added_document_id = + qualified_id_join_index_->last_added_document_id(); + if (qualified_id_join_index_last_added_document_id == kInvalidDocumentId || + last_stored_document_id > + qualified_id_join_index_last_added_document_id) { + // If last_stored_document_id is greater than + // qualified_id_join_index_last_added_document_id, then we only have to + // replay docs starting from (qualified_id_join_index_last_added_document_id + // + 1). Also use std::min since we might need to replay even smaller doc + // ids for other components. + qualified_id_join_index_needed_restoration = true; + if (qualified_id_join_index_last_added_document_id != kInvalidDocumentId) { + first_document_to_reindex = + std::min(first_document_to_reindex, + qualified_id_join_index_last_added_document_id + 1); + } else { + first_document_to_reindex = kMinDocumentId; + } + } else if (last_stored_document_id < + qualified_id_join_index_last_added_document_id) { + // Clear the entire qualified id join index if last_stored_document_id is + // smaller than qualified_id_join_index_last_added_document_id, because + // there is no way to remove data with doc_id > last_stored_document_id from + // join index efficiently and we have to rebuild. + ICING_RETURN_IF_ERROR(qualified_id_join_index_->Clear()); + + // Since the entire qualified id join index is discarded, we start to + // rebuild it by setting first_document_to_reindex to kMinDocumentId. + qualified_id_join_index_needed_restoration = true; + first_document_to_reindex = kMinDocumentId; + } + return TruncateIndexResult(first_document_to_reindex, index_needed_restoration, - integer_index_needed_restoration); + integer_index_needed_restoration, + qualified_id_join_index_needed_restoration); } -libtextclassifier3::Status IcingSearchEngine::ClearIndices() { +libtextclassifier3::Status IcingSearchEngine::ClearSearchIndices() { ICING_RETURN_IF_ERROR(index_->Reset()); ICING_RETURN_IF_ERROR(integer_index_->Clear()); return libtextclassifier3::Status::OK; } +libtextclassifier3::Status IcingSearchEngine::ClearJoinIndices() { + return qualified_id_join_index_->Clear(); +} + +libtextclassifier3::Status IcingSearchEngine::ClearAllIndices() { + ICING_RETURN_IF_ERROR(ClearSearchIndices()); + ICING_RETURN_IF_ERROR(ClearJoinIndices()); + return libtextclassifier3::Status::OK; +} + ResetResultProto IcingSearchEngine::Reset() { absl_ports::unique_lock l(&mutex_); return ResetInternal(); diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h index 678fc77..3e85f69 100644 --- a/icing/icing-search-engine.h +++ b/icing/icing-search-engine.h @@ -31,6 +31,7 @@ #include "icing/index/numeric/numeric-index.h" #include "icing/jni/jni-cache.h" #include "icing/join/join-children-fetcher.h" +#include "icing/join/qualified-id-type-joinable-index.h" #include "icing/legacy/index/icing-filesystem.h" #include "icing/performance-configuration.h" #include "icing/proto/debug.pb.h" @@ -474,10 +475,13 @@ class IcingSearchEngine { std::unique_ptr<Index> index_ ICING_GUARDED_BY(mutex_); // Storage for all hits of numeric contents from the document store. - // TODO(b/249829533): integrate more functions with integer_index_. std::unique_ptr<NumericIndex<int64_t>> integer_index_ ICING_GUARDED_BY(mutex_); + // Storage for all join qualified ids from the document store. + std::unique_ptr<QualifiedIdTypeJoinableIndex> qualified_id_join_index_ + ICING_GUARDED_BY(mutex_); + // Pointer to JNI class references const std::unique_ptr<const JniCache> jni_cache_; @@ -550,8 +554,8 @@ class IcingSearchEngine { InitializeStatsProto* initialize_stats) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); - // Do any initialization/recovery necessary to create a DocumentStore - // instance. + // Do any initialization/recovery necessary to create term index, integer + // index, and qualified id join index instances. // // Returns: // OK on success @@ -640,9 +644,10 @@ class IcingSearchEngine { OptimizeStatsProto* optimize_stats) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); - // Helper method to restore missing document data in index_. All documents - // will be reindexed. This does not clear the index, so it is recommended to - // call Index::Reset first. + // Helper method to restore missing document data in index_, integer_index_, + // and qualified_id_join_index_. All documents will be reindexed. This does + // not clear the index, so it is recommended to call ClearAllIndices, + // ClearSearchIndices, or ClearJoinIndices first if needed. // // Returns: // On success, OK and a bool indicating whether or not restoration was @@ -657,6 +662,7 @@ class IcingSearchEngine { libtextclassifier3::Status status; bool index_needed_restoration; bool integer_index_needed_restoration; + bool qualified_id_join_index_needed_restoration; }; IndexRestorationResult RestoreIndexIfNeeded() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); @@ -674,13 +680,18 @@ class IcingSearchEngine { ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Helper method to create all types of data indexing handlers to index term, - // integer, and joinable qualified ids. + // integer, and join qualified ids. libtextclassifier3::StatusOr< std::vector<std::unique_ptr<DataIndexingHandler>>> CreateDataIndexingHandlers() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); - // Helper method to discard parts of (term, integer) indices if they contain - // data for document ids greater than last_stored_document_id. + // Helper method to discard parts of (term, integer, qualified id join) + // indices if they contain data for document ids greater than + // last_stored_document_id. + // + // REQUIRES: last_stored_document_id is valid (!= kInvalidDocumentId). Note: + // if we want to truncate everything in the index, then please call + // ClearSearchIndices/ClearJoinIndices/ClearAllIndices instead. // // Returns: // On success, a DocumentId indicating the first document to start for @@ -691,25 +702,45 @@ class IcingSearchEngine { DocumentId first_document_to_reindex; bool index_needed_restoration; bool integer_index_needed_restoration; + bool qualified_id_join_index_needed_restoration; - explicit TruncateIndexResult(DocumentId first_document_to_reindex_in, - bool index_needed_restoration_in, - bool integer_index_needed_restoration_in) + explicit TruncateIndexResult( + DocumentId first_document_to_reindex_in, + bool index_needed_restoration_in, + bool integer_index_needed_restoration_in, + bool qualified_id_join_index_needed_restoration_in) : first_document_to_reindex(first_document_to_reindex_in), index_needed_restoration(index_needed_restoration_in), - integer_index_needed_restoration( - integer_index_needed_restoration_in) {} + integer_index_needed_restoration(integer_index_needed_restoration_in), + qualified_id_join_index_needed_restoration( + qualified_id_join_index_needed_restoration_in) {} }; libtextclassifier3::StatusOr<TruncateIndexResult> TruncateIndicesTo( DocumentId last_stored_document_id) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); - // Helper method to discard the entire (term, integer) indices. + // Helper method to discard search (term, integer) indices. + // + // Returns: + // OK on success + // INTERNAL_ERROR on any I/O errors + libtextclassifier3::Status ClearSearchIndices() + ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + + // Helper method to discard join (qualified id) indices. + // + // Returns: + // OK on success + // INTERNAL_ERROR on any I/O errors + libtextclassifier3::Status ClearJoinIndices() + ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + + // Helper method to discard all search and join indices. // // Returns: // OK on success // INTERNAL_ERROR on any I/O errors - libtextclassifier3::Status ClearIndices() + libtextclassifier3::Status ClearAllIndices() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); }; diff --git a/icing/icing-search-engine_initialization_test.cc b/icing/icing-search-engine_initialization_test.cc index f51abdf..6ba1737 100644 --- a/icing/icing-search-engine_initialization_test.cc +++ b/icing/icing-search-engine_initialization_test.cc @@ -28,6 +28,9 @@ #include "icing/index/index.h" #include "icing/index/numeric/integer-index.h" #include "icing/jni/jni-cache.h" +#include "icing/join/doc-join-info.h" +#include "icing/join/join-processor.h" +#include "icing/join/qualified-id-type-joinable-index.h" #include "icing/legacy/index/icing-mock-filesystem.h" #include "icing/portable/endian.h" #include "icing/portable/equals-proto.h" @@ -66,10 +69,12 @@ namespace { using ::icing::lib::portable_equals_proto::EqualsProto; using ::testing::_; +using ::testing::AtLeast; using ::testing::DoDefault; using ::testing::EndsWith; using ::testing::Eq; using ::testing::HasSubstr; +using ::testing::IsEmpty; using ::testing::Matcher; using ::testing::Return; using ::testing::SizeIs; @@ -162,6 +167,10 @@ std::string GetIntegerIndexDir() { return GetTestBaseDir() + "/integer_index_dir"; } +std::string GetQualifiedIdJoinIndexDir() { + return GetTestBaseDir() + "/qualified_id_join_index_dir"; +} + std::string GetSchemaDir() { return GetTestBaseDir() + "/schema_dir"; } std::string GetHeaderFilename() { @@ -239,6 +248,10 @@ ScoringSpecProto GetDefaultScoringSpec() { return scoring_spec; } +// TODO(b/272145329): create SearchSpecBuilder, JoinSpecBuilder, +// SearchResultProtoBuilder and ResultProtoBuilder for unit tests and build all +// instances by them. + TEST_F(IcingSearchEngineInitializationTest, UninitializedInstanceFailsSafely) { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); @@ -876,21 +889,73 @@ TEST_F(IcingSearchEngineInitializationTest, TEST_F(IcingSearchEngineInitializationTest, RecoverFromInconsistentDocumentStore) { - // Test the following scenario: document store is ahead of term and integer - // index. IcingSearchEngine should be able to recover term index. Several - // additional behaviors are also tested: + // Test the following scenario: document store is ahead of term, integer and + // qualified id join index. IcingSearchEngine should be able to recover all + // indices. Several additional behaviors are also tested: // - Index directory handling: // - Term index directory should be unaffected. // - Integer index directory should be unaffected. + // - Qualified id join index directory should be unaffected. // - Truncate indices: // - "TruncateTo()" for term index shouldn't take effect. // - "Clear()" shouldn't be called for integer index, i.e. no integer index // storage sub directories (path_expr = "*/integer_index_dir/*") should be // discarded. + // - "Clear()" shouldn't be called for qualified id join index, i.e. no + // underlying storage sub directory (path_expr = + // "*/qualified_id_join_index_dir/*") should be discarded. // - Still, we need to replay and reindex documents. - DocumentProto document1 = CreateMessageDocument("namespace", "uri1"); - DocumentProto document2 = CreateMessageDocument("namespace", "uri2"); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("indexableInteger") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto person = + DocumentBuilder() + .SetKey("namespace", "person") + .SetSchema("Person") + .AddStringProperty("name", "person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto message1 = + DocumentBuilder() + .SetKey("namespace", "message/1") + .SetSchema("Message") + .AddStringProperty("body", "message body one") + .AddInt64Property("indexableInteger", 123) + .AddStringProperty("senderQualifiedId", "namespace#person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto message2 = + DocumentBuilder() + .SetKey("namespace", "message/2") + .SetSchema("Message") + .AddStringProperty("body", "message body two") + .AddInt64Property("indexableInteger", 123) + .AddStringProperty("senderQualifiedId", "namespace#person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); { // Initializes folder and schema, index one document @@ -899,8 +964,9 @@ TEST_F(IcingSearchEngineInitializationTest, std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(), GetTestJniCache()); EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); - EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); - EXPECT_THAT(icing.Put(document1).status(), ProtoIsOk()); + EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(person).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(message1).status(), ProtoIsOk()); } // This should shut down IcingSearchEngine and persist anything it needs to { @@ -910,7 +976,7 @@ TEST_F(IcingSearchEngineInitializationTest, SchemaStore::Create(filesystem(), GetSchemaDir(), &fake_clock)); ICING_EXPECT_OK(schema_store->SetSchema(CreateMessageSchema())); - // Puts a second document into DocumentStore but doesn't index it. + // Puts message2 into DocumentStore but doesn't index it. ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(filesystem(), GetDocumentDir(), &fake_clock, @@ -918,11 +984,10 @@ TEST_F(IcingSearchEngineInitializationTest, std::unique_ptr<DocumentStore> document_store = std::move(create_result.document_store); - ICING_EXPECT_OK(document_store->Put(document2)); + ICING_EXPECT_OK(document_store->Put(message2)); } - // Mock filesystem to observe and check the behavior of term index and - // integer index. + // Mock filesystem to observe and check the behavior of all indices. auto mock_filesystem = std::make_unique<MockFilesystem>(); EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_)) .WillRepeatedly(DoDefault()); @@ -939,6 +1004,15 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/"))) .Times(0); + // Ensure qualified id join index directory should never be discarded, and + // Clear() should never be called (i.e. storage sub directory + // "*/qualified_id_join_index_dir/*" should never be discarded). + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively( + EndsWith("/qualified_id_join_index_dir"))) + .Times(0); + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively( + HasSubstr("/qualified_id_join_index_dir/"))) + .Times(0); TestIcingSearchEngine icing(GetDefaultIcingOptions(), std::move(mock_filesystem), @@ -953,27 +1027,30 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_THAT( initialize_result.initialize_stats().integer_index_restoration_cause(), Eq(InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH)); + EXPECT_THAT(initialize_result.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH)); GetResultProto expected_get_result_proto; expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); - *expected_get_result_proto.mutable_document() = document1; + *expected_get_result_proto.mutable_document() = message1; // DocumentStore kept the additional document - EXPECT_THAT( - icing.Get("namespace", "uri1", GetResultSpecProto::default_instance()), - EqualsProto(expected_get_result_proto)); + EXPECT_THAT(icing.Get("namespace", "message/1", + GetResultSpecProto::default_instance()), + EqualsProto(expected_get_result_proto)); - *expected_get_result_proto.mutable_document() = document2; - EXPECT_THAT( - icing.Get("namespace", "uri2", GetResultSpecProto::default_instance()), - EqualsProto(expected_get_result_proto)); + *expected_get_result_proto.mutable_document() = message2; + EXPECT_THAT(icing.Get("namespace", "message/2", + GetResultSpecProto::default_instance()), + EqualsProto(expected_get_result_proto)); SearchResultProto expected_search_result_proto; expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); *expected_search_result_proto.mutable_results()->Add()->mutable_document() = - document2; + message2; *expected_search_result_proto.mutable_results()->Add()->mutable_document() = - document1; + message1; // We indexed the additional document in all indices. // Verify term search @@ -998,6 +1075,40 @@ TEST_F(IcingSearchEngineInitializationTest, ResultSpecProto::default_instance()); EXPECT_THAT(search_result_google::protobuf, EqualsSearchResultIgnoreStatsAndScores( expected_search_result_proto)); + + // Verify join search: join a query for `name:person` with a child query for + // `body:message` based on the child's `senderQualifiedId` field. + SearchSpecProto search_spec3; + search_spec3.set_term_match_type(TermMatchType::EXACT_ONLY); + search_spec3.set_query("name:person"); + JoinSpecProto* join_spec = search_spec3.mutable_join_spec(); + join_spec->set_max_joined_child_count(100); + join_spec->set_parent_property_expression( + std::string(JoinProcessor::kQualifiedIdExpr)); + join_spec->set_child_property_expression("senderQualifiedId"); + join_spec->set_aggregation_scoring_strategy( + JoinSpecProto::AggregationScoringStrategy::COUNT); + JoinSpecProto::NestedSpecProto* nested_spec = + join_spec->mutable_nested_spec(); + SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec(); + nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY); + nested_search_spec->set_query("body:message"); + *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec(); + *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance(); + + SearchResultProto expected_join_search_result_proto; + expected_join_search_result_proto.mutable_status()->set_code(StatusProto::OK); + SearchResultProto::ResultProto* result_proto = + expected_join_search_result_proto.mutable_results()->Add(); + *result_proto->mutable_document() = person; + *result_proto->mutable_joined_results()->Add()->mutable_document() = message2; + *result_proto->mutable_joined_results()->Add()->mutable_document() = message1; + + SearchResultProto search_result_proto3 = + icing.Search(search_spec3, ScoringSpecProto::default_instance(), + ResultSpecProto::default_instance()); + EXPECT_THAT(search_result_proto3, EqualsSearchResultIgnoreStatsAndScores( + expected_join_search_result_proto)); } TEST_F(IcingSearchEngineInitializationTest, RecoverFromCorruptIndex) { @@ -1008,20 +1119,67 @@ TEST_F(IcingSearchEngineInitializationTest, RecoverFromCorruptIndex) { // - Should discard the entire term index directory and start it from // scratch. // - Integer index directory should be unaffected. + // - Qualified id join index directory should be unaffected. // - Truncate indices: // - "TruncateTo()" for term index shouldn't take effect since we start it // from scratch. // - "Clear()" shouldn't be called for integer index, i.e. no integer index // storage sub directories (path_expr = "*/integer_index_dir/*") should be // discarded. + // - "Clear()" shouldn't be called for qualified id join index, i.e. no + // underlying storage sub directory (path_expr = + // "*/qualified_id_join_index_dir/*") should be discarded. + + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("indexableInteger") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto person = + DocumentBuilder() + .SetKey("namespace", "person") + .SetSchema("Person") + .AddStringProperty("name", "person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto message = + DocumentBuilder() + .SetKey("namespace", "message/1") + .SetSchema("Message") + .AddStringProperty("body", "message body") + .AddInt64Property("indexableInteger", 123) + .AddStringProperty("senderQualifiedId", "namespace#person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + SearchSpecProto search_spec; - search_spec.set_query("message"); + search_spec.set_query("body:message"); search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); SearchResultProto expected_search_result_proto; expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); *expected_search_result_proto.mutable_results()->Add()->mutable_document() = - CreateMessageDocument("namespace", "uri"); + message; { // Initializes folder and schema, index one document @@ -1030,9 +1188,9 @@ TEST_F(IcingSearchEngineInitializationTest, RecoverFromCorruptIndex) { std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(), GetTestJniCache()); EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); - EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); - EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(), - ProtoIsOk()); + EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(person).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); @@ -1048,8 +1206,7 @@ TEST_F(IcingSearchEngineInitializationTest, RecoverFromCorruptIndex) { ASSERT_TRUE(filesystem()->Write(fd.get(), "1234", 4)); } - // Mock filesystem to observe and check the behavior of term index and integer - // index. + // Mock filesystem to observe and check the behavior of all indices. auto mock_filesystem = std::make_unique<MockFilesystem>(); EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_)) .WillRepeatedly(DoDefault()); @@ -1066,6 +1223,15 @@ TEST_F(IcingSearchEngineInitializationTest, RecoverFromCorruptIndex) { EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/"))) .Times(0); + // Ensure qualified id join index directory should never be discarded, and + // Clear() should never be called (i.e. storage sub directory + // "*/qualified_id_join_index_dir/*" should never be discarded). + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively( + EndsWith("/qualified_id_join_index_dir"))) + .Times(0); + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively( + HasSubstr("/qualified_id_join_index_dir/"))) + .Times(0); TestIcingSearchEngine icing(GetDefaultIcingOptions(), std::move(mock_filesystem), @@ -1078,6 +1244,9 @@ TEST_F(IcingSearchEngineInitializationTest, RecoverFromCorruptIndex) { EXPECT_THAT( initialize_result.initialize_stats().integer_index_restoration_cause(), Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); // Check that our index is ok by searching over the restored index SearchResultProto search_result_proto = @@ -1095,11 +1264,58 @@ TEST_F(IcingSearchEngineInitializationTest, RecoverFromCorruptIntegerIndex) { // - Term index directory should be unaffected. // - Should discard the entire integer index directory and start it from // scratch. + // - Qualified id join index directory should be unaffected. // - Truncate indices: // - "TruncateTo()" for term index shouldn't take effect. // - "Clear()" shouldn't be called for integer index, i.e. no integer index // storage sub directories (path_expr = "*/integer_index_dir/*") should be // discarded, since we start it from scratch. + // - "Clear()" shouldn't be called for qualified id join index, i.e. no + // underlying storage sub directory (path_expr = + // "*/qualified_id_join_index_dir/*") should be discarded. + + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("indexableInteger") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto person = + DocumentBuilder() + .SetKey("namespace", "person") + .SetSchema("Person") + .AddStringProperty("name", "person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto message = + DocumentBuilder() + .SetKey("namespace", "message/1") + .SetSchema("Message") + .AddStringProperty("body", "message body") + .AddInt64Property("indexableInteger", 123) + .AddStringProperty("senderQualifiedId", "namespace#person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + SearchSpecProto search_spec; search_spec.set_query("indexableInteger == 123"); search_spec.set_search_type( @@ -1109,7 +1325,7 @@ TEST_F(IcingSearchEngineInitializationTest, RecoverFromCorruptIntegerIndex) { SearchResultProto expected_search_result_proto; expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); *expected_search_result_proto.mutable_results()->Add()->mutable_document() = - CreateMessageDocument("namespace", "uri"); + message; { // Initializes folder and schema, index one document @@ -1118,9 +1334,9 @@ TEST_F(IcingSearchEngineInitializationTest, RecoverFromCorruptIntegerIndex) { std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(), GetTestJniCache()); EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); - EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); - EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(), - ProtoIsOk()); + EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(person).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); @@ -1138,8 +1354,7 @@ TEST_F(IcingSearchEngineInitializationTest, RecoverFromCorruptIntegerIndex) { ASSERT_TRUE(filesystem()->Write(fd.get(), "1234", 4)); } - // Mock filesystem to observe and check the behavior of term index and integer - // index. + // Mock filesystem to observe and check the behavior of all indices. auto mock_filesystem = std::make_unique<MockFilesystem>(); EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_)) .WillRepeatedly(DoDefault()); @@ -1156,6 +1371,15 @@ TEST_F(IcingSearchEngineInitializationTest, RecoverFromCorruptIntegerIndex) { EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/"))) .Times(0); + // Ensure qualified id join index directory should never be discarded, and + // Clear() should never be called (i.e. storage sub directory + // "*/qualified_id_join_index_dir/*" should never be discarded). + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively( + EndsWith("/qualified_id_join_index_dir"))) + .Times(0); + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively( + HasSubstr("/qualified_id_join_index_dir/"))) + .Times(0); TestIcingSearchEngine icing(GetDefaultIcingOptions(), std::move(mock_filesystem), @@ -1168,6 +1392,175 @@ TEST_F(IcingSearchEngineInitializationTest, RecoverFromCorruptIntegerIndex) { EXPECT_THAT( initialize_result.initialize_stats().integer_index_restoration_cause(), Eq(InitializeStatsProto::IO_ERROR)); + EXPECT_THAT(initialize_result.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + + // Check that our index is ok by searching over the restored index + SearchResultProto search_result_proto = + icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); +} + +TEST_F(IcingSearchEngineInitializationTest, + RecoverFromCorruptQualifiedIdJoinIndex) { + // Test the following scenario: qualified id join index is corrupted (e.g. + // checksum doesn't match). IcingSearchEngine should be able to recover + // qualified id join index. Several additional behaviors are also tested: + // - Index directory handling: + // - Term index directory should be unaffected. + // - Integer index directory should be unaffected. + // - Should discard the entire qualified id join index directory and start + // it from scratch. + // - Truncate indices: + // - "TruncateTo()" for term index shouldn't take effect. + // - "Clear()" shouldn't be called for integer index, i.e. no integer index + // storage sub directories (path_expr = "*/integer_index_dir/*") should be + // discarded. + // - "Clear()" shouldn't be called for qualified id join index, i.e. no + // underlying storage sub directory (path_expr = + // "*/qualified_id_join_index_dir/*") should be discarded, since we start + // it from scratch. + + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("indexableInteger") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto person = + DocumentBuilder() + .SetKey("namespace", "person") + .SetSchema("Person") + .AddStringProperty("name", "person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto message = + DocumentBuilder() + .SetKey("namespace", "message/1") + .SetSchema("Message") + .AddStringProperty("body", "message body") + .AddInt64Property("indexableInteger", 123) + .AddStringProperty("senderQualifiedId", "namespace#person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + // Prepare join search spec to join a query for `name:person` with a child + // query for `body:message` based on the child's `senderQualifiedId` field. + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + search_spec.set_query("name:person"); + JoinSpecProto* join_spec = search_spec.mutable_join_spec(); + join_spec->set_max_joined_child_count(100); + join_spec->set_parent_property_expression( + std::string(JoinProcessor::kQualifiedIdExpr)); + join_spec->set_child_property_expression("senderQualifiedId"); + join_spec->set_aggregation_scoring_strategy( + JoinSpecProto::AggregationScoringStrategy::COUNT); + JoinSpecProto::NestedSpecProto* nested_spec = + join_spec->mutable_nested_spec(); + SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec(); + nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY); + nested_search_spec->set_query("body:message"); + *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec(); + *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance(); + + SearchResultProto expected_search_result_proto; + expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); + SearchResultProto::ResultProto* result_proto = + expected_search_result_proto.mutable_results()->Add(); + *result_proto->mutable_document() = person; + *result_proto->mutable_joined_results()->Add()->mutable_document() = message; + + { + // Initializes folder and schema, index one document + TestIcingSearchEngine icing( + GetDefaultIcingOptions(), std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(), + GetTestJniCache()); + EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); + EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(person).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + SearchResultProto search_result_proto = + icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); + } // This should shut down IcingSearchEngine and persist anything it needs to + + // Manually corrupt qualified id join index + { + const std::string qualified_id_join_index_metadata_file = + GetQualifiedIdJoinIndexDir() + "/metadata"; + ScopedFd fd(filesystem()->OpenForWrite( + qualified_id_join_index_metadata_file.c_str())); + ASSERT_TRUE(fd.is_valid()); + ASSERT_TRUE(filesystem()->Write(fd.get(), "1234", 4)); + } + + // Mock filesystem to observe and check the behavior of all indices. + auto mock_filesystem = std::make_unique<MockFilesystem>(); + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_)) + .WillRepeatedly(DoDefault()); + // Ensure term index directory should never be discarded. + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(EndsWith("/index_dir"))) + .Times(0); + // Ensure integer index directory should never be discarded, and Clear() + // should never be called (i.e. storage sub directory "*/integer_index_dir/*" + // should never be discarded). + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(EndsWith("/integer_index_dir"))) + .Times(0); + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/"))) + .Times(0); + // Ensure qualified id join index directory should be discarded once, and + // Clear() should never be called (i.e. storage sub directory + // "*/qualified_id_join_index_dir/*" should never be discarded). + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively( + EndsWith("/qualified_id_join_index_dir"))) + .Times(1); + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively( + HasSubstr("/qualified_id_join_index_dir/"))) + .Times(0); + + TestIcingSearchEngine icing(GetDefaultIcingOptions(), + std::move(mock_filesystem), + std::make_unique<IcingFilesystem>(), + std::make_unique<FakeClock>(), GetTestJniCache()); + InitializeResultProto initialize_result = icing.Initialize(); + EXPECT_THAT(initialize_result.status(), ProtoIsOk()); + EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT( + initialize_result.initialize_stats().integer_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::IO_ERROR)); // Check that our index is ok by searching over the restored index SearchResultProto search_result_proto = @@ -1185,19 +1578,60 @@ TEST_F(IcingSearchEngineInitializationTest, RestoreIndexLoseTermIndex) { // - Term index directory should not be discarded since we've already lost // it. Start it from scratch. // - Integer index directory should be unaffected. + // - Qualified id join index directory should be unaffected. // - Truncate indices: // - "TruncateTo()" for term index shouldn't take effect since we start it // from scratch. // - "Clear()" shouldn't be called for integer index, i.e. no integer index // storage sub directories (path_expr = "*/integer_index_dir/*") should be // discarded. - DocumentProto document = DocumentBuilder() - .SetKey("icing", "fake_type/0") - .SetSchema("Message") - .AddStringProperty("body", kIpsumText) - .AddInt64Property("indexableInteger", 123) - .Build(); - // 1. Create an index with 3 documents. + // - "Clear()" shouldn't be called for qualified id join index, i.e. no + // underlying storage sub directory (path_expr = + // "*/qualified_id_join_index_dir/*") should be discarded. + + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("indexableInteger") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto person = + DocumentBuilder() + .SetKey("namespace", "person") + .SetSchema("Person") + .AddStringProperty("name", "person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto message = + DocumentBuilder() + .SetKey("namespace", "message/1") + .SetSchema("Message") + .AddStringProperty("body", kIpsumText) + .AddInt64Property("indexableInteger", 123) + .AddStringProperty("senderQualifiedId", "namespace#person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + // 1. Create an index with 3 message documents. { TestIcingSearchEngine icing( GetDefaultIcingOptions(), std::make_unique<Filesystem>(), @@ -1205,13 +1639,14 @@ TEST_F(IcingSearchEngineInitializationTest, RestoreIndexLoseTermIndex) { GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); - EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); - document = DocumentBuilder(document).SetUri("fake_type/1").Build(); - EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); - document = DocumentBuilder(document).SetUri("fake_type/2").Build(); - EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(person).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + message = DocumentBuilder(message).SetUri("message/2").Build(); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + message = DocumentBuilder(message).SetUri("message/3").Build(); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); } // 2. Delete the term index directory to trigger RestoreIndexIfNeeded. @@ -1220,8 +1655,7 @@ TEST_F(IcingSearchEngineInitializationTest, RestoreIndexLoseTermIndex) { // 3. Create the index again. This should trigger index restoration. { - // Mock filesystem to observe and check the behavior of term index and - // integer index. + // Mock filesystem to observe and check the behavior of all indices. auto mock_filesystem = std::make_unique<MockFilesystem>(); EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_)) .WillRepeatedly(DoDefault()); @@ -1239,6 +1673,16 @@ TEST_F(IcingSearchEngineInitializationTest, RestoreIndexLoseTermIndex) { EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/"))) .Times(0); + // Ensure qualified id join index directory should never be discarded, and + // Clear() should never be called (i.e. storage sub directory + // "*/qualified_id_join_index_dir/*" should never be discarded). + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively( + EndsWith("/qualified_id_join_index_dir"))) + .Times(0); + EXPECT_CALL( + *mock_filesystem, + DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/"))) + .Times(0); TestIcingSearchEngine icing( GetDefaultIcingOptions(), std::move(mock_filesystem), @@ -1251,10 +1695,13 @@ TEST_F(IcingSearchEngineInitializationTest, RestoreIndexLoseTermIndex) { EXPECT_THAT( initialize_result.initialize_stats().integer_index_restoration_cause(), Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); // Verify term index works normally SearchSpecProto search_spec1; - search_spec1.set_query("consectetur"); + search_spec1.set_query("body:consectetur"); search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY); SearchResultProto results1 = icing.Search(search_spec1, ScoringSpecProto::default_instance(), @@ -1263,9 +1710,9 @@ TEST_F(IcingSearchEngineInitializationTest, RestoreIndexLoseTermIndex) { EXPECT_THAT(results1.next_page_token(), Eq(0)); // All documents should be retrievable. ASSERT_THAT(results1.results(), SizeIs(3)); - EXPECT_THAT(results1.results(0).document().uri(), Eq("fake_type/2")); - EXPECT_THAT(results1.results(1).document().uri(), Eq("fake_type/1")); - EXPECT_THAT(results1.results(2).document().uri(), Eq("fake_type/0")); + EXPECT_THAT(results1.results(0).document().uri(), Eq("message/3")); + EXPECT_THAT(results1.results(1).document().uri(), Eq("message/2")); + EXPECT_THAT(results1.results(2).document().uri(), Eq("message/1")); // Verify integer index works normally SearchSpecProto search_spec2; @@ -1278,9 +1725,43 @@ TEST_F(IcingSearchEngineInitializationTest, RestoreIndexLoseTermIndex) { icing.Search(search_spec2, ScoringSpecProto::default_instance(), ResultSpecProto::default_instance()); ASSERT_THAT(results2.results(), SizeIs(3)); - EXPECT_THAT(results2.results(0).document().uri(), Eq("fake_type/2")); - EXPECT_THAT(results2.results(1).document().uri(), Eq("fake_type/1")); - EXPECT_THAT(results2.results(2).document().uri(), Eq("fake_type/0")); + EXPECT_THAT(results2.results(0).document().uri(), Eq("message/3")); + EXPECT_THAT(results2.results(1).document().uri(), Eq("message/2")); + EXPECT_THAT(results2.results(2).document().uri(), Eq("message/1")); + + // Verify qualified id join index works normally: join a query for + // `name:person` with a child query for `body:consectetur` based on the + // child's `senderQualifiedId` field. + SearchSpecProto search_spec3; + search_spec3.set_term_match_type(TermMatchType::EXACT_ONLY); + search_spec3.set_query("name:person"); + JoinSpecProto* join_spec = search_spec3.mutable_join_spec(); + join_spec->set_max_joined_child_count(100); + join_spec->set_parent_property_expression( + std::string(JoinProcessor::kQualifiedIdExpr)); + join_spec->set_child_property_expression("senderQualifiedId"); + join_spec->set_aggregation_scoring_strategy( + JoinSpecProto::AggregationScoringStrategy::COUNT); + JoinSpecProto::NestedSpecProto* nested_spec = + join_spec->mutable_nested_spec(); + SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec(); + nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY); + nested_search_spec->set_query("body:consectetur"); + *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec(); + *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance(); + + SearchResultProto results3 = + icing.Search(search_spec3, ScoringSpecProto::default_instance(), + ResultSpecProto::default_instance()); + ASSERT_THAT(results3.results(), SizeIs(1)); + EXPECT_THAT(results3.results(0).document().uri(), Eq("person")); + EXPECT_THAT(results3.results(0).joined_results(), SizeIs(3)); + EXPECT_THAT(results3.results(0).joined_results(0).document().uri(), + Eq("message/3")); + EXPECT_THAT(results3.results(0).joined_results(1).document().uri(), + Eq("message/2")); + EXPECT_THAT(results3.results(0).joined_results(2).document().uri(), + Eq("message/1")); } } @@ -1292,18 +1773,59 @@ TEST_F(IcingSearchEngineInitializationTest, RestoreIndexLoseIntegerIndex) { // - Term index directory should be unaffected. // - Integer index directory should not be discarded since we've already // lost it. Start it from scratch. + // - Qualified id join index directory should be unaffected. // - Truncate indices: // - "TruncateTo()" for term index shouldn't take effect. // - "Clear()" shouldn't be called for integer index, i.e. no integer index // storage sub directories (path_expr = "*/integer_index_dir/*") should be // discarded, since we start it from scratch. - DocumentProto document = DocumentBuilder() - .SetKey("icing", "fake_type/0") - .SetSchema("Message") - .AddStringProperty("body", kIpsumText) - .AddInt64Property("indexableInteger", 123) - .Build(); - // 1. Create an index with 3 documents. + // - "Clear()" shouldn't be called for qualified id join index, i.e. no + // underlying storage sub directory (path_expr = + // "*/qualified_id_join_index_dir/*") should be discarded. + + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("indexableInteger") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto person = + DocumentBuilder() + .SetKey("namespace", "person") + .SetSchema("Person") + .AddStringProperty("name", "person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto message = + DocumentBuilder() + .SetKey("namespace", "message/1") + .SetSchema("Message") + .AddStringProperty("body", kIpsumText) + .AddInt64Property("indexableInteger", 123) + .AddStringProperty("senderQualifiedId", "namespace#person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + // 1. Create an index with 3 message documents. { TestIcingSearchEngine icing( GetDefaultIcingOptions(), std::make_unique<Filesystem>(), @@ -1311,13 +1833,14 @@ TEST_F(IcingSearchEngineInitializationTest, RestoreIndexLoseIntegerIndex) { GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); - EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); - document = DocumentBuilder(document).SetUri("fake_type/1").Build(); - EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); - document = DocumentBuilder(document).SetUri("fake_type/2").Build(); - EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(person).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + message = DocumentBuilder(message).SetUri("message/2").Build(); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + message = DocumentBuilder(message).SetUri("message/3").Build(); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); } // 2. Delete the integer index file to trigger RestoreIndexIfNeeded. @@ -1326,8 +1849,7 @@ TEST_F(IcingSearchEngineInitializationTest, RestoreIndexLoseIntegerIndex) { // 3. Create the index again. This should trigger index restoration. { - // Mock filesystem to observe and check the behavior of term index and - // integer index. + // Mock filesystem to observe and check the behavior of all indices. auto mock_filesystem = std::make_unique<MockFilesystem>(); EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_)) .WillRepeatedly(DoDefault()); @@ -1345,6 +1867,16 @@ TEST_F(IcingSearchEngineInitializationTest, RestoreIndexLoseIntegerIndex) { EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/"))) .Times(0); + // Ensure qualified id join index directory should never be discarded, and + // Clear() should never be called (i.e. storage sub directory + // "*/qualified_id_join_index_dir/*" should never be discarded). + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively( + EndsWith("/qualified_id_join_index_dir"))) + .Times(0); + EXPECT_CALL( + *mock_filesystem, + DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/"))) + .Times(0); TestIcingSearchEngine icing( GetDefaultIcingOptions(), std::move(mock_filesystem), @@ -1357,10 +1889,13 @@ TEST_F(IcingSearchEngineInitializationTest, RestoreIndexLoseIntegerIndex) { EXPECT_THAT( initialize_result.initialize_stats().integer_index_restoration_cause(), Eq(InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH)); + EXPECT_THAT(initialize_result.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); // Verify term index works normally SearchSpecProto search_spec1; - search_spec1.set_query("consectetur"); + search_spec1.set_query("body:consectetur"); search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY); SearchResultProto results1 = icing.Search(search_spec1, ScoringSpecProto::default_instance(), @@ -1369,9 +1904,9 @@ TEST_F(IcingSearchEngineInitializationTest, RestoreIndexLoseIntegerIndex) { EXPECT_THAT(results1.next_page_token(), Eq(0)); // All documents should be retrievable. ASSERT_THAT(results1.results(), SizeIs(3)); - EXPECT_THAT(results1.results(0).document().uri(), Eq("fake_type/2")); - EXPECT_THAT(results1.results(1).document().uri(), Eq("fake_type/1")); - EXPECT_THAT(results1.results(2).document().uri(), Eq("fake_type/0")); + EXPECT_THAT(results1.results(0).document().uri(), Eq("message/3")); + EXPECT_THAT(results1.results(1).document().uri(), Eq("message/2")); + EXPECT_THAT(results1.results(2).document().uri(), Eq("message/1")); // Verify integer index works normally SearchSpecProto search_spec2; @@ -1384,9 +1919,239 @@ TEST_F(IcingSearchEngineInitializationTest, RestoreIndexLoseIntegerIndex) { icing.Search(search_spec2, ScoringSpecProto::default_instance(), ResultSpecProto::default_instance()); ASSERT_THAT(results2.results(), SizeIs(3)); - EXPECT_THAT(results2.results(0).document().uri(), Eq("fake_type/2")); - EXPECT_THAT(results2.results(1).document().uri(), Eq("fake_type/1")); - EXPECT_THAT(results2.results(2).document().uri(), Eq("fake_type/0")); + EXPECT_THAT(results2.results(0).document().uri(), Eq("message/3")); + EXPECT_THAT(results2.results(1).document().uri(), Eq("message/2")); + EXPECT_THAT(results2.results(2).document().uri(), Eq("message/1")); + + // Verify qualified id join index works normally: join a query for + // `name:person` with a child query for `body:consectetur` based on the + // child's `senderQualifiedId` field. + SearchSpecProto search_spec3; + search_spec3.set_term_match_type(TermMatchType::EXACT_ONLY); + search_spec3.set_query("name:person"); + JoinSpecProto* join_spec = search_spec3.mutable_join_spec(); + join_spec->set_max_joined_child_count(100); + join_spec->set_parent_property_expression( + std::string(JoinProcessor::kQualifiedIdExpr)); + join_spec->set_child_property_expression("senderQualifiedId"); + join_spec->set_aggregation_scoring_strategy( + JoinSpecProto::AggregationScoringStrategy::COUNT); + JoinSpecProto::NestedSpecProto* nested_spec = + join_spec->mutable_nested_spec(); + SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec(); + nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY); + nested_search_spec->set_query("body:consectetur"); + *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec(); + *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance(); + + SearchResultProto results3 = + icing.Search(search_spec3, ScoringSpecProto::default_instance(), + ResultSpecProto::default_instance()); + ASSERT_THAT(results3.results(), SizeIs(1)); + EXPECT_THAT(results3.results(0).document().uri(), Eq("person")); + EXPECT_THAT(results3.results(0).joined_results(), SizeIs(3)); + EXPECT_THAT(results3.results(0).joined_results(0).document().uri(), + Eq("message/3")); + EXPECT_THAT(results3.results(0).joined_results(1).document().uri(), + Eq("message/2")); + EXPECT_THAT(results3.results(0).joined_results(2).document().uri(), + Eq("message/1")); + } +} + +TEST_F(IcingSearchEngineInitializationTest, + RestoreIndexLoseQualifiedIdJoinIndex) { + // Test the following scenario: losing the entire qualified id join index + // directory. IcingSearchEngine should be able to recover qualified id join + // index. Several additional behaviors are also tested: + // - Index directory handling: + // - Term index directory should be unaffected. + // - Integer index directory should be unaffected. + // - Qualified id join index directory should not be discarded since we've + // already lost it. Start it from scratch. + // - Truncate indices: + // - "TruncateTo()" for term index shouldn't take effect. + // - "Clear()" shouldn't be called for integer index, i.e. no integer index + // storage sub directories (path_expr = "*/integer_index_dir/*") should be + // discarded. + // - "Clear()" shouldn't be called for qualified id join index, i.e. no + // underlying storage sub directory (path_expr = + // "*/qualified_id_join_index_dir/*") should be discarded, since we start + // it from scratch. + + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("indexableInteger") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto person = + DocumentBuilder() + .SetKey("namespace", "person") + .SetSchema("Person") + .AddStringProperty("name", "person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto message = + DocumentBuilder() + .SetKey("namespace", "message/1") + .SetSchema("Message") + .AddStringProperty("body", kIpsumText) + .AddInt64Property("indexableInteger", 123) + .AddStringProperty("senderQualifiedId", "namespace#person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + // 1. Create an index with 3 message documents. + { + TestIcingSearchEngine icing( + GetDefaultIcingOptions(), std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(), + GetTestJniCache()); + + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + + EXPECT_THAT(icing.Put(person).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + message = DocumentBuilder(message).SetUri("message/2").Build(); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + message = DocumentBuilder(message).SetUri("message/3").Build(); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + } + + // 2. Delete the qualified id join index file to trigger RestoreIndexIfNeeded. + std::string qualified_id_join_index_dir = GetQualifiedIdJoinIndexDir(); + filesystem()->DeleteDirectoryRecursively(qualified_id_join_index_dir.c_str()); + + // 3. Create the index again. This should trigger index restoration. + { + // Mock filesystem to observe and check the behavior of all indices. + auto mock_filesystem = std::make_unique<MockFilesystem>(); + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_)) + .WillRepeatedly(DoDefault()); + // Ensure term index directory should never be discarded. + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(EndsWith("/index_dir"))) + .Times(0); + // Ensure integer index directory should never be discarded since we've + // already lost it, and Clear() should never be called (i.e. storage sub + // directory "*/integer_index_dir/*" should never be discarded). + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(EndsWith("/integer_index_dir"))) + .Times(0); + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/"))) + .Times(0); + // Ensure qualified id join index directory should never be discarded, and + // Clear() should never be called (i.e. storage sub directory + // "*/qualified_id_join_index_dir/*" should never be discarded) + // since we start it from scratch. + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively( + EndsWith("/qualified_id_join_index_dir"))) + .Times(0); + EXPECT_CALL( + *mock_filesystem, + DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/"))) + .Times(0); + + TestIcingSearchEngine icing( + GetDefaultIcingOptions(), std::move(mock_filesystem), + std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(), + GetTestJniCache()); + InitializeResultProto initialize_result = icing.Initialize(); + ASSERT_THAT(initialize_result.status(), ProtoIsOk()); + EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT( + initialize_result.initialize_stats().integer_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH)); + + // Verify term index works normally + SearchSpecProto search_spec1; + search_spec1.set_query("body:consectetur"); + search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY); + SearchResultProto results1 = + icing.Search(search_spec1, ScoringSpecProto::default_instance(), + ResultSpecProto::default_instance()); + EXPECT_THAT(results1.status(), ProtoIsOk()); + EXPECT_THAT(results1.next_page_token(), Eq(0)); + // All documents should be retrievable. + ASSERT_THAT(results1.results(), SizeIs(3)); + EXPECT_THAT(results1.results(0).document().uri(), Eq("message/3")); + EXPECT_THAT(results1.results(1).document().uri(), Eq("message/2")); + EXPECT_THAT(results1.results(2).document().uri(), Eq("message/1")); + + // Verify integer index works normally + SearchSpecProto search_spec2; + search_spec2.set_query("indexableInteger == 123"); + search_spec2.set_search_type( + SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY); + search_spec2.add_enabled_features(std::string(kNumericSearchFeature)); + + SearchResultProto results2 = + icing.Search(search_spec2, ScoringSpecProto::default_instance(), + ResultSpecProto::default_instance()); + ASSERT_THAT(results2.results(), SizeIs(3)); + EXPECT_THAT(results2.results(0).document().uri(), Eq("message/3")); + EXPECT_THAT(results2.results(1).document().uri(), Eq("message/2")); + EXPECT_THAT(results2.results(2).document().uri(), Eq("message/1")); + + // Verify qualified id join index works normally: join a query for + // `name:person` with a child query for `body:consectetur` based on the + // child's `senderQualifiedId` field. + SearchSpecProto search_spec3; + search_spec3.set_term_match_type(TermMatchType::EXACT_ONLY); + search_spec3.set_query("name:person"); + JoinSpecProto* join_spec = search_spec3.mutable_join_spec(); + join_spec->set_max_joined_child_count(100); + join_spec->set_parent_property_expression( + std::string(JoinProcessor::kQualifiedIdExpr)); + join_spec->set_child_property_expression("senderQualifiedId"); + join_spec->set_aggregation_scoring_strategy( + JoinSpecProto::AggregationScoringStrategy::COUNT); + JoinSpecProto::NestedSpecProto* nested_spec = + join_spec->mutable_nested_spec(); + SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec(); + nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY); + nested_search_spec->set_query("body:consectetur"); + *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec(); + *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance(); + + SearchResultProto results3 = + icing.Search(search_spec3, ScoringSpecProto::default_instance(), + ResultSpecProto::default_instance()); + ASSERT_THAT(results3.results(), SizeIs(1)); + EXPECT_THAT(results3.results(0).document().uri(), Eq("person")); + EXPECT_THAT(results3.results(0).joined_results(), SizeIs(3)); + EXPECT_THAT(results3.results(0).joined_results(0).document().uri(), + Eq("message/3")); + EXPECT_THAT(results3.results(0).joined_results(1).document().uri(), + Eq("message/2")); + EXPECT_THAT(results3.results(0).joined_results(2).document().uri(), + Eq("message/1")); } } @@ -1398,6 +2163,7 @@ TEST_F(IcingSearchEngineInitializationTest, // - Index directory handling: // - Term index directory should be unaffected. // - Integer index directory should be unaffected. + // - Qualified id join index directory should be unaffected. // - Truncate indices: // - "TruncateTo()" for term index should take effect and throw out the // entire lite index. This should be sufficient to make term index @@ -1405,39 +2171,82 @@ TEST_F(IcingSearchEngineInitializationTest, // - "Clear()" shouldn't be called for integer index, i.e. no integer index // storage sub directories (path_expr = "*/integer_index_dir/*") should be // discarded. - DocumentProto document = DocumentBuilder() - .SetKey("icing", "fake_type/0") - .SetSchema("Message") - .AddStringProperty("body", kIpsumText) - .AddInt64Property("indexableInteger", 123) - .Build(); - // 1. Create an index with a LiteIndex that will only allow one document - // before needing a merge. + // - "Clear()" shouldn't be called for qualified id join index, i.e. no + // underlying storage sub directory (path_expr = + // "*/qualified_id_join_index_dir/*") should be discarded. + + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("indexableInteger") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto person = + DocumentBuilder() + .SetKey("namespace", "person") + .SetSchema("Person") + .AddStringProperty("name", "person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto message = + DocumentBuilder() + .SetKey("namespace", "message/1") + .SetSchema("Message") + .AddStringProperty("body", kIpsumText) + .AddInt64Property("indexableInteger", 123) + .AddStringProperty("senderQualifiedId", "namespace#person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + // 1. Create an index with a LiteIndex that will only allow a person and a + // message document before needing a merge. { IcingSearchEngineOptions options = GetDefaultIcingOptions(); - options.set_index_merge_size(document.ByteSizeLong()); + options.set_index_merge_size(person.ByteSizeLong() + + message.ByteSizeLong()); TestIcingSearchEngine icing(options, std::make_unique<Filesystem>(), std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); - // Add two documents. These should get merged into the main index. - EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); - document = DocumentBuilder(document).SetUri("fake_type/1").Build(); - EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(person).status(), ProtoIsOk()); + // Add two message documents. These should get merged into the main index. + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + message = DocumentBuilder(message).SetUri("message/2").Build(); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); } // 2. Manually add some data into term lite index and increment // last_added_document_id, but don't merge into the main index. This will // cause mismatched last_added_document_id with term index. - // - Document store: [0, 1] + // - Document store: [0, 1, 2] // - Term index - // - Main index: [0, 1] - // - Lite index: [2] - // - Integer index: [0, 1] + // - Main index: [0, 1, 2] + // - Lite index: [3] + // - Integer index: [0, 1, 2] + // - Qualified id join index: [0, 1, 2] { Filesystem filesystem; IcingFilesystem icing_filesystem; @@ -1445,7 +2254,7 @@ TEST_F(IcingSearchEngineInitializationTest, std::unique_ptr<Index> index, Index::Create( Index::Options(GetIndexDir(), - /*index_merge_size=*/document.ByteSizeLong()), + /*index_merge_size=*/message.ByteSizeLong()), &filesystem, &icing_filesystem)); DocumentId original_last_added_doc_id = index->last_added_document_id(); index->set_last_added_document_id(original_last_added_doc_id + 1); @@ -1458,8 +2267,7 @@ TEST_F(IcingSearchEngineInitializationTest, // 3. Create the index again. { - // Mock filesystem to observe and check the behavior of term index and - // integer index. + // Mock filesystem to observe and check the behavior of all indices. auto mock_filesystem = std::make_unique<MockFilesystem>(); EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_)) .WillRepeatedly(DoDefault()); @@ -1477,9 +2285,19 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/"))) .Times(0); + // Ensure qualified id join index directory should never be discarded, and + // Clear() should never be called (i.e. storage sub directory + // "*/qualified_id_join_index_dir/*" should never be discarded). + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively( + EndsWith("/qualified_id_join_index_dir"))) + .Times(0); + EXPECT_CALL( + *mock_filesystem, + DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/"))) + .Times(0); IcingSearchEngineOptions options = GetDefaultIcingOptions(); - options.set_index_merge_size(document.ByteSizeLong()); + options.set_index_merge_size(message.ByteSizeLong()); TestIcingSearchEngine icing(options, std::move(mock_filesystem), std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(), @@ -1493,10 +2311,13 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_THAT( initialize_result.initialize_stats().integer_index_restoration_cause(), Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); // Verify term index works normally SearchSpecProto search_spec1; - search_spec1.set_query("consectetur"); + search_spec1.set_query("body:consectetur"); search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY); SearchResultProto results1 = icing.Search(search_spec1, ScoringSpecProto::default_instance(), @@ -1505,8 +2326,8 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_THAT(results1.next_page_token(), Eq(0)); // Only the documents that were in the main index should be retrievable. ASSERT_THAT(results1.results(), SizeIs(2)); - EXPECT_THAT(results1.results(0).document().uri(), Eq("fake_type/1")); - EXPECT_THAT(results1.results(1).document().uri(), Eq("fake_type/0")); + EXPECT_THAT(results1.results(0).document().uri(), Eq("message/2")); + EXPECT_THAT(results1.results(1).document().uri(), Eq("message/1")); // Verify integer index works normally SearchSpecProto search_spec2; @@ -1519,11 +2340,43 @@ TEST_F(IcingSearchEngineInitializationTest, icing.Search(search_spec2, ScoringSpecProto::default_instance(), ResultSpecProto::default_instance()); ASSERT_THAT(results2.results(), SizeIs(2)); - EXPECT_THAT(results2.results(0).document().uri(), Eq("fake_type/1")); - EXPECT_THAT(results2.results(1).document().uri(), Eq("fake_type/0")); + EXPECT_THAT(results2.results(0).document().uri(), Eq("message/2")); + EXPECT_THAT(results2.results(1).document().uri(), Eq("message/1")); + + // Verify qualified id join index works normally: join a query for + // `name:person` with a child query for `body:consectetur` based on the + // child's `senderQualifiedId` field. + SearchSpecProto search_spec3; + search_spec3.set_term_match_type(TermMatchType::EXACT_ONLY); + search_spec3.set_query("name:person"); + JoinSpecProto* join_spec = search_spec3.mutable_join_spec(); + join_spec->set_max_joined_child_count(100); + join_spec->set_parent_property_expression( + std::string(JoinProcessor::kQualifiedIdExpr)); + join_spec->set_child_property_expression("senderQualifiedId"); + join_spec->set_aggregation_scoring_strategy( + JoinSpecProto::AggregationScoringStrategy::COUNT); + JoinSpecProto::NestedSpecProto* nested_spec = + join_spec->mutable_nested_spec(); + SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec(); + nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY); + nested_search_spec->set_query("body:consectetur"); + *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec(); + *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance(); + + SearchResultProto results3 = + icing.Search(search_spec3, ScoringSpecProto::default_instance(), + ResultSpecProto::default_instance()); + ASSERT_THAT(results3.results(), SizeIs(1)); + EXPECT_THAT(results3.results(0).document().uri(), Eq("person")); + EXPECT_THAT(results3.results(0).joined_results(), SizeIs(2)); + EXPECT_THAT(results3.results(0).joined_results(0).document().uri(), + Eq("message/2")); + EXPECT_THAT(results3.results(0).joined_results(1).document().uri(), + Eq("message/1")); } - // 4. Since document 2 doesn't exist, testing query = "foo" is not enough to + // 4. Since document 3 doesn't exist, testing query = "foo" is not enough to // verify the correctness of term index restoration. Instead, we have to check // hits for "foo" should not be found in term index. { @@ -1533,7 +2386,7 @@ TEST_F(IcingSearchEngineInitializationTest, std::unique_ptr<Index> index, Index::Create( Index::Options(GetIndexDir(), - /*index_merge_size=*/document.ByteSizeLong()), + /*index_merge_size=*/message.ByteSizeLong()), &filesystem, &icing_filesystem)); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocHitInfoIterator> doc_hit_info_iter, @@ -1553,6 +2406,7 @@ TEST_F(IcingSearchEngineInitializationTest, // - Index directory handling: // - Term index directory should be unaffected. // - Integer index directory should be unaffected. + // - Qualified id join index directory should be unaffected. // - Truncate indices: // - "TruncateTo()" for term index should take effect and throw out the // entire lite index. However, some valid data in term lite index were @@ -1561,42 +2415,84 @@ TEST_F(IcingSearchEngineInitializationTest, // - "Clear()" shouldn't be called for integer index, i.e. no integer index // storage sub directories (path_expr = "*/integer_index_dir/*") should be // discarded. - DocumentProto document = DocumentBuilder() - .SetKey("icing", "fake_type/0") - .SetSchema("Message") - .AddStringProperty("body", kIpsumText) - .AddInt64Property("indexableInteger", 123) - .Build(); - // 1. Create an index with a LiteIndex that will only allow one document - // before needing a merge. + // - "Clear()" shouldn't be called for qualified id join index, i.e. no + // underlying storage sub directory (path_expr = + // "*/qualified_id_join_index_dir/*") should be discarded. + + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("indexableInteger") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto person = + DocumentBuilder() + .SetKey("namespace", "person") + .SetSchema("Person") + .AddStringProperty("name", "person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto message = + DocumentBuilder() + .SetKey("namespace", "message/1") + .SetSchema("Message") + .AddStringProperty("body", kIpsumText) + .AddInt64Property("indexableInteger", 123) + .AddStringProperty("senderQualifiedId", "namespace#person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + // 1. Create an index with a LiteIndex that will only allow a person and a + // message document before needing a merge. { IcingSearchEngineOptions options = GetDefaultIcingOptions(); - options.set_index_merge_size(document.ByteSizeLong()); + options.set_index_merge_size(message.ByteSizeLong()); TestIcingSearchEngine icing(options, std::make_unique<Filesystem>(), std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); - // Add two documents. These should get merged into the main index. - EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); - document = DocumentBuilder(document).SetUri("fake_type/1").Build(); - EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(person).status(), ProtoIsOk()); + // Add two message documents. These should get merged into the main index. + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + message = DocumentBuilder(message).SetUri("message/2").Build(); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); // Add one document. This one should get remain in the lite index. - document = DocumentBuilder(document).SetUri("fake_type/2").Build(); - EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); + message = DocumentBuilder(message).SetUri("message/3").Build(); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); } // 2. Manually add some data into term lite index and increment // last_added_document_id, but don't merge into the main index. This will // cause mismatched last_added_document_id with term index. - // - Document store: [0, 1, 2] + // - Document store: [0, 1, 2, 3] // - Term index - // - Main index: [0, 1] - // - Lite index: [2, 3] - // - Integer index: [0, 1, 2] + // - Main index: [0, 1, 2] + // - Lite index: [3, 4] + // - Integer index: [0, 1, 2, 3] + // - Qualified id join index: [0, 1, 2, 3] { Filesystem filesystem; IcingFilesystem icing_filesystem; @@ -1604,7 +2500,7 @@ TEST_F(IcingSearchEngineInitializationTest, std::unique_ptr<Index> index, Index::Create( Index::Options(GetIndexDir(), - /*index_merge_size=*/document.ByteSizeLong()), + /*index_merge_size=*/message.ByteSizeLong()), &filesystem, &icing_filesystem)); DocumentId original_last_added_doc_id = index->last_added_document_id(); index->set_last_added_document_id(original_last_added_doc_id + 1); @@ -1617,8 +2513,7 @@ TEST_F(IcingSearchEngineInitializationTest, // 3. Create the index again. { - // Mock filesystem to observe and check the behavior of term index and - // integer index. + // Mock filesystem to observe and check the behavior of all indices. auto mock_filesystem = std::make_unique<MockFilesystem>(); EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_)) .WillRepeatedly(DoDefault()); @@ -1636,9 +2531,19 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/"))) .Times(0); + // Ensure qualified id join index directory should never be discarded, and + // Clear() should never be called (i.e. storage sub directory + // "*/qualified_id_join_index_dir/*" should never be discarded). + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively( + EndsWith("/qualified_id_join_index_dir"))) + .Times(0); + EXPECT_CALL( + *mock_filesystem, + DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/"))) + .Times(0); IcingSearchEngineOptions options = GetDefaultIcingOptions(); - options.set_index_merge_size(document.ByteSizeLong()); + options.set_index_merge_size(message.ByteSizeLong()); TestIcingSearchEngine icing(options, std::move(mock_filesystem), std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(), @@ -1653,10 +2558,13 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_THAT( initialize_result.initialize_stats().integer_index_restoration_cause(), Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); // Verify term index works normally SearchSpecProto search_spec1; - search_spec1.set_query("consectetur"); + search_spec1.set_query("body:consectetur"); search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY); SearchResultProto results1 = icing.Search(search_spec1, ScoringSpecProto::default_instance(), @@ -1665,9 +2573,9 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_THAT(results1.next_page_token(), Eq(0)); // Only the documents that were in the main index should be retrievable. ASSERT_THAT(results1.results(), SizeIs(3)); - EXPECT_THAT(results1.results(0).document().uri(), Eq("fake_type/2")); - EXPECT_THAT(results1.results(1).document().uri(), Eq("fake_type/1")); - EXPECT_THAT(results1.results(2).document().uri(), Eq("fake_type/0")); + EXPECT_THAT(results1.results(0).document().uri(), Eq("message/3")); + EXPECT_THAT(results1.results(1).document().uri(), Eq("message/2")); + EXPECT_THAT(results1.results(2).document().uri(), Eq("message/1")); // Verify integer index works normally SearchSpecProto search_spec2; @@ -1680,12 +2588,46 @@ TEST_F(IcingSearchEngineInitializationTest, icing.Search(search_spec2, ScoringSpecProto::default_instance(), ResultSpecProto::default_instance()); ASSERT_THAT(results2.results(), SizeIs(3)); - EXPECT_THAT(results2.results(0).document().uri(), Eq("fake_type/2")); - EXPECT_THAT(results2.results(1).document().uri(), Eq("fake_type/1")); - EXPECT_THAT(results2.results(2).document().uri(), Eq("fake_type/0")); + EXPECT_THAT(results2.results(0).document().uri(), Eq("message/3")); + EXPECT_THAT(results2.results(1).document().uri(), Eq("message/2")); + EXPECT_THAT(results2.results(2).document().uri(), Eq("message/1")); + + // Verify qualified id join index works normally: join a query for + // `name:person` with a child query for `body:consectetur` based on the + // child's `senderQualifiedId` field. + SearchSpecProto search_spec3; + search_spec3.set_term_match_type(TermMatchType::EXACT_ONLY); + search_spec3.set_query("name:person"); + JoinSpecProto* join_spec = search_spec3.mutable_join_spec(); + join_spec->set_max_joined_child_count(100); + join_spec->set_parent_property_expression( + std::string(JoinProcessor::kQualifiedIdExpr)); + join_spec->set_child_property_expression("senderQualifiedId"); + join_spec->set_aggregation_scoring_strategy( + JoinSpecProto::AggregationScoringStrategy::COUNT); + JoinSpecProto::NestedSpecProto* nested_spec = + join_spec->mutable_nested_spec(); + SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec(); + nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY); + nested_search_spec->set_query("body:consectetur"); + *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec(); + *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance(); + + SearchResultProto results3 = + icing.Search(search_spec3, ScoringSpecProto::default_instance(), + ResultSpecProto::default_instance()); + ASSERT_THAT(results3.results(), SizeIs(1)); + EXPECT_THAT(results3.results(0).document().uri(), Eq("person")); + EXPECT_THAT(results3.results(0).joined_results(), SizeIs(3)); + EXPECT_THAT(results3.results(0).joined_results(0).document().uri(), + Eq("message/3")); + EXPECT_THAT(results3.results(0).joined_results(1).document().uri(), + Eq("message/2")); + EXPECT_THAT(results3.results(0).joined_results(2).document().uri(), + Eq("message/1")); } - // 4. Since document 3 doesn't exist, testing query = "foo" is not enough to + // 4. Since document 4 doesn't exist, testing query = "foo" is not enough to // verify the correctness of term index restoration. Instead, we have to check // hits for "foo" should not be found in term index. { @@ -1695,7 +2637,7 @@ TEST_F(IcingSearchEngineInitializationTest, std::unique_ptr<Index> index, Index::Create( Index::Options(GetIndexDir(), - /*index_merge_size=*/document.ByteSizeLong()), + /*index_merge_size=*/message.ByteSizeLong()), &filesystem, &icing_filesystem)); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocHitInfoIterator> doc_hit_info_iter, @@ -1715,14 +2657,18 @@ TEST_F(IcingSearchEngineInitializationTest, // - Index directory handling: // - Term index directory should be unaffected. // - Integer index directory should be unaffected. + // - Qualified id join index directory should be unaffected. // - Truncate indices: // - "TruncateTo()" for term index should take effect and throw out the // entire lite and main index. This should be sufficient to make term // index consistent with document store (in this case, document store is // empty as well), so reindexing should not take place. - // - "Clear()" shouldn't be called for integer index, i.e. no integer index - // storage sub directories (path_expr = "*/integer_index_dir/*") should be + // - "Clear()" should be called for integer index. It is a special case when + // document store has no document. Since there is no integer index storage + // sub directories (path_expr = "*/integer_index_dir/*"), nothing will be // discarded. + // - "Clear()" should be called for qualified id join index. It is a special + // case when document store has no document. // 1. Create an index with no document. { @@ -1744,6 +2690,7 @@ TEST_F(IcingSearchEngineInitializationTest, // - Main index: [0] // - Lite index: [1] // - Integer index: [] + // - Qualified id join index: [] { Filesystem filesystem; IcingFilesystem icing_filesystem; @@ -1774,8 +2721,7 @@ TEST_F(IcingSearchEngineInitializationTest, // 3. Create the index again. This should throw out the lite and main index. { - // Mock filesystem to observe and check the behavior of term index and - // integer index. + // Mock filesystem to observe and check the behavior of all indices. auto mock_filesystem = std::make_unique<MockFilesystem>(); EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_)) .WillRepeatedly(DoDefault()); @@ -1784,15 +2730,25 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(EndsWith("/index_dir"))) .Times(0); - // Ensure integer index directory should never be discarded, and Clear() - // should never be called (i.e. storage sub directory - // "*/integer_index_dir/*" should never be discarded). + // Ensure integer index directory should never be discarded. Even though + // Clear() was called, it shouldn't take effect since there is no storage + // sub directory ("*/integer_index_dir/*") and nothing will be discarded. EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(EndsWith("/integer_index_dir"))) .Times(0); EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/"))) .Times(0); + // Ensure qualified id join index directory should never be discarded. + // Clear() was called and should discard and reinitialize the underlying + // mapper. + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively( + EndsWith("/qualified_id_join_index_dir"))) + .Times(0); + EXPECT_CALL( + *mock_filesystem, + DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/"))) + .Times(AtLeast(1)); TestIcingSearchEngine icing( GetDefaultIcingOptions(), std::move(mock_filesystem), @@ -1808,6 +2764,9 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_THAT( initialize_result.initialize_stats().integer_index_restoration_cause(), Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); } // 4. Since document 0, 1 don't exist, testing queries = "foo", "bar" are not @@ -1846,6 +2805,7 @@ TEST_F(IcingSearchEngineInitializationTest, // - Index directory handling: // - Term index directory should be unaffected. // - Integer index directory should be unaffected. + // - Qualified id join index directory should be unaffected. // - In RestoreIndexIfNecessary(): // - "TruncateTo()" for term index should take effect and throw out the // entire lite and main index. However, some valid data in term main index @@ -1854,13 +2814,53 @@ TEST_F(IcingSearchEngineInitializationTest, // - "Clear()" shouldn't be called for integer index, i.e. no integer index // storage sub directories (path_expr = "*/integer_index_dir/*") should be // discarded. - DocumentProto document = DocumentBuilder() - .SetKey("icing", "fake_type/0") - .SetSchema("Message") - .AddStringProperty("body", kIpsumText) - .AddInt64Property("indexableInteger", 123) - .Build(); - // 1. Create an index with 3 documents. + // - "Clear()" shouldn't be called for qualified id join index, i.e. no + // underlying storage sub directory (path_expr = + // "*/qualified_id_join_index_dir/*") should be discarded. + + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("indexableInteger") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto person = + DocumentBuilder() + .SetKey("namespace", "person") + .SetSchema("Person") + .AddStringProperty("name", "person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto message = + DocumentBuilder() + .SetKey("namespace", "message/1") + .SetSchema("Message") + .AddStringProperty("body", kIpsumText) + .AddInt64Property("indexableInteger", 123) + .AddStringProperty("senderQualifiedId", "namespace#person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + // 1. Create an index with 3 message documents. { TestIcingSearchEngine icing( GetDefaultIcingOptions(), std::make_unique<Filesystem>(), @@ -1868,24 +2868,26 @@ TEST_F(IcingSearchEngineInitializationTest, GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); - EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); - document = DocumentBuilder(document).SetUri("fake_type/1").Build(); - EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); - document = DocumentBuilder(document).SetUri("fake_type/2").Build(); - EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(person).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + message = DocumentBuilder(message).SetUri("message/2").Build(); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + message = DocumentBuilder(message).SetUri("message/3").Build(); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); } // 2. Manually add some data into term lite index and increment // last_added_document_id. Merge some of them into the main index and keep // others in the lite index. This will cause mismatched document id with // document store. - // - Document store: [0, 1, 2] + // - Document store: [0, 1, 2, 3] // - Term index - // - Main index: [0, 1, 2, 3] - // - Lite index: [4] - // - Integer index: [0, 1, 2] + // - Main index: [0, 1, 2, 3, 4] + // - Lite index: [5] + // - Integer index: [0, 1, 2, 3] + // - Qualified id join index: [0, 1, 2, 3] { Filesystem filesystem; IcingFilesystem icing_filesystem; @@ -1893,9 +2895,9 @@ TEST_F(IcingSearchEngineInitializationTest, std::unique_ptr<Index> index, Index::Create( Index::Options(GetIndexDir(), - /*index_merge_size=*/document.ByteSizeLong()), + /*index_merge_size=*/message.ByteSizeLong()), &filesystem, &icing_filesystem)); - // Add hits for document 3 and merge. + // Add hits for document 4 and merge. DocumentId original_last_added_doc_id = index->last_added_document_id(); index->set_last_added_document_id(original_last_added_doc_id + 1); Index::Editor editor = @@ -1905,7 +2907,7 @@ TEST_F(IcingSearchEngineInitializationTest, ICING_ASSERT_OK(editor.IndexAllBufferedTerms()); ICING_ASSERT_OK(index->Merge()); - // Add hits for document 4 and don't merge. + // Add hits for document 5 and don't merge. index->set_last_added_document_id(original_last_added_doc_id + 2); editor = index->Edit(original_last_added_doc_id + 2, /*section_id=*/0, TermMatchType::EXACT_ONLY, /*namespace_id=*/0); @@ -1916,8 +2918,7 @@ TEST_F(IcingSearchEngineInitializationTest, // 3. Create the index again. This should throw out the lite and main index // and trigger index restoration. { - // Mock filesystem to observe and check the behavior of term index and - // integer index. + // Mock filesystem to observe and check the behavior of all indices. auto mock_filesystem = std::make_unique<MockFilesystem>(); EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_)) .WillRepeatedly(DoDefault()); @@ -1935,6 +2936,16 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/"))) .Times(0); + // Ensure qualified id join index directory should never be discarded, and + // Clear() should never be called (i.e. storage sub directory + // "*/qualified_id_join_index_dir/*" should never be discarded). + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively( + EndsWith("/qualified_id_join_index_dir"))) + .Times(0); + EXPECT_CALL( + *mock_filesystem, + DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/"))) + .Times(0); TestIcingSearchEngine icing( GetDefaultIcingOptions(), std::move(mock_filesystem), @@ -1950,10 +2961,13 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_THAT( initialize_result.initialize_stats().integer_index_restoration_cause(), Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); // Verify term index works normally SearchSpecProto search_spec1; - search_spec1.set_query("consectetur"); + search_spec1.set_query("body:consectetur"); search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY); SearchResultProto results1 = icing.Search(search_spec1, ScoringSpecProto::default_instance(), @@ -1962,9 +2976,9 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_THAT(results1.next_page_token(), Eq(0)); // Only the first document should be retrievable. ASSERT_THAT(results1.results(), SizeIs(3)); - EXPECT_THAT(results1.results(0).document().uri(), Eq("fake_type/2")); - EXPECT_THAT(results1.results(1).document().uri(), Eq("fake_type/1")); - EXPECT_THAT(results1.results(2).document().uri(), Eq("fake_type/0")); + EXPECT_THAT(results1.results(0).document().uri(), Eq("message/3")); + EXPECT_THAT(results1.results(1).document().uri(), Eq("message/2")); + EXPECT_THAT(results1.results(2).document().uri(), Eq("message/1")); // Verify integer index works normally SearchSpecProto search_spec2; @@ -1977,12 +2991,46 @@ TEST_F(IcingSearchEngineInitializationTest, icing.Search(search_spec2, ScoringSpecProto::default_instance(), ResultSpecProto::default_instance()); ASSERT_THAT(results2.results(), SizeIs(3)); - EXPECT_THAT(results2.results(0).document().uri(), Eq("fake_type/2")); - EXPECT_THAT(results2.results(1).document().uri(), Eq("fake_type/1")); - EXPECT_THAT(results2.results(2).document().uri(), Eq("fake_type/0")); + EXPECT_THAT(results2.results(0).document().uri(), Eq("message/3")); + EXPECT_THAT(results2.results(1).document().uri(), Eq("message/2")); + EXPECT_THAT(results2.results(2).document().uri(), Eq("message/1")); + + // Verify qualified id join index works normally: join a query for + // `name:person` with a child query for `body:consectetur` based on the + // child's `senderQualifiedId` field. + SearchSpecProto search_spec3; + search_spec3.set_term_match_type(TermMatchType::EXACT_ONLY); + search_spec3.set_query("name:person"); + JoinSpecProto* join_spec = search_spec3.mutable_join_spec(); + join_spec->set_max_joined_child_count(100); + join_spec->set_parent_property_expression( + std::string(JoinProcessor::kQualifiedIdExpr)); + join_spec->set_child_property_expression("senderQualifiedId"); + join_spec->set_aggregation_scoring_strategy( + JoinSpecProto::AggregationScoringStrategy::COUNT); + JoinSpecProto::NestedSpecProto* nested_spec = + join_spec->mutable_nested_spec(); + SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec(); + nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY); + nested_search_spec->set_query("body:consectetur"); + *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec(); + *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance(); + + SearchResultProto results3 = + icing.Search(search_spec3, ScoringSpecProto::default_instance(), + ResultSpecProto::default_instance()); + ASSERT_THAT(results3.results(), SizeIs(1)); + EXPECT_THAT(results3.results(0).document().uri(), Eq("person")); + EXPECT_THAT(results3.results(0).joined_results(), SizeIs(3)); + EXPECT_THAT(results3.results(0).joined_results(0).document().uri(), + Eq("message/3")); + EXPECT_THAT(results3.results(0).joined_results(1).document().uri(), + Eq("message/2")); + EXPECT_THAT(results3.results(0).joined_results(2).document().uri(), + Eq("message/1")); } - // 4. Since document 3, 4 don't exist, testing queries = "foo", "bar" are not + // 4. Since document 4, 5 don't exist, testing queries = "foo", "bar" are not // enough to verify the correctness of term index restoration. Instead, we // have to check hits for "foo", "bar" should not be found in term index. { @@ -2018,6 +3066,7 @@ TEST_F(IcingSearchEngineInitializationTest, // - Index directory handling: // - Term index directory should be unaffected. // - Integer index directory should be unaffected. + // - Qualified id join index directory should be unaffected. // - Truncate indices: // - "TruncateTo()" for term index shouldn't take effect. // - "Clear()" should be called for integer index and throw out all integer @@ -2025,6 +3074,8 @@ TEST_F(IcingSearchEngineInitializationTest, // "*/integer_index_dir/*") should be discarded. This should be sufficient // to make integer index consistent with document store (in this case, // document store is empty as well), so reindexing should not take place. + // - "Clear()" should be called for qualified id join index. It is a special + // case when document store has no document. // 1. Create an index with no document. { @@ -2043,6 +3094,7 @@ TEST_F(IcingSearchEngineInitializationTest, // - Document store: [] // - Term index: [] // - Integer index: [0] + // - Qualified id join index: [] { Filesystem filesystem; ICING_ASSERT_OK_AND_ASSIGN( @@ -2060,8 +3112,7 @@ TEST_F(IcingSearchEngineInitializationTest, // 3. Create the index again. This should trigger index restoration. { - // Mock filesystem to observe and check the behavior of term index and - // integer index. + // Mock filesystem to observe and check the behavior of all indices. auto mock_filesystem = std::make_unique<MockFilesystem>(); EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_)) .WillRepeatedly(DoDefault()); @@ -2078,6 +3129,16 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/"))) .Times(1); + // Ensure qualified id join index directory should never be discarded. + // Clear() was called and should discard and reinitialize the underlying + // mapper. + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively( + EndsWith("/qualified_id_join_index_dir"))) + .Times(0); + EXPECT_CALL( + *mock_filesystem, + DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/"))) + .Times(AtLeast(1)); TestIcingSearchEngine icing( GetDefaultIcingOptions(), std::move(mock_filesystem), @@ -2092,23 +3153,36 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_THAT( initialize_result.initialize_stats().integer_index_restoration_cause(), Eq(InitializeStatsProto::NONE)); - } + EXPECT_THAT(initialize_result.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); - // 4. Since document 0 doesn't exist, testing numeric query - // "indexableInteger == 123" is not enough to verify the correctness of - // integer index restoration. Instead, we have to check hits for 123 should - // not be found in integer index. - { - Filesystem filesystem; - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<IntegerIndex> integer_index, - IntegerIndex::Create(filesystem, GetIntegerIndexDir())); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<DocHitInfoIterator> doc_hit_info_iter, - integer_index->GetIterator(/*property_path=*/"indexableInteger", - /*key_lower=*/123, /*key_upper=*/123)); - EXPECT_THAT(doc_hit_info_iter->Advance(), - StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); + // Verify that numeric query safely wiped out the pre-existing hit for + // 'indexableInteger' == 123. Add a new document without that value for + // 'indexableInteger' that will take docid=0. If the integer index was not + // rebuilt correctly, then it will still have the previously added hit for + // 'indexableInteger' == 123 for docid 0 and incorrectly return this new + // doc in a query. + DocumentProto another_message = + DocumentBuilder() + .SetKey("namespace", "message/1") + .SetSchema("Message") + .AddStringProperty("body", kIpsumText) + .AddInt64Property("indexableInteger", 456) + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + EXPECT_THAT(icing.Put(another_message).status(), ProtoIsOk()); + // Verify integer index works normally + SearchSpecProto search_spec; + search_spec.set_query("indexableInteger == 123"); + search_spec.set_search_type( + SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY); + search_spec.add_enabled_features(std::string(kNumericSearchFeature)); + + SearchResultProto results = + icing.Search(search_spec, ScoringSpecProto::default_instance(), + ResultSpecProto::default_instance()); + EXPECT_THAT(results.results(), IsEmpty()); } } @@ -2120,6 +3194,7 @@ TEST_F(IcingSearchEngineInitializationTest, // - Index directory handling: // - Term index directory should be unaffected. // - Integer index directory should be unaffected. + // - Qualified id join index directory should be unaffected. // - Truncate indices: // - "TruncateTo()" for term index shouldn't take effect. // - "Clear()" should be called for integer index and throw out all integer @@ -2127,13 +3202,53 @@ TEST_F(IcingSearchEngineInitializationTest, // "*/integer_index_dir/*") should be discarded. However, some valid data // in integer index were discarded together, so reindexing should still // take place to recover them after clearing. - DocumentProto document = DocumentBuilder() - .SetKey("icing", "fake_type/0") - .SetSchema("Message") - .AddStringProperty("body", kIpsumText) - .AddInt64Property("indexableInteger", 123) - .Build(); - // 1. Create an index with 3 documents. + // - "Clear()" shouldn't be called for qualified id join index, i.e. no + // underlying storage sub directory (path_expr = + // "*/qualified_id_join_index_dir/*") should be discarded. + + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("indexableInteger") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto person = + DocumentBuilder() + .SetKey("namespace", "person") + .SetSchema("Person") + .AddStringProperty("name", "person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto message = + DocumentBuilder() + .SetKey("namespace", "message/1") + .SetSchema("Message") + .AddStringProperty("body", kIpsumText) + .AddInt64Property("indexableInteger", 123) + .AddStringProperty("senderQualifiedId", "namespace#person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + // 1. Create an index with message 3 documents. { TestIcingSearchEngine icing( GetDefaultIcingOptions(), std::make_unique<Filesystem>(), @@ -2141,27 +3256,29 @@ TEST_F(IcingSearchEngineInitializationTest, GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); - EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); - document = DocumentBuilder(document).SetUri("fake_type/1").Build(); - EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); - document = DocumentBuilder(document).SetUri("fake_type/2").Build(); - EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(person).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + message = DocumentBuilder(message).SetUri("message/2").Build(); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + message = DocumentBuilder(message).SetUri("message/3").Build(); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); } // 2. Manually add some data into integer index and increment // last_added_document_id. This will cause mismatched document id with // document store. - // - Document store: [0, 1, 2] - // - Term index: [0, 1, 2] - // - Integer index: [0, 1, 2, 3] + // - Document store: [0, 1, 2, 3] + // - Term index: [0, 1, 2, 3] + // - Integer index: [0, 1, 2, 3, 4] + // - Qualified id join index: [0, 1, 2, 3] { Filesystem filesystem; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<IntegerIndex> integer_index, IntegerIndex::Create(filesystem, GetIntegerIndexDir())); - // Add hits for document 3. + // Add hits for document 4. DocumentId original_last_added_doc_id = integer_index->last_added_document_id(); integer_index->set_last_added_document_id(original_last_added_doc_id + 1); @@ -2174,8 +3291,7 @@ TEST_F(IcingSearchEngineInitializationTest, // 3. Create the index again. This should trigger index restoration. { - // Mock filesystem to observe and check the behavior of term index and - // integer index. + // Mock filesystem to observe and check the behavior of all indices. auto mock_filesystem = std::make_unique<MockFilesystem>(); EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_)) .WillRepeatedly(DoDefault()); @@ -2192,6 +3308,16 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/"))) .Times(1); + // Ensure qualified id join index directory should never be discarded, and + // Clear() should never be called (i.e. storage sub directory + // "*/qualified_id_join_index_dir/*" should never be discarded). + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively( + EndsWith("/qualified_id_join_index_dir"))) + .Times(0); + EXPECT_CALL( + *mock_filesystem, + DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/"))) + .Times(0); TestIcingSearchEngine icing( GetDefaultIcingOptions(), std::move(mock_filesystem), @@ -2204,10 +3330,13 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_THAT( initialize_result.initialize_stats().integer_index_restoration_cause(), Eq(InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH)); + EXPECT_THAT(initialize_result.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); // Verify term index works normally SearchSpecProto search_spec1; - search_spec1.set_query("consectetur"); + search_spec1.set_query("body:consectetur"); search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY); SearchResultProto results1 = icing.Search(search_spec1, ScoringSpecProto::default_instance(), @@ -2216,9 +3345,9 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_THAT(results1.next_page_token(), Eq(0)); // All documents should be retrievable. ASSERT_THAT(results1.results(), SizeIs(3)); - EXPECT_THAT(results1.results(0).document().uri(), Eq("fake_type/2")); - EXPECT_THAT(results1.results(1).document().uri(), Eq("fake_type/1")); - EXPECT_THAT(results1.results(2).document().uri(), Eq("fake_type/0")); + EXPECT_THAT(results1.results(0).document().uri(), Eq("message/3")); + EXPECT_THAT(results1.results(1).document().uri(), Eq("message/2")); + EXPECT_THAT(results1.results(2).document().uri(), Eq("message/1")); // Verify integer index works normally SearchSpecProto search_spec2; @@ -2231,26 +3360,425 @@ TEST_F(IcingSearchEngineInitializationTest, icing.Search(search_spec2, ScoringSpecProto::default_instance(), ResultSpecProto::default_instance()); ASSERT_THAT(results2.results(), SizeIs(3)); - EXPECT_THAT(results2.results(0).document().uri(), Eq("fake_type/2")); - EXPECT_THAT(results2.results(1).document().uri(), Eq("fake_type/1")); - EXPECT_THAT(results2.results(2).document().uri(), Eq("fake_type/0")); + EXPECT_THAT(results2.results(0).document().uri(), Eq("message/3")); + EXPECT_THAT(results2.results(1).document().uri(), Eq("message/2")); + EXPECT_THAT(results2.results(2).document().uri(), Eq("message/1")); + + // Verify qualified id join index works normally: join a query for + // `name:person` with a child query for `body:consectetur` based on the + // child's `senderQualifiedId` field. + SearchSpecProto search_spec3; + search_spec3.set_term_match_type(TermMatchType::EXACT_ONLY); + search_spec3.set_query("name:person"); + JoinSpecProto* join_spec = search_spec3.mutable_join_spec(); + join_spec->set_max_joined_child_count(100); + join_spec->set_parent_property_expression( + std::string(JoinProcessor::kQualifiedIdExpr)); + join_spec->set_child_property_expression("senderQualifiedId"); + join_spec->set_aggregation_scoring_strategy( + JoinSpecProto::AggregationScoringStrategy::COUNT); + JoinSpecProto::NestedSpecProto* nested_spec = + join_spec->mutable_nested_spec(); + SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec(); + nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY); + nested_search_spec->set_query("body:consectetur"); + *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec(); + *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance(); + + SearchResultProto results3 = + icing.Search(search_spec3, ScoringSpecProto::default_instance(), + ResultSpecProto::default_instance()); + ASSERT_THAT(results3.results(), SizeIs(1)); + EXPECT_THAT(results3.results(0).document().uri(), Eq("person")); + EXPECT_THAT(results3.results(0).joined_results(), SizeIs(3)); + EXPECT_THAT(results3.results(0).joined_results(0).document().uri(), + Eq("message/3")); + EXPECT_THAT(results3.results(0).joined_results(1).document().uri(), + Eq("message/2")); + EXPECT_THAT(results3.results(0).joined_results(2).document().uri(), + Eq("message/1")); + + // Verify that numeric index safely wiped out the pre-existing hit for + // 'indexableInteger' == 456. Add a new document without that value for + // 'indexableInteger' that will take docid=0. If the integer index was not + // rebuilt correctly, then it will still have the previously added hit for + // 'indexableInteger' == 456 for docid 0 and incorrectly return this new + // doc in a query. + DocumentProto another_message = + DocumentBuilder() + .SetKey("namespace", "message/4") + .SetSchema("Message") + .AddStringProperty("body", kIpsumText) + .AddStringProperty("senderQualifiedId", "namespace#person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + EXPECT_THAT(icing.Put(another_message).status(), ProtoIsOk()); + // Verify integer index works normally + SearchSpecProto search_spec; + search_spec.set_query("indexableInteger == 456"); + search_spec.set_search_type( + SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY); + search_spec.add_enabled_features(std::string(kNumericSearchFeature)); + + SearchResultProto results = + icing.Search(search_spec, ScoringSpecProto::default_instance(), + ResultSpecProto::default_instance()); + EXPECT_THAT(results.results(), IsEmpty()); + } +} + +TEST_F(IcingSearchEngineInitializationTest, + RestoreIndexTruncateQualifiedIdJoinIndexWithoutReindexing) { + // Test the following scenario: qualified id join index is *completely* ahead + // of document store. IcingSearchEngine should be able to recover qualified id + // join index. Several additional behaviors are also tested: + // - Index directory handling: + // - Term index directory should be unaffected. + // - Integer index directory should be unaffected. + // - Qualified id join index directory should be unaffected. + // - Truncate indices: + // - "TruncateTo()" for term index shouldn't take effect. + // - "Clear()" should be called for integer index. It is a special case when + // document store has no document. Since there is no integer index storage + // sub directories (path_expr = "*/integer_index_dir/*"), nothing will be + // discarded. + // - "Clear()" should be called for qualified id join index and throw out + // all data, i.e. discarding the underlying mapper (path_expr = + // "*/qualified_id_join_index_dir/*") and reinitialize. This should be + // sufficient to make qualified id join index consistent with document + // store (in this case, document store is empty as well), so reindexing + // should not take place. + + // 1. Create an index with no document. + { + TestIcingSearchEngine icing( + GetDefaultIcingOptions(), std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(), + GetTestJniCache()); + + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); } - // 4. Since document 3 doesn't exist, testing numeric query - // "indexableInteger == 456" is not enough to verify the correctness of - // integer index restoration. Instead, we have to check hits for 456 should - // not be found in integer index. + // 2. Manually add some data into integer index and increment + // last_added_document_id. This will cause mismatched document id with + // document store. + // - Document store: [] + // - Term index: [] + // - Integer index: [] + // - Qualified id join index: [0] { Filesystem filesystem; ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<IntegerIndex> integer_index, - IntegerIndex::Create(filesystem, GetIntegerIndexDir())); + std::unique_ptr<QualifiedIdTypeJoinableIndex> qualified_id_join_index, + QualifiedIdTypeJoinableIndex::Create(filesystem, + GetQualifiedIdJoinIndexDir())); + // Add data for document 0. + ASSERT_THAT(qualified_id_join_index->last_added_document_id(), + kInvalidDocumentId); + qualified_id_join_index->set_last_added_document_id(0); + ICING_ASSERT_OK(qualified_id_join_index->Put( + DocJoinInfo(/*document_id=*/0, /*joinable_property_id=*/0), + /*ref_qualified_id_str=*/"namespace#person")); + } + + // 3. Create the index again. This should trigger index restoration. + { + // Mock filesystem to observe and check the behavior of all indices. + auto mock_filesystem = std::make_unique<MockFilesystem>(); + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_)) + .WillRepeatedly(DoDefault()); + // Ensure term index directory should never be discarded. + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(EndsWith("/index_dir"))) + .Times(0); + // Ensure integer index directory should never be discarded. Even though + // Clear() was called, it shouldn't take effect since there is no storage + // sub directory ("*/integer_index_dir/*") and nothing will be discarded. + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(EndsWith("/integer_index_dir"))) + .Times(0); + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/"))) + .Times(0); + // Ensure qualified id join index directory should never be discarded. + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively( + EndsWith("/qualified_id_join_index_dir"))) + .Times(0); + // Clear() should be called to truncate qualified id join index and thus + // underlying storage sub directory (path_expr = + // "*/qualified_id_join_index_dir/*") should be discarded. + EXPECT_CALL( + *mock_filesystem, + DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/"))) + .Times(AtLeast(1)); + + TestIcingSearchEngine icing( + GetDefaultIcingOptions(), std::move(mock_filesystem), + std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(), + GetTestJniCache()); + InitializeResultProto initialize_result = icing.Initialize(); + ASSERT_THAT(initialize_result.status(), ProtoIsOk()); + EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT( + initialize_result.initialize_stats().integer_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + // Since truncating qualified id join index is sufficient to make it + // consistent with document store, replaying documents or reindexing + // shouldn't take place. + EXPECT_THAT(initialize_result.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + } + + // 4. Since document 0 doesn't exist, testing join query is not enough to + // verify the correctness of qualified id join index restoration. Instead, we + // have to check the previously added data should not be found in qualified id + // join index. + { + Filesystem filesystem; ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<DocHitInfoIterator> doc_hit_info_iter, - integer_index->GetIterator(/*property_path=*/"indexableInteger", - /*key_lower=*/456, /*key_upper=*/456)); - EXPECT_THAT(doc_hit_info_iter->Advance(), - StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); + std::unique_ptr<QualifiedIdTypeJoinableIndex> qualified_id_join_index, + QualifiedIdTypeJoinableIndex::Create(filesystem, + GetQualifiedIdJoinIndexDir())); + EXPECT_THAT(qualified_id_join_index->Get( + DocJoinInfo(/*document_id=*/0, /*joinable_property_id=*/0)), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + } +} + +TEST_F(IcingSearchEngineInitializationTest, + RestoreIndexTruncateQualifiedIdJoinIndexWithReindexing) { + // Test the following scenario: qualified id join index is *partially* ahead + // of document store. IcingSearchEngine should be able to recover qualified id + // join index. Several additional behaviors are also tested: + // - Index directory handling: + // - Term index directory should be unaffected. + // - Integer index directory should be unaffected. + // - Qualified id join index directory should be unaffected. + // - Truncate indices: + // - "TruncateTo()" for term index shouldn't take effect. + // - "Clear()" shouldn't be called for integer index, i.e. no integer index + // storage sub directories (path_expr = "*/integer_index_dir/*") should be + // discarded. + // - "Clear()" should be called for qualified id join index and throw out + // all data, i.e. discarding the underlying mapper (path_expr = + // "*/qualified_id_join_index_dir/*") and reinitialize. However, some + // valid data in qualified id join index were discarded together, so + // reindexing should still take place to recover them after clearing. + + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("indexableInteger") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + DocumentProto person = + DocumentBuilder() + .SetKey("namespace", "person") + .SetSchema("Person") + .AddStringProperty("name", "person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto message = + DocumentBuilder() + .SetKey("namespace", "message/1") + .SetSchema("Message") + .AddStringProperty("body", kIpsumText) + .AddInt64Property("indexableInteger", 123) + .AddStringProperty("senderQualifiedId", "namespace#person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + // 1. Create an index with message 3 documents. + { + TestIcingSearchEngine icing( + GetDefaultIcingOptions(), std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(), + GetTestJniCache()); + + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + + EXPECT_THAT(icing.Put(person).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + message = DocumentBuilder(message).SetUri("message/2").Build(); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + message = DocumentBuilder(message).SetUri("message/3").Build(); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + } + + DocJoinInfo additional_data_key; + // 2. Manually add some data into qualified id join index and increment + // last_added_document_id. This will cause mismatched document id with + // document store. + // - Document store: [0, 1, 2, 3] + // - Term index: [0, 1, 2, 3] + // - Integer index: [0, 1, 2, 3] + // - Qualified id join index: [0, 1, 2, 3, 4] + { + Filesystem filesystem; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdTypeJoinableIndex> qualified_id_join_index, + QualifiedIdTypeJoinableIndex::Create(filesystem, + GetQualifiedIdJoinIndexDir())); + // Add data for document 4. + DocumentId original_last_added_doc_id = + qualified_id_join_index->last_added_document_id(); + qualified_id_join_index->set_last_added_document_id( + original_last_added_doc_id + 1); + additional_data_key = + DocJoinInfo(/*document_id=*/original_last_added_doc_id + 1, + /*joinable_property_id=*/0); + ICING_ASSERT_OK(qualified_id_join_index->Put( + additional_data_key, + /*ref_qualified_id_str=*/"namespace#person")); + } + + // 3. Create the index again. This should trigger index restoration. + { + // Mock filesystem to observe and check the behavior of all indices. + auto mock_filesystem = std::make_unique<MockFilesystem>(); + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_)) + .WillRepeatedly(DoDefault()); + // Ensure term index directory should never be discarded. + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(EndsWith("/index_dir"))) + .Times(0); + // Ensure integer index directory should never be discarded, and Clear() + // should never be called (i.e. storage sub directory + // "*/integer_index_dir/*" should never be discarded). + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(EndsWith("/integer_index_dir"))) + .Times(0); + EXPECT_CALL(*mock_filesystem, + DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/"))) + .Times(0); + // Ensure qualified id join index directory should never be discarded. + EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively( + EndsWith("/qualified_id_join_index_dir"))) + .Times(0); + // Clear() should be called to truncate qualified id join index and thus + // underlying storage sub directory (path_expr = + // "*/qualified_id_join_index_dir/*") should be discarded. + EXPECT_CALL( + *mock_filesystem, + DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/"))) + .Times(AtLeast(1)); + + TestIcingSearchEngine icing( + GetDefaultIcingOptions(), std::move(mock_filesystem), + std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(), + GetTestJniCache()); + InitializeResultProto initialize_result = icing.Initialize(); + ASSERT_THAT(initialize_result.status(), ProtoIsOk()); + EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT( + initialize_result.initialize_stats().integer_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH)); + + // Verify term index works normally + SearchSpecProto search_spec1; + search_spec1.set_query("body:consectetur"); + search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY); + SearchResultProto results1 = + icing.Search(search_spec1, ScoringSpecProto::default_instance(), + ResultSpecProto::default_instance()); + EXPECT_THAT(results1.status(), ProtoIsOk()); + EXPECT_THAT(results1.next_page_token(), Eq(0)); + // All documents should be retrievable. + ASSERT_THAT(results1.results(), SizeIs(3)); + EXPECT_THAT(results1.results(0).document().uri(), Eq("message/3")); + EXPECT_THAT(results1.results(1).document().uri(), Eq("message/2")); + EXPECT_THAT(results1.results(2).document().uri(), Eq("message/1")); + + // Verify integer index works normally + SearchSpecProto search_spec2; + search_spec2.set_query("indexableInteger == 123"); + search_spec2.set_search_type( + SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY); + search_spec2.add_enabled_features(std::string(kNumericSearchFeature)); + + SearchResultProto results2 = + icing.Search(search_spec2, ScoringSpecProto::default_instance(), + ResultSpecProto::default_instance()); + ASSERT_THAT(results2.results(), SizeIs(3)); + EXPECT_THAT(results2.results(0).document().uri(), Eq("message/3")); + EXPECT_THAT(results2.results(1).document().uri(), Eq("message/2")); + EXPECT_THAT(results2.results(2).document().uri(), Eq("message/1")); + + // Verify qualified id join index works normally: join a query for + // `name:person` with a child query for `body:consectetur` based on the + // child's `senderQualifiedId` field. + + // Add document 4 without "senderQualifiedId". If joinable index is not + // rebuilt correctly, then it will still have the previously added + // senderQualifiedId for document 4 and include document 4 incorrectly in + // the right side. + DocumentProto another_message = + DocumentBuilder() + .SetKey("namespace", "message/4") + .SetSchema("Message") + .AddStringProperty("body", kIpsumText) + .AddInt64Property("indexableInteger", 123) + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + EXPECT_THAT(icing.Put(another_message).status(), ProtoIsOk()); + + SearchSpecProto search_spec3; + search_spec3.set_term_match_type(TermMatchType::EXACT_ONLY); + search_spec3.set_query("name:person"); + JoinSpecProto* join_spec = search_spec3.mutable_join_spec(); + join_spec->set_max_joined_child_count(100); + join_spec->set_parent_property_expression( + std::string(JoinProcessor::kQualifiedIdExpr)); + join_spec->set_child_property_expression("senderQualifiedId"); + join_spec->set_aggregation_scoring_strategy( + JoinSpecProto::AggregationScoringStrategy::COUNT); + JoinSpecProto::NestedSpecProto* nested_spec = + join_spec->mutable_nested_spec(); + SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec(); + nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY); + nested_search_spec->set_query("body:consectetur"); + *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec(); + *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance(); + + SearchResultProto results3 = + icing.Search(search_spec3, ScoringSpecProto::default_instance(), + ResultSpecProto::default_instance()); + ASSERT_THAT(results3.results(), SizeIs(1)); + EXPECT_THAT(results3.results(0).document().uri(), Eq("person")); + EXPECT_THAT(results3.results(0).joined_results(), SizeIs(3)); + EXPECT_THAT(results3.results(0).joined_results(0).document().uri(), + Eq("message/3")); + EXPECT_THAT(results3.results(0).joined_results(1).document().uri(), + Eq("message/2")); + EXPECT_THAT(results3.results(0).joined_results(2).document().uri(), + Eq("message/1")); } } @@ -2308,6 +3836,9 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_THAT( init_result.initialize_stats().integer_index_restoration_cause(), Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(init_result.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); } } @@ -2332,14 +3863,21 @@ TEST_F(IcingSearchEngineInitializationTest, .AddProperty(PropertyConfigBuilder() .SetName("indexableInteger") .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); - // Set a schema for a single type that has no indexed contents. + // Set a schema for a single type that has no term, integer, join indexed + // contents. ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); // Add a document that contains: // - No valid indexed string content - just punctuation // - No integer content - since it is an optional property + // - No qualified id content - since it is an optional property DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/0") .SetSchema("Message") @@ -2364,6 +3902,9 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_THAT( init_result.initialize_stats().integer_index_restoration_cause(), Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(init_result.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); } } @@ -2454,6 +3995,9 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_THAT(initialize_result_proto.initialize_stats() .integer_index_restoration_cause(), Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); EXPECT_THAT( initialize_result_proto.initialize_stats().index_restoration_latency_ms(), Eq(0)); @@ -2525,6 +4069,9 @@ TEST_F(IcingSearchEngineInitializationTest, .integer_index_restoration_cause(), Eq(InitializeStatsProto::NONE)); EXPECT_THAT(initialize_result_proto.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .index_restoration_latency_ms(), Eq(0)); EXPECT_THAT(initialize_result_proto.initialize_stats() @@ -2616,6 +4163,9 @@ TEST_F(IcingSearchEngineInitializationTest, .integer_index_restoration_cause(), Eq(InitializeStatsProto::NONE)); EXPECT_THAT(initialize_result_proto.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .index_restoration_latency_ms(), Eq(0)); EXPECT_THAT(initialize_result_proto.initialize_stats() @@ -2667,6 +4217,9 @@ TEST_F(IcingSearchEngineInitializationTest, .integer_index_restoration_cause(), Eq(InitializeStatsProto::NONE)); EXPECT_THAT(initialize_result_proto.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .index_restoration_latency_ms(), Eq(10)); EXPECT_THAT(initialize_result_proto.initialize_stats() @@ -2728,6 +4281,111 @@ TEST_F( .integer_index_restoration_cause(), Eq(InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH)); EXPECT_THAT(initialize_result_proto.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() + .index_restoration_latency_ms(), + Eq(10)); + EXPECT_THAT(initialize_result_proto.initialize_stats() + .document_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() + .document_store_recovery_latency_ms(), + Eq(0)); + EXPECT_THAT( + initialize_result_proto.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::NO_DATA_LOSS)); + EXPECT_THAT(initialize_result_proto.initialize_stats() + .schema_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() + .schema_store_recovery_latency_ms(), + Eq(0)); + } +} + +TEST_F( + IcingSearchEngineInitializationTest, + InitializeShouldLogRecoveryCauseQualifiedIdJoinIndexInconsistentWithGroundTruth) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("indexableInteger") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto person = + DocumentBuilder() + .SetKey("namespace", "person") + .SetSchema("Person") + .AddStringProperty("name", "person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto message = + DocumentBuilder() + .SetKey("namespace", "message/1") + .SetSchema("Message") + .AddStringProperty("body", "message body") + .AddInt64Property("indexableInteger", 123) + .AddStringProperty("senderQualifiedId", "namespace#person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + { + // Initialize and put documents. + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(person).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + } + + { + // Delete the qualified id join index file to trigger RestoreIndexIfNeeded. + std::string qualified_id_join_index_dir = GetQualifiedIdJoinIndexDir(); + filesystem()->DeleteDirectoryRecursively( + qualified_id_join_index_dir.c_str()); + } + + { + // Index is empty but ground truth is not. Index should be restored due to + // the inconsistency. + auto fake_clock = std::make_unique<FakeClock>(); + fake_clock->SetTimerElapsedMilliseconds(10); + TestIcingSearchEngine icing(GetDefaultIcingOptions(), + std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), + std::move(fake_clock), GetTestJniCache()); + InitializeResultProto initialize_result_proto = icing.Initialize(); + EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); + EXPECT_THAT( + initialize_result_proto.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() + .integer_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .index_restoration_latency_ms(), Eq(10)); EXPECT_THAT(initialize_result_proto.initialize_stats() @@ -2808,6 +4466,9 @@ TEST_F(IcingSearchEngineInitializationTest, .integer_index_restoration_cause(), Eq(InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC)); EXPECT_THAT(initialize_result_proto.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .index_restoration_latency_ms(), Eq(10)); EXPECT_THAT(initialize_result_proto.initialize_stats() @@ -2844,6 +4505,9 @@ TEST_F(IcingSearchEngineInitializationTest, .integer_index_restoration_cause(), Eq(InitializeStatsProto::NONE)); EXPECT_THAT(initialize_result_proto.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .index_restoration_latency_ms(), Eq(0)); EXPECT_THAT(initialize_result_proto.initialize_stats() @@ -2906,6 +4570,9 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_THAT(initialize_result_proto.initialize_stats() .integer_index_restoration_cause(), Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); EXPECT_THAT( initialize_result_proto.initialize_stats().index_restoration_latency_ms(), Eq(10)); @@ -2966,6 +4633,113 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_THAT(initialize_result_proto.initialize_stats() .integer_index_restoration_cause(), Eq(InitializeStatsProto::IO_ERROR)); + EXPECT_THAT(initialize_result_proto.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT( + initialize_result_proto.initialize_stats().index_restoration_latency_ms(), + Eq(10)); + EXPECT_THAT(initialize_result_proto.initialize_stats() + .document_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() + .document_store_recovery_latency_ms(), + Eq(0)); + EXPECT_THAT( + initialize_result_proto.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::NO_DATA_LOSS)); + EXPECT_THAT( + initialize_result_proto.initialize_stats().schema_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() + .schema_store_recovery_latency_ms(), + Eq(0)); +} + +TEST_F(IcingSearchEngineInitializationTest, + InitializeShouldLogRecoveryCauseQualifiedIdJoinIndexIOError) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("indexableInteger") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto person = + DocumentBuilder() + .SetKey("namespace", "person") + .SetSchema("Person") + .AddStringProperty("name", "person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto message = + DocumentBuilder() + .SetKey("namespace", "message/1") + .SetSchema("Message") + .AddStringProperty("body", "message body") + .AddInt64Property("indexableInteger", 123) + .AddStringProperty("senderQualifiedId", "namespace#person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + { + // Initialize and put documents. + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(person).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(message).status(), ProtoIsOk()); + } + + std::string qualified_id_join_index_metadata_file = + absl_ports::StrCat(GetQualifiedIdJoinIndexDir(), "/metadata"); + auto mock_filesystem = std::make_unique<MockFilesystem>(); + EXPECT_CALL(*mock_filesystem, PRead(A<const char*>(), _, _, _)) + .WillRepeatedly(DoDefault()); + // This fails QualifiedIdTypeJoinableIndex::Create() once. + EXPECT_CALL( + *mock_filesystem, + PRead(Matcher<const char*>(Eq(qualified_id_join_index_metadata_file)), _, + _, _)) + .WillOnce(Return(false)) + .WillRepeatedly(DoDefault()); + + auto fake_clock = std::make_unique<FakeClock>(); + fake_clock->SetTimerElapsedMilliseconds(10); + TestIcingSearchEngine icing(GetDefaultIcingOptions(), + std::move(mock_filesystem), + std::make_unique<IcingFilesystem>(), + std::move(fake_clock), GetTestJniCache()); + + InitializeResultProto initialize_result_proto = icing.Initialize(); + EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); + EXPECT_THAT( + initialize_result_proto.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() + .integer_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::IO_ERROR)); EXPECT_THAT( initialize_result_proto.initialize_stats().index_restoration_latency_ms(), Eq(10)); @@ -3038,6 +4812,9 @@ TEST_F(IcingSearchEngineInitializationTest, EXPECT_THAT(initialize_result_proto.initialize_stats() .integer_index_restoration_cause(), Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); EXPECT_THAT( initialize_result_proto.initialize_stats().index_restoration_latency_ms(), Eq(0)); @@ -3095,6 +4872,9 @@ TEST_F(IcingSearchEngineInitializationTest, .integer_index_restoration_cause(), Eq(InitializeStatsProto::NONE)); EXPECT_THAT(initialize_result_proto.initialize_stats() + .qualified_id_join_index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .index_restoration_latency_ms(), Eq(0)); } diff --git a/icing/icing-search-engine_optimize_test.cc b/icing/icing-search-engine_optimize_test.cc index b2c7a62..0c5cb7a 100644 --- a/icing/icing-search-engine_optimize_test.cc +++ b/icing/icing-search-engine_optimize_test.cc @@ -28,6 +28,7 @@ #include "icing/file/mock-filesystem.h" #include "icing/icing-search-engine.h" #include "icing/jni/jni-cache.h" +#include "icing/join/join-processor.h" #include "icing/portable/endian.h" #include "icing/portable/equals-proto.h" #include "icing/portable/platform.h" @@ -123,46 +124,46 @@ IcingSearchEngineOptions GetDefaultIcingOptions() { return icing_options; } -DocumentProto CreateMessageDocument(std::string name_space, std::string uri) { - return DocumentBuilder() - .SetKey(std::move(name_space), std::move(uri)) - .SetSchema("Message") - .AddStringProperty("body", "message body") - .AddInt64Property("indexableInteger", 123) - .SetCreationTimestampMs(kDefaultCreationTimestampMs) - .Build(); -} - -SchemaProto CreateMessageSchema() { - return SchemaBuilder() - .AddType(SchemaTypeConfigBuilder() - .SetType("Message") - .AddProperty(PropertyConfigBuilder() - .SetName("body") - .SetDataTypeString(TERM_MATCH_PREFIX, - TOKENIZER_PLAIN) - .SetCardinality(CARDINALITY_REQUIRED)) - .AddProperty(PropertyConfigBuilder() - .SetName("indexableInteger") - .SetDataTypeInt64(NUMERIC_MATCH_RANGE) - .SetCardinality(CARDINALITY_REQUIRED))) - .Build(); -} - ScoringSpecProto GetDefaultScoringSpec() { ScoringSpecProto scoring_spec; scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE); return scoring_spec; } +// TODO(b/272145329): create SearchSpecBuilder, JoinSpecBuilder, +// SearchResultProtoBuilder and ResultProtoBuilder for unit tests and build all +// instances by them. + TEST_F(IcingSearchEngineOptimizeTest, AllPageTokensShouldBeInvalidatedAfterOptimization) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto document1 = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Message") + .AddStringProperty("body", "message body one") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto document2 = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Message") + .AddStringProperty("body", "message body two") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); - DocumentProto document1 = CreateMessageDocument("namespace", "uri1"); - DocumentProto document2 = CreateMessageDocument("namespace", "uri2"); ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); @@ -205,9 +206,24 @@ TEST_F(IcingSearchEngineOptimizeTest, } TEST_F(IcingSearchEngineOptimizeTest, OptimizationShouldRemoveDeletedDocs) { - IcingSearchEngineOptions icing_options = GetDefaultIcingOptions(); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto document1 = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Message") + .AddStringProperty("body", "message body one") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); - DocumentProto document1 = CreateMessageDocument("namespace", "uri1"); + IcingSearchEngineOptions icing_options = GetDefaultIcingOptions(); GetResultProto expected_get_result_proto; expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND); @@ -216,7 +232,7 @@ TEST_F(IcingSearchEngineOptimizeTest, OptimizationShouldRemoveDeletedDocs) { { IcingSearchEngine icing(icing_options, GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); // Deletes document1 @@ -247,10 +263,19 @@ TEST_F(IcingSearchEngineOptimizeTest, OptimizationShouldRemoveDeletedDocs) { TEST_F(IcingSearchEngineOptimizeTest, OptimizationShouldDeleteTemporaryDirectory) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + IcingSearchEngineOptions icing_options = GetDefaultIcingOptions(); IcingSearchEngine icing(icing_options, GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); // Create a tmp dir that will be used in Optimize() to swap files, // this validates that any tmp dirs will be deleted before using. @@ -271,12 +296,26 @@ TEST_F(IcingSearchEngineOptimizeTest, } TEST_F(IcingSearchEngineOptimizeTest, GetOptimizeInfoHasCorrectStats) { - DocumentProto document1 = CreateMessageDocument("namespace", "uri1"); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto document1 = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Message") + .AddStringProperty("body", "message body one") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); DocumentProto document2 = DocumentBuilder() .SetKey("namespace", "uri2") .SetSchema("Message") - .AddStringProperty("body", "message body") - .AddInt64Property("indexableInteger", 456) + .AddStringProperty("body", "message body two") .SetCreationTimestampMs(100) .SetTtlMs(500) .Build(); @@ -298,7 +337,7 @@ TEST_F(IcingSearchEngineOptimizeTest, GetOptimizeInfoHasCorrectStats) { EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0)); EXPECT_THAT(optimize_info.time_since_last_optimize_ms(), Eq(0)); - ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); // Only have active documents, nothing is optimizable yet. @@ -356,11 +395,50 @@ TEST_F(IcingSearchEngineOptimizeTest, GetOptimizeInfoHasCorrectStats) { } TEST_F(IcingSearchEngineOptimizeTest, GetAndPutShouldWorkAfterOptimization) { - DocumentProto document1 = CreateMessageDocument("namespace", "uri1"); - DocumentProto document2 = CreateMessageDocument("namespace", "uri2"); - DocumentProto document3 = CreateMessageDocument("namespace", "uri3"); - DocumentProto document4 = CreateMessageDocument("namespace", "uri4"); - DocumentProto document5 = CreateMessageDocument("namespace", "uri5"); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto document1 = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Message") + .AddStringProperty("body", "message body one") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto document2 = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Message") + .AddStringProperty("body", "message body two") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto document3 = + DocumentBuilder() + .SetKey("namespace", "uri3") + .SetSchema("Message") + .AddStringProperty("body", "message body three") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto document4 = + DocumentBuilder() + .SetKey("namespace", "uri4") + .SetSchema("Message") + .AddStringProperty("body", "message body four") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto document5 = + DocumentBuilder() + .SetKey("namespace", "uri5") + .SetSchema("Message") + .AddStringProperty("body", "message body five") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); GetResultProto expected_get_result_proto; expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); @@ -368,7 +446,7 @@ TEST_F(IcingSearchEngineOptimizeTest, GetAndPutShouldWorkAfterOptimization) { { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); @@ -474,12 +552,34 @@ TEST_F(IcingSearchEngineOptimizeTest, } TEST_F(IcingSearchEngineOptimizeTest, DeleteShouldWorkAfterOptimization) { - DocumentProto document1 = CreateMessageDocument("namespace", "uri1"); - DocumentProto document2 = CreateMessageDocument("namespace", "uri2"); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto document1 = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Message") + .AddStringProperty("body", "message body one") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto document2 = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Message") + .AddStringProperty("body", "message body two") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); ASSERT_THAT(icing.Optimize().status(), ProtoIsOk()); @@ -557,13 +657,14 @@ TEST_F(IcingSearchEngineOptimizeTest, OptimizationFailureUninitializesIcing) { ASSERT_THAT(icing.Optimize().status(), ProtoStatusIs(StatusProto::INTERNAL)); // Ordinary operations should fail safely. - SchemaProto simple_schema; - auto type = simple_schema.add_types(); - type->set_schema_type("type0"); - auto property = type->add_properties(); - property->set_property_name("prop0"); - property->set_data_type(PropertyConfigProto::DataType::STRING); - property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + SchemaProto simple_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("type0").AddProperty( + PropertyConfigBuilder() + .SetName("prop0") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); DocumentProto simple_doc = DocumentBuilder() .SetKey("namespace0", "uri0") @@ -606,27 +707,30 @@ TEST_F(IcingSearchEngineOptimizeTest, OptimizationFailureUninitializesIcing) { TEST_F(IcingSearchEngineOptimizeTest, SetSchemaShouldWorkAfterOptimization) { // Creates 3 test schemas - SchemaProto schema1 = SchemaProto(CreateMessageSchema()); + SchemaProto schema1 = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); SchemaProto schema2 = SchemaProto(schema1); - auto new_property2 = schema2.mutable_types(0)->add_properties(); - new_property2->set_property_name("property2"); - new_property2->set_data_type(PropertyConfigProto::DataType::STRING); - new_property2->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - new_property2->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::PREFIX); - new_property2->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); + *schema2.mutable_types(0)->add_properties() = + PropertyConfigBuilder() + .SetName("property2") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL) + .Build(); SchemaProto schema3 = SchemaProto(schema2); - auto new_property3 = schema3.mutable_types(0)->add_properties(); - new_property3->set_property_name("property3"); - new_property3->set_data_type(PropertyConfigProto::DataType::STRING); - new_property3->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - new_property3->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::PREFIX); - new_property3->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); + *schema3.mutable_types(0)->add_properties() = + PropertyConfigBuilder() + .SetName("property3") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL) + .Build(); { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); @@ -644,7 +748,29 @@ TEST_F(IcingSearchEngineOptimizeTest, SetSchemaShouldWorkAfterOptimization) { } TEST_F(IcingSearchEngineOptimizeTest, SearchShouldWorkAfterOptimization) { - DocumentProto document = CreateMessageDocument("namespace", "uri"); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("indexableInteger") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto document = + DocumentBuilder() + .SetKey("namespace", "uri") + .SetSchema("Message") + .AddStringProperty("body", "message body") + .AddInt64Property("indexableInteger", 123) + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); SearchSpecProto search_spec1; search_spec1.set_term_match_type(TermMatchType::PREFIX); @@ -664,7 +790,7 @@ TEST_F(IcingSearchEngineOptimizeTest, SearchShouldWorkAfterOptimization) { { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); ASSERT_THAT(icing.Put(document).status(), ProtoIsOk()); ASSERT_THAT(icing.Optimize().status(), ProtoIsOk()); @@ -703,14 +829,308 @@ TEST_F(IcingSearchEngineOptimizeTest, SearchShouldWorkAfterOptimization) { } TEST_F(IcingSearchEngineOptimizeTest, + JoinShouldWorkAfterOptimizationDeleteParent) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto person1 = + DocumentBuilder() + .SetKey("namespace", "person1") + .SetSchema("Person") + .AddStringProperty("name", "person one") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto person2 = + DocumentBuilder() + .SetKey("namespace", "person2") + .SetSchema("Person") + .AddStringProperty("name", "person two") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + DocumentProto message1 = + DocumentBuilder() + .SetKey("namespace", "message1") + .SetSchema("Message") + .AddStringProperty("body", "message body one") + .AddStringProperty("senderQualifiedId", "namespace#person1") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto message2 = + DocumentBuilder() + .SetKey("namespace", "message2") + .SetSchema("Message") + .AddStringProperty("body", "message body two") + .AddStringProperty("senderQualifiedId", "namespace#person1") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto message3 = + DocumentBuilder() + .SetKey("namespace", "message3") + .SetSchema("Message") + .AddStringProperty("body", "message body three") + .AddStringProperty("senderQualifiedId", "namespace#person2") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + // Prepare join search spec to join a query for `name:person` with a child + // query for `body:message` based on the child's `senderQualifiedId` field. + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + search_spec.set_query("name:person"); + JoinSpecProto* join_spec = search_spec.mutable_join_spec(); + join_spec->set_max_joined_child_count(100); + join_spec->set_parent_property_expression( + std::string(JoinProcessor::kQualifiedIdExpr)); + join_spec->set_child_property_expression("senderQualifiedId"); + join_spec->set_aggregation_scoring_strategy( + JoinSpecProto::AggregationScoringStrategy::COUNT); + JoinSpecProto::NestedSpecProto* nested_spec = + join_spec->mutable_nested_spec(); + SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec(); + nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY); + nested_search_spec->set_query("body:message"); + *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec(); + *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance(); + + // Person1 is going to be deleted below. Only person2 which is joined with + // message3 should match the query. + SearchResultProto expected_search_result_proto; + expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); + SearchResultProto::ResultProto* result_proto = + expected_search_result_proto.mutable_results()->Add(); + *result_proto->mutable_document() = person2; + *result_proto->mutable_joined_results()->Add()->mutable_document() = message3; + + { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(person1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(person2).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(message1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(message2).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(message3).status(), ProtoIsOk()); + // Delete parent document: person1 + ASSERT_THAT(icing.Delete("namespace", "person1").status(), ProtoIsOk()); + ASSERT_THAT(icing.Optimize().status(), ProtoIsOk()); + + // Validates that join search query works right after Optimize() + SearchResultProto search_result_proto = + icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); + } // Destroys IcingSearchEngine to make sure nothing is cached. + + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); + + SearchResultProto search_result_proto = + icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); +} + +TEST_F(IcingSearchEngineOptimizeTest, + JoinShouldWorkAfterOptimizationDeleteChild) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto person1 = + DocumentBuilder() + .SetKey("namespace", "person1") + .SetSchema("Person") + .AddStringProperty("name", "person one") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto person2 = + DocumentBuilder() + .SetKey("namespace", "person2") + .SetSchema("Person") + .AddStringProperty("name", "person two") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + DocumentProto message1 = + DocumentBuilder() + .SetKey("namespace", "message1") + .SetSchema("Message") + .AddStringProperty("body", "message body one") + .AddStringProperty("senderQualifiedId", "namespace#person1") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto message2 = + DocumentBuilder() + .SetKey("namespace", "message2") + .SetSchema("Message") + .AddStringProperty("body", "message body two") + .AddStringProperty("senderQualifiedId", "namespace#person1") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto message3 = + DocumentBuilder() + .SetKey("namespace", "message3") + .SetSchema("Message") + .AddStringProperty("body", "message body three") + .AddStringProperty("senderQualifiedId", "namespace#person2") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + // Prepare join search spec to join a query for `name:person` with a child + // query for `body:message` based on the child's `senderQualifiedId` field. + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + search_spec.set_query("name:person"); + JoinSpecProto* join_spec = search_spec.mutable_join_spec(); + join_spec->set_max_joined_child_count(100); + join_spec->set_parent_property_expression( + std::string(JoinProcessor::kQualifiedIdExpr)); + join_spec->set_child_property_expression("senderQualifiedId"); + join_spec->set_aggregation_scoring_strategy( + JoinSpecProto::AggregationScoringStrategy::COUNT); + JoinSpecProto::NestedSpecProto* nested_spec = + join_spec->mutable_nested_spec(); + SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec(); + nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY); + nested_search_spec->set_query("body:message"); + *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec(); + *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance(); + + // Message1 and message3 are going to be deleted below. Both person1 and + // person2 should be included even though person2 has no child (since we're + // doing left join). + SearchResultProto expected_search_result_proto; + expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); + SearchResultProto::ResultProto* result_proto1 = + expected_search_result_proto.mutable_results()->Add(); + *result_proto1->mutable_document() = person1; + *result_proto1->mutable_joined_results()->Add()->mutable_document() = + message2; + SearchResultProto::ResultProto* result_google::protobuf = + expected_search_result_proto.mutable_results()->Add(); + *result_google::protobuf->mutable_document() = person2; + + { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(person1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(person2).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(message1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(message2).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(message3).status(), ProtoIsOk()); + // Delete child documents: message1 and message3 + ASSERT_THAT(icing.Delete("namespace", "message1").status(), ProtoIsOk()); + ASSERT_THAT(icing.Delete("namespace", "message3").status(), ProtoIsOk()); + ASSERT_THAT(icing.Optimize().status(), ProtoIsOk()); + + // Validates that join search query works right after Optimize() + SearchResultProto search_result_proto = + icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); + } // Destroys IcingSearchEngine to make sure nothing is cached. + + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); + + SearchResultProto search_result_proto = + icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); +} + +TEST_F(IcingSearchEngineOptimizeTest, IcingShouldWorkFineIfOptimizationIsAborted) { - DocumentProto document1 = CreateMessageDocument("namespace", "uri1"); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("indexableInteger") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto person = + DocumentBuilder() + .SetKey("namespace", "person") + .SetSchema("Person") + .AddStringProperty("name", "person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + DocumentProto message1 = + DocumentBuilder() + .SetKey("namespace", "message1") + .SetSchema("Message") + .AddStringProperty("body", "message body one") + .AddInt64Property("indexableInteger", 123) + .AddStringProperty("senderQualifiedId", "namespace#person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); { // Initializes a normal icing to create files needed IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(person).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(message1).status(), ProtoIsOk()); } // Creates a mock filesystem in which DeleteDirectoryRecursively() always @@ -733,25 +1153,33 @@ TEST_F(IcingSearchEngineOptimizeTest, GetResultProto expected_get_result_proto; expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); - *expected_get_result_proto.mutable_document() = document1; - EXPECT_THAT( - icing.Get("namespace", "uri1", GetResultSpecProto::default_instance()), - EqualsProto(expected_get_result_proto)); + *expected_get_result_proto.mutable_document() = message1; + EXPECT_THAT(icing.Get("namespace", "message1", + GetResultSpecProto::default_instance()), + EqualsProto(expected_get_result_proto)); - DocumentProto document2 = CreateMessageDocument("namespace", "uri2"); + DocumentProto message2 = + DocumentBuilder() + .SetKey("namespace", "message2") + .SetSchema("Message") + .AddStringProperty("body", "message body two") + .AddInt64Property("indexableInteger", 123) + .AddStringProperty("senderQualifiedId", "namespace#person") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); - EXPECT_THAT(icing.Put(document2).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(message2).status(), ProtoIsOk()); SearchResultProto expected_search_result_proto; expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); *expected_search_result_proto.mutable_results()->Add()->mutable_document() = - document2; + message2; *expected_search_result_proto.mutable_results()->Add()->mutable_document() = - document1; + message1; // Verify term search SearchSpecProto search_spec1; - search_spec1.set_query("m"); + search_spec1.set_query("body:m"); search_spec1.set_term_match_type(TermMatchType::PREFIX); SearchResultProto search_result_proto1 = @@ -772,10 +1200,68 @@ TEST_F(IcingSearchEngineOptimizeTest, ResultSpecProto::default_instance()); EXPECT_THAT(search_result_google::protobuf, EqualsSearchResultIgnoreStatsAndScores( expected_search_result_proto)); + + // Verify join search: join a query for `name:person` with a child query for + // `body:message` based on the child's `senderQualifiedId` field. + SearchSpecProto search_spec3; + search_spec3.set_term_match_type(TermMatchType::EXACT_ONLY); + search_spec3.set_query("name:person"); + JoinSpecProto* join_spec = search_spec3.mutable_join_spec(); + join_spec->set_max_joined_child_count(100); + join_spec->set_parent_property_expression( + std::string(JoinProcessor::kQualifiedIdExpr)); + join_spec->set_child_property_expression("senderQualifiedId"); + join_spec->set_aggregation_scoring_strategy( + JoinSpecProto::AggregationScoringStrategy::COUNT); + JoinSpecProto::NestedSpecProto* nested_spec = + join_spec->mutable_nested_spec(); + SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec(); + nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY); + nested_search_spec->set_query("body:message"); + *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec(); + *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance(); + + SearchResultProto expected_join_search_result_proto; + expected_join_search_result_proto.mutable_status()->set_code(StatusProto::OK); + SearchResultProto::ResultProto* result_proto = + expected_join_search_result_proto.mutable_results()->Add(); + *result_proto->mutable_document() = person; + *result_proto->mutable_joined_results()->Add()->mutable_document() = message2; + *result_proto->mutable_joined_results()->Add()->mutable_document() = message1; + + SearchResultProto search_result_proto3 = + icing.Search(search_spec3, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(search_result_proto3, EqualsSearchResultIgnoreStatsAndScores( + expected_join_search_result_proto)); } TEST_F(IcingSearchEngineOptimizeTest, OptimizationShouldRecoverIfFileDirectoriesAreMissing) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("indexableInteger") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto document = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Message") + .AddStringProperty("body", "message body") + .AddInt64Property("indexableInteger", 123) + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + // Creates a mock filesystem in which SwapFiles() always fails and deletes the // directories. This will fail IcingSearchEngine::OptimizeDocumentStore(). auto mock_filesystem = std::make_unique<MockFilesystem>(); @@ -793,9 +1279,8 @@ TEST_F(IcingSearchEngineOptimizeTest, std::make_unique<FakeClock>(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(), - ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document).status(), ProtoIsOk()); // Optimize() fails due to filesystem error OptimizeResultProto result = icing.Optimize(); @@ -873,6 +1358,30 @@ TEST_F(IcingSearchEngineOptimizeTest, TEST_F(IcingSearchEngineOptimizeTest, OptimizationShouldRecoverIfDataFilesAreMissing) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("indexableInteger") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto document = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Message") + .AddStringProperty("body", "message body") + .AddInt64Property("indexableInteger", 123) + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + // Creates a mock filesystem in which SwapFiles() always fails and empties the // directories. This will fail IcingSearchEngine::OptimizeDocumentStore(). auto mock_filesystem = std::make_unique<MockFilesystem>(); @@ -892,9 +1401,8 @@ TEST_F(IcingSearchEngineOptimizeTest, std::make_unique<FakeClock>(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(), - ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document).status(), ProtoIsOk()); // Optimize() fails due to filesystem error OptimizeResultProto result = icing.Optimize(); @@ -969,23 +1477,61 @@ TEST_F(IcingSearchEngineOptimizeTest, expected_search_result_proto)); } -TEST_F(IcingSearchEngineOptimizeTest, OptimizeStatsProtoTest) { +TEST_F(IcingSearchEngineOptimizeTest, OptimizeThresholdTest) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("indexableInteger") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto document1 = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Message") + .AddStringProperty("body", "message body one") + .AddInt64Property("indexableInteger", 1) + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Message") + .AddStringProperty("body", "message body two") + .AddInt64Property("indexableInteger", 2) + .SetCreationTimestampMs(9000) + .SetTtlMs(500) + .Build(); + DocumentProto document3 = + DocumentBuilder() + .SetKey("namespace", "uri3") + .SetSchema("Message") + .AddStringProperty("body", "message body three") + .AddInt64Property("indexableInteger", 3) + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + auto fake_clock = std::make_unique<FakeClock>(); fake_clock->SetTimerElapsedMilliseconds(5); fake_clock->SetSystemTimeMilliseconds(10000); + IcingSearchEngineOptions options = GetDefaultIcingOptions(); + // Set the threshold to 0.9 to test that the threshold works. + options.set_optimize_rebuild_index_threshold(0.9); auto icing = std::make_unique<TestIcingSearchEngine>( - GetDefaultIcingOptions(), std::make_unique<Filesystem>(), + options, std::make_unique<Filesystem>(), std::make_unique<IcingFilesystem>(), std::move(fake_clock), GetTestJniCache()); ASSERT_THAT(icing->Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing->SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); - - // Create three documents. - DocumentProto document1 = CreateMessageDocument("namespace", "uri1"); - DocumentProto document2 = CreateMessageDocument("namespace", "uri2"); - document2.set_creation_timestamp_ms(9000); - document2.set_ttl_ms(500); - DocumentProto document3 = CreateMessageDocument("namespace", "uri3"); + ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk()); + + // Add three documents. ASSERT_THAT(icing->Put(document1).status(), ProtoIsOk()); ASSERT_THAT(icing->Put(document2).status(), ProtoIsOk()); ASSERT_THAT(icing->Put(document3).status(), ProtoIsOk()); @@ -1022,7 +1568,7 @@ TEST_F(IcingSearchEngineOptimizeTest, OptimizeStatsProtoTest) { fake_clock->SetTimerElapsedMilliseconds(5); fake_clock->SetSystemTimeMilliseconds(20000); icing = std::make_unique<TestIcingSearchEngine>( - GetDefaultIcingOptions(), std::make_unique<Filesystem>(), + options, std::make_unique<Filesystem>(), std::make_unique<IcingFilesystem>(), std::move(fake_clock), GetTestJniCache()); ASSERT_THAT(icing->Initialize().status(), ProtoIsOk()); @@ -1069,6 +1615,144 @@ TEST_F(IcingSearchEngineOptimizeTest, OptimizeStatsProtoTest) { EXPECT_THAT(result.optimize_stats(), EqualsProto(expected)); } +TEST_F(IcingSearchEngineOptimizeTest, OptimizeStatsProtoTest) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("indexableInteger") + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + DocumentProto document1 = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Message") + .AddStringProperty("body", "message body one") + .AddInt64Property("indexableInteger", 1) + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace", "uri2") + .SetSchema("Message") + .AddStringProperty("body", "message body two") + .AddInt64Property("indexableInteger", 2) + .SetCreationTimestampMs(9000) + .SetTtlMs(500) + .Build(); + DocumentProto document3 = + DocumentBuilder() + .SetKey("namespace", "uri3") + .SetSchema("Message") + .AddStringProperty("body", "message body three") + .AddInt64Property("indexableInteger", 3) + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + auto fake_clock = std::make_unique<FakeClock>(); + fake_clock->SetTimerElapsedMilliseconds(5); + fake_clock->SetSystemTimeMilliseconds(10000); + // Use the default Icing options, so that a change to the default value will + // require updating this test. + auto icing = std::make_unique<TestIcingSearchEngine>( + GetDefaultIcingOptions(), std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), std::move(fake_clock), + GetTestJniCache()); + ASSERT_THAT(icing->Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk()); + + // Add three documents. + ASSERT_THAT(icing->Put(document1).status(), ProtoIsOk()); + ASSERT_THAT(icing->Put(document2).status(), ProtoIsOk()); + ASSERT_THAT(icing->Put(document3).status(), ProtoIsOk()); + + // Delete the first document. + ASSERT_THAT(icing->Delete(document1.namespace_(), document1.uri()).status(), + ProtoIsOk()); + ASSERT_THAT(icing->PersistToDisk(PersistType::FULL).status(), ProtoIsOk()); + + OptimizeStatsProto expected; + expected.set_latency_ms(5); + expected.set_document_store_optimize_latency_ms(5); + expected.set_index_restoration_latency_ms(5); + expected.set_num_original_documents(3); + expected.set_num_deleted_documents(1); + expected.set_num_expired_documents(1); + expected.set_index_restoration_mode(OptimizeStatsProto::FULL_INDEX_REBUILD); + + // Run Optimize + OptimizeResultProto result = icing->Optimize(); + // Depending on how many blocks the documents end up spread across, it's + // possible that Optimize can remove documents without shrinking storage. The + // first Optimize call will also write the OptimizeStatusProto for the first + // time which will take up 1 block. So make sure that before_size is no less + // than after_size - 1 block. + uint32_t page_size = getpagesize(); + EXPECT_THAT(result.optimize_stats().storage_size_before(), + Ge(result.optimize_stats().storage_size_after() - page_size)); + result.mutable_optimize_stats()->clear_storage_size_before(); + result.mutable_optimize_stats()->clear_storage_size_after(); + EXPECT_THAT(result.optimize_stats(), EqualsProto(expected)); + + fake_clock = std::make_unique<FakeClock>(); + fake_clock->SetTimerElapsedMilliseconds(5); + fake_clock->SetSystemTimeMilliseconds(20000); + // Use the default Icing options, so that a change to the default value will + // require updating this test. + icing = std::make_unique<TestIcingSearchEngine>( + GetDefaultIcingOptions(), std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), std::move(fake_clock), + GetTestJniCache()); + ASSERT_THAT(icing->Initialize().status(), ProtoIsOk()); + + expected = OptimizeStatsProto(); + expected.set_latency_ms(5); + expected.set_document_store_optimize_latency_ms(5); + expected.set_index_restoration_latency_ms(5); + expected.set_num_original_documents(1); + expected.set_num_deleted_documents(0); + expected.set_num_expired_documents(0); + expected.set_time_since_last_optimize_ms(10000); + expected.set_index_restoration_mode(OptimizeStatsProto::FULL_INDEX_REBUILD); + + // Run Optimize + result = icing->Optimize(); + EXPECT_THAT(result.optimize_stats().storage_size_before(), + Eq(result.optimize_stats().storage_size_after())); + result.mutable_optimize_stats()->clear_storage_size_before(); + result.mutable_optimize_stats()->clear_storage_size_after(); + EXPECT_THAT(result.optimize_stats(), EqualsProto(expected)); + + // Delete the last document. + ASSERT_THAT(icing->Delete(document3.namespace_(), document3.uri()).status(), + ProtoIsOk()); + + expected = OptimizeStatsProto(); + expected.set_latency_ms(5); + expected.set_document_store_optimize_latency_ms(5); + expected.set_index_restoration_latency_ms(5); + expected.set_num_original_documents(1); + expected.set_num_deleted_documents(1); + expected.set_num_expired_documents(0); + expected.set_time_since_last_optimize_ms(0); + expected.set_index_restoration_mode(OptimizeStatsProto::FULL_INDEX_REBUILD); + + // Run Optimize + result = icing->Optimize(); + EXPECT_THAT(result.optimize_stats().storage_size_before(), + Ge(result.optimize_stats().storage_size_after())); + result.mutable_optimize_stats()->clear_storage_size_before(); + result.mutable_optimize_stats()->clear_storage_size_after(); + EXPECT_THAT(result.optimize_stats(), EqualsProto(expected)); +} + } // namespace } // namespace lib } // namespace icing diff --git a/icing/icing-search-engine_schema_test.cc b/icing/icing-search-engine_schema_test.cc index 38a0464..7081ba2 100644 --- a/icing/icing-search-engine_schema_test.cc +++ b/icing/icing-search-engine_schema_test.cc @@ -26,6 +26,7 @@ #include "icing/file/mock-filesystem.h" #include "icing/icing-search-engine.h" #include "icing/jni/jni-cache.h" +#include "icing/join/join-processor.h" #include "icing/portable/endian.h" #include "icing/portable/equals-proto.h" #include "icing/portable/platform.h" @@ -153,6 +154,10 @@ ScoringSpecProto GetDefaultScoringSpec() { return scoring_spec; } +// TODO(b/272145329): create SearchSpecBuilder, JoinSpecBuilder, +// SearchResultProtoBuilder and ResultProtoBuilder for unit tests and build all +// instances by them. + TEST_F(IcingSearchEngineSchemaTest, CircularReferenceCreateSectionManagerReturnsInvalidArgument) { // Create a type config with a circular reference. @@ -1077,7 +1082,203 @@ TEST_F(IcingSearchEngineSchemaTest, } TEST_F(IcingSearchEngineSchemaTest, - ForceSetSchemaPropertyDeletionTriggersIndexRestorationAndReturnsOk) { + SetSchemaNewJoinablePropertyTriggersIndexRestorationAndReturnsOk) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + // Create "Message" schema with 3 properties: + // - "subject": string type, non-joinable. No joinable property id assigned. + // It is indexed and used for searching only. + // - "receiverQualifiedId": string type, non-joinable. No joinable property id + // assigned. + // - "senderQualifiedId": string type, Qualified Id type joinable. Joinable + // property id = 0. + SchemaProto schema_one = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("receiverQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_NONE) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + SetSchemaResultProto set_schema_result = icing.SetSchema(schema_one); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + SetSchemaResultProto expected_set_schema_result; + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + expected_set_schema_result.mutable_new_schema_types()->Add("Message"); + expected_set_schema_result.mutable_new_schema_types()->Add("Person"); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + DocumentProto person1 = + DocumentBuilder() + .SetKey("namespace", "person1") + .SetSchema("Person") + .AddStringProperty("name", "person one") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto person2 = + DocumentBuilder() + .SetKey("namespace", "person2") + .SetSchema("Person") + .AddStringProperty("name", "person two") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + DocumentProto message = + DocumentBuilder() + .SetKey("namespace", "message1") + .SetSchema("Message") + .AddStringProperty("subject", "message") + .AddStringProperty("receiverQualifiedId", "namespace#person1") + .AddStringProperty("senderQualifiedId", "namespace#person2") + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + EXPECT_THAT(icing.Put(person1).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(person2).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(message).status(), ProtoIsOk()); + + // Verify join search: join a query for `name:person` with a child query for + // `subject:message` based on the child's `receiverQualifiedId` field. + // Since "receiverQualifiedId" is not JOINABLE_VALUE_TYPE_QUALIFIED_ID, + // joining on that property should only return the "left-side" (`name:person`) + // of the join. + SearchSpecProto search_spec_join_by_receiver; + search_spec_join_by_receiver.set_query("name:person"); + search_spec_join_by_receiver.set_term_match_type(TermMatchType::EXACT_ONLY); + JoinSpecProto* join_spec = search_spec_join_by_receiver.mutable_join_spec(); + join_spec->set_max_joined_child_count(100); + join_spec->set_parent_property_expression( + std::string(JoinProcessor::kQualifiedIdExpr)); + join_spec->set_child_property_expression("receiverQualifiedId"); + join_spec->set_aggregation_scoring_strategy( + JoinSpecProto::AggregationScoringStrategy::COUNT); + JoinSpecProto::NestedSpecProto* nested_spec = + join_spec->mutable_nested_spec(); + SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec(); + nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY); + nested_search_spec->set_query("subject:message"); + *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec(); + *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance(); + + SearchResultProto expected_empty_child_search_result_proto; + expected_empty_child_search_result_proto.mutable_status()->set_code( + StatusProto::OK); + *expected_empty_child_search_result_proto.mutable_results() + ->Add() + ->mutable_document() = person2; + *expected_empty_child_search_result_proto.mutable_results() + ->Add() + ->mutable_document() = person1; + SearchResultProto actual_results = + icing.Search(search_spec_join_by_receiver, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_empty_child_search_result_proto)); + + // Verify join search: join a query for `name:person` with a child query for + // `subject:message` based on the child's `senderQualifiedId` field. + // Since "senderQualifiedId" is JOINABLE_VALUE_TYPE_QUALIFIED_ID, joining on + // that property should return both "left-side" (`name:person`) and + // "right-side" (`subject:message`) of the join. + SearchSpecProto search_spec_join_by_sender = search_spec_join_by_receiver; + join_spec = search_spec_join_by_sender.mutable_join_spec(); + join_spec->set_child_property_expression("senderQualifiedId"); + + SearchResultProto expected_join_by_sender_search_result_proto; + expected_join_by_sender_search_result_proto.mutable_status()->set_code( + StatusProto::OK); + SearchResultProto::ResultProto* result_proto = + expected_join_by_sender_search_result_proto.mutable_results()->Add(); + *result_proto->mutable_document() = person2; + *result_proto->mutable_joined_results()->Add()->mutable_document() = message; + *expected_join_by_sender_search_result_proto.mutable_results() + ->Add() + ->mutable_document() = person1; + actual_results = + icing.Search(search_spec_join_by_sender, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_join_by_sender_search_result_proto)); + + // Change "Message" schema to: + // - "subject": string type, non-joinable. No joinable property id assigned. + // - "receiverQualifiedId": string type, Qualified Id joinable. Joinable + // property id = 0. + // - "senderQualifiedId": string type, Qualified Id joinable. Joinable + // property id = 1. + SchemaProto schema_two = schema_one; + schema_two.mutable_types(1) + ->mutable_properties(1) + ->mutable_joinable_config() + ->set_value_type(JOINABLE_VALUE_TYPE_QUALIFIED_ID); + // Index restoration should be triggered here because new schema requires more + // joinable properties. Also new joinable property ids will be reassigned and + // index restoration should use new joinable property ids to rebuild. + set_schema_result = icing.SetSchema(schema_two); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + expected_set_schema_result = SetSchemaResultProto(); + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + expected_set_schema_result.mutable_join_incompatible_changed_schema_types() + ->Add("Message"); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + // Verify join search: join a query for `name:person` with a child query for + // `subject:message` based on the child's `receiverQualifiedId` field. + // Since we've changed "receiverQualifiedId" to be + // JOINABLE_VALUE_TYPE_QUALIFIED_ID, joining on that property should return + // should return both "left-side" (`name:person`) and "right-side" + // (`subject:message`) of the join now. + SearchResultProto expected_join_by_receiver_search_result_proto; + expected_join_by_receiver_search_result_proto.mutable_status()->set_code( + StatusProto::OK); + result_proto = + expected_join_by_receiver_search_result_proto.mutable_results()->Add(); + *result_proto->mutable_document() = person1; + *result_proto->mutable_joined_results()->Add()->mutable_document() = message; + *expected_join_by_receiver_search_result_proto.mutable_results() + ->Add() + ->mutable_document() = person2; + actual_results = + icing.Search(search_spec_join_by_receiver, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, + EqualsSearchResultIgnoreStatsAndScores( + expected_join_by_receiver_search_result_proto)); + + // Verify join search: join a query for `name:person` with a child query for + // `subject:message` based on the child's `senderQualifiedId` field. We should + // get the same set of result since `senderQualifiedId` is unchanged. + actual_results = + icing.Search(search_spec_join_by_sender, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_join_by_sender_search_result_proto)); +} + +TEST_F( + IcingSearchEngineSchemaTest, + ForceSetSchemaIndexedPropertyDeletionTriggersIndexRestorationAndReturnsOk) { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); @@ -1221,9 +1422,161 @@ TEST_F(IcingSearchEngineSchemaTest, expected_search_result_proto)); } +TEST_F(IcingSearchEngineSchemaTest, + ForceSetSchemaJoinablePropertyDeletionTriggersIndexRestoration) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + // Create "Email" schema with 2 joinable properties: + // - "receiverQualifiedId": qualified id joinable. Joinable property id = 0. + // - "senderQualifiedId": qualified id joinable. Joinable property id = 1. + SchemaProto email_with_receiver_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Email") + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("receiverQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + SetSchemaResultProto set_schema_result = + icing.SetSchema(email_with_receiver_schema); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + SetSchemaResultProto expected_set_schema_result; + expected_set_schema_result.mutable_new_schema_types()->Add("Email"); + expected_set_schema_result.mutable_new_schema_types()->Add("Person"); + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + DocumentProto person = DocumentBuilder() + .SetKey("namespace", "person") + .SetSchema("Person") + .SetCreationTimestampMs(1000) + .AddStringProperty("name", "person") + .Build(); + // Create an email document with only "senderQualifiedId" joinable property. + DocumentProto email = + DocumentBuilder() + .SetKey("namespace", "email") + .SetSchema("Email") + .SetCreationTimestampMs(1000) + .AddStringProperty("subject", + "Did you get the memo about TPS reports?") + .AddStringProperty("senderQualifiedId", "namespace#person") + .Build(); + + EXPECT_THAT(icing.Put(person).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(email).status(), ProtoIsOk()); + + // Verify join search: join a query for `name:person` with a child query for + // `subject:tps` based on the child's `senderQualifiedId` field. We should be + // able to join person and email documents by this property. + SearchResultProto expected_search_result_proto; + expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); + SearchResultProto::ResultProto* result_proto = + expected_search_result_proto.mutable_results()->Add(); + *result_proto->mutable_document() = person; + *result_proto->mutable_joined_results()->Add()->mutable_document() = email; + + SearchSpecProto search_spec; + search_spec.set_query("name:person"); + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + JoinSpecProto* join_spec = search_spec.mutable_join_spec(); + join_spec->set_max_joined_child_count(100); + join_spec->set_parent_property_expression( + std::string(JoinProcessor::kQualifiedIdExpr)); + join_spec->set_child_property_expression("senderQualifiedId"); + join_spec->set_aggregation_scoring_strategy( + JoinSpecProto::AggregationScoringStrategy::COUNT); + JoinSpecProto::NestedSpecProto* nested_spec = + join_spec->mutable_nested_spec(); + SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec(); + nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY); + nested_search_spec->set_query("subject:tps"); + *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec(); + *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance(); + + SearchResultProto actual_results = + icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); + + // Now update the schema to remove "receiverQualifiedId" fields. This is + // backwards incompatible, but document should be preserved because it doesn't + // contain "receiverQualifiedId" field. Also since it is join incompatible, we + // have to rebuild join index. + // - "senderQualifiedId": qualified id joinable. Joinable property id = 0. + // + // If the index is not correctly rebuilt, then the joinable data of + // "senderQualifiedId" in the joinable index will still have old joinable + // property id of 1 and therefore won't take effect for join search query. + SchemaProto email_without_receiver_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Email") + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + // Although we've just deleted an existing property "receiverQualifiedId" from + // schema "Email", some email documents will still be preserved because they + // don't have "receiverQualifiedId" property. + set_schema_result = + icing.SetSchema(email_without_receiver_schema, + /*ignore_errors_and_delete_documents=*/true); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + expected_set_schema_result = SetSchemaResultProto(); + expected_set_schema_result.mutable_incompatible_schema_types()->Add("Email"); + expected_set_schema_result.mutable_join_incompatible_changed_schema_types() + ->Add("Email"); + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + // Verify join search: join a query for `name:person` with a child query for + // `subject:tps` based on the child's `senderQualifiedId` field. We should + // still be able to join person and email documents by this property. + actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); +} + TEST_F( IcingSearchEngineSchemaTest, - ForceSetSchemaPropertyDeletionAndAdditionTriggersIndexRestorationAndReturnsOk) { + ForceSetSchemaIndexedPropertyDeletionAndAdditionTriggersIndexRestorationAndReturnsOk) { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); @@ -1368,6 +1721,161 @@ TEST_F( expected_search_result_proto)); } +TEST_F( + IcingSearchEngineSchemaTest, + ForceSetSchemaJoinablePropertyDeletionAndAdditionTriggersIndexRestorationAndReturnsOk) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + // Create "Email" schema with 2 joinable properties: + // - "receiverQualifiedId": qualified id joinable. Joinable property id = 0. + // - "senderQualifiedId": qualified id joinable. Joinable property id = 1. + SchemaProto email_with_body_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Email") + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("receiverQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + SetSchemaResultProto set_schema_result = + icing.SetSchema(email_with_body_schema); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + SetSchemaResultProto expected_set_schema_result; + expected_set_schema_result.mutable_new_schema_types()->Add("Email"); + expected_set_schema_result.mutable_new_schema_types()->Add("Person"); + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + DocumentProto person = DocumentBuilder() + .SetKey("namespace", "person") + .SetSchema("Person") + .SetCreationTimestampMs(1000) + .AddStringProperty("name", "person") + .Build(); + // Create an email document with only subject and timestamp property. + DocumentProto email = + DocumentBuilder() + .SetKey("namespace", "email") + .SetSchema("Email") + .SetCreationTimestampMs(1000) + .AddStringProperty("subject", + "Did you get the memo about TPS reports?") + .AddStringProperty("senderQualifiedId", "namespace#person") + .Build(); + + EXPECT_THAT(icing.Put(person).status(), ProtoIsOk()); + EXPECT_THAT(icing.Put(email).status(), ProtoIsOk()); + + // Verify join search: join a query for `name:person` with a child query for + // `subject:tps` based on the child's `senderQualifiedId` field. We should be + // able to join person and email documents by this property. + SearchResultProto expected_search_result_proto; + expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); + SearchResultProto::ResultProto* result_proto = + expected_search_result_proto.mutable_results()->Add(); + *result_proto->mutable_document() = person; + *result_proto->mutable_joined_results()->Add()->mutable_document() = email; + + SearchSpecProto search_spec; + search_spec.set_query("name:person"); + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + JoinSpecProto* join_spec = search_spec.mutable_join_spec(); + join_spec->set_max_joined_child_count(100); + join_spec->set_parent_property_expression( + std::string(JoinProcessor::kQualifiedIdExpr)); + join_spec->set_child_property_expression("senderQualifiedId"); + join_spec->set_aggregation_scoring_strategy( + JoinSpecProto::AggregationScoringStrategy::COUNT); + JoinSpecProto::NestedSpecProto* nested_spec = + join_spec->mutable_nested_spec(); + SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec(); + nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY); + nested_search_spec->set_query("subject:tps"); + *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec(); + *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance(); + + SearchResultProto actual_results = + icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); + + // Now update the schema to remove the "receiverQualified" field and add + // "zQualifiedId". This is backwards incompatible, but document should + // be preserved because it doesn't contain a "receiverQualified" field and + // "zQualifiedId" is optional. + // - "senderQualifiedId": qualified id joinable. Joinable property id = 0. + // - "zQualifiedId": qualified id joinable. Joinable property id = 1. + // + // If the index is not correctly rebuilt, then the joinable data of + // "senderQualifiedId" in the joinable index will still have old joinable + // property id of 1 and therefore won't take effect for join search query. + SchemaProto email_no_body_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("Email") + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(TERM_MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("zQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("senderQualifiedId") + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + set_schema_result = icing.SetSchema( + email_no_body_schema, /*ignore_errors_and_delete_documents=*/true); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + expected_set_schema_result = SetSchemaResultProto(); + expected_set_schema_result.mutable_incompatible_schema_types()->Add("Email"); + expected_set_schema_result.mutable_join_incompatible_changed_schema_types() + ->Add("Email"); + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + // Verify join search: join a query for `name:person` with a child query for + // `subject:tps` based on the child's `senderQualifiedId` field. We should + // still be able to join person and email documents by this property. + actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); +} + TEST_F(IcingSearchEngineSchemaTest, ForceSetSchemaIncompatibleNestedDocsAreDeleted) { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); @@ -1485,9 +1993,6 @@ TEST_F(IcingSearchEngineSchemaTest, EXPECT_THAT(get_result.status(), ProtoStatusIs(StatusProto::NOT_FOUND)); } -// TODO(b/256022027): add unit tests for join incompatible schema change to make -// sure the joinable cache is rebuilt correctly. - TEST_F(IcingSearchEngineSchemaTest, SetSchemaRevalidatesDocumentsAndReturnsOk) { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); diff --git a/icing/icing-search-engine_search_test.cc b/icing/icing-search-engine_search_test.cc index 5648184..e953d71 100644 --- a/icing/icing-search-engine_search_test.cc +++ b/icing/icing-search-engine_search_test.cc @@ -4388,6 +4388,96 @@ TEST_P(IcingSearchEngineSearchTest, LatinSnippetTest) { ASSERT_THAT(match, Eq("ḞÖÖ")); } +TEST_P(IcingSearchEngineSearchTest, + DocumentStoreNamespaceIdFingerprintCompatible) { + DocumentProto document1 = CreateMessageDocument("namespace", "uri1"); + DocumentProto document2 = CreateMessageDocument("namespace", "uri2"); + DocumentProto document3 = CreateMessageDocument("namespace", "uri3"); + + // Initialize with some documents with document_store_namespace_id_fingerprint + // being false. + { + IcingSearchEngineOptions options = GetDefaultIcingOptions(); + options.set_document_store_namespace_id_fingerprint(false); + IcingSearchEngine icing(options, GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + + // Creates and inserts 3 documents + ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk()); + } + + // Reinitializate with document_store_namespace_id_fingerprint being true, + // and test that we are still able to read/query docs. + { + IcingSearchEngineOptions options = GetDefaultIcingOptions(); + options.set_document_store_namespace_id_fingerprint(true); + IcingSearchEngine icing(options, GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + ASSERT_THAT( + icing.Get("namespace", "uri1", GetResultSpecProto::default_instance()) + .status(), + ProtoIsOk()); + ASSERT_THAT( + icing.Get("namespace", "uri2", GetResultSpecProto::default_instance()) + .status(), + ProtoIsOk()); + ASSERT_THAT( + icing.Get("namespace", "uri3", GetResultSpecProto::default_instance()) + .status(), + ProtoIsOk()); + + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::PREFIX); + search_spec.set_query("message"); + search_spec.set_search_type(GetParam()); + SearchResultProto results = + icing.Search(search_spec, ScoringSpecProto::default_instance(), + ResultSpecProto::default_instance()); + ASSERT_THAT(results.results(), SizeIs(3)); + EXPECT_THAT(results.results(0).document(), EqualsProto(document3)); + EXPECT_THAT(results.results(1).document(), EqualsProto(document2)); + EXPECT_THAT(results.results(2).document(), EqualsProto(document1)); + } + + // Reinitializate with document_store_namespace_id_fingerprint being false, + // and test that we are still able to read/query docs. + { + IcingSearchEngineOptions options = GetDefaultIcingOptions(); + options.set_document_store_namespace_id_fingerprint(false); + IcingSearchEngine icing(options, GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + ASSERT_THAT( + icing.Get("namespace", "uri1", GetResultSpecProto::default_instance()) + .status(), + ProtoIsOk()); + ASSERT_THAT( + icing.Get("namespace", "uri2", GetResultSpecProto::default_instance()) + .status(), + ProtoIsOk()); + ASSERT_THAT( + icing.Get("namespace", "uri3", GetResultSpecProto::default_instance()) + .status(), + ProtoIsOk()); + + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::PREFIX); + search_spec.set_query("message"); + search_spec.set_search_type(GetParam()); + SearchResultProto results = + icing.Search(search_spec, ScoringSpecProto::default_instance(), + ResultSpecProto::default_instance()); + ASSERT_THAT(results.results(), SizeIs(3)); + EXPECT_THAT(results.results(0).document(), EqualsProto(document3)); + EXPECT_THAT(results.results(1).document(), EqualsProto(document2)); + EXPECT_THAT(results.results(2).document(), EqualsProto(document1)); + } +} + INSTANTIATE_TEST_SUITE_P( IcingSearchEngineSearchTest, IcingSearchEngineSearchTest, testing::Values( diff --git a/icing/icing-search-engine_suggest_test.cc b/icing/icing-search-engine_suggest_test.cc index 6973ad0..b3aeafc 100644 --- a/icing/icing-search-engine_suggest_test.cc +++ b/icing/icing-search-engine_suggest_test.cc @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/icing-search-engine.h" - #include <cstdint> #include <limits> #include <memory> @@ -25,6 +23,7 @@ #include "gtest/gtest.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" +#include "icing/icing-search-engine.h" #include "icing/jni/jni-cache.h" #include "icing/portable/endian.h" #include "icing/portable/equals-proto.h" @@ -1508,6 +1507,95 @@ TEST_F(IcingSearchEngineSuggestTest, UnorderedElementsAre(EqualsProto(suggestionBarCatSubjectFoo))); } +TEST_F(IcingSearchEngineSuggestTest, SearchSuggestionsTest_InvalidPrefixTest) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), + ProtoIsOk()); + + DocumentProto document1 = + DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Email") + .SetCreationTimestampMs(10) + .AddStringProperty("subject", "bar fo") // "bar fo" + .AddStringProperty("body", "fool") + .Build(); + DocumentProto document2 = + DocumentBuilder() + .SetKey("namespace1", "uri2") + .SetSchema("Email") + .SetCreationTimestampMs(10) + .AddStringProperty("subject", "bar cat foo") // "bar cat fool" + .AddStringProperty("body", "fool") + .Build(); + DocumentProto document3 = DocumentBuilder() + .SetKey("namespace1", "uri3") + .SetSchema("Email") + .SetCreationTimestampMs(10) + .AddStringProperty("subject", "fool") // "fool" + .AddStringProperty("body", "fool") + .Build(); + ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk()); + + // Search for "f OR" + SuggestionSpecProto suggestion_spec; + suggestion_spec.set_prefix("f OR"); + suggestion_spec.set_num_to_return(10); + suggestion_spec.mutable_scoring_spec()->set_scoring_match_type( + TermMatchType::PREFIX); + suggestion_spec.mutable_scoring_spec()->set_rank_by( + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT); + + SuggestionResponse response = icing.SearchSuggestions(suggestion_spec); + if (SearchSpecProto::default_instance().search_type() == + SearchSpecProto::SearchType::ICING_RAW_QUERY) { + EXPECT_THAT(response.status(), ProtoIsOk()); + EXPECT_THAT(response.suggestions(), IsEmpty()); + } else { + EXPECT_THAT(response.status(), + ProtoStatusIs(StatusProto::INVALID_ARGUMENT)); + EXPECT_THAT(response.suggestions(), IsEmpty()); + } + + // TODO(b/208654892): Update handling for hyphens to only consider it a hyphen + // within a TEXT token (rather than a MINUS token) when surrounded on both + // sides by TEXT rather than just preceded by TEXT. + // Search for "f-" + suggestion_spec.set_prefix("f-"); + response = icing.SearchSuggestions(suggestion_spec); + EXPECT_THAT(response.status(), ProtoIsOk()); + EXPECT_THAT(response.suggestions(), IsEmpty()); + + // Search for "f:" + suggestion_spec.set_prefix("f:"); + response = icing.SearchSuggestions(suggestion_spec); + if (SearchSpecProto::default_instance().search_type() == + SearchSpecProto::SearchType::ICING_RAW_QUERY) { + EXPECT_THAT(response.status(), ProtoIsOk()); + EXPECT_THAT(response.suggestions(), IsEmpty()); + } else { + EXPECT_THAT(response.status(), + ProtoStatusIs(StatusProto::INVALID_ARGUMENT)); + EXPECT_THAT(response.suggestions(), IsEmpty()); + } + + // Search for "OR OR - :" + suggestion_spec.set_prefix("OR OR - :"); + response = icing.SearchSuggestions(suggestion_spec); + if (SearchSpecProto::default_instance().search_type() == + SearchSpecProto::SearchType::ICING_RAW_QUERY) { + EXPECT_THAT(response.status(), ProtoIsOk()); + EXPECT_THAT(response.suggestions(), IsEmpty()); + } else { + EXPECT_THAT(response.status(), + ProtoStatusIs(StatusProto::INVALID_ARGUMENT)); + EXPECT_THAT(response.suggestions(), IsEmpty()); + } +} + } // namespace } // namespace lib } // namespace icing diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc index 3a9b4ee..47baabe 100644 --- a/icing/index/index-processor_test.cc +++ b/icing/index/index-processor_test.cc @@ -40,6 +40,8 @@ #include "icing/index/numeric/numeric-index.h" #include "icing/index/string-section-indexing-handler.h" #include "icing/index/term-property-id.h" +#include "icing/join/qualified-id-joinable-property-indexing-handler.h" +#include "icing/join/qualified-id-type-joinable-index.h" #include "icing/legacy/index/icing-filesystem.h" #include "icing/legacy/index/icing-mock-filesystem.h" #include "icing/portable/platform.h" @@ -51,6 +53,7 @@ #include "icing/schema/schema-util.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" +#include "icing/store/document-store.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" #include "icing/testing/icu-data-file-helper.h" @@ -160,7 +163,9 @@ class IndexProcessorTest : public Test { index_dir_ = base_dir_ + "/index"; integer_index_dir_ = base_dir_ + "/integer_index"; + qualified_id_join_index_dir_ = base_dir_ + "/qualified_id_join_index"; schema_store_dir_ = base_dir_ + "/schema_store"; + doc_store_dir_ = base_dir_ + "/doc_store"; Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024); ICING_ASSERT_OK_AND_ASSIGN( @@ -169,6 +174,10 @@ class IndexProcessorTest : public Test { ICING_ASSERT_OK_AND_ASSIGN( integer_index_, IntegerIndex::Create(filesystem_, integer_index_dir_)); + ICING_ASSERT_OK_AND_ASSIGN(qualified_id_join_index_, + QualifiedIdTypeJoinableIndex::Create( + filesystem_, qualified_id_join_index_dir_)); + language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US); ICING_ASSERT_OK_AND_ASSIGN( lang_segmenter_, @@ -260,6 +269,13 @@ class IndexProcessorTest : public Test { .Build(); ICING_ASSERT_OK(schema_store_->SetSchema(schema)); + ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(doc_store_dir_.c_str())); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, doc_store_dir_, &fake_clock_, + schema_store_.get())); + doc_store_ = std::move(create_result.document_store); + ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<StringSectionIndexingHandler> string_section_indexing_handler, @@ -269,9 +285,16 @@ class IndexProcessorTest : public Test { integer_section_indexing_handler, IntegerSectionIndexingHandler::Create( &fake_clock_, integer_index_.get())); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinablePropertyIndexingHandler> + qualified_id_joinable_property_indexing_handler, + QualifiedIdJoinablePropertyIndexingHandler::Create( + &fake_clock_, qualified_id_join_index_.get())); std::vector<std::unique_ptr<DataIndexingHandler>> handlers; handlers.push_back(std::move(string_section_indexing_handler)); handlers.push_back(std::move(integer_section_indexing_handler)); + handlers.push_back( + std::move(qualified_id_joinable_property_indexing_handler)); index_processor_ = std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_); @@ -281,9 +304,11 @@ class IndexProcessorTest : public Test { void TearDown() override { index_processor_.reset(); + doc_store_.reset(); schema_store_.reset(); normalizer_.reset(); lang_segmenter_.reset(); + qualified_id_join_index_.reset(); integer_index_.reset(); index_.reset(); @@ -298,13 +323,17 @@ class IndexProcessorTest : public Test { std::string base_dir_; std::string index_dir_; std::string integer_index_dir_; + std::string qualified_id_join_index_dir_; std::string schema_store_dir_; + std::string doc_store_dir_; std::unique_ptr<Index> index_; std::unique_ptr<NumericIndex<int64_t>> integer_index_; + std::unique_ptr<QualifiedIdTypeJoinableIndex> qualified_id_join_index_; std::unique_ptr<LanguageSegmenter> lang_segmenter_; std::unique_ptr<Normalizer> normalizer_; std::unique_ptr<SchemaStore> schema_store_; + std::unique_ptr<DocumentStore> doc_store_; std::unique_ptr<IndexProcessor> index_processor_; }; @@ -788,9 +817,16 @@ TEST_F(IndexProcessorTest, OutOfOrderDocumentIdsInRecoveryMode) { integer_section_indexing_handler, IntegerSectionIndexingHandler::Create( &fake_clock_, integer_index_.get())); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinablePropertyIndexingHandler> + qualified_id_joinable_property_indexing_handler, + QualifiedIdJoinablePropertyIndexingHandler::Create( + &fake_clock_, qualified_id_join_index_.get())); std::vector<std::unique_ptr<DataIndexingHandler>> handlers; handlers.push_back(std::move(string_section_indexing_handler)); handlers.push_back(std::move(integer_section_indexing_handler)); + handlers.push_back( + std::move(qualified_id_joinable_property_indexing_handler)); IndexProcessor index_processor(std::move(handlers), &fake_clock_, /*recovery_mode=*/true); @@ -1506,10 +1542,10 @@ TEST_F(IndexProcessorTest, IndexableIntegerProperty) { EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<DocHitInfoIterator> itr, - integer_index_->GetIterator(kIndexableIntegerProperty, /*key_lower=*/1, - /*key_upper=*/5)); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr, + integer_index_->GetIterator( + kIndexableIntegerProperty, /*key_lower=*/1, + /*key_upper=*/5, *doc_store_, *schema_store_)); EXPECT_THAT( GetHits(std::move(itr)), @@ -1535,10 +1571,10 @@ TEST_F(IndexProcessorTest, IndexableIntegerPropertyNoMatch) { EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<DocHitInfoIterator> itr, - integer_index_->GetIterator(kIndexableIntegerProperty, /*key_lower=*/-1, - /*key_upper=*/0)); + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr, + integer_index_->GetIterator( + kIndexableIntegerProperty, /*key_lower=*/-1, + /*key_upper=*/0, *doc_store_, *schema_store_)); EXPECT_THAT(GetHits(std::move(itr)), IsEmpty()); } diff --git a/icing/index/iterator/doc-hit-info-iterator-not.cc b/icing/index/iterator/doc-hit-info-iterator-not.cc index 1818f08..38b1ded 100644 --- a/icing/index/iterator/doc-hit-info-iterator-not.cc +++ b/icing/index/iterator/doc-hit-info-iterator-not.cc @@ -63,8 +63,8 @@ libtextclassifier3::Status DocHitInfoIteratorNot::Advance() { libtextclassifier3::StatusOr<DocHitInfoIterator::TrimmedNode> DocHitInfoIteratorNot::TrimRightMostNode() && { // Don't generate suggestion if the last operator is NOT. - return absl_ports::UnimplementedError( - "Cannot trim right most node in NOT operator."); + return absl_ports::InvalidArgumentError( + "Cannot generate suggestion if the last term is NOT operator."); } int32_t DocHitInfoIteratorNot::GetNumBlocksInspected() const { diff --git a/icing/index/iterator/doc-hit-info-iterator-not_test.cc b/icing/index/iterator/doc-hit-info-iterator-not_test.cc index 54d6c36..5a8ce2c 100644 --- a/icing/index/iterator/doc-hit-info-iterator-not_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-not_test.cc @@ -163,7 +163,7 @@ TEST(DocHitInfoIteratorNotTest, TrimNotIterator) { DocHitInfoIteratorNot not_iterator(std::move(to_be_excluded_iterator), /*document_id_limit=*/5); EXPECT_THAT(std::move(not_iterator).TrimRightMostNode(), - StatusIs(libtextclassifier3::StatusCode::UNIMPLEMENTED)); + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } } // namespace diff --git a/icing/index/numeric/doc-hit-info-iterator-numeric.h b/icing/index/numeric/doc-hit-info-iterator-numeric.h index bf990d1..fc66a1d 100644 --- a/icing/index/numeric/doc-hit-info-iterator-numeric.h +++ b/icing/index/numeric/doc-hit-info-iterator-numeric.h @@ -49,8 +49,8 @@ class DocHitInfoIteratorNumeric : public DocHitInfoIterator { } libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override { - return absl_ports::UnimplementedError( - "Cannot trim right most node in numeric operator."); + return absl_ports::InvalidArgumentError( + "Cannot generate suggestion if the last term is numeric operator."); } int32_t GetNumBlocksInspected() const override { return 0; } diff --git a/icing/index/numeric/dummy-numeric-index.h b/icing/index/numeric/dummy-numeric-index.h index 164866c..7cfb102 100644 --- a/icing/index/numeric/dummy-numeric-index.h +++ b/icing/index/numeric/dummy-numeric-index.h @@ -70,7 +70,8 @@ class DummyNumericIndex : public NumericIndex<T> { } libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator( - std::string_view property_path, T key_lower, T key_upper) const override; + std::string_view property_path, T key_lower, T key_upper, + const DocumentStore&, const SchemaStore&) const override; libtextclassifier3::Status Optimize( const std::vector<DocumentId>& document_id_old_to_new, @@ -93,6 +94,8 @@ class DummyNumericIndex : public NumericIndex<T> { } } + int num_property_indices() const override { return storage_.size(); } + private: class Editor : public NumericIndex<T>::Editor { public: @@ -176,7 +179,6 @@ class DummyNumericIndex : public NumericIndex<T> { DocHitInfo doc_hit_info_; }; - private: explicit DummyNumericIndex(const Filesystem& filesystem, std::string&& working_path) : NumericIndex<T>(filesystem, std::move(working_path), @@ -265,7 +267,8 @@ libtextclassifier3::Status DummyNumericIndex<T>::Iterator::Advance() { template <typename T> libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> DummyNumericIndex<T>::GetIterator(std::string_view property_path, T key_lower, - T key_upper) const { + T key_upper, const DocumentStore&, + const SchemaStore&) const { if (key_lower > key_upper) { return absl_ports::InvalidArgumentError( "key_lower should not be greater than key_upper"); diff --git a/icing/index/numeric/integer-index-bucket-util.cc b/icing/index/numeric/integer-index-bucket-util.cc new file mode 100644 index 0000000..a05baab --- /dev/null +++ b/icing/index/numeric/integer-index-bucket-util.cc @@ -0,0 +1,205 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/numeric/integer-index-bucket-util.h" + +#include <algorithm> +#include <cstdint> +#include <iterator> +#include <limits> +#include <utility> +#include <vector> + +#include "icing/index/numeric/integer-index-data.h" + +namespace icing { +namespace lib { + +namespace integer_index_bucket_util { + +namespace { + +// Helper function to determine if data slice [start, end) forms a "full +// single-range bucket". +// +// Full single-range bucket: keys of all data are identical and # of them exceed +// num_data_threshold. +// +// REQUIRES: data slice [start, end) are sorted by key. +inline bool WouldBeFullSingleRangeBucket( + const std::vector<IntegerIndexData>::iterator& start, + const std::vector<IntegerIndexData>::iterator& end, + int32_t num_data_threshold) { + return std::distance(start, end) > num_data_threshold && + start->key() == (end - 1)->key(); +} + +// Helper function to determine if a bucket is full single-range. +// +// REQUIRES: +// bucket.key_lower <= [bucket.start, bucket.end)->key() <= bucket.key_upper +inline bool IsFullSingleRangeBucket(const DataRangeAndBucketInfo& bucket, + int32_t num_data_threshold) { + return bucket.key_lower == bucket.key_upper && + WouldBeFullSingleRangeBucket(bucket.start, bucket.end, + num_data_threshold); +} + +// Helper function to append new bucket(s) with corresponding data slice for +// range [curr_key_lower, last_key] where last_key = (it_end - 1)->key(). +// +// Also it handles an edge case: +// If data slice [it_start, it_end) forms a "full single-range bucket" (see +// WouldBeFullSingleRangeBucket for definition), then we have to put them into a +// single range bucket [last_key, last_key] instead of [curr_key_lower, +// last_key]. Also we have to deal with range [curr_key_lower, last_key - 1]: +// - If the previous bucket exists and it is not a "full single-range bucket", +// then merge [curr_key_lower, last_key - 1] into the previous bucket, i.e. +// change the previous bucket's key_upper to (last_key - 1). Then we will end +// up having: +// - [prev_bucket.key_lower, last_key - 1] +// - [last_key, last_key] +// - Otherwise, we have to create [curr_key_lower, last_key - 1] with +// empty data. Then we will end up having (Note: prev_bucket.key_upper == +// curr_key_lower - 1): +// - [prev_bucket.key_lower, curr_key_lower - 1] +// - [curr_key_lower, last_key - 1] +// - [last_key, last_key] +// This will avoid split bucket being called too frequently. +// For example, original_key_lower = 0, original_key_upper = 50. If we have +// (num_data_threshold + 1) data with key = 20 and another data with key = 40: +// - Without this part, we will split them into [[0, 20], [21, 50]]. Then when +// adding data with key = 10 next round, we will invoke split again and split +// [0, 20] to [[0, 10], [11, 20]]. +// - With this part, we will split them into [[0, 19], [20, 20], [21, 50]], +// which will avoid splitting in the next round for key = 20. +// +// REQUIRES: it_start < it_end +void AppendNewBuckets(const std::vector<IntegerIndexData>::iterator& it_start, + const std::vector<IntegerIndexData>::iterator& it_end, + int64_t curr_key_lower, int32_t num_data_threshold, + std::vector<DataRangeAndBucketInfo>& results) { + int64_t last_key = (it_end - 1)->key(); + if (curr_key_lower < last_key && + WouldBeFullSingleRangeBucket(it_start, it_end, num_data_threshold)) { + if (!results.empty() && + !IsFullSingleRangeBucket(results.back(), num_data_threshold)) { + // Previous bucket is not full single-range, so merge it to now hold the + // range [prev_bucket.key_lower, last_key - 1]. + results.back().key_upper = last_key - 1; + } else { + // There is either no previous bucket or the previous bucket is full + // single-range. So add an empty bucket for the range [curr_key_lower, + // last_key - 1]. + results.push_back(DataRangeAndBucketInfo(it_start, it_start, + curr_key_lower, last_key - 1)); + } + curr_key_lower = last_key; + } + results.push_back( + DataRangeAndBucketInfo(it_start, it_end, curr_key_lower, last_key)); +} + +} // namespace + +std::vector<DataRangeAndBucketInfo> Split(std::vector<IntegerIndexData>& data, + int64_t original_key_lower, + int64_t original_key_upper, + int32_t num_data_threshold) { + // Early return if there is no need to split. + if (data.size() <= num_data_threshold) { + return {DataRangeAndBucketInfo(data.begin(), data.end(), original_key_lower, + original_key_upper)}; + } + + // Sort data by key. + std::sort( + data.begin(), data.end(), + [](const IntegerIndexData& lhs, const IntegerIndexData& rhs) -> bool { + return lhs.key() < rhs.key(); + }); + + std::vector<DataRangeAndBucketInfo> results; + int64_t curr_key_lower = original_key_lower; + // Sliding window [it_start, it_end) to separate data into different buckets. + auto it_start = data.begin(); + auto it_end = data.begin(); + while (it_end != data.end()) { + // Attempt to extend it_end by 1, but we have to include all data with the + // same key since they cannot be separated into different buckets. Also use + // extend_it_end to avoid modifying it_end directly. For some edge cases, + // the extension in a single round is extremely large (i.e. a lot of data + // have the same key), and we want to separate them. For example: + // - key = 0: 5 data + // - key = 1: num_data_threshold - 1 data + // In the second round, # of data in the sliding window will exceed the + // threshold. We want to separate all data with key = 0 into a single bucket + // instead of putting key = 0 and key = 1 together. Therefore, using + // extend_it_end allow us to preserve it_end of the previous round and be + // able to deal with this case. + auto extend_it_end = it_end + 1; + while (extend_it_end != data.end() && + it_end->key() == extend_it_end->key()) { + ++extend_it_end; + } + + if (std::distance(it_start, extend_it_end) > num_data_threshold && + it_start != it_end) { + // Split data between [it_start, it_end) into range [curr_key_lower, + // (it_end - 1)->key()]. + AppendNewBuckets(it_start, it_end, curr_key_lower, num_data_threshold, + results); + + // it_end at this moment won't be data.end(), so the last element of the + // new bucket can't have key == INT64_MAX. Therefore, it is safe to set + // curr_key_lower as ((it_end - 1)->key() + 1). + curr_key_lower = (it_end - 1)->key() + 1; + it_start = it_end; + } + it_end = extend_it_end; + } + + // Handle the final range [curr_key_lower, original_key_upper]. + if (curr_key_lower <= original_key_upper) { + if (it_start != it_end) { + AppendNewBuckets(it_start, it_end, curr_key_lower, num_data_threshold, + results); + + // AppendNewBuckets only handles range [curr_key_lower, (it_end - + // 1)->key()], so we have to handle range [(it_end - 1)->key() + 1, + // original_key_upper] if needed. + int64_t last_key = (it_end - 1)->key(); + if (last_key != std::numeric_limits<int64_t>::max() && + last_key + 1 <= original_key_upper) { + if (!results.empty() && + !IsFullSingleRangeBucket(results.back(), num_data_threshold)) { + results.back().key_upper = original_key_upper; + } else { + results.push_back(DataRangeAndBucketInfo( + it_start, it_start, last_key + 1, original_key_upper)); + } + } + } else { + results.push_back(DataRangeAndBucketInfo(it_start, it_end, curr_key_lower, + original_key_upper)); + } + } + + return results; +} + +} // namespace integer_index_bucket_util + +} // namespace lib +} // namespace icing diff --git a/icing/index/numeric/integer-index-bucket-util.h b/icing/index/numeric/integer-index-bucket-util.h new file mode 100644 index 0000000..863bd01 --- /dev/null +++ b/icing/index/numeric/integer-index-bucket-util.h @@ -0,0 +1,81 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_INDEX_NUMERIC_INTEGER_INDEX_BUCKET_UTIL_H_ +#define ICING_INDEX_NUMERIC_INTEGER_INDEX_BUCKET_UTIL_H_ + +#include <cstdint> +#include <utility> +#include <vector> + +#include "icing/index/numeric/integer-index-data.h" + +namespace icing { +namespace lib { + +namespace integer_index_bucket_util { + +// A wrapper struct that contains information of a bucket. +// - The bucket contains data within the iterator [start, end). +// - Bucket range is [key_lower, key_upper], and all data within [start, end) +// should have keys in the bucket range. +// +// Note: the caller should make sure the lifecycle of data vector is longer than +// instances of this wrapper struct. +struct DataRangeAndBucketInfo { + std::vector<IntegerIndexData>::iterator start; + std::vector<IntegerIndexData>::iterator end; + int64_t key_lower; + int64_t key_upper; + + explicit DataRangeAndBucketInfo( + std::vector<IntegerIndexData>::iterator start_in, + std::vector<IntegerIndexData>::iterator end_in, int64_t key_lower_in, + int64_t key_upper_in) + : start(std::move(start_in)), + end(std::move(end_in)), + key_lower(key_lower_in), + key_upper(key_upper_in) {} +}; + +// Helper function to split data (that are originally in a bucket with range +// [original_key_lower, original_key_upper]) into different buckets according to +// num_data_threshold. +// - The input vector `data` will be sorted by key in ascending order (unless +// there's no need to split in which case data is returned unmodified) +// - Data with the same key will be in the same bucket even if # of them exceed +// num_data_threshold. +// - Range of all buckets will be disjoint, and the range union will be +// [original_key_lower, original_key_upper]. +// - Data slice (i.e. [start, end)) can be empty. +// +// REQUIRES: +// - original_key_lower <= original_key_upper +// - num_data_threshold > 0 +// - Keys of all data are in range [original_key_lower, original_key_upper] +// +// Returns: a vector of DataRangeAndBucketInfo that contain all bucket info +// after splitting. Also the returned vector should contain at least one +// bucket, otherwise it is considered an error. +std::vector<DataRangeAndBucketInfo> Split(std::vector<IntegerIndexData>& data, + int64_t original_key_lower, + int64_t original_key_upper, + int32_t num_data_threshold); + +} // namespace integer_index_bucket_util + +} // namespace lib +} // namespace icing + +#endif // ICING_INDEX_NUMERIC_INTEGER_INDEX_BUCKET_UTIL_H_ diff --git a/icing/index/numeric/integer-index-bucket-util_test.cc b/icing/index/numeric/integer-index-bucket-util_test.cc new file mode 100644 index 0000000..82c593e --- /dev/null +++ b/icing/index/numeric/integer-index-bucket-util_test.cc @@ -0,0 +1,1112 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/index/numeric/integer-index-bucket-util.h" + +#include <limits> +#include <vector> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/index/numeric/integer-index-data.h" +#include "icing/schema/section.h" +#include "icing/store/document-id.h" + +namespace icing { +namespace lib { +namespace integer_index_bucket_util { + +namespace { + +using ::testing::ElementsAre; +using ::testing::Eq; +using ::testing::IsEmpty; +using ::testing::Ne; +using ::testing::SizeIs; + +static constexpr DocumentId kDefaultDocumentId = 123; +static constexpr SectionId kDefaultSectionId = 31; + +TEST(IntegerIndexBucketUtilTest, Split_numDataNotDivisibleByThreshold) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)}; + int64_t key_lower = -10; + int64_t key_upper = 10; + int32_t num_data_threshold = 3; + ASSERT_THAT(data.size() % num_data_threshold, Ne(0)); + + // Keys = [-10, -3, -2, 0, 1, 2, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, key_lower, key_upper, num_data_threshold); + ASSERT_THAT(results, SizeIs(3)); + // Bucket 0: key lower = -10, key upper = -2, keys = [-10, -3, -2]. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(-2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2))); + // Bucket 1: key lower = -1, key upper = 2, keys = [0, 1, 2]. + EXPECT_THAT(results[1].key_lower, Eq(-1)); + EXPECT_THAT(results[1].key_upper, Eq(2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2))); + // Bucket 2: key lower = 3, key upper = 10, keys = [10]. + EXPECT_THAT(results[2].key_lower, Eq(3)); + EXPECT_THAT(results[2].key_upper, Eq(10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, Split_numDataDivisibleByThreshold) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)}; + int64_t key_lower = -10; + int64_t key_upper = 10; + int32_t num_data_threshold = 3; + ASSERT_THAT(data.size() % num_data_threshold, Eq(0)); + + // Keys = [-10, -3, -2, 0, 2, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, key_lower, key_upper, num_data_threshold); + ASSERT_THAT(results, SizeIs(2)); + // Bucket 0: key lower = -10, key upper = -2, keys = [-10, -3, -2]. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(-2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2))); + // Bucket 1: key lower = -1, key upper = 2, keys = [0, 2, 10]. + EXPECT_THAT(results[1].key_lower, Eq(-1)); + EXPECT_THAT(results[1].key_upper, Eq(10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, Split_shouldIncludeOriginalKeyRange) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)}; + int64_t key_lower = -1000; + int64_t key_upper = 1000; + int32_t num_data_threshold = 3; + + // Keys = [-10, -3, -2, 0, 1, 2, 10]. + // Split should include the original key_lower and key_upper even if there is + // no key at boundary. + std::vector<DataRangeAndBucketInfo> results = + Split(data, key_lower, key_upper, num_data_threshold); + ASSERT_THAT(results, SizeIs(3)); + // Bucket 0: key lower = -1000, key upper = -2, keys = [-10, -3, -2]. + EXPECT_THAT(results[0].key_lower, Eq(-1000)); + EXPECT_THAT(results[0].key_upper, Eq(-2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2))); + // Bucket 1: key lower = -1, key upper = 2, keys = [0, 1, 2]. + EXPECT_THAT(results[1].key_lower, Eq(-1)); + EXPECT_THAT(results[1].key_upper, Eq(2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2))); + // Bucket 2: key lower = 3, key upper = 1000, keys = [10]. + EXPECT_THAT(results[2].key_lower, Eq(3)); + EXPECT_THAT(results[2].key_upper, Eq(1000)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, Split_singleBucketWithoutSplitting) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)}; + int64_t key_lower = -1000; + int64_t key_upper = 1000; + int32_t num_data_threshold = 100; + + // Keys = [-10, -3, -2, 0, 1, 2, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, key_lower, key_upper, num_data_threshold); + ASSERT_THAT(results, SizeIs(1)); + // Bucket 0: key lower = -1000, key upper = 1000, keys = [-10, -3, -2, 0, 1, + // 2, 10]. Since # of data <= threshold, data vector won't be sorted and thus + // [start, end) will have data with the original order. + EXPECT_THAT(results[0].key_lower, Eq(-1000)); + EXPECT_THAT(results[0].key_upper, Eq(1000)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2))); +} + +TEST(IntegerIndexBucketUtilTest, Split_emptyData) { + std::vector<IntegerIndexData> empty_data; + std::vector<DataRangeAndBucketInfo> results = + Split(empty_data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + ASSERT_THAT(results, SizeIs(1)); + // Bucket 0: key lower = -10, key upper = 10, keys = []. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(10)); + EXPECT_THAT(std::vector<IntegerIndexData>(results[0].start, results[0].end), + IsEmpty()); +} + +TEST(IntegerIndexBucketUtilTest, + Split_sameKeysExceedingThreshold_firstBucket_keyEqualsKeyLower) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)}; + + // Keys = [-10, -10, -10, -10, -10, 0, 3, 5, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // - Even though # of data with key = -10 exceeds the threshold, they should + // still be in the same bucket. + // - They should be separated from key = 0, 3, .... + ASSERT_THAT(results, SizeIs(3)); + // Bucket 0: key lower = -10, key upper = -10, keys = [-10, -10, -10, -10, + // -10]. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(-10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre( + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10))); + // Bucket 1: key lower = -9, key upper = 5, keys = [0, 3, 5]. + EXPECT_THAT(results[1].key_lower, Eq(-9)); + EXPECT_THAT(results[1].key_upper, Eq(5)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5))); + // Bucket 2: key lower = 6, key upper = 10, keys = [10]. + EXPECT_THAT(results[2].key_lower, Eq(6)); + EXPECT_THAT(results[2].key_upper, Eq(10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_sameKeysExceedingThreshold_firstBucket_keyGreaterThanKeyLower) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)}; + + // Keys = [-7, -7, -7, -7, -7, 0, 3, 5, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // - Even though # of data with key = -7 exceeds the threshold, they should + // still be in the same bucket. + // - They should be separated from key = 0, 3, .... + // - They should be in a single range bucket [-7, -7], and another bucket + // [-10, -8] with empty data should be created before it. + ASSERT_THAT(results, SizeIs(4)); + // Bucket 0: key lower = -10, key upper = -8, keys = []. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(-8)); + EXPECT_THAT(std::vector<IntegerIndexData>(results[0].start, results[0].end), + IsEmpty()); + // Bucket 1: key lower = -7, key upper = -7, keys = [-7, -7, -7, -7, -7]. + EXPECT_THAT(results[1].key_lower, Eq(-7)); + EXPECT_THAT(results[1].key_upper, Eq(-7)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7))); + // Bucket 2: key lower = -6, key upper = 5, keys = [0, 3, 5]. + EXPECT_THAT(results[2].key_lower, Eq(-6)); + EXPECT_THAT(results[2].key_upper, Eq(5)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5))); + // Bucket 3: key lower = 6, key upper = 10, keys = [10]. + EXPECT_THAT(results[3].key_lower, Eq(6)); + EXPECT_THAT(results[3].key_upper, Eq(10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[3].start, results[3].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_sameKeysExceedingThreshold_midBucket_keyEqualsKeyLower) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)}; + + // Keys = [-10, -5, -4, -4, -4, -4, -4, 5, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // - Even though # of data with key = -4 exceeds the threshold, they should + // still be in the same bucket. + // - They should be separated from key = -10, -5, 5, 10. + ASSERT_THAT(results, SizeIs(3)); + // Bucket 0: key lower = -10, key upper = -5, keys = [-10, -5]. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(-5)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -5))); + // Bucket 1: key lower = -4, key upper = -4, keys = [-4, -4, -4, -4, -4]. + EXPECT_THAT(results[1].key_lower, Eq(-4)); + EXPECT_THAT(results[1].key_upper, Eq(-4)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4))); + // Bucket 2: key lower = -3, key upper = 10, keys = [5, 10]. + EXPECT_THAT(results[2].key_lower, Eq(-3)); + EXPECT_THAT(results[2].key_upper, Eq(10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_sameKeysExceedingThreshold_midBucket_keyGreaterThanKeyLower) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)}; + + // Keys = [-10, -5, -1, -1, -1, -1, -1, 5, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // - Even though # of data with key = -1 exceeds the threshold, they should + // still be in the same bucket. + // - They should be separated from key = -10, -5, 5, 10. + // - They should be in a single range bucket [-1, -1], and range [-4, -2] + // should be merged into the previous bucket. + ASSERT_THAT(results, SizeIs(3)); + // Bucket 0: key lower = -10, key upper = -2, keys = [-10, -5]. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(-2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -5))); + // Bucket 1: key lower = -1, key upper = -1, keys = [-1, -1, -1, -1, -1]. + EXPECT_THAT(results[1].key_lower, Eq(-1)); + EXPECT_THAT(results[1].key_upper, Eq(-1)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1))); + // Bucket 2: key lower = 0, key upper = 10, keys = [5, 10]. + EXPECT_THAT(results[2].key_lower, Eq(0)); + EXPECT_THAT(results[2].key_upper, Eq(10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_sameKeysExceedingThreshold_lastBucket_keyEqualsKeyLower) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3)}; + + // Keys = [-10, -3, 0, 2, 3, 3, 3, 3, 3]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // - Even though # of data with key = 3 exceeds the threshold, they should + // still be in the same bucket. + // - They should be separated from key = -10, -3, 0, 2. + // - They should be in a single range bucket [3, 3], and another bucket + // [4, 10] with empty data should be created after it. + ASSERT_THAT(results, SizeIs(4)); + // Bucket 0: key lower = -10, key upper = 0, keys = [-10, -3, 0]. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(0)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0))); + // Bucket 1: key lower = 1, key upper = 2, keys = [2]. + EXPECT_THAT(results[1].key_lower, Eq(1)); + EXPECT_THAT(results[1].key_upper, Eq(2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2))); + // Bucket 2: key lower = 3, key upper = 10, keys = [3, 3, 3, 3, 3]. + EXPECT_THAT(results[2].key_lower, Eq(3)); + EXPECT_THAT(results[2].key_upper, Eq(3)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3))); + // Bucket 3: key lower = 4, key upper = 10, keys = []. + EXPECT_THAT(results[3].key_lower, Eq(4)); + EXPECT_THAT(results[3].key_upper, Eq(10)); + EXPECT_THAT(std::vector<IntegerIndexData>(results[3].start, results[3].end), + IsEmpty()); +} + +TEST(IntegerIndexBucketUtilTest, + Split_sameKeysExceedingThreshold_lastBucket_keyWithinKeyLowerAndUpper) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6)}; + + // Keys = [-10, -3, 0, 2, 6, 6, 6, 6, 6]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // - Even though # of data with key = 6 exceeds the threshold, they should + // still be in the same bucket. + // - They should be separated from key = -10, -3, 0, 2. + // - They should be in a single range bucket [6, 6]. Range [3, 5] should be + // merged into the previous bucket. and another bucket [7, 10] with empty + // data should be created after it. + ASSERT_THAT(results, SizeIs(4)); + // Bucket 0: key lower = -10, key upper = 0, keys = [-10, -3, 0]. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(0)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0))); + // Bucket 1: key lower = 1, key upper = 5, keys = [2]. + EXPECT_THAT(results[1].key_lower, Eq(1)); + EXPECT_THAT(results[1].key_upper, Eq(5)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2))); + // Bucket 2: key lower = 6, key upper = 6, keys = [6, 6, 6, 6, 6]. + EXPECT_THAT(results[2].key_lower, Eq(6)); + EXPECT_THAT(results[2].key_upper, Eq(6)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6))); + // Bucket 3: key lower = 7, key upper = 10, keys = []. + EXPECT_THAT(results[3].key_lower, Eq(7)); + EXPECT_THAT(results[3].key_upper, Eq(10)); + EXPECT_THAT(std::vector<IntegerIndexData>(results[3].start, results[3].end), + IsEmpty()); +} + +TEST(IntegerIndexBucketUtilTest, + Split_sameKeysExceedingThreshold_lastBucket_keyEqualsKeyUpper) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)}; + + // Keys = [-10, -3, 0, 2, 10, 10, 10, 10, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // - Even though # of data with key = 10 exceeds the threshold, they should + // still be in the same bucket. + // - They should be separated from key = -10, -3, 0, 2. + // - They should be in a single range bucket [10, 10], and range [3, 9] should + // be merged into the previous bucket. + ASSERT_THAT(results, SizeIs(3)); + // Bucket 0: key lower = -10, key upper = 0, keys = [-10, -3, 0]. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(0)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0))); + // Bucket 1: key lower = 1, key upper = 9, keys = [2]. + EXPECT_THAT(results[1].key_lower, Eq(1)); + EXPECT_THAT(results[1].key_upper, Eq(9)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2))); + // Bucket 2: key lower = 10, key upper = 10, keys = [10, 10, 10, 10, 10]. + EXPECT_THAT(results[2].key_lower, Eq(10)); + EXPECT_THAT(results[2].key_upper, Eq(10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_sameKeysExceedingThreshold_shouldNotMergeIntoPreviousBucket) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)}; + + // Keys = [-10, -2, -2, -2, -2, -2, 5, 5, 5, 5, 5, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // - Data with key = -2 and 5 should be put into a single bucket respectively. + // - When dealing with key = 5, range [-1, 4] should not be merged into the + // previous bucket [-2, -2] because [-2, -2] also contains single key data + // exceeding the threshold. Instead, we should create bucket [-1, 4] with + // empty data. + ASSERT_THAT(results, SizeIs(5)); + // Bucket 0: key lower = -10, key upper = -3, keys = [-10]. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(-3)); + EXPECT_THAT(std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, + kDefaultDocumentId, -10))); + // Bucket 1: key lower = -2, key upper = -2, keys = [-2, -2, -2, -2, -2]. + EXPECT_THAT(results[1].key_lower, Eq(-2)); + EXPECT_THAT(results[1].key_upper, Eq(-2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2))); + // Bucket 2: key lower = -1, key upper = 4, keys = []. + EXPECT_THAT(results[2].key_lower, Eq(-1)); + EXPECT_THAT(results[2].key_upper, Eq(4)); + EXPECT_THAT(std::vector<IntegerIndexData>(results[2].start, results[2].end), + IsEmpty()); + // Bucket 3: key lower = 5, key upper = 5, keys = [5, 5, 5, 5, 5]. + EXPECT_THAT(results[3].key_lower, Eq(5)); + EXPECT_THAT(results[3].key_upper, Eq(5)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[3].start, results[3].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5))); + // Bucket 4: key lower = 6, key upper = 10, keys = [10]. + EXPECT_THAT(results[4].key_lower, Eq(6)); + EXPECT_THAT(results[4].key_upper, Eq(10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[4].start, results[4].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_sameKeysExceedingThreshold_shouldMergeIntoPreviousBucket) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -8), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)}; + + // Keys = [-10, -8, -3, -2, -2, -2, 5, 5, 5, 5, 5, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // - Data with key = 5 should be put into a single bucket. + // - When dealing with key = 5, range [-1, 4] should be merged into the + // previous bucket [-2, -2] because # of data in [-2, -2] doesn't exceed the + // threshold. + ASSERT_THAT(results, SizeIs(4)); + // Bucket 0: key lower = -10, key upper = -3, keys = [-10, -8, -3]. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(-3)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -8), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3))); + // Bucket 1: key lower = -2, key upper = 4, keys = [-2, -2, -2]. + EXPECT_THAT(results[1].key_lower, Eq(-2)); + EXPECT_THAT(results[1].key_upper, Eq(4)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2))); + // Bucket 2: key lower = 5, key upper = 5, keys = [5, 5, 5, 5, 5]. + EXPECT_THAT(results[2].key_lower, Eq(5)); + EXPECT_THAT(results[2].key_upper, Eq(5)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5))); + // Bucket 3: key lower = 6, key upper = 10, keys = [10]. + EXPECT_THAT(results[3].key_lower, Eq(6)); + EXPECT_THAT(results[3].key_upper, Eq(10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[3].start, results[3].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_sameKeysExceedingThreshold_singleBucket_keyEqualsKeyLower) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10)}; + + // Keys = [-10, -10, -10, -10, -10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // - Even though # of data with key = -10 exceeds the threshold, they should + // still be in the same bucket. + // - They should be in a single range bucket [-10, -10], and another bucket + // [-9, 10] with empty data should be created after it. + ASSERT_THAT(results, SizeIs(2)); + // Bucket 0: key lower = -10, key upper = -10, keys = [-10, -10, -10, -10, + // -10]. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(-10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre( + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10))); + // Bucket 1: key lower = -9, key upper = 10, keys = []. + EXPECT_THAT(results[1].key_lower, Eq(-9)); + EXPECT_THAT(results[1].key_upper, Eq(10)); + EXPECT_THAT(std::vector<IntegerIndexData>(results[1].start, results[1].end), + IsEmpty()); +} + +TEST(IntegerIndexBucketUtilTest, + Split_sameKeysExceedingThreshold_singleBucket_keyWithinKeyLowerAndUpper) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0)}; + + // Keys = [0, 0, 0, 0, 0]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // - Even though # of data with key = 0 exceeds the threshold, they should + // still be in the same bucket. + // - They should be in a single range bucket [0, 0]. Another bucket [-10, -1] + // with empty data should be created before it, and another bucket [1, 10] + // with empty data should be created after it. + ASSERT_THAT(results, SizeIs(3)); + // Bucket 0: key lower = -10, key upper = -1, keys = []. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(-1)); + EXPECT_THAT(std::vector<IntegerIndexData>(results[0].start, results[0].end), + IsEmpty()); + // Bucket 1: key lower = 0, key upper = 0, keys = [0, 0, 0, 0, 0]. + EXPECT_THAT(results[1].key_lower, Eq(0)); + EXPECT_THAT(results[1].key_upper, Eq(0)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0))); + // Bucket 2: key lower = 1, key upper = 10, keys = []. + EXPECT_THAT(results[2].key_lower, Eq(1)); + EXPECT_THAT(results[2].key_upper, Eq(10)); + EXPECT_THAT(std::vector<IntegerIndexData>(results[2].start, results[2].end), + IsEmpty()); +} + +TEST(IntegerIndexBucketUtilTest, + Split_sameKeysExceedingThreshold_singleBucket_keyEqualsKeyUpper) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)}; + + // Keys = [10, 10, 10, 10, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // - Even though # of data with key = 10 exceeds the threshold, they should + // still be in the same bucket. + // - They should be in a single range bucket [10, 10], and another bucket + // [-10, 9] with empty data should be created before it. + ASSERT_THAT(results, SizeIs(2)); + // Bucket 0: key lower = -10, key upper = 9, keys = []. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(9)); + EXPECT_THAT(std::vector<IntegerIndexData>(results[0].start, results[0].end), + IsEmpty()); + // Bucket 1: key lower = -10, key upper = 10, keys = [10, 10, 10, 10, 10]. + EXPECT_THAT(results[1].key_lower, Eq(10)); + EXPECT_THAT(results[1].key_upper, Eq(10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_adjacentKeysTotalNumDataExceedThreshold) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)}; + + // Keys = [-10, -10, -1, -1, 2, 2, 10, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10, + /*num_data_threshold=*/3); + // Even though # of data with the same key is within the threshold, since + // total # of data of adjacent keys exceed the threshold, they should be + // separated into different buckets. + ASSERT_THAT(results, SizeIs(4)); + // Bucket 0: key lower = -10, key upper = -10, keys = [-10, -10]. + EXPECT_THAT(results[0].key_lower, Eq(-10)); + EXPECT_THAT(results[0].key_upper, Eq(-10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre( + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10))); + // Bucket 1: key lower = -9, key upper = -1, keys = [-1, -1]. + EXPECT_THAT(results[1].key_lower, Eq(-9)); + EXPECT_THAT(results[1].key_upper, Eq(-1)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1))); + // Bucket 2: key lower = 0, key upper = 2, keys = [2, 2]. + EXPECT_THAT(results[2].key_lower, Eq(0)); + EXPECT_THAT(results[2].key_upper, Eq(2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2))); + // Bucket 3: key lower = 3, key upper = 10, keys = [10, 10]. + EXPECT_THAT(results[3].key_lower, Eq(3)); + EXPECT_THAT(results[3].key_upper, Eq(10)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[3].start, results[3].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_keyLowerEqualsIntMin_smallestKeyGreaterThanKeyLower) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min() + 1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)}; + + // Keys = [INT64_MIN + 1, -10, -1, 2, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/std::numeric_limits<int64_t>::min(), + /*original_key_upper=*/std::numeric_limits<int64_t>::max(), + /*num_data_threshold=*/3); + ASSERT_THAT(results, SizeIs(2)); + // Bucket 0: key lower = INT64_MIN, key upper = -1, keys = [INT64_MIN + 1, + // -10, -1]. + EXPECT_THAT(results[0].key_lower, Eq(std::numeric_limits<int64_t>::min())); + EXPECT_THAT(results[0].key_upper, Eq(-1)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min() + 1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1))); + // Bucket 1: key lower = 0, key upper = INT64_MAX, keys = [2, 10]. + EXPECT_THAT(results[1].key_lower, Eq(0)); + EXPECT_THAT(results[1].key_upper, Eq(std::numeric_limits<int64_t>::max())); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_keyLowerEqualsIntMin_smallestKeyEqualsKeyLower) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)}; + + // Keys = [INT64_MIN, -10, -1, 2, 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/std::numeric_limits<int64_t>::min(), + /*original_key_upper=*/std::numeric_limits<int64_t>::max(), + /*num_data_threshold=*/3); + ASSERT_THAT(results, SizeIs(2)); + // Bucket 0: key lower = INT64_MIN, key upper = -1, keys = [INT64_MIN, -10, + // -1]. + EXPECT_THAT(results[0].key_lower, Eq(std::numeric_limits<int64_t>::min())); + EXPECT_THAT(results[0].key_upper, Eq(-1)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1))); + // Bucket 1: key lower = 0, key upper = INT64_MAX, keys = [2, 10]. + EXPECT_THAT(results[1].key_lower, Eq(0)); + EXPECT_THAT(results[1].key_upper, Eq(std::numeric_limits<int64_t>::max())); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_keyLowerEqualsIntMin_keyIntMinExceedingThreshold) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)}; + + // Keys = [INT64_MIN, INT64_MIN, INT64_MIN, INT64_MIN, INT64_MIN, -10, -1, 2, + // 10]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/std::numeric_limits<int64_t>::min(), + /*original_key_upper=*/std::numeric_limits<int64_t>::max(), + /*num_data_threshold=*/3); + ASSERT_THAT(results, SizeIs(3)); + // Bucket 0: key lower = INT64_MIN, key upper = INT64_MIN, keys = [INT64_MIN, + // INT64_MIN, INT64_MIN, INT64_MIN, INT64_MIN]. + EXPECT_THAT(results[0].key_lower, Eq(std::numeric_limits<int64_t>::min())); + EXPECT_THAT(results[0].key_upper, Eq(std::numeric_limits<int64_t>::min())); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::min()))); + // Bucket 1: key lower = INT64_MIN + 1, key upper = 2, keys = [-10, -1, 2]. + EXPECT_THAT(results[1].key_lower, + Eq(std::numeric_limits<int64_t>::min() + 1)); + EXPECT_THAT(results[1].key_upper, Eq(2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2))); + // Bucket 2: key lower = 3, key upper = INT64_MAX, keys = [10]. + EXPECT_THAT(results[2].key_lower, Eq(3)); + EXPECT_THAT(results[2].key_upper, Eq(std::numeric_limits<int64_t>::max())); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_keyUpperEqualsIntMax_largestKeySmallerThanKeyUpper) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max() - 1), + }; + + // Keys = [-10, -1, 2, 10, INT64_MAX - 1]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/std::numeric_limits<int64_t>::min(), + /*original_key_upper=*/std::numeric_limits<int64_t>::max(), + /*num_data_threshold=*/3); + ASSERT_THAT(results, SizeIs(2)); + // Bucket 0: key lower = INT64_MIN, key upper = 2, keys = [-10, -1, 2]. + EXPECT_THAT(results[0].key_lower, Eq(std::numeric_limits<int64_t>::min())); + EXPECT_THAT(results[0].key_upper, Eq(2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2))); + // Bucket 1: key lower = 3, key upper = INT64_MAX, keys = [10, INT64_MAX - 1]. + EXPECT_THAT(results[1].key_lower, Eq(3)); + EXPECT_THAT(results[1].key_upper, Eq(std::numeric_limits<int64_t>::max())); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max() - 1))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_keyUpperEqualsIntMax_largestKeyEqualsKeyUpper) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max()), + }; + + // Keys = [-10, -1, 2, 10, INT64_MAX]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/std::numeric_limits<int64_t>::min(), + /*original_key_upper=*/std::numeric_limits<int64_t>::max(), + /*num_data_threshold=*/3); + ASSERT_THAT(results, SizeIs(2)); + // Bucket 0: key lower = INT64_MIN, key upper = 2, keys = [-10, -1, 2]. + EXPECT_THAT(results[0].key_lower, Eq(std::numeric_limits<int64_t>::min())); + EXPECT_THAT(results[0].key_upper, Eq(2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2))); + // Bucket 1: key lower = 3, key upper = INT64_MAX, keys = [10, INT64_MAX]. + EXPECT_THAT(results[1].key_lower, Eq(3)); + EXPECT_THAT(results[1].key_upper, Eq(std::numeric_limits<int64_t>::max())); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max()))); +} + +TEST(IntegerIndexBucketUtilTest, + Split_keyUpperEqualsIntMax_keyIntMaxExceedingThreshold) { + std::vector<IntegerIndexData> data = { + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max())}; + + // Keys = [-10, -1, 2, 10, INT64_MAX, INT64_MAX, INT64_MAX, INT64_MAX, + // INT64_MAX]. + std::vector<DataRangeAndBucketInfo> results = + Split(data, /*original_key_lower=*/std::numeric_limits<int64_t>::min(), + /*original_key_upper=*/std::numeric_limits<int64_t>::max(), + /*num_data_threshold=*/3); + ASSERT_THAT(results, SizeIs(3)); + // Bucket 0: key lower = INT64_MIN, key upper = 2, keys = [-10, -1, 2]. + EXPECT_THAT(results[0].key_lower, Eq(std::numeric_limits<int64_t>::min())); + EXPECT_THAT(results[0].key_upper, Eq(2)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[0].start, results[0].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2))); + // Bucket 1: key lower = 3, key upper = INT_MAX - 1, keys = [10]. + EXPECT_THAT(results[1].key_lower, Eq(3)); + EXPECT_THAT(results[1].key_upper, + Eq(std::numeric_limits<int64_t>::max() - 1)); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[1].start, results[1].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10))); + // Bucket 2: key lower = INT64_MAX, key upper = INT64_MAX, keys = [INT64_MAX, + // INT64_MAX, INT64_MAX, INT64_MAX, INT64_MAX]. + EXPECT_THAT(results[2].key_lower, Eq(std::numeric_limits<int64_t>::max())); + EXPECT_THAT(results[2].key_upper, Eq(std::numeric_limits<int64_t>::max())); + EXPECT_THAT( + std::vector<IntegerIndexData>(results[2].start, results[2].end), + ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max()), + IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, + std::numeric_limits<int64_t>::max()))); +} + +} // namespace + +} // namespace integer_index_bucket_util +} // namespace lib +} // namespace icing diff --git a/icing/index/numeric/integer-index-storage.cc b/icing/index/numeric/integer-index-storage.cc index 22ef8bd..db1983c 100644 --- a/icing/index/numeric/integer-index-storage.cc +++ b/icing/index/numeric/integer-index-storage.cc @@ -17,6 +17,7 @@ #include <algorithm> #include <cstdint> #include <functional> +#include <iterator> #include <limits> #include <memory> #include <queue> @@ -37,6 +38,7 @@ #include "icing/index/hit/doc-hit-info.h" #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/index/numeric/doc-hit-info-iterator-numeric.h" +#include "icing/index/numeric/integer-index-bucket-util.h" #include "icing/index/numeric/integer-index-data.h" #include "icing/index/numeric/numeric-index.h" #include "icing/index/numeric/posting-list-integer-index-accessor.h" @@ -50,6 +52,41 @@ namespace lib { namespace { +// Helper function to flush data between [it_start, it_end) into posting list(s) +// and return posting list id. +// Note: it will sort data between [it_start, it_end) by basic hit value, so the +// caller should be aware that the data order will be changed after calling this +// function. +libtextclassifier3::StatusOr<PostingListIdentifier> FlushDataIntoPostingLists( + FlashIndexStorage* flash_index_storage, + PostingListIntegerIndexSerializer* posting_list_serializer, + const std::vector<IntegerIndexData>::iterator& it_start, + const std::vector<IntegerIndexData>::iterator& it_end) { + if (it_start == it_end) { + return PostingListIdentifier::kInvalid; + } + + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<PostingListIntegerIndexAccessor> new_pl_accessor, + PostingListIntegerIndexAccessor::Create(flash_index_storage, + posting_list_serializer)); + + std::sort(it_start, it_end); + for (auto it = it_end - 1; it >= it_start; --it) { + ICING_RETURN_IF_ERROR(new_pl_accessor->PrependData(*it)); + } + + PostingListAccessor::FinalizeResult result = + std::move(*new_pl_accessor).Finalize(); + if (!result.status.ok()) { + return result.status; + } + if (!result.id.is_valid()) { + return absl_ports::InternalError("Fail to flush data into posting list(s)"); + } + return result.id; +} + // The following 4 methods are helper functions to get the correct file path of // metadata/sorted_buckets/unsorted_buckets/flash_index_storage, according to // the given working directory. @@ -510,9 +547,12 @@ libtextclassifier3::Status IntegerIndexStorage::AddKeys( mutable_new_arr.SetArray(/*idx=*/0, new_buckets.data(), new_buckets.size()); } - // Step 4: merge the unsorted bucket array into the sorted bucket array if the - // length of the unsorted bucket array exceeds the threshold. - // TODO(b/259743562): [Optimization 1] implement merge + // Step 4: sort and merge the unsorted bucket array into the sorted bucket + // array if the length of the unsorted bucket array exceeds the + // threshold. + if (unsorted_buckets_->num_elements() > kUnsortedBucketsLengthThreshold) { + ICING_RETURN_IF_ERROR(SortBuckets()); + } info().num_data += new_keys.size(); @@ -679,29 +719,23 @@ IntegerIndexStorage::InitializeNewFiles( absl_ports::StrCat("Failed to create directory: ", working_path)); } - // TODO(b/259743562): [Optimization 1] decide max # buckets, unsorted buckets - // threshold // Initialize sorted_buckets int32_t pre_mapping_mmap_size = sizeof(Bucket) * (1 << 10); - int32_t max_file_size = - pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize; ICING_ASSIGN_OR_RETURN( std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets, FileBackedVector<Bucket>::Create( filesystem, GetSortedBucketsFilePath(working_path), - MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size, - pre_mapping_mmap_size)); + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, + FileBackedVector<Bucket>::kMaxFileSize, pre_mapping_mmap_size)); // Initialize unsorted_buckets - pre_mapping_mmap_size = sizeof(Bucket) * 100; - max_file_size = - pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize; + pre_mapping_mmap_size = sizeof(Bucket) * kUnsortedBucketsLengthThreshold; ICING_ASSIGN_OR_RETURN( std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets, FileBackedVector<Bucket>::Create( filesystem, GetUnsortedBucketsFilePath(working_path), - MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size, - pre_mapping_mmap_size)); + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, + FileBackedVector<Bucket>::kMaxFileSize, pre_mapping_mmap_size)); // Initialize flash_index_storage ICING_ASSIGN_OR_RETURN( @@ -785,29 +819,23 @@ IntegerIndexStorage::InitializeExistingFiles( /*pre_mapping_file_offset=*/0, /*pre_mapping_mmap_size=*/kMetadataFileSize)); - // TODO(b/259743562): [Optimization 1] decide max # buckets, unsorted buckets - // threshold // Initialize sorted_buckets int32_t pre_mapping_mmap_size = sizeof(Bucket) * (1 << 10); - int32_t max_file_size = - pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize; ICING_ASSIGN_OR_RETURN( std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets, FileBackedVector<Bucket>::Create( filesystem, GetSortedBucketsFilePath(working_path), - MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size, - pre_mapping_mmap_size)); + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, + FileBackedVector<Bucket>::kMaxFileSize, pre_mapping_mmap_size)); // Initialize unsorted_buckets - pre_mapping_mmap_size = sizeof(Bucket) * 100; - max_file_size = - pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize; + pre_mapping_mmap_size = sizeof(Bucket) * kUnsortedBucketsLengthThreshold; ICING_ASSIGN_OR_RETURN( std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets, FileBackedVector<Bucket>::Create( filesystem, GetUnsortedBucketsFilePath(working_path), - MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size, - pre_mapping_mmap_size)); + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, + FileBackedVector<Bucket>::kMaxFileSize, pre_mapping_mmap_size)); // Initialize flash_index_storage ICING_ASSIGN_OR_RETURN( @@ -845,28 +873,13 @@ IntegerIndexStorage::FlushDataIntoNewSortedBucket( } ICING_ASSIGN_OR_RETURN( - std::unique_ptr<PostingListIntegerIndexAccessor> new_pl_accessor, - PostingListIntegerIndexAccessor::Create( - storage->flash_index_storage_.get(), - storage->posting_list_serializer_)); - - std::sort(data.begin(), data.end()); - for (auto itr = data.rbegin(); itr != data.rend(); ++itr) { - ICING_RETURN_IF_ERROR(new_pl_accessor->PrependData(*itr)); - } - - PostingListAccessor::FinalizeResult result = - std::move(*new_pl_accessor).Finalize(); - if (!result.status.ok()) { - return result.status; - } - if (!result.id.is_valid()) { - return absl_ports::InternalError("Fail to flush data into posting list"); - } + PostingListIdentifier pl_id, + FlushDataIntoPostingLists(storage->flash_index_storage_.get(), + storage->posting_list_serializer_, data.begin(), + data.end())); storage->info().num_data += data.size(); - return storage->sorted_buckets_->Append( - Bucket(key_lower, key_upper, result.id)); + return storage->sorted_buckets_->Append(Bucket(key_lower, key_upper, pl_id)); } libtextclassifier3::Status IntegerIndexStorage::PersistStoragesToDisk() { @@ -921,21 +934,80 @@ IntegerIndexStorage::AddKeysIntoBucketAndSplitIfNecessary( } for (auto it = it_start; it != it_end; ++it) { - // TODO(b/259743562): [Optimization 1] implement split bucket if pl is full - // and the bucket is splittable + if (mutable_bucket.Get().key_lower() < mutable_bucket.Get().key_upper() && + pl_accessor->WantsSplit()) { + // If the bucket needs split (max size and full) and is splittable, then + // we perform bucket splitting. + + // 1. Finalize the current posting list accessor. + PostingListAccessor::FinalizeResult result = + std::move(*pl_accessor).Finalize(); + if (!result.status.ok()) { + return result.status; + } + + // 2. Create another posting list accessor instance. Read all data and + // free all posting lists. + ICING_ASSIGN_OR_RETURN( + pl_accessor, + PostingListIntegerIndexAccessor::CreateFromExisting( + flash_index_storage_.get(), posting_list_serializer_, result.id)); + ICING_ASSIGN_OR_RETURN(std::vector<IntegerIndexData> all_data, + pl_accessor->GetAllDataAndFree()); + + // 3. Append all remaining new data. + all_data.reserve(all_data.size() + std::distance(it, it_end)); + for (; it != it_end; ++it) { + all_data.push_back(IntegerIndexData(section_id, document_id, *it)); + } + + // 4. Run bucket splitting algorithm to decide new buckets and dispatch + // data. + std::vector<integer_index_bucket_util::DataRangeAndBucketInfo> + new_bucket_infos = integer_index_bucket_util::Split( + all_data, mutable_bucket.Get().key_lower(), + mutable_bucket.Get().key_upper(), + kNumDataThresholdForBucketSplit); + if (new_bucket_infos.empty()) { + ICING_LOG(WARNING) + << "No buckets after splitting. This should not happen."; + return absl_ports::InternalError("Split error"); + } + + // 5. Flush data. + std::vector<Bucket> new_buckets; + for (int i = 0; i < new_bucket_infos.size(); ++i) { + ICING_ASSIGN_OR_RETURN( + PostingListIdentifier pl_id, + FlushDataIntoPostingLists( + flash_index_storage_.get(), posting_list_serializer_, + new_bucket_infos[i].start, new_bucket_infos[i].end)); + if (i == 0) { + // Reuse mutable_bucket + mutable_bucket.Get().set_key_lower(new_bucket_infos[i].key_lower); + mutable_bucket.Get().set_key_upper(new_bucket_infos[i].key_upper); + mutable_bucket.Get().set_posting_list_identifier(pl_id); + } else { + new_buckets.push_back(Bucket(new_bucket_infos[i].key_lower, + new_bucket_infos[i].key_upper, pl_id)); + } + } + + return new_buckets; + } + ICING_RETURN_IF_ERROR(pl_accessor->PrependData( IntegerIndexData(section_id, document_id, *it))); } - // TODO(b/259743562): [Optimization 1] implement split and return new buckets. - // We will change the original bucket (mutable_bucket) - // in-place to one of the new buckets, and the rest will - // be returned and added into unsorted buckets in AddKeys. PostingListAccessor::FinalizeResult result = std::move(*pl_accessor).Finalize(); if (!result.status.ok()) { return result.status; } + if (!result.id.is_valid()) { + return absl_ports::InternalError("Fail to flush data into posting list(s)"); + } mutable_bucket.Get().set_posting_list_identifier(result.id); diff --git a/icing/index/numeric/integer-index-storage.h b/icing/index/numeric/integer-index-storage.h index be0add9..ddd9231 100644 --- a/icing/index/numeric/integer-index-storage.h +++ b/icing/index/numeric/integer-index-storage.h @@ -30,6 +30,7 @@ #include "icing/file/posting_list/flash-index-storage.h" #include "icing/file/posting_list/posting-list-identifier.h" #include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/index/numeric/integer-index-data.h" #include "icing/index/numeric/posting-list-integer-index-serializer.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" @@ -117,6 +118,10 @@ class IntegerIndexStorage : public PersistentStorage { int64_t key_upper() const { return key_upper_; } + void set_key_lower(int64_t key_lower) { key_lower_ = key_lower; } + + void set_key_upper(int64_t key_upper) { key_upper_ = key_upper; } + PostingListIdentifier posting_list_identifier() const { return posting_list_identifier_; } @@ -176,14 +181,29 @@ class IntegerIndexStorage : public PersistentStorage { WorkingPathType::kDirectory; static constexpr std::string_view kFilePrefix = "integer_index_storage"; - // # of data threshold for bucket merging. If total # data of adjacent buckets - // exceed this value, then flush the accumulated data. Otherwise merge - // buckets and their data. + // # of data threshold for bucket merging during optimization (TransferIndex). + // If total # data of adjacent buckets exceed this value, then flush the + // accumulated data. Otherwise merge buckets and their data. // // Calculated by: 0.7 * (kMaxPostingListSize / sizeof(IntegerIndexData)), // where kMaxPostingListSize = (kPageSize - sizeof(IndexBlock::BlockHeader)). static constexpr int32_t kNumDataThresholdForBucketMerge = 240; + // # of data threshold for bucket splitting during indexing (AddKeys). + // When the posting list of a bucket is full, we will try to split data into + // multiple buckets according to their keys. In order to achieve good + // (amortized) time complexity, we want # of data in new buckets to be at most + // half # of elements in a full posting list. + // + // Calculated by: 0.5 * (kMaxPostingListSize / sizeof(IntegerIndexData)), + // where kMaxPostingListSize = (kPageSize - sizeof(IndexBlock::BlockHeader)). + static constexpr int32_t kNumDataThresholdForBucketSplit = 170; + + // Length threshold to sort and merge unsorted buckets into sorted buckets. If + // the length of unsorted_buckets exceed the threshold, then call + // SortBuckets(). + static constexpr int32_t kUnsortedBucketsLengthThreshold = 50; + // Creates a new IntegerIndexStorage instance to index integers (for a single // property). If any of the underlying file is missing, then delete the whole // working_path and (re)initialize with new ones. Otherwise initialize and @@ -370,7 +390,6 @@ class IntegerIndexStorage : public PersistentStorage { // into several new buckets with new ranges, and split the data (according // to their keys and the range of new buckets) of the original posting // list into several new posting lists. - // TODO(b/259743562): [Optimization 1] implement split // - Otherwise, just simply add a new key into it, and PostingListAccessor // mechanism will automatically create a new max size posting list and // chain them. diff --git a/icing/index/numeric/integer-index-storage_benchmark.cc b/icing/index/numeric/integer-index-storage_benchmark.cc index d150f2d..54b19c3 100644 --- a/icing/index/numeric/integer-index-storage_benchmark.cc +++ b/icing/index/numeric/integer-index-storage_benchmark.cc @@ -57,6 +57,7 @@ namespace lib { namespace { using ::testing::Eq; +using ::testing::IsEmpty; using ::testing::SizeIs; static constexpr SectionId kDefaultSectionId = 12; @@ -237,18 +238,24 @@ void BM_ExactQuery(benchmark::State& state) { std::unique_ptr<DocHitInfoIterator> iterator, storage->GetIterator(/*query_key_lower=*/exact_query_key, /*query_key_upper=*/exact_query_key)); - int cnt = 0; + std::vector<DocHitInfo> data; while (iterator->Advance().ok()) { - benchmark::DoNotOptimize(iterator->doc_hit_info()); - ++cnt; + data.push_back(iterator->doc_hit_info()); } + state.PauseTiming(); const auto it = keys.find(exact_query_key); if (it == keys.end()) { - ASSERT_THAT(cnt, Eq(0)); + ASSERT_THAT(data, IsEmpty()); } else { - ASSERT_THAT(it->second, SizeIs(cnt)); + ASSERT_THAT(data, SizeIs(it->second.size())); + std::reverse(data.begin(), data.end()); + for (int i = 0; i < data.size(); ++i) { + ASSERT_THAT(data[i].document_id(), Eq(it->second[i])); + ASSERT_THAT(data[i].hit_section_ids_mask(), Eq(1 << kDefaultSectionId)); + } } + state.ResumeTiming(); } } BENCHMARK(BM_ExactQuery) diff --git a/icing/index/numeric/integer-index-storage_test.cc b/icing/index/numeric/integer-index-storage_test.cc index 9d6864c..ed7d5db 100644 --- a/icing/index/numeric/integer-index-storage_test.cc +++ b/icing/index/numeric/integer-index-storage_test.cc @@ -14,6 +14,8 @@ #include "icing/index/numeric/integer-index-storage.h" +#include <unistd.h> + #include <cstdint> #include <limits> #include <memory> @@ -26,7 +28,10 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/file/file-backed-vector.h" +#include "icing/file/filesystem.h" #include "icing/file/persistent-storage.h" +#include "icing/file/posting_list/flash-index-storage.h" +#include "icing/file/posting_list/index-block.h" #include "icing/file/posting_list/posting-list-identifier.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/iterator/doc-hit-info-iterator.h" @@ -42,14 +47,17 @@ namespace lib { namespace { +using ::testing::Contains; using ::testing::ElementsAre; using ::testing::ElementsAreArray; using ::testing::Eq; +using ::testing::Ge; using ::testing::Gt; using ::testing::HasSubstr; using ::testing::IsEmpty; using ::testing::IsFalse; using ::testing::IsTrue; +using ::testing::Key; using ::testing::Le; using ::testing::Ne; using ::testing::Not; @@ -1186,6 +1194,150 @@ TEST_F(IntegerIndexStorageTest, EqualsDocHitInfo(kDefaultDocumentId, expected_sections)))); } +TEST_F(IntegerIndexStorageTest, SplitBuckets) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create(filesystem_, working_path_, Options(), + serializer_.get())); + + uint32_t block_size = FlashIndexStorage::SelectBlockSize(); + uint32_t max_posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes( + block_size, serializer_->GetDataTypeBytes()); + uint32_t max_num_data_before_split = + max_posting_list_bytes / serializer_->GetDataTypeBytes(); + + // Add max_num_data_before_split + 1 keys to invoke bucket splitting. + // Keys: max_num_data_before_split to 0 + // Document ids: 0 to max_num_data_before_split + std::unordered_map<int64_t, DocumentId> data; + int64_t key = max_num_data_before_split; + DocumentId document_id = 0; + for (int i = 0; i < max_num_data_before_split + 1; ++i) { + data[key] = document_id; + ICING_ASSERT_OK( + storage->AddKeys(document_id, kDefaultSectionId, /*new_keys=*/{key})); + ++document_id; + --key; + } + ICING_ASSERT_OK(storage->PersistToDisk()); + + // Manually check sorted and unsorted buckets. + { + // Check sorted buckets. + const std::string sorted_buckets_file_path = absl_ports::StrCat( + working_path_, "/", IntegerIndexStorage::kFilePrefix, ".s"); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets, + FileBackedVector<Bucket>::Create( + filesystem_, sorted_buckets_file_path, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + + EXPECT_THAT(sorted_buckets->num_elements(), Eq(1)); + ICING_ASSERT_OK_AND_ASSIGN(const Bucket* bucket1, + sorted_buckets->Get(/*idx=*/0)); + EXPECT_THAT(bucket1->key_lower(), Eq(std::numeric_limits<int64_t>::min())); + EXPECT_THAT(bucket1->key_upper(), Ne(std::numeric_limits<int64_t>::max())); + + int64_t sorted_bucket_key_upper = bucket1->key_upper(); + + // Check unsorted buckets. + const std::string unsorted_buckets_file_path = absl_ports::StrCat( + working_path_, "/", IntegerIndexStorage::kFilePrefix, ".u"); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets, + FileBackedVector<Bucket>::Create( + filesystem_, unsorted_buckets_file_path, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + + EXPECT_THAT(unsorted_buckets->num_elements(), Ge(1)); + ICING_ASSERT_OK_AND_ASSIGN(const Bucket* bucket2, + unsorted_buckets->Get(/*idx=*/0)); + EXPECT_THAT(bucket2->key_lower(), Eq(sorted_bucket_key_upper + 1)); + } + + // Ensure that search works normally. + std::vector<SectionId> expected_sections = {kDefaultSectionId}; + for (int64_t key = max_num_data_before_split; key >= 0; key--) { + ASSERT_THAT(data, Contains(Key(key))); + DocumentId expected_document_id = data[key]; + EXPECT_THAT(Query(storage.get(), /*key_lower=*/key, /*key_upper=*/key), + IsOkAndHolds(ElementsAre(EqualsDocHitInfo(expected_document_id, + expected_sections)))); + } +} + +TEST_F(IntegerIndexStorageTest, SplitBucketsTriggerSortBuckets) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndexStorage> storage, + IntegerIndexStorage::Create(filesystem_, working_path_, Options(), + serializer_.get())); + + uint32_t block_size = FlashIndexStorage::SelectBlockSize(); + uint32_t max_posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes( + block_size, serializer_->GetDataTypeBytes()); + uint32_t max_num_data_before_split = + max_posting_list_bytes / serializer_->GetDataTypeBytes(); + + // Add IntegerIndexStorage::kUnsortedBucketsLengthThreshold keys. For each + // key, add max_num_data_before_split + 1 data. Then we will get: + // - Bucket splitting will create kUnsortedBucketsLengthThreshold + 1 unsorted + // buckets [[50, 50], [49, 49], ..., [1, 1], [51, INT64_MAX]]. + // - Since there are kUnsortedBucketsLengthThreshold + 1 unsorted buckets, we + // should sort and merge buckets. + std::unordered_map<int64_t, std::vector<DocumentId>> data; + int64_t key = IntegerIndexStorage::kUnsortedBucketsLengthThreshold; + DocumentId document_id = 0; + for (int i = 0; i < IntegerIndexStorage::kUnsortedBucketsLengthThreshold; + ++i) { + for (int j = 0; j < max_num_data_before_split + 1; ++j) { + data[key].push_back(document_id); + ICING_ASSERT_OK( + storage->AddKeys(document_id, kDefaultSectionId, /*new_keys=*/{key})); + ++document_id; + } + --key; + } + ICING_ASSERT_OK(storage->PersistToDisk()); + + // Manually check sorted and unsorted buckets. + { + // Check unsorted buckets. + const std::string unsorted_buckets_file_path = absl_ports::StrCat( + working_path_, "/", IntegerIndexStorage::kFilePrefix, ".u"); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets, + FileBackedVector<Bucket>::Create( + filesystem_, unsorted_buckets_file_path, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + EXPECT_THAT(unsorted_buckets->num_elements(), Eq(0)); + + // Check sorted buckets. + const std::string sorted_buckets_file_path = absl_ports::StrCat( + working_path_, "/", IntegerIndexStorage::kFilePrefix, ".s"); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets, + FileBackedVector<Bucket>::Create( + filesystem_, sorted_buckets_file_path, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + EXPECT_THAT(sorted_buckets->num_elements(), Gt(1)); + } + + // Ensure that search works normally. + for (key = 1; key <= IntegerIndexStorage::kUnsortedBucketsLengthThreshold; + ++key) { + ASSERT_THAT(data, Contains(Key(key))); + + std::vector<DocHitInfo> expected_doc_hit_infos; + for (DocumentId doc_id : data[key]) { + expected_doc_hit_infos.push_back(DocHitInfo( + doc_id, /*hit_section_ids_mask=*/UINT64_C(1) << kDefaultSectionId)); + } + EXPECT_THAT(Query(storage.get(), /*key_lower=*/key, /*key_upper=*/key), + IsOkAndHolds(ElementsAreArray(expected_doc_hit_infos.rbegin(), + expected_doc_hit_infos.rend()))); + } +} + TEST_F(IntegerIndexStorageTest, TransferIndex) { // We use predefined custom buckets to initialize new integer index storage // and create some test keys accordingly. diff --git a/icing/index/numeric/integer-index.cc b/icing/index/numeric/integer-index.cc index a2d40f1..2f876e4 100644 --- a/icing/index/numeric/integer-index.cc +++ b/icing/index/numeric/integer-index.cc @@ -14,10 +14,12 @@ #include "icing/index/numeric/integer-index.h" +#include <algorithm> #include <cstdint> #include <memory> #include <string> #include <string_view> +#include <utility> #include <vector> #include "icing/text_classifier/lib3/utils/base/status.h" @@ -27,6 +29,7 @@ #include "icing/file/destructible-directory.h" #include "icing/file/filesystem.h" #include "icing/file/memory-mapped-file.h" +#include "icing/index/iterator/doc-hit-info-iterator-section-restrict.h" #include "icing/index/numeric/doc-hit-info-iterator-numeric.h" #include "icing/index/numeric/integer-index-storage.h" #include "icing/index/numeric/posting-list-integer-index-serializer.h" @@ -50,6 +53,17 @@ std::string GetMetadataFilePath(std::string_view working_path) { return absl_ports::StrCat(working_path, "/", GetMetadataFileName()); } +constexpr std::string_view kWildcardPropertyIndexFileName = + "wildcard_property_index"; + +constexpr std::string_view kWildcardPropertyStorageFileName = + "wildcard_property_storage"; + +std::string GetWildcardPropertyStorageFilePath(std::string_view working_path) { + return absl_ports::StrCat(working_path, "/", + kWildcardPropertyStorageFileName); +} + // Helper function to get the sub working (directory) path of // IntegerIndexStorage according to the given working directory and property // path. @@ -64,8 +78,9 @@ libtextclassifier3::StatusOr<std::vector<std::string>> GetAllExistingPropertyPaths(const Filesystem& filesystem, const std::string& working_path) { std::vector<std::string> property_paths; - if (!filesystem.ListDirectory(working_path.c_str(), - /*exclude=*/{GetMetadataFileName()}, + std::unordered_set<std::string> excludes = { + GetMetadataFileName(), std::string(kWildcardPropertyStorageFileName)}; + if (!filesystem.ListDirectory(working_path.c_str(), excludes, /*recursive=*/false, &property_paths)) { return absl_ports::InternalError("Failed to list directory"); } @@ -81,6 +96,9 @@ GetPropertyIntegerIndexStorageMap( IntegerIndex::PropertyToStorageMapType property_to_storage_map; for (const std::string& property_path : property_paths) { + if (property_path == kWildcardPropertyIndexFileName) { + continue; + } std::string storage_working_path = GetPropertyIndexStoragePath(working_path, property_path); ICING_ASSIGN_OR_RETURN( @@ -95,16 +113,61 @@ GetPropertyIntegerIndexStorageMap( return property_to_storage_map; } +// RETURNS: +// - On success, an unordered_set representing the list of property paths +// stored in the WildcardPropertyStorage managed by property_storage +// - INTERNAL_ERROR on any failure to successfully read the underlying proto. +libtextclassifier3::StatusOr<std::unordered_set<std::string>> CreatePropertySet( + const FileBackedProto<WildcardPropertyStorage>& property_storage) { + std::unordered_set<std::string> wildcard_properties_set; + auto wildcard_properties_or = property_storage.Read(); + if (!wildcard_properties_or.ok()) { + if (absl_ports::IsNotFound(wildcard_properties_or.status())) { + return wildcard_properties_set; + } + return wildcard_properties_or.status(); + } + + const WildcardPropertyStorage* wildcard_properties = + wildcard_properties_or.ValueOrDie(); + wildcard_properties_set.reserve(wildcard_properties->property_entries_size()); + for (const std::string& property : wildcard_properties->property_entries()) { + wildcard_properties_set.insert(property); + } + return wildcard_properties_set; +} + } // namespace libtextclassifier3::Status IntegerIndex::Editor::IndexAllBufferedKeys() && { auto iter = integer_index_.property_to_storage_map_.find(property_path_); IntegerIndexStorage* target_storage = nullptr; + // 1. Check if this property already has its own individual index. if (iter != integer_index_.property_to_storage_map_.end()) { target_storage = iter->second.get(); + // 2. Check if this property was added to wildcard storage. + } else if (integer_index_.wildcard_properties_set_.find(property_path_) != + integer_index_.wildcard_properties_set_.end()) { + target_storage = integer_index_.wildcard_index_storage_.get(); + // 3. Check if we've reach the limit of individual property storages. + } else if (integer_index_.property_to_storage_map_.size() >= + kMaxPropertyStorages) { + // 3a. Create the wildcard storage if it doesn't exist. + if (integer_index_.wildcard_index_storage_ == nullptr) { + ICING_ASSIGN_OR_RETURN( + integer_index_.wildcard_index_storage_, + IntegerIndexStorage::Create( + integer_index_.filesystem_, + GetPropertyIndexStoragePath(integer_index_.working_path_, + kWildcardPropertyIndexFileName), + IntegerIndexStorage::Options(), + integer_index_.posting_list_serializer_.get())); + } + ICING_RETURN_IF_ERROR( + integer_index_.AddPropertyToWildcardStorage(property_path_)); + target_storage = integer_index_.wildcard_index_storage_.get(); + // 4. Create a new individual storage for this new property. } else { - // A new property path. Create a new storage instance and insert into the - // map. ICING_ASSIGN_OR_RETURN( std::unique_ptr<IntegerIndexStorage> new_storage, IntegerIndexStorage::Create( @@ -144,15 +207,45 @@ IntegerIndex::~IntegerIndex() { libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> IntegerIndex::GetIterator(std::string_view property_path, int64_t key_lower, - int64_t key_upper) const { - auto iter = property_to_storage_map_.find(std::string(property_path)); - if (iter == property_to_storage_map_.end()) { - // Return an empty iterator. - return std::make_unique<DocHitInfoIteratorNumeric<int64_t>>( - /*numeric_index_iter=*/nullptr); + int64_t key_upper, + const DocumentStore& document_store, + const SchemaStore& schema_store) const { + std::string property_path_str(property_path); + auto iter = property_to_storage_map_.find(property_path_str); + if (iter != property_to_storage_map_.end()) { + return iter->second->GetIterator(key_lower, key_upper); + } + + if (wildcard_properties_set_.find(property_path_str) != + wildcard_properties_set_.end()) { + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<DocHitInfoIterator> delegate, + wildcard_index_storage_->GetIterator(key_lower, key_upper)); + std::set<std::string> property_paths = {std::move(property_path_str)}; + return std::make_unique<DocHitInfoIteratorSectionRestrict>( + std::move(delegate), &document_store, &schema_store, + std::move(property_paths)); + } + + // Return an empty iterator. + return std::make_unique<DocHitInfoIteratorNumeric<int64_t>>( + /*numeric_index_iter=*/nullptr); +} + +libtextclassifier3::Status IntegerIndex::AddPropertyToWildcardStorage( + const std::string& property_path) { + WildcardPropertyStorage wildcard_properties; + wildcard_properties.mutable_property_entries()->Reserve( + wildcard_properties_set_.size()); + for (const std::string& property_path : wildcard_properties_set_) { + wildcard_properties.add_property_entries(property_path); } + ICING_RETURN_IF_ERROR(wildcard_property_storage_->Write( + std::make_unique<WildcardPropertyStorage>( + std::move(wildcard_properties)))); - return iter->second->GetIterator(key_lower, key_upper); + wildcard_properties_set_.insert(property_path); + return libtextclassifier3::Status::OK; } libtextclassifier3::Status IntegerIndex::Optimize( @@ -183,6 +276,8 @@ libtextclassifier3::Status IntegerIndex::Optimize( // Destruct current storage instances to safely swap directories. metadata_mmapped_file_.reset(); property_to_storage_map_.clear(); + wildcard_index_storage_.reset(); + wildcard_property_storage_.reset(); if (!filesystem_.SwapFiles(temp_working_path_ddir.dir().c_str(), working_path_.c_str())) { return absl_ports::InternalError( @@ -190,9 +285,10 @@ libtextclassifier3::Status IntegerIndex::Optimize( } // Reinitialize the integer index. + std::string metadata_file_path = GetMetadataFilePath(working_path_); ICING_ASSIGN_OR_RETURN( MemoryMappedFile metadata_mmapped_file, - MemoryMappedFile::Create(filesystem_, GetMetadataFilePath(working_path_), + MemoryMappedFile::Create(filesystem_, metadata_file_path, MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, /*max_file_size=*/kMetadataFileSize, /*pre_mapping_file_offset=*/0, @@ -200,6 +296,25 @@ libtextclassifier3::Status IntegerIndex::Optimize( metadata_mmapped_file_ = std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file)); + // Recreate all of the data structures tracking the wildcard storage. + std::string wildcard_property_path = + GetWildcardPropertyStorageFilePath(working_path_); + wildcard_property_storage_ = + std::make_unique<FileBackedProto<WildcardPropertyStorage>>( + filesystem_, wildcard_property_path); + + ICING_ASSIGN_OR_RETURN(wildcard_properties_set_, + CreatePropertySet(*wildcard_property_storage_)); + if (!wildcard_properties_set_.empty()) { + ICING_ASSIGN_OR_RETURN( + wildcard_index_storage_, + IntegerIndexStorage::Create( + filesystem_, + GetPropertyIndexStoragePath(working_path_, + kWildcardPropertyIndexFileName), + IntegerIndexStorage::Options(), posting_list_serializer_.get())); + } + // Initialize all existing integer index storages. ICING_ASSIGN_OR_RETURN( property_to_storage_map_, @@ -212,6 +327,7 @@ libtextclassifier3::Status IntegerIndex::Optimize( libtextclassifier3::Status IntegerIndex::Clear() { // Step 1: clear property_to_storage_map_. property_to_storage_map_.clear(); + wildcard_index_storage_.reset(); // Step 2: delete all IntegerIndexStorages. It is safe because there is no // active IntegerIndexStorage after clearing the map. @@ -224,6 +340,15 @@ libtextclassifier3::Status IntegerIndex::Clear() { GetPropertyIndexStoragePath(working_path_, property_path))); } + // Step 3: Delete the wildcard property storage + std::string wildcard_property_path = + GetWildcardPropertyStorageFilePath(working_path_); + if (filesystem_.FileExists(wildcard_property_path.c_str()) || + !filesystem_.DeleteFile(wildcard_property_path.c_str())) { + return absl_ports::InternalError(absl_ports::StrCat( + "Unable to delete file at path ", wildcard_property_path)); + } + info().last_added_document_id = kInvalidDocumentId; return libtextclassifier3::Status::OK; } @@ -249,12 +374,20 @@ IntegerIndex::InitializeNewFiles(const Filesystem& filesystem, ICING_RETURN_IF_ERROR(metadata_mmapped_file.GrowAndRemapIfNecessary( /*file_offset=*/0, /*mmap_size=*/kMetadataFileSize)); + std::string wildcard_property_path = + GetWildcardPropertyStorageFilePath(working_path); + auto wildcard_property_storage = + std::make_unique<FileBackedProto<WildcardPropertyStorage>>( + filesystem, wildcard_property_path); + // Create instance. auto new_integer_index = std::unique_ptr<IntegerIndex>(new IntegerIndex( filesystem, std::move(working_path), std::make_unique<PostingListIntegerIndexSerializer>(), std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file)), - /*property_to_storage_map=*/{})); + /*property_to_storage_map=*/{}, std::move(wildcard_property_storage), + /*wildcard_properties_set=*/{}, /*wildcard_index_storage=*/nullptr)); + // Initialize info content by writing mapped memory directly. Info& info_ref = new_integer_index->info(); info_ref.magic = Info::kMagic; @@ -287,11 +420,33 @@ IntegerIndex::InitializeExistingFiles(const Filesystem& filesystem, GetPropertyIntegerIndexStorageMap(filesystem, working_path, posting_list_serializer.get())); + std::string wildcard_property_path = + GetWildcardPropertyStorageFilePath(working_path); + auto wildcard_property_storage = + std::make_unique<FileBackedProto<WildcardPropertyStorage>>( + filesystem, wildcard_property_path); + + ICING_ASSIGN_OR_RETURN( + std::unordered_set<std::string> wildcard_properties_set, + CreatePropertySet(*wildcard_property_storage)); + + std::unique_ptr<IntegerIndexStorage> wildcard_index_storage; + if (!wildcard_properties_set.empty()) { + ICING_ASSIGN_OR_RETURN( + wildcard_index_storage, + IntegerIndexStorage::Create( + filesystem, + GetPropertyIndexStoragePath(working_path, + kWildcardPropertyIndexFileName), + IntegerIndexStorage::Options(), posting_list_serializer.get())); + } + // Create instance. auto integer_index = std::unique_ptr<IntegerIndex>(new IntegerIndex( filesystem, std::move(working_path), std::move(posting_list_serializer), std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file)), - std::move(property_to_storage_map))); + std::move(property_to_storage_map), std::move(wildcard_property_storage), + std::move(wildcard_properties_set), std::move(wildcard_index_storage))); // Initialize existing PersistentStorage. Checksums will be validated. ICING_RETURN_IF_ERROR(integer_index->InitializeExistingStorage()); @@ -303,31 +458,78 @@ IntegerIndex::InitializeExistingFiles(const Filesystem& filesystem, return integer_index; } -libtextclassifier3::Status IntegerIndex::TransferIndex( +libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> +IntegerIndex::TransferIntegerIndexStorage( const std::vector<DocumentId>& document_id_old_to_new, + const IntegerIndexStorage* old_storage, const std::string& property_path, IntegerIndex* new_integer_index) const { - for (const auto& [property_path, old_storage] : property_to_storage_map_) { - std::string new_storage_working_path = GetPropertyIndexStoragePath( - new_integer_index->working_path_, property_path); - ICING_ASSIGN_OR_RETURN( - std::unique_ptr<IntegerIndexStorage> new_storage, - IntegerIndexStorage::Create( - new_integer_index->filesystem_, new_storage_working_path, - IntegerIndexStorage::Options(), - new_integer_index->posting_list_serializer_.get())); + std::string new_storage_working_path = GetPropertyIndexStoragePath( + new_integer_index->working_path_, property_path); + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<IntegerIndexStorage> new_storage, + IntegerIndexStorage::Create( + new_integer_index->filesystem_, new_storage_working_path, + IntegerIndexStorage::Options(), + new_integer_index->posting_list_serializer_.get())); + + ICING_RETURN_IF_ERROR( + old_storage->TransferIndex(document_id_old_to_new, new_storage.get())); + if (new_storage->num_data() == 0) { + new_storage.reset(); ICING_RETURN_IF_ERROR( - old_storage->TransferIndex(document_id_old_to_new, new_storage.get())); + IntegerIndexStorage::Discard(filesystem_, new_storage_working_path)); + } + return new_storage; +} + +libtextclassifier3::Status IntegerIndex::TransferWildcardStorage( + IntegerIndex* new_integer_index) const { + auto property_storage = std::make_unique<WildcardPropertyStorage>(); + property_storage->mutable_property_entries()->Reserve( + wildcard_properties_set_.size()); + for (const std::string& property : wildcard_properties_set_) { + property_storage->add_property_entries(property); + } + + ICING_RETURN_IF_ERROR(new_integer_index->wildcard_property_storage_->Write( + std::move(property_storage))); + new_integer_index->wildcard_properties_set_ = wildcard_properties_set_; + return libtextclassifier3::Status::OK; +} - if (new_storage->num_data() == 0) { - new_storage.reset(); - ICING_RETURN_IF_ERROR( - IntegerIndexStorage::Discard(filesystem_, new_storage_working_path)); - } else { +libtextclassifier3::Status IntegerIndex::TransferIndex( + const std::vector<DocumentId>& document_id_old_to_new, + IntegerIndex* new_integer_index) const { + // Transfer over the integer index storages + std::unique_ptr<IntegerIndexStorage> new_storage; + for (const auto& [property_path, old_storage] : property_to_storage_map_) { + ICING_ASSIGN_OR_RETURN( + new_storage, + TransferIntegerIndexStorage(document_id_old_to_new, old_storage.get(), + property_path, new_integer_index)); + if (new_storage != nullptr) { new_integer_index->property_to_storage_map_.insert( - std::make_pair(property_path, std::move(new_storage))); + {property_path, std::move(new_storage)}); } } + if (wildcard_index_storage_ != nullptr) { + ICING_ASSIGN_OR_RETURN( + new_storage, + TransferIntegerIndexStorage( + document_id_old_to_new, wildcard_index_storage_.get(), + std::string(kWildcardPropertyIndexFileName), new_integer_index)); + if (new_storage != nullptr) { + new_integer_index->wildcard_index_storage_ = std::move(new_storage); + + // The only time we need to copy over the list of properties using + // wildcard storage is if wildcard_index_storage and new_storage are both + // non-null. Otherwise, the new wildcard index storage won't have any + // data. + ICING_RETURN_IF_ERROR(TransferWildcardStorage(new_integer_index)); + } + } + return libtextclassifier3::Status::OK; } @@ -335,6 +537,11 @@ libtextclassifier3::Status IntegerIndex::PersistStoragesToDisk() { for (auto& [_, storage] : property_to_storage_map_) { ICING_RETURN_IF_ERROR(storage->PersistToDisk()); } + // No need to persist wildcard_properties_storage_. All calls to + // FileBackedProto::Write are fully written through at the time of the call. + if (wildcard_index_storage_) { + ICING_RETURN_IF_ERROR(wildcard_index_storage_->PersistToDisk()); + } return libtextclassifier3::Status::OK; } @@ -350,8 +557,8 @@ libtextclassifier3::StatusOr<Crc32> IntegerIndex::ComputeInfoChecksum() { } libtextclassifier3::StatusOr<Crc32> IntegerIndex::ComputeStoragesChecksum() { - // XOR all crcs of all storages. Since XOR is commutative and associative, the - // order doesn't matter. + // XOR all crcs of all storages. Since XOR is commutative and associative, + // the order doesn't matter. uint32_t storages_checksum = 0; for (auto& [property_path, storage] : property_to_storage_map_) { ICING_ASSIGN_OR_RETURN(Crc32 storage_crc, storage->UpdateChecksums()); @@ -359,6 +566,17 @@ libtextclassifier3::StatusOr<Crc32> IntegerIndex::ComputeStoragesChecksum() { storages_checksum ^= storage_crc.Get(); } + + if (wildcard_index_storage_ != nullptr) { + ICING_ASSIGN_OR_RETURN(Crc32 storage_crc, + wildcard_index_storage_->UpdateChecksums()); + storages_checksum ^= storage_crc.Get(); + } + + ICING_ASSIGN_OR_RETURN(Crc32 wildcard_properties_crc, + wildcard_property_storage_->ComputeChecksum()); + storages_checksum ^= wildcard_properties_crc.Get(); + return Crc32(storages_checksum); } diff --git a/icing/index/numeric/integer-index.h b/icing/index/numeric/integer-index.h index 050a143..303bb41 100644 --- a/icing/index/numeric/integer-index.h +++ b/icing/index/numeric/integer-index.h @@ -23,12 +23,16 @@ #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/file/file-backed-proto.h" #include "icing/file/filesystem.h" #include "icing/file/memory-mapped-file.h" #include "icing/index/numeric/integer-index-storage.h" #include "icing/index/numeric/numeric-index.h" #include "icing/index/numeric/posting-list-integer-index-serializer.h" +#include "icing/index/numeric/wildcard-property-storage.pb.h" +#include "icing/schema/schema-store.h" #include "icing/store/document-id.h" +#include "icing/store/document-store.h" #include "icing/util/crc32.h" namespace icing { @@ -46,6 +50,11 @@ class IntegerIndex : public NumericIndex<int64_t> { using PropertyToStorageMapType = std::unordered_map<std::string, std::unique_ptr<IntegerIndexStorage>>; + // Maximum number of individual property storages that this index will allow + // before falling back to placing hits for any new properties into the + // 'wildcard' storage. + static constexpr int kMaxPropertyStorages = 32; + struct Info { static constexpr int32_t kMagic = 0x238a3dcb; @@ -125,8 +134,9 @@ class IntegerIndex : public NumericIndex<int64_t> { // - NOT_FOUND_ERROR if the given property_path doesn't exist // - Any IntegerIndexStorage errors libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator( - std::string_view property_path, int64_t key_lower, - int64_t key_upper) const override; + std::string_view property_path, int64_t key_lower, int64_t key_upper, + const DocumentStore& document_store, + const SchemaStore& schema_store) const override; // Reduces internal file sizes by reclaiming space and ids of deleted // documents. Integer index will convert all data (hits) to the new document @@ -165,6 +175,11 @@ class IntegerIndex : public NumericIndex<int64_t> { } } + int num_property_indices() const override { + return property_to_storage_map_.size() + + ((wildcard_index_storage_ == nullptr) ? 0 : 1); + } + private: class Editor : public NumericIndex<int64_t>::Editor { public: @@ -191,17 +206,24 @@ class IntegerIndex : public NumericIndex<int64_t> { IntegerIndex& integer_index_; // Does not own. }; - explicit IntegerIndex(const Filesystem& filesystem, - std::string&& working_path, - std::unique_ptr<PostingListIntegerIndexSerializer> - posting_list_serializer, - std::unique_ptr<MemoryMappedFile> metadata_mmapped_file, - PropertyToStorageMapType&& property_to_storage_map) + explicit IntegerIndex( + const Filesystem& filesystem, std::string&& working_path, + std::unique_ptr<PostingListIntegerIndexSerializer> + posting_list_serializer, + std::unique_ptr<MemoryMappedFile> metadata_mmapped_file, + PropertyToStorageMapType&& property_to_storage_map, + std::unique_ptr<FileBackedProto<WildcardPropertyStorage>> + wildcard_property_storage, + std::unordered_set<std::string> wildcard_properties_set, + std::unique_ptr<icing::lib::IntegerIndexStorage> wildcard_index_storage) : NumericIndex<int64_t>(filesystem, std::move(working_path), kWorkingPathType), posting_list_serializer_(std::move(posting_list_serializer)), metadata_mmapped_file_(std::move(metadata_mmapped_file)), - property_to_storage_map_(std::move(property_to_storage_map)) {} + property_to_storage_map_(std::move(property_to_storage_map)), + wildcard_property_storage_(std::move(wildcard_property_storage)), + wildcard_properties_set_(std::move(wildcard_properties_set)), + wildcard_index_storage_(std::move(wildcard_index_storage)) {} static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path); @@ -210,6 +232,17 @@ class IntegerIndex : public NumericIndex<int64_t> { InitializeExistingFiles(const Filesystem& filesystem, std::string&& working_path); + // Adds the property path to the list of properties using wildcard storage. + // This will both update the in-memory list (wildcard_properties_set_) and + // the persistent list (wilcard_property_storage_). + // + // RETURNS: + // - OK on success + // - INTERNAL_ERROR if unable to successfully persist updated properties + // list in wildcard_property_storage_. + libtextclassifier3::Status AddPropertyToWildcardStorage( + const std::string& property_path); + // Transfers integer index data from the current integer index to // new_integer_index. // @@ -222,6 +255,29 @@ class IntegerIndex : public NumericIndex<int64_t> { const std::vector<DocumentId>& document_id_old_to_new, IntegerIndex* new_integer_index) const; + // Transfers integer index data from old_storage to new_integer_index. + // + // Returns: + // - OK on success + // - INTERNAL_ERROR on I/O error. This could potentially leave the storages + // in an invalid state and the caller should handle it properly (e.g. + // discard and rebuild) + libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>> + TransferIntegerIndexStorage( + const std::vector<DocumentId>& document_id_old_to_new, + const IntegerIndexStorage* old_storage, const std::string& property_path, + IntegerIndex* new_integer_index) const; + + // Transfers the persistent and in-memory list of properties using the + // wildcard storage from old_storage to new_integer_index. + // + // RETURNS: + // - OK on success + // - INTERNAL_ERROR if unable to successfully persist updated properties + // list in new_integer_index. + libtextclassifier3::Status TransferWildcardStorage( + IntegerIndex* new_integer_index) const; + // Flushes contents of all storages to underlying files. // // Returns: @@ -277,6 +333,19 @@ class IntegerIndex : public NumericIndex<int64_t> { // Property path to integer index storage map. PropertyToStorageMapType property_to_storage_map_; + + // Persistent list of properties that have added content to + // wildcard_index_storage_. + std::unique_ptr<FileBackedProto<WildcardPropertyStorage>> + wildcard_property_storage_; + + // In-memory list of properties that have added content to + // wildcard_index_storage_. + std::unordered_set<std::string> wildcard_properties_set_; + + // The index storage that is used once we have already created + // kMaxPropertyStorages in property_to_storage_map. + std::unique_ptr<icing::lib::IntegerIndexStorage> wildcard_index_storage_; }; } // namespace lib diff --git a/icing/index/numeric/integer-index_test.cc b/icing/index/numeric/integer-index_test.cc index c6cf855..c4dacb8 100644 --- a/icing/index/numeric/integer-index_test.cc +++ b/icing/index/numeric/integer-index_test.cc @@ -25,6 +25,7 @@ #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "gmock/gmock.h" #include "gtest/gtest.h" +#include "icing/document-builder.h" #include "icing/file/filesystem.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/iterator/doc-hit-info-iterator.h" @@ -32,6 +33,9 @@ #include "icing/index/numeric/integer-index-storage.h" #include "icing/index/numeric/numeric-index.h" #include "icing/index/numeric/posting-list-integer-index-serializer.h" +#include "icing/proto/document.pb.h" +#include "icing/proto/schema.pb.h" +#include "icing/schema-builder.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" @@ -68,9 +72,25 @@ class NumericIndexIntegerTest : public ::testing::Test { IsTrue()); working_path_ = base_dir_ + "/numeric_index_integer_test"; + std::string schema_dir = base_dir_ + "/schema_test"; + + ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(schema_dir.c_str())); + ICING_ASSERT_OK_AND_ASSIGN( + schema_store_, SchemaStore::Create(&filesystem_, schema_dir, &clock_)); + + std::string document_store_dir = base_dir_ + "/doc_store_test"; + ASSERT_TRUE( + filesystem_.CreateDirectoryRecursively(document_store_dir.c_str())); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult doc_store_create_result, + DocumentStore::Create(&filesystem_, document_store_dir, &clock_, + schema_store_.get())); + doc_store_ = std::move(doc_store_create_result.document_store); } void TearDown() override { + doc_store_.reset(); + schema_store_.reset(); filesystem_.DeleteDirectoryRecursively(base_dir_.c_str()); } @@ -92,9 +112,67 @@ class NumericIndexIntegerTest : public ::testing::Test { return IntegerIndex::Create(filesystem_, working_path_); } + template <typename NotIntegerIndexType> + bool is_integer_index() const { + return false; + } + + template <> + bool is_integer_index<IntegerIndex>() const { + return true; + } + + libtextclassifier3::StatusOr<std::vector<DocumentId>> CompactDocStore() { + std::string document_store_dir = base_dir_ + "/doc_store_test"; + std::string document_store_compact_dir = + base_dir_ + "/doc_store_compact_test"; + if (!filesystem_.CreateDirectoryRecursively( + document_store_compact_dir.c_str())) { + return absl_ports::InternalError("Unable to create compact directory"); + } + ICING_ASSIGN_OR_RETURN( + std::vector<DocumentId> docid_map, + doc_store_->OptimizeInto(document_store_compact_dir, nullptr)); + + doc_store_.reset(); + if (!filesystem_.SwapFiles(document_store_dir.c_str(), + document_store_compact_dir.c_str())) { + return absl_ports::InternalError("Unable to swap directories."); + } + if (!filesystem_.DeleteDirectoryRecursively( + document_store_compact_dir.c_str())) { + return absl_ports::InternalError("Unable to delete compact directory"); + } + + ICING_ASSIGN_OR_RETURN( + DocumentStore::CreateResult doc_store_create_result, + DocumentStore::Create(&filesystem_, document_store_dir, &clock_, + schema_store_.get())); + doc_store_ = std::move(doc_store_create_result.document_store); + return docid_map; + } + + libtextclassifier3::StatusOr<std::vector<DocHitInfo>> Query( + const NumericIndex<int64_t>* integer_index, + std::string_view property_path, int64_t key_lower, int64_t key_upper) { + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<DocHitInfoIterator> iter, + integer_index->GetIterator(property_path, key_lower, key_upper, + *doc_store_, *schema_store_)); + + std::vector<DocHitInfo> result; + while (iter->Advance().ok()) { + result.push_back(iter->doc_hit_info()); + } + return result; + } + Filesystem filesystem_; std::string base_dir_; std::string working_path_; + std::unique_ptr<SchemaStore> schema_store_; + std::unique_ptr<DocumentStore> doc_store_; + Clock clock_; }; void Index(NumericIndex<int64_t>* integer_index, std::string_view property_path, @@ -109,20 +187,6 @@ void Index(NumericIndex<int64_t>* integer_index, std::string_view property_path, ICING_EXPECT_OK(std::move(*editor).IndexAllBufferedKeys()); } -libtextclassifier3::StatusOr<std::vector<DocHitInfo>> Query( - const NumericIndex<int64_t>* integer_index, std::string_view property_path, - int64_t key_lower, int64_t key_upper) { - ICING_ASSIGN_OR_RETURN( - std::unique_ptr<DocHitInfoIterator> iter, - integer_index->GetIterator(property_path, key_lower, key_upper)); - - std::vector<DocHitInfo> result; - while (iter->Advance().ok()) { - result.push_back(iter->doc_hit_info()); - } - return result; -} - using TestTypes = ::testing::Types<DummyNumericIndex<int64_t>, IntegerIndex>; TYPED_TEST_SUITE(NumericIndexIntegerTest, TestTypes); @@ -180,8 +244,8 @@ TYPED_TEST(NumericIndexIntegerTest, SingleKeyExactQuery) { int64_t query_key = 2; std::vector<SectionId> expected_sections = {kDefaultSectionId}; - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/query_key, /*key_upper=*/query_key), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/query_key, /*key_upper=*/query_key), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/5, expected_sections), EqualsDocHitInfo(/*document_id=*/2, expected_sections)))); @@ -206,8 +270,8 @@ TYPED_TEST(NumericIndexIntegerTest, SingleKeyRangeQuery) { kDefaultSectionId, /*keys=*/{2}); std::vector<SectionId> expected_sections = {kDefaultSectionId}; - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/1, /*key_upper=*/3), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/1, /*key_upper=*/3), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/5, expected_sections), EqualsDocHitInfo(/*document_id=*/2, expected_sections), @@ -215,6 +279,258 @@ TYPED_TEST(NumericIndexIntegerTest, SingleKeyRangeQuery) { EqualsDocHitInfo(/*document_id=*/0, expected_sections)))); } +TYPED_TEST(NumericIndexIntegerTest, WildcardStorageQuery) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<NumericIndex<int64_t>> integer_index, + this->template CreateIntegerIndex<TypeParam>()); + + // This test sets its schema assuming that max property storages == 32. + ASSERT_THAT(IntegerIndex::kMaxPropertyStorages, Eq(32)); + + PropertyConfigProto int_property_config = + PropertyConfigBuilder() + .SetName("otherProperty1") + .SetCardinality(CARDINALITY_REPEATED) + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .Build(); + // Create a schema with two types: + // - TypeA has 34 properties: + // 'desiredProperty', 'otherProperty'*, 'undesiredProperty' + // - TypeB has 2 properties: 'anotherProperty', 'desiredProperty' + // 1. The 32 'otherProperty's will consume all of the individual storages + // 2. TypeA.desiredProperty and TypeB.anotherProperty will both be assigned + // SectionId = 0 for their respective types. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("TypeA") + .AddProperty(int_property_config) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty2")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty3")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty4")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty5")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty6")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty7")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty8")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty9")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty10")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty11")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty12")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty13")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty14")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty15")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty16")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty17")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty18")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty19")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty20")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty21")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty22")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty23")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty24")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty25")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty26")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty27")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty28")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty29")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty30")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty31")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty32")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("desiredProperty")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("undesiredProperty"))) + .AddType(SchemaTypeConfigBuilder() + .SetType("TypeB") + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("anotherProperty")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("desiredProperty"))) + .Build(); + ICING_ASSERT_OK(this->schema_store_->SetSchema(schema)); + + // Put 11 docs of "TypeA" into the document store. + DocumentProto doc = + DocumentBuilder().SetKey("ns1", "uri0").SetSchema("TypeA").Build(); + ICING_ASSERT_OK(this->doc_store_->Put(doc)); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri1").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri2").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri3").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri4").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri5").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri6").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri7").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri8").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri9").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri10").Build())); + + // Put 5 docs of "TypeB" into the document store. + doc = DocumentBuilder(doc).SetUri("uri11").SetSchema("TypeB").Build(); + ICING_ASSERT_OK(this->doc_store_->Put(doc)); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri12").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri13").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri14").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri15").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri16").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri17").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri18").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri19").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri20").Build())); + + // Ids are assigned alphabetically, so the property ids are: + // TypeA.desiredProperty = 0 + // TypeA.otherPropertyN = N + // TypeA.undesiredProperty = 33 + // TypeB.anotherProperty = 0 + // TypeB.desiredProperty = 1 + SectionId typea_desired_prop_id = 0; + SectionId typea_undesired_prop_id = 33; + SectionId typeb_another_prop_id = 0; + SectionId typeb_desired_prop_id = 1; + + // Index numeric content for other properties to force our property into the + // wildcard storage. + std::string other_property_path = "otherProperty"; + for (int i = 1; i <= IntegerIndex::kMaxPropertyStorages; ++i) { + Index(integer_index.get(), + absl_ports::StrCat(other_property_path, std::to_string(i)), + /*document_id=*/0, /*section_id=*/i, /*keys=*/{i}); + } + + // Index numeric content for TypeA.desiredProperty + std::string desired_property = "desiredProperty"; + Index(integer_index.get(), desired_property, /*document_id=*/0, + typea_desired_prop_id, /*keys=*/{1}); + Index(integer_index.get(), desired_property, /*document_id=*/1, + typea_desired_prop_id, /*keys=*/{3}); + Index(integer_index.get(), desired_property, /*document_id=*/2, + typea_desired_prop_id, /*keys=*/{2}); + Index(integer_index.get(), desired_property, /*document_id=*/3, + typea_desired_prop_id, /*keys=*/{0}); + Index(integer_index.get(), desired_property, /*document_id=*/4, + typea_desired_prop_id, /*keys=*/{4}); + Index(integer_index.get(), desired_property, /*document_id=*/5, + typea_desired_prop_id, /*keys=*/{2}); + + // Index the same numeric content for TypeA.undesiredProperty + std::string undesired_property = "undesiredProperty"; + Index(integer_index.get(), undesired_property, /*document_id=*/6, + typea_undesired_prop_id, /*keys=*/{3}); + Index(integer_index.get(), undesired_property, /*document_id=*/7, + typea_undesired_prop_id, /*keys=*/{2}); + Index(integer_index.get(), undesired_property, /*document_id=*/8, + typea_undesired_prop_id, /*keys=*/{0}); + Index(integer_index.get(), undesired_property, /*document_id=*/9, + typea_undesired_prop_id, /*keys=*/{4}); + Index(integer_index.get(), undesired_property, /*document_id=*/10, + typea_undesired_prop_id, /*keys=*/{2}); + + // Index the same numeric content for TypeB.anotherProperty + std::string another_property = "anotherProperty"; + Index(integer_index.get(), another_property, /*document_id=*/11, + typeb_another_prop_id, /*keys=*/{3}); + Index(integer_index.get(), another_property, /*document_id=*/12, + typeb_another_prop_id, /*keys=*/{2}); + Index(integer_index.get(), another_property, /*document_id=*/13, + typeb_another_prop_id, /*keys=*/{0}); + Index(integer_index.get(), another_property, /*document_id=*/14, + typeb_another_prop_id, /*keys=*/{4}); + Index(integer_index.get(), another_property, /*document_id=*/15, + typeb_another_prop_id, /*keys=*/{2}); + + // Finally, index the same numeric content for TypeB.desiredProperty + Index(integer_index.get(), desired_property, /*document_id=*/16, + typeb_desired_prop_id, /*keys=*/{3}); + Index(integer_index.get(), desired_property, /*document_id=*/17, + typeb_desired_prop_id, /*keys=*/{2}); + Index(integer_index.get(), desired_property, /*document_id=*/18, + typeb_desired_prop_id, /*keys=*/{0}); + Index(integer_index.get(), desired_property, /*document_id=*/19, + typeb_desired_prop_id, /*keys=*/{4}); + Index(integer_index.get(), desired_property, /*document_id=*/20, + typeb_desired_prop_id, /*keys=*/{2}); + + if (this->template is_integer_index<TypeParam>()) { + EXPECT_THAT(integer_index->num_property_indices(), Eq(33)); + } else { + EXPECT_THAT(integer_index->num_property_indices(), Eq(35)); + } + + // Only the hits for 'desired_prop_id' should be returned. + std::vector<SectionId> expected_sections_typea = {typea_desired_prop_id}; + std::vector<SectionId> expected_sections_typeb = {typeb_desired_prop_id}; + EXPECT_THAT( + this->Query(integer_index.get(), desired_property, + /*key_lower=*/2, /*key_upper=*/2), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/20, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/17, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/5, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea)))); + + EXPECT_THAT( + this->Query(integer_index.get(), desired_property, + /*key_lower=*/1, /*key_upper=*/3), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/20, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/17, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/16, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/5, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/1, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/0, expected_sections_typea)))); +} + TYPED_TEST(NumericIndexIntegerTest, EmptyResult) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<NumericIndex<int64_t>> integer_index, @@ -233,11 +549,11 @@ TYPED_TEST(NumericIndexIntegerTest, EmptyResult) { Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/5, kDefaultSectionId, /*keys=*/{2}); - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/10, /*key_upper=*/10), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/10, /*key_upper=*/10), IsOkAndHolds(IsEmpty())); - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/100, /*key_upper=*/200), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/100, /*key_upper=*/200), IsOkAndHolds(IsEmpty())); } @@ -252,8 +568,8 @@ TYPED_TEST(NumericIndexIntegerTest, Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0, kDefaultSectionId, /*keys=*/{1}); - EXPECT_THAT(Query(integer_index.get(), kAnotherPropertyPath, - /*key_lower=*/100, /*key_upper=*/200), + EXPECT_THAT(this->Query(integer_index.get(), kAnotherPropertyPath, + /*key_lower=*/100, /*key_upper=*/200), IsOkAndHolds(IsEmpty())); } @@ -286,8 +602,8 @@ TYPED_TEST(NumericIndexIntegerTest, kDefaultSectionId, /*keys=*/{4, -1000}); std::vector<SectionId> expected_sections = {kDefaultSectionId}; - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/1, /*key_upper=*/3), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/1, /*key_upper=*/3), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/6, expected_sections), EqualsDocHitInfo(/*document_id=*/5, expected_sections), @@ -326,39 +642,39 @@ TYPED_TEST(NumericIndexIntegerTest, EdgeNumericValues) { std::vector<SectionId> expected_sections = {kDefaultSectionId}; // Negative key - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/-100, /*key_upper=*/-70), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/-100, /*key_upper=*/-70), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/2, expected_sections), EqualsDocHitInfo(/*document_id=*/1, expected_sections)))); // INT64_MAX key - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/std::numeric_limits<int64_t>::max(), - /*key_upper=*/std::numeric_limits<int64_t>::max()), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/std::numeric_limits<int64_t>::max(), + /*key_upper=*/std::numeric_limits<int64_t>::max()), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/7, expected_sections), EqualsDocHitInfo(/*document_id=*/3, expected_sections)))); // INT64_MIN key - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/std::numeric_limits<int64_t>::min(), - /*key_upper=*/std::numeric_limits<int64_t>::min()), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/std::numeric_limits<int64_t>::min()), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/9, expected_sections), EqualsDocHitInfo(/*document_id=*/4, expected_sections)))); // Key = 0 - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/0, /*key_upper=*/0), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/0, /*key_upper=*/0), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/8, expected_sections), EqualsDocHitInfo(/*document_id=*/0, expected_sections)))); // All keys from INT64_MIN to INT64_MAX - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/std::numeric_limits<int64_t>::min(), - /*key_upper=*/std::numeric_limits<int64_t>::max()), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/std::numeric_limits<int64_t>::max()), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/9, expected_sections), EqualsDocHitInfo(/*document_id=*/8, expected_sections), @@ -404,8 +720,9 @@ TYPED_TEST(NumericIndexIntegerTest, /*section_id=*/3, /*keys=*/{5}); EXPECT_THAT( - Query(integer_index.get(), kDefaultTestPropertyPath, /*key_lower=*/1, - /*key_upper=*/3), + this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/1, + /*key_upper=*/3), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/2, std::vector<SectionId>{4, 5}), EqualsDocHitInfo(/*document_id=*/1, std::vector<SectionId>{1, 2}), @@ -433,8 +750,8 @@ TYPED_TEST(NumericIndexIntegerTest, NonRelevantPropertyShouldNotBeIncluded) { kDefaultSectionId, /*keys=*/{2}); std::vector<SectionId> expected_sections = {kDefaultSectionId}; - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/1, /*key_upper=*/3), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/1, /*key_upper=*/3), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/5, expected_sections), EqualsDocHitInfo(/*document_id=*/1, expected_sections), @@ -460,8 +777,8 @@ TYPED_TEST(NumericIndexIntegerTest, Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/5, kDefaultSectionId, /*keys=*/{2}); - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/3, /*key_upper=*/1), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/3, /*key_upper=*/1), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } @@ -499,30 +816,30 @@ TYPED_TEST(NumericIndexIntegerTest, Optimize) { // Verify index and query API still work normally after Optimize(). std::vector<SectionId> expected_sections = {kDefaultSectionId}; - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/1, /*key_upper=*/1), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/1, /*key_upper=*/1), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/0, expected_sections)))); - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/3, /*key_upper=*/3), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/3, /*key_upper=*/3), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/1, expected_sections)))); - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/0, /*key_upper=*/0), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/0, /*key_upper=*/0), IsOkAndHolds(IsEmpty())); - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/4, /*key_upper=*/4), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/4, /*key_upper=*/4), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/2, expected_sections)))); - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/2, /*key_upper=*/2), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/2, /*key_upper=*/2), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/3, expected_sections)))); Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/5, kDefaultSectionId, /*keys=*/{123}); - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/123, /*key_upper=*/123), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/123, /*key_upper=*/123), IsOkAndHolds(ElementsAre( EqualsDocHitInfo(/*document_id=*/5, expected_sections)))); } @@ -581,40 +898,40 @@ TYPED_TEST(NumericIndexIntegerTest, OptimizeMultiplePropertyPaths) { // Verify index and query API still work normally after Optimize(). // Key = 1 - EXPECT_THAT(Query(integer_index.get(), kPropertyPath1, /*key_lower=*/1, - /*key_upper=*/1), + EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath1, /*key_lower=*/1, + /*key_upper=*/1), IsOkAndHolds(IsEmpty())); - EXPECT_THAT(Query(integer_index.get(), kPropertyPath2, /*key_lower=*/1, - /*key_upper=*/1), + EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath2, /*key_lower=*/1, + /*key_upper=*/1), IsOkAndHolds(ElementsAre(EqualsDocHitInfo( /*document_id=*/0, std::vector<SectionId>{kSectionId2})))); // key = 2 - EXPECT_THAT(Query(integer_index.get(), kPropertyPath1, /*key_lower=*/2, - /*key_upper=*/2), + EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath1, /*key_lower=*/2, + /*key_upper=*/2), IsOkAndHolds(ElementsAre(EqualsDocHitInfo( /*document_id=*/0, std::vector<SectionId>{kSectionId1})))); - EXPECT_THAT(Query(integer_index.get(), kPropertyPath2, /*key_lower=*/2, - /*key_upper=*/2), + EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath2, /*key_lower=*/2, + /*key_upper=*/2), IsOkAndHolds(IsEmpty())); // key = 3 - EXPECT_THAT(Query(integer_index.get(), kPropertyPath1, /*key_lower=*/3, - /*key_upper=*/3), + EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath1, /*key_lower=*/3, + /*key_upper=*/3), IsOkAndHolds(ElementsAre(EqualsDocHitInfo( /*document_id=*/1, std::vector<SectionId>{kSectionId1})))); - EXPECT_THAT(Query(integer_index.get(), kPropertyPath2, /*key_lower=*/3, - /*key_upper=*/3), + EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath2, /*key_lower=*/3, + /*key_upper=*/3), IsOkAndHolds(ElementsAre(EqualsDocHitInfo( /*document_id=*/2, std::vector<SectionId>{kSectionId2})))); // key = 4 - EXPECT_THAT(Query(integer_index.get(), kPropertyPath1, /*key_lower=*/4, - /*key_upper=*/4), + EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath1, /*key_lower=*/4, + /*key_upper=*/4), IsOkAndHolds(ElementsAre(EqualsDocHitInfo( /*document_id=*/3, std::vector<SectionId>{kSectionId1})))); - EXPECT_THAT(Query(integer_index.get(), kPropertyPath2, /*key_lower=*/4, - /*key_upper=*/4), + EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath2, /*key_lower=*/4, + /*key_upper=*/4), IsOkAndHolds(IsEmpty())); } @@ -655,9 +972,9 @@ TYPED_TEST(NumericIndexIntegerTest, OptimizeShouldDiscardEmptyPropertyStorage) { // All data in "prop2" as well as the underlying storage should be deleted, so // when querying "prop2", we should get empty result. - EXPECT_THAT(Query(integer_index.get(), kPropertyPath2, - /*key_lower=*/std::numeric_limits<int64_t>::min(), - /*key_upper=*/std::numeric_limits<int64_t>::max()), + EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath2, + /*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/std::numeric_limits<int64_t>::max()), IsOkAndHolds(IsEmpty())); if (std::is_same_v<IntegerIndex, TypeParam>) { std::string prop2_storage_working_path = @@ -670,8 +987,8 @@ TYPED_TEST(NumericIndexIntegerTest, OptimizeShouldDiscardEmptyPropertyStorage) { // Verify we can still index and query for "prop2". Index(integer_index.get(), kPropertyPath2, /*document_id=*/100, kSectionId2, /*keys=*/{123}); - EXPECT_THAT(Query(integer_index.get(), kPropertyPath2, - /*key_lower=*/123, /*key_upper=*/123), + EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath2, + /*key_lower=*/123, /*key_upper=*/123), IsOkAndHolds(ElementsAre(EqualsDocHitInfo( /*document_id=*/100, std::vector<SectionId>{kSectionId2})))); } @@ -697,9 +1014,9 @@ TYPED_TEST(NumericIndexIntegerTest, OptimizeOutOfRangeDocumentId) { EXPECT_THAT(integer_index->last_added_document_id(), Eq(kInvalidDocumentId)); // Verify all data are discarded after Optimize(). - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/std::numeric_limits<int64_t>::min(), - /*key_upper=*/std::numeric_limits<int64_t>::max()), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/std::numeric_limits<int64_t>::max()), IsOkAndHolds(IsEmpty())); } @@ -731,9 +1048,9 @@ TYPED_TEST(NumericIndexIntegerTest, OptimizeDeleteAll) { EXPECT_THAT(integer_index->last_added_document_id(), Eq(kInvalidDocumentId)); // Verify all data are discarded after Optimize(). - EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath, - /*key_lower=*/std::numeric_limits<int64_t>::min(), - /*key_upper=*/std::numeric_limits<int64_t>::max()), + EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath, + /*key_lower=*/std::numeric_limits<int64_t>::min(), + /*key_upper=*/std::numeric_limits<int64_t>::max()), IsOkAndHolds(IsEmpty())); } @@ -750,13 +1067,13 @@ TYPED_TEST(NumericIndexIntegerTest, Clear) { ASSERT_THAT(integer_index->last_added_document_id(), Eq(1)); ASSERT_THAT( - Query(integer_index.get(), /*property_path=*/"A", /*key_lower=*/1, - /*key_upper=*/1), + this->Query(integer_index.get(), /*property_path=*/"A", /*key_lower=*/1, + /*key_upper=*/1), IsOkAndHolds(ElementsAre(EqualsDocHitInfo( /*document_id=*/0, std::vector<SectionId>{kDefaultSectionId})))); ASSERT_THAT( - Query(integer_index.get(), /*property_path=*/"B", /*key_lower=*/3, - /*key_upper=*/3), + this->Query(integer_index.get(), /*property_path=*/"B", /*key_lower=*/3, + /*key_upper=*/3), IsOkAndHolds(ElementsAre(EqualsDocHitInfo( /*document_id=*/1, std::vector<SectionId>{kDefaultSectionId})))); @@ -764,12 +1081,14 @@ TYPED_TEST(NumericIndexIntegerTest, Clear) { // kInvalidDocumentId, and the previous added keys should be deleted. ICING_ASSERT_OK(integer_index->Clear()); EXPECT_THAT(integer_index->last_added_document_id(), Eq(kInvalidDocumentId)); - EXPECT_THAT(Query(integer_index.get(), /*property_path=*/"A", /*key_lower=*/1, - /*key_upper=*/1), - IsOkAndHolds(IsEmpty())); - EXPECT_THAT(Query(integer_index.get(), /*property_path=*/"B", /*key_lower=*/3, - /*key_upper=*/3), - IsOkAndHolds(IsEmpty())); + EXPECT_THAT( + this->Query(integer_index.get(), /*property_path=*/"A", /*key_lower=*/1, + /*key_upper=*/1), + IsOkAndHolds(IsEmpty())); + EXPECT_THAT( + this->Query(integer_index.get(), /*property_path=*/"B", /*key_lower=*/3, + /*key_upper=*/3), + IsOkAndHolds(IsEmpty())); // Integer index should be able to work normally after Clear(). Index(integer_index.get(), /*property_path=*/"A", /*document_id=*/3, @@ -780,13 +1099,13 @@ TYPED_TEST(NumericIndexIntegerTest, Clear) { EXPECT_THAT(integer_index->last_added_document_id(), Eq(4)); EXPECT_THAT( - Query(integer_index.get(), /*property_path=*/"A", /*key_lower=*/123, - /*key_upper=*/123), + this->Query(integer_index.get(), /*property_path=*/"A", /*key_lower=*/123, + /*key_upper=*/123), IsOkAndHolds(ElementsAre(EqualsDocHitInfo( /*document_id=*/3, std::vector<SectionId>{kDefaultSectionId})))); EXPECT_THAT( - Query(integer_index.get(), /*property_path=*/"B", /*key_lower=*/456, - /*key_upper=*/456), + this->Query(integer_index.get(), /*property_path=*/"B", /*key_lower=*/456, + /*key_upper=*/456), IsOkAndHolds(ElementsAre(EqualsDocHitInfo( /*document_id=*/4, std::vector<SectionId>{kDefaultSectionId})))); } @@ -1066,6 +1385,260 @@ TEST_F(IntegerIndexTest, HasSubstr("Invalid storages crc")); } } + +TEST_F(IntegerIndexTest, WildcardStoragePersistenceQuery) { + // This test sets its schema assuming that max property storages == 32. + ASSERT_THAT(IntegerIndex::kMaxPropertyStorages, Eq(32)); + + PropertyConfigProto int_property_config = + PropertyConfigBuilder() + .SetName("otherProperty1") + .SetCardinality(CARDINALITY_REPEATED) + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .Build(); + // Create a schema with two types: + // - TypeA has 34 properties: + // 'desiredProperty', 'otherProperty'*, 'undesiredProperty' + // - TypeB has 2 properties: 'anotherProperty', 'desiredProperty' + // 1. The 32 'otherProperty's will consume all of the individual storages + // 2. TypeA.desiredProperty and TypeB.anotherProperty will both be assigned + // SectionId = 0 for their respective types. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("TypeA") + .AddProperty(int_property_config) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty2")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty3")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty4")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty5")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty6")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty7")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty8")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty9")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty10")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty11")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty12")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty13")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty14")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty15")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty16")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty17")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty18")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty19")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty20")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty21")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty22")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty23")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty24")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty25")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty26")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty27")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty28")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty29")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty30")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty31")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty32")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("desiredProperty")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("undesiredProperty"))) + .AddType(SchemaTypeConfigBuilder() + .SetType("TypeB") + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("anotherProperty")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("desiredProperty"))) + .Build(); + ICING_ASSERT_OK(this->schema_store_->SetSchema(schema)); + + // Ids are assigned alphabetically, so the property ids are: + // TypeA.desiredProperty = 0 + // TypeA.otherPropertyN = N + // TypeA.undesiredProperty = 33 + // TypeB.anotherProperty = 0 + // TypeB.desiredProperty = 1 + SectionId typea_desired_prop_id = 0; + SectionId typea_undesired_prop_id = 33; + SectionId typeb_another_prop_id = 0; + SectionId typeb_desired_prop_id = 1; + std::string desired_property = "desiredProperty"; + std::string undesired_property = "undesiredProperty"; + std::string another_property = "anotherProperty"; + + // Put 11 docs of "TypeA" into the document store. + DocumentProto doc = + DocumentBuilder().SetKey("ns1", "uri0").SetSchema("TypeA").Build(); + ICING_ASSERT_OK(this->doc_store_->Put(doc)); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri1").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri2").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri3").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri4").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri5").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri6").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri7").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri8").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri9").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri10").Build())); + + // Put 10 docs of "TypeB" into the document store. + doc = DocumentBuilder(doc).SetUri("uri11").SetSchema("TypeB").Build(); + ICING_ASSERT_OK(this->doc_store_->Put(doc)); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri12").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri13").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri14").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri15").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri16").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri17").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri18").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri19").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri20").Build())); + + { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndex> integer_index, + IntegerIndex::Create(filesystem_, working_path_)); + + // Index numeric content for other properties to force our property into the + // wildcard storage. + std::string other_property_path = "otherProperty"; + for (int i = 1; i <= IntegerIndex::kMaxPropertyStorages; ++i) { + Index(integer_index.get(), + absl_ports::StrCat(other_property_path, std::to_string(i)), + /*document_id=*/0, /*section_id=*/i, /*keys=*/{i}); + } + + // Index numeric content for TypeA.desiredProperty + Index(integer_index.get(), desired_property, /*document_id=*/0, + typea_desired_prop_id, /*keys=*/{1}); + Index(integer_index.get(), desired_property, /*document_id=*/1, + typea_desired_prop_id, /*keys=*/{3}); + Index(integer_index.get(), desired_property, /*document_id=*/2, + typea_desired_prop_id, /*keys=*/{2}); + Index(integer_index.get(), desired_property, /*document_id=*/3, + typea_desired_prop_id, /*keys=*/{0}); + Index(integer_index.get(), desired_property, /*document_id=*/4, + typea_desired_prop_id, /*keys=*/{4}); + Index(integer_index.get(), desired_property, /*document_id=*/5, + typea_desired_prop_id, /*keys=*/{2}); + + // Index the same numeric content for TypeA.undesiredProperty + Index(integer_index.get(), undesired_property, /*document_id=*/6, + typea_undesired_prop_id, /*keys=*/{3}); + Index(integer_index.get(), undesired_property, /*document_id=*/7, + typea_undesired_prop_id, /*keys=*/{2}); + Index(integer_index.get(), undesired_property, /*document_id=*/8, + typea_undesired_prop_id, /*keys=*/{0}); + Index(integer_index.get(), undesired_property, /*document_id=*/9, + typea_undesired_prop_id, /*keys=*/{4}); + Index(integer_index.get(), undesired_property, /*document_id=*/10, + typea_undesired_prop_id, /*keys=*/{2}); + + // Index the same numeric content for TypeB.undesiredProperty + Index(integer_index.get(), another_property, /*document_id=*/11, + typeb_another_prop_id, /*keys=*/{3}); + Index(integer_index.get(), another_property, /*document_id=*/12, + typeb_another_prop_id, /*keys=*/{2}); + Index(integer_index.get(), another_property, /*document_id=*/13, + typeb_another_prop_id, /*keys=*/{0}); + Index(integer_index.get(), another_property, /*document_id=*/14, + typeb_another_prop_id, /*keys=*/{4}); + Index(integer_index.get(), another_property, /*document_id=*/15, + typeb_another_prop_id, /*keys=*/{2}); + + // Finally, index the same numeric content for TypeB.desiredProperty + Index(integer_index.get(), desired_property, /*document_id=*/16, + typeb_desired_prop_id, /*keys=*/{3}); + Index(integer_index.get(), desired_property, /*document_id=*/17, + typeb_desired_prop_id, /*keys=*/{2}); + Index(integer_index.get(), desired_property, /*document_id=*/18, + typeb_desired_prop_id, /*keys=*/{0}); + Index(integer_index.get(), desired_property, /*document_id=*/19, + typeb_desired_prop_id, /*keys=*/{4}); + Index(integer_index.get(), desired_property, /*document_id=*/20, + typeb_desired_prop_id, /*keys=*/{2}); + } + + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerIndex> integer_index, + IntegerIndex::Create(filesystem_, working_path_)); + + EXPECT_THAT(integer_index->num_property_indices(), Eq(33)); + + // Only the hits for 'desired_prop_id' should be returned. + std::vector<SectionId> expected_sections_typea = {typea_desired_prop_id}; + std::vector<SectionId> expected_sections_typeb = {typeb_desired_prop_id}; + EXPECT_THAT( + Query(integer_index.get(), desired_property, + /*key_lower=*/2, /*key_upper=*/2), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/20, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/17, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/5, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea)))); + + EXPECT_THAT( + Query(integer_index.get(), desired_property, + /*key_lower=*/1, /*key_upper=*/3), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/20, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/17, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/16, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/5, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/1, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/0, expected_sections_typea)))); +} + TEST_F(IntegerIndexTest, IntegerIndexShouldWorkAfterOptimizeAndReinitialization) { constexpr std::string_view kPropertyPath1 = "prop1"; @@ -1183,6 +1756,550 @@ TEST_F(IntegerIndexTest, } } +TEST_F(IntegerIndexTest, WildcardStorageWorksAfterOptimize) { + // This test sets its schema assuming that max property storages == 32. + ASSERT_THAT(IntegerIndex::kMaxPropertyStorages, Eq(32)); + + PropertyConfigProto int_property_config = + PropertyConfigBuilder() + .SetName("otherProperty1") + .SetCardinality(CARDINALITY_REPEATED) + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .Build(); + // Create a schema with two types: + // - TypeA has 34 properties: + // 'desiredProperty', 'otherProperty'*, 'undesiredProperty' + // - TypeB has 2 properties: 'anotherProperty', 'desiredProperty' + // 1. The 32 'otherProperty's will consume all of the individual storages + // 2. TypeA.desiredProperty and TypeB.anotherProperty will both be assigned + // SectionId = 0 for their respective types. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("TypeA") + .AddProperty(int_property_config) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty2")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty3")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty4")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty5")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty6")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty7")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty8")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty9")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty10")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty11")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty12")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty13")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty14")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty15")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty16")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty17")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty18")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty19")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty20")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty21")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty22")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty23")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty24")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty25")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty26")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty27")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty28")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty29")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty30")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty31")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty32")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("desiredProperty")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("undesiredProperty"))) + .AddType(SchemaTypeConfigBuilder() + .SetType("TypeB") + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("anotherProperty")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("desiredProperty"))) + .Build(); + ICING_ASSERT_OK(this->schema_store_->SetSchema(schema)); + + // Ids are assigned alphabetically, so the property ids are: + // TypeA.desiredProperty = 0 + // TypeA.otherPropertyN = N + // TypeA.undesiredProperty = 33 + // TypeB.anotherProperty = 0 + // TypeB.desiredProperty = 1 + SectionId typea_desired_prop_id = 0; + SectionId typea_undesired_prop_id = 33; + SectionId typeb_another_prop_id = 0; + SectionId typeb_desired_prop_id = 1; + std::string desired_property = "desiredProperty"; + std::string undesired_property = "undesiredProperty"; + std::string another_property = "anotherProperty"; + + // Only the hits for 'desired_prop_id' should be returned. + std::vector<SectionId> expected_sections_typea = {typea_desired_prop_id}; + std::vector<SectionId> expected_sections_typeb = {typeb_desired_prop_id}; + + // Put 11 docs of "TypeA" into the document store. + DocumentProto doc = + DocumentBuilder().SetKey("ns1", "uri0").SetSchema("TypeA").Build(); + ICING_ASSERT_OK(this->doc_store_->Put(doc)); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri1").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri2").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri3").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri4").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri5").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri6").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri7").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri8").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri9").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri10").Build())); + + // Put 10 docs of "TypeB" into the document store. + doc = DocumentBuilder(doc).SetUri("uri11").SetSchema("TypeB").Build(); + ICING_ASSERT_OK(this->doc_store_->Put(doc)); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri12").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri13").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri14").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri15").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri16").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri17").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri18").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri19").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri20").Build())); + + { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndex> integer_index, + IntegerIndex::Create(filesystem_, working_path_)); + + // Index numeric content for other properties to force our property into the + // wildcard storage. + std::string other_property_path = "otherProperty"; + for (int i = 1; i <= IntegerIndex::kMaxPropertyStorages; ++i) { + Index(integer_index.get(), + absl_ports::StrCat(other_property_path, std::to_string(i)), + /*document_id=*/0, /*section_id=*/i, /*keys=*/{i}); + } + + // Index numeric content for TypeA.desiredProperty + Index(integer_index.get(), desired_property, /*document_id=*/0, + typea_desired_prop_id, /*keys=*/{1}); + Index(integer_index.get(), desired_property, /*document_id=*/1, + typea_desired_prop_id, /*keys=*/{3}); + Index(integer_index.get(), desired_property, /*document_id=*/2, + typea_desired_prop_id, /*keys=*/{2}); + Index(integer_index.get(), desired_property, /*document_id=*/3, + typea_desired_prop_id, /*keys=*/{0}); + Index(integer_index.get(), desired_property, /*document_id=*/4, + typea_desired_prop_id, /*keys=*/{4}); + Index(integer_index.get(), desired_property, /*document_id=*/5, + typea_desired_prop_id, /*keys=*/{2}); + + // Index the same numeric content for TypeA.undesiredProperty + Index(integer_index.get(), undesired_property, /*document_id=*/6, + typea_undesired_prop_id, /*keys=*/{3}); + Index(integer_index.get(), undesired_property, /*document_id=*/7, + typea_undesired_prop_id, /*keys=*/{2}); + Index(integer_index.get(), undesired_property, /*document_id=*/8, + typea_undesired_prop_id, /*keys=*/{0}); + Index(integer_index.get(), undesired_property, /*document_id=*/9, + typea_undesired_prop_id, /*keys=*/{4}); + Index(integer_index.get(), undesired_property, /*document_id=*/10, + typea_undesired_prop_id, /*keys=*/{2}); + + // Index the same numeric content for TypeB.undesiredProperty + Index(integer_index.get(), another_property, /*document_id=*/11, + typeb_another_prop_id, /*keys=*/{3}); + Index(integer_index.get(), another_property, /*document_id=*/12, + typeb_another_prop_id, /*keys=*/{2}); + Index(integer_index.get(), another_property, /*document_id=*/13, + typeb_another_prop_id, /*keys=*/{0}); + Index(integer_index.get(), another_property, /*document_id=*/14, + typeb_another_prop_id, /*keys=*/{4}); + Index(integer_index.get(), another_property, /*document_id=*/15, + typeb_another_prop_id, /*keys=*/{2}); + + // Finally, index the same numeric content for TypeB.desiredProperty + Index(integer_index.get(), desired_property, /*document_id=*/16, + typeb_desired_prop_id, /*keys=*/{3}); + Index(integer_index.get(), desired_property, /*document_id=*/17, + typeb_desired_prop_id, /*keys=*/{2}); + Index(integer_index.get(), desired_property, /*document_id=*/18, + typeb_desired_prop_id, /*keys=*/{0}); + Index(integer_index.get(), desired_property, /*document_id=*/19, + typeb_desired_prop_id, /*keys=*/{4}); + Index(integer_index.get(), desired_property, /*document_id=*/20, + typeb_desired_prop_id, /*keys=*/{2}); + + ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/3)); + ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/5)); + // Delete doc id = 3, 5, compress and keep the rest. + ICING_ASSERT_OK_AND_ASSIGN(std::vector<DocumentId> document_id_old_to_new, + CompactDocStore()); + + DocumentId new_last_added_document_id = 18; + EXPECT_THAT(integer_index->Optimize(document_id_old_to_new, + new_last_added_document_id), + IsOk()); + EXPECT_THAT(integer_index->last_added_document_id(), + Eq(new_last_added_document_id)); + + EXPECT_THAT( + Query(integer_index.get(), desired_property, + /*key_lower=*/2, /*key_upper=*/2), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/20 - 2, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/17 - 2, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea)))); + + EXPECT_THAT( + Query(integer_index.get(), desired_property, + /*key_lower=*/1, /*key_upper=*/3), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/20 - 2, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/17 - 2, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/16 - 2, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/1, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/0, expected_sections_typea)))); + } + + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerIndex> integer_index, + IntegerIndex::Create(filesystem_, working_path_)); + + EXPECT_THAT(integer_index->num_property_indices(), Eq(33)); + + EXPECT_THAT( + Query(integer_index.get(), desired_property, + /*key_lower=*/2, /*key_upper=*/2), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/20 - 2, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/17 - 2, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea)))); + + EXPECT_THAT( + Query(integer_index.get(), desired_property, + /*key_lower=*/1, /*key_upper=*/3), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/20 - 2, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/17 - 2, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/16 - 2, expected_sections_typeb), + EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/1, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/0, expected_sections_typea)))); +} + +// This test covers the situation where Optimize causes us to throw out some of +// the individual index storages (because they don't have any hits anymore). +// In this case, any properties that added content to the wildcard storage (even +// if all of their content was also deleted) should still be placed in the +// wilcard storage. +TEST_F(IntegerIndexTest, WildcardStorageAvailableIndicesAfterOptimize) { + // This test sets its schema assuming that max property storages == 32. + ASSERT_THAT(IntegerIndex::kMaxPropertyStorages, Eq(32)); + + PropertyConfigProto int_property_config = + PropertyConfigBuilder() + .SetName("otherProperty1") + .SetCardinality(CARDINALITY_REPEATED) + .SetDataTypeInt64(NUMERIC_MATCH_RANGE) + .Build(); + // Create a schema with two types: + // - TypeA has 34 properties: + // 'desiredProperty', 'otherProperty'*, 'undesiredProperty' + // - TypeB has 2 properties: 'anotherProperty', 'desiredProperty' + // 1. The 32 'otherProperty's will consume all of the individual storages + // 2. TypeA.desiredProperty and TypeB.anotherProperty will both be assigned + // SectionId = 0 for their respective types. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("TypeA") + .AddProperty(int_property_config) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty2")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty3")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty4")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty5")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty6")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty7")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty8")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty9")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty10")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty11")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty12")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty13")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty14")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty15")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty16")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty17")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty18")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty19")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty20")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty21")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty22")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty23")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty24")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty25")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty26")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty27")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty28")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty29")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty30")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty31")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("otherProperty32")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("desiredProperty")) + .AddProperty(PropertyConfigBuilder(int_property_config) + .SetName("undesiredProperty"))) + .Build(); + ICING_ASSERT_OK(this->schema_store_->SetSchema(schema)); + + // Ids are assigned alphabetically, so the property ids are: + // TypeA.desiredProperty = 0 + // TypeA.otherPropertyN = N + // TypeA.undesiredProperty = 33 + // TypeB.anotherProperty = 0 + // TypeB.desiredProperty = 1 + SectionId typea_desired_prop_id = 0; + SectionId typea_undesired_prop_id = 33; + SectionId typea_other1_prop_id = 1; + std::string desired_property = "desiredProperty"; + std::string undesired_property = "undesiredProperty"; + std::string another_property = "anotherProperty"; + std::string other_property_1 = "otherProperty1"; + + // Only the hits for 'desired_prop_id' should be returned. + std::vector<SectionId> expected_sections_typea = {typea_desired_prop_id}; + + // Put 11 docs of "TypeA" into the document store. + DocumentProto doc = + DocumentBuilder().SetKey("ns1", "uri0").SetSchema("TypeA").Build(); + ICING_ASSERT_OK(this->doc_store_->Put(doc)); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri1").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri2").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri3").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri4").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri5").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri6").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri7").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri8").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri9").Build())); + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri10").Build())); + + { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<IntegerIndex> integer_index, + IntegerIndex::Create(filesystem_, working_path_)); + + // Index numeric content for other properties to force our property into the + // wildcard storage. + std::string other_property_path = "otherProperty"; + for (int i = 1; i <= IntegerIndex::kMaxPropertyStorages; ++i) { + Index(integer_index.get(), + absl_ports::StrCat(other_property_path, std::to_string(i)), + /*document_id=*/0, /*section_id=*/i, /*keys=*/{i}); + } + + // Index numeric content for TypeA.desiredProperty + Index(integer_index.get(), desired_property, /*document_id=*/0, + typea_desired_prop_id, /*keys=*/{1}); + Index(integer_index.get(), desired_property, /*document_id=*/1, + typea_desired_prop_id, /*keys=*/{3}); + Index(integer_index.get(), desired_property, /*document_id=*/2, + typea_desired_prop_id, /*keys=*/{2}); + Index(integer_index.get(), desired_property, /*document_id=*/3, + typea_desired_prop_id, /*keys=*/{0}); + Index(integer_index.get(), desired_property, /*document_id=*/4, + typea_desired_prop_id, /*keys=*/{4}); + Index(integer_index.get(), desired_property, /*document_id=*/5, + typea_desired_prop_id, /*keys=*/{2}); + + // Index the same numeric content for TypeA.undesiredProperty + Index(integer_index.get(), undesired_property, /*document_id=*/6, + typea_undesired_prop_id, /*keys=*/{3}); + Index(integer_index.get(), undesired_property, /*document_id=*/7, + typea_undesired_prop_id, /*keys=*/{2}); + Index(integer_index.get(), undesired_property, /*document_id=*/8, + typea_undesired_prop_id, /*keys=*/{0}); + Index(integer_index.get(), undesired_property, /*document_id=*/9, + typea_undesired_prop_id, /*keys=*/{4}); + Index(integer_index.get(), undesired_property, /*document_id=*/10, + typea_undesired_prop_id, /*keys=*/{2}); + + // Delete all the docs that had hits in otherProperty* and + // undesiredProperty. + ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/0)); + ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/6)); + ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/7)); + ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/8)); + ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/9)); + ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/10)); + // Delete doc id = 0, 6, 7, 8, 9, 10. Compress and keep the rest. + ICING_ASSERT_OK_AND_ASSIGN(std::vector<DocumentId> document_id_old_to_new, + CompactDocStore()); + + DocumentId new_last_added_document_id = 5 - 1; + EXPECT_THAT(integer_index->Optimize(document_id_old_to_new, + new_last_added_document_id), + IsOk()); + EXPECT_THAT(integer_index->last_added_document_id(), + Eq(new_last_added_document_id)); + + EXPECT_THAT( + Query(integer_index.get(), desired_property, + /*key_lower=*/2, /*key_upper=*/2), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/5 - 1, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/2 - 1, expected_sections_typea)))); + + EXPECT_THAT( + Query(integer_index.get(), desired_property, + /*key_lower=*/1, /*key_upper=*/3), + IsOkAndHolds(ElementsAre( + EqualsDocHitInfo(/*document_id=*/5 - 1, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/2 - 1, expected_sections_typea), + EqualsDocHitInfo(/*document_id=*/1 - 1, expected_sections_typea)))); + } + + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerIndex> integer_index, + IntegerIndex::Create(filesystem_, working_path_)); + + EXPECT_THAT(integer_index->num_property_indices(), Eq(1)); + + // Add a new doc (docid==5) and a hit for desiredProperty. This should still + // be placed into the wildcard integer storage. + doc = DocumentBuilder().SetKey("ns1", "uri11").SetSchema("TypeA").Build(); + ICING_ASSERT_OK(this->doc_store_->Put(doc)); + Index(integer_index.get(), desired_property, /*document_id=*/5, + typea_desired_prop_id, /*keys=*/{12}); + EXPECT_THAT(integer_index->num_property_indices(), Eq(1)); + + EXPECT_THAT(Query(integer_index.get(), desired_property, + /*key_lower=*/12, /*key_upper=*/12), + IsOkAndHolds(ElementsAre(EqualsDocHitInfo( + /*document_id=*/5, expected_sections_typea)))); + + // Add a new doc (docid==6) and a hit for undesiredProperty. This should still + // be placed into the wildcard integer storage. + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri12").Build())); + Index(integer_index.get(), undesired_property, /*document_id=*/6, + typea_undesired_prop_id, /*keys=*/{3}); + EXPECT_THAT(integer_index->num_property_indices(), Eq(1)); + + expected_sections_typea = {typea_undesired_prop_id}; + EXPECT_THAT(Query(integer_index.get(), undesired_property, + /*key_lower=*/3, /*key_upper=*/3), + IsOkAndHolds(ElementsAre(EqualsDocHitInfo( + /*document_id=*/6, expected_sections_typea)))); + + // Add a new doc (docid==7) and a hit for otherProperty1. This should be given + // its own individual storage. + ICING_ASSERT_OK( + this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri13").Build())); + Index(integer_index.get(), other_property_1, /*document_id=*/7, + typea_other1_prop_id, /*keys=*/{3}); + EXPECT_THAT(integer_index->num_property_indices(), Eq(2)); + + expected_sections_typea = {typea_other1_prop_id}; + EXPECT_THAT(Query(integer_index.get(), other_property_1, + /*key_lower=*/3, /*key_upper=*/3), + IsOkAndHolds(ElementsAre(EqualsDocHitInfo( + /*document_id=*/7, expected_sections_typea)))); +} + } // namespace } // namespace lib diff --git a/icing/index/numeric/numeric-index.h b/icing/index/numeric/numeric-index.h index 347260a..28640ca 100644 --- a/icing/index/numeric/numeric-index.h +++ b/icing/index/numeric/numeric-index.h @@ -23,8 +23,10 @@ #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/file/persistent-storage.h" #include "icing/index/iterator/doc-hit-info-iterator.h" +#include "icing/schema/schema-store.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" +#include "icing/store/document-store.h" namespace icing { namespace lib { @@ -126,8 +128,9 @@ class NumericIndex : public PersistentStorage { // - INVALID_ARGUMENT_ERROR if key_lower > key_upper // - Any other errors, depending on the actual implementation virtual libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> - GetIterator(std::string_view property_path, T key_lower, - T key_upper) const = 0; + GetIterator(std::string_view property_path, T key_lower, T key_upper, + const DocumentStore& document_store, + const SchemaStore& schema_store) const = 0; // Reduces internal file sizes by reclaiming space and ids of deleted // documents. Numeric index will convert all data (hits) to the new document @@ -162,6 +165,10 @@ class NumericIndex : public PersistentStorage { // last_added_document_id() or last_added_document_id() is invalid. virtual void set_last_added_document_id(DocumentId document_id) = 0; + // The number of individual indices that the NumericIndex has created to + // search over all indexed properties thus far. + virtual int num_property_indices() const = 0; + protected: explicit NumericIndex(const Filesystem& filesystem, std::string&& working_path, diff --git a/icing/index/numeric/posting-list-integer-index-accessor.cc b/icing/index/numeric/posting-list-integer-index-accessor.cc index 220b240..af2aea4 100644 --- a/icing/index/numeric/posting-list-integer-index-accessor.cc +++ b/icing/index/numeric/posting-list-integer-index-accessor.cc @@ -64,6 +64,58 @@ PostingListIntegerIndexAccessor::CreateFromExisting( // Returns the next batch of integer index data for the provided posting list. libtextclassifier3::StatusOr<std::vector<IntegerIndexData>> PostingListIntegerIndexAccessor::GetNextDataBatch() { + return GetNextDataBatchImpl(/*free_posting_list=*/false); +} + +libtextclassifier3::StatusOr<std::vector<IntegerIndexData>> +PostingListIntegerIndexAccessor::GetAllDataAndFree() { + if (preexisting_posting_list_ == nullptr) { + return absl_ports::FailedPreconditionError( + "Cannot retrieve data from a PostingListIntegerIndexAccessor that " + "was not created from a preexisting posting list."); + } + + std::vector<IntegerIndexData> all_data; + while (true) { + ICING_ASSIGN_OR_RETURN(std::vector<IntegerIndexData> batch, + GetNextDataBatchImpl(/*free_posting_list=*/true)); + if (batch.empty()) { + break; + } + std::move(batch.begin(), batch.end(), std::back_inserter(all_data)); + } + + return all_data; +} + +libtextclassifier3::Status PostingListIntegerIndexAccessor::PrependData( + const IntegerIndexData& data) { + PostingListUsed& active_pl = (preexisting_posting_list_ != nullptr) + ? preexisting_posting_list_->posting_list + : in_memory_posting_list_; + libtextclassifier3::Status status = + serializer_->PrependData(&active_pl, data); + if (!absl_ports::IsResourceExhausted(status)) { + return status; + } + // There is no more room to add data to this current posting list! Therefore, + // we need to either move those data to a larger posting list or flush this + // posting list and create another max-sized posting list in the chain. + if (preexisting_posting_list_ != nullptr) { + ICING_RETURN_IF_ERROR(FlushPreexistingPostingList()); + } else { + ICING_RETURN_IF_ERROR(FlushInMemoryPostingList()); + } + + // Re-add data. Should always fit since we just cleared + // in_memory_posting_list_. It's fine to explicitly reference + // in_memory_posting_list_ here because there's no way of reaching this line + // while preexisting_posting_list_ is still in use. + return serializer_->PrependData(&in_memory_posting_list_, data); +} + +libtextclassifier3::StatusOr<std::vector<IntegerIndexData>> +PostingListIntegerIndexAccessor::GetNextDataBatchImpl(bool free_posting_list) { if (preexisting_posting_list_ == nullptr) { if (has_reached_posting_list_chain_end_) { return std::vector<IntegerIndexData>(); @@ -85,6 +137,11 @@ PostingListIntegerIndexAccessor::GetNextDataBatch() { next_block_index = preexisting_posting_list_->next_block_index; } + if (free_posting_list) { + ICING_RETURN_IF_ERROR( + storage_->FreePostingList(std::move(*preexisting_posting_list_))); + } + if (next_block_index != kInvalidBlockIndex) { // Since we only have to deal with next block for max-sized posting list // block, max_num_posting_lists is 1 and posting_list_index_bits is @@ -103,31 +160,5 @@ PostingListIntegerIndexAccessor::GetNextDataBatch() { return batch; } -libtextclassifier3::Status PostingListIntegerIndexAccessor::PrependData( - const IntegerIndexData& data) { - PostingListUsed& active_pl = (preexisting_posting_list_ != nullptr) - ? preexisting_posting_list_->posting_list - : in_memory_posting_list_; - libtextclassifier3::Status status = - serializer_->PrependData(&active_pl, data); - if (!absl_ports::IsResourceExhausted(status)) { - return status; - } - // There is no more room to add data to this current posting list! Therefore, - // we need to either move those data to a larger posting list or flush this - // posting list and create another max-sized posting list in the chain. - if (preexisting_posting_list_ != nullptr) { - ICING_RETURN_IF_ERROR(FlushPreexistingPostingList()); - } else { - ICING_RETURN_IF_ERROR(FlushInMemoryPostingList()); - } - - // Re-add data. Should always fit since we just cleared - // in_memory_posting_list_. It's fine to explicitly reference - // in_memory_posting_list_ here because there's no way of reaching this line - // while preexisting_posting_list_ is still in use. - return serializer_->PrependData(&in_memory_posting_list_, data); -} - } // namespace lib } // namespace icing diff --git a/icing/index/numeric/posting-list-integer-index-accessor.h b/icing/index/numeric/posting-list-integer-index-accessor.h index 4c1eced..f0d3d25 100644 --- a/icing/index/numeric/posting-list-integer-index-accessor.h +++ b/icing/index/numeric/posting-list-integer-index-accessor.h @@ -50,7 +50,7 @@ class PostingListIntegerIndexAccessor : public PostingListAccessor { Create(FlashIndexStorage* storage, PostingListIntegerIndexSerializer* serializer); - // Create a PostingListIntegerIndexAccessor with an existing posting list + // Creates a PostingListIntegerIndexAccessor with an existing posting list // identified by existing_posting_list_id. // // RETURNS: @@ -64,17 +64,30 @@ class PostingListIntegerIndexAccessor : public PostingListAccessor { PostingListSerializer* GetSerializer() override { return serializer_; } - // Retrieve the next batch of data in the posting list chain + // Retrieves the next batch of data in the posting list chain. // // RETURNS: // - On success, a vector of integer index data in the posting list chain - // - INTERNAL if called on an instance that was created via Create, if - // unable to read the next posting list in the chain or if the posting - // list has been corrupted somehow. + // - FAILED_PRECONDITION_ERROR if called on an instance that was created via + // Create. + // - INTERNAL_ERROR if unable to read the next posting list in the chain or + // if the posting list has been corrupted somehow. libtextclassifier3::StatusOr<std::vector<IntegerIndexData>> GetNextDataBatch(); - // Prepend one data. This may result in flushing the posting list to disk (if + // Retrieves all data from the posting list chain and frees all posting + // list(s). + // + // RETURNS: + // - On success, a vector of integer index data in the posting list chain + // - FAILED_PRECONDITION_ERROR if called on an instance that was created via + // Create. + // - INTERNAL_ERROR if unable to read the next posting list in the chain or + // if the posting list has been corrupted somehow. + libtextclassifier3::StatusOr<std::vector<IntegerIndexData>> + GetAllDataAndFree(); + + // Prepends one data. This may result in flushing the posting list to disk (if // the PostingListIntegerIndexAccessor holds a max-sized posting list that // is full) or freeing a pre-existing posting list if it is too small to fit // all data necessary. @@ -87,7 +100,15 @@ class PostingListIntegerIndexAccessor : public PostingListAccessor { // posting list. libtextclassifier3::Status PrependData(const IntegerIndexData& data); - // TODO(b/259743562): [Optimization 1] add GetAndClear, IsFull for split + bool WantsSplit() const { + const PostingListUsed* current_pl = + preexisting_posting_list_ != nullptr + ? &preexisting_posting_list_->posting_list + : &in_memory_posting_list_; + // Only max-sized PLs get split. Smaller PLs just get copied to larger PLs. + return current_pl->size_in_bytes() == storage_->max_posting_list_bytes() && + serializer_->IsFull(current_pl); + } private: explicit PostingListIntegerIndexAccessor( @@ -96,6 +117,20 @@ class PostingListIntegerIndexAccessor : public PostingListAccessor { : PostingListAccessor(storage, std::move(in_memory_posting_list)), serializer_(serializer) {} + // Retrieves the next batch of data in the posting list chain. + // + // - free_posting_list: a boolean flag indicating whether freeing all posting + // lists after retrieving batch data. + // + // RETURNS: + // - On success, a vector of integer index data in the posting list chain + // - FAILED_PRECONDITION_ERROR if called on an instance that was created via + // Create. + // - INTERNAL_ERROR if unable to read the next posting list in the chain or + // if the posting list has been corrupted somehow. + libtextclassifier3::StatusOr<std::vector<IntegerIndexData>> + GetNextDataBatchImpl(bool free_posting_list); + PostingListIntegerIndexSerializer* serializer_; // Does not own. }; diff --git a/icing/index/numeric/posting-list-integer-index-accessor_test.cc b/icing/index/numeric/posting-list-integer-index-accessor_test.cc index 48221b9..f655fea 100644 --- a/icing/index/numeric/posting-list-integer-index-accessor_test.cc +++ b/icing/index/numeric/posting-list-integer-index-accessor_test.cc @@ -25,6 +25,7 @@ #include "gtest/gtest.h" #include "icing/file/filesystem.h" #include "icing/file/posting_list/flash-index-storage.h" +#include "icing/file/posting_list/posting-list-common.h" #include "icing/file/posting_list/posting-list-identifier.h" #include "icing/index/numeric/integer-index-data.h" #include "icing/index/numeric/posting-list-integer-index-serializer.h" @@ -42,6 +43,7 @@ using ::testing::ElementsAre; using ::testing::ElementsAreArray; using ::testing::Eq; using ::testing::Lt; +using ::testing::Ne; using ::testing::SizeIs; class PostingListIntegerIndexAccessorTest : public ::testing::Test { @@ -402,6 +404,131 @@ TEST_F(PostingListIntegerIndexAccessorTest, EXPECT_THAT(result2.status, IsOk()); } +TEST_F(PostingListIntegerIndexAccessorTest, GetAllDataAndFree) { + IntegerIndexData data1(/*section_id=*/3, /*document_id=*/1, /*key=*/123); + IntegerIndexData data2(/*section_id=*/3, /*document_id=*/2, /*key=*/456); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor1, + PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(), + serializer_.get())); + // Add 2 data. + ICING_ASSERT_OK(pl_accessor1->PrependData(data1)); + ICING_ASSERT_OK(pl_accessor1->PrependData(data2)); + PostingListAccessor::FinalizeResult result1 = + std::move(*pl_accessor1).Finalize(); + ICING_ASSERT_OK(result1.status); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor2, + PostingListIntegerIndexAccessor::CreateFromExisting( + flash_index_storage_.get(), serializer_.get(), result1.id)); + EXPECT_THAT(pl_accessor2->GetAllDataAndFree(), + IsOkAndHolds(ElementsAre(data2, data1))); + + // Allocate a new posting list with same size again. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor3, + PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(), + serializer_.get())); + // Add 2 data. + ICING_ASSERT_OK(pl_accessor3->PrependData(data1)); + ICING_ASSERT_OK(pl_accessor3->PrependData(data2)); + PostingListAccessor::FinalizeResult result3 = + std::move(*pl_accessor3).Finalize(); + ICING_ASSERT_OK(result3.status); + // We should get the same id if the previous one has been freed correctly by + // GetAllDataAndFree. + EXPECT_THAT(result3.id, Eq(result1.id)); +} + +TEST_F(PostingListIntegerIndexAccessorTest, GetAllDataAndFreePostingListChain) { + uint32_t block_size = FlashIndexStorage::SelectBlockSize(); + uint32_t max_posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes( + block_size, serializer_->GetDataTypeBytes()); + uint32_t max_num_data_single_posting_list = + max_posting_list_bytes / serializer_->GetDataTypeBytes(); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor1, + PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(), + serializer_.get())); + + // Prepend max_num_data_single_posting_list + 1 data. + std::vector<IntegerIndexData> data_vec; + for (uint32_t i = 0; i < max_num_data_single_posting_list + 1; ++i) { + IntegerIndexData data(/*section_id=*/3, static_cast<DocumentId>(i), + /*key=*/i); + ICING_ASSERT_OK(pl_accessor1->PrependData(data)); + data_vec.push_back(data); + } + + // This will cause: + // - Allocate the first max-sized posting list at block index = 1, storing + // max_num_data_single_posting_list data. + // - Allocate the second max-sized posting list at block index = 2, storing 1 + // data. Also its next_block_index is 1. + // - IOW, we will get 2 -> 1 and result1.id points to 2. + PostingListAccessor::FinalizeResult result1 = + std::move(*pl_accessor1).Finalize(); + ICING_ASSERT_OK(result1.status); + + uint32_t first_pl_block_index = kInvalidBlockIndex; + { + // result1.id points at the second (max-sized) PL, and next_block_index of + // the second PL points to the first PL's block. Fetch the first PL's block + // index manually. + ICING_ASSERT_OK_AND_ASSIGN( + PostingListHolder pl_holder, + flash_index_storage_->GetPostingList(result1.id)); + first_pl_block_index = pl_holder.next_block_index; + } + ASSERT_THAT(first_pl_block_index, Ne(kInvalidBlockIndex)); + + // Call GetAllDataAndFree. This will free block 2 and block 1. + // Free block list: 1 -> 2 (since free block list is LIFO). + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor2, + PostingListIntegerIndexAccessor::CreateFromExisting( + flash_index_storage_.get(), serializer_.get(), result1.id)); + EXPECT_THAT( + pl_accessor2->GetAllDataAndFree(), + IsOkAndHolds(ElementsAreArray(data_vec.rbegin(), data_vec.rend()))); + pl_accessor2.reset(); + + // Allocate a new posting list with same size again. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor3, + PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(), + serializer_.get())); + // Add same set of data. + for (uint32_t i = 0; i < max_num_data_single_posting_list + 1; ++i) { + ICING_ASSERT_OK(pl_accessor3->PrependData(data_vec[i])); + } + + // This will cause: + // - Allocate the first max-sized posting list from the free block list, which + // is block index = 1, storing max_num_data_single_posting_list data. + // - Allocate the second max-sized posting list from the next block in free + // block list, which is block index = 2, storing 1 data. Also its + // next_block_index should be 1. + PostingListAccessor::FinalizeResult result3 = + std::move(*pl_accessor3).Finalize(); + ICING_ASSERT_OK(result3.status); + // We should get the same id if the previous one has been freed correctly by + // GetAllDataAndFree. + EXPECT_THAT(result3.id, Eq(result1.id)); + // Also the first PL should be the same if it has been freed correctly by + // GetAllDataAndFree. Since it is a max-sized posting list, we just need to + // verify the block index. + { + ICING_ASSERT_OK_AND_ASSIGN( + PostingListHolder pl_holder, + flash_index_storage_->GetPostingList(result3.id)); + EXPECT_THAT(pl_holder.next_block_index, Eq(first_pl_block_index)); + } +} + } // namespace } // namespace lib diff --git a/icing/index/numeric/posting-list-integer-index-serializer.h b/icing/index/numeric/posting-list-integer-index-serializer.h index 9cfdb7a..ea2f2da 100644 --- a/icing/index/numeric/posting-list-integer-index-serializer.h +++ b/icing/index/numeric/posting-list-integer-index-serializer.h @@ -111,6 +111,12 @@ class PostingListIntegerIndexSerializer : public PostingListSerializer { libtextclassifier3::Status PopFrontData(PostingListUsed* posting_list_used, uint32_t num_data) const; + // Helper function to determine if posting list is full. + bool IsFull(const PostingListUsed* posting_list_used) const { + return GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() && + GetSpecialData(posting_list_used, /*index=*/1).data().is_valid(); + } + private: // Posting list layout formats: // @@ -228,11 +234,6 @@ class PostingListIntegerIndexSerializer : public PostingListSerializer { // +-----------------+-----------------+---+--------+-----+--------+--------+ // Helpers to determine what state the posting list is in. - bool IsFull(const PostingListUsed* posting_list_used) const { - return GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() && - GetSpecialData(posting_list_used, /*index=*/1).data().is_valid(); - } - bool IsAlmostFull(const PostingListUsed* posting_list_used) const { return !GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() && GetSpecialData(posting_list_used, /*index=*/1).data().is_valid(); diff --git a/icing/join/join-processor.cc b/icing/join/join-processor.cc index ab32850..da0e5d2 100644 --- a/icing/join/join-processor.cc +++ b/icing/join/join-processor.cc @@ -27,9 +27,10 @@ #include "icing/join/qualified-id.h" #include "icing/proto/scoring.pb.h" #include "icing/proto/search.pb.h" +#include "icing/schema/joinable-property.h" #include "icing/scoring/scored-document-hit.h" #include "icing/store/document-id.h" -#include "icing/util/snippet-helpers.h" +#include "icing/util/status-macros.h" namespace icing { namespace lib { @@ -67,34 +68,20 @@ JoinProcessor::GetChildrenFetcher( std::unordered_map<DocumentId, std::vector<ScoredDocumentHit>> map_joinable_qualified_id; for (const ScoredDocumentHit& child : child_scored_document_hits) { - std::string property_content = FetchPropertyExpressionValue( - child.document_id(), join_spec.child_property_expression()); - - // Parse qualified id. - libtextclassifier3::StatusOr<QualifiedId> qualified_id_or = - QualifiedId::Parse(property_content); - if (!qualified_id_or.ok()) { - ICING_VLOG(2) << "Skip content with invalid format of QualifiedId"; - continue; - } - QualifiedId qualified_id = std::move(qualified_id_or).ValueOrDie(); - - // Lookup parent DocumentId. - libtextclassifier3::StatusOr<DocumentId> parent_doc_id_or = - doc_store_->GetDocumentId(qualified_id.name_space(), - qualified_id.uri()); - if (!parent_doc_id_or.ok()) { - // Skip the document if getting errors. + ICING_ASSIGN_OR_RETURN( + DocumentId ref_doc_id, + FetchReferencedQualifiedId(child.document_id(), + join_spec.child_property_expression())); + if (ref_doc_id == kInvalidDocumentId) { continue; } - DocumentId parent_doc_id = std::move(parent_doc_id_or).ValueOrDie(); // Since we've already sorted child_scored_document_hits, just simply omit // if the map_joinable_qualified_id[parent_doc_id].size() has reached max // joined child count. - if (map_joinable_qualified_id[parent_doc_id].size() < + if (map_joinable_qualified_id[ref_doc_id].size() < join_spec.max_joined_child_count()) { - map_joinable_qualified_id[parent_doc_id].push_back(child); + map_joinable_qualified_id[ref_doc_id].push_back(child); } } return JoinChildrenFetcher(join_spec, std::move(map_joinable_qualified_id)); @@ -127,20 +114,49 @@ JoinProcessor::Join( return joined_scored_document_hits; } -std::string JoinProcessor::FetchPropertyExpressionValue( - const DocumentId& document_id, - const std::string& property_expression) const { - // TODO(b/256022027): Add caching of document_id -> {expression -> value} - libtextclassifier3::StatusOr<DocumentProto> document_or = - doc_store_->Get(document_id); - if (!document_or.ok()) { - // Skip the document if getting errors. - return ""; +libtextclassifier3::StatusOr<DocumentId> +JoinProcessor::FetchReferencedQualifiedId( + const DocumentId& document_id, const std::string& property_path) const { + std::optional<DocumentFilterData> filter_data = + doc_store_->GetAliveDocumentFilterData(document_id); + if (!filter_data) { + return kInvalidDocumentId; + } + + ICING_ASSIGN_OR_RETURN(const JoinablePropertyMetadata* metadata, + schema_store_->GetJoinablePropertyMetadata( + filter_data->schema_type_id(), property_path)); + if (metadata == nullptr || + metadata->value_type != JoinableConfig::ValueType::QUALIFIED_ID) { + // Currently we only support qualified id. + return kInvalidDocumentId; } - DocumentProto document = std::move(document_or).ValueOrDie(); + DocJoinInfo info(document_id, metadata->id); + libtextclassifier3::StatusOr<std::string_view> ref_qualified_id_str_or = + qualified_id_join_index_->Get(info); + if (!ref_qualified_id_str_or.ok()) { + if (absl_ports::IsNotFound(ref_qualified_id_str_or.status())) { + return kInvalidDocumentId; + } + return std::move(ref_qualified_id_str_or).status(); + } - return std::string(GetString(&document, property_expression)); + libtextclassifier3::StatusOr<QualifiedId> ref_qualified_id_or = + QualifiedId::Parse(std::move(ref_qualified_id_str_or).ValueOrDie()); + if (!ref_qualified_id_or.ok()) { + // This shouldn't happen because we've validated it during indexing and only + // put valid qualified id strings into qualified id join index. + return kInvalidDocumentId; + } + QualifiedId qualified_id = std::move(ref_qualified_id_or).ValueOrDie(); + + libtextclassifier3::StatusOr<DocumentId> ref_document_id_or = + doc_store_->GetDocumentId(qualified_id.name_space(), qualified_id.uri()); + if (!ref_document_id_or.ok()) { + return kInvalidDocumentId; + } + return std::move(ref_document_id_or).ValueOrDie(); } } // namespace lib diff --git a/icing/join/join-processor.h b/icing/join/join-processor.h index 9d5ee11..497787f 100644 --- a/icing/join/join-processor.h +++ b/icing/join/join-processor.h @@ -21,7 +21,9 @@ #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/join/join-children-fetcher.h" +#include "icing/join/qualified-id-type-joinable-index.h" #include "icing/proto/search.pb.h" +#include "icing/schema/schema-store.h" #include "icing/scoring/scored-document-hit.h" #include "icing/store/document-store.h" @@ -32,8 +34,12 @@ class JoinProcessor { public: static constexpr std::string_view kQualifiedIdExpr = "this.qualifiedId()"; - explicit JoinProcessor(const DocumentStore* doc_store) - : doc_store_(doc_store) {} + explicit JoinProcessor( + const DocumentStore* doc_store, const SchemaStore* schema_store, + const QualifiedIdTypeJoinableIndex* qualified_id_join_index) + : doc_store_(doc_store), + schema_store_(schema_store), + qualified_id_join_index_(qualified_id_join_index) {} // Get a JoinChildrenFetcher used to fetch all children documents by a parent // document id. @@ -52,23 +58,25 @@ class JoinProcessor { const JoinChildrenFetcher& join_children_fetcher); private: - // Loads a document and uses a property expression to fetch the value of the - // property from the document. The property expression may refer to nested - // document properties. - // Note: currently we only support single joining, so we use the first element - // (index 0) for any repeated values. + // Fetches referenced document id of the given document under the given + // property path. // // TODO(b/256022027): validate joinable property (and its upper-level) should // not have REPEATED cardinality. // // Returns: - // "" on document load error. - // "" if the property path is not found in the document. - std::string FetchPropertyExpressionValue( - const DocumentId& document_id, - const std::string& property_expression) const; + // - A valid referenced document id on success + // - kInvalidDocumentId if the given document is not found, doesn't have + // qualified id joinable type for the given property_path, or doesn't have + // joinable value (an optional property) + // - Any other QualifiedIdTypeJoinableIndex errors + libtextclassifier3::StatusOr<DocumentId> FetchReferencedQualifiedId( + const DocumentId& document_id, const std::string& property_path) const; const DocumentStore* doc_store_; // Does not own. + const SchemaStore* schema_store_; // Does not own. + const QualifiedIdTypeJoinableIndex* + qualified_id_join_index_; // Does not own. }; } // namespace lib diff --git a/icing/join/join-processor_test.cc b/icing/join/join-processor_test.cc index 00f2b1c..25d4cfc 100644 --- a/icing/join/join-processor_test.cc +++ b/icing/join/join-processor_test.cc @@ -22,6 +22,8 @@ #include "gtest/gtest.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" +#include "icing/join/qualified-id-joinable-property-indexing-handler.h" +#include "icing/join/qualified-id-type-joinable-index.h" #include "icing/proto/document.pb.h" #include "icing/proto/schema.pb.h" #include "icing/proto/scoring.pb.h" @@ -33,7 +35,14 @@ #include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" +#include "icing/testing/icu-data-file-helper.h" +#include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" +#include "icing/tokenization/language-segmenter-factory.h" +#include "icing/tokenization/language-segmenter.h" +#include "icing/util/status-macros.h" +#include "icing/util/tokenized-document.h" +#include "unicode/uloc.h" namespace icing { namespace lib { @@ -41,16 +50,37 @@ namespace lib { namespace { using ::testing::ElementsAre; +using ::testing::IsTrue; class JoinProcessorTest : public ::testing::Test { protected: void SetUp() override { test_dir_ = GetTestTempDir() + "/icing_join_processor_test"; - filesystem_.CreateDirectoryRecursively(test_dir_.c_str()); + ASSERT_THAT(filesystem_.CreateDirectoryRecursively(test_dir_.c_str()), + IsTrue()); + schema_store_dir_ = test_dir_ + "/schema_store"; + doc_store_dir_ = test_dir_ + "/doc_store"; + qualified_id_join_index_dir_ = test_dir_ + "/qualified_id_join_index"; + + if (!IsCfStringTokenization() && !IsReverseJniTokenization()) { + ICING_ASSERT_OK( + // File generated via icu_data_file rule in //icing/BUILD. + icu_data_file_helper::SetUpICUDataFile( + GetTestFilePath("icing/icu.dat"))); + } + + language_segmenter_factory::SegmenterOptions options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + lang_segmenter_, + language_segmenter_factory::Create(std::move(options))); + + ASSERT_THAT( + filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()), + IsTrue()); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, - SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); + SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); SchemaProto schema = SchemaBuilder() @@ -75,24 +105,51 @@ class JoinProcessorTest : public ::testing::Test { .Build(); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); + ASSERT_THAT(filesystem_.CreateDirectoryRecursively(doc_store_dir_.c_str()), + IsTrue()); ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_, + DocumentStore::Create(&filesystem_, doc_store_dir_, &fake_clock_, schema_store_.get())); doc_store_ = std::move(create_result.document_store); + + ICING_ASSERT_OK_AND_ASSIGN(qualified_id_join_index_, + QualifiedIdTypeJoinableIndex::Create( + filesystem_, qualified_id_join_index_dir_)); } void TearDown() override { + qualified_id_join_index_.reset(); doc_store_.reset(); schema_store_.reset(); + lang_segmenter_.reset(); filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); } + libtextclassifier3::StatusOr<DocumentId> PutAndIndexDocument( + const DocumentProto& document) { + ICING_ASSIGN_OR_RETURN(DocumentId document_id, doc_store_->Put(document)); + ICING_ASSIGN_OR_RETURN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<QualifiedIdJoinablePropertyIndexingHandler> handler, + QualifiedIdJoinablePropertyIndexingHandler::Create( + &fake_clock_, qualified_id_join_index_.get())); + ICING_RETURN_IF_ERROR(handler->Handle(tokenized_document, document_id, + /*recovery_mode=*/false, + /*put_document_stats=*/nullptr)); + return document_id; + } + libtextclassifier3::StatusOr<std::vector<JoinedScoredDocumentHit>> Join( const JoinSpecProto& join_spec, std::vector<ScoredDocumentHit>&& parent_scored_document_hits, std::vector<ScoredDocumentHit>&& child_scored_document_hits) { - JoinProcessor join_processor(doc_store_.get()); + JoinProcessor join_processor(doc_store_.get(), schema_store_.get(), + qualified_id_join_index_.get()); ICING_ASSIGN_OR_RETURN( JoinChildrenFetcher join_children_fetcher, join_processor.GetChildrenFetcher( @@ -104,8 +161,15 @@ class JoinProcessorTest : public ::testing::Test { Filesystem filesystem_; std::string test_dir_; + std::string schema_store_dir_; + std::string doc_store_dir_; + std::string qualified_id_join_index_dir_; + + std::unique_ptr<LanguageSegmenter> lang_segmenter_; std::unique_ptr<SchemaStore> schema_store_; std::unique_ptr<DocumentStore> doc_store_; + std::unique_ptr<QualifiedIdTypeJoinableIndex> qualified_id_join_index_; + FakeClock fake_clock_; }; @@ -144,11 +208,16 @@ TEST_F(JoinProcessorTest, JoinByQualifiedId) { .AddStringProperty("sender", "pkg$db/namespace#person1") .Build(); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, doc_store_->Put(person1)); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, doc_store_->Put(person2)); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3, doc_store_->Put(email1)); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4, doc_store_->Put(email2)); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id5, doc_store_->Put(email3)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, + PutAndIndexDocument(person1)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, + PutAndIndexDocument(person2)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3, + PutAndIndexDocument(email1)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4, + PutAndIndexDocument(email2)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id5, + PutAndIndexDocument(email3)); ScoredDocumentHit scored_doc_hit1(document_id1, kSectionIdMaskNone, /*score=*/0.0); @@ -216,9 +285,12 @@ TEST_F(JoinProcessorTest, ShouldIgnoreChildDocumentsWithoutJoiningProperty) { .AddStringProperty("subject", "test subject 2") .Build(); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, doc_store_->Put(person1)); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, doc_store_->Put(email1)); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3, doc_store_->Put(email2)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, + PutAndIndexDocument(person1)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, + PutAndIndexDocument(email1)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3, + PutAndIndexDocument(email2)); ScoredDocumentHit scored_doc_hit1(document_id1, kSectionIdMaskNone, /*score=*/0.0); @@ -290,10 +362,14 @@ TEST_F(JoinProcessorTest, ShouldIgnoreChildDocumentsWithInvalidQualifiedId) { R"(pkg$db/namespace\#person1)") // invalid format .Build(); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, doc_store_->Put(person1)); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, doc_store_->Put(email1)); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3, doc_store_->Put(email2)); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4, doc_store_->Put(email3)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, + PutAndIndexDocument(person1)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, + PutAndIndexDocument(email1)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3, + PutAndIndexDocument(email2)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4, + PutAndIndexDocument(email3)); ScoredDocumentHit scored_doc_hit1(document_id1, kSectionIdMaskNone, /*score=*/0.0); @@ -356,9 +432,12 @@ TEST_F(JoinProcessorTest, LeftJoinShouldReturnParentWithoutChildren) { R"(pkg$db/name\#space\\\\#person2)") // escaped .Build(); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, doc_store_->Put(person1)); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, doc_store_->Put(person2)); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3, doc_store_->Put(email1)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, + PutAndIndexDocument(person1)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, + PutAndIndexDocument(person2)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3, + PutAndIndexDocument(email1)); ScoredDocumentHit scored_doc_hit1(document_id1, kSectionIdMaskNone, /*score=*/0.0); @@ -430,10 +509,14 @@ TEST_F(JoinProcessorTest, ShouldSortChildDocumentsByRankingStrategy) { .AddStringProperty("sender", "pkg$db/namespace#person1") .Build(); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, doc_store_->Put(person1)); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, doc_store_->Put(email1)); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3, doc_store_->Put(email2)); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4, doc_store_->Put(email3)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, + PutAndIndexDocument(person1)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, + PutAndIndexDocument(email1)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3, + PutAndIndexDocument(email2)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4, + PutAndIndexDocument(email3)); ScoredDocumentHit scored_doc_hit1(document_id1, kSectionIdMaskNone, /*score=*/0.0); @@ -519,12 +602,18 @@ TEST_F(JoinProcessorTest, R"(pkg$db/name\#space\\\\#person2)") // escaped .Build(); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, doc_store_->Put(person1)); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, doc_store_->Put(person2)); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3, doc_store_->Put(email1)); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4, doc_store_->Put(email2)); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id5, doc_store_->Put(email3)); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id6, doc_store_->Put(email4)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, + PutAndIndexDocument(person1)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, + PutAndIndexDocument(person2)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3, + PutAndIndexDocument(email1)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4, + PutAndIndexDocument(email2)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id5, + PutAndIndexDocument(email3)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id6, + PutAndIndexDocument(email4)); ScoredDocumentHit scored_doc_hit1(document_id1, kSectionIdMaskNone, /*score=*/0.0); @@ -587,7 +676,8 @@ TEST_F(JoinProcessorTest, ShouldAllowSelfJoining) { .AddStringProperty("sender", "pkg$db/namespace#email1") .Build(); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, doc_store_->Put(email1)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, + PutAndIndexDocument(email1)); ScoredDocumentHit scored_doc_hit1(document_id1, kSectionIdMaskNone, /*score=*/0.0); diff --git a/icing/join/qualified-id-joinable-property-indexing-handler.cc b/icing/join/qualified-id-joinable-property-indexing-handler.cc new file mode 100644 index 0000000..0b28444 --- /dev/null +++ b/icing/join/qualified-id-joinable-property-indexing-handler.cc @@ -0,0 +1,96 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/join/qualified-id-joinable-property-indexing-handler.h" + +#include <memory> +#include <string_view> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/join/doc-join-info.h" +#include "icing/join/qualified-id-type-joinable-index.h" +#include "icing/join/qualified-id.h" +#include "icing/legacy/core/icing-string-util.h" +#include "icing/proto/logging.pb.h" +#include "icing/store/document-id.h" +#include "icing/util/logging.h" +#include "icing/util/tokenized-document.h" + +namespace icing { +namespace lib { + +/* static */ libtextclassifier3::StatusOr< + std::unique_ptr<QualifiedIdJoinablePropertyIndexingHandler>> +QualifiedIdJoinablePropertyIndexingHandler::Create( + const Clock* clock, QualifiedIdTypeJoinableIndex* qualified_id_join_index) { + ICING_RETURN_ERROR_IF_NULL(clock); + ICING_RETURN_ERROR_IF_NULL(qualified_id_join_index); + + return std::unique_ptr<QualifiedIdJoinablePropertyIndexingHandler>( + new QualifiedIdJoinablePropertyIndexingHandler(clock, + qualified_id_join_index)); +} + +libtextclassifier3::Status QualifiedIdJoinablePropertyIndexingHandler::Handle( + const TokenizedDocument& tokenized_document, DocumentId document_id, + bool recovery_mode, PutDocumentStatsProto* put_document_stats) { + // TODO(b/263890397): set qualified id join index processing latency and other + // stats. + + if (qualified_id_join_index_.last_added_document_id() != kInvalidDocumentId && + document_id <= qualified_id_join_index_.last_added_document_id()) { + if (recovery_mode) { + // Skip the document if document_id <= last_added_document_id in recovery + // mode without returning an error. + return libtextclassifier3::Status::OK; + } + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "DocumentId %d must be greater than last added document_id %d", + document_id, qualified_id_join_index_.last_added_document_id())); + } + qualified_id_join_index_.set_last_added_document_id(document_id); + + for (const JoinableProperty<std::string_view>& qualified_id_property : + tokenized_document.qualified_id_join_properties()) { + if (qualified_id_property.values.empty()) { + continue; + } + + DocJoinInfo info(document_id, qualified_id_property.metadata.id); + // Currently we only support single (non-repeated) joinable value under a + // property. + std::string_view ref_qualified_id_str = qualified_id_property.values[0]; + + // Attempt to parse qualified id string to make sure the format is correct. + if (!QualifiedId::Parse(ref_qualified_id_str).ok()) { + // Skip incorrect format of qualified id string to save disk space. + continue; + } + + libtextclassifier3::Status status = + qualified_id_join_index_.Put(info, ref_qualified_id_str); + if (!status.ok()) { + ICING_LOG(WARNING) + << "Failed to add data into qualified id join index due to: " + << status.error_message(); + return status; + } + } + + return libtextclassifier3::Status::OK; +} + +} // namespace lib +} // namespace icing diff --git a/icing/join/qualified-id-joinable-property-indexing-handler.h b/icing/join/qualified-id-joinable-property-indexing-handler.h new file mode 100644 index 0000000..111526e --- /dev/null +++ b/icing/join/qualified-id-joinable-property-indexing-handler.h @@ -0,0 +1,71 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_JOIN_QUALIFIED_ID_JOINABLE_PROPERTY_INDEXING_HANDLER_H_ +#define ICING_JOIN_QUALIFIED_ID_JOINABLE_PROPERTY_INDEXING_HANDLER_H_ + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/index/data-indexing-handler.h" +#include "icing/join/qualified-id-type-joinable-index.h" +#include "icing/proto/logging.pb.h" +#include "icing/store/document-id.h" +#include "icing/util/clock.h" +#include "icing/util/tokenized-document.h" + +namespace icing { +namespace lib { + +class QualifiedIdJoinablePropertyIndexingHandler : public DataIndexingHandler { + public: + // Creates a QualifiedIdJoinablePropertyIndexingHandler instance which does + // not take ownership of any input components. All pointers must refer to + // valid objects that outlive the created + // QualifiedIdJoinablePropertyIndexingHandler instance. + // + // Returns: + // - A QualifiedIdJoinablePropertyIndexingHandler instance on success + // - FAILED_PRECONDITION_ERROR if any of the input pointer is null + static libtextclassifier3::StatusOr< + std::unique_ptr<QualifiedIdJoinablePropertyIndexingHandler>> + Create(const Clock* clock, + QualifiedIdTypeJoinableIndex* qualified_id_join_index); + + ~QualifiedIdJoinablePropertyIndexingHandler() override = default; + + // Handles the joinable qualified id data indexing process: add data into the + // qualified id type joinable cache. + // + /// Returns: + // - OK on success + // - INVALID_ARGUMENT_ERROR if document_id is less than or equal to the + // document_id of a previously indexed document in non recovery mode. + // - INTERNAL_ERROR if any other errors occur. + // - Any QualifiedIdTypeJoinableIndex errors. + libtextclassifier3::Status Handle( + const TokenizedDocument& tokenized_document, DocumentId document_id, + bool recovery_mode, PutDocumentStatsProto* put_document_stats) override; + + private: + explicit QualifiedIdJoinablePropertyIndexingHandler( + const Clock* clock, QualifiedIdTypeJoinableIndex* qualified_id_join_index) + : DataIndexingHandler(clock), + qualified_id_join_index_(*qualified_id_join_index) {} + + QualifiedIdTypeJoinableIndex& qualified_id_join_index_; // Does not own. +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_JOIN_QUALIFIED_ID_JOINABLE_PROPERTY_INDEXING_HANDLER_H_ diff --git a/icing/join/qualified-id-joinable-property-indexing-handler_test.cc b/icing/join/qualified-id-joinable-property-indexing-handler_test.cc new file mode 100644 index 0000000..aa5624c --- /dev/null +++ b/icing/join/qualified-id-joinable-property-indexing-handler_test.cc @@ -0,0 +1,332 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/join/qualified-id-joinable-property-indexing-handler.h" + +#include <memory> +#include <string> +#include <string_view> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/document-builder.h" +#include "icing/file/filesystem.h" +#include "icing/join/qualified-id-type-joinable-index.h" +#include "icing/join/qualified-id.h" +#include "icing/portable/platform.h" +#include "icing/proto/document.pb.h" +#include "icing/proto/schema.pb.h" +#include "icing/schema-builder.h" +#include "icing/schema/joinable-property.h" +#include "icing/schema/schema-store.h" +#include "icing/store/document-id.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/fake-clock.h" +#include "icing/testing/icu-data-file-helper.h" +#include "icing/testing/test-data.h" +#include "icing/testing/tmp-directory.h" +#include "icing/tokenization/language-segmenter-factory.h" +#include "icing/tokenization/language-segmenter.h" +#include "icing/util/tokenized-document.h" +#include "unicode/uloc.h" + +namespace icing { +namespace lib { + +namespace { + +using ::testing::Eq; +using ::testing::IsEmpty; +using ::testing::IsTrue; + +// Schema type for referenced documents: ReferencedType +static constexpr std::string_view kReferencedType = "ReferencedType"; +static constexpr std::string_view kPropertyName = "name"; + +// Joinable properties and joinable property id. Joinable property id is +// determined by the lexicographical order of joinable property path. +// Schema type with joinable property: FakeType +static constexpr std::string_view kFakeType = "FakeType"; +static constexpr std::string_view kPropertyQualifiedId = "qualifiedId"; + +static constexpr JoinablePropertyId kQualifiedIdJoinablePropertyId = 0; + +// Schema type with nested joinable properties: NestedType +static constexpr std::string_view kNestedType = "NestedType"; +static constexpr std::string_view kPropertyNestedDoc = "nested"; +static constexpr std::string_view kPropertyQualifiedId2 = "qualifiedId2"; + +static constexpr JoinablePropertyId kNestedQualifiedIdJoinablePropertyId = 0; +static constexpr JoinablePropertyId kQualifiedId2JoinablePropertyId = 1; + +static constexpr DocumentId kDefaultDocumentId = 3; + +class QualifiedIdJoinablePropertyIndexingHandlerTest : public ::testing::Test { + protected: + void SetUp() override { + if (!IsCfStringTokenization() && !IsReverseJniTokenization()) { + ICING_ASSERT_OK( + // File generated via icu_data_file rule in //icing/BUILD. + icu_data_file_helper::SetUpICUDataFile( + GetTestFilePath("icing/icu.dat"))); + } + + base_dir_ = GetTestTempDir() + "/icing_test"; + ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()), + IsTrue()); + + qualified_id_join_index_dir_ = base_dir_ + "/qualified_id_join_index"; + schema_store_dir_ = base_dir_ + "/schema_store"; + + ICING_ASSERT_OK_AND_ASSIGN(qualified_id_join_index_, + QualifiedIdTypeJoinableIndex::Create( + filesystem_, qualified_id_join_index_dir_)); + + language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US); + ICING_ASSERT_OK_AND_ASSIGN( + lang_segmenter_, + language_segmenter_factory::Create(std::move(segmenter_options))); + + ASSERT_THAT( + filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()), + IsTrue()); + ICING_ASSERT_OK_AND_ASSIGN( + schema_store_, + SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + SchemaProto schema = + SchemaBuilder() + .AddType( + SchemaTypeConfigBuilder() + .SetType(kReferencedType) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertyName) + .SetDataTypeString(TERM_MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType(kFakeType).AddProperty( + PropertyConfigBuilder() + .SetName(kPropertyQualifiedId) + .SetDataTypeJoinableString(JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType( + SchemaTypeConfigBuilder() + .SetType(kNestedType) + .AddProperty( + PropertyConfigBuilder() + .SetName(kPropertyNestedDoc) + .SetDataTypeDocument( + kFakeType, /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertyQualifiedId2) + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + ICING_ASSERT_OK(schema_store_->SetSchema(schema)); + } + + void TearDown() override { + schema_store_.reset(); + lang_segmenter_.reset(); + qualified_id_join_index_.reset(); + + filesystem_.DeleteDirectoryRecursively(base_dir_.c_str()); + } + + Filesystem filesystem_; + FakeClock fake_clock_; + std::string base_dir_; + std::string qualified_id_join_index_dir_; + std::string schema_store_dir_; + + std::unique_ptr<QualifiedIdTypeJoinableIndex> qualified_id_join_index_; + std::unique_ptr<LanguageSegmenter> lang_segmenter_; + std::unique_ptr<SchemaStore> schema_store_; +}; + +TEST_F(QualifiedIdJoinablePropertyIndexingHandlerTest, + CreationWithNullPointerShouldFail) { + EXPECT_THAT(QualifiedIdJoinablePropertyIndexingHandler::Create( + /*clock=*/nullptr, qualified_id_join_index_.get()), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + + EXPECT_THAT(QualifiedIdJoinablePropertyIndexingHandler::Create( + &fake_clock_, /*qualified_id_join_index=*/nullptr), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); +} + +TEST_F(QualifiedIdJoinablePropertyIndexingHandlerTest, HandleJoinableProperty) { + DocumentProto referenced_document = + DocumentBuilder() + .SetKey("pkg$db/ns", "ref_type/1") + .SetSchema(std::string(kReferencedType)) + .AddStringProperty(std::string(kPropertyName), "one") + .Build(); + + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPropertyQualifiedId), + "pkg$db/ns#ref_type/1") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + + // Handle document. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinablePropertyIndexingHandler> handler, + QualifiedIdJoinablePropertyIndexingHandler::Create( + &fake_clock_, qualified_id_join_index_.get())); + EXPECT_THAT( + handler->Handle(tokenized_document, kDefaultDocumentId, + /*recovery_mode=*/false, /*put_document_stats=*/nullptr), + StatusIs(libtextclassifier3::StatusCode::OK)); + + EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(kDefaultDocumentId)); + EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( + kDefaultDocumentId, kQualifiedIdJoinablePropertyId)), + IsOkAndHolds("pkg$db/ns#ref_type/1")); +} + +TEST_F(QualifiedIdJoinablePropertyIndexingHandlerTest, + HandleNestedJoinableProperty) { + DocumentProto referenced_document1 = + DocumentBuilder() + .SetKey("pkg$db/ns", "ref_type/1") + .SetSchema(std::string(kReferencedType)) + .AddStringProperty(std::string(kPropertyName), "one") + .Build(); + DocumentProto referenced_document2 = + DocumentBuilder() + .SetKey("pkg$db/ns", "ref_type/2") + .SetSchema(std::string(kReferencedType)) + .AddStringProperty(std::string(kPropertyName), "two") + .Build(); + + DocumentProto nested_document = + DocumentBuilder() + .SetKey("pkg$db/ns", "nested_type/1") + .SetSchema(std::string(kNestedType)) + .AddDocumentProperty( + std::string(kPropertyNestedDoc), + DocumentBuilder() + .SetKey("pkg$db/ns", "nested_fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPropertyQualifiedId), + "pkg$db/ns#ref_type/2") + .Build()) + .AddStringProperty(std::string(kPropertyQualifiedId2), + "pkg$db/ns#ref_type/1") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + nested_document)); + + // Handle nested_document. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinablePropertyIndexingHandler> handler, + QualifiedIdJoinablePropertyIndexingHandler::Create( + &fake_clock_, qualified_id_join_index_.get())); + EXPECT_THAT(handler->Handle(tokenized_document, kDefaultDocumentId, + /*recovery_mode=*/false, + /*put_document_stats=*/nullptr), + StatusIs(libtextclassifier3::StatusCode::OK)); + + EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(kDefaultDocumentId)); + EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( + kDefaultDocumentId, kNestedQualifiedIdJoinablePropertyId)), + IsOkAndHolds("pkg$db/ns#ref_type/2")); + EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( + kDefaultDocumentId, kQualifiedId2JoinablePropertyId)), + IsOkAndHolds("pkg$db/ns#ref_type/1")); +} + +TEST_F(QualifiedIdJoinablePropertyIndexingHandlerTest, + HandleShouldSkipInvalidFormatQualifiedId) { + static constexpr std::string_view kInvalidFormatQualifiedId = + "invalid_format_qualified_id"; + ASSERT_THAT(QualifiedId::Parse(kInvalidFormatQualifiedId), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kPropertyQualifiedId), + std::string(kInvalidFormatQualifiedId)) + .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + + // Handle document. Handle() should ignore invalid format qualified id. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinablePropertyIndexingHandler> handler, + QualifiedIdJoinablePropertyIndexingHandler::Create( + &fake_clock_, qualified_id_join_index_.get())); + EXPECT_THAT( + handler->Handle(tokenized_document, kDefaultDocumentId, + /*recovery_mode=*/false, /*put_document_stats=*/nullptr), + StatusIs(libtextclassifier3::StatusCode::OK)); + + EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(kDefaultDocumentId)); + EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( + kDefaultDocumentId, kQualifiedIdJoinablePropertyId)), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); +} + +TEST_F(QualifiedIdJoinablePropertyIndexingHandlerTest, + HandleShouldSkipEmptyQualifiedId) { + // Create a document without any qualified id. + DocumentProto document = DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + ASSERT_THAT(tokenized_document.qualified_id_join_properties(), IsEmpty()); + + // Handle document. Handle() should ignore invalid format qualified id. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdJoinablePropertyIndexingHandler> handler, + QualifiedIdJoinablePropertyIndexingHandler::Create( + &fake_clock_, qualified_id_join_index_.get())); + EXPECT_THAT( + handler->Handle(tokenized_document, kDefaultDocumentId, + /*recovery_mode=*/false, /*put_document_stats=*/nullptr), + StatusIs(libtextclassifier3::StatusCode::OK)); + + EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), + Eq(kDefaultDocumentId)); + EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo( + kDefaultDocumentId, kQualifiedIdJoinablePropertyId)), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); +} + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/join/qualified-id-type-joinable-index.cc b/icing/join/qualified-id-type-joinable-index.cc index 231e78a..9c25e62 100644 --- a/icing/join/qualified-id-type-joinable-index.cc +++ b/icing/join/qualified-id-type-joinable-index.cc @@ -14,6 +14,7 @@ #include "icing/join/qualified-id-type-joinable-index.h" +#include <cstring> #include <memory> #include <string> #include <string_view> @@ -24,7 +25,9 @@ #include "icing/absl_ports/canonical_errors.h" #include "icing/absl_ports/str_cat.h" #include "icing/file/destructible-directory.h" +#include "icing/file/file-backed-vector.h" #include "icing/file/filesystem.h" +#include "icing/file/memory-mapped-file.h" #include "icing/join/doc-join-info.h" #include "icing/store/document-id.h" #include "icing/store/key-mapper.h" @@ -49,13 +52,15 @@ DocumentId GetNewDocumentId( } std::string GetMetadataFilePath(std::string_view working_path) { - return absl_ports::StrCat(working_path, "/", - QualifiedIdTypeJoinableIndex::kFilePrefix, ".m"); + return absl_ports::StrCat(working_path, "/metadata"); } -std::string GetDocumentToQualifiedIdMapperPath(std::string_view working_path) { - return absl_ports::StrCat( - working_path, "/", QualifiedIdTypeJoinableIndex::kFilePrefix, "_mapper"); +std::string GetDocJoinInfoMapperPath(std::string_view working_path) { + return absl_ports::StrCat(working_path, "/doc_join_info_mapper"); +} + +std::string GetQualifiedIdStoragePath(std::string_view working_path) { + return absl_ports::StrCat(working_path, "/qualified_id_storage"); } } // namespace @@ -66,9 +71,12 @@ QualifiedIdTypeJoinableIndex::Create(const Filesystem& filesystem, std::string working_path) { if (!filesystem.FileExists(GetMetadataFilePath(working_path).c_str()) || !filesystem.DirectoryExists( - GetDocumentToQualifiedIdMapperPath(working_path).c_str())) { + GetDocJoinInfoMapperPath(working_path).c_str()) || + !filesystem.FileExists(GetQualifiedIdStoragePath(working_path).c_str())) { // Discard working_path if any file/directory is missing, and reinitialize. - ICING_RETURN_IF_ERROR(Discard(filesystem, working_path)); + if (filesystem.DirectoryExists(working_path.c_str())) { + ICING_RETURN_IF_ERROR(Discard(filesystem, working_path)); + } return InitializeNewFiles(filesystem, std::move(working_path)); } return InitializeExistingFiles(filesystem, std::move(working_path)); @@ -83,29 +91,44 @@ QualifiedIdTypeJoinableIndex::~QualifiedIdTypeJoinableIndex() { } libtextclassifier3::Status QualifiedIdTypeJoinableIndex::Put( - const DocJoinInfo& doc_join_info, DocumentId ref_document_id) { + const DocJoinInfo& doc_join_info, std::string_view ref_qualified_id_str) { if (!doc_join_info.is_valid()) { return absl_ports::InvalidArgumentError( "Cannot put data for an invalid DocJoinInfo"); } - ICING_RETURN_IF_ERROR(document_to_qualified_id_mapper_->Put( - encode_util::EncodeIntToCString(doc_join_info.value()), ref_document_id)); + int32_t qualified_id_index = qualified_id_storage_->num_elements(); + ICING_ASSIGN_OR_RETURN( + FileBackedVector<char>::MutableArrayView mutable_arr, + qualified_id_storage_->Allocate(ref_qualified_id_str.size() + 1)); + mutable_arr.SetArray(/*idx=*/0, ref_qualified_id_str.data(), + ref_qualified_id_str.size()); + mutable_arr.SetArray(/*idx=*/ref_qualified_id_str.size(), /*arr=*/"\0", + /*arr_len=*/1); + + ICING_RETURN_IF_ERROR(doc_join_info_mapper_->Put( + encode_util::EncodeIntToCString(doc_join_info.value()), + qualified_id_index)); // TODO(b/268521214): add data into delete propagation storage return libtextclassifier3::Status::OK; } -libtextclassifier3::StatusOr<DocumentId> QualifiedIdTypeJoinableIndex::Get( - const DocJoinInfo& doc_join_info) const { +libtextclassifier3::StatusOr<std::string_view> +QualifiedIdTypeJoinableIndex::Get(const DocJoinInfo& doc_join_info) const { if (!doc_join_info.is_valid()) { return absl_ports::InvalidArgumentError( "Cannot get data for an invalid DocJoinInfo"); } - return document_to_qualified_id_mapper_->Get( - encode_util::EncodeIntToCString(doc_join_info.value())); + ICING_ASSIGN_OR_RETURN( + int32_t qualified_id_index, + doc_join_info_mapper_->Get( + encode_util::EncodeIntToCString(doc_join_info.value()))); + + const char* data = qualified_id_storage_->array() + qualified_id_index; + return std::string_view(data, strlen(data)); } libtextclassifier3::Status QualifiedIdTypeJoinableIndex::Optimize( @@ -137,7 +160,8 @@ libtextclassifier3::Status QualifiedIdTypeJoinableIndex::Optimize( // Destruct current index's storage instances to safely swap directories. // TODO(b/268521214): handle delete propagation storage - document_to_qualified_id_mapper_.reset(); + doc_join_info_mapper_.reset(); + qualified_id_storage_.reset(); if (!filesystem_.SwapFiles(temp_working_path_ddir.dir().c_str(), working_path_.c_str())) { @@ -153,24 +177,37 @@ libtextclassifier3::Status QualifiedIdTypeJoinableIndex::Optimize( return absl_ports::InternalError("Fail to read metadata file"); } ICING_ASSIGN_OR_RETURN( - document_to_qualified_id_mapper_, - PersistentHashMapKeyMapper<DocumentId>::Create( - filesystem_, GetDocumentToQualifiedIdMapperPath(working_path_))); + doc_join_info_mapper_, + PersistentHashMapKeyMapper<int32_t>::Create( + filesystem_, GetDocJoinInfoMapperPath(working_path_))); + + ICING_ASSIGN_OR_RETURN( + qualified_id_storage_, + FileBackedVector<char>::Create( + filesystem_, GetQualifiedIdStoragePath(working_path_), + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, + FileBackedVector<char>::kMaxFileSize, + /*pre_mapping_mmap_size=*/1024 * 1024)); return libtextclassifier3::Status::OK; } libtextclassifier3::Status QualifiedIdTypeJoinableIndex::Clear() { - document_to_qualified_id_mapper_.reset(); - // Discard and reinitialize document to qualified id mapper. - std::string document_to_qualified_id_mapper_path = - GetDocumentToQualifiedIdMapperPath(working_path_); - ICING_RETURN_IF_ERROR(PersistentHashMapKeyMapper<DocumentId>::Delete( - filesystem_, document_to_qualified_id_mapper_path)); + doc_join_info_mapper_.reset(); + // Discard and reinitialize doc join info mapper. + std::string doc_join_info_mapper_path = + GetDocJoinInfoMapperPath(working_path_); + ICING_RETURN_IF_ERROR(PersistentHashMapKeyMapper<int32_t>::Delete( + filesystem_, doc_join_info_mapper_path)); ICING_ASSIGN_OR_RETURN( - document_to_qualified_id_mapper_, - PersistentHashMapKeyMapper<DocumentId>::Create( - filesystem_, std::move(document_to_qualified_id_mapper_path))); + doc_join_info_mapper_, + PersistentHashMapKeyMapper<int32_t>::Create( + filesystem_, std::move(doc_join_info_mapper_path))); + + // Clear qualified_id_storage_. + if (qualified_id_storage_->num_elements() > 0) { + ICING_RETURN_IF_ERROR(qualified_id_storage_->TruncateTo(0)); + } // TODO(b/268521214): clear delete propagation storage @@ -188,26 +225,34 @@ QualifiedIdTypeJoinableIndex::InitializeNewFiles(const Filesystem& filesystem, absl_ports::StrCat("Failed to create directory: ", working_path)); } - // Initialize document_to_qualified_id_mapper + // Initialize doc_join_info_mapper // TODO(b/263890397): decide PersistentHashMapKeyMapper size ICING_ASSIGN_OR_RETURN( - std::unique_ptr<KeyMapper<DocumentId>> document_to_qualified_id_mapper, - PersistentHashMapKeyMapper<DocumentId>::Create( - filesystem, GetDocumentToQualifiedIdMapperPath(working_path))); + std::unique_ptr<KeyMapper<int32_t>> doc_join_info_mapper, + PersistentHashMapKeyMapper<int32_t>::Create( + filesystem, GetDocJoinInfoMapperPath(working_path))); + + // Initialize qualified_id_storage + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<FileBackedVector<char>> qualified_id_storage, + FileBackedVector<char>::Create( + filesystem, GetQualifiedIdStoragePath(working_path), + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, + FileBackedVector<char>::kMaxFileSize, + /*pre_mapping_mmap_size=*/1024 * 1024)); // Create instance. auto new_index = std::unique_ptr<QualifiedIdTypeJoinableIndex>( new QualifiedIdTypeJoinableIndex( filesystem, std::move(working_path), /*metadata_buffer=*/std::make_unique<uint8_t[]>(kMetadataFileSize), - std::move(document_to_qualified_id_mapper))); + std::move(doc_join_info_mapper), std::move(qualified_id_storage))); // Initialize info content. new_index->info().magic = Info::kMagic; new_index->info().last_added_document_id = kInvalidDocumentId; // Initialize new PersistentStorage. The initial checksums will be computed - // and set via InitializeNewStorage. Also write them into disk as well. + // and set via InitializeNewStorage. ICING_RETURN_IF_ERROR(new_index->InitializeNewStorage()); - ICING_RETURN_IF_ERROR(new_index->PersistMetadataToDisk()); return new_index; } @@ -224,17 +269,26 @@ QualifiedIdTypeJoinableIndex::InitializeExistingFiles( return absl_ports::InternalError("Fail to read metadata file"); } - // Initialize document_to_qualified_id_mapper + // Initialize doc_join_info_mapper ICING_ASSIGN_OR_RETURN( - std::unique_ptr<KeyMapper<DocumentId>> document_to_qualified_id_mapper, - PersistentHashMapKeyMapper<DocumentId>::Create( - filesystem, GetDocumentToQualifiedIdMapperPath(working_path))); + std::unique_ptr<KeyMapper<int32_t>> doc_join_info_mapper, + PersistentHashMapKeyMapper<int32_t>::Create( + filesystem, GetDocJoinInfoMapperPath(working_path))); + + // Initialize qualified_id_storage + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<FileBackedVector<char>> qualified_id_storage, + FileBackedVector<char>::Create( + filesystem, GetQualifiedIdStoragePath(working_path), + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, + FileBackedVector<char>::kMaxFileSize, + /*pre_mapping_mmap_size=*/1024 * 1024)); // Create instance. auto type_joinable_index = std::unique_ptr<QualifiedIdTypeJoinableIndex>( new QualifiedIdTypeJoinableIndex( filesystem, std::move(working_path), std::move(metadata_buffer), - std::move(document_to_qualified_id_mapper))); + std::move(doc_join_info_mapper), std::move(qualified_id_storage))); // Initialize existing PersistentStorage. Checksums will be validated. ICING_RETURN_IF_ERROR(type_joinable_index->InitializeExistingStorage()); @@ -249,25 +303,25 @@ QualifiedIdTypeJoinableIndex::InitializeExistingFiles( libtextclassifier3::Status QualifiedIdTypeJoinableIndex::TransferIndex( const std::vector<DocumentId>& document_id_old_to_new, QualifiedIdTypeJoinableIndex* new_index) const { - std::unique_ptr<KeyMapper<DocumentId>::Iterator> iter = - document_to_qualified_id_mapper_->GetIterator(); + std::unique_ptr<KeyMapper<int32_t>::Iterator> iter = + doc_join_info_mapper_->GetIterator(); while (iter->Advance()) { DocJoinInfo old_doc_join_info( encode_util::DecodeIntFromCString(iter->GetKey())); - DocumentId old_ref_document_id = iter->GetValue(); + int32_t qualified_id_index = iter->GetValue(); + + const char* data = qualified_id_storage_->array() + qualified_id_index; + std::string_view ref_qualified_id_str(data, strlen(data)); - // Translate to new doc ids. + // Translate to new doc id. DocumentId new_document_id = GetNewDocumentId( document_id_old_to_new, old_doc_join_info.document_id()); - DocumentId new_ref_document_id = - GetNewDocumentId(document_id_old_to_new, old_ref_document_id); - if (new_document_id != kInvalidDocumentId && - new_ref_document_id != kInvalidDocumentId) { + if (new_document_id != kInvalidDocumentId) { ICING_RETURN_IF_ERROR( new_index->Put(DocJoinInfo(new_document_id, old_doc_join_info.joinable_property_id()), - new_ref_document_id)); + ref_qualified_id_str)); } } @@ -299,7 +353,9 @@ QualifiedIdTypeJoinableIndex::PersistMetadataToDisk() { libtextclassifier3::Status QualifiedIdTypeJoinableIndex::PersistStoragesToDisk() { - return document_to_qualified_id_mapper_->PersistToDisk(); + ICING_RETURN_IF_ERROR(doc_join_info_mapper_->PersistToDisk()); + ICING_RETURN_IF_ERROR(qualified_id_storage_->PersistToDisk()); + return libtextclassifier3::Status::OK; } libtextclassifier3::StatusOr<Crc32> @@ -309,7 +365,12 @@ QualifiedIdTypeJoinableIndex::ComputeInfoChecksum() { libtextclassifier3::StatusOr<Crc32> QualifiedIdTypeJoinableIndex::ComputeStoragesChecksum() { - return document_to_qualified_id_mapper_->ComputeChecksum(); + ICING_ASSIGN_OR_RETURN(Crc32 doc_join_info_mapper_crc, + doc_join_info_mapper_->ComputeChecksum()); + ICING_ASSIGN_OR_RETURN(Crc32 qualified_id_storage_crc, + qualified_id_storage_->ComputeChecksum()); + + return Crc32(doc_join_info_mapper_crc.Get() ^ qualified_id_storage_crc.Get()); } } // namespace lib diff --git a/icing/join/qualified-id-type-joinable-index.h b/icing/join/qualified-id-type-joinable-index.h index 794f33f..1127641 100644 --- a/icing/join/qualified-id-type-joinable-index.h +++ b/icing/join/qualified-id-type-joinable-index.h @@ -23,6 +23,7 @@ #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/file/file-backed-vector.h" #include "icing/file/filesystem.h" #include "icing/file/persistent-storage.h" #include "icing/join/doc-join-info.h" @@ -59,8 +60,6 @@ class QualifiedIdTypeJoinableIndex : public PersistentStorage { static constexpr WorkingPathType kWorkingPathType = WorkingPathType::kDirectory; - static constexpr std::string_view kFilePrefix = - "qualified_id_type_joinable_index"; // Creates a QualifiedIdTypeJoinableIndex instance to store qualified ids for // future joining search. If any of the underlying file is missing, then @@ -112,24 +111,26 @@ class QualifiedIdTypeJoinableIndex : public PersistentStorage { ~QualifiedIdTypeJoinableIndex() override; // Puts a new data into index: DocJoinInfo (DocumentId, JoinablePropertyId) - // references to ref_document_id. + // references to ref_qualified_id_str (the identifier of another document). + // + // REQUIRES: ref_qualified_id_str contains no '\0'. // // Returns: // - OK on success // - INVALID_ARGUMENT_ERROR if doc_join_info is invalid // - Any KeyMapper errors libtextclassifier3::Status Put(const DocJoinInfo& doc_join_info, - DocumentId ref_document_id); + std::string_view ref_qualified_id_str); - // Gets the referenced DocumentId by DocJoinInfo. + // Gets the referenced document's qualified id string by DocJoinInfo. // // Returns: - // - DocumentId referenced by the given DocJoinInfo (DocumentId, + // - A qualified id string referenced by the given DocJoinInfo (DocumentId, // JoinablePropertyId) on success // - INVALID_ARGUMENT_ERROR if doc_join_info is invalid // - NOT_FOUND_ERROR if doc_join_info doesn't exist // - Any KeyMapper errors - libtextclassifier3::StatusOr<DocumentId> Get( + libtextclassifier3::StatusOr<std::string_view> Get( const DocJoinInfo& doc_join_info) const; // Reduces internal file sizes by reclaiming space and ids of deleted @@ -158,7 +159,7 @@ class QualifiedIdTypeJoinableIndex : public PersistentStorage { // - INTERNAL_ERROR on I/O error libtextclassifier3::Status Clear(); - int32_t size() const { return document_to_qualified_id_mapper_->num_keys(); } + int32_t size() const { return doc_join_info_mapper_->num_keys(); } bool empty() const { return size() == 0; } @@ -178,11 +179,13 @@ class QualifiedIdTypeJoinableIndex : public PersistentStorage { explicit QualifiedIdTypeJoinableIndex( const Filesystem& filesystem, std::string&& working_path, std::unique_ptr<uint8_t[]> metadata_buffer, - std::unique_ptr<KeyMapper<DocumentId>> key_mapper) + std::unique_ptr<KeyMapper<int32_t>> doc_join_info_mapper, + std::unique_ptr<FileBackedVector<char>> qualified_id_storage) : PersistentStorage(filesystem, std::move(working_path), kWorkingPathType), metadata_buffer_(std::move(metadata_buffer)), - document_to_qualified_id_mapper_(std::move(key_mapper)) {} + doc_join_info_mapper_(std::move(doc_join_info_mapper)), + qualified_id_storage_(std::move(qualified_id_storage)) {} static libtextclassifier3::StatusOr< std::unique_ptr<QualifiedIdTypeJoinableIndex>> @@ -255,9 +258,12 @@ class QualifiedIdTypeJoinableIndex : public PersistentStorage { std::unique_ptr<uint8_t[]> metadata_buffer_; // Persistent KeyMapper for mapping (encoded) DocJoinInfo (DocumentId, - // JoinablePropertyId) to another referenced DocumentId (converted from - // qualified id string). - std::unique_ptr<KeyMapper<DocumentId>> document_to_qualified_id_mapper_; + // JoinablePropertyId) to another referenced document's qualified id string + // index in qualified_id_storage_. + std::unique_ptr<KeyMapper<int32_t>> doc_join_info_mapper_; + + // Storage for qualified id strings. + std::unique_ptr<FileBackedVector<char>> qualified_id_storage_; // TODO(b/268521214): add delete propagation storage }; diff --git a/icing/join/qualified-id-type-joinable-index_test.cc b/icing/join/qualified-id-type-joinable-index_test.cc index 6cbc9e4..745b0c1 100644 --- a/icing/join/qualified-id-type-joinable-index_test.cc +++ b/icing/join/qualified-id-type-joinable-index_test.cc @@ -16,10 +16,12 @@ #include <memory> #include <string> +#include <string_view> #include "icing/text_classifier/lib3/utils/base/status.h" #include "gmock/gmock.h" #include "gtest/gtest.h" +#include "icing/file/file-backed-vector.h" #include "icing/file/filesystem.h" #include "icing/file/persistent-storage.h" #include "icing/join/doc-join-info.h" @@ -89,8 +91,8 @@ TEST_F(QualifiedIdTypeJoinableIndexTest, InitializeNewFiles) { // Metadata file should be initialized correctly for both info and crcs // sections. - const std::string metadata_file_path = absl_ports::StrCat( - working_path_, "/", QualifiedIdTypeJoinableIndex::kFilePrefix, ".m"); + const std::string metadata_file_path = + absl_ports::StrCat(working_path_, "/metadata"); auto metadata_buffer = std::make_unique<uint8_t[]>( QualifiedIdTypeJoinableIndex::kMetadataFileSize); ASSERT_THAT( @@ -134,13 +136,13 @@ TEST_F(QualifiedIdTypeJoinableIndexTest, // Insert some data. ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), - /*ref_document_id=*/0)); + /*ref_qualified_id_str=*/"namespace#uriA")); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20), - /*ref_document_id=*/2)); + /*ref_qualified_id_str=*/"namespace#uriB")); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20), - /*ref_document_id=*/4)); + /*ref_qualified_id_str=*/"namespace#uriC")); // Without calling PersistToDisk, checksums will not be recomputed or synced // to disk, so initializing another instance on the same files should fail. @@ -158,13 +160,13 @@ TEST_F(QualifiedIdTypeJoinableIndexTest, // Insert some data. ICING_ASSERT_OK( index1->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), - /*ref_document_id=*/0)); + /*ref_qualified_id_str=*/"namespace#uriA")); ICING_ASSERT_OK( index1->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20), - /*ref_document_id=*/2)); + /*ref_qualified_id_str=*/"namespace#uriB")); ICING_ASSERT_OK( index1->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20), - /*ref_document_id=*/4)); + /*ref_qualified_id_str=*/"namespace#uriC")); ASSERT_THAT(index1, Pointee(SizeIs(3))); // After calling PersistToDisk, all checksums should be recomputed and synced @@ -178,13 +180,13 @@ TEST_F(QualifiedIdTypeJoinableIndexTest, EXPECT_THAT(index2, Pointee(SizeIs(3))); EXPECT_THAT( index2->Get(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20)), - IsOkAndHolds(0)); + IsOkAndHolds(/*ref_qualified_id_str=*/"namespace#uriA")); EXPECT_THAT( index2->Get(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20)), - IsOkAndHolds(2)); + IsOkAndHolds(/*ref_qualified_id_str=*/"namespace#uriB")); EXPECT_THAT( index2->Get(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20)), - IsOkAndHolds(4)); + IsOkAndHolds(/*ref_qualified_id_str=*/"namespace#uriC")); } TEST_F(QualifiedIdTypeJoinableIndexTest, @@ -198,13 +200,13 @@ TEST_F(QualifiedIdTypeJoinableIndexTest, // Insert some data. ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), - /*ref_document_id=*/0)); + /*ref_qualified_id_str=*/"namespace#uriA")); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20), - /*ref_document_id=*/2)); + /*ref_qualified_id_str=*/"namespace#uriB")); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20), - /*ref_document_id=*/4)); + /*ref_qualified_id_str=*/"namespace#uriC")); ASSERT_THAT(index, Pointee(SizeIs(3))); } @@ -219,13 +221,13 @@ TEST_F(QualifiedIdTypeJoinableIndexTest, EXPECT_THAT(index, Pointee(SizeIs(3))); EXPECT_THAT(index->Get(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20)), - IsOkAndHolds(0)); + IsOkAndHolds("namespace#uriA")); EXPECT_THAT(index->Get(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20)), - IsOkAndHolds(2)); + IsOkAndHolds("namespace#uriB")); EXPECT_THAT(index->Get(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20)), - IsOkAndHolds(4)); + IsOkAndHolds("namespace#uriC")); } } @@ -238,15 +240,15 @@ TEST_F(QualifiedIdTypeJoinableIndexTest, QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), - /*ref_document_id=*/0)); + /*ref_qualified_id_str=*/"namespace#uriA")); ICING_ASSERT_OK(index->PersistToDisk()); } { // Manually change magic and update checksum - const std::string metadata_file_path = absl_ports::StrCat( - working_path_, "/", QualifiedIdTypeJoinableIndex::kFilePrefix, ".m"); + const std::string metadata_file_path = + absl_ports::StrCat(working_path_, "/metadata"); ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); ASSERT_THAT(metadata_sfd.is_valid(), IsTrue()); @@ -290,14 +292,14 @@ TEST_F(QualifiedIdTypeJoinableIndexTest, QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), - /*ref_document_id=*/0)); + /*ref_qualified_id_str=*/"namespace#uriA")); ICING_ASSERT_OK(index->PersistToDisk()); } { - const std::string metadata_file_path = absl_ports::StrCat( - working_path_, "/", QualifiedIdTypeJoinableIndex::kFilePrefix, ".m"); + const std::string metadata_file_path = + absl_ports::StrCat(working_path_, "/metadata"); ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); ASSERT_THAT(metadata_sfd.is_valid(), IsTrue()); @@ -337,14 +339,14 @@ TEST_F(QualifiedIdTypeJoinableIndexTest, QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), - /*ref_document_id=*/0)); + /*ref_qualified_id_str=*/"namespace#uriA")); ICING_ASSERT_OK(index->PersistToDisk()); } { - const std::string metadata_file_path = absl_ports::StrCat( - working_path_, "/", QualifiedIdTypeJoinableIndex::kFilePrefix, ".m"); + const std::string metadata_file_path = + absl_ports::StrCat(working_path_, "/metadata"); ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str())); ASSERT_THAT(metadata_sfd.is_valid(), IsTrue()); @@ -376,9 +378,8 @@ TEST_F(QualifiedIdTypeJoinableIndexTest, HasSubstr("Invalid info crc"))); } -TEST_F( - QualifiedIdTypeJoinableIndexTest, - InitializeExistingFilesWithCorruptedDocumentToQualifiedIdMapperShouldFail) { +TEST_F(QualifiedIdTypeJoinableIndexTest, + InitializeExistingFilesWithCorruptedDocJoinInfoMapperShouldFail) { { // Create new qualified id type joinable index ICING_ASSERT_OK_AND_ASSIGN( @@ -386,19 +387,18 @@ TEST_F( QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), - /*ref_document_id=*/0)); + /*ref_qualified_id_str=*/"namespace#uriA")); ICING_ASSERT_OK(index->PersistToDisk()); } { - // Corrupt document_to_qualified_id_mapper manually. - std::string mapper_working_path = absl_ports::StrCat( - working_path_, "/", QualifiedIdTypeJoinableIndex::kFilePrefix, - "_mapper"); + // Corrupt doc_join_info_mapper manually. + std::string mapper_working_path = + absl_ports::StrCat(working_path_, "/doc_join_info_mapper"); ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<PersistentHashMapKeyMapper<DocumentId>> mapper, - PersistentHashMapKeyMapper<DocumentId>::Create( + std::unique_ptr<PersistentHashMapKeyMapper<int32_t>> mapper, + PersistentHashMapKeyMapper<int32_t>::Create( filesystem_, std::move(mapper_working_path))); ICING_ASSERT_OK_AND_ASSIGN(Crc32 old_crc, mapper->ComputeChecksum()); ICING_ASSERT_OK(mapper->Put("foo", 12345)); @@ -408,7 +408,47 @@ TEST_F( } // Attempt to create the qualified id type joinable index with corrupted - // document_to_qualified_id_mapper. This should fail. + // doc_join_info_mapper. This should fail. + EXPECT_THAT(QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION, + HasSubstr("Invalid storages crc"))); +} + +TEST_F(QualifiedIdTypeJoinableIndexTest, + InitializeExistingFilesWithCorruptedQualifiedIdStorageShouldFail) { + { + // Create new qualified id type joinable index + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<QualifiedIdTypeJoinableIndex> index, + QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); + ICING_ASSERT_OK( + index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20), + /*ref_qualified_id_str=*/"namespace#uriA")); + + ICING_ASSERT_OK(index->PersistToDisk()); + } + + { + // Corrupt qualified_id_storage manually. + std::string qualified_id_storage_path = + absl_ports::StrCat(working_path_, "/qualified_id_storage"); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> qualified_id_storage, + FileBackedVector<char>::Create( + filesystem_, qualified_id_storage_path, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + ICING_ASSERT_OK_AND_ASSIGN(Crc32 old_crc, + qualified_id_storage->ComputeChecksum()); + ICING_ASSERT_OK(qualified_id_storage->Append('a')); + ICING_ASSERT_OK(qualified_id_storage->Append('b')); + ICING_ASSERT_OK(qualified_id_storage->PersistToDisk()); + ICING_ASSERT_OK_AND_ASSIGN(Crc32 new_crc, + qualified_id_storage->ComputeChecksum()); + ASSERT_THAT(old_crc, Not(Eq(new_crc))); + } + + // Attempt to create the qualified id type joinable index with corrupted + // qualified_id_storage. This should fail. EXPECT_THAT(QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION, HasSubstr("Invalid storages crc"))); @@ -421,8 +461,9 @@ TEST_F(QualifiedIdTypeJoinableIndexTest, InvalidPut) { QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); DocJoinInfo default_invalid; - EXPECT_THAT(index->Put(default_invalid, /*ref_document_id=*/0), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT( + index->Put(default_invalid, /*ref_qualified_id_str=*/"namespace#uriA"), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST_F(QualifiedIdTypeJoinableIndexTest, InvalidGet) { @@ -438,13 +479,13 @@ TEST_F(QualifiedIdTypeJoinableIndexTest, InvalidGet) { TEST_F(QualifiedIdTypeJoinableIndexTest, PutAndGet) { DocJoinInfo target_info1(/*document_id=*/1, /*joinable_property_id=*/20); - DocumentId ref_document1 = 0; + std::string_view ref_qualified_id_str_a = "namespace#uriA"; DocJoinInfo target_info2(/*document_id=*/3, /*joinable_property_id=*/13); - DocumentId ref_document2 = 2; + std::string_view ref_qualified_id_str_b = "namespace#uriB"; DocJoinInfo target_info3(/*document_id=*/4, /*joinable_property_id=*/4); - DocumentId ref_document3 = ref_document1; + std::string_view ref_qualified_id_str_c = "namespace#uriC"; { // Create new qualified id type joinable index @@ -452,17 +493,14 @@ TEST_F(QualifiedIdTypeJoinableIndexTest, PutAndGet) { std::unique_ptr<QualifiedIdTypeJoinableIndex> index, QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); - EXPECT_THAT(index->Put(target_info1, /*ref_document_id=*/ref_document1), - IsOk()); - EXPECT_THAT(index->Put(target_info2, /*ref_document_id=*/ref_document2), - IsOk()); - EXPECT_THAT(index->Put(target_info3, /*ref_document_id=*/ref_document3), - IsOk()); + EXPECT_THAT(index->Put(target_info1, ref_qualified_id_str_a), IsOk()); + EXPECT_THAT(index->Put(target_info2, ref_qualified_id_str_b), IsOk()); + EXPECT_THAT(index->Put(target_info3, ref_qualified_id_str_c), IsOk()); EXPECT_THAT(index, Pointee(SizeIs(3))); - EXPECT_THAT(index->Get(target_info1), IsOkAndHolds(ref_document1)); - EXPECT_THAT(index->Get(target_info2), IsOkAndHolds(ref_document2)); - EXPECT_THAT(index->Get(target_info3), IsOkAndHolds(ref_document3)); + EXPECT_THAT(index->Get(target_info1), IsOkAndHolds(ref_qualified_id_str_a)); + EXPECT_THAT(index->Get(target_info2), IsOkAndHolds(ref_qualified_id_str_b)); + EXPECT_THAT(index->Get(target_info3), IsOkAndHolds(ref_qualified_id_str_c)); ICING_ASSERT_OK(index->PersistToDisk()); } @@ -472,15 +510,15 @@ TEST_F(QualifiedIdTypeJoinableIndexTest, PutAndGet) { std::unique_ptr<QualifiedIdTypeJoinableIndex> index, QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); EXPECT_THAT(index, Pointee(SizeIs(3))); - EXPECT_THAT(index->Get(target_info1), IsOkAndHolds(ref_document1)); - EXPECT_THAT(index->Get(target_info2), IsOkAndHolds(ref_document2)); - EXPECT_THAT(index->Get(target_info3), IsOkAndHolds(ref_document3)); + EXPECT_THAT(index->Get(target_info1), IsOkAndHolds(ref_qualified_id_str_a)); + EXPECT_THAT(index->Get(target_info2), IsOkAndHolds(ref_qualified_id_str_b)); + EXPECT_THAT(index->Get(target_info3), IsOkAndHolds(ref_qualified_id_str_c)); } TEST_F(QualifiedIdTypeJoinableIndexTest, GetShouldReturnNotFoundErrorIfNotExist) { DocJoinInfo target_info(/*document_id=*/1, /*joinable_property_id=*/20); - DocumentId ref_document = 0; + std::string_view ref_qualified_id_str = "namespace#uriA"; // Create new qualified id type joinable index ICING_ASSERT_OK_AND_ASSIGN( @@ -491,8 +529,8 @@ TEST_F(QualifiedIdTypeJoinableIndexTest, EXPECT_THAT(index->Get(target_info), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); - ICING_ASSERT_OK(index->Put(target_info, /*ref_document_id=*/ref_document)); - ASSERT_THAT(index->Get(target_info), IsOkAndHolds(ref_document)); + ICING_ASSERT_OK(index->Put(target_info, ref_qualified_id_str)); + ASSERT_THAT(index->Get(target_info), IsOkAndHolds(ref_qualified_id_str)); // Get another non-existing entry. This should get NOT_FOUND_ERROR. DocJoinInfo another_target_info(/*document_id=*/2, @@ -542,34 +580,30 @@ TEST_F(QualifiedIdTypeJoinableIndexTest, Optimize) { ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/10), - /*ref_document_id=*/0)); + /*ref_qualified_id_str=*/"namespace#uriA")); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/3), - /*ref_document_id=*/0)); + /*ref_qualified_id_str=*/"namespace#uriA")); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/8, /*joinable_property_id=*/9), - /*ref_document_id=*/2)); + /*ref_qualified_id_str=*/"namespace#uriB")); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/13, /*joinable_property_id=*/4), - /*ref_document_id=*/12)); + /*ref_qualified_id_str=*/"namespace#uriC")); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/21, /*joinable_property_id=*/12), - /*ref_document_id=*/12)); + /*ref_qualified_id_str=*/"namespace#uriC")); index->set_last_added_document_id(21); ASSERT_THAT(index, Pointee(SizeIs(5))); - // Used doc id: 0, 2, 3, 5, 8, 12, 13, 21. - // Delete doc id = 2, 5, compress and keep the rest. + // Delete doc id = 5, 8, compress and keep the rest. std::vector<DocumentId> document_id_old_to_new(22, kInvalidDocumentId); - document_id_old_to_new[0] = 0; - document_id_old_to_new[3] = 1; - document_id_old_to_new[8] = 2; - document_id_old_to_new[12] = 3; - document_id_old_to_new[13] = 4; - document_id_old_to_new[21] = 5; - - DocumentId new_last_added_document_id = 5; + document_id_old_to_new[3] = 0; + document_id_old_to_new[13] = 1; + document_id_old_to_new[21] = 2; + + DocumentId new_last_added_document_id = 2; EXPECT_THAT( index->Optimize(document_id_old_to_new, new_last_added_document_id), IsOk()); @@ -577,48 +611,43 @@ TEST_F(QualifiedIdTypeJoinableIndexTest, Optimize) { EXPECT_THAT(index->last_added_document_id(), Eq(new_last_added_document_id)); // Verify Put and Get API still work normally after Optimize(). - // (old_doc_id=3, joinable_property_id=10) had old referenced doc_id = 0, - // which is now (doc_id=1, joinable_property_id=10) and referenced doc_id = 0. + // (old_doc_id=3, joinable_property_id=10), which is now (doc_id=0, + // joinable_property_id=10), has referenced qualified id str = + // "namespace#uriA". EXPECT_THAT( - index->Get(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/10)), - IsOkAndHolds(0)); - - // (old_doc_id=5, joinable_property_id=3) had old referenced doc_id = 0, - // which is now not found since we've deleted old_doc_id = 5. It is not - // testable via Get() because there is no valid doc_id mapping for old_doc_id - // = 5 and we cannot generate a valid DocJoinInfo for it. - - // (old_doc_id=8, joinable_property_id=9) had old referenced doc_id = 2, - // which is now (doc_id=2, joinable_property_id=9), but since we've deleted - // old referenced doc_id = 2, this data should not be found after - // optimization. - EXPECT_THAT( - index->Get(DocJoinInfo(/*document_id=*/2, /*joinable_property_id=*/9)), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + index->Get(DocJoinInfo(/*document_id=*/0, /*joinable_property_id=*/10)), + IsOkAndHolds("namespace#uriA")); + + // (old_doc_id=5, joinable_property_id=3) and (old_doc_id=8, + // joinable_property_id=9) are now not found since we've deleted old_doc_id = + // 5, 8. It is not testable via Get() because there is no valid doc_id mapping + // for old_doc_id = 5, 8 and we cannot generate a valid DocJoinInfo for it. - // (old_doc_id=13, joinable_property_id=4) had old referenced doc_id = 12, - // which is now (doc_id=4, joinable_property_id=4) and referenced doc_id = 3. + // (old_doc_id=13, joinable_property_id=4), which is now (doc_id=1, + // joinable_property_id=4), has referenced qualified id str = + // "namespace#uriC". EXPECT_THAT( - index->Get(DocJoinInfo(/*document_id=*/4, /*joinable_property_id=*/4)), - IsOkAndHolds(3)); + index->Get(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/4)), + IsOkAndHolds("namespace#uriC")); - // (old_doc_id=21, joinable_property_id=12) had old referenced doc_id = 12, - // which is now (doc_id=5, joinable_property_id=12) and referenced doc_id = 3. + // (old_doc_id=21, joinable_property_id=12), which is now (doc_id=2, + // joinable_property_id=12), has referenced qualified id str = + // "namespace#uriC". EXPECT_THAT( - index->Get(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/12)), - IsOkAndHolds(3)); + index->Get(DocJoinInfo(/*document_id=*/2, /*joinable_property_id=*/12)), + IsOkAndHolds("namespace#uriC")); // Joinable index should be able to work normally after Optimize(). ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/99, /*joinable_property_id=*/2), - /*ref_document_id=*/90)); + /*ref_qualified_id_str=*/"namespace#uriD")); index->set_last_added_document_id(99); EXPECT_THAT(index, Pointee(SizeIs(4))); EXPECT_THAT(index->last_added_document_id(), Eq(99)); EXPECT_THAT(index->Get(DocJoinInfo(/*document_id=*/99, /*joinable_property_id=*/2)), - IsOkAndHolds(90)); + IsOkAndHolds("namespace#uriD")); } TEST_F(QualifiedIdTypeJoinableIndexTest, OptimizeOutOfRangeDocumentId) { @@ -628,7 +657,7 @@ TEST_F(QualifiedIdTypeJoinableIndexTest, OptimizeOutOfRangeDocumentId) { ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/99, /*joinable_property_id=*/10), - /*ref_document_id=*/91)); + /*ref_qualified_id_str=*/"namespace#uriA")); index->set_last_added_document_id(99); // Create document_id_old_to_new with size = 1. Optimize should handle out of @@ -653,19 +682,19 @@ TEST_F(QualifiedIdTypeJoinableIndexTest, OptimizeDeleteAll) { ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/10), - /*ref_document_id=*/0)); + /*ref_qualified_id_str=*/"namespace#uriA")); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/3), - /*ref_document_id=*/0)); + /*ref_qualified_id_str=*/"namespace#uriA")); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/8, /*joinable_property_id=*/9), - /*ref_document_id=*/2)); + /*ref_qualified_id_str=*/"namespace#uriB")); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/13, /*joinable_property_id=*/4), - /*ref_document_id=*/12)); + /*ref_qualified_id_str=*/"namespace#uriC")); ICING_ASSERT_OK( index->Put(DocJoinInfo(/*document_id=*/21, /*joinable_property_id=*/12), - /*ref_document_id=*/12)); + /*ref_qualified_id_str=*/"namespace#uriC")); index->set_last_added_document_id(21); // Delete all documents. @@ -690,9 +719,12 @@ TEST_F(QualifiedIdTypeJoinableIndexTest, Clear) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<QualifiedIdTypeJoinableIndex> index, QualifiedIdTypeJoinableIndex::Create(filesystem_, working_path_)); - ICING_ASSERT_OK(index->Put(target_info1, /*ref_document_id=*/0)); - ICING_ASSERT_OK(index->Put(target_info2, /*ref_document_id=*/2)); - ICING_ASSERT_OK(index->Put(target_info3, /*ref_document_id=*/4)); + ICING_ASSERT_OK( + index->Put(target_info1, /*ref_qualified_id_str=*/"namespace#uriA")); + ICING_ASSERT_OK( + index->Put(target_info2, /*ref_qualified_id_str=*/"namespace#uriB")); + ICING_ASSERT_OK( + index->Put(target_info3, /*ref_qualified_id_str=*/"namespace#uriC")); ASSERT_THAT(index, Pointee(SizeIs(3))); index->set_last_added_document_id(6); ASSERT_THAT(index->last_added_document_id(), Eq(6)); @@ -711,11 +743,12 @@ TEST_F(QualifiedIdTypeJoinableIndexTest, Clear) { // Joinable index should be able to work normally after Clear(). DocJoinInfo target_info4(/*document_id=*/2, /*joinable_property_id=*/19); - ICING_ASSERT_OK(index->Put(target_info4, /*ref_document_id=*/0)); + ICING_ASSERT_OK( + index->Put(target_info4, /*ref_qualified_id_str=*/"namespace#uriD")); index->set_last_added_document_id(2); EXPECT_THAT(index->last_added_document_id(), Eq(2)); - EXPECT_THAT(index->Get(target_info4), IsOkAndHolds(0)); + EXPECT_THAT(index->Get(target_info4), IsOkAndHolds("namespace#uriD")); ICING_ASSERT_OK(index->PersistToDisk()); index.reset(); @@ -730,7 +763,7 @@ TEST_F(QualifiedIdTypeJoinableIndexTest, Clear) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(index->Get(target_info3), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); - EXPECT_THAT(index->Get(target_info4), IsOkAndHolds(0)); + EXPECT_THAT(index->Get(target_info4), IsOkAndHolds("namespace#uriD")); } } // namespace diff --git a/icing/join/qualified-id.cc b/icing/join/qualified-id.cc index 2a30c44..42e080c 100644 --- a/icing/join/qualified-id.cc +++ b/icing/join/qualified-id.cc @@ -40,9 +40,14 @@ bool IsSpecialCharacter(char c) { // A valid index of the separator on success. // std::string::npos if the escape format of content is incorrect. // std::string::npos if the content contains 0 or more than 1 separators. +// std::string::npos if the content contains '\0'. size_t VerifyFormatAndGetSeparatorPosition(std::string_view content) { size_t separator_pos = std::string::npos; for (size_t i = 0; i < content.length(); ++i) { + if (content[i] == '\0') { + return std::string::npos; + } + if (content[i] == QualifiedId::kEscapeChar) { // Advance to the next character. ++i; diff --git a/icing/join/qualified-id_test.cc b/icing/join/qualified-id_test.cc index 0c3750a..92bf63e 100644 --- a/icing/join/qualified-id_test.cc +++ b/icing/join/qualified-id_test.cc @@ -135,6 +135,24 @@ TEST(QualifiedIdTest, InvalidQualifiedIdWithWrongNumberOfSeparators) { StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } +TEST(QualifiedIdTest, InvalidQualifiedIdWithStringTerminator) { + const char invalid_qualified_id1[] = "names\0pace#uri"; + EXPECT_THAT(QualifiedId::Parse(std::string_view(invalid_qualified_id1, 14)), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + const char invalid_qualified_id2[] = "namespace#ur\0i"; + EXPECT_THAT(QualifiedId::Parse(std::string_view(invalid_qualified_id2, 14)), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + const char invalid_qualified_id3[] = "\0namespace#uri"; + EXPECT_THAT(QualifiedId::Parse(std::string_view(invalid_qualified_id3, 14)), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + const char invalid_qualified_id4[] = "namespace#uri\0"; + EXPECT_THAT(QualifiedId::Parse(std::string_view(invalid_qualified_id4, 14)), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + } // namespace } // namespace lib diff --git a/icing/monkey_test/icing-monkey-test-runner.cc b/icing/monkey_test/icing-monkey-test-runner.cc index db518bd..89b8e89 100644 --- a/icing/monkey_test/icing-monkey-test-runner.cc +++ b/icing/monkey_test/icing-monkey-test-runner.cc @@ -453,9 +453,15 @@ void IcingMonkeyTestRunner::DoOptimize() { } void IcingMonkeyTestRunner::CreateIcingSearchEngine() { + std::uniform_int_distribution<> dist(0, 1); + IcingSearchEngineOptions icing_options; icing_options.set_index_merge_size(config_.index_merge_size); icing_options.set_base_dir(icing_dir_->dir()); + // The method will be called every time when we ReloadFromDisk(), so randomly + // flip this flag to test document store's compatibility. + icing_options.set_document_store_namespace_id_fingerprint( + (bool)dist(random_)); icing_ = std::make_unique<IcingSearchEngine>(icing_options); ASSERT_THAT(icing_->Initialize().status(), ProtoIsOk()); } diff --git a/icing/query/advanced_query_parser/abstract-syntax-tree.h b/icing/query/advanced_query_parser/abstract-syntax-tree.h index d18f6ea..67049ad 100644 --- a/icing/query/advanced_query_parser/abstract-syntax-tree.h +++ b/icing/query/advanced_query_parser/abstract-syntax-tree.h @@ -17,6 +17,7 @@ #include <memory> #include <string> +#include <string_view> #include <utility> #include <vector> @@ -52,24 +53,29 @@ class Node { class TerminalNode : public Node { public: - explicit TerminalNode(std::string value, bool is_prefix) - : value_(std::move(value)), is_prefix_(is_prefix) {} + explicit TerminalNode(std::string value, std::string_view raw_value, + bool is_prefix) + : value_(std::move(value)), + raw_value_(raw_value), + is_prefix_(is_prefix) {} const std::string& value() const& { return value_; } std::string value() && { return std::move(value_); } bool is_prefix() const { return is_prefix_; } + std::string_view raw_value() const { return raw_value_; } + private: std::string value_; + std::string_view raw_value_; bool is_prefix_; }; class FunctionNameNode : public TerminalNode { public: explicit FunctionNameNode(std::string value) - : TerminalNode(std::move(value), /*is_prefix=*/false) {} - + : TerminalNode(std::move(value), /*raw_value=*/"", /*is_prefix=*/false) {} void Accept(AbstractSyntaxTreeVisitor* visitor) const override { visitor->VisitFunctionName(this); } @@ -77,9 +83,9 @@ class FunctionNameNode : public TerminalNode { class StringNode : public TerminalNode { public: - explicit StringNode(std::string value, bool is_prefix = false) - : TerminalNode(std::move(value), is_prefix) {} - + explicit StringNode(std::string value, std::string_view raw_value, + bool is_prefix = false) + : TerminalNode(std::move(value), raw_value, is_prefix) {} void Accept(AbstractSyntaxTreeVisitor* visitor) const override { visitor->VisitString(this); } @@ -87,9 +93,9 @@ class StringNode : public TerminalNode { class TextNode : public TerminalNode { public: - explicit TextNode(std::string value, bool is_prefix = false) - : TerminalNode(std::move(value), is_prefix) {} - + explicit TextNode(std::string value, std::string_view raw_value, + bool is_prefix = false) + : TerminalNode(std::move(value), raw_value, is_prefix) {} void Accept(AbstractSyntaxTreeVisitor* visitor) const override { visitor->VisitText(this); } diff --git a/icing/query/advanced_query_parser/abstract-syntax-tree_test.cc b/icing/query/advanced_query_parser/abstract-syntax-tree_test.cc index a8599fd..5e28278 100644 --- a/icing/query/advanced_query_parser/abstract-syntax-tree_test.cc +++ b/icing/query/advanced_query_parser/abstract-syntax-tree_test.cc @@ -27,8 +27,8 @@ namespace { using ::testing::ElementsAre; TEST(AbstractSyntaxTreeTest, Simple) { - // foo - std::unique_ptr<Node> root = std::make_unique<TextNode>("foo"); + std::string_view query = "foo"; + std::unique_ptr<Node> root = std::make_unique<TextNode>("foo", query); SimpleVisitor visitor; root->Accept(&visitor); @@ -37,16 +37,16 @@ TEST(AbstractSyntaxTreeTest, Simple) { } TEST(AbstractSyntaxTreeTest, Composite) { - // (foo bar) OR baz + std::string_view query = "(foo bar) OR baz"; std::vector<std::unique_ptr<Node>> and_args; - and_args.push_back(std::make_unique<TextNode>("foo")); - and_args.push_back(std::make_unique<TextNode>("bar")); + and_args.push_back(std::make_unique<TextNode>("foo", query.substr(1, 3))); + and_args.push_back(std::make_unique<TextNode>("bar", query.substr(5, 3))); auto and_node = std::make_unique<NaryOperatorNode>("AND", std::move(and_args)); std::vector<std::unique_ptr<Node>> or_args; or_args.push_back(std::move(and_node)); - or_args.push_back(std::make_unique<TextNode>("baz")); + or_args.push_back(std::make_unique<TextNode>("baz", query.substr(13, 3))); std::unique_ptr<Node> root = std::make_unique<NaryOperatorNode>("OR", std::move(or_args)); @@ -72,9 +72,9 @@ TEST(AbstractSyntaxTreeTest, Function) { ElementsAre(EqualsNodeInfo("foo", NodeType::kFunctionName), EqualsNodeInfo("", NodeType::kFunction))); - // foo("bar") + std::string_view query = "foo(\"bar\")"; std::vector<std::unique_ptr<Node>> args; - args.push_back(std::make_unique<StringNode>("bar")); + args.push_back(std::make_unique<StringNode>("bar", query.substr(5, 3))); root = std::make_unique<FunctionNode>( std::make_unique<FunctionNameNode>("foo"), std::move(args)); visitor = SimpleVisitor(); @@ -85,9 +85,9 @@ TEST(AbstractSyntaxTreeTest, Function) { EqualsNodeInfo("bar", NodeType::kString), EqualsNodeInfo("", NodeType::kFunction))); - // foo(bar("baz")) + query = "foo(bar(\"baz\"))"; std::vector<std::unique_ptr<Node>> inner_args; - inner_args.push_back(std::make_unique<StringNode>("baz")); + inner_args.push_back(std::make_unique<StringNode>("baz", query.substr(9, 3))); args.clear(); args.push_back(std::make_unique<FunctionNode>( std::make_unique<FunctionNameNode>("bar"), std::move(inner_args))); @@ -105,14 +105,16 @@ TEST(AbstractSyntaxTreeTest, Function) { } TEST(AbstractSyntaxTreeTest, Restriction) { - // sender.name:(IMPORTANT OR URGENT) + std::string_view query = "sender.name:(IMPORTANT OR URGENT)"; std::vector<std::unique_ptr<TextNode>> member_args; - member_args.push_back(std::make_unique<TextNode>("sender")); - member_args.push_back(std::make_unique<TextNode>("name")); + member_args.push_back( + std::make_unique<TextNode>("sender", query.substr(0, 6))); + member_args.push_back(std::make_unique<TextNode>("name", query.substr(7, 4))); std::vector<std::unique_ptr<Node>> or_args; - or_args.push_back(std::make_unique<TextNode>("IMPORTANT")); - or_args.push_back(std::make_unique<TextNode>("URGENT")); + or_args.push_back( + std::make_unique<TextNode>("IMPORTANT", query.substr(13, 9))); + or_args.push_back(std::make_unique<TextNode>("URGENT", query.substr(26, 6))); std::vector<std::unique_ptr<Node>> has_args; has_args.push_back(std::make_unique<MemberNode>(std::move(member_args), diff --git a/icing/query/advanced_query_parser/function_test.cc b/icing/query/advanced_query_parser/function_test.cc index 3b3ca40..afd4e04 100644 --- a/icing/query/advanced_query_parser/function_test.cc +++ b/icing/query/advanced_query_parser/function_test.cc @@ -63,10 +63,10 @@ TEST(FunctionTest, ParamNotWrongTypeFails) { Function function, Function::Create(/*return_type=*/DataType::kString, "foo", /*params=*/{Param(DataType::kString)}, TrivialEval())); - // foo(bar) + std::string_view query = "foo(bar)"; std::vector<PendingValue> args; args.push_back(PendingValue::CreateTextPendingValue( - QueryTerm{"bar", /*is_prefix_val=*/false})); + QueryTerm{"bar", query.substr(4, 3), /*is_prefix_val=*/false})); EXPECT_THAT(function.Eval(std::move(args)), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } @@ -77,10 +77,10 @@ TEST(FunctionTest, ParamRequiredArgSucceeds) { Function::Create(/*return_type=*/DataType::kString, "foo", /*params=*/{Param(DataType::kString)}, TrivialEval())); - // foo("bar") + std::string_view query = R"(foo("bar"))"; std::vector<PendingValue> args; args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"bar", /*is_prefix_val=*/false})); + QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(PendingValue val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); } @@ -136,19 +136,19 @@ TEST(FunctionTest, MultipleArgsTrailingOptionalSucceeds) { Param(DataType::kString, Cardinality::kOptional)}, TrivialEval())); - // foo("bar") + std::string_view query = R"(foo("bar"))"; std::vector<PendingValue> args; args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"bar", /*is_prefix_val=*/false})); + QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(PendingValue val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); - // foo("bar", "baz") + query = R"(foo("bar", "baz"))"; args = std::vector<PendingValue>(); args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"bar", /*is_prefix_val=*/false})); + QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false})); args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"baz", /*is_prefix_val=*/false})); + QueryTerm{"baz", query.substr(12, 3), /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); } @@ -162,30 +162,30 @@ TEST(FunctionTest, MultipleArgsTrailingVariableSucceeds) { Param(DataType::kString, Cardinality::kVariable)}, TrivialEval())); - // foo("bar") + std::string_view query = R"(foo("bar"))"; std::vector<PendingValue> args; args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"bar", /*is_prefix_val=*/false})); + QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(PendingValue val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); - // foo("bar", "baz") + query = R"(foo("bar", "baz"))"; args = std::vector<PendingValue>(); args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"bar", /*is_prefix_val=*/false})); + QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false})); args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"baz", /*is_prefix_val=*/false})); + QueryTerm{"baz", query.substr(12, 3), /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); - // foo("bar", "baz", "bat") + query = R"(foo("bar", "baz", "bat"))"; args = std::vector<PendingValue>(); args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"bar", /*is_prefix_val=*/false})); + QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false})); args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"baz", /*is_prefix_val=*/false})); + QueryTerm{"baz", query.substr(12, 3), /*is_prefix_val=*/false})); args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"bat", /*is_prefix_val=*/false})); + QueryTerm{"bat", query.substr(19, 3), /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); } @@ -214,26 +214,26 @@ TEST(FunctionTest, MultipleArgsOptionalBeforeOptionalSucceeds) { ICING_ASSERT_OK_AND_ASSIGN(PendingValue val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); - // foo("bar") + std::string_view query = R"(foo("bar"))"; args = std::vector<PendingValue>(); args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"bar", /*is_prefix_val=*/false})); + QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); - // foo("bar", baz) + query = R"(foo("bar", baz))"; args = std::vector<PendingValue>(); args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"bar", /*is_prefix_val=*/false})); + QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false})); args.push_back(PendingValue::CreateTextPendingValue( - QueryTerm{"baz", /*is_prefix_val=*/false})); + QueryTerm{"baz", query.substr(11, 3), /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); - // foo(baz) + query = R"(foo(baz))"; args = std::vector<PendingValue>(); args.push_back(PendingValue::CreateTextPendingValue( - QueryTerm{"baz", /*is_prefix_val=*/false})); + QueryTerm{"baz", query.substr(4, 3), /*is_prefix_val=*/false})); EXPECT_THAT(function.Eval(std::move(args)), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } @@ -252,46 +252,46 @@ TEST(FunctionTest, MultipleArgsOptionalBeforeVariableSucceeds) { ICING_ASSERT_OK_AND_ASSIGN(PendingValue val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); - // foo("bar") + std::string_view query = R"(foo("bar"))"; args = std::vector<PendingValue>(); args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"bar", /*is_prefix_val=*/false})); + QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); - // foo("bar", baz) + query = R"(foo("bar", baz))"; args = std::vector<PendingValue>(); args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"bar", /*is_prefix_val=*/false})); + QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false})); args.push_back(PendingValue::CreateTextPendingValue( - QueryTerm{"baz", /*is_prefix_val=*/false})); + QueryTerm{"baz", query.substr(11, 3), /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); - // foo("bar", baz, bat) + query = R"(foo("bar", baz, bat))"; args = std::vector<PendingValue>(); args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"bar", /*is_prefix_val=*/false})); + QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false})); args.push_back(PendingValue::CreateTextPendingValue( - QueryTerm{"baz", /*is_prefix_val=*/false})); + QueryTerm{"baz", query.substr(11, 3), /*is_prefix_val=*/false})); args.push_back(PendingValue::CreateTextPendingValue( - QueryTerm{"bat", /*is_prefix_val=*/false})); + QueryTerm{"bat", query.substr(16, 3), /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); - // foo(baz) + query = R"(foo(baz))"; args = std::vector<PendingValue>(); args.push_back(PendingValue::CreateTextPendingValue( - QueryTerm{"baz", /*is_prefix_val=*/false})); + QueryTerm{"baz", query.substr(4, 3), /*is_prefix_val=*/false})); EXPECT_THAT(function.Eval(std::move(args)), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - // foo(baz, bat) + query = R"(foo(baz, bat))"; args = std::vector<PendingValue>(); args.push_back(PendingValue::CreateTextPendingValue( - QueryTerm{"baz", /*is_prefix_val=*/false})); + QueryTerm{"baz", query.substr(4, 3), /*is_prefix_val=*/false})); args.push_back(PendingValue::CreateTextPendingValue( - QueryTerm{"bat", /*is_prefix_val=*/false})); + QueryTerm{"bat", query.substr(9, 3), /*is_prefix_val=*/false})); EXPECT_THAT(function.Eval(std::move(args)), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } diff --git a/icing/query/advanced_query_parser/lexer.cc b/icing/query/advanced_query_parser/lexer.cc index 6cddd96..0dd0bb0 100644 --- a/icing/query/advanced_query_parser/lexer.cc +++ b/icing/query/advanced_query_parser/lexer.cc @@ -38,12 +38,13 @@ bool Lexer::ConsumeWhitespace() { } bool Lexer::ConsumeQuerySingleChar() { + std::string_view original_text = query_.substr(current_index_, 1); switch (current_char_) { case ':': - tokens_.push_back({":", TokenType::COMPARATOR}); + tokens_.push_back({":", original_text, TokenType::COMPARATOR}); break; case '*': - tokens_.push_back({"", TokenType::STAR}); + tokens_.push_back({"", original_text, TokenType::STAR}); break; case '-': if (in_text_) { @@ -51,7 +52,7 @@ bool Lexer::ConsumeQuerySingleChar() { // in the middle of a TEXT segment (ex. `foo-bar`). return false; } - tokens_.push_back({"", TokenType::MINUS}); + tokens_.push_back({"", original_text, TokenType::MINUS}); break; default: return false; @@ -61,18 +62,19 @@ bool Lexer::ConsumeQuerySingleChar() { } bool Lexer::ConsumeScoringSingleChar() { + std::string_view original_text = query_.substr(current_index_, 1); switch (current_char_) { case '+': - tokens_.push_back({"", TokenType::PLUS}); + tokens_.push_back({"", original_text, TokenType::PLUS}); break; case '*': - tokens_.push_back({"", TokenType::TIMES}); + tokens_.push_back({"", original_text, TokenType::TIMES}); break; case '/': - tokens_.push_back({"", TokenType::DIV}); + tokens_.push_back({"", original_text, TokenType::DIV}); break; case '-': - tokens_.push_back({"", TokenType::MINUS}); + tokens_.push_back({"", original_text, TokenType::MINUS}); break; default: return false; @@ -82,18 +84,19 @@ bool Lexer::ConsumeScoringSingleChar() { } bool Lexer::ConsumeGeneralSingleChar() { + std::string_view original_text = query_.substr(current_index_, 1); switch (current_char_) { case ',': - tokens_.push_back({"", TokenType::COMMA}); + tokens_.push_back({"", original_text, TokenType::COMMA}); break; case '.': - tokens_.push_back({"", TokenType::DOT}); + tokens_.push_back({"", original_text, TokenType::DOT}); break; case '(': - tokens_.push_back({"", TokenType::LPAREN}); + tokens_.push_back({"", original_text, TokenType::LPAREN}); break; case ')': - tokens_.push_back({"", TokenType::RPAREN}); + tokens_.push_back({"", original_text, TokenType::RPAREN}); break; default: return false; @@ -124,13 +127,17 @@ bool Lexer::ConsumeComparator() { // Matching for '<=', '>=', '!=', or '=='. char next_char = PeekNext(1); if (next_char == '=') { - tokens_.push_back({{current_char_, next_char}, TokenType::COMPARATOR}); + tokens_.push_back({{current_char_, next_char}, + query_.substr(current_index_, 2), + TokenType::COMPARATOR}); Advance(2); return true; } // Now, next_char must not be '='. Let's match for '<' and '>'. if (current_char_ == '<' || current_char_ == '>') { - tokens_.push_back({{current_char_}, TokenType::COMPARATOR}); + tokens_.push_back({{current_char_}, + query_.substr(current_index_, 1), + TokenType::COMPARATOR}); Advance(); return true; } @@ -145,10 +152,11 @@ bool Lexer::ConsumeAndOr() { if (current_char_ != next_char) { return false; } + std::string_view original_text = query_.substr(current_index_, 2); if (current_char_ == '&') { - tokens_.push_back({"", TokenType::AND}); + tokens_.push_back({"", original_text, TokenType::AND}); } else { - tokens_.push_back({"", TokenType::OR}); + tokens_.push_back({"", original_text, TokenType::OR}); } Advance(2); return true; @@ -158,37 +166,42 @@ bool Lexer::ConsumeStringLiteral() { if (current_char_ != '"') { return false; } - std::string text; Advance(); + int32_t unnormalized_start_pos = current_index_; while (current_char_ != '\0' && current_char_ != '"') { // When getting a backslash, we will always match the next character, even // if the next character is a quotation mark if (current_char_ == '\\') { - text.push_back(current_char_); Advance(); if (current_char_ == '\0') { // In this case, we are missing a terminating quotation mark. break; } } - text.push_back(current_char_); Advance(); } if (current_char_ == '\0') { SyntaxError("missing terminating \" character"); return false; } - tokens_.push_back({text, TokenType::STRING}); + int32_t unnormalized_length = current_index_ - unnormalized_start_pos; + std::string_view raw_token_text = + query_.substr(unnormalized_start_pos, unnormalized_length); + std::string token_text(raw_token_text); + tokens_.push_back({std::move(token_text), raw_token_text, TokenType::STRING}); Advance(); return true; } -bool Lexer::Text() { +bool Lexer::ConsumeText() { if (current_char_ == '\0') { return false; } - tokens_.push_back({"", TokenType::TEXT}); + tokens_.push_back({"", query_.substr(current_index_, 0), TokenType::TEXT}); int token_index = tokens_.size() - 1; + + int32_t unnormalized_start_pos = current_index_; + int32_t unnormalized_end_pos = current_index_; while (!ConsumeNonText() && current_char_ != '\0') { in_text_ = true; // When getting a backslash in TEXT, unescape it by accepting its following @@ -203,14 +216,18 @@ bool Lexer::Text() { } tokens_[token_index].text.push_back(current_char_); Advance(); - if (current_char_ == '(') { - // A TEXT followed by a LPAREN is a FUNCTION_NAME. - tokens_.back().type = TokenType::FUNCTION_NAME; - // No need to break, since NonText() must be true at this point. - } + unnormalized_end_pos = current_index_; } in_text_ = false; + tokens_[token_index].original_text = query_.substr( + unnormalized_start_pos, unnormalized_end_pos - unnormalized_start_pos); + if (unnormalized_end_pos < query_.length() && + query_[unnormalized_end_pos] == '(') { + // A TEXT followed by a LPAREN is a FUNCTION_NAME. + tokens_[token_index].type = TokenType::FUNCTION_NAME; + } + if (language_ == Lexer::Language::QUERY) { std::string &text = tokens_[token_index].text; TokenType &type = tokens_[token_index].type; @@ -234,7 +251,7 @@ Lexer::ExtractTokens() { // Clear out any non-text before matching a Text. while (ConsumeNonText()) { } - Text(); + ConsumeText(); } if (!error_.empty()) { return absl_ports::InvalidArgumentError( diff --git a/icing/query/advanced_query_parser/lexer.h b/icing/query/advanced_query_parser/lexer.h index f7f06dc..b313fa7 100644 --- a/icing/query/advanced_query_parser/lexer.h +++ b/icing/query/advanced_query_parser/lexer.h @@ -48,7 +48,9 @@ class Lexer { AND, // 'AND' | '&&' Not allowed in SCORING language. OR, // 'OR' | '||' Not allowed in SCORING language. NOT, // 'NOT' Not allowed in SCORING language. - STRING, // String literal surrounded by quotation marks + STRING, // String literal surrounded by quotation marks. The + // original_text of a STRING token will not include quotation + // marks. TEXT, // A sequence of chars that are not any above-listed operator FUNCTION_NAME, // A TEXT followed by LPAREN. // Whitespaces not inside a string literal will be skipped. @@ -69,6 +71,10 @@ class Lexer { // For other types, this field will be empty. std::string text; + // Lifecycle is dependent on the lifecycle of the string pointed to by + // query_. + std::string_view original_text; + // The type of the token. TokenType type; }; @@ -141,8 +147,9 @@ class Lexer { } // Try to match TEXT, FUNCTION_NAME, 'AND', 'OR' and 'NOT'. - // Should make sure that NonText() is false before calling into this method. - bool Text(); + // REQUIRES: ConsumeNonText() must be called immediately before calling this + // function. + bool ConsumeText(); std::string_view query_; std::string error_; diff --git a/icing/query/advanced_query_parser/parser.cc b/icing/query/advanced_query_parser/parser.cc index 0e4c78d..fd74561 100644 --- a/icing/query/advanced_query_parser/parser.cc +++ b/icing/query/advanced_query_parser/parser.cc @@ -55,7 +55,8 @@ libtextclassifier3::StatusOr<std::unique_ptr<TextNode>> Parser::ConsumeText() { if (!Match(Lexer::TokenType::TEXT)) { return absl_ports::InvalidArgumentError("Unable to consume token as TEXT."); } - auto text_node = std::make_unique<TextNode>(std::move(current_token_->text)); + auto text_node = std::make_unique<TextNode>(std::move(current_token_->text), + current_token_->original_text); ++current_token_; return text_node; } @@ -81,6 +82,7 @@ Parser::ConsumeStringElement() { "Unable to consume token as STRING."); } std::string text = std::move(current_token_->text); + std::string_view raw_text = current_token_->original_text; ++current_token_; bool is_prefix = false; @@ -89,7 +91,7 @@ Parser::ConsumeStringElement() { ++current_token_; } - return std::make_unique<StringNode>(std::move(text), is_prefix); + return std::make_unique<StringNode>(std::move(text), raw_text, is_prefix); } libtextclassifier3::StatusOr<std::string> Parser::ConsumeComparator() { @@ -115,7 +117,9 @@ Parser::ConsumeMember() { // at this point. So check for 'STAR' to differentiate the two cases. if (Match(Lexer::TokenType::STAR)) { Consume(Lexer::TokenType::STAR); - text_node = std::make_unique<TextNode>(std::move(*text_node).value(), + std::string_view raw_text = text_node->raw_value(); + std::string text = std::move(*text_node).value(); + text_node = std::make_unique<TextNode>(std::move(text), raw_text, /*is_prefix=*/true); children.push_back(std::move(text_node)); } else { diff --git a/icing/query/advanced_query_parser/parser_test.cc b/icing/query/advanced_query_parser/parser_test.cc index 502dbd3..824c2ce 100644 --- a/icing/query/advanced_query_parser/parser_test.cc +++ b/icing/query/advanced_query_parser/parser_test.cc @@ -46,9 +46,9 @@ TEST(ParserTest, EmptyScoring) { } TEST(ParserTest, SingleTerm) { - // Query: "foo" + std::string_view query = "foo"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::TEXT}}; + {"foo", query, Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -67,9 +67,10 @@ TEST(ParserTest, SingleTerm) { } TEST(ParserTest, ImplicitAnd) { - // Query: "foo bar" + std::string_view query = "foo bar"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::TEXT}, {"bar", Lexer::TokenType::TEXT}}; + {"foo", query.substr(0, 3), Lexer::TokenType::TEXT}, + {"bar", query.substr(4, 3), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -93,11 +94,11 @@ TEST(ParserTest, ImplicitAnd) { } TEST(ParserTest, Or) { - // Query: "foo OR bar" + std::string_view query = "foo OR bar"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::OR}, - {"bar", Lexer::TokenType::TEXT}}; + {"foo", query.substr(0, 3), Lexer::TokenType::TEXT}, + {"", query.substr(4, 2), Lexer::TokenType::OR}, + {"bar", query.substr(7, 3), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -121,11 +122,11 @@ TEST(ParserTest, Or) { } TEST(ParserTest, And) { - // Query: "foo AND bar" + std::string_view query = "foo AND bar"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::AND}, - {"bar", Lexer::TokenType::TEXT}}; + {"foo", query.substr(0, 3), Lexer::TokenType::TEXT}, + {"", query.substr(4, 3), Lexer::TokenType::AND}, + {"bar", query.substr(8, 4), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -149,9 +150,10 @@ TEST(ParserTest, And) { } TEST(ParserTest, Not) { - // Query: "NOT foo" + std::string_view query = "NOT foo"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"", Lexer::TokenType::NOT}, {"foo", Lexer::TokenType::TEXT}}; + {"", query.substr(0, 3), Lexer::TokenType::NOT}, + {"foo", query.substr(4, 3), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -173,9 +175,10 @@ TEST(ParserTest, Not) { } TEST(ParserTest, Minus) { - // Query: "-foo" + std::string_view query = "-foo"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"", Lexer::TokenType::MINUS}, {"foo", Lexer::TokenType::TEXT}}; + {"", query.substr(0, 1), Lexer::TokenType::MINUS}, + {"foo", query.substr(1, 3), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -197,11 +200,11 @@ TEST(ParserTest, Minus) { } TEST(ParserTest, Has) { - // Query: "subject:foo" + std::string_view query = "subject:foo"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"subject", Lexer::TokenType::TEXT}, - {":", Lexer::TokenType::COMPARATOR}, - {"foo", Lexer::TokenType::TEXT}}; + {"subject", query.substr(0, 7), Lexer::TokenType::TEXT}, + {":", query.substr(7, 1), Lexer::TokenType::COMPARATOR}, + {"foo", query.substr(8, 3), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -225,13 +228,13 @@ TEST(ParserTest, Has) { } TEST(ParserTest, HasNested) { - // Query: "sender.name:foo" + std::string_view query = "sender.name:foo"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"sender", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::DOT}, - {"name", Lexer::TokenType::TEXT}, - {":", Lexer::TokenType::COMPARATOR}, - {"foo", Lexer::TokenType::TEXT}}; + {"sender", query.substr(0, 6), Lexer::TokenType::TEXT}, + {"", query.substr(6, 1), Lexer::TokenType::DOT}, + {"name", query.substr(7, 4), Lexer::TokenType::TEXT}, + {":", query.substr(11, 1), Lexer::TokenType::COMPARATOR}, + {"foo", query.substr(12, 3), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -256,11 +259,11 @@ TEST(ParserTest, HasNested) { } TEST(ParserTest, EmptyFunction) { - // Query: "foo()" + std::string_view query = "foo()"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::FUNCTION_NAME}, - {"", Lexer::TokenType::LPAREN}, - {"", Lexer::TokenType::RPAREN}}; + {"foo", query.substr(0, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", query.substr(3, 1), Lexer::TokenType::LPAREN}, + {"", query.substr(4, 1), Lexer::TokenType::RPAREN}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -279,12 +282,12 @@ TEST(ParserTest, EmptyFunction) { } TEST(ParserTest, FunctionSingleArg) { - // Query: "foo("bar")" + std::string_view query = "foo(\"bar\")"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::FUNCTION_NAME}, - {"", Lexer::TokenType::LPAREN}, - {"bar", Lexer::TokenType::STRING}, - {"", Lexer::TokenType::RPAREN}}; + {"foo", query.substr(0, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", query.substr(3, 1), Lexer::TokenType::LPAREN}, + {"bar", query.substr(5, 3), Lexer::TokenType::STRING}, + {"", query.substr(8, 1), Lexer::TokenType::RPAREN}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -304,11 +307,14 @@ TEST(ParserTest, FunctionSingleArg) { } TEST(ParserTest, FunctionMultiArg) { - // Query: "foo("bar", "baz")" + std::string_view query = "foo(\"bar\", \"baz\")"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::FUNCTION_NAME}, {"", Lexer::TokenType::LPAREN}, - {"bar", Lexer::TokenType::STRING}, {"", Lexer::TokenType::COMMA}, - {"baz", Lexer::TokenType::STRING}, {"", Lexer::TokenType::RPAREN}}; + {"foo", query.substr(0, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", query.substr(3, 1), Lexer::TokenType::LPAREN}, + {"bar", query.substr(5, 3), Lexer::TokenType::STRING}, + {"", query.substr(9, 1), Lexer::TokenType::COMMA}, + {"baz", query.substr(12, 3), Lexer::TokenType::STRING}, + {"", query.substr(16, 1), Lexer::TokenType::RPAREN}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -329,11 +335,14 @@ TEST(ParserTest, FunctionMultiArg) { } TEST(ParserTest, FunctionNested) { - // Query: "foo(bar())" + std::string_view query = "foo(bar())"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::FUNCTION_NAME}, {"", Lexer::TokenType::LPAREN}, - {"bar", Lexer::TokenType::FUNCTION_NAME}, {"", Lexer::TokenType::LPAREN}, - {"", Lexer::TokenType::RPAREN}, {"", Lexer::TokenType::RPAREN}}; + {"foo", query.substr(0, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", query.substr(3, 1), Lexer::TokenType::LPAREN}, + {"bar", query.substr(4, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", query.substr(7, 1), Lexer::TokenType::LPAREN}, + {"", query.substr(8, 1), Lexer::TokenType::RPAREN}, + {"", query.substr(9, 1), Lexer::TokenType::RPAREN}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -356,13 +365,13 @@ TEST(ParserTest, FunctionNested) { } TEST(ParserTest, FunctionWithTrailingSequence) { - // Query: "foo() OR bar" + std::string_view query = "foo() OR bar"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::FUNCTION_NAME}, - {"", Lexer::TokenType::LPAREN}, - {"", Lexer::TokenType::RPAREN}, - {"", Lexer::TokenType::OR}, - {"bar", Lexer::TokenType::TEXT}}; + {"foo", query.substr(0, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", query.substr(3, 1), Lexer::TokenType::LPAREN}, + {"", query.substr(4, 1), Lexer::TokenType::RPAREN}, + {"", query.substr(6, 2), Lexer::TokenType::OR}, + {"bar", query.substr(9, 3), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -386,11 +395,14 @@ TEST(ParserTest, FunctionWithTrailingSequence) { } TEST(ParserTest, Composite) { - // Query: "foo OR (bar baz)" + std::string_view query = "foo OR (bar baz)"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::OR}, - {"", Lexer::TokenType::LPAREN}, {"bar", Lexer::TokenType::TEXT}, - {"baz", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::RPAREN}}; + {"foo", query.substr(0, 3), Lexer::TokenType::TEXT}, + {"", query.substr(4, 2), Lexer::TokenType::OR}, + {"", query.substr(7, 1), Lexer::TokenType::LPAREN}, + {"bar", query.substr(8, 3), Lexer::TokenType::TEXT}, + {"baz", query.substr(12, 3), Lexer::TokenType::TEXT}, + {"", query.substr(15, 1), Lexer::TokenType::RPAREN}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -419,11 +431,14 @@ TEST(ParserTest, Composite) { } TEST(ParserTest, CompositeWithTrailingSequence) { - // Query: "(bar baz) OR foo" + std::string_view query = "(bar baz) OR foo"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"", Lexer::TokenType::LPAREN}, {"bar", Lexer::TokenType::TEXT}, - {"baz", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::RPAREN}, - {"", Lexer::TokenType::OR}, {"foo", Lexer::TokenType::TEXT}}; + {"", query.substr(0, 1), Lexer::TokenType::LPAREN}, + {"bar", query.substr(1, 3), Lexer::TokenType::TEXT}, + {"baz", query.substr(5, 3), Lexer::TokenType::TEXT}, + {"", query.substr(8, 1), Lexer::TokenType::RPAREN}, + {"", query.substr(10, 2), Lexer::TokenType::OR}, + {"foo", query.substr(13, 3), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -452,17 +467,17 @@ TEST(ParserTest, CompositeWithTrailingSequence) { } TEST(ParserTest, Complex) { - // Query: "foo bar:baz OR pal("bat")" + std::string_view query = R"(foo bar:baz OR pal("bat"))"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::TEXT}, - {"bar", Lexer::TokenType::TEXT}, - {":", Lexer::TokenType::COMPARATOR}, - {"baz", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::OR}, - {"pal", Lexer::TokenType::FUNCTION_NAME}, - {"", Lexer::TokenType::LPAREN}, - {"bat", Lexer::TokenType::STRING}, - {"", Lexer::TokenType::RPAREN}}; + {"foo", query.substr(0, 3), Lexer::TokenType::TEXT}, + {"bar", query.substr(4, 3), Lexer::TokenType::TEXT}, + {":", query.substr(7, 1), Lexer::TokenType::COMPARATOR}, + {"baz", query.substr(8, 3), Lexer::TokenType::TEXT}, + {"", query.substr(12, 2), Lexer::TokenType::OR}, + {"pal", query.substr(15, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", query.substr(18, 1), Lexer::TokenType::LPAREN}, + {"bat", query.substr(20, 3), Lexer::TokenType::STRING}, + {"", query.substr(24, 1), Lexer::TokenType::RPAREN}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -498,107 +513,116 @@ TEST(ParserTest, Complex) { } TEST(ParserTest, InvalidHas) { - // Query: "foo:" No right hand operand to : + std::string_view query = "foo:"; // No right hand operand to : std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::TEXT}, {":", Lexer::TokenType::COMPARATOR}}; + {"foo", query.substr(0, 3), Lexer::TokenType::TEXT}, + {":", query.substr(3, 1), Lexer::TokenType::COMPARATOR}}; Parser parser = Parser::Create(std::move(lexer_tokens)); EXPECT_THAT(parser.ConsumeQuery(), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST(ParserTest, InvalidComposite) { - // Query: "(foo bar" No terminating RPAREN + std::string_view query = "(foo bar"; // No terminating RPAREN std::vector<Lexer::LexerToken> lexer_tokens = { - {"", Lexer::TokenType::LPAREN}, - {"foo", Lexer::TokenType::TEXT}, - {"bar", Lexer::TokenType::TEXT}}; + {"", query.substr(0, 1), Lexer::TokenType::LPAREN}, + {"foo", query.substr(1, 3), Lexer::TokenType::TEXT}, + {"bar", query.substr(5, 3), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); EXPECT_THAT(parser.ConsumeQuery(), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST(ParserTest, InvalidMember) { - // Query: "foo." DOT must have succeeding TEXT + std::string_view query = "foo."; // DOT must have succeeding TEXT std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::DOT}}; + {"foo", query.substr(0, 3), Lexer::TokenType::TEXT}, + {"", query.substr(3, 1), Lexer::TokenType::DOT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); EXPECT_THAT(parser.ConsumeQuery(), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST(ParserTest, InvalidOr) { - // Query: "foo OR" No right hand operand to OR + std::string_view query = "foo OR"; // No right hand operand to OR std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::OR}}; + {"foo", query.substr(0, 3), Lexer::TokenType::TEXT}, + {"", query.substr(3, 2), Lexer::TokenType::OR}}; Parser parser = Parser::Create(std::move(lexer_tokens)); EXPECT_THAT(parser.ConsumeQuery(), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST(ParserTest, InvalidAnd) { - // Query: "foo AND" No right hand operand to AND + std::string_view query = "foo AND"; // No right hand operand to AND std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::AND}}; + {"foo", query.substr(0, 3), Lexer::TokenType::TEXT}, + {"", query.substr(4, 3), Lexer::TokenType::AND}}; Parser parser = Parser::Create(std::move(lexer_tokens)); EXPECT_THAT(parser.ConsumeQuery(), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST(ParserTest, InvalidNot) { - // Query: "NOT" No right hand operand to NOT - std::vector<Lexer::LexerToken> lexer_tokens = {{"", Lexer::TokenType::NOT}}; + std::string_view query = "NOT"; // No right hand operand to NOT + std::vector<Lexer::LexerToken> lexer_tokens = { + {"", query.substr(0, 3), Lexer::TokenType::NOT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); EXPECT_THAT(parser.ConsumeQuery(), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST(ParserTest, InvalidMinus) { - // Query: "-" No right hand operand to - - std::vector<Lexer::LexerToken> lexer_tokens = {{"", Lexer::TokenType::MINUS}}; + std::string_view query = "-"; // No right hand operand to - + std::vector<Lexer::LexerToken> lexer_tokens = { + {"", query.substr(0, 1), Lexer::TokenType::MINUS}}; Parser parser = Parser::Create(std::move(lexer_tokens)); EXPECT_THAT(parser.ConsumeQuery(), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST(ParserTest, InvalidFunctionCallNoRparen) { - // Query: "foo(" No terminating RPAREN + std::string_view query = "foo("; // No terminating RPAREN std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::FUNCTION_NAME}, {"", Lexer::TokenType::LPAREN}}; + {"foo", query.substr(0, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", query.substr(3, 0), Lexer::TokenType::LPAREN}}; Parser parser = Parser::Create(std::move(lexer_tokens)); EXPECT_THAT(parser.ConsumeQuery(), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST(ParserTest, InvalidFunctionCallNoLparen) { - // Query: "foo bar" foo labeled FUNCTION_NAME despite no LPAREN + std::string_view query = + "foo bar"; // foo labeled FUNCTION_NAME despite no LPAREN std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::FUNCTION_NAME}, - {"bar", Lexer::TokenType::FUNCTION_NAME}}; + {"foo", query.substr(0, 3), Lexer::TokenType::FUNCTION_NAME}, + {"bar", query.substr(4, 3), Lexer::TokenType::FUNCTION_NAME}}; Parser parser = Parser::Create(std::move(lexer_tokens)); EXPECT_THAT(parser.ConsumeQuery(), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST(ParserTest, InvalidFunctionArgsHangingComma) { - // Query: "foo("bar",)" no valid arg following COMMA + std::string_view query = R"(foo("bar",))"; // no valid arg following COMMA std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::FUNCTION_NAME}, - {"", Lexer::TokenType::LPAREN}, - {"bar", Lexer::TokenType::STRING}, - {"", Lexer::TokenType::COMMA}, - {"", Lexer::TokenType::RPAREN}}; + {"foo", query.substr(0, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", query.substr(3, 1), Lexer::TokenType::LPAREN}, + {"bar", query.substr(5, 3), Lexer::TokenType::STRING}, + {"", query.substr(9, 1), Lexer::TokenType::COMMA}, + {"", query.substr(10, 1), Lexer::TokenType::RPAREN}}; Parser parser = Parser::Create(std::move(lexer_tokens)); EXPECT_THAT(parser.ConsumeQuery(), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST(ParserTest, ScoringPlus) { - // Scoring: "1 + 1 + 1" - std::vector<Lexer::LexerToken> lexer_tokens = {{"1", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::PLUS}, - {"1", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::PLUS}, - {"1", Lexer::TokenType::TEXT}}; + std::string_view scoring_exp = "1 + 1 + 1"; + std::vector<Lexer::LexerToken> lexer_tokens = { + {"1", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(2, 1), Lexer::TokenType::PLUS}, + {"1", scoring_exp.substr(4, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(6, 1), Lexer::TokenType::PLUS}, + {"1", scoring_exp.substr(8, 1), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeScoring()); @@ -622,12 +646,13 @@ TEST(ParserTest, ScoringPlus) { } TEST(ParserTest, ScoringMinus) { - // Scoring: "1 - 1 - 1" - std::vector<Lexer::LexerToken> lexer_tokens = {{"1", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::MINUS}, - {"1", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::MINUS}, - {"1", Lexer::TokenType::TEXT}}; + std::string_view scoring_exp = "1 - 1 - 1"; + std::vector<Lexer::LexerToken> lexer_tokens = { + {"1", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(2, 1), Lexer::TokenType::MINUS}, + {"1", scoring_exp.substr(4, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(6, 1), Lexer::TokenType::MINUS}, + {"1", scoring_exp.substr(8, 1), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeScoring()); @@ -651,11 +676,14 @@ TEST(ParserTest, ScoringMinus) { } TEST(ParserTest, ScoringUnaryMinus) { - // Scoring: "1 + -1 + 1" + std::string_view scoring_exp = "1 + -1 + 1"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"1", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::PLUS}, - {"", Lexer::TokenType::MINUS}, {"1", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::PLUS}, {"1", Lexer::TokenType::TEXT}}; + {"1", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(2, 1), Lexer::TokenType::PLUS}, + {"", scoring_exp.substr(4, 1), Lexer::TokenType::MINUS}, + {"1", scoring_exp.substr(5, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(7, 1), Lexer::TokenType::PLUS}, + {"1", scoring_exp.substr(9, 1), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeScoring()); @@ -682,12 +710,15 @@ TEST(ParserTest, ScoringUnaryMinus) { } TEST(ParserTest, ScoringPlusMinus) { - // Scoring: "11 + 12 - 13 + 14" + std::string_view scoring_exp = "11 + 12 - 13 + 14"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"11", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::PLUS}, - {"12", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::MINUS}, - {"13", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::PLUS}, - {"14", Lexer::TokenType::TEXT}}; + {"11", scoring_exp.substr(0, 2), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(3, 1), Lexer::TokenType::PLUS}, + {"12", scoring_exp.substr(5, 2), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(6, 1), Lexer::TokenType::MINUS}, + {"13", scoring_exp.substr(8, 2), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(11, 1), Lexer::TokenType::PLUS}, + {"14", scoring_exp.substr(13, 2), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeScoring()); @@ -719,12 +750,13 @@ TEST(ParserTest, ScoringPlusMinus) { } TEST(ParserTest, ScoringTimes) { - // Scoring: "1 * 1 * 1" - std::vector<Lexer::LexerToken> lexer_tokens = {{"1", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::TIMES}, - {"1", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::TIMES}, - {"1", Lexer::TokenType::TEXT}}; + std::string_view scoring_exp = "1 * 1 * 1"; + std::vector<Lexer::LexerToken> lexer_tokens = { + {"1", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(2, 1), Lexer::TokenType::TIMES}, + {"1", scoring_exp.substr(4, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(6, 1), Lexer::TokenType::TIMES}, + {"1", scoring_exp.substr(8, 1), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeScoring()); @@ -748,12 +780,13 @@ TEST(ParserTest, ScoringTimes) { } TEST(ParserTest, ScoringDiv) { - // Scoring: "1 / 1 / 1" - std::vector<Lexer::LexerToken> lexer_tokens = {{"1", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::DIV}, - {"1", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::DIV}, - {"1", Lexer::TokenType::TEXT}}; + std::string_view scoring_exp = "1 / 1 / 1"; + std::vector<Lexer::LexerToken> lexer_tokens = { + {"1", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(2, 1), Lexer::TokenType::DIV}, + {"1", scoring_exp.substr(4, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(6, 1), Lexer::TokenType::DIV}, + {"1", scoring_exp.substr(8, 1), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeScoring()); @@ -777,13 +810,17 @@ TEST(ParserTest, ScoringDiv) { } TEST(ParserTest, ScoringTimesDiv) { - // Scoring: "11 / 12 * 13 / 14 / 15" + std::string_view scoring_exp = "11 / 12 * 13 / 14 / 15"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"11", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::DIV}, - {"12", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::TIMES}, - {"13", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::DIV}, - {"14", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::DIV}, - {"15", Lexer::TokenType::TEXT}}; + {"11", scoring_exp.substr(0, 2), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(3, 1), Lexer::TokenType::DIV}, + {"12", scoring_exp.substr(5, 2), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(8, 1), Lexer::TokenType::TIMES}, + {"13", scoring_exp.substr(10, 2), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(13, 1), Lexer::TokenType::DIV}, + {"14", scoring_exp.substr(15, 2), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(18, 1), Lexer::TokenType::DIV}, + {"15", scoring_exp.substr(20, 2), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeScoring()); @@ -817,29 +854,29 @@ TEST(ParserTest, ScoringTimesDiv) { } TEST(ParserTest, ComplexScoring) { - // Scoring: "1 + pow((2 * sin(3)), 4) + -5 / 6" + std::string_view scoring_exp = "1 + pow((2 * sin(3)), 4) + -5 / 6"; // With parentheses in function arguments. std::vector<Lexer::LexerToken> lexer_tokens = { - {"1", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::PLUS}, - {"pow", Lexer::TokenType::FUNCTION_NAME}, - {"", Lexer::TokenType::LPAREN}, - {"", Lexer::TokenType::LPAREN}, - {"2", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::TIMES}, - {"sin", Lexer::TokenType::FUNCTION_NAME}, - {"", Lexer::TokenType::LPAREN}, - {"3", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::RPAREN}, - {"", Lexer::TokenType::RPAREN}, - {"", Lexer::TokenType::COMMA}, - {"4", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::RPAREN}, - {"", Lexer::TokenType::PLUS}, - {"", Lexer::TokenType::MINUS}, - {"5", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::DIV}, - {"6", Lexer::TokenType::TEXT}, + {"1", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(2, 1), Lexer::TokenType::PLUS}, + {"pow", scoring_exp.substr(4, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", scoring_exp.substr(7, 1), Lexer::TokenType::LPAREN}, + {"", scoring_exp.substr(8, 1), Lexer::TokenType::LPAREN}, + {"2", scoring_exp.substr(9, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(11, 1), Lexer::TokenType::TIMES}, + {"sin", scoring_exp.substr(13, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", scoring_exp.substr(16, 1), Lexer::TokenType::LPAREN}, + {"3", scoring_exp.substr(17, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(18, 1), Lexer::TokenType::RPAREN}, + {"", scoring_exp.substr(19, 1), Lexer::TokenType::RPAREN}, + {"", scoring_exp.substr(20, 1), Lexer::TokenType::COMMA}, + {"4", scoring_exp.substr(22, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(23, 1), Lexer::TokenType::RPAREN}, + {"", scoring_exp.substr(25, 1), Lexer::TokenType::PLUS}, + {"", scoring_exp.substr(27, 1), Lexer::TokenType::MINUS}, + {"5", scoring_exp.substr(28, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(30, 1), Lexer::TokenType::DIV}, + {"6", scoring_exp.substr(32, 1), Lexer::TokenType::TEXT}, }; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, @@ -869,27 +906,27 @@ TEST(ParserTest, ComplexScoring) { EqualsNodeInfo("DIV", NodeType::kNaryOperator), EqualsNodeInfo("PLUS", NodeType::kNaryOperator))); - // Scoring: "1 + pow(2 * sin(3), 4) + -5 / 6" + scoring_exp = "1 + pow(2 * sin(3), 4) + -5 / 6"; // Without parentheses in function arguments. lexer_tokens = { - {"1", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::PLUS}, - {"pow", Lexer::TokenType::FUNCTION_NAME}, - {"", Lexer::TokenType::LPAREN}, - {"2", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::TIMES}, - {"sin", Lexer::TokenType::FUNCTION_NAME}, - {"", Lexer::TokenType::LPAREN}, - {"3", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::RPAREN}, - {"", Lexer::TokenType::COMMA}, - {"4", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::RPAREN}, - {"", Lexer::TokenType::PLUS}, - {"", Lexer::TokenType::MINUS}, - {"5", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::DIV}, - {"6", Lexer::TokenType::TEXT}, + {"1", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(2, 1), Lexer::TokenType::PLUS}, + {"pow", scoring_exp.substr(4, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", scoring_exp.substr(7, 1), Lexer::TokenType::LPAREN}, + {"2", scoring_exp.substr(8, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(10, 1), Lexer::TokenType::TIMES}, + {"sin", scoring_exp.substr(12, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", scoring_exp.substr(15, 1), Lexer::TokenType::LPAREN}, + {"3", scoring_exp.substr(16, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(17, 1), Lexer::TokenType::RPAREN}, + {"", scoring_exp.substr(18, 1), Lexer::TokenType::COMMA}, + {"4", scoring_exp.substr(20, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(21, 1), Lexer::TokenType::RPAREN}, + {"", scoring_exp.substr(23, 1), Lexer::TokenType::PLUS}, + {"", scoring_exp.substr(25, 1), Lexer::TokenType::MINUS}, + {"5", scoring_exp.substr(26, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(28, 1), Lexer::TokenType::DIV}, + {"6", scoring_exp.substr(30, 1), Lexer::TokenType::TEXT}, }; parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(tree_root, parser.ConsumeScoring()); @@ -899,13 +936,14 @@ TEST(ParserTest, ComplexScoring) { } TEST(ParserTest, ScoringMemberFunction) { - // Scoring: this.CreationTimestamp() + std::string_view scoring_exp = "this.CreationTimestamp()"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"this", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::DOT}, - {"CreationTimestamp", Lexer::TokenType::FUNCTION_NAME}, - {"", Lexer::TokenType::LPAREN}, - {"", Lexer::TokenType::RPAREN}}; + {"this", scoring_exp.substr(0, 4), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(4, 1), Lexer::TokenType::DOT}, + {"CreationTimestamp", scoring_exp.substr(5, 17), + Lexer::TokenType::FUNCTION_NAME}, + {"", scoring_exp.substr(22, 1), Lexer::TokenType::LPAREN}, + {"", scoring_exp.substr(23, 1), Lexer::TokenType::RPAREN}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeScoring()); @@ -927,13 +965,13 @@ TEST(ParserTest, ScoringMemberFunction) { } TEST(ParserTest, QueryMemberFunction) { - // Query: this.foo() + std::string_view query = "this.foo()"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"this", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::DOT}, - {"foo", Lexer::TokenType::FUNCTION_NAME}, - {"", Lexer::TokenType::LPAREN}, - {"", Lexer::TokenType::RPAREN}}; + {"this", query.substr(0, 4), Lexer::TokenType::TEXT}, + {"", query.substr(4, 1), Lexer::TokenType::DOT}, + {"foo", query.substr(5, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", query.substr(8, 1), Lexer::TokenType::LPAREN}, + {"", query.substr(9, 1), Lexer::TokenType::RPAREN}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -954,18 +992,18 @@ TEST(ParserTest, QueryMemberFunction) { } TEST(ParserTest, ScoringComplexMemberFunction) { - // Scoring: a.b.fun(c, d) + std::string_view scoring_exp = "a.b.fun(c, d)"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"a", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::DOT}, - {"b", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::DOT}, - {"fun", Lexer::TokenType::FUNCTION_NAME}, - {"", Lexer::TokenType::LPAREN}, - {"c", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::COMMA}, - {"d", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::RPAREN}}; + {"a", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(1, 1), Lexer::TokenType::DOT}, + {"b", scoring_exp.substr(2, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(3, 1), Lexer::TokenType::DOT}, + {"fun", scoring_exp.substr(4, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", scoring_exp.substr(7, 1), Lexer::TokenType::LPAREN}, + {"c", scoring_exp.substr(8, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(9, 1), Lexer::TokenType::COMMA}, + {"d", scoring_exp.substr(11, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(12, 1), Lexer::TokenType::RPAREN}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeScoring()); @@ -993,13 +1031,18 @@ TEST(ParserTest, ScoringComplexMemberFunction) { } TEST(ParserTest, QueryComplexMemberFunction) { - // Query: this.abc.fun(def, ghi) + std::string_view query = "this.abc.fun(def, ghi)"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"this", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::DOT}, - {"abc", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::DOT}, - {"fun", Lexer::TokenType::FUNCTION_NAME}, {"", Lexer::TokenType::LPAREN}, - {"def", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::COMMA}, - {"ghi", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::RPAREN}}; + {"this", query.substr(0, 4), Lexer::TokenType::TEXT}, + {"", query.substr(4, 1), Lexer::TokenType::DOT}, + {"abc", query.substr(5, 3), Lexer::TokenType::TEXT}, + {"", query.substr(8, 1), Lexer::TokenType::DOT}, + {"fun", query.substr(9, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", query.substr(12, 1), Lexer::TokenType::LPAREN}, + {"def", query.substr(13, 3), Lexer::TokenType::TEXT}, + {"", query.substr(16, 1), Lexer::TokenType::COMMA}, + {"ghi", query.substr(17, 3), Lexer::TokenType::TEXT}, + {"", query.substr(20, 1), Lexer::TokenType::RPAREN}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -1027,11 +1070,12 @@ TEST(ParserTest, QueryComplexMemberFunction) { } TEST(ParserTest, InvalidScoringToken) { - // Scoring: "1 + NOT 1" - std::vector<Lexer::LexerToken> lexer_tokens = {{"1", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::PLUS}, - {"", Lexer::TokenType::NOT}, - {"1", Lexer::TokenType::TEXT}}; + std::string_view scoring_exp = "1 + NOT 1"; + std::vector<Lexer::LexerToken> lexer_tokens = { + {"1", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(2, 1), Lexer::TokenType::PLUS}, + {"", scoring_exp.substr(4, 3), Lexer::TokenType::NOT}, + {"1", scoring_exp.substr(8, 1), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); EXPECT_THAT(parser.ConsumeScoring(), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); diff --git a/icing/query/advanced_query_parser/pending-value.cc b/icing/query/advanced_query_parser/pending-value.cc index 903e12f..67bdc3a 100644 --- a/icing/query/advanced_query_parser/pending-value.cc +++ b/icing/query/advanced_query_parser/pending-value.cc @@ -36,7 +36,7 @@ libtextclassifier3::Status PendingValue::ParseInt() { "Unable to parse \"", query_term_.term, "\" as number.")); } data_type_ = DataType::kLong; - query_term_ = {"", false}; + query_term_ = {/*term=*/"", /*raw_term=*/"", /*is_prefix_val=*/false}; return libtextclassifier3::Status::OK; } diff --git a/icing/query/advanced_query_parser/pending-value.h b/icing/query/advanced_query_parser/pending-value.h index d18789d..1a6717e 100644 --- a/icing/query/advanced_query_parser/pending-value.h +++ b/icing/query/advanced_query_parser/pending-value.h @@ -38,17 +38,18 @@ enum class DataType { struct QueryTerm { std::string term; + std::string_view raw_term; bool is_prefix_val; }; // A holder for intermediate results when processing child nodes. struct PendingValue { - static PendingValue CreateStringPendingValue(QueryTerm query_term) { - return PendingValue(std::move(query_term), DataType::kString); + static PendingValue CreateStringPendingValue(QueryTerm str) { + return PendingValue(std::move(str), DataType::kString); } - static PendingValue CreateTextPendingValue(QueryTerm query_term) { - return PendingValue(std::move(query_term), DataType::kText); + static PendingValue CreateTextPendingValue(QueryTerm text) { + return PendingValue(std::move(text), DataType::kText); } PendingValue() : data_type_(DataType::kNone) {} @@ -125,7 +126,7 @@ struct PendingValue { private: explicit PendingValue(QueryTerm query_term, DataType data_type) - : query_term_({std::move(query_term)}), data_type_(data_type) {} + : query_term_(std::move(query_term)), data_type_(data_type) {} libtextclassifier3::Status CheckDataType(DataType required_data_type) const { if (data_type_ == required_data_type) { @@ -141,7 +142,7 @@ struct PendingValue { // iterator_ will be populated when data_type_ is kDocumentIterator. std::unique_ptr<DocHitInfoIterator> iterator_; - // string_vals_ will be populated when data_type_ is kStringList. + // string_vals_ will be populated when data_type_ kStringList. std::vector<std::string> string_vals_; // query_term_ will be populated when data_type_ is kString or kText diff --git a/icing/query/advanced_query_parser/query-visitor.cc b/icing/query/advanced_query_parser/query-visitor.cc index a1a9c38..31223a5 100644 --- a/icing/query/advanced_query_parser/query-visitor.cc +++ b/icing/query/advanced_query_parser/query-visitor.cc @@ -37,9 +37,12 @@ #include "icing/query/advanced_query_parser/lexer.h" #include "icing/query/advanced_query_parser/param.h" #include "icing/query/advanced_query_parser/parser.h" +#include "icing/query/advanced_query_parser/pending-value.h" +#include "icing/query/advanced_query_parser/util/string-util.h" #include "icing/query/query-features.h" #include "icing/schema/property-util.h" #include "icing/schema/section.h" +#include "icing/tokenization/token.h" #include "icing/tokenization/tokenizer.h" #include "icing/util/status-macros.h" @@ -54,32 +57,13 @@ struct CreateList { std::vector<std::string> values; values.reserve(args.size()); for (PendingValue& arg : args) { - QueryTerm val = std::move(arg).string_val().ValueOrDie(); - values.push_back(std::move(val.term)); + QueryTerm string_val = std::move(arg).string_val().ValueOrDie(); + values.push_back(std::move(string_val.term)); } return PendingValue(std::move(values)); } }; -libtextclassifier3::StatusOr<std::string> UnescapeStringValue( - std::string_view value) { - std::string result; - bool in_escape = false; - for (char c : value) { - if (in_escape) { - in_escape = false; - } else if (c == '\\') { - in_escape = true; - continue; - } else if (c == '"') { - return absl_ports::InvalidArgumentError( - "Encountered an unescaped quotation mark!"); - } - result += c; - } - return result; -} - bool IsNumericComparator(std::string_view operator_text) { if (operator_text.length() < 1 || operator_text.length() > 2) { return false; @@ -168,8 +152,14 @@ void QueryVisitor::PendingPropertyRestricts::AddValidRestricts( } libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> -QueryVisitor::CreateTermIterator(QueryTerm query_term) { +QueryVisitor::CreateTermIterator(const QueryTerm& query_term) { + if (query_term.is_prefix_val) { + // '*' prefix operator was added in list filters + features_.insert(kListFilterQueryLanguageFeature); + } TermMatchType::Code match_type = GetTermMatchType(query_term.is_prefix_val); + int unnormalized_term_start = + query_term.raw_term.data() - raw_query_text_.data(); if (!processing_not_) { // 1. Add term to property_query_terms_map if (pending_property_restricts_.has_active_property_restricts()) { @@ -183,13 +173,11 @@ QueryVisitor::CreateTermIterator(QueryTerm query_term) { // 2. If needed add term iterator to query_term_iterators_ map. if (needs_term_frequency_info_) { - // TODO(b/152934343) Save "term start index" into Node and PendingValue and - // pass it into index.GetIterator ICING_ASSIGN_OR_RETURN( std::unique_ptr<DocHitInfoIterator> term_iterator, - index_.GetIterator(query_term.term, /*term_start_index=*/0, - /*unnormalized_term_length=*/0, kSectionIdMaskAll, - match_type, needs_term_frequency_info_)); + index_.GetIterator(query_term.term, unnormalized_term_start, + query_term.raw_term.length(), kSectionIdMaskAll, + match_type_, needs_term_frequency_info_)); query_term_iterators_[query_term.term] = std::make_unique<DocHitInfoIteratorFilter>( std::move(term_iterator), &document_store_, &schema_store_, @@ -198,10 +186,8 @@ QueryVisitor::CreateTermIterator(QueryTerm query_term) { } // 3. Add the term iterator. - // TODO(b/152934343) Save "term start index" into Node and PendingValue and - // pass it into index.GetIterator - return index_.GetIterator(query_term.term, /*term_start_index=*/0, - /*unnormalized_term_length=*/0, kSectionIdMaskAll, + return index_.GetIterator(query_term.term, unnormalized_term_start, + query_term.raw_term.length(), kSectionIdMaskAll, match_type, needs_term_frequency_info_); } @@ -266,7 +252,7 @@ libtextclassifier3::StatusOr<PendingValue> QueryVisitor::SearchFunction( } else { QueryVisitor query_visitor(&index_, &numeric_index_, &document_store_, &schema_store_, &normalizer_, &tokenizer_, - filter_options_, match_type_, + query->raw_term, filter_options_, match_type_, needs_term_frequency_info_, pending_property_restricts_, processing_not_); tree_root->Accept(&query_visitor); @@ -353,24 +339,44 @@ QueryVisitor::PopPendingIterator() { // The tokenizer will produce 1+ tokens out of the text. The prefix operator // only applies to the final token. bool reached_final_token = !token_itr->Advance(); + // raw_text is the portion of text_value.raw_term that hasn't yet been + // matched to any of the tokens that we've processed. escaped_token will + // hold the portion of raw_text that corresponds to the current token that + // is being processed. + std::string_view raw_text = text_value.raw_term; + std::string_view raw_token; while (!reached_final_token) { std::vector<Token> tokens = token_itr->GetTokens(); - reached_final_token = !token_itr->Advance(); + if (tokens.size() > 1) { + // The tokenizer iterator iterates between token groups. In practice, + // the tokenizer used with QueryVisitor (PlainTokenizer) will always + // only produce a single token per token group. + return absl_ports::InvalidArgumentError( + "Encountered unexpected token group with >1 tokens."); + } - // The tokenizer iterator iterates between token groups. In practice, the - // tokenizer used with QueryVisitor (PlainTokenizer) will always only - // produce a single token per token group. - // For simplicity, we will apply the prefix operator to *all* tokens - // in the final token group. - for (const Token& token : tokens) { - normalized_term = normalizer_.NormalizeTerm(token.text); - ICING_ASSIGN_OR_RETURN( - std::unique_ptr<DocHitInfoIterator> iterator, - CreateTermIterator( - QueryTerm{std::move(normalized_term), - reached_final_token && text_value.is_prefix_val})); - iterators.push_back(std::move(iterator)); + reached_final_token = !token_itr->Advance(); + const Token& token = tokens.at(0); + if (reached_final_token && token.text.length() == raw_text.length()) { + // Unescaped tokens are strictly smaller than their escaped counterparts + // This means that if we're at the final token and token.length equals + // raw_text, then all of raw_text must correspond to this token. + raw_token = raw_text; + } else { + ICING_ASSIGN_OR_RETURN(raw_token, string_util::FindEscapedToken( + raw_text, token.text)); } + normalized_term = normalizer_.NormalizeTerm(token.text); + QueryTerm term_value{std::move(normalized_term), raw_token, + reached_final_token && text_value.is_prefix_val}; + ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> iterator, + CreateTermIterator(std::move(term_value))); + iterators.push_back(std::move(iterator)); + + // Remove escaped_token from raw_text now that we've processed + // raw_text. + const char* escaped_token_end = raw_token.data() + raw_token.length(); + raw_text = raw_text.substr(escaped_token_end - raw_text.data()); } // Finally, create an And Iterator. If there's only a single term here, then @@ -452,7 +458,8 @@ libtextclassifier3::Status QueryVisitor::ProcessNumericComparator( GetInt64Range(node->operator_text(), int_value)); ICING_ASSIGN_OR_RETURN( std::unique_ptr<DocHitInfoIterator> iterator, - numeric_index_.GetIterator(text_value.term, range.low, range.high)); + numeric_index_.GetIterator(text_value.term, range.low, range.high, + document_store_, schema_store_)); features_.insert(kNumericSearchFeature); pending_values_.push(PendingValue(std::move(iterator))); @@ -618,22 +625,23 @@ void QueryVisitor::VisitFunctionName(const FunctionNameNode* node) { void QueryVisitor::VisitString(const StringNode* node) { // A STRING node can only be a term. Create the iterator now. - auto unescaped_string_or = UnescapeStringValue(node->value()); + auto unescaped_string_or = string_util::UnescapeStringValue(node->value()); if (!unescaped_string_or.ok()) { pending_error_ = std::move(unescaped_string_or).status(); return; } std::string unescaped_string = std::move(unescaped_string_or).ValueOrDie(); - pending_values_.push(PendingValue::CreateStringPendingValue( - QueryTerm{std::move(unescaped_string), node->is_prefix()})); + QueryTerm val{std::move(unescaped_string), node->raw_value(), + node->is_prefix()}; + pending_values_.push(PendingValue::CreateStringPendingValue(std::move(val))); } void QueryVisitor::VisitText(const TextNode* node) { // TEXT nodes could either be a term (and will become DocHitInfoIteratorTerm) // or a property name. As such, we just push the TEXT value into pending // values and determine which it is at a later point. - pending_values_.push(PendingValue::CreateTextPendingValue( - QueryTerm{std::move(node->value()), node->is_prefix()})); + QueryTerm val{std::move(node->value()), node->raw_value(), node->is_prefix()}; + pending_values_.push(PendingValue::CreateTextPendingValue(std::move(val))); } void QueryVisitor::VisitMember(const MemberNode* node) { @@ -668,6 +676,8 @@ void QueryVisitor::VisitMember(const MemberNode* node) { libtextclassifier3::StatusOr<QueryTerm> member_or; std::vector<std::string> members; QueryTerm text_val; + const char* start = nullptr; + const char* end = nullptr; while (!pending_values_.empty() && !pending_values_.top().is_placeholder()) { member_or = PopPendingTextValue(); @@ -681,11 +691,19 @@ void QueryVisitor::VisitMember(const MemberNode* node) { "Cannot use prefix operator '*' within a property name!"); return; } + if (start == nullptr) { + start = text_val.raw_term.data(); + end = text_val.raw_term.data() + text_val.raw_term.length(); + } else { + start = std::min(start, text_val.raw_term.data()); + end = std::max(end, text_val.raw_term.data() + text_val.raw_term.length()); + } members.push_back(std::move(text_val.term)); } QueryTerm member; member.term = absl_ports::StrJoin(members.rbegin(), members.rend(), property_util::kPropertyPathSeparator); + member.raw_term = std::string_view(start, end - start); member.is_prefix_val = false; pending_value = PendingValue::CreateTextPendingValue(std::move(member)); } @@ -739,6 +757,9 @@ void QueryVisitor::VisitFunction(const FunctionNode* node) { // 5. Pop placeholder in pending_values and add the result of our function. pending_values_.pop(); pending_values_.push(std::move(eval_result).ValueOrDie()); + + // Support for custom functions was added in list filters. + features_.insert(kListFilterQueryLanguageFeature); } // TODO(b/265312785) Clarify handling of the interaction between HAS and NOT. @@ -771,6 +792,15 @@ void QueryVisitor::VisitUnaryOperator(const UnaryOperatorNode* node) { if (!status.ok()) { pending_error_ = std::move(status); } + + if (!is_minus || + pending_property_restricts_.has_active_property_restricts() || + processing_not_) { + // 'NOT' operator was added in list filters. + // Likewise, mixing property restricts and NOTs were made valid in list + // filters. + features_.insert(kListFilterQueryLanguageFeature); + } } void QueryVisitor::VisitNaryOperator(const NaryOperatorNode* node) { @@ -780,6 +810,13 @@ void QueryVisitor::VisitNaryOperator(const NaryOperatorNode* node) { return; } + if (pending_property_restricts_.has_active_property_restricts() || + processing_not_) { + // Likewise, mixing property restricts and NOT with compound statements was + // added in list filters. + features_.insert(kListFilterQueryLanguageFeature); + } + if (node->operator_text() == ":") { libtextclassifier3::Status status = ProcessHasOperator(node); if (!status.ok()) { diff --git a/icing/query/advanced_query_parser/query-visitor.h b/icing/query/advanced_query_parser/query-visitor.h index 7498457..9fcaec0 100644 --- a/icing/query/advanced_query_parser/query-visitor.h +++ b/icing/query/advanced_query_parser/query-visitor.h @@ -49,12 +49,12 @@ class QueryVisitor : public AbstractSyntaxTreeVisitor { Index* index, const NumericIndex<int64_t>* numeric_index, const DocumentStore* document_store, const SchemaStore* schema_store, const Normalizer* normalizer, const Tokenizer* tokenizer, + std::string_view raw_query_text, DocHitInfoIteratorFilter::Options filter_options, TermMatchType::Code match_type, bool needs_term_frequency_info) : QueryVisitor(index, numeric_index, document_store, schema_store, - normalizer, tokenizer, filter_options, match_type, - needs_term_frequency_info, - + normalizer, tokenizer, raw_query_text, filter_options, + match_type, needs_term_frequency_info, PendingPropertyRestricts(), /*processing_not=*/false) {} @@ -105,9 +105,9 @@ class QueryVisitor : public AbstractSyntaxTreeVisitor { Index* index, const NumericIndex<int64_t>* numeric_index, const DocumentStore* document_store, const SchemaStore* schema_store, const Normalizer* normalizer, const Tokenizer* tokenizer, + std::string_view raw_query_text, DocHitInfoIteratorFilter::Options filter_options, TermMatchType::Code match_type, bool needs_term_frequency_info, - PendingPropertyRestricts pending_property_restricts, bool processing_not) : index_(*index), numeric_index_(*numeric_index), @@ -115,6 +115,7 @@ class QueryVisitor : public AbstractSyntaxTreeVisitor { schema_store_(*schema_store), normalizer_(*normalizer), tokenizer_(*tokenizer), + raw_query_text_(raw_query_text), filter_options_(std::move(filter_options)), match_type_(match_type), needs_term_frequency_info_(needs_term_frequency_info), @@ -133,7 +134,7 @@ class QueryVisitor : public AbstractSyntaxTreeVisitor { // - On success, a DocHitInfoIterator for the provided term // - INVALID_ARGUMENT if unable to create an iterator for the term. libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> - CreateTermIterator(QueryTerm term); + CreateTermIterator(const QueryTerm& term); // Processes the PendingValue at the top of pending_values_, parses it into a // int64_t and pops the top. @@ -279,6 +280,7 @@ class QueryVisitor : public AbstractSyntaxTreeVisitor { const Normalizer& normalizer_; // Does not own! const Tokenizer& tokenizer_; // Does not own! + std::string_view raw_query_text_; DocHitInfoIteratorFilter::Options filter_options_; TermMatchType::Code match_type_; // Whether or not term_frequency information is needed. This affects: diff --git a/icing/query/advanced_query_parser/query-visitor_test.cc b/icing/query/advanced_query_parser/query-visitor_test.cc index 033e86b..b560d52 100644 --- a/icing/query/advanced_query_parser/query-visitor_test.cc +++ b/icing/query/advanced_query_parser/query-visitor_test.cc @@ -17,6 +17,7 @@ #include <cstdint> #include <limits> #include <memory> +#include <string_view> #include "icing/text_classifier/lib3/utils/base/status.h" #include "gmock/gmock.h" @@ -225,14 +226,20 @@ TEST_P(QueryVisitorTest, SimpleLessThan) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, - ElementsAre(kNumericSearchFeature)); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kNumericSearchFeature, + kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kNumericSearchFeature)); + } // "price" is a property restrict here and "2" isn't a "term" - its a numeric // value. So QueryTermIterators should be empty. EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty()); @@ -262,14 +269,20 @@ TEST_P(QueryVisitorTest, SimpleLessThanEq) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, - ElementsAre(kNumericSearchFeature)); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kNumericSearchFeature, + kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kNumericSearchFeature)); + } // "price" is a property restrict here and "1" isn't a "term" - its a numeric // value. So QueryTermIterators should be empty. EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty()); @@ -299,14 +312,20 @@ TEST_P(QueryVisitorTest, SimpleEqual) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, - ElementsAre(kNumericSearchFeature)); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kNumericSearchFeature, + kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kNumericSearchFeature)); + } // "price" is a property restrict here and "2" isn't a "term" - its a numeric // value. So QueryTermIterators should be empty. EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty()); @@ -336,14 +355,20 @@ TEST_P(QueryVisitorTest, SimpleGreaterThanEq) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, - ElementsAre(kNumericSearchFeature)); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kNumericSearchFeature, + kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kNumericSearchFeature)); + } // "price" is a property restrict here and "1" isn't a "term" - its a numeric // value. So QueryTermIterators should be empty. EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty()); @@ -373,14 +398,20 @@ TEST_P(QueryVisitorTest, SimpleGreaterThan) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, - ElementsAre(kNumericSearchFeature)); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kNumericSearchFeature, + kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kNumericSearchFeature)); + } // "price" is a property restrict here and "1" isn't a "term" - its a numeric // value. So QueryTermIterators should be empty. EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty()); @@ -411,14 +442,20 @@ TEST_P(QueryVisitorTest, IntMinLessThanEqual) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, - ElementsAre(kNumericSearchFeature)); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kNumericSearchFeature, + kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kNumericSearchFeature)); + } // "price" is a property restrict here and int_min isn't a "term" - its a // numeric value. So QueryTermIterators should be empty. EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty()); @@ -449,14 +486,20 @@ TEST_P(QueryVisitorTest, IntMaxGreaterThanEqual) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, - ElementsAre(kNumericSearchFeature)); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kNumericSearchFeature, + kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kNumericSearchFeature)); + } // "price" is a property restrict here and int_max isn't a "term" - its a // numeric value. So QueryTermIterators should be empty. EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty()); @@ -488,14 +531,20 @@ TEST_P(QueryVisitorTest, NestedPropertyLessThan) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, - ElementsAre(kNumericSearchFeature)); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kNumericSearchFeature, + kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kNumericSearchFeature)); + } // "subscription.price" is a property restrict here and int_max isn't a "term" // - its a numeric value. So QueryTermIterators should be empty. EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty()); @@ -510,7 +559,7 @@ TEST_P(QueryVisitorTest, IntParsingError) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -524,7 +573,7 @@ TEST_P(QueryVisitorTest, NotEqualsUnsupported) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -552,16 +601,19 @@ TEST_P(QueryVisitorTest, LessThanTooManyOperandsInvalid) { // Create an invalid AST for the query '3 < subscription.price 25' where '<' // has three operands - auto property_node = std::make_unique<TextNode>("subscription"); - auto subproperty_node = std::make_unique<TextNode>("price"); + std::string_view query = "3 < subscription.price 25"; + auto property_node = + std::make_unique<TextNode>("subscription", query.substr(4, 12)); + auto subproperty_node = + std::make_unique<TextNode>("price", query.substr(17, 5)); std::vector<std::unique_ptr<TextNode>> member_args; member_args.push_back(std::move(property_node)); member_args.push_back(std::move(subproperty_node)); auto member_node = std::make_unique<MemberNode>(std::move(member_args), /*function=*/nullptr); - auto value_node = std::make_unique<TextNode>("3"); - auto extra_value_node = std::make_unique<TextNode>("25"); + auto value_node = std::make_unique<TextNode>("3", query.substr(0, 1)); + auto extra_value_node = std::make_unique<TextNode>("25", query.substr(23, 2)); std::vector<std::unique_ptr<Node>> args; args.push_back(std::move(value_node)); args.push_back(std::move(member_node)); @@ -569,7 +621,7 @@ TEST_P(QueryVisitorTest, LessThanTooManyOperandsInvalid) { auto root_node = std::make_unique<NaryOperatorNode>("<", std::move(args)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -580,8 +632,11 @@ TEST_P(QueryVisitorTest, LessThanTooManyOperandsInvalid) { TEST_P(QueryVisitorTest, LessThanTooFewOperandsInvalid) { // Create an invalid AST for the query 'subscription.price <' where '<' // has a single operand - auto property_node = std::make_unique<TextNode>("subscription"); - auto subproperty_node = std::make_unique<TextNode>("price"); + std::string_view query = "subscription.price <"; + auto property_node = + std::make_unique<TextNode>("subscription", query.substr(0, 12)); + auto subproperty_node = + std::make_unique<TextNode>("price", query.substr(13, 5)); std::vector<std::unique_ptr<TextNode>> member_args; member_args.push_back(std::move(property_node)); member_args.push_back(std::move(subproperty_node)); @@ -593,7 +648,7 @@ TEST_P(QueryVisitorTest, LessThanTooFewOperandsInvalid) { auto root_node = std::make_unique<NaryOperatorNode>("<", std::move(args)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -624,14 +679,20 @@ TEST_P(QueryVisitorTest, LessThanNonExistentPropertyNotFound) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, - ElementsAre(kNumericSearchFeature)); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kNumericSearchFeature, + kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kNumericSearchFeature)); + } EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty()); EXPECT_THAT(query_results.query_terms, IsEmpty()); EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), IsEmpty()); @@ -640,7 +701,7 @@ TEST_P(QueryVisitorTest, LessThanNonExistentPropertyNotFound) { TEST_P(QueryVisitorTest, NeverVisitedReturnsInvalid) { QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), "", DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); EXPECT_THAT(std::move(query_visitor).ConsumeResults(), @@ -669,7 +730,7 @@ TEST_P(QueryVisitorTest, IntMinLessThanInvalid) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -699,7 +760,7 @@ TEST_P(QueryVisitorTest, IntMaxGreaterThanInvalid) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -714,7 +775,7 @@ TEST_P(QueryVisitorTest, NumericComparisonPropertyStringIsInvalid) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -776,14 +837,20 @@ TEST_P(QueryVisitorTest, NumericComparatorDoesntAffectLaterTerms) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, - ElementsAre(kNumericSearchFeature)); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kNumericSearchFeature, + kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kNumericSearchFeature)); + } EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty()); EXPECT_THAT(query_results.query_terms, IsEmpty()); EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), @@ -813,7 +880,7 @@ TEST_P(QueryVisitorTest, SingleTermTermFrequencyEnabled) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -865,7 +932,7 @@ TEST_P(QueryVisitorTest, SingleTermTermFrequencyDisabled) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/false); root_node->Accept(&query_visitor); @@ -917,7 +984,7 @@ TEST_P(QueryVisitorTest, SingleTermPrefix) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -933,7 +1000,7 @@ TEST_P(QueryVisitorTest, SingleTermPrefix) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); @@ -953,7 +1020,7 @@ TEST_P(QueryVisitorTest, PrefixOperatorAfterPropertyReturnsInvalid) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -967,7 +1034,7 @@ TEST_P(QueryVisitorTest, PrefixOperatorAfterNumericValueReturnsInvalid) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -981,7 +1048,7 @@ TEST_P(QueryVisitorTest, PrefixOperatorAfterPropertyRestrictReturnsInvalid) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -1019,7 +1086,7 @@ TEST_P(QueryVisitorTest, SegmentationWithPrefix) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -1042,7 +1109,7 @@ TEST_P(QueryVisitorTest, SegmentationWithPrefix) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); @@ -1079,14 +1146,20 @@ TEST_P(QueryVisitorTest, SingleVerbatimTerm) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, - ElementsAre(kVerbatimSearchFeature)); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kVerbatimSearchFeature, + kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kVerbatimSearchFeature)); + } EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("foo:bar(baz)")); @@ -1120,14 +1193,15 @@ TEST_P(QueryVisitorTest, SingleVerbatimTermPrefix) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); EXPECT_THAT(query_results.features_in_use, - ElementsAre(kVerbatimSearchFeature)); + UnorderedElementsAre(kVerbatimSearchFeature, + kListFilterQueryLanguageFeature)); EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("foo:bar(")); EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), @@ -1172,14 +1246,20 @@ TEST_P(QueryVisitorTest, VerbatimTermEscapingQuote) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, - ElementsAre(kVerbatimSearchFeature)); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kVerbatimSearchFeature, + kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kVerbatimSearchFeature)); + } EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre(R"(foobar")")); @@ -1218,14 +1298,20 @@ TEST_P(QueryVisitorTest, VerbatimTermEscapingEscape) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, - ElementsAre(kVerbatimSearchFeature)); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kVerbatimSearchFeature, + kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kVerbatimSearchFeature)); + } EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre(R"(foobar\)")); @@ -1266,14 +1352,20 @@ TEST_P(QueryVisitorTest, VerbatimTermEscapingNonSpecialChar) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, - ElementsAre(kVerbatimSearchFeature)); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kVerbatimSearchFeature, + kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kVerbatimSearchFeature)); + } EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre(R"(foobary)")); @@ -1287,14 +1379,20 @@ TEST_P(QueryVisitorTest, VerbatimTermEscapingNonSpecialChar) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); ICING_ASSERT_OK_AND_ASSIGN(query_results, std::move(query_visitor_two).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, - ElementsAre(kVerbatimSearchFeature)); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kVerbatimSearchFeature, + kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kVerbatimSearchFeature)); + } EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre(R"(foobar\y)")); @@ -1336,14 +1434,20 @@ TEST_P(QueryVisitorTest, VerbatimTermNewLine) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, - ElementsAre(kVerbatimSearchFeature)); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kVerbatimSearchFeature, + kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kVerbatimSearchFeature)); + } EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("foobar\n")); EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), @@ -1356,14 +1460,20 @@ TEST_P(QueryVisitorTest, VerbatimTermNewLine) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); ICING_ASSERT_OK_AND_ASSIGN(query_results, std::move(query_visitor_two).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, - ElementsAre(kVerbatimSearchFeature)); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kVerbatimSearchFeature, + kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kVerbatimSearchFeature)); + } EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre(R"(foobar\n)")); @@ -1399,14 +1509,20 @@ TEST_P(QueryVisitorTest, VerbatimTermEscapingComplex) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, - ElementsAre(kVerbatimSearchFeature)); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kVerbatimSearchFeature, + kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kVerbatimSearchFeature)); + } EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre(R"(foo\"bar\nbaz")")); @@ -1450,7 +1566,7 @@ TEST_P(QueryVisitorTest, SingleMinusTerm) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -1458,7 +1574,12 @@ TEST_P(QueryVisitorTest, SingleMinusTerm) { std::move(query_visitor).ConsumeResults()); EXPECT_THAT(ExtractKeys(query_results.query_terms), IsEmpty()); EXPECT_THAT(query_results.query_term_iterators, IsEmpty()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, IsEmpty()); + } EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), ElementsAre(kDocumentId2)); } @@ -1497,14 +1618,15 @@ TEST_P(QueryVisitorTest, SingleNotTerm) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); EXPECT_THAT(query_results.query_terms, IsEmpty()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(query_results.query_term_iterators, IsEmpty()); EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), ElementsAre(kDocumentId2)); @@ -1549,13 +1671,14 @@ TEST_P(QueryVisitorTest, NestedNotTerms) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("foo", "baz")); @@ -1615,13 +1738,14 @@ TEST_P(QueryVisitorTest, DeeplyNestedNotTerms) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("bar", "baz", "bat")); @@ -1653,13 +1777,18 @@ TEST_P(QueryVisitorTest, ImplicitAndTerms) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, IsEmpty()); + } EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("foo", "bar")); @@ -1691,13 +1820,18 @@ TEST_P(QueryVisitorTest, ExplicitAndTerms) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, IsEmpty()); + } EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("foo", "bar")); @@ -1729,13 +1863,18 @@ TEST_P(QueryVisitorTest, OrTerms) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, IsEmpty()); + } EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("foo", "bar")); @@ -1769,13 +1908,18 @@ TEST_P(QueryVisitorTest, AndOrTermPrecedence) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, IsEmpty()); + } EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("foo", "bar", "baz")); @@ -1789,13 +1933,18 @@ TEST_P(QueryVisitorTest, AndOrTermPrecedence) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); ICING_ASSERT_OK_AND_ASSIGN(query_results, std::move(query_visitor_two).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, IsEmpty()); + } EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("foo", "bar", "baz")); @@ -1808,13 +1957,18 @@ TEST_P(QueryVisitorTest, AndOrTermPrecedence) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_three( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_three); ICING_ASSERT_OK_AND_ASSIGN(query_results, std::move(query_visitor_three).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, IsEmpty()); + } EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("foo", "bar", "baz")); @@ -1863,13 +2017,14 @@ TEST_P(QueryVisitorTest, AndOrNotPrecedence) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("foo", "baz")); @@ -1882,13 +2037,14 @@ TEST_P(QueryVisitorTest, AndOrNotPrecedence) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); ICING_ASSERT_OK_AND_ASSIGN(query_results, std::move(query_visitor_two).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("foo")); EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), @@ -1943,7 +2099,7 @@ TEST_P(QueryVisitorTest, PropertyFilter) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -1954,7 +2110,12 @@ TEST_P(QueryVisitorTest, PropertyFilter) { EXPECT_THAT(query_results.query_terms["prop1"], UnorderedElementsAre("foo")); EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), UnorderedElementsAre("foo")); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, IsEmpty()); + } EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), ElementsAre(kDocumentId1, kDocumentId0)); } @@ -2011,7 +2172,7 @@ TEST_F(QueryVisitorTest, MultiPropertyFilter) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2023,7 +2184,8 @@ TEST_F(QueryVisitorTest, MultiPropertyFilter) { EXPECT_THAT(query_results.query_terms["prop2"], UnorderedElementsAre("foo")); EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), UnorderedElementsAre("foo")); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), ElementsAre(kDocumentId1, kDocumentId0)); } @@ -2051,7 +2213,7 @@ TEST_P(QueryVisitorTest, PropertyFilterStringIsInvalid) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2105,7 +2267,7 @@ TEST_P(QueryVisitorTest, PropertyFilterNonNormalized) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2116,7 +2278,12 @@ TEST_P(QueryVisitorTest, PropertyFilterNonNormalized) { EXPECT_THAT(query_results.query_terms["PROP1"], UnorderedElementsAre("foo")); EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), UnorderedElementsAre("foo")); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, IsEmpty()); + } EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), ElementsAre(kDocumentId1, kDocumentId0)); } @@ -2168,13 +2335,14 @@ TEST_P(QueryVisitorTest, PropertyFilterWithGrouping) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("prop1")); EXPECT_THAT(query_results.query_terms["prop1"], @@ -2231,13 +2399,14 @@ TEST_P(QueryVisitorTest, ValidNestedPropertyFilter) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("prop1")); EXPECT_THAT(query_results.query_terms["prop1"], UnorderedElementsAre("foo")); @@ -2251,13 +2420,14 @@ TEST_P(QueryVisitorTest, ValidNestedPropertyFilter) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); ICING_ASSERT_OK_AND_ASSIGN(query_results, std::move(query_visitor_two).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("prop1")); EXPECT_THAT(query_results.query_terms["prop1"], UnorderedElementsAre("foo")); @@ -2313,13 +2483,14 @@ TEST_P(QueryVisitorTest, InvalidNestedPropertyFilter) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(ExtractKeys(query_results.query_terms), IsEmpty()); EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty()); EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), IsEmpty()); @@ -2333,13 +2504,14 @@ TEST_P(QueryVisitorTest, InvalidNestedPropertyFilter) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); ICING_ASSERT_OK_AND_ASSIGN(query_results, std::move(query_visitor_two).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(ExtractKeys(query_results.query_terms), IsEmpty()); EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty()); EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), IsEmpty()); @@ -2395,13 +2567,14 @@ TEST_P(QueryVisitorTest, NotWithPropertyFilter) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(ExtractKeys(query_results.query_terms), IsEmpty()); EXPECT_THAT(query_results.query_term_iterators, IsEmpty()); EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), @@ -2415,13 +2588,14 @@ TEST_P(QueryVisitorTest, NotWithPropertyFilter) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); ICING_ASSERT_OK_AND_ASSIGN(query_results, std::move(query_visitor_two).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(ExtractKeys(query_results.query_terms), IsEmpty()); EXPECT_THAT(query_results.query_term_iterators, IsEmpty()); EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), @@ -2478,14 +2652,15 @@ TEST_P(QueryVisitorTest, PropertyFilterWithNot) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("prop1")); EXPECT_THAT(query_results.query_terms["prop1"], UnorderedElementsAre("bar")); @@ -2501,13 +2676,14 @@ TEST_P(QueryVisitorTest, PropertyFilterWithNot) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); ICING_ASSERT_OK_AND_ASSIGN(query_results, std::move(query_visitor_two).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("prop1")); EXPECT_THAT(query_results.query_terms["prop1"], UnorderedElementsAre("bar")); @@ -2579,13 +2755,18 @@ TEST_P(QueryVisitorTest, SegmentationTest) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, IsEmpty()); + } EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("")); if (IsCfStringTokenization()) { EXPECT_THAT(query_results.query_terms[""], @@ -2692,13 +2873,18 @@ TEST_P(QueryVisitorTest, PropertyRestrictsPopCorrectly) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + if (GetParam() == QueryType::kSearch) { + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); + } else { + EXPECT_THAT(query_results.features_in_use, IsEmpty()); + } EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("", "prop1")); EXPECT_THAT(query_results.query_terms[""], @@ -2802,13 +2988,14 @@ TEST_P(QueryVisitorTest, UnsatisfiablePropertyRestrictsPopCorrectly) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("prop0", "prop2")); EXPECT_THAT(query_results.query_terms["prop0"], UnorderedElementsAre("val0")); @@ -2825,7 +3012,7 @@ TEST_F(QueryVisitorTest, UnsupportedFunctionReturnsInvalidArgument) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2839,7 +3026,7 @@ TEST_F(QueryVisitorTest, SearchFunctionTooFewArgumentsReturnsInvalidArgument) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2853,7 +3040,7 @@ TEST_F(QueryVisitorTest, SearchFunctionTooManyArgumentsReturnsInvalidArgument) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2869,7 +3056,7 @@ TEST_F(QueryVisitorTest, ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2881,7 +3068,7 @@ TEST_F(QueryVisitorTest, ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); @@ -2897,7 +3084,7 @@ TEST_F(QueryVisitorTest, ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2909,7 +3096,7 @@ TEST_F(QueryVisitorTest, ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); @@ -2924,7 +3111,7 @@ TEST_F(QueryVisitorTest, ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2984,14 +3171,15 @@ TEST_F(QueryVisitorTest, SearchFunctionNestedFunctionCalls) { ParseQueryHelper(level_two_query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), level_two_query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("prop1")); EXPECT_THAT(query_results.query_terms["prop1"], @@ -3008,13 +3196,14 @@ TEST_F(QueryVisitorTest, SearchFunctionNestedFunctionCalls) { QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), schema_store_.get(), normalizer_.get(), tokenizer_.get(), - DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, + level_three_query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); ICING_ASSERT_OK_AND_ASSIGN(query_results, std::move(query_visitor_two).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("prop1")); EXPECT_THAT(query_results.query_terms["prop1"], @@ -3031,13 +3220,14 @@ TEST_F(QueryVisitorTest, SearchFunctionNestedFunctionCalls) { QueryVisitor query_visitor_three( index_.get(), numeric_index_.get(), document_store_.get(), schema_store_.get(), normalizer_.get(), tokenizer_.get(), - DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, + level_four_query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_three); ICING_ASSERT_OK_AND_ASSIGN(query_results, std::move(query_visitor_three).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("prop1")); EXPECT_THAT(query_results.query_terms["prop1"], @@ -3148,14 +3338,15 @@ TEST_F(QueryVisitorTest, SearchFunctionNestedPropertyRestrictsNarrowing) { ParseQueryHelper(level_one_query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), level_one_query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("prop0", "prop1", "prop2", "prop3", "prop4", "prop5", "prop6", "prop7")); @@ -3179,14 +3370,15 @@ TEST_F(QueryVisitorTest, SearchFunctionNestedPropertyRestrictsNarrowing) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(level_two_query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), level_two_query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); ICING_ASSERT_OK_AND_ASSIGN(query_results, std::move(query_visitor_two).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("prop0", "prop2", "prop4", "prop6")); EXPECT_THAT(query_results.query_terms["prop0"], UnorderedElementsAre("foo")); @@ -3205,13 +3397,14 @@ TEST_F(QueryVisitorTest, SearchFunctionNestedPropertyRestrictsNarrowing) { QueryVisitor query_visitor_three( index_.get(), numeric_index_.get(), document_store_.get(), schema_store_.get(), normalizer_.get(), tokenizer_.get(), - DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, + level_three_query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_three); ICING_ASSERT_OK_AND_ASSIGN(query_results, std::move(query_visitor_three).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("prop0", "prop6")); EXPECT_THAT(query_results.query_terms["prop0"], UnorderedElementsAre("foo")); @@ -3224,7 +3417,7 @@ TEST_F(QueryVisitorTest, SearchFunctionNestedPropertyRestrictsNarrowing) { // This test will nest `search` calls together with the set of restricts // narrowing at each level so that the set of docs matching the query shrinks. -TEST_F(QueryVisitorTest, SearchFunctionNestedPropertyRestrictsExpandinging) { +TEST_F(QueryVisitorTest, SearchFunctionNestedPropertyRestrictsExpanding) { PropertyConfigProto prop = PropertyConfigBuilder() .SetName("prop0") @@ -3322,14 +3515,15 @@ TEST_F(QueryVisitorTest, SearchFunctionNestedPropertyRestrictsExpandinging) { ParseQueryHelper(level_one_query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), level_one_query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results, std::move(query_visitor).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("prop0", "prop6")); EXPECT_THAT(query_results.query_terms["prop0"], UnorderedElementsAre("foo")); @@ -3345,14 +3539,15 @@ TEST_F(QueryVisitorTest, SearchFunctionNestedPropertyRestrictsExpandinging) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(level_two_query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), level_two_query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); ICING_ASSERT_OK_AND_ASSIGN(query_results, std::move(query_visitor_two).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("prop0", "prop6")); EXPECT_THAT(query_results.query_terms["prop0"], UnorderedElementsAre("foo")); @@ -3370,13 +3565,14 @@ TEST_F(QueryVisitorTest, SearchFunctionNestedPropertyRestrictsExpandinging) { QueryVisitor query_visitor_three( index_.get(), numeric_index_.get(), document_store_.get(), schema_store_.get(), normalizer_.get(), tokenizer_.get(), - DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, + level_three_query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_three); ICING_ASSERT_OK_AND_ASSIGN(query_results, std::move(query_visitor_three).ConsumeResults()); - EXPECT_THAT(query_results.features_in_use, IsEmpty()); + EXPECT_THAT(query_results.features_in_use, + UnorderedElementsAre(kListFilterQueryLanguageFeature)); EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre("prop0", "prop6")); EXPECT_THAT(query_results.query_terms["prop0"], UnorderedElementsAre("foo")); diff --git a/icing/query/advanced_query_parser/util/string-util.cc b/icing/query/advanced_query_parser/util/string-util.cc new file mode 100644 index 0000000..9af2ed6 --- /dev/null +++ b/icing/query/advanced_query_parser/util/string-util.cc @@ -0,0 +1,106 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/query/advanced_query_parser/util/string-util.h" + +#include "icing/absl_ports/canonical_errors.h" +#include "icing/absl_ports/str_cat.h" + +namespace icing { +namespace lib { + +namespace string_util { + +libtextclassifier3::StatusOr<std::string> UnescapeStringValue( + std::string_view value) { + std::string result; + bool in_escape = false; + for (char c : value) { + if (in_escape) { + in_escape = false; + } else if (c == '\\') { + in_escape = true; + continue; + } else if (c == '"') { + return absl_ports::InvalidArgumentError( + "Encountered an unescaped quotation mark!"); + } + result += c; + } + return result; +} + +libtextclassifier3::StatusOr<std::string_view> FindEscapedToken( + std::string_view escaped_string, std::string_view unescaped_token) { + if (unescaped_token.empty()) { + return absl_ports::InvalidArgumentError( + "Cannot find escaped token in empty unescaped token."); + } + + // Find the start of unescaped_token within the escaped_string + const char* esc_string_end = escaped_string.data() + escaped_string.length(); + size_t pos = escaped_string.find(unescaped_token[0]); + const char* esc_token_start = (pos == std::string_view::npos) + ? esc_string_end + : escaped_string.data() + pos; + const char* esc_token_cur = esc_token_start; + const char* possible_next_start = nullptr; + bool is_escaped = false; + int i = 0; + for (; i < unescaped_token.length() && esc_token_cur < esc_string_end; + ++esc_token_cur) { + if (esc_token_cur != esc_token_start && + *esc_token_cur == unescaped_token[0] && + possible_next_start == nullptr) { + possible_next_start = esc_token_cur; + } + + // Every char in unescaped_token should either be an escape or match the + // next char in unescaped_token. + if (!is_escaped && *esc_token_cur == '\\') { + is_escaped = true; + } else if (*esc_token_cur == unescaped_token[i]) { + is_escaped = false; + ++i; + } else { + // No match. If we don't have a possible_next_start, then try to find one. + if (possible_next_start == nullptr) { + pos = escaped_string.find(unescaped_token[0], + esc_token_cur - escaped_string.data()); + if (pos == std::string_view::npos) { + break; + } + esc_token_start = escaped_string.data() + pos; + } else { + esc_token_start = possible_next_start; + possible_next_start = nullptr; + } + // esc_token_start has been reset to a char that equals unescaped_token[0] + // The for loop above will advance esc_token_cur so set i to 1. + i = 1; + esc_token_cur = esc_token_start; + } + } + if (i != unescaped_token.length()) { + return absl_ports::InvalidArgumentError( + absl_ports::StrCat("Couldn't match chars at token=", unescaped_token, + ") and raw_text=", escaped_string)); + } + return std::string_view(esc_token_start, esc_token_cur - esc_token_start); +} + +} // namespace string_util + +} // namespace lib +} // namespace icing
\ No newline at end of file diff --git a/icing/query/advanced_query_parser/util/string-util.h b/icing/query/advanced_query_parser/util/string-util.h new file mode 100644 index 0000000..09fb451 --- /dev/null +++ b/icing/query/advanced_query_parser/util/string-util.h @@ -0,0 +1,49 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_QUERY_ADVANCED_QUERY_PARSER__STRING_UTIL_H_ +#define ICING_QUERY_ADVANCED_QUERY_PARSER__STRING_UTIL_H_ + +#include <string> +#include <string_view> + +#include "icing/text_classifier/lib3/utils/base/statusor.h" + +namespace icing { +namespace lib { + +namespace string_util { + +// Returns: +// - On success, value with the escapes removed. +// - INVALID_ARGUMENT if an non-escaped quote is encountered. +// Ex. "fo\\\\o" -> "fo\\o" +libtextclassifier3::StatusOr<std::string> UnescapeStringValue( + std::string_view value); + +// Returns: +// - On success, string_view pointing to the segment of escaped_string that, +// if unescaped, would match unescaped_token. +// - INVALID_ARGUMENT +// Ex. escaped_string="foo b\\a\\\"r baz", unescaped_token="ba\"r" +// returns "b\\a\\\"r" +libtextclassifier3::StatusOr<std::string_view> FindEscapedToken( + std::string_view escaped_string, std::string_view unescaped_token); + +} // namespace string_util + +} // namespace lib +} // namespace icing + +#endif // ICING_QUERY_ADVANCED_QUERY_PARSER__STRING_UTIL_H_ diff --git a/icing/query/advanced_query_parser/util/string-util_test.cc b/icing/query/advanced_query_parser/util/string-util_test.cc new file mode 100644 index 0000000..a7ccf3e --- /dev/null +++ b/icing/query/advanced_query_parser/util/string-util_test.cc @@ -0,0 +1,125 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/query/advanced_query_parser/util/string-util.h" + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/testing/common-matchers.h" + +namespace icing { +namespace lib { + +namespace { + +using ::testing::Eq; +using ::testing::IsEmpty; + +TEST(StringUtilTest, UnescapeStringEmptyString) { + EXPECT_THAT(string_util::UnescapeStringValue(""), IsOkAndHolds(IsEmpty())); +} + +TEST(StringUtilTest, UnescapeStringStringWithNoEscapes) { + EXPECT_THAT(string_util::UnescapeStringValue("foo"), IsOkAndHolds("foo")); + EXPECT_THAT(string_util::UnescapeStringValue("f o o"), IsOkAndHolds("f o o")); + EXPECT_THAT(string_util::UnescapeStringValue("f\to\to"), + IsOkAndHolds("f\to\to")); + EXPECT_THAT(string_util::UnescapeStringValue("f.o.o"), IsOkAndHolds("f.o.o")); +} + +TEST(StringUtilTest, UnescapeStringStringWithEscapes) { + EXPECT_THAT(string_util::UnescapeStringValue("f\\oo"), IsOkAndHolds("foo")); + EXPECT_THAT(string_util::UnescapeStringValue("f\\\\oo"), + IsOkAndHolds("f\\oo")); + EXPECT_THAT(string_util::UnescapeStringValue("f\\\"oo"), + IsOkAndHolds("f\"oo")); + EXPECT_THAT(string_util::UnescapeStringValue("foo\\"), IsOkAndHolds("foo")); + EXPECT_THAT(string_util::UnescapeStringValue("foo b\\a\\\"r baz"), + IsOkAndHolds("foo ba\"r baz")); + EXPECT_THAT(string_util::UnescapeStringValue("bar b\\aar bar\\s bart"), + IsOkAndHolds("bar baar bars bart")); + EXPECT_THAT(string_util::UnescapeStringValue("\\\\\\\\a"), + IsOkAndHolds("\\\\a")); +} + +TEST(StringUtilTest, UnescapeStringQuoteWithoutEscape) { + EXPECT_THAT(string_util::UnescapeStringValue("f\\o\"o"), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(string_util::UnescapeStringValue("f\"oo"), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST(StringUtilTest, FindEscapedTokenEmptyUnescapedToken) { + EXPECT_THAT(string_util::FindEscapedToken("foo b\\a\\\"r baz", ""), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST(StringUtilTest, FindEscapedTokenTokenNotPresent) { + EXPECT_THAT(string_util::FindEscapedToken("foo b\\a\\\"r baz", "elephant"), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(string_util::FindEscapedToken("foo b\\a\\\"r baz", "bat"), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(string_util::FindEscapedToken("foo b\\a\\\"r baz", "taz"), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(string_util::FindEscapedToken("foo b\\a\\\"r baz", "bazz"), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST(StringUtilTest, FindEscapedTokenMatchInMiddleToken) { + EXPECT_THAT(string_util::FindEscapedToken("babar", "bar"), + IsOkAndHolds("bar")); +} + +TEST(StringUtilTest, FindEscapedTokenMatches) { + EXPECT_THAT(string_util::FindEscapedToken("foo b\\a\\\"r baz", "ba\"r"), + IsOkAndHolds("b\\a\\\"r")); + EXPECT_THAT(string_util::FindEscapedToken("\\\\\\\\a", "\\\\a"), + IsOkAndHolds("\\\\\\\\a")); +} + +TEST(StringUtilTest, FindEscapedTokenTraversesThroughEscapedText) { + std::string_view escaped_text = "bar b\\aar bar\\s bart"; + ICING_ASSERT_OK_AND_ASSIGN( + std::string_view result, + string_util::FindEscapedToken(escaped_text, "bar")); + // escaped_text = "bar b\\aar bar\\s bart"; + // escaped_token ^ ^ + EXPECT_THAT(result, Eq("bar")); + + // escaped_text = "b\\aar bar\\s bart"; + // escaped_token ^ ^ + const char* result_end = result.data() + result.length(); + escaped_text = escaped_text.substr(result_end - escaped_text.data()); + ICING_ASSERT_OK_AND_ASSIGN( + result, string_util::FindEscapedToken(escaped_text, "bar")); + EXPECT_THAT(result, Eq("bar")); + + // escaped_text = "\\s bart"; + // escaped_token ^ ^ + result_end = result.data() + result.length(); + escaped_text = escaped_text.substr(result_end - escaped_text.data()); + ICING_ASSERT_OK_AND_ASSIGN( + result, string_util::FindEscapedToken(escaped_text, "bar")); + EXPECT_THAT(result, Eq("bar")); + + result_end = result.data() + result.length(); + escaped_text = escaped_text.substr(result_end - escaped_text.data()); + EXPECT_THAT(string_util::FindEscapedToken(escaped_text, "bar"), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +} // namespace + +} // namespace lib +} // namespace icing
\ No newline at end of file diff --git a/icing/query/query-features.h b/icing/query/query-features.h index 1471063..9fafba5 100644 --- a/icing/query/query-features.h +++ b/icing/query/query-features.h @@ -36,8 +36,14 @@ constexpr Feature kNumericSearchFeature = constexpr Feature kVerbatimSearchFeature = "VERBATIM_SEARCH"; // Features#VERBATIM_SEARCH -// TODO(b/208654892): Add this as an enabled feature in the query visitor when -// it gets invoked. +// This feature covers all additions (other than numeric search and verbatim +// search) to the query language to bring it into better alignment with the list +// filters spec. +// This includes: +// - support for function calls +// - expanding support for negation and property restriction expressions +// - prefix operator '*' +// - 'NOT' operator constexpr Feature kListFilterQueryLanguageFeature = "LIST_FILTER_QUERY_LANGUAGE"; // Features#LIST_FILTER_QUERY_LANGUAGE diff --git a/icing/query/query-processor.cc b/icing/query/query-processor.cc index 9b03a0e..6760fad 100644 --- a/icing/query/query-processor.cc +++ b/icing/query/query-processor.cc @@ -203,8 +203,8 @@ libtextclassifier3::StatusOr<QueryResults> QueryProcessor::ParseAdvancedQuery( ranking_strategy == ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE; QueryVisitor query_visitor( &index_, &numeric_index_, &document_store_, &schema_store_, &normalizer_, - plain_tokenizer.get(), std::move(options), search_spec.term_match_type(), - needs_term_frequency_info); + plain_tokenizer.get(), search_spec.query(), std::move(options), + search_spec.term_match_type(), needs_term_frequency_info); tree_root->Accept(&query_visitor); return std::move(query_visitor).ConsumeResults(); } diff --git a/icing/query/suggestion-processor_test.cc b/icing/query/suggestion-processor_test.cc index e161099..d4ecec0 100644 --- a/icing/query/suggestion-processor_test.cc +++ b/icing/query/suggestion-processor_test.cc @@ -14,6 +14,9 @@ #include "icing/query/suggestion-processor.h" +#include <string> +#include <vector> + #include "gmock/gmock.h" #include "icing/document-builder.h" #include "icing/index/numeric/dummy-numeric-index.h" @@ -36,10 +39,19 @@ namespace lib { namespace { using ::testing::IsEmpty; -using ::testing::SizeIs; using ::testing::Test; using ::testing::UnorderedElementsAre; +std::vector<std::string> RetrieveSuggestionsText( + const std::vector<TermMetadata>& terms) { + std::vector<std::string> suggestions; + suggestions.reserve(terms.size()); + for (const TermMetadata& term : terms) { + suggestions.push_back(term.content); + } + return suggestions; +} + class SuggestionProcessorTest : public Test { protected: SuggestionProcessorTest() @@ -181,8 +193,7 @@ TEST_F(SuggestionProcessorTest, MultipleTermsTest_And) { ICING_ASSERT_OK_AND_ASSIGN( std::vector<TermMetadata> terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - EXPECT_THAT(terms.at(0).content, "bar foo"); - EXPECT_THAT(terms, SizeIs(1)); + EXPECT_THAT(RetrieveSuggestionsText(terms), UnorderedElementsAre("bar foo")); } TEST_F(SuggestionProcessorTest, MultipleTermsTest_AndNary) { @@ -228,8 +239,8 @@ TEST_F(SuggestionProcessorTest, MultipleTermsTest_AndNary) { ICING_ASSERT_OK_AND_ASSIGN( std::vector<TermMetadata> terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - EXPECT_THAT(terms.at(0).content, "bar cat foo"); - EXPECT_THAT(terms, SizeIs(1)); + EXPECT_THAT(RetrieveSuggestionsText(terms), + UnorderedElementsAre("bar cat foo")); } TEST_F(SuggestionProcessorTest, MultipleTermsTest_Or) { @@ -277,11 +288,7 @@ TEST_F(SuggestionProcessorTest, MultipleTermsTest_Or) { ICING_ASSERT_OK_AND_ASSIGN( std::vector<TermMetadata> terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - std::vector<std::string> suggestions; - for (TermMetadata term : terms) { - suggestions.push_back(term.content); - } - EXPECT_THAT(suggestions, + EXPECT_THAT(RetrieveSuggestionsText(terms), UnorderedElementsAre("bar OR cat fo", "bar OR cat foo")); } @@ -340,14 +347,11 @@ TEST_F(SuggestionProcessorTest, MultipleTermsTest_OrNary) { ICING_ASSERT_OK_AND_ASSIGN( std::vector<TermMetadata> terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - std::vector<std::string> suggestions; - for (TermMetadata term : terms) { - suggestions.push_back(term.content); - } // "fo" in document1, "foo" in document2 and "fool" in document3 could match. - EXPECT_THAT(suggestions, UnorderedElementsAre("bar OR cat OR lot fo", - "bar OR cat OR lot foo", - "bar OR cat OR lot fool")); + EXPECT_THAT( + RetrieveSuggestionsText(terms), + UnorderedElementsAre("bar OR cat OR lot fo", "bar OR cat OR lot foo", + "bar OR cat OR lot fool")); } TEST_F(SuggestionProcessorTest, MultipleTermsTest_NormalizedTerm) { @@ -394,22 +398,17 @@ TEST_F(SuggestionProcessorTest, MultipleTermsTest_NormalizedTerm) { ICING_ASSERT_OK_AND_ASSIGN( std::vector<TermMetadata> terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - std::vector<std::string> suggestions; - for (TermMetadata term : terms) { - suggestions.push_back(term.content); - } // The term is normalized. - EXPECT_THAT(suggestions, UnorderedElementsAre("bar foo", "bar fool")); - suggestions.clear(); + EXPECT_THAT(RetrieveSuggestionsText(terms), + UnorderedElementsAre("bar foo", "bar fool")); + // Search for "bar AND ḞÖ" suggestion_spec.set_prefix("bar ḞÖ"); ICING_ASSERT_OK_AND_ASSIGN( terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - for (TermMetadata term : terms) { - suggestions.push_back(term.content); - } // The term is normalized. - EXPECT_THAT(suggestions, UnorderedElementsAre("bar foo", "bar fool")); + EXPECT_THAT(RetrieveSuggestionsText(terms), + UnorderedElementsAre("bar foo", "bar fool")); } TEST_F(SuggestionProcessorTest, NonExistentPrefixTest) { @@ -441,7 +440,6 @@ TEST_F(SuggestionProcessorTest, NonExistentPrefixTest) { ICING_ASSERT_OK_AND_ASSIGN( std::vector<TermMetadata> terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - EXPECT_THAT(terms, IsEmpty()); } @@ -474,7 +472,6 @@ TEST_F(SuggestionProcessorTest, PrefixTrailingSpaceTest) { ICING_ASSERT_OK_AND_ASSIGN( std::vector<TermMetadata> terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - EXPECT_THAT(terms, IsEmpty()); } @@ -506,23 +503,22 @@ TEST_F(SuggestionProcessorTest, NormalizePrefixTest) { ICING_ASSERT_OK_AND_ASSIGN( std::vector<TermMetadata> terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - EXPECT_THAT(terms.at(0).content, "foo"); + EXPECT_THAT(RetrieveSuggestionsText(terms), UnorderedElementsAre("foo")); suggestion_spec.set_prefix("fO"); ICING_ASSERT_OK_AND_ASSIGN( terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - EXPECT_THAT(terms.at(0).content, "foo"); + EXPECT_THAT(RetrieveSuggestionsText(terms), UnorderedElementsAre("foo")); suggestion_spec.set_prefix("Fo"); ICING_ASSERT_OK_AND_ASSIGN( terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - EXPECT_THAT(terms.at(0).content, "foo"); + EXPECT_THAT(RetrieveSuggestionsText(terms), UnorderedElementsAre("foo")); suggestion_spec.set_prefix("FO"); ICING_ASSERT_OK_AND_ASSIGN( terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - - EXPECT_THAT(terms.at(0).content, "foo"); + EXPECT_THAT(RetrieveSuggestionsText(terms), UnorderedElementsAre("foo")); } TEST_F(SuggestionProcessorTest, ParenthesesOperatorPrefixTest) { @@ -593,20 +589,34 @@ TEST_F(SuggestionProcessorTest, OtherSpecialPrefixTest) { suggestion_spec.mutable_scoring_spec()->set_scoring_match_type( TermMatchType::PREFIX); - ICING_ASSERT_OK_AND_ASSIGN( - std::vector<TermMetadata> terms, - suggestion_processor_->QuerySuggestions(suggestion_spec)); - EXPECT_THAT(terms, IsEmpty()); + auto terms_or = suggestion_processor_->QuerySuggestions(suggestion_spec); + if (SearchSpecProto::default_instance().search_type() == + SearchSpecProto::SearchType::ICING_RAW_QUERY) { + ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms, terms_or); + EXPECT_THAT(terms, IsEmpty()); + } else { + EXPECT_THAT(terms_or, + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + } + // TODO(b/208654892): Update handling for hyphens to only consider it a hyphen + // within a TEXT token (rather than a MINUS token) when surrounded on both + // sides by TEXT rather than just preceded by TEXT. suggestion_spec.set_prefix("f-"); - ICING_ASSERT_OK_AND_ASSIGN( - terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); + terms_or = suggestion_processor_->QuerySuggestions(suggestion_spec); + ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms, terms_or); EXPECT_THAT(terms, IsEmpty()); suggestion_spec.set_prefix("f OR"); - ICING_ASSERT_OK_AND_ASSIGN( - terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - EXPECT_THAT(terms, IsEmpty()); + terms_or = suggestion_processor_->QuerySuggestions(suggestion_spec); + if (SearchSpecProto::default_instance().search_type() == + SearchSpecProto::SearchType::ICING_RAW_QUERY) { + ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms, terms_or); + EXPECT_THAT(terms, IsEmpty()); + } else { + EXPECT_THAT(terms_or, + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + } } TEST_F(SuggestionProcessorTest, InvalidPrefixTest) { @@ -635,10 +645,15 @@ TEST_F(SuggestionProcessorTest, InvalidPrefixTest) { suggestion_spec.mutable_scoring_spec()->set_scoring_match_type( TermMatchType::PREFIX); - ICING_ASSERT_OK_AND_ASSIGN( - std::vector<TermMetadata> terms, - suggestion_processor_->QuerySuggestions(suggestion_spec)); - EXPECT_THAT(terms, IsEmpty()); + auto terms_or = suggestion_processor_->QuerySuggestions(suggestion_spec); + if (SearchSpecProto::default_instance().search_type() == + SearchSpecProto::SearchType::ICING_RAW_QUERY) { + ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms, terms_or); + EXPECT_THAT(terms, IsEmpty()); + } else { + EXPECT_THAT(terms_or, + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + } } } // namespace diff --git a/icing/schema/joinable-property-manager.cc b/icing/schema/joinable-property-manager.cc index 3977b6b..1606abb 100644 --- a/icing/schema/joinable-property-manager.cc +++ b/icing/schema/joinable-property-manager.cc @@ -156,8 +156,7 @@ JoinablePropertyManager::GetJoinablePropertyMetadata( .property_path_to_id_map.find(property_path); if (iter == joinable_property_metadata_cache_[schema_type_id] .property_path_to_id_map.end()) { - return absl_ports::NotFoundError( - "Property path is not joinable or doesn't exist"); + return nullptr; } JoinablePropertyId joinable_property_id = iter->second; diff --git a/icing/schema/joinable-property-manager.h b/icing/schema/joinable-property-manager.h index c7038ce..3ee5963 100644 --- a/icing/schema/joinable-property-manager.h +++ b/icing/schema/joinable-property-manager.h @@ -100,9 +100,9 @@ class JoinablePropertyManager { // // Returns: // - Valid pointer to JoinablePropertyMetadata on success + // - nullptr if property_path doesn't exist (or is not joinable) in the + // joinable metadata list of the schema // - INVALID_ARGUMENT_ERROR if schema type id is invalid - // - NOT_FOUND_ERROR if property_path doesn't exist (or is not joinable) in - // the joinable metadata list of the schema libtextclassifier3::StatusOr<const JoinablePropertyMetadata*> GetJoinablePropertyMetadata(SchemaTypeId schema_type_id, const std::string& property_path) const; diff --git a/icing/schema/joinable-property-manager_test.cc b/icing/schema/joinable-property-manager_test.cc index d9a3841..ceaaa18 100644 --- a/icing/schema/joinable-property-manager_test.cc +++ b/icing/schema/joinable-property-manager_test.cc @@ -42,6 +42,7 @@ namespace lib { namespace { using ::testing::ElementsAre; +using ::testing::IsNull; using ::testing::Pointee; using ::testing::SizeIs; @@ -491,11 +492,11 @@ TEST_F(JoinablePropertyManagerTest, GetJoinablePropertyMetadataByPathNotExist) { EXPECT_THAT( schema_type_manager->joinable_property_manager() .GetJoinablePropertyMetadata(/*schema_type_id=*/0, "nonExistingPath"), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + IsOkAndHolds(IsNull())); EXPECT_THAT(schema_type_manager->joinable_property_manager() .GetJoinablePropertyMetadata(/*schema_type_id=*/1, "emails.nonExistingPath"), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + IsOkAndHolds(IsNull())); } // Note: valid GetMetadataList has been tested in diff --git a/icing/schema/schema-store.cc b/icing/schema/schema-store.cc index 0e0c917..065157e 100644 --- a/icing/schema/schema-store.cc +++ b/icing/schema/schema-store.cc @@ -369,11 +369,6 @@ SchemaStore::SetSchema(SchemaProto&& new_schema, bool ignore_errors_and_delete_documents) { ICING_ASSIGN_OR_RETURN(SchemaUtil::DependentMap new_dependent_map, SchemaUtil::Validate(new_schema)); - // TODO(b/256022027): validate and extract joinable properties. - // - Joinable config in non-string properties should be ignored, since - // currently we only support string joining. - // - If set joinable, the property itself and all of its nested properties - // should not have REPEATED cardinality. SetSchemaResult result; diff --git a/icing/schema/schema-store.h b/icing/schema/schema-store.h index 601d22a..5ad714e 100644 --- a/icing/schema/schema-store.h +++ b/icing/schema/schema-store.h @@ -225,10 +225,10 @@ class SchemaStore { // // Returns: // Valid pointer to JoinablePropertyMetadata on success + // nullptr if property_path doesn't exist (or is not joinable) in the + // joinable metadata list of the schema // FAILED_PRECONDITION if schema hasn't been set yet // INVALID_ARGUMENT if schema type id is invalid - // NOT_FOUND if property_path doesn't exist (or is not joinable) in the - // joinable metadata list of the schema libtextclassifier3::StatusOr<const JoinablePropertyMetadata*> GetJoinablePropertyMetadata(SchemaTypeId schema_type_id, const std::string& property_path) const; diff --git a/icing/schema/schema-util.cc b/icing/schema/schema-util.cc index 0589ada..ea0d85a 100644 --- a/icing/schema/schema-util.cc +++ b/icing/schema/schema-util.cc @@ -15,6 +15,7 @@ #include "icing/schema/schema-util.h" #include <cstdint> +#include <queue> #include <string> #include <string_view> #include <unordered_map> @@ -143,7 +144,7 @@ void AddIncompatibleChangeToDelta( auto dependent_types_itr = new_schema_dependent_map.find(old_type_config.schema_type()); if (dependent_types_itr != new_schema_dependent_map.end()) { - for (std::string_view dependent_type : dependent_types_itr->second) { + for (const auto& [dependent_type, _] : dependent_types_itr->second) { // The types from new_schema that depend on the current // old_type_config may not present in old_schema. // Those types will be listed at schema_delta.schema_types_new @@ -176,14 +177,15 @@ libtextclassifier3::Status ExpandTranstiveDependents( return libtextclassifier3::Status::OK; } pending_expansions->insert(type); - std::unordered_set<std::string_view> expanded_dependents; + std::unordered_map<std::string_view, std::vector<const PropertyConfigProto*>> + expanded_dependents; // Add all of the direct dependents. expanded_dependents.reserve(itr->second.size()); expanded_dependents.insert(itr->second.begin(), itr->second.end()); // Iterate through each direct dependent and add their indirect dependents. - for (std::string_view dep : itr->second) { + for (const auto& [dep, _] : itr->second) { // 1. Check if we're in the middle of expanding this type - IOW there's a // cycle! if (pending_expansions->count(dep) > 0) { @@ -206,8 +208,12 @@ libtextclassifier3::Status ExpandTranstiveDependents( auto dep_expanded_itr = expanded_dependent_map->find(dep); expanded_dependents.reserve(expanded_dependents.size() + dep_expanded_itr->second.size()); - expanded_dependents.insert(dep_expanded_itr->second.begin(), - dep_expanded_itr->second.end()); + for (const auto& [dep_dependent, _] : dep_expanded_itr->second) { + // Insert a transitive dependent `dep_dependent` for `type`. Also since + // there is no direct edge between `type` and `dep_dependent`, the direct + // edge (i.e. PropertyConfigProto*) vector is empty. + expanded_dependents.insert({dep_dependent, {}}); + } } expanded_dependent_map->insert({type, std::move(expanded_dependents)}); pending_expansions->erase(type); @@ -283,7 +289,8 @@ BuildTransitiveDependentGraph(const SchemaProto& schema) { if (known_types.count(property_schema_type) == 0) { unknown_types.insert(property_schema_type); } - dependent_map[property_schema_type].insert(schema_type); + dependent_map[property_schema_type][schema_type].push_back( + &property_config); } } } @@ -305,6 +312,9 @@ libtextclassifier3::StatusOr<SchemaUtil::DependentMap> SchemaUtil::Validate( // already. std::unordered_set<std::string_view> known_property_names; + // Tracks PropertyConfigs containing joinable properties. + std::unordered_set<std::string_view> schema_types_with_joinable_property; + // 2. Validate the properties of each type. for (const auto& type_config : schema.types()) { std::string_view schema_type(type_config.schema_type()); @@ -351,6 +361,55 @@ libtextclassifier3::StatusOr<SchemaUtil::DependentMap> SchemaUtil::Validate( property_config.string_indexing_config(), data_type, schema_type, property_name)); } + + ICING_RETURN_IF_ERROR(ValidateJoinableConfig( + property_config.joinable_config(), data_type, + property_config.cardinality(), schema_type, property_name)); + if (property_config.joinable_config().value_type() != + JoinableConfig::ValueType::NONE) { + schema_types_with_joinable_property.insert(schema_type); + } + } + } + + // BFS traverse the dependent graph to make sure that no nested levels + // (properties with DOCUMENT data type) have REPEATED cardinality while + // depending on schema types with joinable property. + std::queue<std::string_view> frontier; + for (const auto& schema_type : schema_types_with_joinable_property) { + frontier.push(schema_type); + } + std::unordered_set<std::string_view> traversed = + std::move(schema_types_with_joinable_property); + while (!frontier.empty()) { + std::string_view schema_type = frontier.front(); + frontier.pop(); + + const auto it = dependent_map.find(schema_type); + if (it == dependent_map.end()) { + continue; + } + + // Check every type that has a property of type schema_type. + for (const auto& [next_schema_type, property_configs] : it->second) { + // Check all properties in "next_schema_type" that are of type + // "schema_type". + for (const PropertyConfigProto* property_config : property_configs) { + if (property_config != nullptr && + property_config->cardinality() == + PropertyConfigProto::Cardinality::REPEATED) { + return absl_ports::InvalidArgumentError(absl_ports::StrCat( + "Schema type '", next_schema_type, + "' cannot have REPEATED nested document property '", + property_config->property_name(), + "' while connecting to some joinable properties")); + } + } + + if (traversed.count(next_schema_type) == 0) { + traversed.insert(next_schema_type); + frontier.push(next_schema_type); + } } } @@ -440,6 +499,35 @@ libtextclassifier3::Status SchemaUtil::ValidateStringIndexingConfig( return libtextclassifier3::Status::OK; } +libtextclassifier3::Status SchemaUtil::ValidateJoinableConfig( + const JoinableConfig& config, PropertyConfigProto::DataType::Code data_type, + PropertyConfigProto::Cardinality::Code cardinality, + std::string_view schema_type, std::string_view property_name) { + if (config.value_type() == JoinableConfig::ValueType::QUALIFIED_ID) { + if (data_type != PropertyConfigProto::DataType::STRING) { + return absl_ports::InvalidArgumentError( + absl_ports::StrCat("Qualified id joinable property '", property_name, + "' is required to have STRING data type")); + } + + if (cardinality == PropertyConfigProto::Cardinality::REPEATED) { + return absl_ports::InvalidArgumentError( + absl_ports::StrCat("Qualified id joinable property '", property_name, + "' cannot have REPEATED cardinality")); + } + } + + if (config.propagate_delete() && + config.value_type() != JoinableConfig::ValueType::QUALIFIED_ID) { + return absl_ports::InvalidArgumentError( + absl_ports::StrCat("Field 'property_name' '", property_name, + "' is required to have QUALIFIED_ID joinable " + "value type with delete propagation enabled")); + } + + return libtextclassifier3::Status::OK; +} + void SchemaUtil::BuildTypeConfigMap( const SchemaProto& schema, SchemaUtil::TypeConfigMap* type_config_map) { type_config_map->clear(); diff --git a/icing/schema/schema-util.h b/icing/schema/schema-util.h index e5747bb..47bb76b 100644 --- a/icing/schema/schema-util.h +++ b/icing/schema/schema-util.h @@ -34,9 +34,19 @@ class SchemaUtil { std::unordered_map<std::string, const SchemaTypeConfigProto>; // If A -> B is indicated in the map, then type A must be built before - // building type B, i.e. B depends on A. - using DependentMap = std::unordered_map<std::string_view, - std::unordered_set<std::string_view>>; + // building type B, i.e. B has a property of type A. Also include all + // PropertyConfigProto (with DOCUMENT data_type) pointers which directly + // connects type A and B. IOW, this vector of PropertyConfigProto* are "direct + // edges" connecting A and B directly. It will be an empty vector if A and B + // are not "directly" connected, but instead via another intermediate level of + // schema type. For example, the actual dependency is A -> C -> B, so there + // will be A -> C and C -> B with valid PropertyConfigProto* respectively in + // this map, but we will also expand transitive dependents: add A -> B into + // dependent map with empty vector of "edges". + using DependentMap = std::unordered_map< + std::string_view, + std::unordered_map<std::string_view, + std::vector<const PropertyConfigProto*>>>; struct SchemaDelta { // Which schema types were present in the old schema, but were deleted from @@ -113,6 +123,11 @@ class SchemaUtil { // itself, thus creating an infinite loop. // 13. Two SchemaTypeConfigProtos cannot have properties that reference each // other's schema_type, thus creating an infinite loop. + // 14. PropertyConfigProtos.joinable_config must be valid. See + // ValidateJoinableConfig for more details. + // 15. Any PropertyConfigProtos with nested DOCUMENT data type must not have + // REPEATED cardinality if they reference a schema type containing + // joinable property. // // TODO(b/171996137): Clarify 12 and 13 are only for indexed properties, once // document properties can be opted out of indexing. @@ -121,7 +136,7 @@ class SchemaUtil { // On success, a dependent map from each types to their dependent types // that depend on it directly or indirectly. // ALREADY_EXISTS for case 1 and 2 - // INVALID_ARGUMENT for 3-13 + // INVALID_ARGUMENT for 3-15 static libtextclassifier3::StatusOr<DependentMap> Validate( const SchemaProto& schema); @@ -145,6 +160,8 @@ class SchemaUtil { // `SchemaDelta.schema_types_deleted` // 3. A schema type's new definition would mean any existing data of the old // definition is now incompatible. + // 4. The derived join index would be incompatible. This is held in + // `SchemaDelta.join_incompatible`. // // For case 1, the two schemas would result in an incompatible index if: // 1.1. The new SchemaProto has a different set of indexed properties than @@ -167,6 +184,11 @@ class SchemaUtil { // scale defined as: // LEAST <REPEATED - OPTIONAL - REQUIRED> MOST // + // For case 4, the two schemas would result in an incompatible join if: + // 4.1. A SchematypeConfig exists in the new SchemaProto that has a + // different set of joinable properties than it did in the old + // SchemaProto. + // // A property is defined by the combination of the // SchemaTypeConfig.schema_type and the PropertyConfigProto.property_name. // @@ -227,6 +249,22 @@ class SchemaUtil { const StringIndexingConfig& config, PropertyConfigProto::DataType::Code data_type, std::string_view schema_type, std::string_view property_name); + + // Checks that the 'joinable_config' satisfies the following rules: + // 1. If the data type matches joinable value type + // a. Only STRING data types can use QUALIFIED_ID joinable value type + // 2. Only QUALIFIED_ID joinable value type can have delete propagation + // enabled + // 3. Any joinable property should have non-REPEATED cardinality + // + // Returns: + // INVALID_ARGUMENT if any of the rules are not followed + // OK on success + static libtextclassifier3::Status ValidateJoinableConfig( + const JoinableConfig& config, + PropertyConfigProto::DataType::Code data_type, + PropertyConfigProto::Cardinality::Code cardinality, + std::string_view schema_type, std::string_view property_name); }; } // namespace lib diff --git a/icing/schema/schema-util_test.cc b/icing/schema/schema-util_test.cc index 44d8def..2d1e683 100644 --- a/icing/schema/schema-util_test.cc +++ b/icing/schema/schema-util_test.cc @@ -30,9 +30,13 @@ namespace icing { namespace lib { namespace { +using portable_equals_proto::EqualsProto; using ::testing::Eq; using ::testing::HasSubstr; using ::testing::IsEmpty; +using ::testing::Pair; +using ::testing::Pointee; +using ::testing::UnorderedElementsAre; // Properties/fields in a schema type constexpr char kEmailType[] = "EmailMessage"; @@ -118,12 +122,32 @@ TEST(SchemaUtilTest, DependentGraphAlphabeticalOrder) { ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependentMap d_map, SchemaUtil::Validate(schema)); EXPECT_THAT(d_map, testing::SizeIs(5)); - EXPECT_THAT(d_map["F"], - testing::UnorderedElementsAre("A", "B", "C", "D", "E")); - EXPECT_THAT(d_map["E"], testing::UnorderedElementsAre("A", "B", "C", "D")); - EXPECT_THAT(d_map["D"], testing::UnorderedElementsAre("A", "B")); - EXPECT_THAT(d_map["C"], testing::UnorderedElementsAre("A", "B")); - EXPECT_THAT(d_map["B"], testing::UnorderedElementsAre("A")); + EXPECT_THAT( + d_map["F"], + UnorderedElementsAre(Pair("A", IsEmpty()), Pair("B", IsEmpty()), + Pair("C", IsEmpty()), Pair("D", IsEmpty()), + Pair("E", UnorderedElementsAre(Pointee( + EqualsProto(type_e.properties(0))))))); + EXPECT_THAT(d_map["E"], + UnorderedElementsAre( + Pair("A", IsEmpty()), Pair("B", IsEmpty()), + Pair("C", UnorderedElementsAre( + Pointee(EqualsProto(type_c.properties(0))))), + Pair("D", UnorderedElementsAre( + Pointee(EqualsProto(type_d.properties(0))))))); + EXPECT_THAT( + d_map["D"], + UnorderedElementsAre(Pair("A", IsEmpty()), + Pair("B", UnorderedElementsAre(Pointee( + EqualsProto(type_b.properties(1))))))); + EXPECT_THAT( + d_map["C"], + UnorderedElementsAre(Pair("A", IsEmpty()), + Pair("B", UnorderedElementsAre(Pointee( + EqualsProto(type_b.properties(0))))))); + EXPECT_THAT(d_map["B"], UnorderedElementsAre(Pair( + "A", UnorderedElementsAre(Pointee( + EqualsProto(type_a.properties(0))))))); } TEST(SchemaUtilTest, DependentGraphReverseAlphabeticalOrder) { @@ -206,12 +230,32 @@ TEST(SchemaUtilTest, DependentGraphReverseAlphabeticalOrder) { ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependentMap d_map, SchemaUtil::Validate(schema)); EXPECT_THAT(d_map, testing::SizeIs(5)); - EXPECT_THAT(d_map["F"], - testing::UnorderedElementsAre("A", "B", "C", "D", "E")); - EXPECT_THAT(d_map["E"], testing::UnorderedElementsAre("A", "B", "C", "D")); - EXPECT_THAT(d_map["D"], testing::UnorderedElementsAre("A", "B")); - EXPECT_THAT(d_map["C"], testing::UnorderedElementsAre("A", "B")); - EXPECT_THAT(d_map["B"], testing::UnorderedElementsAre("A")); + EXPECT_THAT( + d_map["F"], + UnorderedElementsAre(Pair("A", IsEmpty()), Pair("B", IsEmpty()), + Pair("C", IsEmpty()), Pair("D", IsEmpty()), + Pair("E", UnorderedElementsAre(Pointee( + EqualsProto(type_e.properties(0))))))); + EXPECT_THAT(d_map["E"], + UnorderedElementsAre( + Pair("A", IsEmpty()), Pair("B", IsEmpty()), + Pair("C", UnorderedElementsAre( + Pointee(EqualsProto(type_c.properties(0))))), + Pair("D", UnorderedElementsAre( + Pointee(EqualsProto(type_d.properties(0))))))); + EXPECT_THAT( + d_map["D"], + UnorderedElementsAre(Pair("A", IsEmpty()), + Pair("B", UnorderedElementsAre(Pointee( + EqualsProto(type_b.properties(1))))))); + EXPECT_THAT( + d_map["C"], + UnorderedElementsAre(Pair("A", IsEmpty()), + Pair("B", UnorderedElementsAre(Pointee( + EqualsProto(type_b.properties(0))))))); + EXPECT_THAT(d_map["B"], UnorderedElementsAre(Pair( + "A", UnorderedElementsAre(Pointee( + EqualsProto(type_a.properties(0))))))); } TEST(SchemaUtilTest, DependentGraphMixedOrder) { @@ -293,12 +337,32 @@ TEST(SchemaUtilTest, DependentGraphMixedOrder) { ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependentMap d_map, SchemaUtil::Validate(schema)); EXPECT_THAT(d_map, testing::SizeIs(5)); - EXPECT_THAT(d_map["F"], - testing::UnorderedElementsAre("A", "B", "C", "D", "E")); - EXPECT_THAT(d_map["E"], testing::UnorderedElementsAre("A", "B", "C", "D")); - EXPECT_THAT(d_map["D"], testing::UnorderedElementsAre("A", "B")); - EXPECT_THAT(d_map["C"], testing::UnorderedElementsAre("A", "B")); - EXPECT_THAT(d_map["B"], testing::UnorderedElementsAre("A")); + EXPECT_THAT( + d_map["F"], + UnorderedElementsAre(Pair("A", IsEmpty()), Pair("B", IsEmpty()), + Pair("C", IsEmpty()), Pair("D", IsEmpty()), + Pair("E", UnorderedElementsAre(Pointee( + EqualsProto(type_e.properties(0))))))); + EXPECT_THAT(d_map["E"], + UnorderedElementsAre( + Pair("A", IsEmpty()), Pair("B", IsEmpty()), + Pair("C", UnorderedElementsAre( + Pointee(EqualsProto(type_c.properties(0))))), + Pair("D", UnorderedElementsAre( + Pointee(EqualsProto(type_d.properties(0))))))); + EXPECT_THAT( + d_map["D"], + UnorderedElementsAre(Pair("A", IsEmpty()), + Pair("B", UnorderedElementsAre(Pointee( + EqualsProto(type_b.properties(1))))))); + EXPECT_THAT( + d_map["C"], + UnorderedElementsAre(Pair("A", IsEmpty()), + Pair("B", UnorderedElementsAre(Pointee( + EqualsProto(type_b.properties(0))))))); + EXPECT_THAT(d_map["B"], UnorderedElementsAre(Pair( + "A", UnorderedElementsAre(Pointee( + EqualsProto(type_a.properties(0))))))); } TEST(SchemaUtilTest, TopLevelCycle) { @@ -888,7 +952,8 @@ TEST(SchemaUtilTest, DifferentSchemaTypeIsIncompatible) { SchemaUtil::SchemaDelta schema_delta; schema_delta.schema_types_incompatible.emplace(kEmailType); // kEmailType depends on kMessageType - SchemaUtil::DependentMap dependents_map = {{kMessageType, {kEmailType}}}; + SchemaUtil::DependentMap dependents_map = { + {kMessageType, {{kEmailType, {}}}}}; SchemaUtil::SchemaDelta actual = SchemaUtil::ComputeCompatibilityDelta( old_schema, new_schema, dependents_map); EXPECT_THAT(actual, Eq(schema_delta)); @@ -1403,7 +1468,7 @@ TEST(SchemaUtilTest, IndexNestedDocumentsIndexIncompatible) { // unaffected. SchemaUtil::SchemaDelta schema_delta; schema_delta.schema_types_index_incompatible.emplace(kPersonType); - SchemaUtil::DependentMap dependents_map = {{kEmailType, {kPersonType}}}; + SchemaUtil::DependentMap dependents_map = {{kEmailType, {{kPersonType, {}}}}}; SchemaUtil::SchemaDelta actual = SchemaUtil::ComputeCompatibilityDelta( no_nested_index_schema, nested_index_schema, dependents_map); EXPECT_THAT(actual, Eq(schema_delta)); @@ -1466,6 +1531,547 @@ TEST(SchemaUtilTest, ValidateStringIndexingConfigShouldHaveTokenizer) { EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); } +TEST(SchemaUtilTest, + ValidateJoinablePropertyTypeQualifiedIdShouldHaveStringDataType) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty( + PropertyConfigBuilder() + .SetName("Foo") + .SetDataType(TYPE_INT64) + .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID, + /*propagate_delete=*/false) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + // Error if data type is not STRING for qualified id joinable value type. + EXPECT_THAT(SchemaUtil::Validate(schema), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + // Passes once we set STRING as the data type. + schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty( + PropertyConfigBuilder() + .SetName("Foo") + .SetDataType(TYPE_STRING) + .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID, + /*propagate_delete=*/false) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); +} + +TEST(SchemaUtilTest, ValidateJoinablePropertyShouldNotHaveRepeatedCardinality) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty( + PropertyConfigBuilder() + .SetName("Foo") + .SetDataType(TYPE_STRING) + .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID, + /*propagate_delete=*/false) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + + // Error if using REPEATED cardinality for joinable property. + EXPECT_THAT(SchemaUtil::Validate(schema), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + // Passes once we use OPTIONAL cardinality with joinable property. + schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty( + PropertyConfigBuilder() + .SetName("Foo") + .SetDataType(TYPE_STRING) + .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID, + /*propagate_delete=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); + + // Passes once we use REQUIRED cardinality with joinable property. + schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty( + PropertyConfigBuilder() + .SetName("Foo") + .SetDataType(TYPE_STRING) + .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID, + /*propagate_delete=*/false) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); + + // Passes once we use REPEATED cardinality with non-joinable property. + schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty( + PropertyConfigBuilder() + .SetName("Foo") + .SetDataType(TYPE_STRING) + .SetJoinable(JOINABLE_VALUE_TYPE_NONE, + /*propagate_delete=*/false) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); +} + +TEST(SchemaUtilTest, + ValidateJoinablePropertyWithDeletePropagationShouldHaveTypeQualifiedId) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty( + PropertyConfigBuilder() + .SetName("Foo") + .SetDataType(TYPE_STRING) + .SetJoinable(JOINABLE_VALUE_TYPE_NONE, + /*propagate_delete=*/true) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + // Error if enabling delete propagation with non qualified id joinable value + // type. + EXPECT_THAT(SchemaUtil::Validate(schema), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + // Passes once we set qualified id joinable value type with delete propagation + // enabled. + schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty( + PropertyConfigBuilder() + .SetName("Foo") + .SetDataType(TYPE_STRING) + .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID, + /*propagate_delete=*/true) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); + + // Passes once we disable delete propagation. + schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty( + PropertyConfigBuilder() + .SetName("Foo") + .SetDataType(TYPE_STRING) + .SetJoinable(JOINABLE_VALUE_TYPE_NONE, + /*propagate_delete=*/false) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); +} + +TEST(SchemaUtilTest, + ValidateNestedJoinablePropertyShouldNotHaveNestedRepeatedCardinality) { + // Dependency and nested document property cardinality: + // "C" --(REPEATED)--> "B" --(OPTIONAL)--> "A" + // where "A" contains joinable property. This should not be allowed. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("A").AddProperty( + PropertyConfigBuilder() + .SetName("Foo") + .SetDataType(TYPE_STRING) + .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID, + /*propagate_delete=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("B").AddProperty( + PropertyConfigBuilder() + .SetName("a") + .SetDataTypeDocument("A", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("C").AddProperty( + PropertyConfigBuilder() + .SetName("b") + .SetDataTypeDocument("B", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + EXPECT_THAT(SchemaUtil::Validate(schema), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + // Passes once we use non-REPEATED cardinality for "C.b", i.e. the dependency + // and nested document property cardinality becomes: + // "C" --(OPTIONAL)--> "B" --(OPTIONAL)--> "A" + schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("A").AddProperty( + PropertyConfigBuilder() + .SetName("Foo") + .SetDataType(TYPE_STRING) + .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID, + /*propagate_delete=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("B").AddProperty( + PropertyConfigBuilder() + .SetName("a") + .SetDataTypeDocument("A", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("C").AddProperty( + PropertyConfigBuilder() + .SetName("b") + .SetDataTypeDocument("B", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); +} + +TEST( + SchemaUtilTest, + ValidateNestedJoinablePropertyShouldAllowRepeatedCardinalityIfNoJoinableProperty) { + // Dependency and nested document property cardinality: + // "C" --(OPTIONAL)--> "B" --(REPEATED)--> "A" + // where only "B" contains joinable property. This should be allowed. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("A").AddProperty( + PropertyConfigBuilder() + .SetName("Foo") + .SetDataType(TYPE_STRING) + .SetJoinable(JOINABLE_VALUE_TYPE_NONE, + /*propagate_delete=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder() + .SetType("B") + .AddProperty(PropertyConfigBuilder() + .SetName("a") + .SetDataTypeDocument( + "A", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty( + PropertyConfigBuilder() + .SetName("Bar") + .SetDataType(TYPE_STRING) + .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID, + /*propagate_delete=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("C").AddProperty( + PropertyConfigBuilder() + .SetName("b") + .SetDataTypeDocument("B", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + // Passes since nested schema type with REPEATED cardinality doesn't have + // joinable property. + EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); +} + +TEST(SchemaUtilTest, + ValidateNestedJoinablePropertyMultiplePropertiesWithSameSchema) { + // Dependency and nested document property cardinality: + // --(a1: OPTIONAL)-- + // / \ + // B -- --> A + // \ / + // --(a2: REPEATED)-- + // where "A" contains joinable property. This should not be allowed. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("A").AddProperty( + PropertyConfigBuilder() + .SetName("Foo") + .SetDataType(TYPE_STRING) + .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID, + /*propagate_delete=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder() + .SetType("B") + .AddProperty(PropertyConfigBuilder() + .SetName("a1") + .SetDataTypeDocument( + "A", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("a2") + .SetDataTypeDocument( + "A", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + EXPECT_THAT(SchemaUtil::Validate(schema), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + // Passes once we use non-REPEATED cardinality for "B.a2", i.e. the dependency + // and nested document property cardinality becomes: + // --(a1: OPTIONAL)-- + // / \ + // B -- --> A + // \ / + // --(a2: OPTIONAL)-- + schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("A").AddProperty( + PropertyConfigBuilder() + .SetName("Foo") + .SetDataType(TYPE_STRING) + .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID, + /*propagate_delete=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder() + .SetType("B") + .AddProperty(PropertyConfigBuilder() + .SetName("a1") + .SetDataTypeDocument( + "A", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("a2") + .SetDataTypeDocument( + "A", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); +} + +TEST(SchemaUtilTest, ValidateNestedJoinablePropertyDiamondRelationship) { + // Dependency and nested document property cardinality: + // B + // / \ + // (OPTIONAL) (OPTIONAL) + // / \ + // D --- --> A + // \ / + // (OPTIONAL) (OPTIONAL) + // \ / + // C + // where "A" contains joinable property. This should be allowed. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("A").AddProperty( + PropertyConfigBuilder() + .SetName("Foo") + .SetDataType(TYPE_STRING) + .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID, + /*propagate_delete=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("B").AddProperty( + PropertyConfigBuilder() + .SetName("a") + .SetDataTypeDocument("A", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("C").AddProperty( + PropertyConfigBuilder() + .SetName("a") + .SetDataTypeDocument("A", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder() + .SetType("D") + .AddProperty(PropertyConfigBuilder() + .SetName("b") + .SetDataTypeDocument( + "B", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("c") + .SetDataTypeDocument( + "C", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); + + // Fails once we change any of edge to REPEATED cardinality. + // B + // / \ + // (REPEATED) (OPTIONAL) + // / \ + // D --- --> A + // \ / + // (OPTIONAL) (OPTIONAL) + // \ / + // C + schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("A").AddProperty( + PropertyConfigBuilder() + .SetName("Foo") + .SetDataType(TYPE_STRING) + .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID, + /*propagate_delete=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("B").AddProperty( + PropertyConfigBuilder() + .SetName("a") + .SetDataTypeDocument("A", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("C").AddProperty( + PropertyConfigBuilder() + .SetName("a") + .SetDataTypeDocument("A", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder() + .SetType("D") + .AddProperty(PropertyConfigBuilder() + .SetName("b") + .SetDataTypeDocument( + "B", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty(PropertyConfigBuilder() + .SetName("c") + .SetDataTypeDocument( + "C", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + EXPECT_THAT(SchemaUtil::Validate(schema), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + // B + // / \ + // (OPTIONAL) (REPEATED) + // / \ + // D --- --> A + // \ / + // (OPTIONAL) (OPTIONAL) + // \ / + // C + schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("A").AddProperty( + PropertyConfigBuilder() + .SetName("Foo") + .SetDataType(TYPE_STRING) + .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID, + /*propagate_delete=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("B").AddProperty( + PropertyConfigBuilder() + .SetName("a") + .SetDataTypeDocument("A", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_REPEATED))) + .AddType(SchemaTypeConfigBuilder().SetType("C").AddProperty( + PropertyConfigBuilder() + .SetName("a") + .SetDataTypeDocument("A", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder() + .SetType("D") + .AddProperty(PropertyConfigBuilder() + .SetName("b") + .SetDataTypeDocument( + "B", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("c") + .SetDataTypeDocument( + "C", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + EXPECT_THAT(SchemaUtil::Validate(schema), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + // B + // / \ + // (OPTIONAL) (OPTIONAL) + // / \ + // D --- --> A + // \ / + // (REPEATED) (OPTIONAL) + // \ / + // C + schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("A").AddProperty( + PropertyConfigBuilder() + .SetName("Foo") + .SetDataType(TYPE_STRING) + .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID, + /*propagate_delete=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("B").AddProperty( + PropertyConfigBuilder() + .SetName("a") + .SetDataTypeDocument("A", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("C").AddProperty( + PropertyConfigBuilder() + .SetName("a") + .SetDataTypeDocument("A", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder() + .SetType("D") + .AddProperty(PropertyConfigBuilder() + .SetName("b") + .SetDataTypeDocument( + "B", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("c") + .SetDataTypeDocument( + "C", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + EXPECT_THAT(SchemaUtil::Validate(schema), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + // B + // / \ + // (OPTIONAL) (OPTIONAL) + // / \ + // D --- --> A + // \ / + // (OPTIONAL) (REPEATED) + // \ / + // C + schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("A").AddProperty( + PropertyConfigBuilder() + .SetName("Foo") + .SetDataType(TYPE_STRING) + .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID, + /*propagate_delete=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("B").AddProperty( + PropertyConfigBuilder() + .SetName("a") + .SetDataTypeDocument("A", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("C").AddProperty( + PropertyConfigBuilder() + .SetName("a") + .SetDataTypeDocument("A", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_REPEATED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("D") + .AddProperty(PropertyConfigBuilder() + .SetName("b") + .SetDataTypeDocument( + "B", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("c") + .SetDataTypeDocument( + "C", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + EXPECT_THAT(SchemaUtil::Validate(schema), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + TEST(SchemaUtilTest, MultipleReferencesToSameNestedSchemaOk) { SchemaProto schema = SchemaBuilder() diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc index 35ee172..710ff58 100644 --- a/icing/store/document-store.cc +++ b/icing/store/document-store.cc @@ -89,17 +89,6 @@ constexpr int32_t kUriMapperMaxSize = 36 * 1024 * 1024; // 36 MiB constexpr int32_t kNamespaceMapperMaxSize = 3 * 128 * 1024; // 384 KiB constexpr int32_t kCorpusMapperMaxSize = 3 * 128 * 1024; // 384 KiB -// Whether to use namespace id or namespace name to build up fingerprint for -// document_key_mapper_ and corpus_mapper_. -// Note: Changing this flag will require a reconstruction of the internal -// mappers in the document store. A easy way to trigger a rebuild is to change -// the kMagic value. -// -// TODO(b/259969017) Flip this flag to true at the time when we switch to use -// persistent hash map for document_key_mapper_ so that we just need one -// reconstruction of the internal mappers. -constexpr bool kNamespaceIdFingerprint = false; - DocumentWrapper CreateDocumentWrapper(DocumentProto&& document) { DocumentWrapper document_wrapper; *document_wrapper.mutable_document() = std::move(document); @@ -157,23 +146,6 @@ std::string EncodeNamespaceId(NamespaceId namespace_id) { return encoding; } -std::string MakeFingerprint(NamespaceId namespace_id, - std::string_view namespace_, - std::string_view uri_or_schema) { - if (!kNamespaceIdFingerprint) { - // Using a 64-bit fingerprint to represent the key could lead to collisions. - // But, even with 200K unique keys, the probability of collision is about - // one-in-a-billion (https://en.wikipedia.org/wiki/Birthday_attack). - uint64_t fprint = tc3farmhash::Fingerprint64( - absl_ports::StrCat(namespace_, uri_or_schema)); - return fingerprint_util::GetFingerprintString(fprint); - } else { - return absl_ports::StrCat(EncodeNamespaceId(namespace_id), - encode_util::EncodeIntToCString( - tc3farmhash::Fingerprint64(uri_or_schema))); - } -} - int64_t CalculateExpirationTimestampMs(int64_t creation_timestamp_ms, int64_t ttl_ms) { if (ttl_ms == 0) { @@ -236,15 +208,34 @@ std::unordered_map<NamespaceId, std::string> GetNamespaceIdsToNamespaces( } // namespace +std::string DocumentStore::MakeFingerprint( + NamespaceId namespace_id, std::string_view namespace_, + std::string_view uri_or_schema) const { + if (!namespace_id_fingerprint_) { + // Using a 64-bit fingerprint to represent the key could lead to collisions. + // But, even with 200K unique keys, the probability of collision is about + // one-in-a-billion (https://en.wikipedia.org/wiki/Birthday_attack). + uint64_t fprint = tc3farmhash::Fingerprint64( + absl_ports::StrCat(namespace_, uri_or_schema)); + return fingerprint_util::GetFingerprintString(fprint); + } else { + return absl_ports::StrCat(EncodeNamespaceId(namespace_id), + encode_util::EncodeIntToCString( + tc3farmhash::Fingerprint64(uri_or_schema))); + } +} + DocumentStore::DocumentStore(const Filesystem* filesystem, const std::string_view base_dir, const Clock* clock, - const SchemaStore* schema_store) + const SchemaStore* schema_store, + bool namespace_id_fingerprint) : filesystem_(filesystem), base_dir_(base_dir), clock_(*clock), schema_store_(schema_store), - document_validator_(schema_store) {} + document_validator_(schema_store), + namespace_id_fingerprint_(namespace_id_fingerprint) {} libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put( const DocumentProto& document, int32_t num_tokens, @@ -271,14 +262,14 @@ DocumentStore::~DocumentStore() { libtextclassifier3::StatusOr<DocumentStore::CreateResult> DocumentStore::Create( const Filesystem* filesystem, const std::string& base_dir, const Clock* clock, const SchemaStore* schema_store, - bool force_recovery_and_revalidate_documents, + bool force_recovery_and_revalidate_documents, bool namespace_id_fingerprint, InitializeStatsProto* initialize_stats) { ICING_RETURN_ERROR_IF_NULL(filesystem); ICING_RETURN_ERROR_IF_NULL(clock); ICING_RETURN_ERROR_IF_NULL(schema_store); - auto document_store = std::unique_ptr<DocumentStore>( - new DocumentStore(filesystem, base_dir, clock, schema_store)); + auto document_store = std::unique_ptr<DocumentStore>(new DocumentStore( + filesystem, base_dir, clock, schema_store, namespace_id_fingerprint)); ICING_ASSIGN_OR_RETURN( DataLoss data_loss, document_store->Initialize(force_recovery_and_revalidate_documents, @@ -386,7 +377,8 @@ libtextclassifier3::Status DocumentStore::InitializeExistingDerivedFiles() { absl_ports::StrCat("Couldn't read: ", MakeHeaderFilename(base_dir_))); } - if (header.magic != DocumentStore::Header::kMagic) { + if (header.magic != + DocumentStore::Header::GetCurrentMagic(namespace_id_fingerprint_)) { return absl_ports::InternalError(absl_ports::StrCat( "Invalid header kMagic for file: ", MakeHeaderFilename(base_dir_))); } @@ -859,7 +851,8 @@ bool DocumentStore::HeaderExists() { libtextclassifier3::Status DocumentStore::UpdateHeader(const Crc32& checksum) { // Write the header DocumentStore::Header header; - header.magic = DocumentStore::Header::kMagic; + header.magic = + DocumentStore::Header::GetCurrentMagic(namespace_id_fingerprint_); header.checksum = checksum.Get(); // This should overwrite the header. diff --git a/icing/store/document-store.h b/icing/store/document-store.h index 3e02636..7c414d7 100644 --- a/icing/store/document-store.h +++ b/icing/store/document-store.h @@ -59,13 +59,19 @@ namespace lib { class DocumentStore { public: struct Header { - static constexpr int32_t kMagic = 0x746f7265; + static int32_t GetCurrentMagic(bool namespace_id_fingerprint) { + return namespace_id_fingerprint ? kNewMagic : kOldMagic; + } // Holds the magic as a quick sanity check against file corruption. int32_t magic; // Checksum of the DocumentStore's sub-component's checksums. uint32_t checksum; + + private: + static constexpr int32_t kOldMagic = 0x746f7265; + static constexpr int32_t kNewMagic = 0x1b99c8b0; }; struct OptimizeInfo { @@ -136,6 +142,7 @@ class DocumentStore { const Filesystem* filesystem, const std::string& base_dir, const Clock* clock, const SchemaStore* schema_store, bool force_recovery_and_revalidate_documents = false, + bool namespace_id_fingerprint = false, InitializeStatsProto* initialize_stats = nullptr); // Returns the maximum DocumentId that the DocumentStore has assigned. If @@ -472,7 +479,8 @@ class DocumentStore { private: // Use DocumentStore::Create() to instantiate. DocumentStore(const Filesystem* filesystem, std::string_view base_dir, - const Clock* clock, const SchemaStore* schema_store); + const Clock* clock, const SchemaStore* schema_store, + bool namespace_id_fingerprint); const Filesystem* const filesystem_; const std::string base_dir_; @@ -485,6 +493,10 @@ class DocumentStore { // Used to validate incoming documents DocumentValidator document_validator_; + // Whether to use namespace id or namespace name to build up fingerprint for + // document_key_mapper_ and corpus_mapper_. + bool namespace_id_fingerprint_; + // A log used to store all documents, it serves as a ground truth of doc // store. key_mapper_ and document_id_mapper_ can be regenerated from it. std::unique_ptr<PortableFileBackedProtoLog<DocumentWrapper>> document_log_; @@ -733,6 +745,13 @@ class DocumentStore { libtextclassifier3::StatusOr< google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>> CollectCorpusInfo() const; + + // Build fingerprint for the keys of document_key_mapper_ and corpus_mapper_. + // Note that namespace_id_fingerprint_ controls the way that a fingerprint is + // built. + std::string MakeFingerprint(NamespaceId namespace_id, + std::string_view namespace_, + std::string_view uri_or_schema) const; }; } // namespace lib diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc index a115e11..81da191 100644 --- a/icing/store/document-store_test.cc +++ b/icing/store/document-store_test.cc @@ -207,7 +207,8 @@ class DocumentStoreTest : public ::testing::Test { const std::string header_file = absl_ports::StrCat(document_store_dir_, "/document_store_header"); DocumentStore::Header header; - header.magic = DocumentStore::Header::kMagic; + header.magic = DocumentStore::Header::GetCurrentMagic( + /*namespace_id_fingerprint=*/false); header.checksum = 10; // Arbitrary garbage checksum filesystem_.DeleteFile(header_file.c_str()); filesystem_.Write(header_file.c_str(), &header, sizeof(header)); @@ -3285,10 +3286,10 @@ TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) { InitializeStatsProto initialize_stats; ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, - schema_store_.get(), - /*force_recovery_and_revalidate_documents=*/false, - &initialize_stats)); + DocumentStore::Create( + &filesystem_, document_store_dir_, &fake_clock_, schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, &initialize_stats)); std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); // The document log is using the legacy v0 format so that a migration is @@ -3489,10 +3490,10 @@ TEST_F(DocumentStoreTest, InitializeForceRecoveryUpdatesTypeIds) { InitializeStatsProto initialize_stats; ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, - schema_store.get(), - /*force_recovery_and_revalidate_documents=*/true, - &initialize_stats)); + DocumentStore::Create( + &filesystem_, document_store_dir_, &fake_clock_, schema_store.get(), + /*force_recovery_and_revalidate_documents=*/true, + /*namespace_id_fingerprint=*/false, &initialize_stats)); std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); @@ -3875,10 +3876,10 @@ TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) { InitializeStatsProto initialize_stats; ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, document_store_dir, &fake_clock_, - schema_store.get(), - /*force_recovery_and_revalidate_documents=*/false, - &initialize_stats)); + DocumentStore::Create( + &filesystem_, document_store_dir, &fake_clock_, schema_store.get(), + /*force_recovery_and_revalidate_documents=*/false, + /*namespace_id_fingerprint=*/false, &initialize_stats)); std::unique_ptr<DocumentStore> document_store = std::move(create_result.document_store); diff --git a/icing/tokenization/icu/icu-language-segmenter-factory.cc b/icing/tokenization/icu/icu-language-segmenter-factory.cc index 363bc6d..7b095b4 100644 --- a/icing/tokenization/icu/icu-language-segmenter-factory.cc +++ b/icing/tokenization/icu/icu-language-segmenter-factory.cc @@ -47,7 +47,7 @@ libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create( << " not supported. Converting to locale " << ULOC_US; options.locale = ULOC_US; } - return std::make_unique<IcuLanguageSegmenter>(std::move(options.locale)); + return IcuLanguageSegmenter::Create(std::move(options.locale)); } } // namespace language_segmenter_factory diff --git a/icing/tokenization/icu/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc index fd790cf..59bcc18 100644 --- a/icing/tokenization/icu/icu-language-segmenter.cc +++ b/icing/tokenization/icu/icu-language-segmenter.cc @@ -24,6 +24,7 @@ #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" +#include "icing/absl_ports/mutex.h" #include "icing/legacy/core/icing-string-util.h" #include "icing/util/character-iterator.h" #include "icing/util/i18n-utils.h" @@ -48,9 +49,11 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { // INTERNAL_ERROR if unable to create static libtextclassifier3::StatusOr< std::unique_ptr<LanguageSegmenter::Iterator>> - Create(std::string_view text, std::string_view locale) { + Create(const IcuLanguageSegmenter* creator, UBreakIterator* break_iterator, + std::string_view text, std::string_view locale) { std::unique_ptr<IcuLanguageSegmenterIterator> iterator( - new IcuLanguageSegmenterIterator(text, locale)); + new IcuLanguageSegmenterIterator(creator, break_iterator, text, + locale)); if (iterator->Initialize()) { return iterator; } @@ -58,8 +61,8 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { } ~IcuLanguageSegmenterIterator() { - ubrk_close(break_iterator_); utext_close(u_text_); + creator_.ReturnBreakIterator(break_iterator_); } // Advances to the next term. Returns false if it has reached the end. @@ -244,9 +247,12 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { } private: - explicit IcuLanguageSegmenterIterator(std::string_view text, + explicit IcuLanguageSegmenterIterator(const IcuLanguageSegmenter* creator, + UBreakIterator* break_iterator, + std::string_view text, std::string_view locale) - : break_iterator_(nullptr), + : creator_(*creator), + break_iterator_(break_iterator), text_(text), locale_(locale), u_text_(nullptr), @@ -256,13 +262,14 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { // Returns true on success bool Initialize() { + if (break_iterator_ == nullptr) { + return false; + } UErrorCode status = U_ZERO_ERROR; u_text_ = utext_openUTF8(nullptr, text_.data(), text_.length(), &status); if (u_text_ == nullptr) { return false; } - break_iterator_ = ubrk_open(UBRK_WORD, locale_.data(), /*text=*/nullptr, - /*textLength=*/0, &status); ubrk_setUText(break_iterator_, u_text_, &status); return !U_FAILURE(status); } @@ -290,9 +297,11 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { term_start_index_ = 0; } + const IcuLanguageSegmenter& creator_; // Does not own. + // The underlying class that does the segmentation, ubrk_close() must be // called after using. - UBreakIterator* break_iterator_; + UBreakIterator* break_iterator_; // Does not own // Text to be segmented std::string_view text_; @@ -321,19 +330,62 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { int term_end_index_exclusive_; }; -IcuLanguageSegmenter::IcuLanguageSegmenter(std::string locale) - : locale_(std::move(locale)) {} +/* static */ libtextclassifier3::StatusOr<std::unique_ptr<IcuLanguageSegmenter>> +IcuLanguageSegmenter::Create(std::string&& locale) { + UErrorCode status = U_ZERO_ERROR; + UBreakIterator* break_iterator = ubrk_open( + UBRK_WORD, locale.c_str(), /*text=*/nullptr, /*textLength=*/0, &status); + if (U_FAILURE(status) || break_iterator == nullptr) { + return absl_ports::AbortedError( + "Unable to create ICU break_iterator for language segmentation"); + } + return std::unique_ptr<IcuLanguageSegmenter>( + new IcuLanguageSegmenter(std::move(locale), break_iterator)); +} + +UBreakIterator* IcuLanguageSegmenter::ProduceBreakIterator() const { + UBreakIterator* itr = nullptr; + { + absl_ports::unique_lock l(&mutex_); + if (cached_break_iterator_ != nullptr) { + itr = cached_break_iterator_; + cached_break_iterator_ = nullptr; + } + } + if (itr == nullptr) { + UErrorCode status = U_ZERO_ERROR; + itr = ubrk_open(UBRK_WORD, locale_.c_str(), /*text=*/nullptr, + /*textLength=*/0, &status); + if (U_FAILURE(status)) { + itr = nullptr; + } + } + return itr; +} + +void IcuLanguageSegmenter::ReturnBreakIterator(UBreakIterator* itr) const { + { + absl_ports::unique_lock l(&mutex_); + if (cached_break_iterator_ == nullptr) { + cached_break_iterator_ = itr; + return; + } + } + ubrk_close(itr); +} libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>> IcuLanguageSegmenter::Segment(const std::string_view text, LanguageSegmenter::AccessType) const { - return IcuLanguageSegmenterIterator::Create(text, locale_); + return IcuLanguageSegmenterIterator::Create(this, ProduceBreakIterator(), + text, locale_); } libtextclassifier3::StatusOr<std::vector<std::string_view>> IcuLanguageSegmenter::GetAllTerms(const std::string_view text) const { - ICING_ASSIGN_OR_RETURN(std::unique_ptr<LanguageSegmenter::Iterator> iterator, - IcuLanguageSegmenterIterator::Create(text, locale_)); + ICING_ASSIGN_OR_RETURN( + std::unique_ptr<LanguageSegmenter::Iterator> iterator, + Segment(text, LanguageSegmenter::AccessType::kForwardIterator)); std::vector<std::string_view> terms; while (iterator->Advance()) { terms.push_back(iterator->GetTerm()); diff --git a/icing/tokenization/icu/icu-language-segmenter.h b/icing/tokenization/icu/icu-language-segmenter.h index f9cfbcb..e22c5d2 100644 --- a/icing/tokenization/icu/icu-language-segmenter.h +++ b/icing/tokenization/icu/icu-language-segmenter.h @@ -22,7 +22,9 @@ #include <vector> #include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/mutex.h" #include "icing/tokenization/language-segmenter.h" +#include "unicode/ubrk.h" namespace icing { namespace lib { @@ -41,7 +43,14 @@ namespace lib { // class. Other special tokenization logic will be in each tokenizer. class IcuLanguageSegmenter : public LanguageSegmenter { public: - explicit IcuLanguageSegmenter(std::string locale); + static libtextclassifier3::StatusOr<std::unique_ptr<IcuLanguageSegmenter>> + Create(std::string&& locale); + + ~IcuLanguageSegmenter() override { + if (cached_break_iterator_ != nullptr) { + ubrk_close(cached_break_iterator_); + } + } IcuLanguageSegmenter(const IcuLanguageSegmenter&) = delete; IcuLanguageSegmenter& operator=(const IcuLanguageSegmenter&) = delete; @@ -69,8 +78,32 @@ class IcuLanguageSegmenter : public LanguageSegmenter { std::string_view text) const override; private: + // Declared a friend so that it can call AcceptBreakIterator. + friend class IcuLanguageSegmenterIterator; + + explicit IcuLanguageSegmenter(std::string&& locale, UBreakIterator* iterator) + : locale_(std::move(locale)), cached_break_iterator_(iterator) {} + + // Returns a UBreakIterator that the caller owns. + // If cached_break_iterator_ is non-null, transfers ownership to caller and + // sets cached_break_iterator_ to null. + // If cached_break_iterator is null, creates a new UBreakIterator and + // transfers ownership to caller. + UBreakIterator* ProduceBreakIterator() const; + + // Caller transfers ownership of itr to IcuLanguageSegmenter. + // If cached_break_iterator_ is null, itr becomes the cached_break_iterator_ + // If cached_break_iterator_ is non-null, then itr will be closed. + void ReturnBreakIterator(UBreakIterator* itr) const; + // Used to help segment text const std::string locale_; + + // The underlying class that does the segmentation, ubrk_close() must be + // called after using. + mutable UBreakIterator* cached_break_iterator_ ICING_LOCKS_EXCLUDED(mutex_); + + mutable absl_ports::shared_mutex mutex_; }; } // namespace lib diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc index c88b992..d1bf5c6 100644 --- a/icing/tokenization/icu/icu-language-segmenter_test.cc +++ b/icing/tokenization/icu/icu-language-segmenter_test.cc @@ -1352,6 +1352,53 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, QuerySyntax) { "subproperty2", ":", "term3")); } +TEST_P(IcuLanguageSegmenterAllLocalesTest, MultipleLangSegmentersTest) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> iterator_one, + language_segmenter->Segment( + "foo bar baz", LanguageSegmenter::AccessType::kForwardIterator)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<LanguageSegmenter::Iterator> iterator_two, + language_segmenter->Segment( + "abra kadabra alakazam", + LanguageSegmenter::AccessType::kForwardIterator)); + + ASSERT_TRUE(iterator_one->Advance()); + ASSERT_TRUE(iterator_two->Advance()); + EXPECT_THAT(iterator_one->GetTerm(), Eq("foo")); + EXPECT_THAT(iterator_two->GetTerm(), Eq("abra")); + + ASSERT_TRUE(iterator_one->Advance()); + ASSERT_TRUE(iterator_two->Advance()); + EXPECT_THAT(iterator_one->GetTerm(), Eq(" ")); + EXPECT_THAT(iterator_two->GetTerm(), Eq(" ")); + + ASSERT_TRUE(iterator_one->Advance()); + EXPECT_THAT(iterator_one->GetTerm(), Eq("bar")); + EXPECT_THAT(iterator_two->GetTerm(), Eq(" ")); + ASSERT_TRUE(iterator_two->Advance()); + EXPECT_THAT(iterator_one->GetTerm(), Eq("bar")); + EXPECT_THAT(iterator_two->GetTerm(), Eq("kadabra")); + + ASSERT_TRUE(iterator_one->Advance()); + ASSERT_TRUE(iterator_two->Advance()); + EXPECT_THAT(iterator_one->GetTerm(), Eq(" ")); + EXPECT_THAT(iterator_two->GetTerm(), Eq(" ")); + + ASSERT_TRUE(iterator_two->Advance()); + ASSERT_TRUE(iterator_one->Advance()); + EXPECT_THAT(iterator_one->GetTerm(), Eq("baz")); + EXPECT_THAT(iterator_two->GetTerm(), Eq("alakazam")); + + ASSERT_FALSE(iterator_two->Advance()); + ASSERT_FALSE(iterator_one->Advance()); +} + INSTANTIATE_TEST_SUITE_P( LocaleName, IcuLanguageSegmenterAllLocalesTest, testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH, diff --git a/icing/tokenization/rfc822-tokenizer_test.cc b/icing/tokenization/rfc822-tokenizer_test.cc index 6b95a07..e1a7fc8 100644 --- a/icing/tokenization/rfc822-tokenizer_test.cc +++ b/icing/tokenization/rfc822-tokenizer_test.cc @@ -21,10 +21,7 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/testing/common-matchers.h" -#include "icing/testing/jni-test-helpers.h" -#include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/language-segmenter.h" -#include "unicode/uloc.h" namespace icing { namespace lib { @@ -32,21 +29,7 @@ namespace { using ::testing::ElementsAre; using ::testing::IsEmpty; -class Rfc822TokenizerTest : public testing::Test { - protected: - void SetUp() override { - jni_cache_ = GetTestJniCache(); - language_segmenter_factory::SegmenterOptions options(ULOC_US, - jni_cache_.get()); - ICING_ASSERT_OK_AND_ASSIGN( - language_segmenter_, - language_segmenter_factory::Create(std::move(options))); - } - std::unique_ptr<const JniCache> jni_cache_; - std::unique_ptr<LanguageSegmenter> language_segmenter_; -}; - -TEST_F(Rfc822TokenizerTest, StartingState) { +TEST(Rfc822TokenizerTest, StartingState) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string text = "a@g.c"; auto token_iterator = @@ -59,7 +42,7 @@ TEST_F(Rfc822TokenizerTest, StartingState) { ASSERT_THAT(token_iterator->GetTokens(), Not(IsEmpty())); } -TEST_F(Rfc822TokenizerTest, EmptyMiddleToken) { +TEST(Rfc822TokenizerTest, EmptyMiddleToken) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string s("<alex>,,<tom>"); @@ -77,7 +60,7 @@ TEST_F(Rfc822TokenizerTest, EmptyMiddleToken) { EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "tom")))); } -TEST_F(Rfc822TokenizerTest, Simple) { +TEST(Rfc822TokenizerTest, Simple) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string_view s("<你alex@google.com>"); @@ -94,7 +77,7 @@ TEST_F(Rfc822TokenizerTest, Simple) { EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com")))); } -TEST_F(Rfc822TokenizerTest, Small) { +TEST(Rfc822TokenizerTest, Small) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string s = "\"a\""; @@ -127,7 +110,7 @@ TEST_F(Rfc822TokenizerTest, Small) { EqualsToken(Token::Type::RFC822_COMMENT, "a")))); } -TEST_F(Rfc822TokenizerTest, PB) { +TEST(Rfc822TokenizerTest, PB) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string_view s("peanut (comment) butter, <alex@google.com>"); @@ -154,7 +137,7 @@ TEST_F(Rfc822TokenizerTest, PB) { EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com")))); } -TEST_F(Rfc822TokenizerTest, NoBrackets) { +TEST(Rfc822TokenizerTest, NoBrackets) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string_view s("alex@google.com"); @@ -171,7 +154,7 @@ TEST_F(Rfc822TokenizerTest, NoBrackets) { EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "google.com")))); } -TEST_F(Rfc822TokenizerTest, TwoAddresses) { +TEST(Rfc822TokenizerTest, TwoAddresses) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string_view s("<你alex@google.com>; <alexsav@gmail.com>"); @@ -195,7 +178,7 @@ TEST_F(Rfc822TokenizerTest, TwoAddresses) { EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com")))); } -TEST_F(Rfc822TokenizerTest, Comment) { +TEST(Rfc822TokenizerTest, Comment) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string_view s("(a comment) <alex@google.com>"); @@ -214,7 +197,7 @@ TEST_F(Rfc822TokenizerTest, Comment) { EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com")))); } -TEST_F(Rfc822TokenizerTest, NameAndComment) { +TEST(Rfc822TokenizerTest, NameAndComment) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string_view s("\"a name\" also a name <alex@google.com>"); @@ -237,7 +220,7 @@ TEST_F(Rfc822TokenizerTest, NameAndComment) { } // Test from tokenizer_test.cc. -TEST_F(Rfc822TokenizerTest, Rfc822SanityCheck) { +TEST(Rfc822TokenizerTest, Rfc822SanityCheck) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string addr1("A name (A comment) <address@domain.com>"); @@ -297,7 +280,7 @@ TEST_F(Rfc822TokenizerTest, Rfc822SanityCheck) { } // Tests from rfc822 converter. -TEST_F(Rfc822TokenizerTest, SimpleRfcText) { +TEST(Rfc822TokenizerTest, SimpleRfcText) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string test_string = "foo@google.com,bar@google.com,baz@google.com,foo+hello@google.com,baz@" @@ -349,7 +332,7 @@ TEST_F(Rfc822TokenizerTest, SimpleRfcText) { EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "corp.google.com")))); } -TEST_F(Rfc822TokenizerTest, ComplicatedRfcText) { +TEST(Rfc822TokenizerTest, ComplicatedRfcText) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string test_string = R"raw("Weird, But&(Also)\\Valid" Name (!With, "an" \\odd\\ cmt too¡) <Foo B(a)r,Baz@g.co> @@ -390,7 +373,7 @@ TEST_F(Rfc822TokenizerTest, ComplicatedRfcText) { EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com")))); } -TEST_F(Rfc822TokenizerTest, FromHtmlBugs) { +TEST(Rfc822TokenizerTest, FromHtmlBugs) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); // This input used to cause HTML parsing exception. We don't do HTML parsing // any more (b/8388100) so we are just checking that it does not crash and @@ -422,7 +405,7 @@ TEST_F(Rfc822TokenizerTest, FromHtmlBugs) { EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com")))); } -TEST_F(Rfc822TokenizerTest, EmptyComponentsTest) { +TEST(Rfc822TokenizerTest, EmptyComponentsTest) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); EXPECT_THAT(rfc822_tokenizer.TokenizeAll(""), IsOkAndHolds(testing::IsEmpty())); @@ -463,7 +446,7 @@ TEST_F(Rfc822TokenizerTest, EmptyComponentsTest) { EqualsToken(Token::Type::RFC822_COMMENT, "comment")))); } -TEST_F(Rfc822TokenizerTest, NameTest) { +TEST(Rfc822TokenizerTest, NameTest) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); // Name spread between address or comment. @@ -529,7 +512,7 @@ TEST_F(Rfc822TokenizerTest, NameTest) { EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "addr")))); } -TEST_F(Rfc822TokenizerTest, CommentEscapeTest) { +TEST(Rfc822TokenizerTest, CommentEscapeTest) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); // '(', ')', '\\' chars should be escaped. All other escaped chars should be // unescaped. @@ -564,7 +547,7 @@ TEST_F(Rfc822TokenizerTest, CommentEscapeTest) { EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "name")))); } -TEST_F(Rfc822TokenizerTest, QuoteEscapeTest) { +TEST(Rfc822TokenizerTest, QuoteEscapeTest) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); // All names that include non-alphanumeric chars must be quoted and have '\\' // and '"' chars escaped. @@ -593,7 +576,7 @@ TEST_F(Rfc822TokenizerTest, QuoteEscapeTest) { EqualsToken(Token::Type::RFC822_HOST_ADDRESS, R"(n\\a\m\"e)")))); } -TEST_F(Rfc822TokenizerTest, UnterminatedComponentTest) { +TEST(Rfc822TokenizerTest, UnterminatedComponentTest) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); EXPECT_THAT( @@ -661,7 +644,7 @@ TEST_F(Rfc822TokenizerTest, UnterminatedComponentTest) { EqualsToken(Token::Type::RFC822_COMMENT, "comment")))); } -TEST_F(Rfc822TokenizerTest, Tokenize) { +TEST(Rfc822TokenizerTest, Tokenize) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string text = @@ -714,7 +697,7 @@ TEST_F(Rfc822TokenizerTest, Tokenize) { EqualsToken(Token::Type::RFC822_COMMENT, "something")))); } -TEST_F(Rfc822TokenizerTest, EdgeCases) { +TEST(Rfc822TokenizerTest, EdgeCases) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); // Text to trigger the scenario where you have a non-alphabetic followed @@ -776,7 +759,7 @@ TEST_F(Rfc822TokenizerTest, EdgeCases) { EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com")))); } -TEST_F(Rfc822TokenizerTest, NumberInAddress) { +TEST(Rfc822TokenizerTest, NumberInAddress) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string text = "<3alex@google.com>"; EXPECT_THAT( @@ -791,7 +774,7 @@ TEST_F(Rfc822TokenizerTest, NumberInAddress) { EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com")))); } -TEST_F(Rfc822TokenizerTest, DoubleQuoteDoubleSlash) { +TEST(Rfc822TokenizerTest, DoubleQuoteDoubleSlash) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string text = R"("alex\"")"; EXPECT_THAT( @@ -813,7 +796,7 @@ TEST_F(Rfc822TokenizerTest, DoubleQuoteDoubleSlash) { EqualsToken(Token::Type::RFC822_HOST_ADDRESS, R"(alex\\\a)")))); } -TEST_F(Rfc822TokenizerTest, TwoEmails) { +TEST(Rfc822TokenizerTest, TwoEmails) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string text = "tjbarron@google.com alexsav@google.com"; EXPECT_THAT( @@ -835,7 +818,7 @@ TEST_F(Rfc822TokenizerTest, TwoEmails) { EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "google.com")))); } -TEST_F(Rfc822TokenizerTest, BackSlashes) { +TEST(Rfc822TokenizerTest, BackSlashes) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string text = R"("\name")"; EXPECT_THAT( @@ -859,7 +842,7 @@ TEST_F(Rfc822TokenizerTest, BackSlashes) { EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "foo\\@gmail")))); } -TEST_F(Rfc822TokenizerTest, BigWhitespace) { +TEST(Rfc822TokenizerTest, BigWhitespace) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string text = "\"quoted\" <address>"; EXPECT_THAT( @@ -872,7 +855,7 @@ TEST_F(Rfc822TokenizerTest, BigWhitespace) { EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "address")))); } -TEST_F(Rfc822TokenizerTest, AtSignFirst) { +TEST(Rfc822TokenizerTest, AtSignFirst) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string text = "\"@foo\""; EXPECT_THAT( @@ -884,7 +867,7 @@ TEST_F(Rfc822TokenizerTest, AtSignFirst) { EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "foo")))); } -TEST_F(Rfc822TokenizerTest, SlashThenUnicode) { +TEST(Rfc822TokenizerTest, SlashThenUnicode) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string text = R"("quoted\你cjk")"; EXPECT_THAT( @@ -897,7 +880,7 @@ TEST_F(Rfc822TokenizerTest, SlashThenUnicode) { EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "quoted\\你cjk")))); } -TEST_F(Rfc822TokenizerTest, AddressEmptyAddress) { +TEST(Rfc822TokenizerTest, AddressEmptyAddress) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string text = "<address> <> Name"; EXPECT_THAT( @@ -910,7 +893,7 @@ TEST_F(Rfc822TokenizerTest, AddressEmptyAddress) { EqualsToken(Token::Type::RFC822_NAME, "Name")))); } -TEST_F(Rfc822TokenizerTest, ProperComment) { +TEST(Rfc822TokenizerTest, ProperComment) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string text = "(comment)alex@google.com"; EXPECT_THAT( @@ -926,7 +909,7 @@ TEST_F(Rfc822TokenizerTest, ProperComment) { EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "google.com")))); } -TEST_F(Rfc822TokenizerTest, SmallNameToEmail) { +TEST(Rfc822TokenizerTest, SmallNameToEmail) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string text = "a@g.c,b@g.c"; EXPECT_THAT(rfc822_tokenizer.TokenizeAll(text), @@ -958,7 +941,7 @@ TEST_F(Rfc822TokenizerTest, SmallNameToEmail) { EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "g.c")))); } -TEST_F(Rfc822TokenizerTest, AtSignLast) { +TEST(Rfc822TokenizerTest, AtSignLast) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string_view text("<alex@>, tim@"); EXPECT_THAT( @@ -974,13 +957,13 @@ TEST_F(Rfc822TokenizerTest, AtSignLast) { EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "tim")))); } -TEST_F(Rfc822TokenizerTest, Commas) { +TEST(Rfc822TokenizerTest, Commas) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string text = ",,,,,,,,,,,,,,,,,,,,,,,,,,;"; EXPECT_THAT(rfc822_tokenizer.TokenizeAll(text), IsOkAndHolds(IsEmpty())); } -TEST_F(Rfc822TokenizerTest, ResetToTokenStartingAfter) { +TEST(Rfc822TokenizerTest, ResetToTokenStartingAfter) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string text = "a@g.c,b@g.c"; auto token_iterator = @@ -999,7 +982,7 @@ TEST_F(Rfc822TokenizerTest, ResetToTokenStartingAfter) { ASSERT_FALSE(token_iterator->ResetToTokenStartingAfter(6)); } -TEST_F(Rfc822TokenizerTest, ResetToTokenEndingBefore) { +TEST(Rfc822TokenizerTest, ResetToTokenEndingBefore) { Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer(); std::string text = "a@g.c,b@g.c"; auto token_iterator = diff --git a/icing/util/tokenized-document.cc b/icing/util/tokenized-document.cc index 1c11c3c..004181e 100644 --- a/icing/util/tokenized-document.cc +++ b/icing/util/tokenized-document.cc @@ -20,6 +20,7 @@ #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/proto/document.pb.h" +#include "icing/schema/joinable-property.h" #include "icing/schema/schema-store.h" #include "icing/schema/section.h" #include "icing/tokenization/language-segmenter.h" @@ -74,6 +75,9 @@ TokenizedDocument::Create(const SchemaStore* schema_store, ICING_ASSIGN_OR_RETURN(SectionGroup section_group, schema_store->ExtractSections(document)); + ICING_ASSIGN_OR_RETURN(JoinablePropertyGroup joinable_property_group, + schema_store->ExtractJoinableProperties(document)); + // Tokenize string sections ICING_ASSIGN_OR_RETURN( std::vector<TokenizedSection> tokenized_string_sections, @@ -82,7 +86,8 @@ TokenizedDocument::Create(const SchemaStore* schema_store, return TokenizedDocument(std::move(document), std::move(tokenized_string_sections), - std::move(section_group.integer_sections)); + std::move(section_group.integer_sections), + std::move(joinable_property_group)); } } // namespace lib diff --git a/icing/util/tokenized-document.h b/icing/util/tokenized-document.h index 5729df2..7cc34e3 100644 --- a/icing/util/tokenized-document.h +++ b/icing/util/tokenized-document.h @@ -21,6 +21,7 @@ #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/proto/document.pb.h" +#include "icing/schema/joinable-property.h" #include "icing/schema/schema-store.h" #include "icing/schema/section.h" #include "icing/tokenization/language-segmenter.h" @@ -62,19 +63,27 @@ class TokenizedDocument { return integer_sections_; } + const std::vector<JoinableProperty<std::string_view>>& + qualified_id_join_properties() const { + return joinable_property_group_.qualified_id_properties; + } + private: // Use TokenizedDocument::Create() to instantiate. explicit TokenizedDocument( DocumentProto&& document, std::vector<TokenizedSection>&& tokenized_string_sections, - std::vector<Section<int64_t>>&& integer_sections) + std::vector<Section<int64_t>>&& integer_sections, + JoinablePropertyGroup&& joinable_property_group) : document_(std::move(document)), tokenized_string_sections_(std::move(tokenized_string_sections)), - integer_sections_(std::move(integer_sections)) {} + integer_sections_(std::move(integer_sections)), + joinable_property_group_(std::move(joinable_property_group)) {} DocumentProto document_; std::vector<TokenizedSection> tokenized_string_sections_; std::vector<Section<int64_t>> integer_sections_; + JoinablePropertyGroup joinable_property_group_; }; } // namespace lib diff --git a/icing/util/tokenized-document_test.cc b/icing/util/tokenized-document_test.cc index 3497bef..f2a9214 100644 --- a/icing/util/tokenized-document_test.cc +++ b/icing/util/tokenized-document_test.cc @@ -27,6 +27,7 @@ #include "icing/proto/schema.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema-builder.h" +#include "icing/schema/joinable-property.h" #include "icing/schema/schema-store.h" #include "icing/schema/section.h" #include "icing/testing/common-matchers.h" @@ -50,19 +51,29 @@ using ::testing::IsEmpty; using ::testing::SizeIs; // schema types -constexpr std::string_view kFakeType = "FakeType"; +static constexpr std::string_view kFakeType = "FakeType"; // Indexable properties and section Id. Section Id is determined by the // lexicographical order of indexable property path. -constexpr std::string_view kIndexableIntegerProperty1 = "indexableInteger1"; -constexpr std::string_view kIndexableIntegerProperty2 = "indexableInteger2"; -constexpr std::string_view kStringExactProperty = "stringExact"; -constexpr std::string_view kStringPrefixProperty = "stringPrefix"; - -constexpr SectionId kIndexableInteger1SectionId = 0; -constexpr SectionId kIndexableInteger2SectionId = 1; -constexpr SectionId kStringExactSectionId = 2; -constexpr SectionId kStringPrefixSectionId = 3; +static constexpr std::string_view kIndexableIntegerProperty1 = + "indexableInteger1"; +static constexpr std::string_view kIndexableIntegerProperty2 = + "indexableInteger2"; +static constexpr std::string_view kStringExactProperty = "stringExact"; +static constexpr std::string_view kStringPrefixProperty = "stringPrefix"; + +static constexpr SectionId kIndexableInteger1SectionId = 0; +static constexpr SectionId kIndexableInteger2SectionId = 1; +static constexpr SectionId kStringExactSectionId = 2; +static constexpr SectionId kStringPrefixSectionId = 3; + +// Joinable properties and joinable property id. Joinable property id is +// determined by the lexicographical order of joinable property path. +static constexpr std::string_view kQualifiedId1 = "qualifiedId1"; +static constexpr std::string_view kQualifiedId2 = "qualifiedId2"; + +static constexpr JoinablePropertyId kQualifiedId1JoinablePropertyId = 0; +static constexpr JoinablePropertyId kQualifiedId2JoinablePropertyId = 1; const SectionMetadata kIndexableInteger1SectionMetadata( kIndexableInteger1SectionId, TYPE_INT64, TOKENIZER_NONE, TERM_MATCH_UNKNOWN, @@ -80,7 +91,15 @@ const SectionMetadata kStringPrefixSectionMetadata( kStringPrefixSectionId, TYPE_STRING, TOKENIZER_PLAIN, TERM_MATCH_PREFIX, NUMERIC_MATCH_UNKNOWN, std::string(kStringPrefixProperty)); -// Other non-indexable properties. +const JoinablePropertyMetadata kQualifiedId1JoinablePropertyMetadata( + kQualifiedId1JoinablePropertyId, TYPE_STRING, + JOINABLE_VALUE_TYPE_QUALIFIED_ID, std::string(kQualifiedId1)); + +const JoinablePropertyMetadata kQualifiedId2JoinablePropertyMetadata( + kQualifiedId2JoinablePropertyId, TYPE_STRING, + JOINABLE_VALUE_TYPE_QUALIFIED_ID, std::string(kQualifiedId2)); + +// Other non-indexable/joinable properties. constexpr std::string_view kUnindexedStringProperty = "unindexedString"; constexpr std::string_view kUnindexedIntegerProperty = "unindexedInteger"; @@ -137,6 +156,16 @@ class TokenizedDocumentTest : public ::testing::Test { .SetName(kStringPrefixProperty) .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName(kQualifiedId1) + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName(kQualifiedId2) + .SetDataTypeJoinableString( + JOINABLE_VALUE_TYPE_QUALIFIED_ID) .SetCardinality(CARDINALITY_OPTIONAL))) .Build(); ICING_ASSERT_OK(schema_store_->SetSchema(schema)); @@ -177,6 +206,8 @@ TEST_F(TokenizedDocumentTest, CreateAll) { .AddInt64Property(std::string(kUnindexedIntegerProperty), 789) .AddInt64Property(std::string(kIndexableIntegerProperty1), 1, 2, 3) .AddInt64Property(std::string(kIndexableIntegerProperty2), 456) + .AddStringProperty(std::string(kQualifiedId1), "pkg$db/ns#uri1") + .AddStringProperty(std::string(kQualifiedId2), "pkg$db/ns#uri2") .Build(); ICING_ASSERT_OK_AND_ASSIGN( @@ -210,6 +241,17 @@ TEST_F(TokenizedDocumentTest, CreateAll) { Eq(kIndexableInteger2SectionMetadata)); EXPECT_THAT(tokenized_document.integer_sections().at(1).content, ElementsAre(456)); + + // Qualified id join properties + EXPECT_THAT(tokenized_document.qualified_id_join_properties(), SizeIs(2)); + EXPECT_THAT(tokenized_document.qualified_id_join_properties().at(0).metadata, + Eq(kQualifiedId1JoinablePropertyMetadata)); + EXPECT_THAT(tokenized_document.qualified_id_join_properties().at(0).values, + ElementsAre("pkg$db/ns#uri1")); + EXPECT_THAT(tokenized_document.qualified_id_join_properties().at(1).metadata, + Eq(kQualifiedId2JoinablePropertyMetadata)); + EXPECT_THAT(tokenized_document.qualified_id_join_properties().at(1).values, + ElementsAre("pkg$db/ns#uri2")); } TEST_F(TokenizedDocumentTest, CreateNoIndexableIntegerProperties) { @@ -233,6 +275,9 @@ TEST_F(TokenizedDocumentTest, CreateNoIndexableIntegerProperties) { // integer sections EXPECT_THAT(tokenized_document.integer_sections(), IsEmpty()); + + // Qualified id join properties + EXPECT_THAT(tokenized_document.qualified_id_join_properties(), IsEmpty()); } TEST_F(TokenizedDocumentTest, CreateMultipleIndexableIntegerProperties) { @@ -266,6 +311,9 @@ TEST_F(TokenizedDocumentTest, CreateMultipleIndexableIntegerProperties) { Eq(kIndexableInteger2SectionMetadata)); EXPECT_THAT(tokenized_document.integer_sections().at(1).content, ElementsAre(456)); + + // Qualified id join properties + EXPECT_THAT(tokenized_document.qualified_id_join_properties(), IsEmpty()); } TEST_F(TokenizedDocumentTest, CreateNoIndexableStringProperties) { @@ -290,6 +338,9 @@ TEST_F(TokenizedDocumentTest, CreateNoIndexableStringProperties) { // integer sections EXPECT_THAT(tokenized_document.integer_sections(), IsEmpty()); + + // Qualified id join properties + EXPECT_THAT(tokenized_document.qualified_id_join_properties(), IsEmpty()); } TEST_F(TokenizedDocumentTest, CreateMultipleIndexableStringProperties) { @@ -327,6 +378,73 @@ TEST_F(TokenizedDocumentTest, CreateMultipleIndexableStringProperties) { // integer sections EXPECT_THAT(tokenized_document.integer_sections(), IsEmpty()); + + // Qualified id join properties + EXPECT_THAT(tokenized_document.qualified_id_join_properties(), IsEmpty()); +} + +TEST_F(TokenizedDocumentTest, CreateNoJoinQualifiedIdProperties) { + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kUnindexedStringProperty), + "hello world unindexed") + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + + EXPECT_THAT(tokenized_document.document(), EqualsProto(document)); + EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(0)); + + // string sections + EXPECT_THAT(tokenized_document.tokenized_string_sections(), IsEmpty()); + + // integer sections + EXPECT_THAT(tokenized_document.integer_sections(), IsEmpty()); + + // Qualified id join properties + EXPECT_THAT(tokenized_document.qualified_id_join_properties(), IsEmpty()); +} + +TEST_F(TokenizedDocumentTest, CreateMultipleJoinQualifiedIdProperties) { + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kUnindexedStringProperty), + "hello world unindexed") + .AddStringProperty(std::string(kQualifiedId1), "pkg$db/ns#uri1") + .AddStringProperty(std::string(kQualifiedId2), "pkg$db/ns#uri2") + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + + EXPECT_THAT(tokenized_document.document(), EqualsProto(document)); + EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(0)); + + // string sections + EXPECT_THAT(tokenized_document.tokenized_string_sections(), IsEmpty()); + + // integer sections + EXPECT_THAT(tokenized_document.integer_sections(), IsEmpty()); + + // Qualified id join properties + EXPECT_THAT(tokenized_document.qualified_id_join_properties(), SizeIs(2)); + EXPECT_THAT(tokenized_document.qualified_id_join_properties().at(0).metadata, + Eq(kQualifiedId1JoinablePropertyMetadata)); + EXPECT_THAT(tokenized_document.qualified_id_join_properties().at(0).values, + ElementsAre("pkg$db/ns#uri1")); + EXPECT_THAT(tokenized_document.qualified_id_join_properties().at(1).metadata, + Eq(kQualifiedId2JoinablePropertyMetadata)); + EXPECT_THAT(tokenized_document.qualified_id_join_properties().at(1).values, + ElementsAre("pkg$db/ns#uri2")); } } // namespace diff --git a/proto/icing/index/numeric/wildcard-property-storage.proto b/proto/icing/index/numeric/wildcard-property-storage.proto new file mode 100644 index 0000000..7f02b77 --- /dev/null +++ b/proto/icing/index/numeric/wildcard-property-storage.proto @@ -0,0 +1,22 @@ +// Copyright 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package icing.lib; + +// Next tag: 2 +message WildcardPropertyStorage { + repeated string property_entries = 1; +} diff --git a/proto/icing/proto/initialize.proto b/proto/icing/proto/initialize.proto index 7fe1e6f..40a0d0c 100644 --- a/proto/icing/proto/initialize.proto +++ b/proto/icing/proto/initialize.proto @@ -23,7 +23,7 @@ option java_package = "com.google.android.icing.proto"; option java_multiple_files = true; option objc_class_prefix = "ICNG"; -// Next tag: 5 +// Next tag: 7 message IcingSearchEngineOptions { // Directory to persist files for Icing. Required. // If Icing was previously initialized with this directory, it will reload @@ -58,6 +58,23 @@ message IcingSearchEngineOptions { // Optional. optional int32 index_merge_size = 4 [default = 1048576]; // 1 MiB + // Whether to use namespace id or namespace name to build up fingerprint for + // document_key_mapper_ and corpus_mapper_ in document store. + // TODO(b/259969017) Flip the default value of this flag to true at the time + // when we switch to use persistent hash map for document_key_mapper_ so that + // we just need one reconstruction of the internal mappers. + optional bool document_store_namespace_id_fingerprint = 5; + + // The threshold of the percentage of invalid documents to rebuild index + // during optimize, i.e. we rebuild index if and only if + // |invalid_documents| / |all_documents| >= optimize_rebuild_index_threshold + // + // Rebuilding the index could be faster than optimizing the index if we have + // removed most of the documents. + // Based on benchmarks, 85%~95% seems to be a good threshold for most cases. + // + // Default to 0 for better rollout of the new index optimize. + optional float optimize_rebuild_index_threshold = 6 [default = 0.0]; reserved 2; } diff --git a/proto/icing/proto/logging.proto b/proto/icing/proto/logging.proto index feb2643..edfcf40 100644 --- a/proto/icing/proto/logging.proto +++ b/proto/icing/proto/logging.proto @@ -23,7 +23,7 @@ option java_multiple_files = true; option objc_class_prefix = "ICNG"; // Stats of the top-level function IcingSearchEngine::Initialize(). -// Next tag: 13 +// Next tag: 14 message InitializeStatsProto { // Overall time used for the function call. optional int32 latency_ms = 1; @@ -105,6 +105,12 @@ message InitializeStatsProto { // - SCHEMA_CHANGES_OUT_OF_SYNC // - IO_ERROR optional RecoveryCause integer_index_restoration_cause = 12; + + // Possible recovery causes for qualified id join index: + // - INCONSISTENT_WITH_GROUND_TRUTH + // - SCHEMA_CHANGES_OUT_OF_SYNC + // - IO_ERROR + optional RecoveryCause qualified_id_join_index_restoration_cause = 13; } // Stats of the top-level function IcingSearchEngine::Put(). diff --git a/proto/icing/proto/search.proto b/proto/icing/proto/search.proto index c9e2b1d..8bdbf0c 100644 --- a/proto/icing/proto/search.proto +++ b/proto/icing/proto/search.proto @@ -85,7 +85,8 @@ message SearchSpecProto { // enable testing. // TODO(b/208654892) Remove this field once EXPERIMENTAL_ICING_ADVANCED_QUERY // is fully supported. - optional SearchType.Code search_type = 6 [default = ICING_RAW_QUERY]; + optional SearchType.Code search_type = 6 + [default = EXPERIMENTAL_ICING_ADVANCED_QUERY]; // OPTIONAL: If this field is present, join documents based on a nested // SearchSpec. diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt index 232fbe0..5ff4997 100644 --- a/synced_AOSP_CL_number.txt +++ b/synced_AOSP_CL_number.txt @@ -1 +1 @@ -set(synced_AOSP_CL_number=-514555603) +set(synced_AOSP_CL_number=516534290) |