diff options
author | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2021-08-22 10:17:49 +0000 |
---|---|---|
committer | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2021-08-22 10:17:49 +0000 |
commit | 20437efd05ffb505b36624f092e3e2d6aa834ed7 (patch) | |
tree | 55370a66f2896116815c92e4d897336cca30ea5e | |
parent | bbbb1f6b786dd46354a81bb88710ab8120240043 (diff) | |
parent | 14ee9a8eb8f3ed47f68117208626045878c943ac (diff) | |
download | icing-androidx-wear-wear-phone-interactions-release.tar.gz |
Snap for 7663505 from 14ee9a8eb8f3ed47f68117208626045878c943ac to androidx-wear-wear-phone-interactions-releaseandroidx-wear-wear-phone-interactions-release
Change-Id: I91670a47e2493c2712a3a4f9bd7a9f9a6e3d1ddc
158 files changed, 18201 insertions, 6773 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index a740924..01ee8eb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,6 +15,9 @@ cmake_minimum_required(VERSION 3.10.2) add_definitions("-DICING_REVERSE_JNI_SEGMENTATION=1") +set(VERSION_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/icing/jni.lds") +set(CMAKE_SHARED_LINKER_FLAGS + "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--gc-sections -Wl,--version-script=${VERSION_SCRIPT}") set( Protobuf_PREBUILTS_DIR @@ -45,7 +48,7 @@ add_subdirectory("${Protobuf_SOURCE_DIR}/cmake" ${Protobuf_TARGET_BINARY_DIR}) # Compile libandroidicu set(ICU_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../icu/libandroidicu") set(ICU_TARGET_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/icu-target") -add_subdirectory(${ICU_SOURCE_DIR} ${ICU_TARGET_BINARY_DIR}) +add_subdirectory("${ICU_SOURCE_DIR}/static_shim" ${ICU_TARGET_BINARY_DIR}) # Glob Icing proto sources. Results look like this: icing/proto/document.proto file( diff --git a/icing/file/destructible-file.h b/icing/file/destructible-file.h new file mode 100644 index 0000000..006dcb4 --- /dev/null +++ b/icing/file/destructible-file.h @@ -0,0 +1,72 @@ +// Copyright (C) 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_FILE_DESTRUCTIBLE_FILE_H_ +#define ICING_FILE_DESTRUCTIBLE_FILE_H_ + +#include <unistd.h> + +#include <string> + +#include "icing/file/filesystem.h" +#include "icing/util/logging.h" + +namespace icing { +namespace lib { + +// A convenient RAII class which will open the specified file path for write and +// delete the underlying file upon destruction. +class DestructibleFile { + public: + explicit DestructibleFile(const std::string& filepath, + const Filesystem* filesystem) + : filesystem_(filesystem), filepath_(filepath) { + fd_ = filesystem_->OpenForWrite(filepath_.c_str()); + } + + DestructibleFile(const DestructibleFile&) = delete; + DestructibleFile(DestructibleFile&& other) : filesystem_(nullptr), fd_(-1) { + *this = std::move(other); + } + + DestructibleFile& operator=(const DestructibleFile&) = delete; + DestructibleFile& operator=(DestructibleFile&& other) { + std::swap(fd_, other.fd_); + std::swap(filesystem_, other.filesystem_); + std::swap(filepath_, other.filepath_); + return *this; + } + + ~DestructibleFile() { + if (is_valid()) { + close(fd_); + if (!filesystem_->DeleteFile(filepath_.c_str())) { + ICING_VLOG(1) << "Failed to delete file " << filepath_; + } + } + } + + bool is_valid() const { return fd_ >= 0; } + int get_fd() const { return fd_; } + + private: + const Filesystem* filesystem_; + std::string filepath_; + int fd_; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_FILE_DESTRUCTIBLE_FILE_H_ diff --git a/icing/file/destructible-file_test.cc b/icing/file/destructible-file_test.cc new file mode 100644 index 0000000..61316d1 --- /dev/null +++ b/icing/file/destructible-file_test.cc @@ -0,0 +1,117 @@ +// Copyright (C) 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/file/destructible-file.h" + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/file/filesystem.h" +#include "icing/testing/tmp-directory.h" + +namespace icing { +namespace lib { + +namespace { + +TEST(DestructibleFileTest, DeletesFileProperly) { + Filesystem filesystem; + std::string filepath1 = GetTestTempDir() + "/file1"; + + { + // 1. Create the file + ScopedFd sfd(filesystem.OpenForWrite(filepath1.c_str())); + ASSERT_TRUE(sfd.is_valid()); + int i = 127; + ASSERT_TRUE(filesystem.Write(sfd.get(), &i, sizeof(i))); + } + + { + // 2. Open with a Destructible file. + DestructibleFile destructible(filepath1, &filesystem); + ASSERT_TRUE(destructible.is_valid()); + } + + // 3. Ensure that the file doesn't exist. + EXPECT_FALSE(filesystem.FileExists(filepath1.c_str())); +} + +TEST(DestructibleFileTest, MoveAssignDeletesFileProperly) { + Filesystem filesystem; + std::string filepath1 = GetTestTempDir() + "/file1"; + std::string filepath2 = GetTestTempDir() + "/file2"; + + // 1. Create file1 + DestructibleFile destructible1(filepath1, &filesystem); + ASSERT_TRUE(destructible1.is_valid()); + int i = 127; + ASSERT_TRUE(filesystem.Write(destructible1.get_fd(), &i, sizeof(i))); + + { + // 2. Create file2 + DestructibleFile destructible2(filepath2, &filesystem); + ASSERT_TRUE(destructible2.is_valid()); + i = 458; + ASSERT_TRUE(filesystem.Write(destructible2.get_fd(), &i, sizeof(i))); + + // Move assign destructible2 into destructible1 + destructible1 = std::move(destructible2); + } + + // 3. file1 shouldn't exist because it was destroyed when destructible1 was + // move assigned to. + EXPECT_FALSE(filesystem.FileExists(filepath1.c_str())); + + // 4. file2 should still exist because it moved into destructible1 from + // destructible2. + EXPECT_TRUE(filesystem.FileExists(filepath2.c_str())); +} + +TEST(DestructibleFileTest, MoveConstructionDeletesFileProperly) { + Filesystem filesystem; + std::string filepath1 = GetTestTempDir() + "/file1"; + + // 1. Create destructible1, it'll be reconstructed soon anyways. + std::unique_ptr<DestructibleFile> destructible1; + { + // 2. Create file1 + DestructibleFile destructible2(filepath1, &filesystem); + ASSERT_TRUE(destructible2.is_valid()); + int i = 458; + ASSERT_TRUE(filesystem.Write(destructible2.get_fd(), &i, sizeof(i))); + + // Move construct destructible1 from destructible2 + destructible1 = + std::make_unique<DestructibleFile>(std::move(destructible2)); + } + + // 3. file1 should still exist because it moved into destructible1 from + // destructible2. + ASSERT_TRUE(destructible1->is_valid()); + EXPECT_TRUE(filesystem.FileExists(filepath1.c_str())); + + { + // 4. Move construct destructible3 from destructible1 + DestructibleFile destructible3(std::move(*destructible1)); + ASSERT_TRUE(destructible3.is_valid()); + } + + // 5. file1 shouldn't exist because it was destroyed when destructible3 was + // destroyed. + EXPECT_FALSE(filesystem.FileExists(filepath1.c_str())); +} + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/file/file-backed-proto-log.h b/icing/file/file-backed-proto-log.h index 763c93b..b2b37e8 100644 --- a/icing/file/file-backed-proto-log.h +++ b/icing/file/file-backed-proto-log.h @@ -70,6 +70,7 @@ #include "icing/file/filesystem.h" #include "icing/file/memory-mapped-file.h" #include "icing/legacy/core/icing-string-util.h" +#include "icing/portable/platform.h" #include "icing/portable/zlib.h" #include "icing/util/crc32.h" #include "icing/util/data-loss.h" @@ -79,23 +80,6 @@ namespace icing { namespace lib { -namespace { - -bool IsEmptyBuffer(const char* buffer, int size) { - return std::all_of(buffer, buffer + size, - [](const char byte) { return byte == 0; }); -} - -// Helper function to get stored proto size from the metadata. -// Metadata format: 8 bits magic + 24 bits size -int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; } - -// Helper function to get stored proto magic from the metadata. -// Metadata format: 8 bits magic + 24 bits size -uint8_t GetProtoMagic(int metadata) { return metadata >> 24; } - -} // namespace - template <typename ProtoT> class FileBackedProtoLog { public: @@ -401,6 +385,28 @@ class FileBackedProtoLog { const Filesystem* filesystem, const std::string& file_path, Crc32 initial_crc, int64_t start, int64_t end); + static bool IsEmptyBuffer(const char* buffer, int size) { + return std::all_of(buffer, buffer + size, + [](const char byte) { return byte == 0; }); + } + + // Helper function to get stored proto size from the metadata. + // Metadata format: 8 bits magic + 24 bits size + static int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; } + + // Helper function to get stored proto magic from the metadata. + // Metadata format: 8 bits magic + 24 bits size + static uint8_t GetProtoMagic(int metadata) { return metadata >> 24; } + + // Reads out the metadata of a proto located at file_offset from the file. + // + // Returns: + // Proto's metadata on success + // OUT_OF_RANGE_ERROR if file_offset exceeds file_size + // INTERNAL_ERROR if the metadata is invalid or any IO errors happen + static libtextclassifier3::StatusOr<int> ReadProtoMetadata( + MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size); + // Magic number added in front of every proto. Used when reading out protos // as a first check for corruption in each entry in the file. Even if there is // a corruption, the best we can do is roll back to our last recovery point @@ -422,20 +428,12 @@ class FileBackedProtoLog { static constexpr int kDeflateCompressionLevel = 3; // Chunks of the file to mmap at a time, so we don't mmap the entire file. - static constexpr int kMmapChunkSize = 4 * 1024; + // Only used on 32-bit devices + static constexpr int kMmapChunkSize = 4 * 1024 * 1024; // 4MiB ScopedFd fd_; const Filesystem* const filesystem_; const std::string file_path_; - - // Reads out the metadata of a proto located at file_offset from the file. - // - // Returns: - // Proto's metadata on success - // OUT_OF_RANGE_ERROR if file_offset exceeds file_size - // INTERNAL_ERROR if the metadata is invalid or any IO errors happen - static libtextclassifier3::StatusOr<int> ReadProtoMetadata( - MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size); std::unique_ptr<Header> header_; }; @@ -571,6 +569,7 @@ FileBackedProtoLog<ProtoT>::InitializeExistingFile(const Filesystem* filesystem, ICING_ASSIGN_OR_RETURN(Crc32 calculated_log_checksum, ComputeChecksum(filesystem, file_path, Crc32(), sizeof(Header), file_size)); + // Double check that the log checksum is the same as the one that was // persisted last time. If not, we start recovery logic. if (header->log_checksum != calculated_log_checksum.Get()) { @@ -631,6 +630,14 @@ libtextclassifier3::StatusOr<Crc32> FileBackedProtoLog<ProtoT>::ComputeChecksum( file_path.c_str(), static_cast<long long>(start))); } + if (end < start) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Ending checksum offset of file '%s' must be greater than start " + "'%lld', was '%lld'", + file_path.c_str(), static_cast<long long>(start), + static_cast<long long>(end))); + } + int64_t file_size = filesystem->GetFileSize(file_path.c_str()); if (end > file_size) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( @@ -640,17 +647,41 @@ libtextclassifier3::StatusOr<Crc32> FileBackedProtoLog<ProtoT>::ComputeChecksum( static_cast<long long>(end))); } - for (int i = start; i < end; i += kMmapChunkSize) { - // Don't read past the file size. - int next_chunk_size = kMmapChunkSize; - if ((i + kMmapChunkSize) >= end) { - next_chunk_size = end - i; + Architecture architecture = GetArchitecture(); + switch (architecture) { + case Architecture::BIT_64: { + // Don't mmap in chunks here since mmapping can be harmful on 64-bit + // devices where mmap/munmap calls need the mmap write semaphore, which + // blocks mmap/munmap/mprotect and all page faults from executing while + // they run. On 64-bit devices, this doesn't actually load into memory, it + // just makes the file faultable. So the whole file should be ok. + // b/185822878. + ICING_RETURN_IF_ERROR(mmapped_file.Remap(start, end - start)); + auto mmap_str = std::string_view(mmapped_file.region(), end - start); + new_crc.Append(mmap_str); + break; + } + case Architecture::BIT_32: + [[fallthrough]]; + case Architecture::UNKNOWN: { + // 32-bit devices only have 4GB of RAM. Mmap in chunks to not use up too + // much memory at once. If we're unknown, then also chunk it because we're + // not sure what the device can handle. + for (int i = start; i < end; i += kMmapChunkSize) { + // Don't read past the file size. + int next_chunk_size = kMmapChunkSize; + if ((i + kMmapChunkSize) >= end) { + next_chunk_size = end - i; + } + + ICING_RETURN_IF_ERROR(mmapped_file.Remap(i, next_chunk_size)); + + auto mmap_str = + std::string_view(mmapped_file.region(), next_chunk_size); + new_crc.Append(mmap_str); + } + break; } - - ICING_RETURN_IF_ERROR(mmapped_file.Remap(i, next_chunk_size)); - - auto mmap_str = std::string_view(mmapped_file.region(), next_chunk_size); - new_crc.Append(mmap_str); } return new_crc; @@ -670,7 +701,8 @@ libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::WriteProto( static_cast<long long>(proto_size), header_->max_proto_size)); } - // At this point, we've guaranteed that proto_size is under kMaxProtoSize (see + // At this point, we've guaranteed that proto_size is under kMaxProtoSize + // (see // ::Create), so we can safely store it in an int. int final_size = 0; @@ -735,8 +767,8 @@ libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto( MemoryMappedFile mmapped_file(*filesystem_, file_path_, MemoryMappedFile::Strategy::READ_ONLY); if (file_offset >= file_size) { - // file_size points to the next byte to write at, so subtract one to get the - // inclusive, actual size of file. + // file_size points to the next byte to write at, so subtract one to get + // the inclusive, actual size of file. return absl_ports::OutOfRangeError( IcingStringUtil::StringPrintf("Trying to read from a location, %lld, " "out of range of the file size, %lld", @@ -778,8 +810,8 @@ libtextclassifier3::Status FileBackedProtoLog<ProtoT>::EraseProto( int64_t file_offset) { int64_t file_size = filesystem_->GetFileSize(fd_.get()); if (file_offset >= file_size) { - // file_size points to the next byte to write at, so subtract one to get the - // inclusive, actual size of file. + // file_size points to the next byte to write at, so subtract one to get + // the inclusive, actual size of file. return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf( "Trying to erase data at a location, %lld, " "out of range of the file size, %lld", @@ -798,12 +830,12 @@ libtextclassifier3::Status FileBackedProtoLog<ProtoT>::EraseProto( ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata), GetProtoSize(metadata))); - // We need to update the crc checksum if the erased area is before the rewind - // position. + // We need to update the crc checksum if the erased area is before the + // rewind position. if (file_offset + sizeof(metadata) < header_->rewind_offset) { // We need to calculate [original string xor 0s]. - // The xored string is the same as the original string because 0 xor 0 = 0, - // 1 xor 0 = 1. + // The xored string is the same as the original string because 0 xor 0 = + // 0, 1 xor 0 = 1. const std::string_view xored_str(mmapped_file.region(), mmapped_file.region_size()); @@ -896,7 +928,8 @@ int64_t FileBackedProtoLog<ProtoT>::Iterator::GetOffset() { template <typename ProtoT> typename FileBackedProtoLog<ProtoT>::Iterator FileBackedProtoLog<ProtoT>::GetIterator() { - return Iterator(*filesystem_, file_path_, /*initial_offset=*/sizeof(Header)); + return Iterator(*filesystem_, file_path_, + /*initial_offset=*/sizeof(Header)); } template <typename ProtoT> @@ -959,7 +992,8 @@ libtextclassifier3::Status FileBackedProtoLog<ProtoT>::PersistToDisk() { header_->header_checksum = header_->CalculateHeaderChecksum(); if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(), - sizeof(Header))) { + sizeof(Header)) || + !filesystem_->DataSync(fd_.get())) { return absl_ports::InternalError( absl_ports::StrCat("Failed to update header to: ", file_path_)); } diff --git a/icing/file/file-backed-proto-log_benchmark.cc b/icing/file/file-backed-proto-log_benchmark.cc index 26e0fb0..c09fd5a 100644 --- a/icing/file/file-backed-proto-log_benchmark.cc +++ b/icing/file/file-backed-proto-log_benchmark.cc @@ -164,6 +164,88 @@ BENCHMARK(BM_Read) // 16MiB, and we need some extra space for the // rest of the document properties +static void BM_Erase(benchmark::State& state) { + const Filesystem filesystem; + const std::string file_path = IcingStringUtil::StringPrintf( + "%s%s", GetTestTempDir().c_str(), "/proto.log"); + int max_proto_size = (1 << 24) - 1; // 16 MiB + bool compress = true; + + // Make sure it doesn't already exist. + filesystem.DeleteFile(file_path.c_str()); + + auto proto_log = + FileBackedProtoLog<DocumentProto>::Create( + &filesystem, file_path, + FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size)) + .ValueOrDie() + .proto_log; + + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + + std::default_random_engine random; + const std::string rand_str = RandomString(kAlNumAlphabet, /*len=*/1, &random); + + auto document_properties = document.add_properties(); + document_properties->set_name("string property"); + document_properties->add_string_values(rand_str); + + for (auto _ : state) { + state.PauseTiming(); + ICING_ASSERT_OK_AND_ASSIGN(int64_t write_offset, + proto_log->WriteProto(document)); + state.ResumeTiming(); + + testing::DoNotOptimize(proto_log->EraseProto(write_offset)); + } + + // Cleanup after ourselves + filesystem.DeleteFile(file_path.c_str()); +} +BENCHMARK(BM_Erase); + +static void BM_ComputeChecksum(benchmark::State& state) { + const Filesystem filesystem; + const std::string file_path = GetTestTempDir() + "/proto.log"; + int max_proto_size = (1 << 24) - 1; // 16 MiB + bool compress = true; + + // Make sure it doesn't already exist. + filesystem.DeleteFile(file_path.c_str()); + + auto proto_log = + FileBackedProtoLog<DocumentProto>::Create( + &filesystem, file_path, + FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size)) + .ValueOrDie() + .proto_log; + + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + + // Make each document 1KiB + int string_length = 1024; + std::default_random_engine random; + const std::string rand_str = + RandomString(kAlNumAlphabet, string_length, &random); + + auto document_properties = document.add_properties(); + document_properties->set_name("string property"); + document_properties->add_string_values(rand_str); + + int num_docs = state.range(0); + for (int i = 0; i < num_docs; ++i) { + ICING_ASSERT_OK(proto_log->WriteProto(document)); + } + + for (auto _ : state) { + testing::DoNotOptimize(proto_log->ComputeChecksum()); + } + + // Cleanup after ourselves + filesystem.DeleteFile(file_path.c_str()); +} +BENCHMARK(BM_ComputeChecksum)->Range(1024, 1 << 20); + } // namespace } // namespace lib } // namespace icing diff --git a/icing/file/file-backed-vector.h b/icing/file/file-backed-vector.h index 3ecef54..0989935 100644 --- a/icing/file/file-backed-vector.h +++ b/icing/file/file-backed-vector.h @@ -56,6 +56,7 @@ #ifndef ICING_FILE_FILE_BACKED_VECTOR_H_ #define ICING_FILE_FILE_BACKED_VECTOR_H_ +#include <inttypes.h> #include <stdint.h> #include <sys/mman.h> @@ -175,7 +176,27 @@ class FileBackedVector { // synced by the system and the checksum will be updated. ~FileBackedVector(); - // Accesses the element at idx. + // Gets a copy of the element at idx. + // + // This is useful if you think the FileBackedVector may grow before you need + // to access this return value. When the FileBackedVector grows, the + // underlying mmap will be unmapped and remapped, which will invalidate any + // pointers to the previously mapped region. Getting a copy will avoid + // referencing the now-invalidated region. + // + // Returns: + // OUT_OF_RANGE_ERROR if idx < 0 or > num_elements() + libtextclassifier3::StatusOr<T> GetCopy(int32_t idx) const; + + // Gets a pointer to the element at idx. + // + // WARNING: Subsequent calls to Set may invalidate the pointer returned by + // Get. + // + // This is useful if you do not think the FileBackedVector will grow before + // you need to reference this value, and you want to avoid a copy. When the + // FileBackedVector grows, the underlying mmap will be unmapped and remapped, + // which will invalidate this pointer to the previously mapped region. // // Returns: // OUT_OF_RANGE_ERROR if idx < 0 or > num_elements() @@ -183,6 +204,10 @@ class FileBackedVector { // Writes the value at idx. // + // May grow the underlying file and mmapped region as needed to fit the new + // value. If it does grow, then any pointers to previous values returned + // from Get() may be invalidated. + // // Returns: // OUT_OF_RANGE_ERROR if idx < 0 or file cannot be grown idx size libtextclassifier3::Status Set(int32_t idx, const T& value); @@ -399,13 +424,6 @@ FileBackedVector<T>::InitializeExistingFile( absl_ports::StrCat("Invalid header kMagic for ", file_path)); } - // Mmap the content of the vector, excluding the header so its easier to - // access elements from the mmapped region - auto mmapped_file = - std::make_unique<MemoryMappedFile>(filesystem, file_path, mmap_strategy); - ICING_RETURN_IF_ERROR( - mmapped_file->Remap(sizeof(Header), file_size - sizeof(Header))); - // Check header if (header->header_checksum != header->CalculateHeaderChecksum()) { return absl_ports::FailedPreconditionError( @@ -418,6 +436,20 @@ FileBackedVector<T>::InitializeExistingFile( header->element_size)); } + int64_t min_file_size = header->num_elements * sizeof(T) + sizeof(Header); + if (min_file_size > file_size) { + return absl_ports::InternalError(IcingStringUtil::StringPrintf( + "Inconsistent file size, expected %" PRId64 ", actual %" PRId64, + min_file_size, file_size)); + } + + // Mmap the content of the vector, excluding the header so its easier to + // access elements from the mmapped region + auto mmapped_file = + std::make_unique<MemoryMappedFile>(filesystem, file_path, mmap_strategy); + ICING_RETURN_IF_ERROR( + mmapped_file->Remap(sizeof(Header), file_size - sizeof(Header))); + // Check vector contents Crc32 vector_checksum; std::string_view vector_contents( @@ -468,6 +500,13 @@ FileBackedVector<T>::~FileBackedVector() { } template <typename T> +libtextclassifier3::StatusOr<T> FileBackedVector<T>::GetCopy( + int32_t idx) const { + ICING_ASSIGN_OR_RETURN(const T* value, Get(idx)); + return *value; +} + +template <typename T> libtextclassifier3::StatusOr<const T*> FileBackedVector<T>::Get( int32_t idx) const { if (idx < 0) { @@ -492,8 +531,6 @@ libtextclassifier3::Status FileBackedVector<T>::Set(int32_t idx, IcingStringUtil::StringPrintf("Index, %d, was less than 0", idx)); } - int32_t start_byte = idx * sizeof(T); - ICING_RETURN_IF_ERROR(GrowIfNecessary(idx + 1)); if (idx + 1 > header_->num_elements) { @@ -518,6 +555,8 @@ libtextclassifier3::Status FileBackedVector<T>::Set(int32_t idx, changes_end_ = 0; header_->vector_checksum = 0; } else { + int32_t start_byte = idx * sizeof(T); + changes_.push_back(idx); saved_original_buffer_.append( reinterpret_cast<char*>(const_cast<T*>(array())) + start_byte, @@ -560,9 +599,24 @@ libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary( least_file_size_needed = math_util::RoundUpTo( least_file_size_needed, int64_t{FileBackedVector<T>::kGrowElements * sizeof(T)}); - if (!filesystem_->Grow(file_path_.c_str(), least_file_size_needed)) { - return absl_ports::InternalError( - absl_ports::StrCat("Couldn't grow file ", file_path_)); + + // We use PWrite here rather than Grow because Grow doesn't actually allocate + // an underlying disk block. This can lead to problems with mmap because mmap + // has no effective way to signal that it was impossible to allocate the disk + // block and ends up crashing instead. PWrite will force the allocation of + // these blocks, which will ensure that any failure to grow will surface here. + int64_t page_size = getpagesize(); + auto buf = std::make_unique<uint8_t[]>(page_size); + int64_t size_to_write = page_size - (current_file_size % page_size); + ScopedFd sfd(filesystem_->OpenForWrite(file_path_.c_str())); + while (current_file_size < least_file_size_needed) { + if (!filesystem_->PWrite(sfd.get(), current_file_size, buf.get(), + size_to_write)) { + return absl_ports::InternalError( + absl_ports::StrCat("Couldn't grow file ", file_path_)); + } + current_file_size += size_to_write; + size_to_write = page_size - (current_file_size % page_size); } ICING_RETURN_IF_ERROR(mmapped_file_->Remap( diff --git a/icing/file/file-backed-vector_test.cc b/icing/file/file-backed-vector_test.cc index bc2fef6..b05ce2d 100644 --- a/icing/file/file-backed-vector_test.cc +++ b/icing/file/file-backed-vector_test.cc @@ -32,6 +32,7 @@ #include "icing/util/logging.h" using ::testing::Eq; +using ::testing::IsTrue; using ::testing::Pointee; namespace icing { @@ -278,7 +279,6 @@ TEST_F(FileBackedVectorTest, Grow) { filesystem_, file_path_, MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(0))); - EXPECT_THAT(vector->Set(kMaxNumElts + 11, 'a'), StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); EXPECT_THAT(vector->Set(-1, 'a'), @@ -318,25 +318,32 @@ TEST_F(FileBackedVectorTest, GrowsInChunks) { filesystem_, file_path_, MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); - // Our initial file size should just be the size of the header - EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()), - Eq(sizeof(FileBackedVector<char>::Header))); + // Our initial file size should just be the size of the header. Disk usage + // will indicate that one block has been allocated, which contains the header. + int header_size = sizeof(FileBackedVector<char>::Header); + int page_size = getpagesize(); + EXPECT_THAT(filesystem_.GetFileSize(fd_), Eq(header_size)); + EXPECT_THAT(filesystem_.GetDiskUsage(fd_), Eq(page_size)); - // Once we add something though, we'll grow to kGrowElements big + // Once we add something though, we'll grow to be kGrowElements big. From this + // point on, file size and disk usage should be the same because Growing will + // explicitly allocate the number of blocks needed to accomodate the file. Insert(vector.get(), 0, "a"); - EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()), - Eq(kGrowElements * sizeof(int))); + int file_size = kGrowElements * sizeof(int); + EXPECT_THAT(filesystem_.GetFileSize(fd_), Eq(file_size)); + EXPECT_THAT(filesystem_.GetDiskUsage(fd_), Eq(file_size)); // Should still be the same size, don't need to grow underlying file Insert(vector.get(), 1, "b"); - EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()), - Eq(kGrowElements * sizeof(int))); + EXPECT_THAT(filesystem_.GetFileSize(fd_), Eq(file_size)); + EXPECT_THAT(filesystem_.GetDiskUsage(fd_), Eq(file_size)); // Now we grow by a kGrowElements chunk, so the underlying file is 2 // kGrowElements big + file_size *= 2; Insert(vector.get(), 2, std::string(kGrowElements, 'c')); - EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()), - Eq(kGrowElements * 2 * sizeof(int))); + EXPECT_THAT(filesystem_.GetFileSize(fd_), Eq(file_size)); + EXPECT_THAT(filesystem_.GetDiskUsage(fd_), Eq(file_size)); // Destroy/persist the contents. vector.reset(); @@ -463,6 +470,174 @@ TEST_F(FileBackedVectorTest, TruncateAndReReadFile) { } } +TEST_F(FileBackedVectorTest, InitFileTooSmallForHeaderFails) { + { + // 1. Create a vector with a few elements. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> vector, + FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + Insert(vector.get(), 0, "A"); + Insert(vector.get(), 1, "Z"); + ASSERT_THAT(vector->PersistToDisk(), IsOk()); + } + + // 2. Shrink the file to be smaller than the header. + filesystem_.Truncate(fd_, sizeof(FileBackedVector<char>::Header) - 1); + + { + // 3. Attempt to create the file and confirm that it fails. + EXPECT_THAT(FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC), + StatusIs(libtextclassifier3::StatusCode::INTERNAL)); + } +} + +TEST_F(FileBackedVectorTest, InitWrongDataSizeFails) { + { + // 1. Create a vector with a few elements. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> vector, + FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + Insert(vector.get(), 0, "A"); + Insert(vector.get(), 1, "Z"); + ASSERT_THAT(vector->PersistToDisk(), IsOk()); + } + + { + // 2. Attempt to create the file with a different element size and confirm + // that it fails. + EXPECT_THAT(FileBackedVector<int>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC), + StatusIs(libtextclassifier3::StatusCode::INTERNAL)); + } +} + +TEST_F(FileBackedVectorTest, InitCorruptHeaderFails) { + { + // 1. Create a vector with a few elements. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> vector, + FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + Insert(vector.get(), 0, "A"); + Insert(vector.get(), 1, "Z"); + ASSERT_THAT(vector->PersistToDisk(), IsOk()); + } + + // 2. Modify the header, but don't update the checksum. This would be similar + // to corruption of the header. + FileBackedVector<char>::Header header; + ASSERT_THAT(filesystem_.PRead(fd_, &header, sizeof(header), /*offset=*/0), + IsTrue()); + header.num_elements = 1; + ASSERT_THAT(filesystem_.PWrite(fd_, /*offset=*/0, &header, sizeof(header)), + IsTrue()); + + { + // 3. Attempt to create the file with a header that doesn't match its + // checksum and confirm that it fails. + EXPECT_THAT(FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + } +} + +TEST_F(FileBackedVectorTest, InitHeaderElementSizeTooBigFails) { + { + // 1. Create a vector with a few elements. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> vector, + FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + Insert(vector.get(), 0, "A"); + Insert(vector.get(), 1, "Z"); + ASSERT_THAT(vector->PersistToDisk(), IsOk()); + } + + // 2. Modify the header so that the number of elements exceeds the actual size + // of the underlying file. + FileBackedVector<char>::Header header; + ASSERT_THAT(filesystem_.PRead(fd_, &header, sizeof(header), /*offset=*/0), + IsTrue()); + int64_t file_size = filesystem_.GetFileSize(fd_); + int64_t allocated_elements_size = file_size - sizeof(header); + header.num_elements = (allocated_elements_size / sizeof(char)) + 1; + header.header_checksum = header.CalculateHeaderChecksum(); + ASSERT_THAT(filesystem_.PWrite(fd_, /*offset=*/0, &header, sizeof(header)), + IsTrue()); + + { + // 3. Attempt to create the file with num_elements that is larger than the + // underlying file and confirm that it fails. + EXPECT_THAT(FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC), + StatusIs(libtextclassifier3::StatusCode::INTERNAL)); + } +} + +TEST_F(FileBackedVectorTest, InitCorruptElementsFails) { + { + // 1. Create a vector with a few elements. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> vector, + FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + Insert(vector.get(), 0, "A"); + Insert(vector.get(), 1, "Z"); + ASSERT_THAT(vector->PersistToDisk(), IsOk()); + } + + // 2. Overwrite the values of the first two elements. + std::string corrupted_content = "BY"; + ASSERT_THAT( + filesystem_.PWrite(fd_, /*offset=*/sizeof(FileBackedVector<char>::Header), + corrupted_content.c_str(), corrupted_content.length()), + IsTrue()); + + { + // 3. Attempt to create the file with elements that don't match their + // checksum and confirm that it fails. + EXPECT_THAT(FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + } +} + +TEST_F(FileBackedVectorTest, InitNormalSucceeds) { + { + // 1. Create a vector with a few elements. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> vector, + FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + Insert(vector.get(), 0, "A"); + Insert(vector.get(), 1, "Z"); + ASSERT_THAT(vector->PersistToDisk(), IsOk()); + } + + { + // 2. Attempt to create the file with a completely valid header and elements + // region. This should succeed. + EXPECT_THAT(FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC), + IsOk()); + } +} + } // namespace } // namespace lib diff --git a/icing/file/filesystem.cc b/icing/file/filesystem.cc index 6a596f5..0655cb9 100644 --- a/icing/file/filesystem.cc +++ b/icing/file/filesystem.cc @@ -466,7 +466,13 @@ bool Filesystem::Write(const char* filename, const void* data, bool Filesystem::CopyFile(const char* src, const char* dst) const { ScopedFd src_fd(OpenForRead(src)); + + std::string dir = GetDirname(dst); + if (!CreateDirectoryRecursively(dir.c_str())) { + return false; + } ScopedFd dst_fd(OpenForWrite(dst)); + if (!src_fd.is_valid() || !dst_fd.is_valid()) { return false; } @@ -478,6 +484,49 @@ bool Filesystem::CopyFile(const char* src, const char* dst) const { return Write(*dst_fd, buf.get(), size); } +bool Filesystem::CopyDirectory(const char* src_dir, const char* dst_dir, + bool recursive) const { + DIR* dir = opendir(src_dir); + if (!dir) { + LogOpenError("Unable to open directory ", src_dir, ": ", errno); + return false; + } + + dirent* p; + // readdir's implementation seems to be thread safe. + while ((p = readdir(dir)) != nullptr) { + std::string file_name(p->d_name); + if (file_name == "." || file_name == "..") { + continue; + } + + std::string full_src_path = absl_ports::StrCat(src_dir, "/", p->d_name); + std::string full_dst_path = absl_ports::StrCat(dst_dir, "/", p->d_name); + + // Directories are copied when writing a non-directory file, so no + // explicit copying of a directory is required. + if (p->d_type != DT_DIR) { + if (!CopyFile(full_src_path.c_str(), full_dst_path.c_str())) { + return false; + } + } + + // Recurse down directories, if requested. + if (recursive && (p->d_type == DT_DIR)) { + std::string src_sub_dir = absl_ports::StrCat(src_dir, "/", p->d_name); + std::string dst_sub_dir = absl_ports::StrCat(dst_dir, "/", p->d_name); + if (!CopyDirectory(src_sub_dir.c_str(), dst_sub_dir.c_str(), recursive)) { + return false; + } + } + } + if (closedir(dir) != 0) { + ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Error closing %s: %s", + src_dir, strerror(errno)); + } + return true; +} + bool Filesystem::PWrite(int fd, off_t offset, const void* data, size_t data_size) const { size_t write_len = data_size; diff --git a/icing/file/filesystem.h b/icing/file/filesystem.h index d3c7787..6bed8e6 100644 --- a/icing/file/filesystem.h +++ b/icing/file/filesystem.h @@ -86,8 +86,12 @@ class Filesystem { // Copies the src file to the dst file. virtual bool CopyFile(const char* src, const char* dst) const; + // Copies the src directory and its contents to the dst dir. + virtual bool CopyDirectory(const char* src_dir, const char* dst_dir, + bool recursive) const; + // Returns true if a file exists. False if the file doesn't exist. - // If there is an error getting stat on the file, it logs the error and // + // If there is an error getting stat on the file, it logs the error and // asserts. virtual bool FileExists(const char* file_name) const; diff --git a/icing/file/filesystem_test.cc b/icing/file/filesystem_test.cc index 492a50d..214180e 100644 --- a/icing/file/filesystem_test.cc +++ b/icing/file/filesystem_test.cc @@ -38,6 +38,7 @@ using ::testing::Gt; using ::testing::Le; using ::testing::Ne; using ::testing::UnorderedElementsAre; +using ::testing::UnorderedElementsAreArray; namespace icing { namespace lib { @@ -450,5 +451,47 @@ TEST_F(FilesystemTest, ReadWrite) { EXPECT_THAT(hello, Eq("hello")); } +TEST_F(FilesystemTest, CopyDirectory) { + Filesystem filesystem; + + // File structure: + // <temp_dir>/ + // src_dir/ + // file1 + // file2 + // sub_dir/ + // file3 + const std::string src_dir = temp_dir_ + "/src_dir"; + const std::string sub_dir = "sub_dir"; + const std::string sub_dir_path = src_dir + "/" + sub_dir; + vector<std::string> some_files = {"file1", "file2", sub_dir + "/file3"}; + + // Make sure there is no pre-existing test-dir structure + ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(src_dir.c_str())); + + // Setup a test-dir structure + ASSERT_TRUE(filesystem.CreateDirectoryRecursively( + sub_dir_path.c_str())); // deepest path for test + CreateTestFiles(some_files, src_dir); + + const std::string dst_dir = temp_dir_ + "/dst_dir"; + EXPECT_TRUE(filesystem.CopyDirectory(src_dir.c_str(), dst_dir.c_str(), + /*recursive=*/true)); + + vector<std::string> src_dir_files; + EXPECT_TRUE(filesystem.ListDirectory(src_dir.c_str(), /*exclude=*/{}, + /*recursive=*/true, &src_dir_files)); + + vector<std::string> dst_dir_files; + EXPECT_TRUE(filesystem.ListDirectory(dst_dir.c_str(), /*exclude=*/{}, + /*recursive=*/true, &dst_dir_files)); + + EXPECT_THAT(dst_dir_files, UnorderedElementsAreArray(src_dir_files)); + + // Clean up + ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(src_dir.c_str())); + ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(dst_dir.c_str())); +} + } // namespace lib } // namespace icing diff --git a/icing/file/mock-filesystem.h b/icing/file/mock-filesystem.h index 88475cd..32817d4 100644 --- a/icing/file/mock-filesystem.h +++ b/icing/file/mock-filesystem.h @@ -44,6 +44,17 @@ class MockFilesystem : public Filesystem { return real_filesystem_.DeleteDirectoryRecursively(dir_name); }); + ON_CALL(*this, CopyFile) + .WillByDefault([this](const char* src, const char* dst) { + return real_filesystem_.CopyFile(src, dst); + }); + + ON_CALL(*this, CopyDirectory) + .WillByDefault( + [this](const char* src, const char* dst, bool recursive) { + return real_filesystem_.CopyDirectory(src, dst, recursive); + }); + ON_CALL(*this, FileExists).WillByDefault([this](const char* file_name) { return real_filesystem_.FileExists(file_name); }); @@ -227,6 +238,9 @@ class MockFilesystem : public Filesystem { MOCK_METHOD(bool, CopyFile, (const char* src, const char* dst), (const)); + MOCK_METHOD(bool, CopyDirectory, + (const char* src, const char* dst, bool recursive), (const)); + MOCK_METHOD(bool, FileExists, (const char* file_name), (const)); MOCK_METHOD(bool, DirectoryExists, (const char* dir_name), (const)); diff --git a/icing/file/portable-file-backed-proto-log.h b/icing/file/portable-file-backed-proto-log.h new file mode 100644 index 0000000..99b8941 --- /dev/null +++ b/icing/file/portable-file-backed-proto-log.h @@ -0,0 +1,1241 @@ +// Copyright (C) 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// File-backed log of protos with append-only writes and position based reads. +// +// There should only be one instance of a PortableFileBackedProtoLog of the same +// file at a time; using multiple instances at the same time may lead to +// undefined behavior. +// +// The entire checksum is computed on initialization to verify the contents are +// valid. On failure, the log will be truncated to the last verified state when +// PersistToDisk() was called. If the log cannot successfully restore the last +// state due to disk corruption or some other inconsistency, then the entire log +// will be lost. +// +// Each proto written to the file will have a metadata written just before it. +// The metadata consists of +// { +// 1 bytes of kProtoMagic; +// 3 bytes of the proto size +// n bytes of the proto itself +// } +// +// All metadata is written in a portable format, encoded with htonl before +// writing to file and decoded with ntohl when reading from file. +// +// Example usage: +// ICING_ASSERT_OK_AND_ASSIGN(auto create_result, +// PortableFileBackedProtoLog<DocumentProto>::Create(filesystem, +// file_path_, +// options)); +// auto proto_log = create_result.proto_log; +// +// Document document; +// document.set_namespace("com.google.android.example"); +// document.set_uri("www.google.com"); +// +// int64_t document_offset = proto_log->WriteProto(document)); +// Document same_document = proto_log->ReadProto(document_offset)); +// proto_log->PersistToDisk(); + +#ifndef ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_ +#define ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_ + +#include <cstddef> +#include <cstdint> +#include <cstring> +#include <memory> +#include <string> +#include <string_view> +#include <utility> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include <google/protobuf/io/gzip_stream.h> +#include <google/protobuf/io/zero_copy_stream_impl_lite.h> +#include "icing/absl_ports/canonical_errors.h" +#include "icing/absl_ports/str_cat.h" +#include "icing/file/filesystem.h" +#include "icing/file/memory-mapped-file.h" +#include "icing/legacy/core/icing-string-util.h" +#include "icing/portable/endian.h" +#include "icing/portable/platform.h" +#include "icing/portable/zlib.h" +#include "icing/util/bit-util.h" +#include "icing/util/crc32.h" +#include "icing/util/data-loss.h" +#include "icing/util/logging.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +template <typename ProtoT> +class PortableFileBackedProtoLog { + public: + struct Options { + // Whether to compress each proto before writing to the proto log. + bool compress; + + // Byte-size limit for each proto written to the store. This does not + // include the bytes needed for the metadata of each proto. + // + // NOTE: Currently, we only support protos up to 16MiB. We store the proto + // size in 3 bytes within the metadata. + // + // NOTE: This limit is only enforced for future writes. If the store + // previously had a higher limit, then reading older entries could return + // larger protos. + // + // NOTE: The max_proto_size is the upper limit for input protos into the + // ProtoLog. Even if the proto is larger than max_proto_size, but compresses + // to a smaller size, ProtoLog will not accept it. Protos that result in a + // compressed size larger than max_proto_size are also not accepted. + const int32_t max_proto_size; + + // Must specify values for options. + Options() = delete; + explicit Options(bool compress_in, + const int32_t max_proto_size_in = kMaxProtoSize) + : compress(compress_in), max_proto_size(max_proto_size_in) {} + }; + + // Number of bytes we reserve for the heading at the beginning of the proto + // log. We reserve this so the header can grow without running into the + // contents of the proto log, triggering an unnecessary migration of the data. + static constexpr int kHeaderReservedBytes = 256; + + // Header stored at the beginning of the file before the rest of the log + // contents. Stores metadata on the log. + class Header { + public: + static constexpr int32_t kMagic = 0xf4c6f67a; + + static constexpr int32_t kFileFormatVersion = 0; + + uint32_t CalculateHeaderChecksum() const { + Crc32 crc; + + // Get a string_view of all the fields of the Header, excluding the + // magic_nbytes_ and header_checksum_nbytes_ + std::string_view header_str( + reinterpret_cast<const char*>(this) + + offsetof(Header, header_checksum_nbytes_) + + sizeof(header_checksum_nbytes_), + sizeof(Header) - sizeof(magic_nbytes_) - + sizeof(header_checksum_nbytes_)); + crc.Append(header_str); + return crc.Get(); + } + + int32_t GetMagic() const { return GNetworkToHostL(magic_nbytes_); } + + void SetMagic(int32_t magic_in) { + magic_nbytes_ = GHostToNetworkL(magic_in); + } + + int32_t GetFileFormatVersion() const { + return GNetworkToHostL(file_format_version_nbytes_); + } + + void SetFileFormatVersion(int32_t file_format_version_in) { + file_format_version_nbytes_ = GHostToNetworkL(file_format_version_in); + } + + int32_t GetMaxProtoSize() const { + return GNetworkToHostL(max_proto_size_nbytes_); + } + + void SetMaxProtoSize(int32_t max_proto_size_in) { + max_proto_size_nbytes_ = GHostToNetworkL(max_proto_size_in); + } + + int32_t GetLogChecksum() const { + return GNetworkToHostL(log_checksum_nbytes_); + } + + void SetLogChecksum(int32_t log_checksum_in) { + log_checksum_nbytes_ = GHostToNetworkL(log_checksum_in); + } + + int64_t GetRewindOffset() const { + return GNetworkToHostLL(rewind_offset_nbytes_); + } + + void SetRewindOffset(int64_t rewind_offset_in) { + rewind_offset_nbytes_ = GHostToNetworkLL(rewind_offset_in); + } + + int32_t GetHeaderChecksum() const { + return GNetworkToHostL(header_checksum_nbytes_); + } + + void SetHeaderChecksum(int32_t header_checksum_in) { + header_checksum_nbytes_ = GHostToNetworkL(header_checksum_in); + } + + bool GetCompressFlag() const { return GetFlag(kCompressBit); } + + void SetCompressFlag(bool compress) { SetFlag(kCompressBit, compress); } + + bool GetDirtyFlag() const { return GetFlag(kDirtyBit); } + + void SetDirtyFlag(bool dirty) { SetFlag(kDirtyBit, dirty); } + + private: + // The least-significant bit offset at which the compress flag is stored in + // 'flags_nbytes_'. Represents whether the protos in the log are compressed + // or not. + static constexpr int32_t kCompressBit = 0; + + // The least-significant bit offset at which the dirty flag is stored in + // 'flags'. Represents whether the checksummed portion of the log has been + // modified after the last checksum was computed. + static constexpr int32_t kDirtyBit = 1; + + bool GetFlag(int offset) const { + return bit_util::BitfieldGet(flags_, offset, /*len=*/1); + } + + void SetFlag(int offset, bool value) { + bit_util::BitfieldSet(value, offset, /*len=*/1, &flags_); + } + + // Holds the magic as a quick sanity check against file corruption. + // + // Field is in network-byte order. + int32_t magic_nbytes_ = GHostToNetworkL(kMagic); + + // Must be at the beginning after kMagic. Contains the crc checksum of + // the following fields. + // + // Field is in network-byte order. + uint32_t header_checksum_nbytes_ = 0; + + // Last known good offset at which the log and its checksum were updated. + // If we crash between writing to the log and updating the checksum, we can + // try to rewind the log to this offset and verify the checksum is still + // valid instead of throwing away the entire log. + // + // Field is in network-byte order. + int64_t rewind_offset_nbytes_ = GHostToNetworkLL(kHeaderReservedBytes); + + // Version number tracking how we serialize the file to disk. If we change + // how/what we write to disk, this version should be updated and this class + // should handle a migration. + // + // Currently at kFileFormatVersion. + // + // Field is in network-byte order. + int32_t file_format_version_nbytes_ = 0; + + // The maximum proto size that can be written to the log. + // + // Field is in network-byte order. + int32_t max_proto_size_nbytes_ = 0; + + // Checksum of the log elements, doesn't include the header fields. + // + // Field is in network-byte order. + uint32_t log_checksum_nbytes_ = 0; + + // Bits are used to hold various flags. + // Lowest bit is whether the protos are compressed or not. + // + // Field is only 1 byte, so is byte-order agnostic. + uint8_t flags_ = 0; + + // NOTE: New fields should *almost always* be added to the end here. Since + // this class may have already been written to disk, appending fields + // increases the chances that changes are backwards-compatible. + }; + static_assert(sizeof(Header) <= kHeaderReservedBytes, + "Header has grown past our reserved bytes!"); + + struct CreateResult { + // A successfully initialized log. + std::unique_ptr<PortableFileBackedProtoLog<ProtoT>> proto_log; + + // The data status after initializing from a previous state. Data loss can + // happen if the file is corrupted or some previously added data was + // unpersisted. This may be used to signal that any derived data off of the + // proto log may need to be regenerated. + DataLoss data_loss = DataLoss::NONE; + + // Whether the proto log had to recalculate the checksum to check its + // integrity. This can be avoided if no changes were made or the log was + // able to update its checksum before shutting down. But it may have to + // recalculate if it's unclear if we crashed after updating the log, but + // before updating our checksum. + bool recalculated_checksum = false; + + bool has_data_loss() { + return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE; + } + }; + + // Factory method to create, initialize, and return a + // PortableFileBackedProtoLog. Will create the file if it doesn't exist. + // + // If on re-initialization the log detects disk corruption or some previously + // added data was unpersisted, the log will rewind to the last-good state. The + // log saves these checkpointed "good" states when PersistToDisk() is called + // or the log is safely destructed. If the log rewinds successfully to the + // last-good state, then the returned CreateResult.data_loss indicates + // whether it has a data loss and what kind of data loss it is (partial or + // complete) so that any derived data may know that it needs to be updated. If + // the log re-initializes successfully without any data loss, + // CreateResult.data_loss will be NONE. + // + // Params: + // filesystem: Handles system level calls + // file_path: Path of the underlying file. Directory of the file should + // already exist + // options: Configuration options for the proto log + // + // Returns: + // PortableFileBackedProtoLog::CreateResult on success + // INVALID_ARGUMENT on an invalid option + // INTERNAL_ERROR on IO error + static libtextclassifier3::StatusOr<CreateResult> Create( + const Filesystem* filesystem, const std::string& file_path, + const Options& options); + + // Not copyable + PortableFileBackedProtoLog(const PortableFileBackedProtoLog&) = delete; + PortableFileBackedProtoLog& operator=(const PortableFileBackedProtoLog&) = + delete; + + // This will update the checksum of the log as well. + ~PortableFileBackedProtoLog(); + + // Writes the serialized proto to the underlying file. Writes are applied + // directly to the underlying file. Users do not need to sync the file after + // writing. + // + // Returns: + // Offset of the newly appended proto in file on success + // INVALID_ARGUMENT if proto is too large, as decided by + // Options.max_proto_size + // INTERNAL_ERROR on IO error + libtextclassifier3::StatusOr<int64_t> WriteProto(const ProtoT& proto); + + // Reads out a proto located at file_offset from the file. + // + // Returns: + // A proto on success + // NOT_FOUND if the proto at the given offset has been erased + // OUT_OF_RANGE_ERROR if file_offset exceeds file size + // INTERNAL_ERROR on IO error + libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const; + + // Erases the data of a proto located at file_offset from the file. + // + // Returns: + // OK on success + // OUT_OF_RANGE_ERROR if file_offset exceeds file size + // INTERNAL_ERROR on IO error + libtextclassifier3::Status EraseProto(int64_t file_offset); + + // Calculates and returns the disk usage in bytes. Rounds up to the nearest + // block size. + // + // Returns: + // Disk usage on success + // INTERNAL_ERROR on IO error + libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const; + + // Returns the file size of all the elements held in the log. File size is in + // bytes. This excludes the size of any internal metadata of the log, e.g. the + // log's header. + // + // Returns: + // File size on success + // INTERNAL_ERROR on IO error + libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const; + + // An iterator helping to find offsets of all the protos in file. + // Example usage: + // + // while (iterator.Advance().ok()) { + // int64_t offset = iterator.GetOffset(); + // // Do something + // } + class Iterator { + public: + Iterator(const Filesystem& filesystem, const std::string& file_path, + int64_t initial_offset); + + // Advances to the position of next proto whether it has been erased or not. + // + // Returns: + // OK on success + // OUT_OF_RANGE_ERROR if it reaches the end + // INTERNAL_ERROR on IO error + libtextclassifier3::Status Advance(); + + // Returns the file offset of current proto. + int64_t GetOffset(); + + private: + static constexpr int64_t kInvalidOffset = -1; + // Used to read proto metadata + MemoryMappedFile mmapped_file_; + // Offset of first proto + int64_t initial_offset_; + int64_t current_offset_; + int64_t file_size_; + }; + + // Returns an iterator of current proto log. The caller needs to keep the + // proto log unchanged while using the iterator, otherwise unexpected + // behaviors could happen. + Iterator GetIterator(); + + // Persists all changes since initialization or the last call to + // PersistToDisk(). Any changes that aren't persisted may be lost if the + // system fails to close safely. + // + // Example use case: + // + // Document document; + // document.set_namespace("com.google.android.example"); + // document.set_uri("www.google.com"); + // + // { + // ICING_ASSERT_OK_AND_ASSIGN(auto create_result, + // PortableFileBackedProtoLog<DocumentProto>::Create(filesystem, + // file_path, + // options)); + // auto proto_log = std::move(create_result.proto_log); + // + // int64_t document_offset = proto_log->WriteProto(document)); + // + // // We lose the document here since it wasn't persisted. + // // *SYSTEM CRASH* + // } + // + // { + // // Can still successfully create after a crash since the log can + // // rewind/truncate to recover into a previously good state + // ICING_ASSERT_OK_AND_ASSIGN(auto create_result, + // PortableFileBackedProtoLog<DocumentProto>::Create(filesystem, + // file_path, + // options)); + // auto proto_log = std::move(create_result.proto_log); + // + // // Lost the proto since we didn't PersistToDisk before the crash + // proto_log->ReadProto(document_offset)); // INVALID_ARGUMENT error + // + // int64_t document_offset = proto_log->WriteProto(document)); + // + // // Persisted this time, so we should be ok. + // ICING_ASSERT_OK(proto_log->PersistToDisk()); + // } + // + // { + // ICING_ASSERT_OK_AND_ASSIGN(auto create_result, + // PortableFileBackedProtoLog<DocumentProto>::Create(filesystem, + // file_path, + // options)); + // auto proto_log = std::move(create_result.proto_log); + // + // // SUCCESS + // Document same_document = proto_log->ReadProto(document_offset)); + // } + // + // NOTE: Since all protos are already written to the file directly, this + // just updates the checksum and rewind position. Without these updates, + // future initializations will truncate the file and discard unpersisted + // changes. + // + // Returns: + // OK on success + // INTERNAL_ERROR on IO error + libtextclassifier3::Status PersistToDisk(); + + // Calculates the checksum of the log contents. Excludes the header content. + // + // Returns: + // Crc of the log content + // INTERNAL_ERROR on IO error + libtextclassifier3::StatusOr<Crc32> ComputeChecksum(); + + private: + // Object can only be instantiated via the ::Create factory. + PortableFileBackedProtoLog(const Filesystem* filesystem, + const std::string& file_path, + std::unique_ptr<Header> header); + + // Initializes a new proto log. + // + // Returns: + // std::unique_ptr<CreateResult> on success + // INTERNAL_ERROR on IO error + static libtextclassifier3::StatusOr<CreateResult> InitializeNewFile( + const Filesystem* filesystem, const std::string& file_path, + const Options& options); + + // Verifies that the existing proto log is in a good state. If not in a good + // state, then the proto log may be truncated to the last good state and + // content will be lost. + // + // Returns: + // std::unique_ptr<CreateResult> on success + // INTERNAL_ERROR on IO error or internal inconsistencies in the file + // INVALID_ARGUMENT_ERROR if options aren't consistent with previous + // instances + static libtextclassifier3::StatusOr<CreateResult> InitializeExistingFile( + const Filesystem* filesystem, const std::string& file_path, + const Options& options, int64_t file_size); + + // Takes an initial checksum and updates it with the content between `start` + // and `end` offsets in the file. + // + // Returns: + // Crc of the content between `start`, inclusive, and `end`, exclusive. + // INTERNAL_ERROR on IO error + // INVALID_ARGUMENT_ERROR if start and end aren't within the file size + static libtextclassifier3::StatusOr<Crc32> ComputeChecksum( + const Filesystem* filesystem, const std::string& file_path, + Crc32 initial_crc, int64_t start, int64_t end); + + // Reads out the metadata of a proto located at file_offset from the file. + // Metadata will be returned in host byte order endianness. + // + // Returns: + // Proto's metadata on success + // OUT_OF_RANGE_ERROR if file_offset exceeds file_size + // INTERNAL_ERROR if the metadata is invalid or any IO errors happen + static libtextclassifier3::StatusOr<int32_t> ReadProtoMetadata( + MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size); + + // Writes metadata of a proto to the fd. Takes in a host byte order endianness + // metadata and converts it into a portable metadata before writing. + // + // Returns: + // OK on success + // INTERNAL_ERROR on any IO errors + static libtextclassifier3::Status WriteProtoMetadata( + const Filesystem* filesystem, int fd, int32_t host_order_metadata); + + static bool IsEmptyBuffer(const char* buffer, int size) { + return std::all_of(buffer, buffer + size, + [](const char byte) { return byte == 0; }); + } + + // Helper function to get stored proto size from the metadata. + // Metadata format: 8 bits magic + 24 bits size + static int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; } + + // Helper function to get stored proto magic from the metadata. + // Metadata format: 8 bits magic + 24 bits size + static uint8_t GetProtoMagic(int metadata) { return metadata >> 24; } + + // Magic number added in front of every proto. Used when reading out protos + // as a first check for corruption in each entry in the file. Even if there is + // a corruption, the best we can do is roll back to our last recovery point + // and throw away un-flushed data. We can discard/reuse this byte if needed so + // that we have 4 bytes to store the size of protos, and increase the size of + // protos we support. + static constexpr uint8_t kProtoMagic = 0x5C; + + // Our internal max for protos. + // + // WARNING: Changing this to a larger number may invalidate our assumption + // that that proto size can safely be stored in the last 3 bytes of the proto + // header. + static constexpr int kMaxProtoSize = (1 << 24) - 1; // 16MiB + static_assert(kMaxProtoSize <= 0x00FFFFFF, + "kMaxProtoSize doesn't fit in 3 bytes"); + + // Level of compression, BEST_SPEED = 1, BEST_COMPRESSION = 9 + static constexpr int kDeflateCompressionLevel = 3; + + // Chunks of the file to mmap at a time, so we don't mmap the entire file. + // Only used on 32-bit devices + static constexpr int kMmapChunkSize = 4 * 1024 * 1024; // 4MiB + + ScopedFd fd_; + const Filesystem* const filesystem_; + const std::string file_path_; + std::unique_ptr<Header> header_; +}; + +template <typename ProtoT> +constexpr uint8_t PortableFileBackedProtoLog<ProtoT>::kProtoMagic; + +template <typename ProtoT> +PortableFileBackedProtoLog<ProtoT>::PortableFileBackedProtoLog( + const Filesystem* filesystem, const std::string& file_path, + std::unique_ptr<Header> header) + : filesystem_(filesystem), + file_path_(file_path), + header_(std::move(header)) { + fd_.reset(filesystem_->OpenForAppend(file_path.c_str())); +} + +template <typename ProtoT> +PortableFileBackedProtoLog<ProtoT>::~PortableFileBackedProtoLog() { + if (!PersistToDisk().ok()) { + ICING_LOG(WARNING) << "Error persisting to disk during destruction of " + "PortableFileBackedProtoLog: " + << file_path_; + } +} + +template <typename ProtoT> +libtextclassifier3::StatusOr< + typename PortableFileBackedProtoLog<ProtoT>::CreateResult> +PortableFileBackedProtoLog<ProtoT>::Create(const Filesystem* filesystem, + const std::string& file_path, + const Options& options) { + if (options.max_proto_size <= 0) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "options.max_proto_size must be greater than 0, was %d", + options.max_proto_size)); + } + + // Since we store the proto_size in 3 bytes, we can only support protos of up + // to 16MiB. + if (options.max_proto_size > kMaxProtoSize) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "options.max_proto_size must be under 16MiB, was %d", + options.max_proto_size)); + } + + if (!filesystem->FileExists(file_path.c_str())) { + return InitializeNewFile(filesystem, file_path, options); + } + + int64_t file_size = filesystem->GetFileSize(file_path.c_str()); + if (file_size == Filesystem::kBadFileSize) { + return absl_ports::InternalError( + absl_ports::StrCat("Bad file size '", file_path, "'")); + } + + if (file_size == 0) { + return InitializeNewFile(filesystem, file_path, options); + } + + return InitializeExistingFile(filesystem, file_path, options, file_size); +} + +template <typename ProtoT> +libtextclassifier3::StatusOr< + typename PortableFileBackedProtoLog<ProtoT>::CreateResult> +PortableFileBackedProtoLog<ProtoT>::InitializeNewFile( + const Filesystem* filesystem, const std::string& file_path, + const Options& options) { + // Grow to the minimum reserved bytes for the header. + if (!filesystem->Truncate(file_path.c_str(), kHeaderReservedBytes)) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to initialize file size: ", file_path)); + } + + // Create the header + std::unique_ptr<Header> header = std::make_unique<Header>(); + header->SetCompressFlag(options.compress); + header->SetMaxProtoSize(options.max_proto_size); + header->SetHeaderChecksum(header->CalculateHeaderChecksum()); + + if (!filesystem->Write(file_path.c_str(), header.get(), sizeof(Header))) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to write header for file: ", file_path)); + } + + CreateResult create_result = { + std::unique_ptr<PortableFileBackedProtoLog<ProtoT>>( + new PortableFileBackedProtoLog<ProtoT>(filesystem, file_path, + std::move(header))), + /*data_loss=*/DataLoss::NONE, /*recalculated_checksum=*/false}; + + return create_result; +} + +template <typename ProtoT> +libtextclassifier3::StatusOr< + typename PortableFileBackedProtoLog<ProtoT>::CreateResult> +PortableFileBackedProtoLog<ProtoT>::InitializeExistingFile( + const Filesystem* filesystem, const std::string& file_path, + const Options& options, int64_t file_size) { + bool header_changed = false; + if (file_size < kHeaderReservedBytes) { + return absl_ports::InternalError( + absl_ports::StrCat("File header too short for: ", file_path)); + } + + std::unique_ptr<Header> header = std::make_unique<Header>(); + if (!filesystem->PRead(file_path.c_str(), header.get(), sizeof(Header), + /*offset=*/0)) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to read header for file: ", file_path)); + } + + // Make sure the header is still valid before we use any of its values. This + // is covered by the header_checksum check below, but this is a quick check + // that can save us from an extra crc computation. + if (header->GetMagic() != Header::kMagic) { + return absl_ports::InternalError( + absl_ports::StrCat("Invalid header kMagic for file: ", file_path)); + } + + if (header->GetHeaderChecksum() != header->CalculateHeaderChecksum()) { + return absl_ports::InternalError( + absl_ports::StrCat("Invalid header checksum for: ", file_path)); + } + + if (header->GetFileFormatVersion() != Header::kFileFormatVersion) { + // If this changes, we might need to handle a migration rather than throwing + // an error. + return absl_ports::InternalError( + absl_ports::StrCat("Invalid header file format version: ", file_path)); + } + + if (header->GetCompressFlag() != options.compress) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Inconsistent compress option, expected %d, actual %d", + header->GetCompressFlag(), options.compress)); + } + + int32_t existing_max_proto_size = header->GetMaxProtoSize(); + if (existing_max_proto_size > options.max_proto_size) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Max proto size cannot be smaller than previous " + "instantiations, previous size %d, wanted size %d", + header->GetMaxProtoSize(), options.max_proto_size)); + } else if (existing_max_proto_size < options.max_proto_size) { + // It's fine if our new max size is greater than our previous one. Existing + // data is still valid. + header->SetMaxProtoSize(options.max_proto_size); + header_changed = true; + } + + DataLoss data_loss = DataLoss::NONE; + + // If we have any documents in our tail, get rid of them since they're not in + // our checksum. Our checksum reflects content up to the rewind offset. + if (file_size > header->GetRewindOffset()) { + if (!filesystem->Truncate(file_path.c_str(), header->GetRewindOffset())) { + return absl_ports::InternalError(IcingStringUtil::StringPrintf( + "Failed to truncate '%s' to size %lld", file_path.data(), + static_cast<long long>(header->GetRewindOffset()))); + }; + data_loss = DataLoss::PARTIAL; + } + + bool recalculated_checksum = false; + + // If our dirty flag is set, that means we might have crashed in the middle of + // erasing a proto. This could have happened anywhere between: + // A. Set dirty flag to true and update header checksum + // B. Erase the proto + // C. Set dirty flag to false, update log checksum, update header checksum + // + // Scenario 1: We went down between A and B. Maybe our dirty flag is a + // false alarm and we can keep all our data. + // + // Scenario 2: We went down between B and C. Our data is compromised and + // we need to throw everything out. + if (header->GetDirtyFlag()) { + // Recompute the log's checksum to detect which scenario we're in. + ICING_ASSIGN_OR_RETURN( + Crc32 calculated_log_checksum, + ComputeChecksum(filesystem, file_path, Crc32(), + /*start=*/kHeaderReservedBytes, /*end=*/file_size)); + + if (header->GetLogChecksum() != calculated_log_checksum.Get()) { + // Still doesn't match, we're in Scenario 2. Throw out all our data now + // and initialize as a new instance. + ICING_ASSIGN_OR_RETURN(CreateResult create_result, + InitializeNewFile(filesystem, file_path, options)); + create_result.data_loss = DataLoss::COMPLETE; + create_result.recalculated_checksum = true; + return create_result; + } + // Otherwise we're good, checksum matches our contents so continue + // initializing like normal. + recalculated_checksum = true; + + // Update our header. + header->SetDirtyFlag(false); + header_changed = true; + } + + if (header_changed) { + header->SetHeaderChecksum(header->CalculateHeaderChecksum()); + + if (!filesystem->PWrite(file_path.c_str(), /*offset=*/0, header.get(), + sizeof(Header))) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to update header to: ", file_path)); + } + } + + CreateResult create_result = { + std::unique_ptr<PortableFileBackedProtoLog<ProtoT>>( + new PortableFileBackedProtoLog<ProtoT>(filesystem, file_path, + std::move(header))), + data_loss, recalculated_checksum}; + + return create_result; +} + +template <typename ProtoT> +libtextclassifier3::StatusOr<Crc32> +PortableFileBackedProtoLog<ProtoT>::ComputeChecksum( + const Filesystem* filesystem, const std::string& file_path, + Crc32 initial_crc, int64_t start, int64_t end) { + auto mmapped_file = MemoryMappedFile(*filesystem, file_path, + MemoryMappedFile::Strategy::READ_ONLY); + Crc32 new_crc(initial_crc.Get()); + + if (start < 0) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Starting checksum offset of file '%s' must be greater than 0, was " + "%lld", + file_path.c_str(), static_cast<long long>(start))); + } + + if (end < start) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Ending checksum offset of file '%s' must be greater than start " + "'%lld', was '%lld'", + file_path.c_str(), static_cast<long long>(start), + static_cast<long long>(end))); + } + + int64_t file_size = filesystem->GetFileSize(file_path.c_str()); + if (end > file_size) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Ending checksum offset of file '%s' must be within " + "file size of %lld, was %lld", + file_path.c_str(), static_cast<long long>(file_size), + static_cast<long long>(end))); + } + + Architecture architecture = GetArchitecture(); + switch (architecture) { + case Architecture::BIT_64: { + // Don't mmap in chunks here since mmapping can be harmful on 64-bit + // devices where mmap/munmap calls need the mmap write semaphore, which + // blocks mmap/munmap/mprotect and all page faults from executing while + // they run. On 64-bit devices, this doesn't actually load into memory, it + // just makes the file faultable. So the whole file should be ok. + // b/185822878. + ICING_RETURN_IF_ERROR(mmapped_file.Remap(start, end - start)); + auto mmap_str = std::string_view(mmapped_file.region(), end - start); + new_crc.Append(mmap_str); + break; + } + case Architecture::BIT_32: + [[fallthrough]]; + case Architecture::UNKNOWN: { + // 32-bit devices only have 4GB of RAM. Mmap in chunks to not use up too + // much memory at once. If we're unknown, then also chunk it because we're + // not sure what the device can handle. + for (int i = start; i < end; i += kMmapChunkSize) { + // Don't read past the file size. + int next_chunk_size = kMmapChunkSize; + if ((i + kMmapChunkSize) >= end) { + next_chunk_size = end - i; + } + + ICING_RETURN_IF_ERROR(mmapped_file.Remap(i, next_chunk_size)); + + auto mmap_str = + std::string_view(mmapped_file.region(), next_chunk_size); + new_crc.Append(mmap_str); + } + break; + } + } + + return new_crc; +} + +template <typename ProtoT> +libtextclassifier3::StatusOr<int64_t> +PortableFileBackedProtoLog<ProtoT>::WriteProto(const ProtoT& proto) { + int64_t proto_size = proto.ByteSizeLong(); + int32_t host_order_metadata; + int64_t current_position = filesystem_->GetCurrentPosition(fd_.get()); + + if (proto_size > header_->GetMaxProtoSize()) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "proto_size, %lld, was too large to write. Max is %d", + static_cast<long long>(proto_size), header_->GetMaxProtoSize())); + } + + // At this point, we've guaranteed that proto_size is under kMaxProtoSize + // (see + // ::Create), so we can safely store it in an int. + int final_size = 0; + + std::string proto_str; + google::protobuf::io::StringOutputStream proto_stream(&proto_str); + + if (header_->GetCompressFlag()) { + google::protobuf::io::GzipOutputStream::Options options; + options.format = google::protobuf::io::GzipOutputStream::ZLIB; + options.compression_level = kDeflateCompressionLevel; + + google::protobuf::io::GzipOutputStream compressing_stream(&proto_stream, + options); + + bool success = proto.SerializeToZeroCopyStream(&compressing_stream) && + compressing_stream.Close(); + + if (!success) { + return absl_ports::InternalError("Error compressing proto."); + } + + final_size = proto_str.size(); + + // In case the compressed proto is larger than the original proto, we also + // can't write it. + if (final_size > header_->GetMaxProtoSize()) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Compressed proto size, %d, was greater than " + "max_proto_size, %d", + final_size, header_->GetMaxProtoSize())); + } + } else { + // Serialize the proto directly into the write buffer at an offset of the + // metadata. + proto.SerializeToZeroCopyStream(&proto_stream); + final_size = proto_str.size(); + } + + // 1st byte for magic, next 3 bytes for proto size. + host_order_metadata = (kProtoMagic << 24) | final_size; + + // Actually write metadata, has to be done after we know the possibly + // compressed proto size + ICING_RETURN_IF_ERROR( + WriteProtoMetadata(filesystem_, fd_.get(), host_order_metadata)); + + // Write the serialized proto + if (!filesystem_->Write(fd_.get(), proto_str.data(), proto_str.size())) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to write proto to: ", file_path_)); + } + + return current_position; +} + +template <typename ProtoT> +libtextclassifier3::StatusOr<ProtoT> +PortableFileBackedProtoLog<ProtoT>::ReadProto(int64_t file_offset) const { + int64_t file_size = filesystem_->GetFileSize(fd_.get()); + MemoryMappedFile mmapped_file(*filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_ONLY); + if (file_offset >= file_size) { + // file_size points to the next byte to write at, so subtract one to get + // the inclusive, actual size of file. + return absl_ports::OutOfRangeError( + IcingStringUtil::StringPrintf("Trying to read from a location, %lld, " + "out of range of the file size, %lld", + static_cast<long long>(file_offset), + static_cast<long long>(file_size - 1))); + } + + // Read out the metadata + ICING_ASSIGN_OR_RETURN( + int32_t metadata, + ReadProtoMetadata(&mmapped_file, file_offset, file_size)); + + // Copy out however many bytes it says the proto is + int stored_size = GetProtoSize(metadata); + + ICING_RETURN_IF_ERROR( + mmapped_file.Remap(file_offset + sizeof(metadata), stored_size)); + + if (IsEmptyBuffer(mmapped_file.region(), mmapped_file.region_size())) { + return absl_ports::NotFoundError("The proto data has been erased."); + } + + google::protobuf::io::ArrayInputStream proto_stream( + mmapped_file.mutable_region(), stored_size); + + // Deserialize proto + ProtoT proto; + if (header_->GetCompressFlag()) { + google::protobuf::io::GzipInputStream decompress_stream(&proto_stream); + proto.ParseFromZeroCopyStream(&decompress_stream); + } else { + proto.ParseFromZeroCopyStream(&proto_stream); + } + + return proto; +} + +template <typename ProtoT> +libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::EraseProto( + int64_t file_offset) { + int64_t file_size = filesystem_->GetFileSize(fd_.get()); + if (file_offset >= file_size) { + // file_size points to the next byte to write at, so subtract one to get + // the inclusive, actual size of file. + return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf( + "Trying to erase data at a location, %lld, " + "out of range of the file size, %lld", + static_cast<long long>(file_offset), + static_cast<long long>(file_size - 1))); + } + + MemoryMappedFile mmapped_file( + *filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC); + + // Read out the metadata + ICING_ASSIGN_OR_RETURN( + int32_t metadata, + ReadProtoMetadata(&mmapped_file, file_offset, file_size)); + + ICING_RETURN_IF_ERROR(mmapped_file.Remap(file_offset + sizeof(metadata), + GetProtoSize(metadata))); + + // We need to update the crc checksum if the erased area is before the + // rewind position. + int32_t new_crc; + int64_t erased_proto_offset = file_offset + sizeof(metadata); + if (erased_proto_offset < header_->GetRewindOffset()) { + // Set to "dirty" before we start writing anything. + header_->SetDirtyFlag(true); + header_->SetHeaderChecksum(header_->CalculateHeaderChecksum()); + if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(), + sizeof(Header))) { + return absl_ports::InternalError(absl_ports::StrCat( + "Failed to update dirty bit of header to: ", file_path_)); + } + + // We need to calculate [original string xor 0s]. + // The xored string is the same as the original string because 0 xor 0 = + // 0, 1 xor 0 = 1. + const std::string_view xored_str(mmapped_file.region(), + mmapped_file.region_size()); + + Crc32 crc(header_->GetLogChecksum()); + ICING_ASSIGN_OR_RETURN( + new_crc, crc.UpdateWithXor( + xored_str, + /*full_data_size=*/header_->GetRewindOffset() - + kHeaderReservedBytes, + /*position=*/erased_proto_offset - kHeaderReservedBytes)); + } + + // Clear the region. + memset(mmapped_file.mutable_region(), '\0', mmapped_file.region_size()); + + // If we cleared something in our checksummed area, we should update our + // checksum and reset our dirty bit. + if (erased_proto_offset < header_->GetRewindOffset()) { + header_->SetDirtyFlag(false); + header_->SetLogChecksum(new_crc); + header_->SetHeaderChecksum(header_->CalculateHeaderChecksum()); + + if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(), + sizeof(Header))) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to update header to: ", file_path_)); + } + } + + return libtextclassifier3::Status::OK; +} + +template <typename ProtoT> +libtextclassifier3::StatusOr<int64_t> +PortableFileBackedProtoLog<ProtoT>::GetDiskUsage() const { + int64_t size = filesystem_->GetDiskUsage(file_path_.c_str()); + if (size == Filesystem::kBadFileSize) { + return absl_ports::InternalError("Failed to get disk usage of proto log"); + } + return size; +} + +template <typename ProtoT> +libtextclassifier3::StatusOr<int64_t> +PortableFileBackedProtoLog<ProtoT>::GetElementsFileSize() const { + int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str()); + if (total_file_size == Filesystem::kBadFileSize) { + return absl_ports::InternalError( + "Failed to get file size of elments in the proto log"); + } + return total_file_size - kHeaderReservedBytes; +} + +template <typename ProtoT> +PortableFileBackedProtoLog<ProtoT>::Iterator::Iterator( + const Filesystem& filesystem, const std::string& file_path, + int64_t initial_offset) + : mmapped_file_(filesystem, file_path, + MemoryMappedFile::Strategy::READ_ONLY), + initial_offset_(initial_offset), + current_offset_(kInvalidOffset), + file_size_(filesystem.GetFileSize(file_path.c_str())) { + if (file_size_ == Filesystem::kBadFileSize) { + // Fails all Advance() calls + file_size_ = 0; + } +} + +template <typename ProtoT> +libtextclassifier3::Status +PortableFileBackedProtoLog<ProtoT>::Iterator::Advance() { + if (current_offset_ == kInvalidOffset) { + // First Advance() call + current_offset_ = initial_offset_; + } else { + // Jumps to the next proto position + ICING_ASSIGN_OR_RETURN( + int32_t metadata, + ReadProtoMetadata(&mmapped_file_, current_offset_, file_size_)); + current_offset_ += sizeof(metadata) + GetProtoSize(metadata); + } + + if (current_offset_ < file_size_) { + return libtextclassifier3::Status::OK; + } else { + return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf( + "The next proto offset, %lld, is out of file range [0, %lld)", + static_cast<long long>(current_offset_), + static_cast<long long>(file_size_))); + } +} + +template <typename ProtoT> +int64_t PortableFileBackedProtoLog<ProtoT>::Iterator::GetOffset() { + return current_offset_; +} + +template <typename ProtoT> +typename PortableFileBackedProtoLog<ProtoT>::Iterator +PortableFileBackedProtoLog<ProtoT>::GetIterator() { + return Iterator(*filesystem_, file_path_, + /*initial_offset=*/kHeaderReservedBytes); +} + +template <typename ProtoT> +libtextclassifier3::StatusOr<int32_t> +PortableFileBackedProtoLog<ProtoT>::ReadProtoMetadata( + MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size) { + // Checks file_offset + if (file_offset >= file_size) { + return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf( + "offset, %lld, is out of file range [0, %lld)", + static_cast<long long>(file_offset), + static_cast<long long>(file_size))); + } + int32_t portable_metadata; + int metadata_size = sizeof(portable_metadata); + if (file_offset + metadata_size >= file_size) { + return absl_ports::InternalError(IcingStringUtil::StringPrintf( + "Wrong metadata offset %lld, metadata doesn't fit in " + "with file range [0, %lld)", + static_cast<long long>(file_offset), + static_cast<long long>(file_size))); + } + + // Reads metadata + ICING_RETURN_IF_ERROR(mmapped_file->Remap(file_offset, metadata_size)); + memcpy(&portable_metadata, mmapped_file->region(), metadata_size); + + // Need to switch it back to host order endianness after reading from disk. + int32_t host_order_metadata = GNetworkToHostL(portable_metadata); + + // Checks magic number + uint8_t stored_k_proto_magic = GetProtoMagic(host_order_metadata); + if (stored_k_proto_magic != kProtoMagic) { + return absl_ports::InternalError(IcingStringUtil::StringPrintf( + "Failed to read kProtoMagic, expected %d, actual %d", kProtoMagic, + stored_k_proto_magic)); + } + + return host_order_metadata; +} + +template <typename ProtoT> +libtextclassifier3::Status +PortableFileBackedProtoLog<ProtoT>::WriteProtoMetadata( + const Filesystem* filesystem, int fd, int32_t host_order_metadata) { + // Convert it into portable endian format before writing to disk + int32_t portable_metadata = GHostToNetworkL(host_order_metadata); + int portable_metadata_size = sizeof(portable_metadata); + + // Write metadata + if (!filesystem->Write(fd, &portable_metadata, portable_metadata_size)) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to write proto metadata.")); + } + + return libtextclassifier3::Status::OK; +} + +template <typename ProtoT> +libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::PersistToDisk() { + int64_t file_size = filesystem_->GetFileSize(file_path_.c_str()); + if (file_size == header_->GetRewindOffset()) { + // No new protos appended, don't need to update the checksum. + return libtextclassifier3::Status::OK; + } + + ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum()); + + header_->SetLogChecksum(crc.Get()); + header_->SetRewindOffset(file_size); + header_->SetHeaderChecksum(header_->CalculateHeaderChecksum()); + + if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(), + sizeof(Header)) || + !filesystem_->DataSync(fd_.get())) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to update header to: ", file_path_)); + } + + return libtextclassifier3::Status::OK; +} + +template <typename ProtoT> +libtextclassifier3::StatusOr<Crc32> +PortableFileBackedProtoLog<ProtoT>::ComputeChecksum() { + int64_t file_size = filesystem_->GetFileSize(file_path_.c_str()); + int64_t new_content_size = file_size - header_->GetRewindOffset(); + Crc32 crc; + if (new_content_size == 0) { + // No new protos appended, return cached checksum + return Crc32(header_->GetLogChecksum()); + } else if (new_content_size < 0) { + // File shrunk, recalculate the entire checksum. + ICING_ASSIGN_OR_RETURN( + crc, + ComputeChecksum(filesystem_, file_path_, Crc32(), + /*start=*/kHeaderReservedBytes, /*end=*/file_size)); + } else { + // Append new changes to the existing checksum. + ICING_ASSIGN_OR_RETURN( + crc, ComputeChecksum( + filesystem_, file_path_, Crc32(header_->GetLogChecksum()), + /*start=*/header_->GetRewindOffset(), /*end=*/file_size)); + } + return crc; +} + +} // namespace lib +} // namespace icing + +#endif // ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_ diff --git a/icing/file/portable-file-backed-proto-log_benchmark.cc b/icing/file/portable-file-backed-proto-log_benchmark.cc new file mode 100644 index 0000000..f83ccd6 --- /dev/null +++ b/icing/file/portable-file-backed-proto-log_benchmark.cc @@ -0,0 +1,343 @@ +// Copyright (C) 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <cstdint> +#include <random> + +#include "testing/base/public/benchmark.h" +#include "gmock/gmock.h" +#include "icing/document-builder.h" +#include "icing/file/filesystem.h" +#include "icing/file/portable-file-backed-proto-log.h" +#include "icing/legacy/core/icing-string-util.h" +#include "icing/proto/document.pb.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/random-string.h" +#include "icing/testing/tmp-directory.h" + +// go/microbenchmarks +// +// To build and run on a local machine: +// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt +// icing/file:portable-file-backed-proto-log_benchmark +// +// $ blaze-bin/icing/file/portable-file-backed-proto-log_benchmark +// --benchmarks=all +// +// +// To build and run on an Android device (must be connected and rooted): +// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1" +// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt +// icing/file:portable-file-backed-proto-log_benchmark +// +// $ adb root +// +// $ adb push +// blaze-bin/icing/file/portable-file-backed-proto-log_benchmark +// /data/local/tmp/ +// +// $ adb shell /data/local/tmp/portable-file-backed-proto-log-benchmark +// --benchmarks=all + +namespace icing { +namespace lib { + +namespace { + +static void BM_Write(benchmark::State& state) { + const Filesystem filesystem; + int string_length = state.range(0); + const std::string file_path = IcingStringUtil::StringPrintf( + "%s%s%d%s", GetTestTempDir().c_str(), "/proto_", string_length, ".log"); + int max_proto_size = (1 << 24) - 1; // 16 MiB + bool compress = true; + + // Make sure it doesn't already exist. + filesystem.DeleteFile(file_path.c_str()); + + auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem, file_path, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress, max_proto_size)) + .ValueOrDie() + .proto_log; + + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + + std::default_random_engine random; + const std::string rand_str = + RandomString(kAlNumAlphabet, string_length, &random); + + auto document_properties = document.add_properties(); + document_properties->set_name("string property"); + document_properties->add_string_values(rand_str); + + for (auto _ : state) { + testing::DoNotOptimize(proto_log->WriteProto(document)); + } + state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * + string_length); + + // Cleanup after ourselves + filesystem.DeleteFile(file_path.c_str()); +} +BENCHMARK(BM_Write) + ->Arg(1) + ->Arg(32) + ->Arg(512) + ->Arg(1024) + ->Arg(4 * 1024) + ->Arg(8 * 1024) + ->Arg(16 * 1024) + ->Arg(32 * 1024) + ->Arg(256 * 1024) + ->Arg(2 * 1024 * 1024) + ->Arg(8 * 1024 * 1024) + ->Arg(15 * 1024 * 1024); // We do 15MiB here since our max proto size is + // 16MiB, and we need some extra space for the + // rest of the document properties + +static void BM_Read(benchmark::State& state) { + const Filesystem filesystem; + int string_length = state.range(0); + const std::string file_path = IcingStringUtil::StringPrintf( + "%s%s%d%s", GetTestTempDir().c_str(), "/proto_", string_length, ".log"); + int max_proto_size = (1 << 24) - 1; // 16 MiB + bool compress = true; + + // Make sure it doesn't already exist. + filesystem.DeleteFile(file_path.c_str()); + + auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem, file_path, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress, max_proto_size)) + .ValueOrDie() + .proto_log; + + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + + std::default_random_engine random; + const std::string rand_str = + RandomString(kAlNumAlphabet, string_length, &random); + + auto document_properties = document.add_properties(); + document_properties->set_name("string property"); + document_properties->add_string_values(rand_str); + + ICING_ASSERT_OK_AND_ASSIGN(int64_t write_offset, + proto_log->WriteProto(document)); + + for (auto _ : state) { + testing::DoNotOptimize(proto_log->ReadProto(write_offset)); + } + state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * + string_length); + + // Cleanup after ourselves + filesystem.DeleteFile(file_path.c_str()); +} +BENCHMARK(BM_Read) + ->Arg(1) + ->Arg(32) + ->Arg(512) + ->Arg(1024) + ->Arg(4 * 1024) + ->Arg(8 * 1024) + ->Arg(16 * 1024) + ->Arg(32 * 1024) + ->Arg(256 * 1024) + ->Arg(2 * 1024 * 1024) + ->Arg(8 * 1024 * 1024) + ->Arg(15 * 1024 * 1024); // We do 15MiB here since our max proto size is + // 16MiB, and we need some extra space for the + // rest of the document properties + // +static void BM_Erase(benchmark::State& state) { + const Filesystem filesystem; + const std::string file_path = IcingStringUtil::StringPrintf( + "%s%s", GetTestTempDir().c_str(), "/proto.log"); + int max_proto_size = (1 << 24) - 1; // 16 MiB + bool compress = true; + + // Make sure it doesn't already exist. + filesystem.DeleteFile(file_path.c_str()); + + auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem, file_path, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress, max_proto_size)) + .ValueOrDie() + .proto_log; + + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + + std::default_random_engine random; + const std::string rand_str = RandomString(kAlNumAlphabet, /*len=*/1, &random); + + auto document_properties = document.add_properties(); + document_properties->set_name("string property"); + document_properties->add_string_values(rand_str); + + for (auto _ : state) { + state.PauseTiming(); + ICING_ASSERT_OK_AND_ASSIGN(int64_t write_offset, + proto_log->WriteProto(document)); + state.ResumeTiming(); + + testing::DoNotOptimize(proto_log->EraseProto(write_offset)); + } + + // Cleanup after ourselves + filesystem.DeleteFile(file_path.c_str()); +} +BENCHMARK(BM_Erase); + +static void BM_ComputeChecksum(benchmark::State& state) { + const Filesystem filesystem; + const std::string file_path = GetTestTempDir() + "/proto.log"; + int max_proto_size = (1 << 24) - 1; // 16 MiB + bool compress = true; + + // Make sure it doesn't already exist. + filesystem.DeleteFile(file_path.c_str()); + + auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem, file_path, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress, max_proto_size)) + .ValueOrDie() + .proto_log; + + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + + // Make each document 1KiB + int string_length = 1024; + std::default_random_engine random; + const std::string rand_str = + RandomString(kAlNumAlphabet, string_length, &random); + + auto document_properties = document.add_properties(); + document_properties->set_name("string property"); + document_properties->add_string_values(rand_str); + + int num_docs = state.range(0); + for (int i = 0; i < num_docs; ++i) { + ICING_ASSERT_OK(proto_log->WriteProto(document)); + } + + for (auto _ : state) { + testing::DoNotOptimize(proto_log->ComputeChecksum()); + } + + // Cleanup after ourselves + filesystem.DeleteFile(file_path.c_str()); +} +BENCHMARK(BM_ComputeChecksum)->Range(1024, 1 << 20); + +static void BM_ComputeChecksumWithCachedChecksum(benchmark::State& state) { + const Filesystem filesystem; + const std::string file_path = GetTestTempDir() + "/proto.log"; + int max_proto_size = (1 << 24) - 1; // 16 MiB + bool compress = true; + + // Make sure it doesn't already exist. + filesystem.DeleteFile(file_path.c_str()); + + auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem, file_path, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress, max_proto_size)) + .ValueOrDie() + .proto_log; + + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + + // Make the document 1KiB + int string_length = 1024; + std::default_random_engine random; + const std::string rand_str = + RandomString(kAlNumAlphabet, string_length, &random); + + auto document_properties = document.add_properties(); + document_properties->set_name("string property"); + document_properties->add_string_values(rand_str); + + // Write some content and persist. This should update our cached checksum to + // include the document. + ICING_ASSERT_OK(proto_log->WriteProto(document)); + ICING_ASSERT_OK(proto_log->PersistToDisk()); + + // This ComputeChecksum call shouldn't need to do any computation since we can + // reuse our cached checksum. + for (auto _ : state) { + testing::DoNotOptimize(proto_log->ComputeChecksum()); + } + + // Cleanup after ourselves + filesystem.DeleteFile(file_path.c_str()); +} +BENCHMARK(BM_ComputeChecksumWithCachedChecksum); + +static void BM_ComputeChecksumOnlyForTail(benchmark::State& state) { + const Filesystem filesystem; + const std::string file_path = GetTestTempDir() + "/proto.log"; + int max_proto_size = (1 << 24) - 1; // 16 MiB + bool compress = true; + + // Make sure it doesn't already exist. + filesystem.DeleteFile(file_path.c_str()); + + auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem, file_path, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress, max_proto_size)) + .ValueOrDie() + .proto_log; + + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + + // Make the document 1KiB + int string_length = 1024; + std::default_random_engine random; + const std::string rand_str = + RandomString(kAlNumAlphabet, string_length, &random); + + auto document_properties = document.add_properties(); + document_properties->set_name("string property"); + document_properties->add_string_values(rand_str); + + // Write some content and persist. This should update our cached checksum to + // include the document. + ICING_ASSERT_OK(proto_log->WriteProto(document)); + ICING_ASSERT_OK(proto_log->PersistToDisk()); + + // Write another proto into the tail, but it's not included in our cached + // checksum since we didn't call persist. + ICING_ASSERT_OK(proto_log->WriteProto(document)); + + // ComputeChecksum should be calculating the checksum of the tail and adding + // it to the cached checksum we have. + for (auto _ : state) { + testing::DoNotOptimize(proto_log->ComputeChecksum()); + } + + // Cleanup after ourselves + filesystem.DeleteFile(file_path.c_str()); +} +BENCHMARK(BM_ComputeChecksumOnlyForTail); + +} // namespace +} // namespace lib +} // namespace icing diff --git a/icing/file/portable-file-backed-proto-log_test.cc b/icing/file/portable-file-backed-proto-log_test.cc new file mode 100644 index 0000000..b5fee4b --- /dev/null +++ b/icing/file/portable-file-backed-proto-log_test.cc @@ -0,0 +1,1071 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/file/portable-file-backed-proto-log.h" + +#include <cstdint> +#include <cstdlib> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/document-builder.h" +#include "icing/file/filesystem.h" +#include "icing/file/mock-filesystem.h" +#include "icing/portable/equals-proto.h" +#include "icing/proto/document.pb.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/tmp-directory.h" + +namespace icing { +namespace lib { + +namespace { + +using ::icing::lib::portable_equals_proto::EqualsProto; +using ::testing::A; +using ::testing::Eq; +using ::testing::Gt; +using ::testing::HasSubstr; +using ::testing::Not; +using ::testing::NotNull; +using ::testing::Pair; +using ::testing::Return; + +using Header = PortableFileBackedProtoLog<DocumentProto>::Header; + +Header ReadHeader(Filesystem filesystem, const std::string& file_path) { + Header header; + filesystem.PRead(file_path.c_str(), &header, sizeof(Header), + /*offset=*/0); + return header; +} + +void WriteHeader(Filesystem filesystem, const std::string& file_path, + Header& header) { + filesystem.Write(file_path.c_str(), &header, sizeof(Header)); +} + +class PortableFileBackedProtoLogTest : public ::testing::Test { + protected: + // Adds a user-defined default construct because a const member variable may + // make the compiler accidentally delete the default constructor. + // https://stackoverflow.com/a/47368753 + PortableFileBackedProtoLogTest() {} + + void SetUp() override { + file_path_ = GetTestTempDir() + "/proto_log"; + filesystem_.DeleteFile(file_path_.c_str()); + } + + void TearDown() override { filesystem_.DeleteFile(file_path_.c_str()); } + + const Filesystem filesystem_; + std::string file_path_; + bool compress_ = true; + int64_t max_proto_size_ = 256 * 1024; // 256 KiB +}; + +TEST_F(PortableFileBackedProtoLogTest, Initialize) { + // max_proto_size must be greater than 0 + int invalid_max_proto_size = 0; + ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, invalid_max_proto_size)), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + EXPECT_THAT(create_result.proto_log, NotNull()); + EXPECT_FALSE(create_result.has_data_loss()); + EXPECT_FALSE(create_result.recalculated_checksum); + + // Can't recreate the same file with different options. + ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + !compress_, max_proto_size_)), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST_F(PortableFileBackedProtoLogTest, ReservedSpaceForHeader) { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + + // With no protos written yet, the log should be minimum the size of the + // reserved header space. + ASSERT_EQ(filesystem_.GetFileSize(file_path_.c_str()), + PortableFileBackedProtoLog<DocumentProto>::kHeaderReservedBytes); +} + +TEST_F(PortableFileBackedProtoLogTest, WriteProtoTooLarge) { + int max_proto_size = 1; + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + + // Proto is too large for the max_proto_size_in + ASSERT_THAT(proto_log->WriteProto(document), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST_F(PortableFileBackedProtoLogTest, ReadProtoWrongKProtoMagic) { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Write a proto + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + + ICING_ASSERT_OK_AND_ASSIGN(int64_t file_offset, + proto_log->WriteProto(document)); + + // The 4 bytes of metadata that just doesn't have the same kProtoMagic + // specified in file-backed-proto-log.h + uint32_t wrong_magic = 0x7E000000; + + // Sanity check that we opened the file correctly + int fd = filesystem_.OpenForWrite(file_path_.c_str()); + ASSERT_GT(fd, 0); + + // Write the wrong kProtoMagic in, kProtoMagics are stored at the beginning of + // a proto entry. + filesystem_.PWrite(fd, file_offset, &wrong_magic, sizeof(wrong_magic)); + + ASSERT_THAT(proto_log->ReadProto(file_offset), + StatusIs(libtextclassifier3::StatusCode::INTERNAL)); +} + +TEST_F(PortableFileBackedProtoLogTest, ReadWriteUncompressedProto) { + int last_offset; + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + /*compress_in=*/false, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Write the first proto + DocumentProto document1 = + DocumentBuilder().SetKey("namespace1", "uri1").Build(); + + ICING_ASSERT_OK_AND_ASSIGN(int written_position, + proto_log->WriteProto(document1)); + + int document1_offset = written_position; + + // Check that what we read is what we wrote + ASSERT_THAT(proto_log->ReadProto(written_position), + IsOkAndHolds(EqualsProto(document1))); + + // Write a second proto that's close to the max size. Leave some room for + // the rest of the proto properties. + std::string long_str(max_proto_size_ - 1024, 'a'); + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace2", "uri2") + .AddStringProperty("long_str", long_str) + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN(written_position, + proto_log->WriteProto(document2)); + + int document2_offset = written_position; + last_offset = written_position; + ASSERT_GT(document2_offset, document1_offset); + + // Check the second proto + ASSERT_THAT(proto_log->ReadProto(written_position), + IsOkAndHolds(EqualsProto(document2))); + + ICING_ASSERT_OK(proto_log->PersistToDisk()); + } + + { + // Make a new proto_log with the same file_path, and make sure we + // can still write to the same underlying file. + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + /*compress_in=*/false, max_proto_size_))); + auto recreated_proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Write a third proto + DocumentProto document3 = + DocumentBuilder().SetKey("namespace3", "uri3").Build(); + + ASSERT_THAT(recreated_proto_log->WriteProto(document3), + IsOkAndHolds(Gt(last_offset))); + } +} + +TEST_F(PortableFileBackedProtoLogTest, ReadWriteCompressedProto) { + int last_offset; + + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + /*compress_in=*/true, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Write the first proto + DocumentProto document1 = + DocumentBuilder().SetKey("namespace1", "uri1").Build(); + + ICING_ASSERT_OK_AND_ASSIGN(int written_position, + proto_log->WriteProto(document1)); + + int document1_offset = written_position; + + // Check that what we read is what we wrote + ASSERT_THAT(proto_log->ReadProto(written_position), + IsOkAndHolds(EqualsProto(document1))); + + // Write a second proto that's close to the max size. Leave some room for + // the rest of the proto properties. + std::string long_str(max_proto_size_ - 1024, 'a'); + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace2", "uri2") + .AddStringProperty("long_str", long_str) + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN(written_position, + proto_log->WriteProto(document2)); + + int document2_offset = written_position; + last_offset = written_position; + ASSERT_GT(document2_offset, document1_offset); + + // Check the second proto + ASSERT_THAT(proto_log->ReadProto(written_position), + IsOkAndHolds(EqualsProto(document2))); + + ICING_ASSERT_OK(proto_log->PersistToDisk()); + } + + { + // Make a new proto_log with the same file_path, and make sure we + // can still write to the same underlying file. + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + /*compress_in=*/true, max_proto_size_))); + auto recreated_proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Write a third proto + DocumentProto document3 = + DocumentBuilder().SetKey("namespace3", "uri3").Build(); + + ASSERT_THAT(recreated_proto_log->WriteProto(document3), + IsOkAndHolds(Gt(last_offset))); + } +} + +TEST_F(PortableFileBackedProtoLogTest, CorruptHeader) { + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto recreated_proto_log = std::move(create_result.proto_log); + EXPECT_FALSE(create_result.has_data_loss()); + } + + int corrupt_checksum = 24; + + // Write the corrupted header + Header header = ReadHeader(filesystem_, file_path_); + header.SetHeaderChecksum(corrupt_checksum); + WriteHeader(filesystem_, file_path_, header); + + { + // Reinitialize the same proto_log + ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_)), + StatusIs(libtextclassifier3::StatusCode::INTERNAL, + HasSubstr("Invalid header checksum"))); + } +} + +TEST_F(PortableFileBackedProtoLogTest, DifferentMagic) { + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto recreated_proto_log = std::move(create_result.proto_log); + EXPECT_FALSE(create_result.has_data_loss()); + + // Corrupt the magic that's stored at the beginning of the header. + int invalid_magic = -1; + ASSERT_THAT(invalid_magic, Not(Eq(Header::kMagic))); + + // Write the corrupted header + Header header = ReadHeader(filesystem_, file_path_); + header.SetMagic(invalid_magic); + WriteHeader(filesystem_, file_path_, header); + } + + { + // Reinitialize the same proto_log + ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_)), + StatusIs(libtextclassifier3::StatusCode::INTERNAL, + HasSubstr("Invalid header kMagic"))); + } +} + +TEST_F(PortableFileBackedProtoLogTest, + UnableToDetectCorruptContentWithoutDirtyBit) { + // This is intentional that we can't detect corruption. We're trading off + // earlier corruption detection for lower initialization latency. By not + // calculating the checksum on initialization, we can initialize much faster, + // but at the cost of detecting corruption. Note that even if we did detect + // corruption, there was nothing we could've done except throw an error to + // clients. We'll still do that, but at some later point when the log is + // attempting to be accessed and we can't actually deserialize a proto from + // it. See the description in cl/374278280 for more details. + + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + EXPECT_FALSE(create_result.has_data_loss()); + + DocumentProto document = + DocumentBuilder().SetKey("namespace1", "uri1").Build(); + + // Write and persist an document. + ICING_ASSERT_OK_AND_ASSIGN(int64_t document_offset, + proto_log->WriteProto(document)); + ICING_ASSERT_OK(proto_log->PersistToDisk()); + + // "Corrupt" the content written in the log. + document.set_uri("invalid"); + std::string serialized_document = document.SerializeAsString(); + ASSERT_TRUE(filesystem_.PWrite(file_path_.c_str(), document_offset, + serialized_document.data(), + serialized_document.size())); + } + + { + // We can recover, and we don't have data loss. + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + EXPECT_FALSE(create_result.has_data_loss()); + EXPECT_THAT(create_result.data_loss, Eq(DataLoss::NONE)); + EXPECT_FALSE(create_result.recalculated_checksum); + + // We still have the corrupted content in our file, we didn't throw + // everything out. + EXPECT_THAT( + filesystem_.GetFileSize(file_path_.c_str()), + Gt(PortableFileBackedProtoLog<DocumentProto>::kHeaderReservedBytes)); + } +} + +TEST_F(PortableFileBackedProtoLogTest, + DetectAndThrowOutCorruptContentWithDirtyBit) { + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + DocumentProto document = + DocumentBuilder() + .SetKey("namespace1", "uri1") + .AddStringProperty("string_property", "foo", "bar") + .Build(); + + // Write and persist the protos + ICING_ASSERT_OK_AND_ASSIGN(int64_t document_offset, + proto_log->WriteProto(document)); + + // Check that what we read is what we wrote + ASSERT_THAT(proto_log->ReadProto(document_offset), + IsOkAndHolds(EqualsProto(document))); + } + + { + // "Corrupt" the content written in the log. Make the corrupt document + // smaller than our original one so we don't accidentally write past our + // file. + DocumentProto document = + DocumentBuilder().SetKey("invalid_namespace", "invalid_uri").Build(); + std::string serialized_document = document.SerializeAsString(); + ASSERT_TRUE(filesystem_.PWrite( + file_path_.c_str(), + PortableFileBackedProtoLog<DocumentProto>::kHeaderReservedBytes, + serialized_document.data(), serialized_document.size())); + + Header header = ReadHeader(filesystem_, file_path_); + + // Set dirty bit to true to reflect that something changed in the log. + header.SetDirtyFlag(true); + header.SetHeaderChecksum(header.CalculateHeaderChecksum()); + + WriteHeader(filesystem_, file_path_, header); + } + + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + EXPECT_TRUE(create_result.has_data_loss()); + EXPECT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE)); + + // We had to recalculate the checksum to detect the corruption. + EXPECT_TRUE(create_result.recalculated_checksum); + + // We lost everything, file size is back down to the header. + EXPECT_THAT( + filesystem_.GetFileSize(file_path_.c_str()), + Eq(PortableFileBackedProtoLog<DocumentProto>::kHeaderReservedBytes)); + + // At least the log is no longer dirty. + Header header = ReadHeader(filesystem_, file_path_); + EXPECT_FALSE(header.GetDirtyFlag()); + } +} + +TEST_F(PortableFileBackedProtoLogTest, DirtyBitFalseAlarmKeepsData) { + DocumentProto document = + DocumentBuilder().SetKey("namespace1", "uri1").Build(); + int64_t document_offset; + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Write and persist the first proto + ICING_ASSERT_OK_AND_ASSIGN(document_offset, + proto_log->WriteProto(document)); + + // Check that what we read is what we wrote + ASSERT_THAT(proto_log->ReadProto(document_offset), + IsOkAndHolds(EqualsProto(document))); + } + + { + Header header = ReadHeader(filesystem_, file_path_); + + // Simulate the dirty flag set as true, but no data has been changed yet. + // Maybe we crashed between writing the dirty flag and erasing a proto. + header.SetDirtyFlag(true); + header.SetHeaderChecksum(header.CalculateHeaderChecksum()); + + WriteHeader(filesystem_, file_path_, header); + } + + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + EXPECT_FALSE(create_result.has_data_loss()); + + // Even though nothing changed, the false alarm dirty bit should have + // triggered us to recalculate our checksum. + EXPECT_TRUE(create_result.recalculated_checksum); + + // Check that our document still exists even though dirty bit was true. + EXPECT_THAT(proto_log->ReadProto(document_offset), + IsOkAndHolds(EqualsProto(document))); + + Header header = ReadHeader(filesystem_, file_path_); + EXPECT_FALSE(header.GetDirtyFlag()); + } +} + +TEST_F(PortableFileBackedProtoLogTest, + PersistToDiskKeepsPersistedDataAndTruncatesExtraData) { + DocumentProto document1 = + DocumentBuilder().SetKey("namespace1", "uri1").Build(); + DocumentProto document2 = + DocumentBuilder().SetKey("namespace2", "uri2").Build(); + int document1_offset, document2_offset; + int log_size; + + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Write and persist the first proto + ICING_ASSERT_OK_AND_ASSIGN(document1_offset, + proto_log->WriteProto(document1)); + ICING_ASSERT_OK(proto_log->PersistToDisk()); + + // Write, but don't explicitly persist the second proto + ICING_ASSERT_OK_AND_ASSIGN(document2_offset, + proto_log->WriteProto(document2)); + + // Check that what we read is what we wrote + ASSERT_THAT(proto_log->ReadProto(document1_offset), + IsOkAndHolds(EqualsProto(document1))); + ASSERT_THAT(proto_log->ReadProto(document2_offset), + IsOkAndHolds(EqualsProto(document2))); + + log_size = filesystem_.GetFileSize(file_path_.c_str()); + ASSERT_GT(log_size, 0); + + // PersistToDisk happens implicitly during the destructor. + } + + { + // The header rewind position and checksum aren't updated in this "system + // crash" scenario. + + std::string bad_proto = + "some incomplete proto that we didn't finish writing before the " + "system crashed"; + filesystem_.PWrite(file_path_.c_str(), log_size, bad_proto.data(), + bad_proto.size()); + + // Double check that we actually wrote something to the underlying file + ASSERT_GT(filesystem_.GetFileSize(file_path_.c_str()), log_size); + } + + { + // We can recover, but we have data loss + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_TRUE(create_result.has_data_loss()); + ASSERT_THAT(create_result.data_loss, Eq(DataLoss::PARTIAL)); + ASSERT_FALSE(create_result.recalculated_checksum); + + // Check that everything was persisted across instances + ASSERT_THAT(proto_log->ReadProto(document1_offset), + IsOkAndHolds(EqualsProto(document1))); + ASSERT_THAT(proto_log->ReadProto(document2_offset), + IsOkAndHolds(EqualsProto(document2))); + + // We correctly rewound to the last good state. + ASSERT_EQ(log_size, filesystem_.GetFileSize(file_path_.c_str())); + } +} + +TEST_F(PortableFileBackedProtoLogTest, + DirtyBitIsFalseAfterPutAndPersistToDisk) { + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + DocumentProto document = + DocumentBuilder().SetKey("namespace1", "uri1").Build(); + + // Write and persist the first proto + ICING_ASSERT_OK_AND_ASSIGN(int64_t document_offset, + proto_log->WriteProto(document)); + ICING_ASSERT_OK(proto_log->PersistToDisk()); + + // Check that what we read is what we wrote + ASSERT_THAT(proto_log->ReadProto(document_offset), + IsOkAndHolds(EqualsProto(document))); + } + + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + + // We previously persisted to disk so everything should be in a perfect + // state. + EXPECT_FALSE(create_result.has_data_loss()); + EXPECT_FALSE(create_result.recalculated_checksum); + + Header header = ReadHeader(filesystem_, file_path_); + EXPECT_FALSE(header.GetDirtyFlag()); + } +} + +TEST_F(PortableFileBackedProtoLogTest, + DirtyBitIsFalseAfterDeleteAndPersistToDisk) { + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + DocumentProto document = + DocumentBuilder().SetKey("namespace1", "uri1").Build(); + + // Write, delete, and persist the first proto + ICING_ASSERT_OK_AND_ASSIGN(int64_t document_offset, + proto_log->WriteProto(document)); + ICING_ASSERT_OK(proto_log->EraseProto(document_offset)); + ICING_ASSERT_OK(proto_log->PersistToDisk()); + + // The proto has been erased. + ASSERT_THAT(proto_log->ReadProto(document_offset), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + } + + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + + // We previously persisted to disk so everything should be in a perfect + // state. + EXPECT_FALSE(create_result.has_data_loss()); + EXPECT_FALSE(create_result.recalculated_checksum); + + Header header = ReadHeader(filesystem_, file_path_); + EXPECT_FALSE(header.GetDirtyFlag()); + } +} + +TEST_F(PortableFileBackedProtoLogTest, DirtyBitIsFalseAfterPutAndDestructor) { + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + DocumentProto document = + DocumentBuilder().SetKey("namespace1", "uri1").Build(); + + // Write and persist the first proto + ICING_ASSERT_OK_AND_ASSIGN(int64_t document_offset, + proto_log->WriteProto(document)); + + // Check that what we read is what we wrote + ASSERT_THAT(proto_log->ReadProto(document_offset), + IsOkAndHolds(EqualsProto(document))); + + // PersistToDisk is implicitly called as part of the destructor and + // PersistToDisk will clear the dirty bit. + } + + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + + // We previously persisted to disk so everything should be in a perfect + // state. + EXPECT_FALSE(create_result.has_data_loss()); + EXPECT_FALSE(create_result.recalculated_checksum); + + Header header = ReadHeader(filesystem_, file_path_); + EXPECT_FALSE(header.GetDirtyFlag()); + } +} + +TEST_F(PortableFileBackedProtoLogTest, + DirtyBitIsFalseAfterDeleteAndDestructor) { + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + DocumentProto document = + DocumentBuilder().SetKey("namespace1", "uri1").Build(); + + // Write, delete, and persist the first proto + ICING_ASSERT_OK_AND_ASSIGN(int64_t document_offset, + proto_log->WriteProto(document)); + ICING_ASSERT_OK(proto_log->EraseProto(document_offset)); + + // The proto has been erased. + ASSERT_THAT(proto_log->ReadProto(document_offset), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + + // PersistToDisk is implicitly called as part of the destructor and + // PersistToDisk will clear the dirty bit. + } + + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + + // We previously persisted to disk so everything should be in a perfect + // state. + EXPECT_FALSE(create_result.has_data_loss()); + EXPECT_FALSE(create_result.recalculated_checksum); + + Header header = ReadHeader(filesystem_, file_path_); + EXPECT_FALSE(header.GetDirtyFlag()); + } +} + +TEST_F(PortableFileBackedProtoLogTest, Iterator) { + DocumentProto document1 = + DocumentBuilder().SetKey("namespace", "uri1").Build(); + DocumentProto document2 = + DocumentBuilder().SetKey("namespace", "uri2").Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + { + // Empty iterator + auto iterator = proto_log->GetIterator(); + ASSERT_THAT(iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + } + + { + // Iterates through some documents + ICING_ASSERT_OK(proto_log->WriteProto(document1)); + ICING_ASSERT_OK(proto_log->WriteProto(document2)); + auto iterator = proto_log->GetIterator(); + // 1st proto + ICING_ASSERT_OK(iterator.Advance()); + ASSERT_THAT(proto_log->ReadProto(iterator.GetOffset()), + IsOkAndHolds(EqualsProto(document1))); + // 2nd proto + ICING_ASSERT_OK(iterator.Advance()); + ASSERT_THAT(proto_log->ReadProto(iterator.GetOffset()), + IsOkAndHolds(EqualsProto(document2))); + // Tries to advance + ASSERT_THAT(iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + } + + { + // Iterator with bad filesystem + MockFilesystem mock_filesystem; + ON_CALL(mock_filesystem, GetFileSize(A<const char*>())) + .WillByDefault(Return(Filesystem::kBadFileSize)); + PortableFileBackedProtoLog<DocumentProto>::Iterator bad_iterator( + mock_filesystem, file_path_, /*initial_offset=*/0); + ASSERT_THAT(bad_iterator.Advance(), + StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + } +} + +TEST_F(PortableFileBackedProtoLogTest, ComputeChecksum) { + DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); + Crc32 checksum; + + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + ICING_EXPECT_OK(proto_log->WriteProto(document)); + + ICING_ASSERT_OK_AND_ASSIGN(checksum, proto_log->ComputeChecksum()); + + // Calling it twice with no changes should get us the same checksum + EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum))); + } + + { + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Checksum should be consistent across instances + EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum))); + + // PersistToDisk shouldn't affect the checksum value + ICING_EXPECT_OK(proto_log->PersistToDisk()); + EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum))); + + // Check that modifying the log leads to a different checksum + ICING_EXPECT_OK(proto_log->WriteProto(document)); + EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Not(Eq(checksum)))); + } +} + +TEST_F(PortableFileBackedProtoLogTest, EraseProtoShouldSetZero) { + DocumentProto document1 = + DocumentBuilder().SetKey("namespace", "uri1").Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Writes and erases proto + ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset, + proto_log->WriteProto(document1)); + ICING_ASSERT_OK(proto_log->EraseProto(document1_offset)); + + // Checks if the erased area is set to 0. + int64_t file_size = filesystem_.GetFileSize(file_path_.c_str()); + MemoryMappedFile mmapped_file(filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_ONLY); + + // document1_offset + sizeof(int) is the start byte of the proto where + // sizeof(int) is the size of the proto metadata. + mmapped_file.Remap(document1_offset + sizeof(int), file_size - 1); + for (size_t i = 0; i < mmapped_file.region_size(); ++i) { + ASSERT_THAT(mmapped_file.region()[i], Eq(0)); + } +} + +TEST_F(PortableFileBackedProtoLogTest, EraseProtoShouldReturnNotFound) { + DocumentProto document1 = + DocumentBuilder().SetKey("namespace", "uri1").Build(); + DocumentProto document2 = + DocumentBuilder().SetKey("namespace", "uri2").Build(); + + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options(compress_, + max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Writes 2 protos + ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset, + proto_log->WriteProto(document1)); + ICING_ASSERT_OK_AND_ASSIGN(int64_t document2_offset, + proto_log->WriteProto(document2)); + + // Erases the first proto + ICING_ASSERT_OK(proto_log->EraseProto(document1_offset)); + + // The first proto has been erased. + ASSERT_THAT(proto_log->ReadProto(document1_offset), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + // The second proto should be returned. + ASSERT_THAT(proto_log->ReadProto(document2_offset), + IsOkAndHolds(EqualsProto(document2))); +} + +TEST_F(PortableFileBackedProtoLogTest, ChecksumShouldBeCorrectWithErasedProto) { + DocumentProto document1 = + DocumentBuilder().SetKey("namespace", "uri1").Build(); + DocumentProto document2 = + DocumentBuilder().SetKey("namespace", "uri2").Build(); + DocumentProto document3 = + DocumentBuilder().SetKey("namespace", "uri3").Build(); + DocumentProto document4 = + DocumentBuilder().SetKey("namespace", "uri4").Build(); + + int64_t document2_offset; + int64_t document3_offset; + + { + // Erase data after the rewind position. This won't update the checksum + // immediately. + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Writes 3 protos + ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset, + proto_log->WriteProto(document1)); + ICING_ASSERT_OK_AND_ASSIGN(document2_offset, + proto_log->WriteProto(document2)); + ICING_ASSERT_OK_AND_ASSIGN(document3_offset, + proto_log->WriteProto(document3)); + + // Erases the 1st proto, checksum won't be updated immediately because the + // rewind position is 0. + ICING_ASSERT_OK(proto_log->EraseProto(document1_offset)); + + EXPECT_THAT(proto_log->ComputeChecksum(), + IsOkAndHolds(Eq(Crc32(2175574628)))); + } // New checksum is updated in destructor. + + { + // Erase data before the rewind position. This will update the checksum + // immediately. + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Erases the 2nd proto that is now before the rewind position. Checksum + // is updated. + ICING_ASSERT_OK(proto_log->EraseProto(document2_offset)); + + EXPECT_THAT(proto_log->ComputeChecksum(), + IsOkAndHolds(Eq(Crc32(790877774)))); + } + + { + // Append data and erase data before the rewind position. This will update + // the checksum twice: in EraseProto() and destructor. + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + ASSERT_FALSE(create_result.has_data_loss()); + + // Append a new document which is after the rewind position. + ICING_ASSERT_OK(proto_log->WriteProto(document4)); + + // Erases the 3rd proto that is now before the rewind position. Checksum + // is updated. + ICING_ASSERT_OK(proto_log->EraseProto(document3_offset)); + + EXPECT_THAT(proto_log->ComputeChecksum(), + IsOkAndHolds(Eq(Crc32(2344803210)))); + } // Checksum is updated with the newly appended document. + + { + // A successful creation means that the checksum matches. + ICING_ASSERT_OK_AND_ASSIGN( + PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result, + PortableFileBackedProtoLog<DocumentProto>::Create( + &filesystem_, file_path_, + PortableFileBackedProtoLog<DocumentProto>::Options( + compress_, max_proto_size_))); + auto proto_log = std::move(create_result.proto_log); + EXPECT_FALSE(create_result.has_data_loss()); + } +} + +} // namespace +} // namespace lib +} // namespace icing diff --git a/icing/icing-search-engine-with-icu-file_test.cc b/icing/icing-search-engine-with-icu-file_test.cc index 5a9327e..48e81e5 100644 --- a/icing/icing-search-engine-with-icu-file_test.cc +++ b/icing/icing-search-engine-with-icu-file_test.cc @@ -27,6 +27,7 @@ #include "icing/proto/search.pb.h" #include "icing/proto/status.pb.h" #include "icing/proto/term.pb.h" +#include "icing/schema-builder.h" #include "icing/testing/common-matchers.h" #include "icing/testing/tmp-directory.h" @@ -36,6 +37,14 @@ namespace { using ::icing::lib::portable_equals_proto::EqualsProto; using ::testing::Eq; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED = + PropertyConfigProto_Cardinality_Code_REQUIRED; + +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; + +constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; + std::string GetTestBaseDir() { return GetTestTempDir() + "/icing_with_icu_files"; } @@ -55,23 +64,6 @@ DocumentProto CreateMessageDocument(std::string name_space, std::string uri) { .Build(); } -SchemaProto CreateMessageSchema() { - SchemaProto schema; - auto type = schema.add_types(); - type->set_schema_type("Message"); - - auto body = type->add_properties(); - body->set_property_name("body"); - body->set_data_type(PropertyConfigProto::DataType::STRING); - body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - body->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::PREFIX); - body->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); - - return schema; -} - ScoringSpecProto GetDefaultScoringSpec() { ScoringSpecProto scoring_spec; scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE); @@ -81,15 +73,31 @@ ScoringSpecProto GetDefaultScoringSpec() { TEST(IcingSearchEngineWithIcuFileTest, ShouldInitialize) { IcingSearchEngine icing(GetDefaultIcingOptions()); EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK)); - EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(), - Eq(StatusProto::OK)); + + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK)); } TEST(IcingSearchEngineWithIcuFileTest, ShouldIndexAndSearch) { IcingSearchEngine icing(GetDefaultIcingOptions()); ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK)); - ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(), - Eq(StatusProto::OK)); + + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + ASSERT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK)); DocumentProto document_one = CreateMessageDocument("namespace", "uri1"); ASSERT_THAT(icing.Put(document_one).status().code(), Eq(StatusProto::OK)); @@ -115,8 +123,8 @@ TEST(IcingSearchEngineWithIcuFileTest, ShouldIndexAndSearch) { // The token is a random number so we don't verify it. expected_search_result_proto.set_next_page_token( search_result_proto.next_page_token()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } } // namespace diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc index 791368a..1b7bd89 100644 --- a/icing/icing-search-engine.cc +++ b/icing/icing-search-engine.cc @@ -27,14 +27,18 @@ #include "icing/absl_ports/canonical_errors.h" #include "icing/absl_ports/mutex.h" #include "icing/absl_ports/str_cat.h" +#include "icing/file/destructible-file.h" +#include "icing/file/file-backed-proto.h" #include "icing/file/filesystem.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/index-processor.h" #include "icing/index/index.h" #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/legacy/index/icing-filesystem.h" +#include "icing/portable/endian.h" #include "icing/proto/document.pb.h" #include "icing/proto/initialize.pb.h" +#include "icing/proto/internal/optimize.pb.h" #include "icing/proto/logging.pb.h" #include "icing/proto/optimize.pb.h" #include "icing/proto/persist.pb.h" @@ -73,8 +77,14 @@ namespace { constexpr std::string_view kDocumentSubfolderName = "document_dir"; constexpr std::string_view kIndexSubfolderName = "index_dir"; constexpr std::string_view kSchemaSubfolderName = "schema_dir"; -constexpr std::string_view kIcingSearchEngineHeaderFilename = - "icing_search_engine_header"; +constexpr std::string_view kSetSchemaMarkerFilename = "set_schema_marker"; +constexpr std::string_view kInitMarkerFilename = "init_marker"; +constexpr std::string_view kOptimizeStatusFilename = "optimize_status"; + +// The maximum number of unsuccessful initialization attempts from the current +// state that we will tolerate before deleting all data and starting from a +// fresh state. +constexpr int kMaxUnsuccessfulInitAttempts = 5; libtextclassifier3::Status ValidateOptions( const IcingSearchEngineOptions& options) { @@ -94,6 +104,21 @@ libtextclassifier3::Status ValidateResultSpec( return absl_ports::InvalidArgumentError( "ResultSpecProto.num_per_page cannot be negative."); } + std::unordered_set<std::string> unique_namespaces; + for (const ResultSpecProto::ResultGrouping& result_grouping : + result_spec.result_groupings()) { + if (result_grouping.max_results() <= 0) { + return absl_ports::InvalidArgumentError( + "Cannot specify a result grouping with max results <= 0."); + } + for (const std::string& name_space : result_grouping.namespaces()) { + if (unique_namespaces.count(name_space) > 0) { + return absl_ports::InvalidArgumentError( + "Namespaces must be unique across result groups."); + } + unique_namespaces.insert(name_space); + } + } return libtextclassifier3::Status::OK; } @@ -119,10 +144,6 @@ IndexProcessor::Options CreateIndexProcessorOptions( return index_processor_options; } -std::string MakeHeaderFilename(const std::string& base_dir) { - return absl_ports::StrCat(base_dir, "/", kIcingSearchEngineHeaderFilename); -} - // Document store files are in a standalone subfolder for easier file // management. We can delete and recreate the subfolder and not touch/affect // anything else. @@ -151,6 +172,14 @@ std::string MakeSchemaDirectoryPath(const std::string& base_dir) { return absl_ports::StrCat(base_dir, "/", kSchemaSubfolderName); } +std::string MakeSetSchemaMarkerFilePath(const std::string& base_dir) { + return absl_ports::StrCat(base_dir, "/", kSetSchemaMarkerFilename); +} + +std::string MakeInitMarkerFilePath(const std::string& base_dir) { + return absl_ports::StrCat(base_dir, "/", kInitMarkerFilename); +} + void TransformStatus(const libtextclassifier3::Status& internal_status, StatusProto* status_proto) { StatusProto::Code code; @@ -238,15 +267,13 @@ IcingSearchEngine::IcingSearchEngine( filesystem_(std::move(filesystem)), icing_filesystem_(std::move(icing_filesystem)), clock_(std::move(clock)), - result_state_manager_(performance_configuration_.max_num_hits_per_query, - performance_configuration_.max_num_cache_results), jni_cache_(std::move(jni_cache)) { ICING_VLOG(1) << "Creating IcingSearchEngine in dir: " << options_.base_dir(); } IcingSearchEngine::~IcingSearchEngine() { if (initialized_) { - if (PersistToDisk().status().code() != StatusProto::OK) { + if (PersistToDisk(PersistType::FULL).status().code() != StatusProto::OK) { ICING_LOG(ERROR) << "Error persisting to disk in IcingSearchEngine destructor"; } @@ -261,6 +288,66 @@ InitializeResultProto IcingSearchEngine::Initialize() { return InternalInitialize(); } +void IcingSearchEngine::ResetMembers() { + schema_store_.reset(); + document_store_.reset(); + language_segmenter_.reset(); + normalizer_.reset(); + index_.reset(); +} + +libtextclassifier3::Status IcingSearchEngine::CheckInitMarkerFile( + InitializeStatsProto* initialize_stats) { + // Check to see if the marker file exists and if we've already passed our max + // number of init attempts. + std::string marker_filepath = MakeInitMarkerFilePath(options_.base_dir()); + bool file_exists = filesystem_->FileExists(marker_filepath.c_str()); + int network_init_attempts = 0; + int host_init_attempts = 0; + + // Read the number of previous failed init attempts from the file. If it + // fails, then just assume the value is zero (the most likely reason for + // failure would be non-existence because the last init was successful + // anyways). + ScopedFd marker_file_fd(filesystem_->OpenForWrite(marker_filepath.c_str())); + libtextclassifier3::Status status; + if (file_exists && + filesystem_->PRead(marker_file_fd.get(), &network_init_attempts, + sizeof(network_init_attempts), /*offset=*/0)) { + host_init_attempts = GNetworkToHostL(network_init_attempts); + if (host_init_attempts > kMaxUnsuccessfulInitAttempts) { + // We're tried and failed to init too many times. We need to throw + // everything out and start from scratch. + ResetMembers(); + if (!filesystem_->DeleteDirectoryRecursively( + options_.base_dir().c_str())) { + return absl_ports::InternalError("Failed to delete icing base dir!"); + } + status = absl_ports::DataLossError( + "Encountered failed initialization limit. Cleared all data."); + host_init_attempts = 0; + } + } + + // Use network_init_attempts here because we might have set host_init_attempts + // to 0 if it exceeded the max threshold. + initialize_stats->set_num_previous_init_failures( + GNetworkToHostL(network_init_attempts)); + + ++host_init_attempts; + network_init_attempts = GHostToNetworkL(host_init_attempts); + // Write the updated number of attempts before we get started. + if (!filesystem_->PWrite(marker_file_fd.get(), /*offset=*/0, + &network_init_attempts, + sizeof(network_init_attempts)) || + !filesystem_->DataSync(marker_file_fd.get())) { + return absl_ports::InternalError( + "Failed to write and sync init marker file"); + } + + return status; +} + InitializeResultProto IcingSearchEngine::InternalInitialize() { ICING_VLOG(1) << "Initializing IcingSearchEngine in dir: " << options_.base_dir(); @@ -270,8 +357,8 @@ InitializeResultProto IcingSearchEngine::InternalInitialize() { InitializeResultProto result_proto; StatusProto* result_status = result_proto.mutable_status(); - NativeInitializeStats* initialize_stats = - result_proto.mutable_native_initialize_stats(); + InitializeStatsProto* initialize_stats = + result_proto.mutable_initialize_stats(); if (initialized_) { // Already initialized. result_status->set_code(StatusProto::OK); @@ -281,87 +368,42 @@ InitializeResultProto IcingSearchEngine::InternalInitialize() { return result_proto; } - // Releases result / query cache if any - result_state_manager_.InvalidateAllResultStates(); - + // Now go ahead and try to initialize. libtextclassifier3::Status status = InitializeMembers(initialize_stats); - if (!status.ok()) { - TransformStatus(status, result_status); - initialize_stats->set_latency_ms( - initialize_timer->GetElapsedMilliseconds()); - return result_proto; - } - - // Even if each subcomponent initialized fine independently, we need to - // check if they're consistent with each other. - if (!CheckConsistency().ok()) { - // The total checksum doesn't match the stored value, it could be one of the - // following cases: - // 1. Icing is initialized the first time in this directory. - // 2. Non-checksumed changes have been made to some files. - if (index_->last_added_document_id() == kInvalidDocumentId && - document_store_->last_added_document_id() == kInvalidDocumentId && - absl_ports::IsNotFound(schema_store_->GetSchema().status())) { - // First time initialize. Not recovering but creating all the files. - // We need to explicitly clear the recovery-related fields because some - // sub-components may not be able to tell if the storage is being - // initialized the first time or has lost some files. Sub-components may - // already have set these fields in earlier steps. - *initialize_stats = NativeInitializeStats(); - status = RegenerateDerivedFiles(); + if (status.ok() || absl_ports::IsDataLoss(status)) { + // We successfully initialized. We should delete the init marker file to + // indicate a successful init. + std::string marker_filepath = MakeInitMarkerFilePath(options_.base_dir()); + if (!filesystem_->DeleteFile(marker_filepath.c_str())) { + status = absl_ports::InternalError("Failed to delete init marker file!"); } else { - ICING_VLOG(1) - << "IcingSearchEngine in inconsistent state, regenerating all " - "derived data"; - // Total checksum mismatch may not be the root cause of document store - // recovery. Preserve the root cause that was set by the document store. - bool should_log_document_store_recovery_cause = - initialize_stats->document_store_recovery_cause() == - NativeInitializeStats::NONE; - if (should_log_document_store_recovery_cause) { - initialize_stats->set_document_store_recovery_cause( - NativeInitializeStats::TOTAL_CHECKSUM_MISMATCH); - } - initialize_stats->set_index_restoration_cause( - NativeInitializeStats::TOTAL_CHECKSUM_MISMATCH); - status = RegenerateDerivedFiles(initialize_stats, - should_log_document_store_recovery_cause); - } - } else { - DocumentId last_stored_document_id = - document_store_->last_added_document_id(); - DocumentId last_indexed_document_id = index_->last_added_document_id(); - if (last_stored_document_id != last_indexed_document_id) { - if (last_stored_document_id == kInvalidDocumentId) { - // Document store is empty but index is not. Reset the index. - status = index_->Reset(); - } else { - // Index is inconsistent with the document store, we need to restore the - // index. - initialize_stats->set_index_restoration_cause( - NativeInitializeStats::INCONSISTENT_WITH_GROUND_TRUTH); - std::unique_ptr<Timer> index_restore_timer = clock_->GetNewTimer(); - status = RestoreIndexIfNeeded(); - initialize_stats->set_index_restoration_latency_ms( - index_restore_timer->GetElapsedMilliseconds()); - } + initialized_ = true; } } - - if (status.ok() || absl_ports::IsDataLoss(status)) { - initialized_ = true; - } TransformStatus(status, result_status); initialize_stats->set_latency_ms(initialize_timer->GetElapsedMilliseconds()); return result_proto; } libtextclassifier3::Status IcingSearchEngine::InitializeMembers( - NativeInitializeStats* initialize_stats) { + InitializeStatsProto* initialize_stats) { ICING_RETURN_ERROR_IF_NULL(initialize_stats); - ICING_RETURN_IF_ERROR(InitializeOptions()); + ICING_RETURN_IF_ERROR(ValidateOptions(options_)); + + // Make sure the base directory exists + if (!filesystem_->CreateDirectoryRecursively(options_.base_dir().c_str())) { + return absl_ports::InternalError(absl_ports::StrCat( + "Could not create directory: ", options_.base_dir())); + } + + // Check to see if the marker file exists and if we've already passed our max + // number of init attempts. + libtextclassifier3::Status status = CheckInitMarkerFile(initialize_stats); + if (!status.ok() && !absl_ports::IsDataLoss(status)) { + return status; + } + ICING_RETURN_IF_ERROR(InitializeSchemaStore(initialize_stats)); - ICING_RETURN_IF_ERROR(InitializeDocumentStore(initialize_stats)); // TODO(b/156383798) : Resolve how to specify the locale. language_segmenter_factory::SegmenterOptions segmenter_options( @@ -372,25 +414,86 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers( TC3_ASSIGN_OR_RETURN(normalizer_, normalizer_factory::Create(options_.max_token_length())); - ICING_RETURN_IF_ERROR(InitializeIndex(initialize_stats)); + std::string marker_filepath = + MakeSetSchemaMarkerFilePath(options_.base_dir()); + libtextclassifier3::Status index_init_status; + if (absl_ports::IsNotFound(schema_store_->GetSchema().status())) { + // The schema was either lost or never set before. Wipe out the doc store + // and index directories and initialize them from scratch. + const std::string doc_store_dir = + MakeDocumentDirectoryPath(options_.base_dir()); + const std::string index_dir = MakeIndexDirectoryPath(options_.base_dir()); + if (!filesystem_->DeleteDirectoryRecursively(doc_store_dir.c_str()) || + !filesystem_->DeleteDirectoryRecursively(index_dir.c_str())) { + return absl_ports::InternalError(absl_ports::StrCat( + "Could not delete directories: ", index_dir, " and ", doc_store_dir)); + } + ICING_RETURN_IF_ERROR(InitializeDocumentStore( + /*force_recovery_and_revalidate_documents=*/false, initialize_stats)); + index_init_status = InitializeIndex(initialize_stats); + if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) { + return index_init_status; + } + } else if (filesystem_->FileExists(marker_filepath.c_str())) { + // If the marker file is still around then something wonky happened when we + // last tried to set the schema. + ICING_RETURN_IF_ERROR(InitializeDocumentStore( + /*force_recovery_and_revalidate_documents=*/true, initialize_stats)); + initialize_stats->set_document_store_recovery_cause( + InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC); + + // We're going to need to build the index from scratch. So just delete its + // files now. + const std::string index_dir = MakeIndexDirectoryPath(options_.base_dir()); + Index::Options index_options(index_dir, options_.index_merge_size()); + if (!filesystem_->DeleteDirectoryRecursively(index_dir.c_str()) || + !filesystem_->CreateDirectoryRecursively(index_dir.c_str())) { + return absl_ports::InternalError( + absl_ports::StrCat("Could not recreate directory: ", index_dir)); + } + ICING_ASSIGN_OR_RETURN(index_, + Index::Create(index_options, filesystem_.get(), + icing_filesystem_.get())); - return libtextclassifier3::Status::OK; -} + std::unique_ptr<Timer> restore_timer = clock_->GetNewTimer(); + IndexRestorationResult restore_result = RestoreIndexIfNeeded(); + index_init_status = std::move(restore_result.status); + // DATA_LOSS means that we have successfully initialized and re-added + // content to the index. Some indexed content was lost, but otherwise the + // index is in a valid state and can be queried. + if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) { + return index_init_status; + } -libtextclassifier3::Status IcingSearchEngine::InitializeOptions() { - ICING_RETURN_IF_ERROR(ValidateOptions(options_)); + // Delete the marker file to indicate that everything is now in sync with + // whatever changes were made to the schema. + filesystem_->DeleteFile(marker_filepath.c_str()); - // Make sure the base directory exists - if (!filesystem_->CreateDirectoryRecursively(options_.base_dir().c_str())) { - return absl_ports::InternalError(absl_ports::StrCat( - "Could not create directory: ", options_.base_dir())); + initialize_stats->set_index_restoration_latency_ms( + restore_timer->GetElapsedMilliseconds()); + initialize_stats->set_index_restoration_cause( + InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC); + } else { + ICING_RETURN_IF_ERROR(InitializeDocumentStore( + /*force_recovery_and_revalidate_documents=*/false, initialize_stats)); + index_init_status = InitializeIndex(initialize_stats); + if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) { + return index_init_status; + } } - return libtextclassifier3::Status::OK; + if (status.ok()) { + status = index_init_status; + } + + result_state_manager_ = std::make_unique<ResultStateManager>( + performance_configuration_.max_num_total_hits, *document_store_); + + return status; } libtextclassifier3::Status IcingSearchEngine::InitializeSchemaStore( - NativeInitializeStats* initialize_stats) { + InitializeStatsProto* initialize_stats) { ICING_RETURN_ERROR_IF_NULL(initialize_stats); const std::string schema_store_dir = @@ -408,7 +511,8 @@ libtextclassifier3::Status IcingSearchEngine::InitializeSchemaStore( } libtextclassifier3::Status IcingSearchEngine::InitializeDocumentStore( - NativeInitializeStats* initialize_stats) { + bool force_recovery_and_revalidate_documents, + InitializeStatsProto* initialize_stats) { ICING_RETURN_ERROR_IF_NULL(initialize_stats); const std::string document_dir = @@ -420,15 +524,16 @@ libtextclassifier3::Status IcingSearchEngine::InitializeDocumentStore( } ICING_ASSIGN_OR_RETURN( DocumentStore::CreateResult create_result, - DocumentStore::Create(filesystem_.get(), document_dir, clock_.get(), - schema_store_.get(), initialize_stats)); + DocumentStore::Create( + filesystem_.get(), document_dir, clock_.get(), schema_store_.get(), + force_recovery_and_revalidate_documents, initialize_stats)); document_store_ = std::move(create_result.document_store); return libtextclassifier3::Status::OK; } libtextclassifier3::Status IcingSearchEngine::InitializeIndex( - NativeInitializeStats* initialize_stats) { + InitializeStatsProto* initialize_stats) { ICING_RETURN_ERROR_IF_NULL(initialize_stats); const std::string index_dir = MakeIndexDirectoryPath(options_.base_dir()); @@ -439,6 +544,7 @@ libtextclassifier3::Status IcingSearchEngine::InitializeIndex( } Index::Options index_options(index_dir, options_.index_merge_size()); + InitializeStatsProto::RecoveryCause recovery_cause; auto index_or = Index::Create(index_options, filesystem_.get(), icing_filesystem_.get()); if (!index_or.ok()) { @@ -448,88 +554,28 @@ libtextclassifier3::Status IcingSearchEngine::InitializeIndex( absl_ports::StrCat("Could not recreate directory: ", index_dir)); } - initialize_stats->set_index_restoration_cause( - NativeInitializeStats::IO_ERROR); + recovery_cause = InitializeStatsProto::IO_ERROR; // Try recreating it from scratch and re-indexing everything. ICING_ASSIGN_OR_RETURN(index_, Index::Create(index_options, filesystem_.get(), icing_filesystem_.get())); - - std::unique_ptr<Timer> restore_timer = clock_->GetNewTimer(); - ICING_RETURN_IF_ERROR(RestoreIndexIfNeeded()); - initialize_stats->set_index_restoration_latency_ms( - restore_timer->GetElapsedMilliseconds()); } else { // Index was created fine. index_ = std::move(index_or).ValueOrDie(); + // If a recover does have to happen, then it must be because the index is + // out of sync with the document store. + recovery_cause = InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH; } - return libtextclassifier3::Status::OK; -} - -libtextclassifier3::Status IcingSearchEngine::CheckConsistency() { - if (!HeaderExists()) { - // Without a header file, we have no checksum and can't even detect - // inconsistencies - return absl_ports::NotFoundError("No header file found."); - } - - // Header does exist, verify that the header looks fine. - IcingSearchEngine::Header header; - if (!filesystem_->Read(MakeHeaderFilename(options_.base_dir()).c_str(), - &header, sizeof(header))) { - return absl_ports::InternalError(absl_ports::StrCat( - "Couldn't read: ", MakeHeaderFilename(options_.base_dir()))); - } - - if (header.magic != IcingSearchEngine::Header::kMagic) { - return absl_ports::InternalError( - absl_ports::StrCat("Invalid header kMagic for file: ", - MakeHeaderFilename(options_.base_dir()))); - } - - ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum()); - if (checksum.Get() != header.checksum) { - return absl_ports::InternalError( - "IcingSearchEngine checksum doesn't match"); - } - - return libtextclassifier3::Status::OK; -} - -libtextclassifier3::Status IcingSearchEngine::RegenerateDerivedFiles( - NativeInitializeStats* initialize_stats, bool log_document_store_stats) { - // Measure the latency of the data recovery. The cause of the recovery should - // be logged by the caller. - std::unique_ptr<Timer> timer = clock_->GetNewTimer(); - ICING_RETURN_IF_ERROR( - document_store_->UpdateSchemaStore(schema_store_.get())); - if (initialize_stats != nullptr && log_document_store_stats) { - initialize_stats->set_document_store_recovery_latency_ms( - timer->GetElapsedMilliseconds()); - } - // Restart timer. - timer = clock_->GetNewTimer(); - ICING_RETURN_IF_ERROR(index_->Reset()); - ICING_RETURN_IF_ERROR(RestoreIndexIfNeeded()); - if (initialize_stats != nullptr) { + std::unique_ptr<Timer> restore_timer = clock_->GetNewTimer(); + IndexRestorationResult restore_result = RestoreIndexIfNeeded(); + if (restore_result.needed_restoration) { initialize_stats->set_index_restoration_latency_ms( - timer->GetElapsedMilliseconds()); - } - - const std::string header_file = - MakeHeaderFilename(options_.base_dir().c_str()); - if (HeaderExists()) { - if (!filesystem_->DeleteFile(header_file.c_str())) { - return absl_ports::InternalError( - absl_ports::StrCat("Unable to delete file: ", header_file)); - } + restore_timer->GetElapsedMilliseconds()); + initialize_stats->set_index_restoration_cause(recovery_cause); } - ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum()); - ICING_RETURN_IF_ERROR(UpdateHeader(checksum)); - - return libtextclassifier3::Status::OK; + return restore_result.status; } SetSchemaResultProto IcingSearchEngine::SetSchema( @@ -545,33 +591,40 @@ SetSchemaResultProto IcingSearchEngine::SetSchema( StatusProto* result_status = result_proto.mutable_status(); absl_ports::unique_lock l(&mutex_); + std::unique_ptr<Timer> timer = clock_->GetNewTimer(); if (!initialized_) { result_status->set_code(StatusProto::FAILED_PRECONDITION); result_status->set_message("IcingSearchEngine has not been initialized!"); - return result_proto; - } - - libtextclassifier3::Status status = SchemaUtil::Validate(new_schema); - if (!status.ok()) { - TransformStatus(status, result_status); + result_proto.set_latency_ms(timer->GetElapsedMilliseconds()); return result_proto; } auto lost_previous_schema_or = LostPreviousSchema(); if (!lost_previous_schema_or.ok()) { TransformStatus(lost_previous_schema_or.status(), result_status); + result_proto.set_latency_ms(timer->GetElapsedMilliseconds()); return result_proto; } bool lost_previous_schema = lost_previous_schema_or.ValueOrDie(); + std::string marker_filepath = + MakeSetSchemaMarkerFilePath(options_.base_dir()); + // Create the marker file indicating that we are going to apply a schema + // change. No need to write anything to the marker file - its existence is the + // only thing that matters. The marker file is used to indicate if we + // encountered a crash or a power loss while updating the schema and other + // files. So set it up to be deleted as long as we return from this function. + DestructibleFile marker_file(marker_filepath, filesystem_.get()); + auto set_schema_result_or = schema_store_->SetSchema( std::move(new_schema), ignore_errors_and_delete_documents); if (!set_schema_result_or.ok()) { TransformStatus(set_schema_result_or.status(), result_status); + result_proto.set_latency_ms(timer->GetElapsedMilliseconds()); return result_proto; } - const SchemaStore::SetSchemaResult set_schema_result = - set_schema_result_or.ValueOrDie(); + SchemaStore::SetSchemaResult set_schema_result = + std::move(set_schema_result_or).ValueOrDie(); for (const std::string& deleted_type : set_schema_result.schema_types_deleted_by_name) { @@ -583,6 +636,26 @@ SetSchemaResultProto IcingSearchEngine::SetSchema( result_proto.add_incompatible_schema_types(incompatible_type); } + for (const std::string& new_type : + set_schema_result.schema_types_new_by_name) { + result_proto.add_new_schema_types(std::move(new_type)); + } + + for (const std::string& compatible_type : + set_schema_result.schema_types_changed_fully_compatible_by_name) { + result_proto.add_fully_compatible_changed_schema_types( + std::move(compatible_type)); + } + + bool index_incompatible = + !set_schema_result.schema_types_index_incompatible_by_name.empty(); + for (const std::string& index_incompatible_type : + set_schema_result.schema_types_index_incompatible_by_name) { + result_proto.add_index_incompatible_changed_schema_types( + std::move(index_incompatible_type)); + } + + libtextclassifier3::Status status; if (set_schema_result.success) { if (lost_previous_schema) { // No previous schema to calculate a diff against. We have to go through @@ -590,6 +663,7 @@ SetSchemaResultProto IcingSearchEngine::SetSchema( status = document_store_->UpdateSchemaStore(schema_store_.get()); if (!status.ok()) { TransformStatus(status, result_status); + result_proto.set_latency_ms(timer->GetElapsedMilliseconds()); return result_proto; } } else if (!set_schema_result.old_schema_type_ids_changed.empty() || @@ -599,21 +673,28 @@ SetSchemaResultProto IcingSearchEngine::SetSchema( set_schema_result); if (!status.ok()) { TransformStatus(status, result_status); + result_proto.set_latency_ms(timer->GetElapsedMilliseconds()); return result_proto; } } - if (lost_previous_schema || set_schema_result.index_incompatible) { + if (lost_previous_schema || index_incompatible) { // Clears all index files status = index_->Reset(); if (!status.ok()) { TransformStatus(status, result_status); + result_proto.set_latency_ms(timer->GetElapsedMilliseconds()); return result_proto; } - status = RestoreIndexIfNeeded(); - if (!status.ok()) { + IndexRestorationResult restore_result = RestoreIndexIfNeeded(); + // DATA_LOSS means that we have successfully re-added content to the + // index. Some indexed content was lost, but otherwise the index is in a + // valid state and can be queried. + if (!restore_result.status.ok() && + !absl_ports::IsDataLoss(restore_result.status)) { TransformStatus(status, result_status); + result_proto.set_latency_ms(timer->GetElapsedMilliseconds()); return result_proto; } } @@ -623,6 +704,8 @@ SetSchemaResultProto IcingSearchEngine::SetSchema( result_status->set_code(StatusProto::FAILED_PRECONDITION); result_status->set_message("Schema is incompatible."); } + + result_proto.set_latency_ms(timer->GetElapsedMilliseconds()); return result_proto; } @@ -682,8 +765,8 @@ PutResultProto IcingSearchEngine::Put(DocumentProto&& document) { PutResultProto result_proto; StatusProto* result_status = result_proto.mutable_status(); - NativePutDocumentStats* put_document_stats = - result_proto.mutable_native_put_document_stats(); + PutDocumentStatsProto* put_document_stats = + result_proto.mutable_put_document_stats(); // Lock must be acquired before validation because the DocumentStore uses // the schema file to validate, and the schema could be changed in @@ -833,8 +916,8 @@ DeleteResultProto IcingSearchEngine::Delete(const std::string_view name_space, return result_proto; } - NativeDeleteStats* delete_stats = result_proto.mutable_delete_stats(); - delete_stats->set_delete_type(NativeDeleteStats::DeleteType::SINGLE); + DeleteStatsProto* delete_stats = result_proto.mutable_delete_stats(); + delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SINGLE); std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer(); // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR @@ -867,8 +950,8 @@ DeleteByNamespaceResultProto IcingSearchEngine::DeleteByNamespace( return delete_result; } - NativeDeleteStats* delete_stats = delete_result.mutable_delete_stats(); - delete_stats->set_delete_type(NativeDeleteStats::DeleteType::NAMESPACE); + DeleteStatsProto* delete_stats = delete_result.mutable_delete_stats(); + delete_stats->set_delete_type(DeleteStatsProto::DeleteType::NAMESPACE); std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer(); // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR @@ -901,8 +984,8 @@ DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType( return delete_result; } - NativeDeleteStats* delete_stats = delete_result.mutable_delete_stats(); - delete_stats->set_delete_type(NativeDeleteStats::DeleteType::SCHEMA_TYPE); + DeleteStatsProto* delete_stats = delete_result.mutable_delete_stats(); + delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SCHEMA_TYPE); std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer(); // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR @@ -937,8 +1020,13 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery( return result_proto; } - NativeDeleteStats* delete_stats = result_proto.mutable_delete_stats(); - delete_stats->set_delete_type(NativeDeleteStats::DeleteType::QUERY); + DeleteByQueryStatsProto* delete_stats = + result_proto.mutable_delete_by_query_stats(); + delete_stats->set_query_length(search_spec.query().length()); + delete_stats->set_num_namespaces_filtered( + search_spec.namespace_filters_size()); + delete_stats->set_num_schema_types_filtered( + search_spec.schema_type_filters_size()); std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer(); libtextclassifier3::Status status = @@ -948,10 +1036,11 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery( return result_proto; } + std::unique_ptr<Timer> component_timer = clock_->GetNewTimer(); // Gets unordered results from query processor auto query_processor_or = QueryProcessor::Create( index_.get(), language_segmenter_.get(), normalizer_.get(), - document_store_.get(), schema_store_.get(), clock_.get()); + document_store_.get(), schema_store_.get()); if (!query_processor_or.ok()) { TransformStatus(query_processor_or.status(), result_status); return result_proto; @@ -966,9 +1055,13 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery( } QueryProcessor::QueryResults query_results = std::move(query_results_or).ValueOrDie(); + delete_stats->set_parse_query_latency_ms( + component_timer->GetElapsedMilliseconds()); ICING_VLOG(2) << "Deleting the docs that matched the query."; int num_deleted = 0; + + component_timer = clock_->GetNewTimer(); while (query_results.root_iterator->Advance().ok()) { ICING_VLOG(3) << "Deleting doc " << query_results.root_iterator->doc_hit_info().document_id(); @@ -980,6 +1073,14 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery( return result_proto; } } + delete_stats->set_document_removal_latency_ms( + component_timer->GetElapsedMilliseconds()); + int term_count = 0; + for (const auto& section_and_terms : query_results.query_terms) { + term_count += section_and_terms.second.size(); + } + delete_stats->set_num_terms(term_count); + if (num_deleted > 0) { result_proto.mutable_status()->set_code(StatusProto::OK); } else { @@ -992,7 +1093,8 @@ DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery( return result_proto; } -PersistToDiskResultProto IcingSearchEngine::PersistToDisk() { +PersistToDiskResultProto IcingSearchEngine::PersistToDisk( + PersistType::Code persist_type) { ICING_VLOG(1) << "Persisting data to disk"; PersistToDiskResultProto result_proto; @@ -1005,7 +1107,7 @@ PersistToDiskResultProto IcingSearchEngine::PersistToDisk() { return result_proto; } - auto status = InternalPersistToDisk(); + auto status = InternalPersistToDisk(persist_type); TransformStatus(status, result_status); return result_proto; } @@ -1029,11 +1131,18 @@ OptimizeResultProto IcingSearchEngine::Optimize() { return result_proto; } - // Releases result / query cache if any - result_state_manager_.InvalidateAllResultStates(); + std::unique_ptr<Timer> optimize_timer = clock_->GetNewTimer(); + OptimizeStatsProto* optimize_stats = result_proto.mutable_optimize_stats(); + int64_t before_size = filesystem_->GetDiskUsage(options_.base_dir().c_str()); + if (before_size != Filesystem::kBadFileSize) { + optimize_stats->set_storage_size_before(before_size); + } else { + // Set -1 as a sentinel value when failures occur. + optimize_stats->set_storage_size_before(-1); + } // Flushes data to disk before doing optimization - auto status = InternalPersistToDisk(); + auto status = InternalPersistToDisk(PersistType::FULL); if (!status.ok()) { TransformStatus(status, result_status); return result_proto; @@ -1041,7 +1150,11 @@ OptimizeResultProto IcingSearchEngine::Optimize() { // TODO(b/143646633): figure out if we need to optimize index and doc store // at the same time. - libtextclassifier3::Status optimization_status = OptimizeDocumentStore(); + std::unique_ptr<Timer> optimize_doc_store_timer = clock_->GetNewTimer(); + libtextclassifier3::Status optimization_status = + OptimizeDocumentStore(optimize_stats); + optimize_stats->set_document_store_optimize_latency_ms( + optimize_doc_store_timer->GetElapsedMilliseconds()); if (!optimization_status.ok() && !absl_ports::IsDataLoss(optimization_status)) { @@ -1055,6 +1168,7 @@ OptimizeResultProto IcingSearchEngine::Optimize() { // The status is either OK or DATA_LOSS. The optimized document store is // guaranteed to work, so we update index according to the new document store. + std::unique_ptr<Timer> optimize_index_timer = clock_->GetNewTimer(); libtextclassifier3::Status index_reset_status = index_->Reset(); if (!index_reset_status.ok()) { status = absl_ports::Annotate( @@ -1064,17 +1178,52 @@ OptimizeResultProto IcingSearchEngine::Optimize() { return result_proto; } - libtextclassifier3::Status index_restoration_status = RestoreIndexIfNeeded(); - if (!index_restoration_status.ok()) { + IndexRestorationResult index_restoration_status = RestoreIndexIfNeeded(); + optimize_stats->set_index_restoration_latency_ms( + optimize_index_timer->GetElapsedMilliseconds()); + // DATA_LOSS means that we have successfully re-added content to the index. + // Some indexed content was lost, but otherwise the index is in a valid state + // and can be queried. + if (!index_restoration_status.status.ok() && + !absl_ports::IsDataLoss(index_restoration_status.status)) { status = absl_ports::Annotate( absl_ports::InternalError( "Failed to reindex documents after optimization."), - index_restoration_status.error_message()); + index_restoration_status.status.error_message()); TransformStatus(status, result_status); return result_proto; } + // Read the optimize status to get the time that we last ran. + std::string optimize_status_filename = + absl_ports::StrCat(options_.base_dir(), "/", kOptimizeStatusFilename); + FileBackedProto<OptimizeStatusProto> optimize_status_file( + *filesystem_, optimize_status_filename); + auto optimize_status_or = optimize_status_file.Read(); + int64_t current_time = clock_->GetSystemTimeMilliseconds(); + if (optimize_status_or.ok()) { + // If we have trouble reading the status or this is the first time that + // we've ever run, don't set this field. + optimize_stats->set_time_since_last_optimize_ms( + current_time - optimize_status_or.ValueOrDie() + ->last_successful_optimize_run_time_ms()); + } + + // Update the status for this run and write it. + auto optimize_status = std::make_unique<OptimizeStatusProto>(); + optimize_status->set_last_successful_optimize_run_time_ms(current_time); + optimize_status_file.Write(std::move(optimize_status)); + + int64_t after_size = filesystem_->GetDiskUsage(options_.base_dir().c_str()); + if (after_size != Filesystem::kBadFileSize) { + optimize_stats->set_storage_size_after(after_size); + } else { + // Set -1 as a sentinel value when failures occur. + optimize_stats->set_storage_size_after(-1); + } + optimize_stats->set_latency_ms(optimize_timer->GetElapsedMilliseconds()); + TransformStatus(optimization_status, result_status); return result_proto; } @@ -1092,6 +1241,22 @@ GetOptimizeInfoResultProto IcingSearchEngine::GetOptimizeInfo() { return result_proto; } + // Read the optimize status to get the time that we last ran. + std::string optimize_status_filename = + absl_ports::StrCat(options_.base_dir(), "/", kOptimizeStatusFilename); + FileBackedProto<OptimizeStatusProto> optimize_status_file( + *filesystem_, optimize_status_filename); + auto optimize_status_or = optimize_status_file.Read(); + int64_t current_time = clock_->GetSystemTimeMilliseconds(); + + if (optimize_status_or.ok()) { + // If we have trouble reading the status or this is the first time that + // we've ever run, don't set this field. + result_proto.set_time_since_last_optimize_ms( + current_time - optimize_status_or.ValueOrDie() + ->last_successful_optimize_run_time_ms()); + } + // Get stats from DocumentStore auto doc_store_optimize_info_or = document_store_->GetOptimizeInfo(); if (!doc_store_optimize_info_or.ok()) { @@ -1127,74 +1292,41 @@ GetOptimizeInfoResultProto IcingSearchEngine::GetOptimizeInfo() { return result_proto; } -libtextclassifier3::Status IcingSearchEngine::InternalPersistToDisk() { - ICING_RETURN_IF_ERROR(schema_store_->PersistToDisk()); - ICING_RETURN_IF_ERROR(document_store_->PersistToDisk()); - ICING_RETURN_IF_ERROR(index_->PersistToDisk()); - - // Update the combined checksum and write to header file. - ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum()); - ICING_RETURN_IF_ERROR(UpdateHeader(checksum)); - - return libtextclassifier3::Status::OK; -} - -libtextclassifier3::StatusOr<Crc32> IcingSearchEngine::ComputeChecksum() { - Crc32 total_checksum; - // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN - // that can support error logging. - auto checksum_or = schema_store_->ComputeChecksum(); - if (!checksum_or.ok()) { - ICING_LOG(ERROR) << checksum_or.status().error_message() - << "Failed to compute checksum of SchemaStore"; - return checksum_or.status(); - } - - Crc32 schema_store_checksum = std::move(checksum_or).ValueOrDie(); - - // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN - // that can support error logging. - checksum_or = document_store_->ComputeChecksum(); - if (!checksum_or.ok()) { - ICING_LOG(ERROR) << checksum_or.status().error_message() - << "Failed to compute checksum of DocumentStore"; - return checksum_or.status(); +StorageInfoResultProto IcingSearchEngine::GetStorageInfo() { + StorageInfoResultProto result; + absl_ports::shared_lock l(&mutex_); + if (!initialized_) { + result.mutable_status()->set_code(StatusProto::FAILED_PRECONDITION); + result.mutable_status()->set_message( + "IcingSearchEngine has not been initialized!"); + return result; } - Crc32 document_store_checksum = std::move(checksum_or).ValueOrDie(); - total_checksum.Append(std::to_string(document_store_checksum.Get())); - total_checksum.Append(std::to_string(schema_store_checksum.Get())); - - return total_checksum; + int64_t index_size = filesystem_->GetDiskUsage(options_.base_dir().c_str()); + if (index_size != Filesystem::kBadFileSize) { + result.mutable_storage_info()->set_total_storage_size(index_size); + } else { + result.mutable_storage_info()->set_total_storage_size(-1); + } + *result.mutable_storage_info()->mutable_document_storage_info() = + document_store_->GetStorageInfo(); + *result.mutable_storage_info()->mutable_schema_store_storage_info() = + schema_store_->GetStorageInfo(); + *result.mutable_storage_info()->mutable_index_storage_info() = + index_->GetStorageInfo(); + result.mutable_status()->set_code(StatusProto::OK); + return result; } -bool IcingSearchEngine::HeaderExists() { - if (!filesystem_->FileExists( - MakeHeaderFilename(options_.base_dir()).c_str())) { - return false; +libtextclassifier3::Status IcingSearchEngine::InternalPersistToDisk( + PersistType::Code persist_type) { + if (persist_type == PersistType::LITE) { + return document_store_->PersistToDisk(persist_type); } + ICING_RETURN_IF_ERROR(schema_store_->PersistToDisk()); + ICING_RETURN_IF_ERROR(document_store_->PersistToDisk(PersistType::FULL)); + ICING_RETURN_IF_ERROR(index_->PersistToDisk()); - int64_t file_size = - filesystem_->GetFileSize(MakeHeaderFilename(options_.base_dir()).c_str()); - - // If it's been truncated to size 0 before, we consider it to be a new file - return file_size != 0 && file_size != Filesystem::kBadFileSize; -} - -libtextclassifier3::Status IcingSearchEngine::UpdateHeader( - const Crc32& checksum) { - // Write the header - IcingSearchEngine::Header header; - header.magic = IcingSearchEngine::Header::kMagic; - header.checksum = checksum.Get(); - - // This should overwrite the header. - if (!filesystem_->Write(MakeHeaderFilename(options_.base_dir()).c_str(), - &header, sizeof(header))) { - return absl_ports::InternalError( - absl_ports::StrCat("Failed to write IcingSearchEngine header: ", - MakeHeaderFilename(options_.base_dir()))); - } return libtextclassifier3::Status::OK; } @@ -1211,7 +1343,8 @@ SearchResultProto IcingSearchEngine::Search( return result_proto; } - NativeQueryStats* query_stats = result_proto.mutable_query_stats(); + QueryStatsProto* query_stats = result_proto.mutable_query_stats(); + query_stats->set_query_length(search_spec.query().length()); std::unique_ptr<Timer> overall_timer = clock_->GetNewTimer(); libtextclassifier3::Status status = ValidateResultSpec(result_spec); @@ -1237,7 +1370,7 @@ SearchResultProto IcingSearchEngine::Search( // Gets unordered results from query processor auto query_processor_or = QueryProcessor::Create( index_.get(), language_segmenter_.get(), normalizer_.get(), - document_store_.get(), schema_store_.get(), clock_.get()); + document_store_.get(), schema_store_.get()); if (!query_processor_or.ok()) { TransformStatus(query_processor_or.status(), result_status); return result_proto; @@ -1289,9 +1422,9 @@ SearchResultProto IcingSearchEngine::Search( component_timer = clock_->GetNewTimer(); // Ranks and paginates results libtextclassifier3::StatusOr<PageResultState> page_result_state_or = - result_state_manager_.RankAndPaginate(ResultState( + result_state_manager_->RankAndPaginate(ResultState( std::move(result_document_hits), std::move(query_results.query_terms), - search_spec, scoring_spec, result_spec)); + search_spec, scoring_spec, result_spec, *document_store_)); if (!page_result_state_or.ok()) { TransformStatus(page_result_state_or.status(), result_status); return result_proto; @@ -1307,7 +1440,7 @@ SearchResultProto IcingSearchEngine::Search( ResultRetriever::Create(document_store_.get(), schema_store_.get(), language_segmenter_.get(), normalizer_.get()); if (!result_retriever_or.ok()) { - result_state_manager_.InvalidateResultState( + result_state_manager_->InvalidateResultState( page_result_state.next_page_token); TransformStatus(result_retriever_or.status(), result_status); return result_proto; @@ -1318,7 +1451,7 @@ SearchResultProto IcingSearchEngine::Search( libtextclassifier3::StatusOr<std::vector<SearchResultProto::ResultProto>> results_or = result_retriever->RetrieveResults(page_result_state); if (!results_or.ok()) { - result_state_manager_.InvalidateResultState( + result_state_manager_->InvalidateResultState( page_result_state.next_page_token); TransformStatus(results_or.status(), result_status); return result_proto; @@ -1340,7 +1473,7 @@ SearchResultProto IcingSearchEngine::Search( query_stats->set_latency_ms(overall_timer->GetElapsedMilliseconds()); query_stats->set_num_results_returned_current_page( result_proto.results_size()); - query_stats->set_num_results_snippeted( + query_stats->set_num_results_with_snippets( std::min(result_proto.results_size(), result_spec.snippet_spec().num_to_snippet())); return result_proto; @@ -1359,12 +1492,12 @@ SearchResultProto IcingSearchEngine::GetNextPage(uint64_t next_page_token) { return result_proto; } - NativeQueryStats* query_stats = result_proto.mutable_query_stats(); + QueryStatsProto* query_stats = result_proto.mutable_query_stats(); query_stats->set_is_first_page(false); std::unique_ptr<Timer> overall_timer = clock_->GetNewTimer(); libtextclassifier3::StatusOr<PageResultState> page_result_state_or = - result_state_manager_.GetNextPage(next_page_token); + result_state_manager_->GetNextPage(next_page_token); if (!page_result_state_or.ok()) { if (absl_ports::IsNotFound(page_result_state_or.status())) { @@ -1424,7 +1557,7 @@ SearchResultProto IcingSearchEngine::GetNextPage(uint64_t next_page_token) { std::max(page_result_state.snippet_context.snippet_spec.num_to_snippet() - page_result_state.num_previously_returned, 0); - query_stats->set_num_results_snippeted( + query_stats->set_num_results_with_snippets( std::min(result_proto.results_size(), num_left_to_snippet)); return result_proto; } @@ -1435,10 +1568,11 @@ void IcingSearchEngine::InvalidateNextPageToken(uint64_t next_page_token) { ICING_LOG(ERROR) << "IcingSearchEngine has not been initialized!"; return; } - result_state_manager_.InvalidateResultState(next_page_token); + result_state_manager_->InvalidateResultState(next_page_token); } -libtextclassifier3::Status IcingSearchEngine::OptimizeDocumentStore() { +libtextclassifier3::Status IcingSearchEngine::OptimizeDocumentStore( + OptimizeStatsProto* optimize_stats) { // Gets the current directory path and an empty tmp directory path for // document store optimization. const std::string current_document_dir = @@ -1455,7 +1589,7 @@ libtextclassifier3::Status IcingSearchEngine::OptimizeDocumentStore() { // Copies valid document data to tmp directory auto optimize_status = document_store_->OptimizeInto( - temporary_document_dir, language_segmenter_.get()); + temporary_document_dir, language_segmenter_.get(), optimize_stats); // Handles error if any if (!optimize_status.ok()) { @@ -1465,7 +1599,9 @@ libtextclassifier3::Status IcingSearchEngine::OptimizeDocumentStore() { optimize_status.error_message()); } - // Resets before swapping + // result_state_manager_ depends on document_store_. So we need to reset it at + // the same time that we reset the document_store_. + result_state_manager_.reset(); document_store_.reset(); // When swapping files, always put the current working directory at the @@ -1502,6 +1638,8 @@ libtextclassifier3::Status IcingSearchEngine::OptimizeDocumentStore() { create_result_or.status().error_message()); } document_store_ = std::move(create_result_or.ValueOrDie().document_store); + result_state_manager_ = std::make_unique<ResultStateManager>( + performance_configuration_.max_num_total_hits, *document_store_); // Potential data loss // TODO(b/147373249): Find a way to detect true data loss error @@ -1522,6 +1660,8 @@ libtextclassifier3::Status IcingSearchEngine::OptimizeDocumentStore() { "instance can't be created"); } document_store_ = std::move(create_result_or.ValueOrDie().document_store); + result_state_manager_ = std::make_unique<ResultStateManager>( + performance_configuration_.max_num_total_hits, *document_store_); // Deletes tmp directory if (!filesystem_->DeleteDirectoryRecursively( @@ -1529,23 +1669,23 @@ libtextclassifier3::Status IcingSearchEngine::OptimizeDocumentStore() { ICING_LOG(ERROR) << "Document store has been optimized, but it failed to " "delete temporary file directory"; } - return libtextclassifier3::Status::OK; } -libtextclassifier3::Status IcingSearchEngine::RestoreIndexIfNeeded() { +IcingSearchEngine::IndexRestorationResult +IcingSearchEngine::RestoreIndexIfNeeded() { DocumentId last_stored_document_id = document_store_->last_added_document_id(); DocumentId last_indexed_document_id = index_->last_added_document_id(); if (last_stored_document_id == last_indexed_document_id) { // No need to recover. - return libtextclassifier3::Status::OK; + return {libtextclassifier3::Status::OK, false}; } if (last_stored_document_id == kInvalidDocumentId) { // Document store is empty but index is not. Reset the index. - return index_->Reset(); + return {index_->Reset(), false}; } // TruncateTo ensures that the index does not hold any data that is not @@ -1554,17 +1694,29 @@ libtextclassifier3::Status IcingSearchEngine::RestoreIndexIfNeeded() { // lost documents. If the index does not contain any hits for documents with // document id greater than last_stored_document_id, then TruncateTo will have // no effect. - ICING_RETURN_IF_ERROR(index_->TruncateTo(last_stored_document_id)); + auto status = index_->TruncateTo(last_stored_document_id); + if (!status.ok()) { + return {status, false}; + } + // Last indexed document id may have changed thanks to TruncateTo. + last_indexed_document_id = index_->last_added_document_id(); DocumentId first_document_to_reindex = (last_indexed_document_id != kInvalidDocumentId) ? index_->last_added_document_id() + 1 : kMinDocumentId; + if (first_document_to_reindex > last_stored_document_id) { + // Nothing to restore. Just return. + return {libtextclassifier3::Status::OK, false}; + } - ICING_ASSIGN_OR_RETURN( - std::unique_ptr<IndexProcessor> index_processor, - IndexProcessor::Create(normalizer_.get(), index_.get(), - CreateIndexProcessorOptions(options_), - clock_.get())); + auto index_processor_or = IndexProcessor::Create( + normalizer_.get(), index_.get(), CreateIndexProcessorOptions(options_), + clock_.get()); + if (!index_processor_or.ok()) { + return {index_processor_or.status(), true}; + } + std::unique_ptr<IndexProcessor> index_processor = + std::move(index_processor_or).ValueOrDie(); ICING_VLOG(1) << "Restoring index by replaying documents from document id " << first_document_to_reindex << " to document id " @@ -1582,7 +1734,7 @@ libtextclassifier3::Status IcingSearchEngine::RestoreIndexIfNeeded() { continue; } else { // Returns other errors - return document_or.status(); + return {document_or.status(), true}; } } DocumentProto document(std::move(document_or).ValueOrDie()); @@ -1592,7 +1744,7 @@ libtextclassifier3::Status IcingSearchEngine::RestoreIndexIfNeeded() { language_segmenter_.get(), std::move(document)); if (!tokenized_document_or.ok()) { - return tokenized_document_or.status(); + return {tokenized_document_or.status(), true}; } TokenizedDocument tokenized_document( std::move(tokenized_document_or).ValueOrDie()); @@ -1602,7 +1754,7 @@ libtextclassifier3::Status IcingSearchEngine::RestoreIndexIfNeeded() { if (!status.ok()) { if (!absl_ports::IsDataLoss(status)) { // Real error. Stop recovering and pass it up. - return status; + return {status, true}; } // Just a data loss. Keep trying to add the remaining docs, but report the // data loss when we're done. @@ -1610,7 +1762,7 @@ libtextclassifier3::Status IcingSearchEngine::RestoreIndexIfNeeded() { } } - return overall_status; + return {overall_status, true}; } libtextclassifier3::StatusOr<bool> IcingSearchEngine::LostPreviousSchema() { @@ -1642,24 +1794,14 @@ ResetResultProto IcingSearchEngine::Reset() { ResetResultProto result_proto; StatusProto* result_status = result_proto.mutable_status(); - int64_t before_size = filesystem_->GetDiskUsage(options_.base_dir().c_str()); - + absl_ports::unique_lock l(&mutex_); + initialized_ = false; + ResetMembers(); if (!filesystem_->DeleteDirectoryRecursively(options_.base_dir().c_str())) { - int64_t after_size = filesystem_->GetDiskUsage(options_.base_dir().c_str()); - if (after_size != before_size) { - // Our filesystem doesn't atomically delete. If we have a discrepancy in - // size, then that means we may have deleted some files, but not others. - // So our data is in an invalid state now. - result_status->set_code(StatusProto::INTERNAL); - return result_proto; - } - - result_status->set_code(StatusProto::ABORTED); + result_status->set_code(StatusProto::INTERNAL); return result_proto; } - absl_ports::unique_lock l(&mutex_); - initialized_ = false; if (InternalInitialize().status().code() != StatusProto::OK) { // We shouldn't hit the following Initialize errors: // NOT_FOUND: all data was cleared, we aren't expecting anything diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h index a899131..65960a3 100644 --- a/icing/icing-search-engine.h +++ b/icing/icing-search-engine.h @@ -37,6 +37,7 @@ #include "icing/proto/schema.pb.h" #include "icing/proto/scoring.pb.h" #include "icing/proto/search.pb.h" +#include "icing/proto/storage.pb.h" #include "icing/proto/usage.pb.h" #include "icing/result/result-state-manager.h" #include "icing/schema/schema-store.h" @@ -52,16 +53,6 @@ namespace lib { // TODO(cassiewang) Top-level comments and links to design-doc. class IcingSearchEngine { public: - struct Header { - static constexpr int32_t kMagic = 0x6e650d0a; - - // Holds the magic as a quick sanity check against file corruption. - int32_t magic; - - // Checksum of the IcingSearchEngine's sub-component's checksums. - uint32_t checksum; - }; - // Note: It is only required to provide a pointer to a valid instance of // JniCache if this instance needs to perform reverse-jni calls. Users on // Linux and iOS should always provide a nullptr. @@ -187,6 +178,7 @@ class IcingSearchEngine { // // Returns: // OK on success + // OUT_OF_SPACE if exceeds maximum number of allowed documents // FAILED_PRECONDITION if a schema has not been set yet, IcingSearchEngine // has not been initialized yet. // NOT_FOUND if there is no SchemaTypeConfig in the SchemaProto that matches @@ -328,12 +320,26 @@ class IcingSearchEngine { // Invalidates the next-page token so that no more results of the related // query can be returned. - void InvalidateNextPageToken(uint64_t next_page_token); + void InvalidateNextPageToken(uint64_t next_page_token) + ICING_LOCKS_EXCLUDED(mutex_); // Makes sure that every update/delete received till this point is flushed // to disk. If the app crashes after a call to PersistToDisk(), Icing // would be able to fully recover all data written up to this point. // + // If persist_type is PersistType::LITE, then only the ground truth will be + // synced. This should be relatively lightweight to do (order of microseconds) + // and ensures that there will be no data loss. At worst, Icing may need to + // recover internal data structures by replaying the document log upon the + // next startup. Clients should call PersistToDisk(LITE) after each batch of + // mutations. + // + // If persist_type is PersistType::FULL, then all internal data structures in + // Icing will be synced. This is a heavier operation (order of milliseconds). + // It ensures that Icing will not need to recover internal data structures + // upon the next startup. Clients should call PersistToDisk(FULL) before their + // process dies. + // // NOTE: It is not necessary to call PersistToDisk() to read back data // that was recently written. All read APIs will include the most recent // updates/deletes regardless of the data being flushed to disk. @@ -342,7 +348,8 @@ class IcingSearchEngine { // OK on success // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet // INTERNAL on I/O error - PersistToDiskResultProto PersistToDisk() ICING_LOCKS_EXCLUDED(mutex_); + PersistToDiskResultProto PersistToDisk(PersistType::Code persist_type) + ICING_LOCKS_EXCLUDED(mutex_); // Allows Icing to run tasks that are too expensive and/or unnecessary to be // executed in real-time, but are useful to keep it fast and be @@ -378,6 +385,12 @@ class IcingSearchEngine { // INTERNAL_ERROR on IO error GetOptimizeInfoResultProto GetOptimizeInfo() ICING_LOCKS_EXCLUDED(mutex_); + // Calculates the StorageInfo for Icing. + // + // If an IO error occurs while trying to calculate the value for a field, then + // that field will be set to -1. + StorageInfoResultProto GetStorageInfo() ICING_LOCKS_EXCLUDED(mutex_); + // Clears all data from Icing and re-initializes. Clients DO NOT need to call // Initialize again. // @@ -416,7 +429,8 @@ class IcingSearchEngine { // acquired first in order to adhere to the global lock ordering: // 1. mutex_ // 2. result_state_manager_.lock_ - ResultStateManager result_state_manager_ ICING_GUARDED_BY(mutex_); + std::unique_ptr<ResultStateManager> result_state_manager_ + ICING_GUARDED_BY(mutex_); // Used to provide reader and writer locks absl_ports::shared_mutex mutex_; @@ -438,12 +452,27 @@ class IcingSearchEngine { // Pointer to JNI class references const std::unique_ptr<const JniCache> jni_cache_; + // Resets all members that are created during Initialize. + void ResetMembers() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + + // Checks for the existence of the init marker file. If the failed init count + // exceeds kMaxUnsuccessfulInitAttempts, all data is deleted and the index is + // initialized from scratch. The updated count (original failed init count + 1 + // ) is written to the marker file. + // + // RETURNS + // OK on success + // INTERNAL if an IO error occurs while trying to update the marker file. + libtextclassifier3::Status CheckInitMarkerFile( + InitializeStatsProto* initialize_stats) + ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + // Helper method to do the actual work to persist data to disk. We need this // separate method so that other public methods don't need to call // PersistToDisk(). Public methods calling each other may cause deadlock // issues. - libtextclassifier3::Status InternalPersistToDisk() - ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + libtextclassifier3::Status InternalPersistToDisk( + PersistType::Code persist_type) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Helper method to the actual work to Initialize. We need this separate // method so that other public methods don't need to call Initialize(). Public @@ -460,16 +489,7 @@ class IcingSearchEngine { // NOT_FOUND if some Document's schema type is not in the SchemaStore // INTERNAL on any I/O errors libtextclassifier3::Status InitializeMembers( - NativeInitializeStats* initialize_stats) - ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); - - // Do any validation/setup required for the given IcingSearchEngineOptions - // - // Returns: - // OK on success - // INVALID_ARGUMENT if options has invalid values - // INTERNAL on I/O error - libtextclassifier3::Status InitializeOptions() + InitializeStatsProto* initialize_stats) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Do any initialization/recovery necessary to create a SchemaStore instance. @@ -479,18 +499,22 @@ class IcingSearchEngine { // FAILED_PRECONDITION if initialize_stats is null // INTERNAL on I/O error libtextclassifier3::Status InitializeSchemaStore( - NativeInitializeStats* initialize_stats) + InitializeStatsProto* initialize_stats) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Do any initialization/recovery necessary to create a DocumentStore // instance. // + // See comments on DocumentStore::Create for explanation of + // force_recovery_and_revalidate_documents. + // // Returns: // OK on success // FAILED_PRECONDITION if initialize_stats is null // INTERNAL on I/O error libtextclassifier3::Status InitializeDocumentStore( - NativeInitializeStats* initialize_stats) + bool force_recovery_and_revalidate_documents, + InitializeStatsProto* initialize_stats) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Do any initialization/recovery necessary to create a DocumentStore @@ -503,7 +527,7 @@ class IcingSearchEngine { // NOT_FOUND if some Document's schema type is not in the SchemaStore // INTERNAL on I/O error libtextclassifier3::Status InitializeIndex( - NativeInitializeStats* initialize_stats) + InitializeStatsProto* initialize_stats) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Many of the internal components rely on other components' derived data. @@ -527,7 +551,7 @@ class IcingSearchEngine { // OK on success // INTERNAL_ERROR on any IO errors libtextclassifier3::Status RegenerateDerivedFiles( - NativeInitializeStats* initialize_stats = nullptr, + InitializeStatsProto* initialize_stats = nullptr, bool log_document_store_stats = false) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); @@ -545,7 +569,8 @@ class IcingSearchEngine { // document store is still available // INTERNAL_ERROR on any IO errors or other errors that we can't recover // from - libtextclassifier3::Status OptimizeDocumentStore() + libtextclassifier3::Status OptimizeDocumentStore( + OptimizeStatsProto* optimize_stats) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Helper method to restore missing document data in index_. All documents @@ -553,29 +578,19 @@ class IcingSearchEngine { // call Index::Reset first. // // Returns: - // OK on success + // On success, OK and a bool indicating whether or not restoration was + // needed. + // DATA_LOSS, if an error during index merging caused us to lose indexed + // data in the main index. Despite the data loss, this is still considered + // a successful run and needed_restoration will be set to true. // RESOURCE_EXHAUSTED if the index fills up before finishing indexing // NOT_FOUND if some Document's schema type is not in the SchemaStore // INTERNAL_ERROR on any IO errors - libtextclassifier3::Status RestoreIndexIfNeeded() - ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); - - // Computes the combined checksum of the IcingSearchEngine - includes all its - // subcomponents - // - // Returns: - // Combined checksum on success - // INTERNAL_ERROR on compute error - libtextclassifier3::StatusOr<Crc32> ComputeChecksum() - ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); - - // Checks if the header exists already. This does not create the header file - // if it doesn't exist. - bool HeaderExists() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); - - // Update and replace the header file. Creates the header file if it doesn't - // exist. - libtextclassifier3::Status UpdateHeader(const Crc32& checksum) + struct IndexRestorationResult { + libtextclassifier3::Status status; + bool needed_restoration; + }; + IndexRestorationResult RestoreIndexIfNeeded() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); // If we lost the schema during a previous failure, it may "look" the same as diff --git a/icing/icing-search-engine_benchmark.cc b/icing/icing-search-engine_benchmark.cc index 9d33a82..ba9aed1 100644 --- a/icing/icing-search-engine_benchmark.cc +++ b/icing/icing-search-engine_benchmark.cc @@ -39,6 +39,7 @@ #include "icing/proto/search.pb.h" #include "icing/proto/status.pb.h" #include "icing/proto/term.pb.h" +#include "icing/schema-builder.h" #include "icing/testing/common-matchers.h" #include "icing/testing/document-generator.h" #include "icing/testing/random-string.h" @@ -69,6 +70,7 @@ namespace lib { namespace { using ::testing::Eq; +using ::testing::HasSubstr; // Icing GMSCore has, on average, 17 corpora on a device and 30 corpora at the // 95th pct. Most clients use a single type. This is a function of Icing's @@ -462,6 +464,287 @@ BENCHMARK(BM_MutlipleIndices) ->ArgPair(10, 32768) ->ArgPair(10, 131072); +void BM_SearchNoStackOverflow(benchmark::State& state) { + // Initialize the filesystem + std::string test_dir = GetTestTempDir() + "/icing/benchmark"; + Filesystem filesystem; + DestructibleDirectory ddir(filesystem, test_dir); + + // Create the schema. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TermMatchType::PREFIX, + StringIndexingConfig::TokenizerType::PLAIN) + .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL))) + .Build(); + + // Create the index. + IcingSearchEngineOptions options; + options.set_base_dir(test_dir); + options.set_index_merge_size(kIcingFullIndexSize); + std::unique_ptr<IcingSearchEngine> icing = + std::make_unique<IcingSearchEngine>(options); + + ASSERT_THAT(icing->Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk()); + + // Create a document that has the term "foo" + DocumentProto base_document = DocumentBuilder() + .SetSchema("Message") + .SetNamespace("namespace") + .AddStringProperty("body", "foo") + .Build(); + + // Insert a lot of documents with the term "foo" + int64_t num_docs = state.range(0); + for (int64_t i = 0; i < num_docs; ++i) { + DocumentProto document = + DocumentBuilder(base_document).SetUri(std::to_string(i)).Build(); + ASSERT_THAT(icing->Put(document).status(), ProtoIsOk()); + } + + // Do a query and exclude documents with the term "foo". The way this is + // currently implemented is that we'll iterate over all the documents in the + // index, then apply the exclusion check. Since all our documents have "foo", + // we'll consider it a "miss". Previously with recursion, we would have + // recursed until we got a success, which would never happen causing us to + // recurse through all the documents and trigger a stack overflow. With + // the iterative implementation, we should avoid this. + SearchSpecProto search_spec; + search_spec.set_query("-foo"); + search_spec.set_term_match_type(TermMatchType::PREFIX); + + ResultSpecProto result_spec; + ScoringSpecProto scoring_spec; + for (auto s : state) { + icing->Search(search_spec, scoring_spec, result_spec); + } +} +// For other reasons, we hit a limit when inserting the ~350,000th document. So +// cap the limit to 1 << 18. +BENCHMARK(BM_SearchNoStackOverflow) + ->Range(/*start=*/1 << 10, /*limit=*/1 << 18); + +// Added for b/184373205. Ensure that we can repeatedly put documents even if +// the underlying mmapped areas grow past a few page sizes. +void BM_RepeatedPut(benchmark::State& state) { + // Initialize the filesystem + std::string test_dir = GetTestTempDir() + "/icing/benchmark"; + Filesystem filesystem; + DestructibleDirectory ddir(filesystem, test_dir); + + // Create the schema. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TermMatchType::PREFIX, + StringIndexingConfig::TokenizerType::PLAIN) + .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL))) + .Build(); + + // Create the index. + IcingSearchEngineOptions options; + options.set_base_dir(test_dir); + options.set_index_merge_size(kIcingFullIndexSize); + std::unique_ptr<IcingSearchEngine> icing = + std::make_unique<IcingSearchEngine>(options); + + ASSERT_THAT(icing->Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk()); + + // Create a document that has the term "foo" + DocumentProto base_document = DocumentBuilder() + .SetSchema("Message") + .SetNamespace("namespace") + .AddStringProperty("body", "foo") + .Build(); + + // Insert a lot of documents with the term "foo" + int64_t num_docs = state.range(0); + for (auto s : state) { + for (int64_t i = 0; i < num_docs; ++i) { + DocumentProto document = + DocumentBuilder(base_document).SetUri("uri").Build(); + ASSERT_THAT(icing->Put(document).status(), ProtoIsOk()); + } + } +} +// For other reasons, we hit a limit when inserting the ~350,000th document. So +// cap the limit to 1 << 18. +BENCHMARK(BM_RepeatedPut)->Range(/*start=*/100, /*limit=*/1 << 18); + +// This is different from BM_RepeatedPut since we're just trying to benchmark +// one Put call, not thousands of them at once. +void BM_Put(benchmark::State& state) { + // Initialize the filesystem + std::string test_dir = GetTestTempDir() + "/icing/benchmark"; + Filesystem filesystem; + DestructibleDirectory ddir(filesystem, test_dir); + + // Create the schema. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Message")) + .Build(); + + // Create the index. + IcingSearchEngineOptions options; + options.set_base_dir(test_dir); + options.set_index_merge_size(kIcingFullIndexSize); + std::unique_ptr<IcingSearchEngine> icing = + std::make_unique<IcingSearchEngine>(options); + + ASSERT_THAT(icing->Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk()); + + // Create a document + DocumentProto document = DocumentBuilder() + .SetSchema("Message") + .SetNamespace("namespace") + .SetUri("uri") + .Build(); + + for (auto s : state) { + benchmark::DoNotOptimize(icing->Put(document)); + } +} +BENCHMARK(BM_Put); + +void BM_Get(benchmark::State& state) { + // Initialize the filesystem + std::string test_dir = GetTestTempDir() + "/icing/benchmark"; + Filesystem filesystem; + DestructibleDirectory ddir(filesystem, test_dir); + + // Create the schema. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Message")) + .Build(); + + // Create the index. + IcingSearchEngineOptions options; + options.set_base_dir(test_dir); + options.set_index_merge_size(kIcingFullIndexSize); + std::unique_ptr<IcingSearchEngine> icing = + std::make_unique<IcingSearchEngine>(options); + + ASSERT_THAT(icing->Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk()); + + // Create a document + DocumentProto document = DocumentBuilder() + .SetSchema("Message") + .SetNamespace("namespace") + .SetUri("uri") + .Build(); + + ASSERT_THAT(icing->Put(document).status(), ProtoIsOk()); + for (auto s : state) { + benchmark::DoNotOptimize( + icing->Get("namespace", "uri", GetResultSpecProto::default_instance())); + } +} +BENCHMARK(BM_Get); + +void BM_Delete(benchmark::State& state) { + // Initialize the filesystem + std::string test_dir = GetTestTempDir() + "/icing/benchmark"; + Filesystem filesystem; + DestructibleDirectory ddir(filesystem, test_dir); + + // Create the schema. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Message")) + .Build(); + + // Create the index. + IcingSearchEngineOptions options; + options.set_base_dir(test_dir); + options.set_index_merge_size(kIcingFullIndexSize); + std::unique_ptr<IcingSearchEngine> icing = + std::make_unique<IcingSearchEngine>(options); + + ASSERT_THAT(icing->Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk()); + + // Create a document + DocumentProto document = DocumentBuilder() + .SetSchema("Message") + .SetNamespace("namespace") + .SetUri("uri") + .Build(); + + ASSERT_THAT(icing->Put(document).status(), ProtoIsOk()); + for (auto s : state) { + state.PauseTiming(); + icing->Put(document); + state.ResumeTiming(); + + benchmark::DoNotOptimize(icing->Delete("namespace", "uri")); + } +} +BENCHMARK(BM_Delete); + +void BM_PutMaxAllowedDocuments(benchmark::State& state) { + // Initialize the filesystem + std::string test_dir = GetTestTempDir() + "/icing/benchmark"; + Filesystem filesystem; + DestructibleDirectory ddir(filesystem, test_dir); + + // Create the schema. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(TermMatchType::PREFIX, + StringIndexingConfig::TokenizerType::PLAIN) + .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL))) + .Build(); + + // Create the index. + IcingSearchEngineOptions options; + options.set_base_dir(test_dir); + options.set_index_merge_size(kIcingFullIndexSize); + std::unique_ptr<IcingSearchEngine> icing = + std::make_unique<IcingSearchEngine>(options); + + ASSERT_THAT(icing->Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk()); + + // Create a document that has the term "foo" + DocumentProto base_document = DocumentBuilder() + .SetSchema("Message") + .SetNamespace("namespace") + .AddStringProperty("body", "foo") + .Build(); + + // Insert a lot of documents with the term "foo" + for (auto s : state) { + for (int64_t i = 0; i <= kMaxDocumentId; ++i) { + DocumentProto document = + DocumentBuilder(base_document).SetUri(std::to_string(i)).Build(); + EXPECT_THAT(icing->Put(document).status(), ProtoIsOk()); + } + } + + DocumentProto document = + DocumentBuilder(base_document).SetUri("out_of_space_uri").Build(); + PutResultProto put_result_proto = icing->Put(document); + EXPECT_THAT(put_result_proto.status(), + ProtoStatusIs(StatusProto::OUT_OF_SPACE)); + EXPECT_THAT(put_result_proto.status().message(), + HasSubstr("Exceeded maximum number of documents")); +} +BENCHMARK(BM_PutMaxAllowedDocuments); + } // namespace } // namespace lib diff --git a/icing/icing-search-engine_flush_benchmark.cc b/icing/icing-search-engine_flush_benchmark.cc new file mode 100644 index 0000000..de8f550 --- /dev/null +++ b/icing/icing-search-engine_flush_benchmark.cc @@ -0,0 +1,200 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <unistd.h> + +#include <fstream> +#include <iostream> +#include <memory> +#include <ostream> +#include <random> +#include <sstream> +#include <stdexcept> +#include <string> +#include <string_view> +#include <unordered_set> +#include <vector> + +#include "testing/base/public/benchmark.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/document-builder.h" +#include "icing/file/filesystem.h" +#include "icing/icing-search-engine.h" +#include "icing/proto/document.pb.h" +#include "icing/proto/initialize.pb.h" +#include "icing/proto/schema.pb.h" +#include "icing/proto/status.pb.h" +#include "icing/proto/term.pb.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/document-generator.h" +#include "icing/testing/random-string.h" +#include "icing/testing/schema-generator.h" +#include "icing/testing/tmp-directory.h" + +// Run on a Linux workstation: +// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt +// //icing:icing-search-engine_flush_benchmark +// +// $ blaze-bin/icing/icing-search-engine_flush_benchmark +// --benchmarks=all --benchmark_memory_usage +// +// Run on an Android device: +// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1" +// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt +// //icing:icing-search-engine_flush_benchmark +// +// $ adb push blaze-bin/icing/icing-search-engine_flush_benchmark +// /data/local/tmp/ +// +// $ adb shell /data/local/tmp/icing-search-engine_flush_benchmark +// --benchmarks=all + +namespace icing { +namespace lib { + +namespace { + +// Assume that there will be roughly 10 packages, each using 3 of its own types. +constexpr int kAvgNumNamespaces = 10; +constexpr int kAvgNumTypes = 3; + +// ASSUME: Types will have at most ten properties. Types will be created with +// [1, 10] properties. +constexpr int kMaxNumProperties = 10; + +// Based on logs from Icing GMSCore. +constexpr int kAvgDocumentSize = 300; + +// ASSUME: ~75% of the document's size comes from its content. +constexpr float kContentSizePct = 0.7; + +// Average length of word in English is 4.7 characters. +constexpr int kAvgTokenLen = 5; +// Made up value. This results in a fairly reasonable language - the majority of +// generated words are 3-9 characters, ~3% of words are >=20 chars, and the +// longest ones are 27 chars, (roughly consistent with the longest, +// non-contrived English words +// https://en.wikipedia.org/wiki/Longest_word_in_English) +constexpr int kTokenStdDev = 7; +constexpr int kLanguageSize = 1000; + +// The number of documents to index. +constexpr int kNumDocuments = 1024; + +std::vector<std::string> CreateNamespaces(int num_namespaces) { + std::vector<std::string> namespaces; + while (--num_namespaces >= 0) { + namespaces.push_back("comgooglepackage" + std::to_string(num_namespaces)); + } + return namespaces; +} + +// Creates a vector containing num_words randomly-generated words for use by +// documents. +template <typename Rand> +std::vector<std::string> CreateLanguage(int num_words, Rand* r) { + std::vector<std::string> language; + std::normal_distribution<> norm_dist(kAvgTokenLen, kTokenStdDev); + while (--num_words >= 0) { + int word_length = 0; + while (word_length < 1) { + word_length = std::round(norm_dist(*r)); + } + language.push_back(RandomString(kAlNumAlphabet, word_length, r)); + } + return language; +} + +class DestructibleDirectory { + public: + explicit DestructibleDirectory(const Filesystem& filesystem, + const std::string& dir) + : filesystem_(filesystem), dir_(dir) { + filesystem_.CreateDirectoryRecursively(dir_.c_str()); + } + ~DestructibleDirectory() { + filesystem_.DeleteDirectoryRecursively(dir_.c_str()); + } + + private: + Filesystem filesystem_; + std::string dir_; +}; + +void BM_FlushBenchmark(benchmark::State& state) { + PersistType::Code persist_type = + (state.range(0)) ? PersistType::LITE : PersistType::FULL; + int num_documents_per_persist = state.range(1); + + // Initialize the filesystem + std::string test_dir = GetTestTempDir() + "/icing/benchmark/flush"; + Filesystem filesystem; + DestructibleDirectory ddir(filesystem, test_dir); + + // Create the schema. + std::default_random_engine random; + int num_types = kAvgNumNamespaces * kAvgNumTypes; + ExactStringPropertyGenerator property_generator; + RandomSchemaGenerator<std::default_random_engine, + ExactStringPropertyGenerator> + schema_generator(&random, &property_generator); + SchemaProto schema = + schema_generator.GenerateSchema(num_types, kMaxNumProperties); + EvenDistributionTypeSelector type_selector(schema); + + std::vector<std::string> namespaces = CreateNamespaces(kAvgNumNamespaces); + EvenDistributionNamespaceSelector namespace_selector(namespaces); + + std::vector<std::string> language = CreateLanguage(kLanguageSize, &random); + UniformDistributionLanguageTokenGenerator<std::default_random_engine> + token_generator(language, &random); + + DocumentGenerator< + EvenDistributionNamespaceSelector, EvenDistributionTypeSelector, + UniformDistributionLanguageTokenGenerator<std::default_random_engine>> + generator(&namespace_selector, &type_selector, &token_generator, + kAvgDocumentSize * kContentSizePct); + + IcingSearchEngineOptions options; + options.set_base_dir(test_dir); + std::unique_ptr<IcingSearchEngine> icing = + std::make_unique<IcingSearchEngine>(options); + + ASSERT_THAT(icing->Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk()); + for (auto s : state) { + for (int i = 0; i < kNumDocuments; ++i) { + icing->Put(generator.generateDoc()); + + if (i % num_documents_per_persist == num_documents_per_persist - 1) { + icing->PersistToDisk(persist_type); + } + } + } +} +BENCHMARK(BM_FlushBenchmark) + // First argument: lite_flush, + // Second argument: num_document_per_lite_flush + ->ArgPair(true, 1) + ->ArgPair(false, 1) + ->ArgPair(true, 32) + ->ArgPair(false, 32) + ->ArgPair(true, 1024) + ->ArgPair(false, 1024); + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/icing-search-engine_fuzz_test.cc b/icing/icing-search-engine_fuzz_test.cc index 1f59c6e..2d07e37 100644 --- a/icing/icing-search-engine_fuzz_test.cc +++ b/icing/icing-search-engine_fuzz_test.cc @@ -23,6 +23,7 @@ #include "icing/proto/document.pb.h" #include "icing/proto/initialize.pb.h" #include "icing/proto/scoring.pb.h" +#include "icing/schema-builder.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" @@ -30,27 +31,20 @@ namespace icing { namespace lib { namespace { +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED = + PropertyConfigProto_Cardinality_Code_REQUIRED; + +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; + +constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; + IcingSearchEngineOptions Setup() { IcingSearchEngineOptions icing_options; icing_options.set_base_dir(GetTestTempDir() + "/icing"); return icing_options; } -SchemaProto SetTypes() { - SchemaProto schema; - SchemaTypeConfigProto* type = schema.add_types(); - type->set_schema_type("Message"); - PropertyConfigProto* body = type->add_properties(); - body->set_property_name("body"); - body->set_data_type(PropertyConfigProto::DataType::STRING); - body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - body->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::PREFIX); - body->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); - return schema; -} - DocumentProto MakeDocument(const uint8_t* data, size_t size) { // TODO (sidchhabra): Added more optimized fuzzing techniques. DocumentProto document; @@ -83,7 +77,15 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { // TODO (b/145758378): Deleting directory should not be required. filesystem_.DeleteDirectoryRecursively(icing_options.base_dir().c_str()); icing.Initialize(); - SchemaProto schema_proto = SetTypes(); + + SchemaProto schema_proto = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); icing.SetSchema(schema_proto); // Index diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc index 8c64614..6ad4703 100644 --- a/icing/icing-search-engine_test.cc +++ b/icing/icing-search-engine_test.cc @@ -29,19 +29,24 @@ #include "icing/file/mock-filesystem.h" #include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/legacy/index/icing-mock-filesystem.h" +#include "icing/portable/endian.h" #include "icing/portable/equals-proto.h" +#include "icing/portable/platform.h" #include "icing/proto/document.pb.h" #include "icing/proto/initialize.pb.h" +#include "icing/proto/optimize.pb.h" +#include "icing/proto/persist.pb.h" #include "icing/proto/schema.pb.h" #include "icing/proto/scoring.pb.h" #include "icing/proto/search.pb.h" #include "icing/proto/status.pb.h" +#include "icing/schema-builder.h" #include "icing/schema/schema-store.h" #include "icing/schema/section.h" +#include "icing/store/document-log-creator.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" #include "icing/testing/jni-test-helpers.h" -#include "icing/testing/platform.h" #include "icing/testing/random-string.h" #include "icing/testing/snippet-helpers.h" #include "icing/testing/test-data.h" @@ -85,13 +90,48 @@ constexpr std::string_view kIpsumText = "vehicula posuere vitae, convallis eu lorem. Donec semper augue eu nibh " "placerat semper."; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED = + PropertyConfigProto_Cardinality_Code_REQUIRED; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED = + PropertyConfigProto_Cardinality_Code_REPEATED; + +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_NONE = + StringIndexingConfig_TokenizerType_Code_NONE; + +#ifndef ICING_JNI_TEST +constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; +#endif // !ICING_JNI_TEST + +constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; +constexpr TermMatchType_Code MATCH_NONE = TermMatchType_Code_UNKNOWN; + +PortableFileBackedProtoLog<DocumentWrapper>::Header ReadDocumentLogHeader( + Filesystem filesystem, const std::string& file_path) { + PortableFileBackedProtoLog<DocumentWrapper>::Header header; + filesystem.PRead(file_path.c_str(), &header, + sizeof(PortableFileBackedProtoLog<DocumentWrapper>::Header), + /*offset=*/0); + return header; +} + +void WriteDocumentLogHeader( + Filesystem filesystem, const std::string& file_path, + PortableFileBackedProtoLog<DocumentWrapper>::Header& header) { + filesystem.Write(file_path.c_str(), &header, + sizeof(PortableFileBackedProtoLog<DocumentWrapper>::Header)); +} + // For mocking purpose, we allow tests to provide a custom Filesystem. class TestIcingSearchEngine : public IcingSearchEngine { public: TestIcingSearchEngine(const IcingSearchEngineOptions& options, std::unique_ptr<const Filesystem> filesystem, std::unique_ptr<const IcingFilesystem> icing_filesystem, - std::unique_ptr<FakeClock> clock, + std::unique_ptr<Clock> clock, std::unique_ptr<JniCache> jni_cache) : IcingSearchEngine(options, std::move(filesystem), std::move(icing_filesystem), std::move(clock), @@ -172,95 +212,61 @@ DocumentProto CreateEmailDocument(const std::string& name_space, } SchemaProto CreateMessageSchema() { - SchemaProto schema; - auto type = schema.add_types(); - type->set_schema_type("Message"); - - auto body = type->add_properties(); - body->set_property_name("body"); - body->set_data_type(PropertyConfigProto::DataType::STRING); - body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - body->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::PREFIX); - body->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); - - return schema; + return SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); } SchemaProto CreateEmailSchema() { - SchemaProto schema; - auto* type = schema.add_types(); - type->set_schema_type("Email"); - - auto* body = type->add_properties(); - body->set_property_name("body"); - body->set_data_type(PropertyConfigProto::DataType::STRING); - body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - body->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::PREFIX); - body->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); - auto* subj = type->add_properties(); - subj->set_property_name("subject"); - subj->set_data_type(PropertyConfigProto::DataType::STRING); - subj->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - subj->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::PREFIX); - subj->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); - return schema; + return SchemaBuilder() + .AddType( + SchemaTypeConfigBuilder() + .SetType("Email") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); } SchemaProto CreatePersonAndEmailSchema() { - SchemaProto schema; - - auto* person_type = schema.add_types(); - person_type->set_schema_type("Person"); - auto* name = person_type->add_properties(); - name->set_property_name("name"); - name->set_data_type(PropertyConfigProto::DataType::STRING); - name->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - name->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::PREFIX); - name->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); - auto* address = person_type->add_properties(); - address->set_property_name("emailAddress"); - address->set_data_type(PropertyConfigProto::DataType::STRING); - address->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - address->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::PREFIX); - address->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); - - auto* type = schema.add_types(); - type->set_schema_type("Email"); - - auto* body = type->add_properties(); - body->set_property_name("body"); - body->set_data_type(PropertyConfigProto::DataType::STRING); - body->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - body->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::PREFIX); - body->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); - auto* subj = type->add_properties(); - subj->set_property_name("subject"); - subj->set_data_type(PropertyConfigProto::DataType::STRING); - subj->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - subj->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::PREFIX); - subj->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); - auto* sender = type->add_properties(); - sender->set_property_name("sender"); - sender->set_schema_type("Person"); - sender->set_data_type(PropertyConfigProto::DataType::DOCUMENT); - sender->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - sender->mutable_document_indexing_config()->set_index_nested_properties(true); - - return schema; + return SchemaBuilder() + .AddType( + SchemaTypeConfigBuilder() + .SetType("Person") + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("emailAddress") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType( + SchemaTypeConfigBuilder() + .SetType("Email") + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("sender") + .SetDataTypeDocument( + "Person", /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); } ScoringSpecProto GetDefaultScoringSpec() { @@ -428,23 +434,23 @@ TEST_F(IcingSearchEngineTest, MaxTokenLenReturnsOkAndTruncatesTokens) { SearchResultProto actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(actual_results, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); // The query token is also truncated to length of 1, so "me"->"m" matches "m" search_spec.set_query("me"); actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(actual_results, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); // The query token is still truncated to length of 1, so "massage"->"m" // matches "m" search_spec.set_query("massage"); actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(actual_results, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, @@ -480,8 +486,8 @@ TEST_F(IcingSearchEngineTest, SearchResultProto actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(actual_results, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, FailToCreateDocStore) { @@ -502,6 +508,217 @@ TEST_F(IcingSearchEngineTest, FailToCreateDocStore) { HasSubstr("Could not create directory")); } +TEST_F(IcingSearchEngineTest, InitMarkerFilePreviousFailuresAtThreshold) { + Filesystem filesystem; + DocumentProto email1 = + CreateEmailDocument("namespace", "uri1", 100, "subject1", "body1"); + email1.set_creation_timestamp_ms(10000); + DocumentProto email2 = + CreateEmailDocument("namespace", "uri2", 50, "subject2", "body2"); + email2.set_creation_timestamp_ms(10000); + + { + // Create an index with a few documents. + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto init_result = icing.Initialize(); + ASSERT_THAT(init_result.status(), ProtoIsOk()); + ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(), + Eq(0)); + ASSERT_THAT(icing.SetSchema(CreateEmailSchema()).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(email1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(email2).status(), ProtoIsOk()); + } + + // Write an init marker file with 5 previously failed attempts. + std::string marker_filepath = GetTestBaseDir() + "/init_marker"; + + { + ScopedFd marker_file_fd(filesystem.OpenForWrite(marker_filepath.c_str())); + int network_init_attempts = GHostToNetworkL(5); + // Write the updated number of attempts before we get started. + ASSERT_TRUE(filesystem.PWrite(marker_file_fd.get(), 0, + &network_init_attempts, + sizeof(network_init_attempts))); + ASSERT_TRUE(filesystem.DataSync(marker_file_fd.get())); + } + + { + // Create the index again and verify that initialization succeeds and no + // data is thrown out. + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto init_result = icing.Initialize(); + ASSERT_THAT(init_result.status(), ProtoIsOk()); + ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(), + Eq(5)); + EXPECT_THAT( + icing.Get("namespace", "uri1", GetResultSpecProto::default_instance()) + .document(), + EqualsProto(email1)); + EXPECT_THAT( + icing.Get("namespace", "uri2", GetResultSpecProto::default_instance()) + .document(), + EqualsProto(email2)); + } + + // The successful init should have thrown out the marker file. + ASSERT_FALSE(filesystem.FileExists(marker_filepath.c_str())); +} + +TEST_F(IcingSearchEngineTest, InitMarkerFilePreviousFailuresBeyondThreshold) { + Filesystem filesystem; + DocumentProto email1 = + CreateEmailDocument("namespace", "uri1", 100, "subject1", "body1"); + DocumentProto email2 = + CreateEmailDocument("namespace", "uri2", 50, "subject2", "body2"); + + { + // Create an index with a few documents. + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto init_result = icing.Initialize(); + ASSERT_THAT(init_result.status(), ProtoIsOk()); + ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(), + Eq(0)); + ASSERT_THAT(icing.SetSchema(CreateEmailSchema()).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(email1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(email2).status(), ProtoIsOk()); + } + + // Write an init marker file with 6 previously failed attempts. + std::string marker_filepath = GetTestBaseDir() + "/init_marker"; + + { + ScopedFd marker_file_fd(filesystem.OpenForWrite(marker_filepath.c_str())); + int network_init_attempts = GHostToNetworkL(6); + // Write the updated number of attempts before we get started. + ASSERT_TRUE(filesystem.PWrite(marker_file_fd.get(), 0, + &network_init_attempts, + sizeof(network_init_attempts))); + ASSERT_TRUE(filesystem.DataSync(marker_file_fd.get())); + } + + { + // Create the index again and verify that initialization succeeds and all + // data is thrown out. + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto init_result = icing.Initialize(); + ASSERT_THAT(init_result.status(), + ProtoStatusIs(StatusProto::WARNING_DATA_LOSS)); + ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(), + Eq(6)); + EXPECT_THAT( + icing.Get("namespace", "uri1", GetResultSpecProto::default_instance()) + .status(), + ProtoStatusIs(StatusProto::NOT_FOUND)); + EXPECT_THAT( + icing.Get("namespace", "uri2", GetResultSpecProto::default_instance()) + .status(), + ProtoStatusIs(StatusProto::NOT_FOUND)); + } + + // The successful init should have thrown out the marker file. + ASSERT_FALSE(filesystem.FileExists(marker_filepath.c_str())); +} + +TEST_F(IcingSearchEngineTest, SuccessiveInitFailuresIncrementsInitMarker) { + Filesystem filesystem; + DocumentProto email1 = + CreateEmailDocument("namespace", "uri1", 100, "subject1", "body1"); + DocumentProto email2 = + CreateEmailDocument("namespace", "uri2", 50, "subject2", "body2"); + + { + // 1. Create an index with a few documents. + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto init_result = icing.Initialize(); + ASSERT_THAT(init_result.status(), ProtoIsOk()); + ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(), + Eq(0)); + ASSERT_THAT(icing.SetSchema(CreateEmailSchema()).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(email1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(email2).status(), ProtoIsOk()); + } + + { + // 2. Create an index that will encounter an IO failure when trying to + // create the document log. + IcingSearchEngineOptions icing_options = GetDefaultIcingOptions(); + + auto mock_filesystem = std::make_unique<MockFilesystem>(); + std::string document_log_filepath = + icing_options.base_dir() + "/document_dir/document_log_v1"; + auto get_filesize_lambda = [this, + &document_log_filepath](const char* filename) { + if (strncmp(document_log_filepath.c_str(), filename, + document_log_filepath.length()) == 0) { + return Filesystem::kBadFileSize; + } + return this->filesystem()->GetFileSize(filename); + }; + ON_CALL(*mock_filesystem, GetFileSize(A<const char*>())) + .WillByDefault(get_filesize_lambda); + + TestIcingSearchEngine icing(icing_options, std::move(mock_filesystem), + std::make_unique<IcingFilesystem>(), + std::make_unique<FakeClock>(), + GetTestJniCache()); + + // Fail to initialize six times in a row. + InitializeResultProto init_result = icing.Initialize(); + ASSERT_THAT(init_result.status(), ProtoStatusIs(StatusProto::INTERNAL)); + ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(), + Eq(0)); + + init_result = icing.Initialize(); + ASSERT_THAT(init_result.status(), ProtoStatusIs(StatusProto::INTERNAL)); + ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(), + Eq(1)); + + init_result = icing.Initialize(); + ASSERT_THAT(init_result.status(), ProtoStatusIs(StatusProto::INTERNAL)); + ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(), + Eq(2)); + + init_result = icing.Initialize(); + ASSERT_THAT(init_result.status(), ProtoStatusIs(StatusProto::INTERNAL)); + ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(), + Eq(3)); + + init_result = icing.Initialize(); + ASSERT_THAT(init_result.status(), ProtoStatusIs(StatusProto::INTERNAL)); + ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(), + Eq(4)); + + init_result = icing.Initialize(); + ASSERT_THAT(init_result.status(), ProtoStatusIs(StatusProto::INTERNAL)); + ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(), + Eq(5)); + } + + { + // 3. Create the index again and verify that initialization succeeds and all + // data is thrown out. + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto init_result = icing.Initialize(); + ASSERT_THAT(init_result.status(), + ProtoStatusIs(StatusProto::WARNING_DATA_LOSS)); + ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(), + Eq(6)); + + EXPECT_THAT( + icing.Get("namespace", "uri1", GetResultSpecProto::default_instance()) + .status(), + ProtoStatusIs(StatusProto::NOT_FOUND)); + EXPECT_THAT( + icing.Get("namespace", "uri2", GetResultSpecProto::default_instance()) + .status(), + ProtoStatusIs(StatusProto::NOT_FOUND)); + } + + // The successful init should have thrown out the marker file. + std::string marker_filepath = GetTestBaseDir() + "/init_marker"; + ASSERT_FALSE(filesystem.FileExists(marker_filepath.c_str())); +} + TEST_F(IcingSearchEngineTest, CircularReferenceCreateSectionManagerReturnsInvalidArgument) { // Create a type config with a circular reference. @@ -596,7 +813,7 @@ TEST_F(IcingSearchEngineTest, FailToWriteSchema) { HasSubstr("Unable to open file for write")); } -TEST_F(IcingSearchEngineTest, SetSchemaDelete2) { +TEST_F(IcingSearchEngineTest, SetSchemaIncompatibleFails) { { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); @@ -639,15 +856,18 @@ TEST_F(IcingSearchEngineTest, SetSchemaDelete2) { property->set_data_type(PropertyConfigProto::DataType::STRING); property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - EXPECT_THAT(icing.SetSchema(schema, false).status(), - ProtoStatusIs(StatusProto::FAILED_PRECONDITION)); + EXPECT_THAT( + icing.SetSchema(schema, /*ignore_errors_and_delete_documents=*/false) + .status(), + ProtoStatusIs(StatusProto::FAILED_PRECONDITION)); - // 4. Try to delete by email type. + // 4. Try to delete by email type. This should succeed because email wasn't + // deleted in step 3. EXPECT_THAT(icing.DeleteBySchemaType("Email").status(), ProtoIsOk()); } } -TEST_F(IcingSearchEngineTest, SetSchemaDelete) { +TEST_F(IcingSearchEngineTest, SetSchemaIncompatibleForceOverrideSucceeds) { { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); @@ -681,7 +901,8 @@ TEST_F(IcingSearchEngineTest, SetSchemaDelete) { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - // 3. Set a schema that deletes email. This should fail. + // 3. Set a schema that deletes email with force override. This should + // succeed and delete the email type. SchemaProto schema; SchemaTypeConfigProto* type = schema.add_types(); type->set_schema_type("Message"); @@ -692,7 +913,8 @@ TEST_F(IcingSearchEngineTest, SetSchemaDelete) { EXPECT_THAT(icing.SetSchema(schema, true).status(), ProtoIsOk()); - // 4. Try to delete by email type. + // 4. Try to delete by email type. This should fail because email was + // already deleted. EXPECT_THAT(icing.DeleteBySchemaType("Email").status(), ProtoStatusIs(StatusProto::NOT_FOUND)); } @@ -731,7 +953,13 @@ TEST_F(IcingSearchEngineTest, SetSchemaCompatibleVersionUpdateSucceeds) { property->set_data_type(PropertyConfigProto::DataType::STRING); property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + SetSchemaResultProto set_schema_result = icing.SetSchema(schema); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + SetSchemaResultProto expected_set_schema_result; + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + expected_set_schema_result.mutable_new_schema_types()->Add("Email"); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); EXPECT_THAT(icing.GetSchema().schema().types(0).version(), Eq(1)); } @@ -749,12 +977,20 @@ TEST_F(IcingSearchEngineTest, SetSchemaCompatibleVersionUpdateSucceeds) { property->set_property_name("title"); property->set_data_type(PropertyConfigProto::DataType::STRING); property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + property = type->add_properties(); property->set_property_name("body"); property->set_data_type(PropertyConfigProto::DataType::STRING); property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); // 3. SetSchema should succeed and the version number should be updated. - EXPECT_THAT(icing.SetSchema(schema, true).status(), ProtoIsOk()); + SetSchemaResultProto set_schema_result = icing.SetSchema(schema, true); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + SetSchemaResultProto expected_set_schema_result; + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + expected_set_schema_result.mutable_fully_compatible_changed_schema_types() + ->Add("Email"); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); EXPECT_THAT(icing.GetSchema().schema().types(0).version(), Eq(2)); } @@ -940,7 +1176,12 @@ TEST_F(IcingSearchEngineTest, } TEST_F(IcingSearchEngineTest, SetSchema) { - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + auto fake_clock = std::make_unique<FakeClock>(); + fake_clock->SetTimerElapsedMilliseconds(1000); + TestIcingSearchEngine icing(GetDefaultIcingOptions(), + std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), + std::move(fake_clock), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); auto message_document = CreateMessageDocument("namespace", "uri"); @@ -969,26 +1210,31 @@ TEST_F(IcingSearchEngineTest, SetSchema) { empty_type->set_schema_type(""); // Make sure we can't set invalid schemas - EXPECT_THAT(icing.SetSchema(invalid_schema).status(), + SetSchemaResultProto set_schema_result = icing.SetSchema(invalid_schema); + EXPECT_THAT(set_schema_result.status(), ProtoStatusIs(StatusProto::INVALID_ARGUMENT)); + EXPECT_THAT(set_schema_result.latency_ms(), Eq(1000)); // Can add an document of a set schema - EXPECT_THAT(icing.SetSchema(schema_with_message).status(), ProtoIsOk()); + set_schema_result = icing.SetSchema(schema_with_message); + EXPECT_THAT(set_schema_result.status(), ProtoStatusIs(StatusProto::OK)); + EXPECT_THAT(set_schema_result.latency_ms(), Eq(1000)); EXPECT_THAT(icing.Put(message_document).status(), ProtoIsOk()); // Schema with Email doesn't have Message, so would result incompatible // data - EXPECT_THAT(icing.SetSchema(schema_with_email).status(), + set_schema_result = icing.SetSchema(schema_with_email); + EXPECT_THAT(set_schema_result.status(), ProtoStatusIs(StatusProto::FAILED_PRECONDITION)); + EXPECT_THAT(set_schema_result.latency_ms(), Eq(1000)); // Can expand the set of schema types and add an document of a new // schema type - EXPECT_THAT(icing.SetSchema(SchemaProto(schema_with_email_and_message)) - .status() - .code(), - Eq(StatusProto::OK)); - EXPECT_THAT(icing.Put(message_document).status(), ProtoIsOk()); + set_schema_result = icing.SetSchema(schema_with_email_and_message); + EXPECT_THAT(set_schema_result.status(), ProtoStatusIs(StatusProto::OK)); + EXPECT_THAT(set_schema_result.latency_ms(), Eq(1000)); + EXPECT_THAT(icing.Put(message_document).status(), ProtoIsOk()); // Can't add an document whose schema isn't set auto photo_document = DocumentBuilder() .SetKey("namespace", "uri") @@ -1001,7 +1247,8 @@ TEST_F(IcingSearchEngineTest, SetSchema) { HasSubstr("'Photo' not found")); } -TEST_F(IcingSearchEngineTest, SetSchemaTriggersIndexRestorationAndReturnsOk) { +TEST_F(IcingSearchEngineTest, + SetSchemaNewIndexedPropertyTriggersIndexRestorationAndReturnsOk) { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); @@ -1010,8 +1257,15 @@ TEST_F(IcingSearchEngineTest, SetSchemaTriggersIndexRestorationAndReturnsOk) { ->mutable_properties(0) ->clear_string_indexing_config(); - EXPECT_THAT(icing.SetSchema(schema_with_no_indexed_property).status(), - ProtoIsOk()); + SetSchemaResultProto set_schema_result = + icing.SetSchema(schema_with_no_indexed_property); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + SetSchemaResultProto expected_set_schema_result; + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + expected_set_schema_result.mutable_new_schema_types()->Add("Message"); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + // Nothing will be index and Search() won't return anything. EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(), ProtoIsOk()); @@ -1026,13 +1280,20 @@ TEST_F(IcingSearchEngineTest, SetSchemaTriggersIndexRestorationAndReturnsOk) { SearchResultProto actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStats(empty_result)); + EXPECT_THAT(actual_results, + EqualsSearchResultIgnoreStatsAndScores(empty_result)); SchemaProto schema_with_indexed_property = CreateMessageSchema(); // Index restoration should be triggered here because new schema requires more // properties to be indexed. - EXPECT_THAT(icing.SetSchema(schema_with_indexed_property).status(), - ProtoIsOk()); + set_schema_result = icing.SetSchema(schema_with_indexed_property); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + expected_set_schema_result = SetSchemaResultProto(); + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + expected_set_schema_result.mutable_index_incompatible_changed_schema_types() + ->Add("Message"); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); SearchResultProto expected_search_result_proto; expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); @@ -1040,8 +1301,441 @@ TEST_F(IcingSearchEngineTest, SetSchemaTriggersIndexRestorationAndReturnsOk) { CreateMessageDocument("namespace", "uri"); actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); +} + +TEST_F(IcingSearchEngineTest, + SetSchemaChangeNestedPropertiesTriggersIndexRestorationAndReturnsOk) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + SchemaTypeConfigProto person_proto = + SchemaTypeConfigBuilder() + .SetType("Person") + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + SchemaProto nested_schema = + SchemaBuilder() + .AddType(person_proto) + .AddType(SchemaTypeConfigBuilder() + .SetType("Email") + .AddProperty(PropertyConfigBuilder() + .SetName("sender") + .SetDataTypeDocument( + "Person", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + SetSchemaResultProto set_schema_result = icing.SetSchema(nested_schema); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + SetSchemaResultProto expected_set_schema_result; + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + expected_set_schema_result.mutable_new_schema_types()->Add("Email"); + expected_set_schema_result.mutable_new_schema_types()->Add("Person"); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + DocumentProto document = + DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Email") + .SetCreationTimestampMs(1000) + .AddStringProperty("subject", + "Did you get the memo about TPS reports?") + .AddDocumentProperty("sender", + DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Person") + .AddStringProperty("name", "Bill Lundbergh") + .Build()) + .Build(); + + // "sender.name" should get assigned property id 0 and subject should get + // property id 1. + EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); + + // document should match a query for 'Bill' in 'sender.name', but not in + // 'subject' + SearchSpecProto search_spec; + search_spec.set_query("sender.name:Bill"); + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + + SearchResultProto result; + result.mutable_status()->set_code(StatusProto::OK); + *result.mutable_results()->Add()->mutable_document() = document; + + SearchResultProto actual_results = + icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(result)); + + SearchResultProto empty_result; + empty_result.mutable_status()->set_code(StatusProto::OK); + search_spec.set_query("subject:Bill"); + actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, + EqualsSearchResultIgnoreStatsAndScores(empty_result)); + + // Now update the schema with index_nested_properties=false. This should + // reassign property ids, lead to an index rebuild and ensure that nothing + // match a query for "Bill". + SchemaProto no_nested_schema = + SchemaBuilder() + .AddType(person_proto) + .AddType(SchemaTypeConfigBuilder() + .SetType("Email") + .AddProperty(PropertyConfigBuilder() + .SetName("sender") + .SetDataTypeDocument( + "Person", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + set_schema_result = icing.SetSchema(no_nested_schema); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + expected_set_schema_result = SetSchemaResultProto(); + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + expected_set_schema_result.mutable_index_incompatible_changed_schema_types() + ->Add("Email"); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + // document shouldn't match a query for 'Bill' in either 'sender.name' or + // 'subject' + search_spec.set_query("sender.name:Bill"); + actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, + EqualsSearchResultIgnoreStatsAndScores(empty_result)); + + search_spec.set_query("subject:Bill"); + actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); EXPECT_THAT(actual_results, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EqualsSearchResultIgnoreStatsAndScores(empty_result)); +} + +TEST_F(IcingSearchEngineTest, + ForceSetSchemaPropertyDeletionTriggersIndexRestorationAndReturnsOk) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + // 'body' should have a property id of 0 and 'subject' should have a property + // id of 1. + SchemaProto email_with_body_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("Email") + .AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + SetSchemaResultProto set_schema_result = + icing.SetSchema(email_with_body_schema); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + SetSchemaResultProto expected_set_schema_result; + expected_set_schema_result.mutable_new_schema_types()->Add("Email"); + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + // Create a document with only a subject property. + DocumentProto document = + DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Email") + .SetCreationTimestampMs(1000) + .AddStringProperty("subject", + "Did you get the memo about TPS reports?") + .Build(); + EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); + + // We should be able to retrieve the document by searching for 'tps' in + // 'subject'. + SearchSpecProto search_spec; + search_spec.set_query("subject:tps"); + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + + SearchResultProto result; + result.mutable_status()->set_code(StatusProto::OK); + *result.mutable_results()->Add()->mutable_document() = document; + + SearchResultProto actual_results = + icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(result)); + + // Now update the schema to remove the 'body' field. This is backwards + // incompatible, but document should be preserved because it doesn't contain a + // 'body' field. If the index is correctly rebuilt, then 'subject' will now + // have a property id of 0. If not, then the hits in the index will still have + // have a property id of 1 and therefore it won't be found. + SchemaProto email_no_body_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Email").AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + set_schema_result = icing.SetSchema( + email_no_body_schema, /*ignore_errors_and_delete_documents=*/true); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + expected_set_schema_result = SetSchemaResultProto(); + expected_set_schema_result.mutable_incompatible_schema_types()->Add("Email"); + expected_set_schema_result.mutable_index_incompatible_changed_schema_types() + ->Add("Email"); + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + // We should be able to retrieve the document by searching for 'tps' in + // 'subject'. + search_spec.set_query("subject:tps"); + actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(result)); +} + +TEST_F( + IcingSearchEngineTest, + ForceSetSchemaPropertyDeletionAndAdditionTriggersIndexRestorationAndReturnsOk) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + // 'body' should have a property id of 0 and 'subject' should have a property + // id of 1. + SchemaProto email_with_body_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("Email") + .AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + SetSchemaResultProto set_schema_result = + icing.SetSchema(email_with_body_schema); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + SetSchemaResultProto expected_set_schema_result; + expected_set_schema_result.mutable_new_schema_types()->Add("Email"); + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + // Create a document with only a subject property. + DocumentProto document = + DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Email") + .SetCreationTimestampMs(1000) + .AddStringProperty("subject", + "Did you get the memo about TPS reports?") + .Build(); + EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); + + // We should be able to retrieve the document by searching for 'tps' in + // 'subject'. + SearchSpecProto search_spec; + search_spec.set_query("subject:tps"); + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + + SearchResultProto result; + result.mutable_status()->set_code(StatusProto::OK); + *result.mutable_results()->Add()->mutable_document() = document; + + SearchResultProto actual_results = + icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(result)); + + // Now update the schema to remove the 'body' field. This is backwards + // incompatible, but document should be preserved because it doesn't contain a + // 'body' field. If the index is correctly rebuilt, then 'subject' and 'to' + // will now have property ids of 0 and 1 respectively. If not, then the hits + // in the index will still have have a property id of 1 and therefore it won't + // be found. + SchemaProto email_no_body_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("Email") + .AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("to") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + set_schema_result = icing.SetSchema( + email_no_body_schema, /*ignore_errors_and_delete_documents=*/true); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + expected_set_schema_result = SetSchemaResultProto(); + expected_set_schema_result.mutable_incompatible_schema_types()->Add("Email"); + expected_set_schema_result.mutable_index_incompatible_changed_schema_types() + ->Add("Email"); + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + // We should be able to retrieve the document by searching for 'tps' in + // 'subject'. + search_spec.set_query("subject:tps"); + actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(result)); +} + +TEST_F(IcingSearchEngineTest, ForceSetSchemaIncompatibleNestedDocsAreDeleted) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + SchemaTypeConfigProto email_schema_type = + SchemaTypeConfigBuilder() + .SetType("Email") + .AddProperty( + PropertyConfigBuilder() + .SetName("sender") + .SetDataTypeDocument("Person", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + SchemaProto nested_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("Person") + .AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("company") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(email_schema_type) + .Build(); + + SetSchemaResultProto set_schema_result = icing.SetSchema(nested_schema); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + SetSchemaResultProto expected_set_schema_result; + expected_set_schema_result.mutable_new_schema_types()->Add("Email"); + expected_set_schema_result.mutable_new_schema_types()->Add("Person"); + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + // Create two documents - a person document and an email document - both docs + // should be deleted when we remove the 'company' field from the person type. + DocumentProto person_document = + DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Person") + .SetCreationTimestampMs(1000) + .AddStringProperty("name", "Bill Lundbergh") + .AddStringProperty("company", "Initech Corp.") + .Build(); + EXPECT_THAT(icing.Put(person_document).status(), ProtoIsOk()); + + DocumentProto email_document = + DocumentBuilder() + .SetKey("namespace1", "uri2") + .SetSchema("Email") + .SetCreationTimestampMs(1000) + .AddStringProperty("subject", + "Did you get the memo about TPS reports?") + .AddDocumentProperty("sender", person_document) + .Build(); + EXPECT_THAT(icing.Put(email_document).status(), ProtoIsOk()); + + // We should be able to retrieve both documents. + GetResultProto get_result = + icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()); + EXPECT_THAT(get_result.status(), ProtoIsOk()); + EXPECT_THAT(get_result.document(), EqualsProto(person_document)); + + get_result = + icing.Get("namespace1", "uri2", GetResultSpecProto::default_instance()); + EXPECT_THAT(get_result.status(), ProtoIsOk()); + EXPECT_THAT(get_result.document(), EqualsProto(email_document)); + + // Now update the schema to remove the 'company' field. This is backwards + // incompatible, *both* documents should be deleted because both fail + // validation (they each contain a 'Person' that has a non-existent property). + nested_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(email_schema_type) + .Build(); + + set_schema_result = icing.SetSchema( + nested_schema, /*ignore_errors_and_delete_documents=*/true); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); + expected_set_schema_result = SetSchemaResultProto(); + expected_set_schema_result.mutable_incompatible_schema_types()->Add("Person"); + expected_set_schema_result.mutable_incompatible_schema_types()->Add("Email"); + expected_set_schema_result.mutable_index_incompatible_changed_schema_types() + ->Add("Email"); + expected_set_schema_result.mutable_index_incompatible_changed_schema_types() + ->Add("Person"); + expected_set_schema_result.mutable_status()->set_code(StatusProto::OK); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result)); + + // Both documents should be deleted now. + get_result = + icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()); + EXPECT_THAT(get_result.status(), ProtoStatusIs(StatusProto::NOT_FOUND)); + + get_result = + icing.Get("namespace1", "uri2", GetResultSpecProto::default_instance()); + EXPECT_THAT(get_result.status(), ProtoStatusIs(StatusProto::NOT_FOUND)); } TEST_F(IcingSearchEngineTest, SetSchemaRevalidatesDocumentsAndReturnsOk) { @@ -1089,6 +1783,10 @@ TEST_F(IcingSearchEngineTest, SetSchemaRevalidatesDocumentsAndReturnsOk) { property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); // Can't set the schema since it's incompatible + SetSchemaResultProto set_schema_result = + icing.SetSchema(schema_with_required_subject); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); SetSchemaResultProto expected_set_schema_result_proto; expected_set_schema_result_proto.mutable_status()->set_code( StatusProto::FAILED_PRECONDITION); @@ -1096,15 +1794,17 @@ TEST_F(IcingSearchEngineTest, SetSchemaRevalidatesDocumentsAndReturnsOk) { "Schema is incompatible."); expected_set_schema_result_proto.add_incompatible_schema_types("email"); - EXPECT_THAT(icing.SetSchema(schema_with_required_subject), - EqualsProto(expected_set_schema_result_proto)); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result_proto)); // Force set it + set_schema_result = + icing.SetSchema(schema_with_required_subject, + /*ignore_errors_and_delete_documents=*/true); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); expected_set_schema_result_proto.mutable_status()->set_code(StatusProto::OK); expected_set_schema_result_proto.mutable_status()->clear_message(); - EXPECT_THAT(icing.SetSchema(schema_with_required_subject, - /*ignore_errors_and_delete_documents=*/true), - EqualsProto(expected_set_schema_result_proto)); + EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result_proto)); GetResultProto expected_get_result_proto; expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); @@ -1161,19 +1861,25 @@ TEST_F(IcingSearchEngineTest, SetSchemaDeletesDocumentsAndReturnsOk) { type->set_schema_type("email"); // Can't set the schema since it's incompatible + SetSchemaResultProto set_schema_result = icing.SetSchema(new_schema); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); SetSchemaResultProto expected_result; expected_result.mutable_status()->set_code(StatusProto::FAILED_PRECONDITION); expected_result.mutable_status()->set_message("Schema is incompatible."); expected_result.add_deleted_schema_types("message"); - EXPECT_THAT(icing.SetSchema(new_schema), EqualsProto(expected_result)); + EXPECT_THAT(set_schema_result, EqualsProto(expected_result)); // Force set it + set_schema_result = + icing.SetSchema(new_schema, + /*ignore_errors_and_delete_documents=*/true); + // Ignore latency numbers. They're covered elsewhere. + set_schema_result.clear_latency_ms(); expected_result.mutable_status()->set_code(StatusProto::OK); expected_result.mutable_status()->clear_message(); - EXPECT_THAT(icing.SetSchema(new_schema, - /*ignore_errors_and_delete_documents=*/true), - EqualsProto(expected_result)); + EXPECT_THAT(set_schema_result, EqualsProto(expected_result)); // "email" document is still there GetResultProto expected_get_result_proto; @@ -1500,24 +2206,21 @@ TEST_F(IcingSearchEngineTest, SearchReturnsValidResults) { icing.Search(search_spec, GetDefaultScoringSpec(), result_spec); EXPECT_THAT(results.status(), ProtoIsOk()); EXPECT_THAT(results.results(), SizeIs(2)); - EXPECT_THAT(results.results(0).document(), EqualsProto(document_two)); - EXPECT_THAT(GetMatch(results.results(0).document(), - results.results(0).snippet(), "body", - /*snippet_index=*/0), - Eq("message")); - EXPECT_THAT( - GetWindow(results.results(0).document(), results.results(0).snippet(), - "body", /*snippet_index=*/0), - Eq("message body")); + + const DocumentProto& document = results.results(0).document(); + EXPECT_THAT(document, EqualsProto(document_two)); + + const SnippetProto& snippet = results.results(0).snippet(); + EXPECT_THAT(snippet.entries(), SizeIs(1)); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), + ElementsAre("message body")); + EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("message")); + EXPECT_THAT(results.results(1).document(), EqualsProto(document_one)); - EXPECT_THAT( - GetMatch(results.results(1).document(), results.results(1).snippet(), - "body", /*snippet_index=*/0), - IsEmpty()); - EXPECT_THAT( - GetWindow(results.results(1).document(), results.results(1).snippet(), - "body", /*snippet_index=*/0), - IsEmpty()); + EXPECT_THAT(results.results(1).snippet().entries(), IsEmpty()); search_spec.set_query("foo"); @@ -1526,8 +2229,79 @@ TEST_F(IcingSearchEngineTest, SearchReturnsValidResults) { SearchResultProto actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(actual_results, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); +} + +TEST_F(IcingSearchEngineTest, SearchReturnsScoresDocumentScore) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + + DocumentProto document_one = CreateMessageDocument("namespace", "uri1"); + document_one.set_score(93); + document_one.set_creation_timestamp_ms(10000); + ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk()); + + DocumentProto document_two = CreateMessageDocument("namespace", "uri2"); + document_two.set_score(15); + document_two.set_creation_timestamp_ms(12000); + ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk()); + + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::PREFIX); + search_spec.set_query("message"); + + // Rank by DOCUMENT_SCORE and ensure that the score field is populated with + // document score. + ScoringSpecProto scoring_spec; + scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE); + + SearchResultProto results = icing.Search(search_spec, scoring_spec, + ResultSpecProto::default_instance()); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(2)); + + EXPECT_THAT(results.results(0).document(), EqualsProto(document_one)); + EXPECT_THAT(results.results(0).score(), 93); + EXPECT_THAT(results.results(1).document(), EqualsProto(document_two)); + EXPECT_THAT(results.results(1).score(), 15); +} + +TEST_F(IcingSearchEngineTest, SearchReturnsScoresCreationTimestamp) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + + DocumentProto document_one = CreateMessageDocument("namespace", "uri1"); + document_one.set_score(93); + document_one.set_creation_timestamp_ms(10000); + ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk()); + + DocumentProto document_two = CreateMessageDocument("namespace", "uri2"); + document_two.set_score(15); + document_two.set_creation_timestamp_ms(12000); + ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk()); + + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::PREFIX); + search_spec.set_query("message"); + + // Rank by CREATION_TS and ensure that the score field is populated with + // creation ts. + ScoringSpecProto scoring_spec; + scoring_spec.set_rank_by( + ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP); + + SearchResultProto results = icing.Search(search_spec, scoring_spec, + ResultSpecProto::default_instance()); + EXPECT_THAT(results.status(), ProtoIsOk()); + EXPECT_THAT(results.results(), SizeIs(2)); + + EXPECT_THAT(results.results(0).document(), EqualsProto(document_two)); + EXPECT_THAT(results.results(0).score(), 12000); + EXPECT_THAT(results.results(1).document(), EqualsProto(document_one)); + EXPECT_THAT(results.results(1).score(), 10000); } TEST_F(IcingSearchEngineTest, SearchReturnsOneResult) { @@ -1559,8 +2333,8 @@ TEST_F(IcingSearchEngineTest, SearchReturnsOneResult) { // The token is a random number so we don't verify it. expected_search_result_proto.set_next_page_token( search_result_proto.next_page_token()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, SearchZeroResultLimitReturnsEmptyResults) { @@ -1578,8 +2352,8 @@ TEST_F(IcingSearchEngineTest, SearchZeroResultLimitReturnsEmptyResults) { expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); SearchResultProto actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), result_spec); - EXPECT_THAT(actual_results, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, SearchNegativeResultLimitReturnsInvalidArgument) { @@ -1600,8 +2374,8 @@ TEST_F(IcingSearchEngineTest, SearchNegativeResultLimitReturnsInvalidArgument) { "ResultSpecProto.num_per_page cannot be negative."); SearchResultProto actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), result_spec); - EXPECT_THAT(actual_results, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, SearchWithPersistenceReturnsValidResults) { @@ -1645,8 +2419,8 @@ TEST_F(IcingSearchEngineTest, SearchWithPersistenceReturnsValidResults) { SearchResultProto actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(actual_results, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); search_spec.set_query("foo"); @@ -1654,7 +2428,8 @@ TEST_F(IcingSearchEngineTest, SearchWithPersistenceReturnsValidResults) { empty_result.mutable_status()->set_code(StatusProto::OK); actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStats(empty_result)); + EXPECT_THAT(actual_results, + EqualsSearchResultIgnoreStatsAndScores(empty_result)); } } @@ -1675,8 +2450,8 @@ TEST_F(IcingSearchEngineTest, SearchShouldReturnEmpty) { icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, SearchShouldReturnMultiplePages) { @@ -1716,8 +2491,8 @@ TEST_F(IcingSearchEngineTest, SearchShouldReturnMultiplePages) { uint64_t next_page_token = search_result_proto.next_page_token(); // Since the token is a random number, we don't need to verify expected_search_result_proto.set_next_page_token(next_page_token); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); // Second page, 2 results expected_search_result_proto.clear_results(); @@ -1726,8 +2501,8 @@ TEST_F(IcingSearchEngineTest, SearchShouldReturnMultiplePages) { *expected_search_result_proto.mutable_results()->Add()->mutable_document() = document2; search_result_proto = icing.GetNextPage(next_page_token); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); // Third page, 1 result expected_search_result_proto.clear_results(); @@ -1737,14 +2512,14 @@ TEST_F(IcingSearchEngineTest, SearchShouldReturnMultiplePages) { // token. expected_search_result_proto.clear_next_page_token(); search_result_proto = icing.GetNextPage(next_page_token); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); // No more results expected_search_result_proto.clear_results(); search_result_proto = icing.GetNextPage(next_page_token); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, SearchWithNoScoringShouldReturnMultiplePages) { @@ -1787,8 +2562,8 @@ TEST_F(IcingSearchEngineTest, SearchWithNoScoringShouldReturnMultiplePages) { uint64_t next_page_token = search_result_proto.next_page_token(); // Since the token is a random number, we don't need to verify expected_search_result_proto.set_next_page_token(next_page_token); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); // Second page, 2 results expected_search_result_proto.clear_results(); @@ -1797,8 +2572,8 @@ TEST_F(IcingSearchEngineTest, SearchWithNoScoringShouldReturnMultiplePages) { *expected_search_result_proto.mutable_results()->Add()->mutable_document() = document2; search_result_proto = icing.GetNextPage(next_page_token); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); // Third page, 1 result expected_search_result_proto.clear_results(); @@ -1808,14 +2583,14 @@ TEST_F(IcingSearchEngineTest, SearchWithNoScoringShouldReturnMultiplePages) { // token. expected_search_result_proto.clear_next_page_token(); search_result_proto = icing.GetNextPage(next_page_token); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); // No more results expected_search_result_proto.clear_results(); search_result_proto = icing.GetNextPage(next_page_token); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, ShouldReturnMultiplePagesWithSnippets) { @@ -1852,24 +2627,28 @@ TEST_F(IcingSearchEngineTest, ShouldReturnMultiplePagesWithSnippets) { ASSERT_THAT(search_result.results(), SizeIs(2)); ASSERT_THAT(search_result.next_page_token(), Gt(kInvalidNextPageToken)); - EXPECT_THAT(search_result.results(0).document(), EqualsProto(document5)); - EXPECT_THAT(GetMatch(search_result.results(0).document(), - search_result.results(0).snippet(), "body", - /*snippet_index=*/0), - Eq("message")); - EXPECT_THAT(GetWindow(search_result.results(0).document(), - search_result.results(0).snippet(), "body", - /*snippet_index=*/0), - Eq("message body")); - EXPECT_THAT(search_result.results(1).document(), EqualsProto(document4)); - EXPECT_THAT(GetMatch(search_result.results(1).document(), - search_result.results(1).snippet(), "body", - /*snippet_index=*/0), - Eq("message")); - EXPECT_THAT(GetWindow(search_result.results(1).document(), - search_result.results(1).snippet(), "body", - /*snippet_index=*/0), - Eq("message body")); + const DocumentProto& document_result_1 = search_result.results(0).document(); + EXPECT_THAT(document_result_1, EqualsProto(document5)); + const SnippetProto& snippet_result_1 = search_result.results(0).snippet(); + EXPECT_THAT(snippet_result_1.entries(), SizeIs(1)); + EXPECT_THAT(snippet_result_1.entries(0).property_name(), Eq("body")); + std::string_view content = GetString( + &document_result_1, snippet_result_1.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet_result_1.entries(0)), + ElementsAre("message body")); + EXPECT_THAT(GetMatches(content, snippet_result_1.entries(0)), + ElementsAre("message")); + + const DocumentProto& document_result_2 = search_result.results(1).document(); + EXPECT_THAT(document_result_2, EqualsProto(document4)); + const SnippetProto& snippet_result_2 = search_result.results(1).snippet(); + EXPECT_THAT(snippet_result_2.entries(0).property_name(), Eq("body")); + content = GetString(&document_result_2, + snippet_result_2.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet_result_2.entries(0)), + ElementsAre("message body")); + EXPECT_THAT(GetMatches(content, snippet_result_2.entries(0)), + ElementsAre("message")); // Second page, 2 result with 1 snippet search_result = icing.GetNextPage(search_result.next_page_token()); @@ -1877,17 +2656,19 @@ TEST_F(IcingSearchEngineTest, ShouldReturnMultiplePagesWithSnippets) { ASSERT_THAT(search_result.results(), SizeIs(2)); ASSERT_THAT(search_result.next_page_token(), Gt(kInvalidNextPageToken)); - EXPECT_THAT(search_result.results(0).document(), EqualsProto(document3)); - EXPECT_THAT(GetMatch(search_result.results(0).document(), - search_result.results(0).snippet(), "body", - /*snippet_index=*/0), - Eq("message")); - EXPECT_THAT(GetWindow(search_result.results(0).document(), - search_result.results(0).snippet(), "body", - /*snippet_index=*/0), - Eq("message body")); + const DocumentProto& document_result_3 = search_result.results(0).document(); + EXPECT_THAT(document_result_3, EqualsProto(document3)); + const SnippetProto& snippet_result_3 = search_result.results(0).snippet(); + EXPECT_THAT(snippet_result_3.entries(0).property_name(), Eq("body")); + content = GetString(&document_result_3, + snippet_result_3.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet_result_3.entries(0)), + ElementsAre("message body")); + EXPECT_THAT(GetMatches(content, snippet_result_3.entries(0)), + ElementsAre("message")); + EXPECT_THAT(search_result.results(1).document(), EqualsProto(document2)); - EXPECT_THAT(search_result.results(1).snippet().entries_size(), Eq(0)); + EXPECT_THAT(search_result.results(1).snippet().entries(), IsEmpty()); // Third page, 1 result with 0 snippets search_result = icing.GetNextPage(search_result.next_page_token()); @@ -1896,7 +2677,7 @@ TEST_F(IcingSearchEngineTest, ShouldReturnMultiplePagesWithSnippets) { ASSERT_THAT(search_result.next_page_token(), Eq(kInvalidNextPageToken)); EXPECT_THAT(search_result.results(0).document(), EqualsProto(document1)); - EXPECT_THAT(search_result.results(0).snippet().entries_size(), Eq(0)); + EXPECT_THAT(search_result.results(0).snippet().entries(), IsEmpty()); } TEST_F(IcingSearchEngineTest, ShouldInvalidateNextPageToken) { @@ -1927,8 +2708,8 @@ TEST_F(IcingSearchEngineTest, ShouldInvalidateNextPageToken) { uint64_t next_page_token = search_result_proto.next_page_token(); // Since the token is a random number, we don't need to verify expected_search_result_proto.set_next_page_token(next_page_token); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); // Now document1 is still to be fetched. // Invalidates token @@ -1938,8 +2719,8 @@ TEST_F(IcingSearchEngineTest, ShouldInvalidateNextPageToken) { expected_search_result_proto.clear_results(); expected_search_result_proto.clear_next_page_token(); search_result_proto = icing.GetNextPage(next_page_token); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, @@ -1971,22 +2752,24 @@ TEST_F(IcingSearchEngineTest, uint64_t next_page_token = search_result_proto.next_page_token(); // Since the token is a random number, we don't need to verify expected_search_result_proto.set_next_page_token(next_page_token); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); // Now document1 is still to be fetched. OptimizeResultProto optimize_result_proto; optimize_result_proto.mutable_status()->set_code(StatusProto::OK); optimize_result_proto.mutable_status()->set_message(""); - ASSERT_THAT(icing.Optimize(), EqualsProto(optimize_result_proto)); + OptimizeResultProto actual_result = icing.Optimize(); + actual_result.clear_optimize_stats(); + ASSERT_THAT(actual_result, EqualsProto(optimize_result_proto)); // Tries to fetch the second page, no results since all tokens have been // invalidated during Optimize() expected_search_result_proto.clear_results(); expected_search_result_proto.clear_next_page_token(); search_result_proto = icing.GetNextPage(next_page_token); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, OptimizationShouldRemoveDeletedDocs) { @@ -2007,7 +2790,8 @@ TEST_F(IcingSearchEngineTest, OptimizationShouldRemoveDeletedDocs) { // Deletes document1 ASSERT_THAT(icing.Delete("namespace", "uri1").status(), ProtoIsOk()); const std::string document_log_path = - icing_options.base_dir() + "/document_dir/document_log"; + icing_options.base_dir() + "/document_dir/" + + DocumentLogCreator::GetDocumentLogFilename(); int64_t document_log_size_before = filesystem()->GetFileSize(document_log_path.c_str()); ASSERT_THAT(icing.Optimize().status(), ProtoIsOk()); @@ -2063,59 +2847,78 @@ TEST_F(IcingSearchEngineTest, GetOptimizeInfoHasCorrectStats) { .SetTtlMs(500) .Build(); - auto fake_clock = std::make_unique<FakeClock>(); - fake_clock->SetSystemTimeMilliseconds(1000); + { + auto fake_clock = std::make_unique<FakeClock>(); + fake_clock->SetSystemTimeMilliseconds(1000); - TestIcingSearchEngine icing(GetDefaultIcingOptions(), - std::make_unique<Filesystem>(), - std::make_unique<IcingFilesystem>(), - std::move(fake_clock), GetTestJniCache()); - ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + TestIcingSearchEngine icing(GetDefaultIcingOptions(), + std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), + std::move(fake_clock), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - // Just initialized, nothing is optimizable yet. - GetOptimizeInfoResultProto optimize_info = icing.GetOptimizeInfo(); - EXPECT_THAT(optimize_info.status(), ProtoIsOk()); - EXPECT_THAT(optimize_info.optimizable_docs(), Eq(0)); - EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0)); + // Just initialized, nothing is optimizable yet. + GetOptimizeInfoResultProto optimize_info = icing.GetOptimizeInfo(); + EXPECT_THAT(optimize_info.status(), ProtoIsOk()); + EXPECT_THAT(optimize_info.optimizable_docs(), Eq(0)); + EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0)); + EXPECT_THAT(optimize_info.time_since_last_optimize_ms(), Eq(0)); - ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); - ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); - // Only have active documents, nothing is optimizable yet. - optimize_info = icing.GetOptimizeInfo(); - EXPECT_THAT(optimize_info.status(), ProtoIsOk()); - EXPECT_THAT(optimize_info.optimizable_docs(), Eq(0)); - EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0)); - - // Deletes document1 - ASSERT_THAT(icing.Delete("namespace", "uri1").status(), ProtoIsOk()); - - optimize_info = icing.GetOptimizeInfo(); - EXPECT_THAT(optimize_info.status(), ProtoIsOk()); - EXPECT_THAT(optimize_info.optimizable_docs(), Eq(1)); - EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Gt(0)); - int64_t first_estimated_optimizable_bytes = - optimize_info.estimated_optimizable_bytes(); - - // Add a second document, but it'll be expired since the time (1000) is - // greater than the document's creation timestamp (100) + the document's ttl - // (500) - ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); + // Only have active documents, nothing is optimizable yet. + optimize_info = icing.GetOptimizeInfo(); + EXPECT_THAT(optimize_info.status(), ProtoIsOk()); + EXPECT_THAT(optimize_info.optimizable_docs(), Eq(0)); + EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0)); + EXPECT_THAT(optimize_info.time_since_last_optimize_ms(), Eq(0)); + + // Deletes document1 + ASSERT_THAT(icing.Delete("namespace", "uri1").status(), ProtoIsOk()); + + optimize_info = icing.GetOptimizeInfo(); + EXPECT_THAT(optimize_info.status(), ProtoIsOk()); + EXPECT_THAT(optimize_info.optimizable_docs(), Eq(1)); + EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Gt(0)); + EXPECT_THAT(optimize_info.time_since_last_optimize_ms(), Eq(0)); + int64_t first_estimated_optimizable_bytes = + optimize_info.estimated_optimizable_bytes(); + + // Add a second document, but it'll be expired since the time (1000) is + // greater than the document's creation timestamp (100) + the document's ttl + // (500) + ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); + + optimize_info = icing.GetOptimizeInfo(); + EXPECT_THAT(optimize_info.status(), ProtoIsOk()); + EXPECT_THAT(optimize_info.optimizable_docs(), Eq(2)); + EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), + Gt(first_estimated_optimizable_bytes)); + EXPECT_THAT(optimize_info.time_since_last_optimize_ms(), Eq(0)); + + // Optimize + ASSERT_THAT(icing.Optimize().status(), ProtoIsOk()); + } - optimize_info = icing.GetOptimizeInfo(); - EXPECT_THAT(optimize_info.status(), ProtoIsOk()); - EXPECT_THAT(optimize_info.optimizable_docs(), Eq(2)); - EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), - Gt(first_estimated_optimizable_bytes)); + { + // Recreate with new time + auto fake_clock = std::make_unique<FakeClock>(); + fake_clock->SetSystemTimeMilliseconds(5000); - // Optimize - ASSERT_THAT(icing.Optimize().status(), ProtoIsOk()); + TestIcingSearchEngine icing(GetDefaultIcingOptions(), + std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), + std::move(fake_clock), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - // Nothing is optimizable now that everything has been optimized away. - optimize_info = icing.GetOptimizeInfo(); - EXPECT_THAT(optimize_info.status(), ProtoIsOk()); - EXPECT_THAT(optimize_info.optimizable_docs(), Eq(0)); - EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0)); + // Nothing is optimizable now that everything has been optimized away. + GetOptimizeInfoResultProto optimize_info = icing.GetOptimizeInfo(); + EXPECT_THAT(optimize_info.status(), ProtoIsOk()); + EXPECT_THAT(optimize_info.optimizable_docs(), Eq(0)); + EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0)); + EXPECT_THAT(optimize_info.time_since_last_optimize_ms(), Eq(4000)); + } } TEST_F(IcingSearchEngineTest, GetAndPutShouldWorkAfterOptimization) { @@ -2351,8 +3154,8 @@ TEST_F(IcingSearchEngineTest, DeleteBySchemaType) { DeleteBySchemaTypeResultProto result_proto = icing.DeleteBySchemaType("message"); EXPECT_THAT(result_proto.status(), ProtoIsOk()); - NativeDeleteStats exp_stats; - exp_stats.set_delete_type(NativeDeleteStats::DeleteType::SCHEMA_TYPE); + DeleteStatsProto exp_stats; + exp_stats.set_delete_type(DeleteStatsProto::DeleteType::SCHEMA_TYPE); exp_stats.set_latency_ms(7); exp_stats.set_num_documents_deleted(1); EXPECT_THAT(result_proto.delete_stats(), EqualsProto(exp_stats)); @@ -2383,8 +3186,8 @@ TEST_F(IcingSearchEngineTest, DeleteBySchemaType) { SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, DeleteSchemaTypeByQuery) { @@ -2458,8 +3261,8 @@ TEST_F(IcingSearchEngineTest, DeleteSchemaTypeByQuery) { SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, DeleteByNamespace) { @@ -2519,8 +3322,8 @@ TEST_F(IcingSearchEngineTest, DeleteByNamespace) { DeleteByNamespaceResultProto result_proto = icing.DeleteByNamespace("namespace1"); EXPECT_THAT(result_proto.status(), ProtoIsOk()); - NativeDeleteStats exp_stats; - exp_stats.set_delete_type(NativeDeleteStats::DeleteType::NAMESPACE); + DeleteStatsProto exp_stats; + exp_stats.set_delete_type(DeleteStatsProto::DeleteType::NAMESPACE); exp_stats.set_latency_ms(7); exp_stats.set_num_documents_deleted(2); EXPECT_THAT(result_proto.delete_stats(), EqualsProto(exp_stats)); @@ -2559,8 +3362,8 @@ TEST_F(IcingSearchEngineTest, DeleteByNamespace) { SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, DeleteNamespaceByQuery) { @@ -2629,8 +3432,8 @@ TEST_F(IcingSearchEngineTest, DeleteNamespaceByQuery) { SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, DeleteByQuery) { @@ -2679,11 +3482,16 @@ TEST_F(IcingSearchEngineTest, DeleteByQuery) { search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); DeleteByQueryResultProto result_proto = icing.DeleteByQuery(search_spec); EXPECT_THAT(result_proto.status(), ProtoIsOk()); - NativeDeleteStats exp_stats; - exp_stats.set_delete_type(NativeDeleteStats::DeleteType::QUERY); + DeleteByQueryStatsProto exp_stats; exp_stats.set_latency_ms(7); exp_stats.set_num_documents_deleted(1); - EXPECT_THAT(result_proto.delete_stats(), EqualsProto(exp_stats)); + exp_stats.set_query_length(search_spec.query().length()); + exp_stats.set_num_terms(1); + exp_stats.set_num_namespaces_filtered(0); + exp_stats.set_num_schema_types_filtered(0); + exp_stats.set_parse_query_latency_ms(7); + exp_stats.set_document_removal_latency_ms(7); + EXPECT_THAT(result_proto.delete_by_query_stats(), EqualsProto(exp_stats)); expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND); expected_get_result_proto.mutable_status()->set_message( @@ -2711,8 +3519,8 @@ TEST_F(IcingSearchEngineTest, DeleteByQuery) { SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, DeleteByQueryNotFound) { @@ -2784,8 +3592,8 @@ TEST_F(IcingSearchEngineTest, DeleteByQueryNotFound) { SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, SetSchemaShouldWorkAfterOptimization) { @@ -2848,8 +3656,8 @@ TEST_F(IcingSearchEngineTest, SearchShouldWorkAfterOptimization) { SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } // Destroys IcingSearchEngine to make sure nothing is cached. IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); @@ -2857,8 +3665,8 @@ TEST_F(IcingSearchEngineTest, SearchShouldWorkAfterOptimization) { SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, IcingShouldWorkFineIfOptimizationIsAborted) { @@ -2913,8 +3721,8 @@ TEST_F(IcingSearchEngineTest, IcingShouldWorkFineIfOptimizationIsAborted) { SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, @@ -2974,8 +3782,8 @@ TEST_F(IcingSearchEngineTest, SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); search_spec.set_query("n"); @@ -2985,8 +3793,8 @@ TEST_F(IcingSearchEngineTest, // Searching new content returns the new document search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, OptimizationShouldRecoverIfDataFilesAreMissing) { @@ -3046,8 +3854,8 @@ TEST_F(IcingSearchEngineTest, OptimizationShouldRecoverIfDataFilesAreMissing) { SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); search_spec.set_query("n"); @@ -3057,8 +3865,8 @@ TEST_F(IcingSearchEngineTest, OptimizationShouldRecoverIfDataFilesAreMissing) { // Searching new content returns the new document search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, SearchIncludesDocumentsBeforeTtl) { @@ -3110,8 +3918,8 @@ TEST_F(IcingSearchEngineTest, SearchIncludesDocumentsBeforeTtl) { SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, SearchDoesntIncludeDocumentsPastTtl) { @@ -3161,8 +3969,8 @@ TEST_F(IcingSearchEngineTest, SearchDoesntIncludeDocumentsPastTtl) { SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, SearchWorksAfterSchemaTypesCompatiblyModified) { @@ -3200,8 +4008,8 @@ TEST_F(IcingSearchEngineTest, SearchWorksAfterSchemaTypesCompatiblyModified) { SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); // With just the schema type filter, we can search for the message search_spec.Clear(); @@ -3212,8 +4020,8 @@ TEST_F(IcingSearchEngineTest, SearchWorksAfterSchemaTypesCompatiblyModified) { search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); // Since SchemaTypeIds are assigned based on order in the SchemaProto, this // will force a change in the DocumentStore's cached SchemaTypeIds @@ -3244,8 +4052,8 @@ TEST_F(IcingSearchEngineTest, SearchWorksAfterSchemaTypesCompatiblyModified) { // We can still search for the message document search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, RecoverFromMissingHeaderFile) { @@ -3276,8 +4084,8 @@ TEST_F(IcingSearchEngineTest, RecoverFromMissingHeaderFile) { SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } // This should shut down IcingSearchEngine and persist anything it needs to EXPECT_TRUE(filesystem()->DeleteFile(GetHeaderFilename().c_str())); @@ -3295,127 +4103,8 @@ TEST_F(IcingSearchEngineTest, RecoverFromMissingHeaderFile) { SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); - - // Checks that Schema is still since it'll be needed to validate the document - EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(), - ProtoIsOk()); -} - -TEST_F(IcingSearchEngineTest, RecoverFromInvalidHeaderMagic) { - SearchSpecProto search_spec; - search_spec.set_query("message"); - search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); - - SearchResultProto expected_search_result_proto; - expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); - *expected_search_result_proto.mutable_results()->Add()->mutable_document() = - CreateMessageDocument("namespace", "uri"); - - GetResultProto expected_get_result_proto; - expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); - *expected_get_result_proto.mutable_document() = - CreateMessageDocument("namespace", "uri"); - - { - // Basic initialization/setup - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); - EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); - EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(), - ProtoIsOk()); - EXPECT_THAT( - icing.Get("namespace", "uri", GetResultSpecProto::default_instance()), - EqualsProto(expected_get_result_proto)); - SearchResultProto search_result_proto = - icing.Search(search_spec, GetDefaultScoringSpec(), - ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); - } // This should shut down IcingSearchEngine and persist anything it needs to - - // Change the header's magic value - int32_t invalid_magic = 1; // Anything that's not the actual kMagic value. - filesystem()->PWrite(GetHeaderFilename().c_str(), - offsetof(IcingSearchEngine::Header, magic), - &invalid_magic, sizeof(invalid_magic)); - - // We should be able to recover from this and access all our previous data - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); - - // Checks that DocumentLog is still ok - EXPECT_THAT( - icing.Get("namespace", "uri", GetResultSpecProto::default_instance()), - EqualsProto(expected_get_result_proto)); - - // Checks that the index is still ok so we can search over it - SearchResultProto search_result_proto = - icing.Search(search_spec, GetDefaultScoringSpec(), - ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); - - // Checks that Schema is still since it'll be needed to validate the document - EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(), - ProtoIsOk()); -} - -TEST_F(IcingSearchEngineTest, RecoverFromInvalidHeaderChecksum) { - SearchSpecProto search_spec; - search_spec.set_query("message"); - search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); - - SearchResultProto expected_search_result_proto; - expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); - *expected_search_result_proto.mutable_results()->Add()->mutable_document() = - CreateMessageDocument("namespace", "uri"); - - GetResultProto expected_get_result_proto; - expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); - *expected_get_result_proto.mutable_document() = - CreateMessageDocument("namespace", "uri"); - - { - // Basic initialization/setup - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); - EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); - EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(), - ProtoIsOk()); - EXPECT_THAT( - icing.Get("namespace", "uri", GetResultSpecProto::default_instance()), - EqualsProto(expected_get_result_proto)); - SearchResultProto search_result_proto = - icing.Search(search_spec, GetDefaultScoringSpec(), - ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); - } // This should shut down IcingSearchEngine and persist anything it needs to - - // Change the header's checksum value - uint32_t invalid_checksum = - 1; // Anything that's not the actual checksum value - filesystem()->PWrite(GetHeaderFilename().c_str(), - offsetof(IcingSearchEngine::Header, checksum), - &invalid_checksum, sizeof(invalid_checksum)); - - // We should be able to recover from this and access all our previous data - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); - - // Checks that DocumentLog is still ok - EXPECT_THAT( - icing.Get("namespace", "uri", GetResultSpecProto::default_instance()), - EqualsProto(expected_get_result_proto)); - - // Checks that the index is still ok so we can search over it - SearchResultProto search_result_proto = - icing.Search(search_spec, GetDefaultScoringSpec(), - ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); // Checks that Schema is still since it'll be needed to validate the document EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(), @@ -3471,8 +4160,8 @@ TEST_F(IcingSearchEngineTest, UnableToRecoverFromCorruptDocumentLog) { EqualsProto(expected_get_result_proto)); } // This should shut down IcingSearchEngine and persist anything it needs to - const std::string document_log_file = - absl_ports::StrCat(GetDocumentDir(), "/document_log"); + const std::string document_log_file = absl_ports::StrCat( + GetDocumentDir(), "/", DocumentLogCreator::GetDocumentLogFilename()); const std::string corrupt_data = "1234"; EXPECT_TRUE(filesystem()->Write(document_log_file.c_str(), corrupt_data.data(), corrupt_data.size())); @@ -3493,9 +4182,10 @@ TEST_F(IcingSearchEngineTest, RecoverFromInconsistentSchemaStore) { .SetCreationTimestampMs(kDefaultCreationTimestampMs) .Build(); + IcingSearchEngineOptions options = GetDefaultIcingOptions(); { // Initializes folder and schema - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + IcingSearchEngine icing(options, GetTestJniCache()); EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); SchemaProto schema; @@ -3532,8 +4222,8 @@ TEST_F(IcingSearchEngineTest, RecoverFromInconsistentSchemaStore) { SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } // This should shut down IcingSearchEngine and persist anything it needs to { @@ -3569,6 +4259,13 @@ TEST_F(IcingSearchEngineTest, RecoverFromInconsistentSchemaStore) { property->mutable_string_indexing_config()->set_tokenizer_type( StringIndexingConfig::TokenizerType::PLAIN); + // Write the marker file + std::string marker_filepath = + absl_ports::StrCat(options.base_dir(), "/set_schema_marker"); + ScopedFd sfd(filesystem()->OpenForWrite(marker_filepath.c_str())); + ASSERT_TRUE(sfd.is_valid()); + + // Write the new schema FakeClock fake_clock; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, @@ -3615,8 +4312,8 @@ TEST_F(IcingSearchEngineTest, RecoverFromInconsistentSchemaStore) { SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, RecoverFromInconsistentDocumentStore) { @@ -3684,8 +4381,8 @@ TEST_F(IcingSearchEngineTest, RecoverFromInconsistentDocumentStore) { SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, RecoverFromInconsistentIndex) { @@ -3708,8 +4405,8 @@ TEST_F(IcingSearchEngineTest, RecoverFromInconsistentIndex) { SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } // This should shut down IcingSearchEngine and persist anything it needs to // Pretend we lost the entire index @@ -3723,8 +4420,8 @@ TEST_F(IcingSearchEngineTest, RecoverFromInconsistentIndex) { SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, RecoverFromCorruptIndex) { @@ -3747,8 +4444,8 @@ TEST_F(IcingSearchEngineTest, RecoverFromCorruptIndex) { SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } // This should shut down IcingSearchEngine and persist anything it needs to // Pretend index is corrupted @@ -3764,8 +4461,8 @@ TEST_F(IcingSearchEngineTest, RecoverFromCorruptIndex) { SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, SearchResultShouldBeRankedByDocumentScore) { @@ -3825,8 +4522,8 @@ TEST_F(IcingSearchEngineTest, SearchResultShouldBeRankedByDocumentScore) { scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE); SearchResultProto search_result_proto = icing.Search( search_spec, scoring_spec, ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, SearchShouldAllowNoScoring) { @@ -3884,8 +4581,8 @@ TEST_F(IcingSearchEngineTest, SearchShouldAllowNoScoring) { scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::NONE); SearchResultProto search_result_proto = icing.Search( search_spec, scoring_spec, ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, SearchResultShouldBeRankedByCreationTimestamp) { @@ -3940,8 +4637,8 @@ TEST_F(IcingSearchEngineTest, SearchResultShouldBeRankedByCreationTimestamp) { ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP); SearchResultProto search_result_proto = icing.Search( search_spec, scoring_spec, ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, SearchResultShouldBeRankedByUsageCount) { @@ -4011,8 +4708,8 @@ TEST_F(IcingSearchEngineTest, SearchResultShouldBeRankedByUsageCount) { ScoringSpecProto::RankingStrategy::USAGE_TYPE1_COUNT); SearchResultProto search_result_proto = icing.Search( search_spec, scoring_spec, ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, @@ -4069,8 +4766,8 @@ TEST_F(IcingSearchEngineTest, ScoringSpecProto::RankingStrategy::USAGE_TYPE1_COUNT); SearchResultProto search_result_proto = icing.Search( search_spec, scoring_spec, ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, SearchResultShouldBeRankedByUsageTimestamp) { @@ -4139,8 +4836,8 @@ TEST_F(IcingSearchEngineTest, SearchResultShouldBeRankedByUsageTimestamp) { ScoringSpecProto::RankingStrategy::USAGE_TYPE1_LAST_USED_TIMESTAMP); SearchResultProto search_result_proto = icing.Search( search_spec, scoring_spec, ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, Bm25fRelevanceScoringOneNamespace) { @@ -4303,24 +5000,21 @@ TEST_F(IcingSearchEngineTest, SearchSpecProto search_spec; search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); - search_spec.set_query("body:coffee OR body:food"); + search_spec.set_query("subject:coffee OR body:food"); ScoringSpecProto scoring_spec = GetDefaultScoringSpec(); scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE); SearchResultProto search_result_proto = icing.Search( search_spec, scoring_spec, ResultSpecProto::default_instance()); - // Result should be in descending score order, section restrict doesn't impact - // the BM25F score. + // Result should be in descending score order EXPECT_THAT(search_result_proto.status(), ProtoIsOk()); - // Both doc5 and doc7 have "coffee" in name and text sections. - // However, doc5 has more matches. + // The term frequencies of "coffee" and "food" are calculated respectively + // from the subject section and the body section. // Documents with "food" are ranked lower as the term "food" is commonly // present in this corpus, and thus, has a lower IDF. EXPECT_THAT( GetUrisFromSearchResults(search_result_proto), - ElementsAre("namespace1/uri5", // 'coffee' 2 times in section subject, - // 1 time in section body - "namespace1/uri7", // 'coffee' 2 times in section body + ElementsAre("namespace1/uri5", // 'coffee' 2 times in section subject "namespace1/uri1", // 'food' 2 times in section body "namespace1/uri4", // 'food' 2 times in section body "namespace1/uri2", // 'food' 1 time in section body @@ -4583,8 +5277,8 @@ TEST_F(IcingSearchEngineTest, ScoringSpecProto::RankingStrategy::USAGE_TYPE1_LAST_USED_TIMESTAMP); SearchResultProto search_result_proto = icing.Search( search_spec, scoring_spec, ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, OlderUsageTimestampShouldNotOverrideNewerOnes) { @@ -4652,8 +5346,8 @@ TEST_F(IcingSearchEngineTest, OlderUsageTimestampShouldNotOverrideNewerOnes) { ScoringSpecProto::RankingStrategy::USAGE_TYPE1_LAST_USED_TIMESTAMP); SearchResultProto search_result_proto = icing.Search( search_spec, scoring_spec, ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, SearchResultShouldBeRankedAscendingly) { @@ -4714,8 +5408,218 @@ TEST_F(IcingSearchEngineTest, SearchResultShouldBeRankedAscendingly) { scoring_spec.set_order_by(ScoringSpecProto::Order::ASC); SearchResultProto search_result_proto = icing.Search( search_spec, scoring_spec, ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); +} + +TEST_F(IcingSearchEngineTest, + SearchResultGroupingDuplicateNamespaceShouldReturnError) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); + EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + + // Creates 2 documents and ensures the relationship in terms of document + // score is: document1 < document2 + DocumentProto document1 = + DocumentBuilder() + .SetKey("namespace1", "uri/1") + .SetSchema("Message") + .AddStringProperty("body", "message1") + .SetScore(1) + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto document2 = + DocumentBuilder() + .SetKey("namespace2", "uri/2") + .SetSchema("Message") + .AddStringProperty("body", "message2") + .SetScore(2) + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); + + // "m" will match all 2 documents + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::PREFIX); + search_spec.set_query("m"); + + ScoringSpecProto scoring_spec = GetDefaultScoringSpec(); + scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE); + + // Specify "namespace1" twice. This should result in an error. + ResultSpecProto result_spec; + ResultSpecProto::ResultGrouping* result_grouping = + result_spec.add_result_groupings(); + result_grouping->set_max_results(1); + result_grouping->add_namespaces("namespace1"); + result_grouping->add_namespaces("namespace2"); + result_grouping = result_spec.add_result_groupings(); + result_grouping->set_max_results(1); + result_grouping->add_namespaces("namespace1"); + + SearchResultProto search_result_proto = + icing.Search(search_spec, scoring_spec, result_spec); + EXPECT_THAT(search_result_proto.status(), + ProtoStatusIs(StatusProto::INVALID_ARGUMENT)); +} + +TEST_F(IcingSearchEngineTest, + SearchResultGroupingNonPositiveMaxResultsShouldReturnError) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); + EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + + // Creates 2 documents and ensures the relationship in terms of document + // score is: document1 < document2 + DocumentProto document1 = + DocumentBuilder() + .SetKey("namespace1", "uri/1") + .SetSchema("Message") + .AddStringProperty("body", "message1") + .SetScore(1) + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto document2 = + DocumentBuilder() + .SetKey("namespace2", "uri/2") + .SetSchema("Message") + .AddStringProperty("body", "message2") + .SetScore(2) + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); + + // "m" will match all 2 documents + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::PREFIX); + search_spec.set_query("m"); + + ScoringSpecProto scoring_spec = GetDefaultScoringSpec(); + scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE); + + // Specify zero results. This should result in an error. + ResultSpecProto result_spec; + ResultSpecProto::ResultGrouping* result_grouping = + result_spec.add_result_groupings(); + result_grouping->set_max_results(0); + result_grouping->add_namespaces("namespace1"); + result_grouping->add_namespaces("namespace2"); + + SearchResultProto search_result_proto = + icing.Search(search_spec, scoring_spec, result_spec); + EXPECT_THAT(search_result_proto.status(), + ProtoStatusIs(StatusProto::INVALID_ARGUMENT)); + + // Specify negative results. This should result in an error. + result_spec.mutable_result_groupings(0)->set_max_results(-1); + EXPECT_THAT(search_result_proto.status(), + ProtoStatusIs(StatusProto::INVALID_ARGUMENT)); +} + +TEST_F(IcingSearchEngineTest, SearchResultGroupingMultiNamespaceGrouping) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); + EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + + // Creates 3 documents and ensures the relationship in terms of document + // score is: document1 < document2 < document3 < document4 < document5 < + // document6 + DocumentProto document1 = + DocumentBuilder() + .SetKey("namespace1", "uri/1") + .SetSchema("Message") + .AddStringProperty("body", "message1") + .SetScore(1) + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto document2 = + DocumentBuilder() + .SetKey("namespace1", "uri/2") + .SetSchema("Message") + .AddStringProperty("body", "message2") + .SetScore(2) + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto document3 = + DocumentBuilder() + .SetKey("namespace2", "uri/3") + .SetSchema("Message") + .AddStringProperty("body", "message3") + .SetScore(3) + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto document4 = + DocumentBuilder() + .SetKey("namespace2", "uri/4") + .SetSchema("Message") + .AddStringProperty("body", "message1") + .SetScore(4) + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto document5 = + DocumentBuilder() + .SetKey("namespace3", "uri/5") + .SetSchema("Message") + .AddStringProperty("body", "message3") + .SetScore(5) + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + DocumentProto document6 = + DocumentBuilder() + .SetKey("namespace3", "uri/6") + .SetSchema("Message") + .AddStringProperty("body", "message1") + .SetScore(6) + .SetCreationTimestampMs(kDefaultCreationTimestampMs) + .Build(); + + ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document4).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document5).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document6).status(), ProtoIsOk()); + + // "m" will match all 6 documents + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::PREFIX); + search_spec.set_query("m"); + + ScoringSpecProto scoring_spec = GetDefaultScoringSpec(); + scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE); + + ResultSpecProto result_spec; + ResultSpecProto::ResultGrouping* result_grouping = + result_spec.add_result_groupings(); + result_grouping->set_max_results(1); + result_grouping->add_namespaces("namespace1"); + result_grouping = result_spec.add_result_groupings(); + result_grouping->set_max_results(2); + result_grouping->add_namespaces("namespace2"); + result_grouping->add_namespaces("namespace3"); + + SearchResultProto search_result_proto = + icing.Search(search_spec, scoring_spec, result_spec); + + // The last result (document1) in namespace "namespace1" should not be + // included. "namespace2" and "namespace3" are grouped together. So only the + // two highest scored documents between the two (both of which are in + // "namespace3") should be returned. + SearchResultProto expected_search_result_proto; + expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_search_result_proto.mutable_results()->Add()->mutable_document() = + document6; + *expected_search_result_proto.mutable_results()->Add()->mutable_document() = + document5; + *expected_search_result_proto.mutable_results()->Add()->mutable_document() = + document2; + + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, @@ -4797,8 +5701,8 @@ TEST_F(IcingSearchEngineTest, SetSchemaCanDetectPreviousSchemaWasLost) { SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, - EqualsSearchResultIgnoreStats(expected_search_result_proto)); + EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } // This should shut down IcingSearchEngine and persist anything it needs to ASSERT_TRUE(filesystem()->DeleteDirectoryRecursively(GetSchemaDir().c_str())); @@ -4824,35 +5728,234 @@ TEST_F(IcingSearchEngineTest, SetSchemaCanDetectPreviousSchemaWasLost) { SearchResultProto search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(), ResultSpecProto::default_instance()); - EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStats(empty_result)); + EXPECT_THAT(search_result_proto, + EqualsSearchResultIgnoreStatsAndScores(empty_result)); } -TEST_F(IcingSearchEngineTest, PersistToDisk) { - GetResultProto expected_get_result_proto; - expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); - *expected_get_result_proto.mutable_document() = - CreateMessageDocument("namespace", "uri"); - +TEST_F(IcingSearchEngineTest, ImplicitPersistToDiskFullSavesEverything) { + DocumentProto document = CreateMessageDocument("namespace", "uri"); { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); - EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(), - ProtoIsOk()); + EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); + } // Destructing calls a PersistToDisk(FULL) - // Persisting shouldn't affect anything - EXPECT_THAT(icing.PersistToDisk().status(), ProtoIsOk()); + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - EXPECT_THAT( - icing.Get("namespace", "uri", GetResultSpecProto::default_instance()), - EqualsProto(expected_get_result_proto)); - } // Destructing persists as well + // There should be no recovery since everything should be saved properly. + InitializeResultProto init_result = icing.Initialize(); + EXPECT_THAT(init_result.status(), ProtoIsOk()); + EXPECT_THAT(init_result.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::NO_DATA_LOSS)); + EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + + // Schema is still intact. + GetSchemaResultProto expected_get_schema_result_proto; + expected_get_schema_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_get_schema_result_proto.mutable_schema() = CreateMessageSchema(); + + EXPECT_THAT(icing.GetSchema(), EqualsProto(expected_get_schema_result_proto)); + + // Documents are still intact. + GetResultProto expected_get_result_proto; + expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_get_result_proto.mutable_document() = document; - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); - EXPECT_THAT(icing.Initialize().status(), ProtoIsOk()); EXPECT_THAT( icing.Get("namespace", "uri", GetResultSpecProto::default_instance()), EqualsProto(expected_get_result_proto)); + + // Index is still intact. + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::PREFIX); + search_spec.set_query("message"); // Content in the Message document. + + SearchResultProto expected_search_result_proto; + expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_search_result_proto.mutable_results()->Add()->mutable_document() = + document; + + SearchResultProto actual_results = + icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); +} + +TEST_F(IcingSearchEngineTest, ExplicitPersistToDiskFullSavesEverything) { + DocumentProto document = CreateMessageDocument("namespace", "uri"); + + // Add schema and documents to our first icing1 instance. + IcingSearchEngine icing1(GetDefaultIcingOptions(), GetTestJniCache()); + EXPECT_THAT(icing1.Initialize().status(), ProtoIsOk()); + EXPECT_THAT(icing1.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + EXPECT_THAT(icing1.Put(document).status(), ProtoIsOk()); + EXPECT_THAT(icing1.PersistToDisk(PersistType::FULL).status(), ProtoIsOk()); + + // Initialize a second icing2 instance which should have it's own memory + // space. If data from icing1 isn't being persisted to the files, then icing2 + // won't be able to see those changes. + IcingSearchEngine icing2(GetDefaultIcingOptions(), GetTestJniCache()); + + // There should be no recovery since everything should be saved properly. + InitializeResultProto init_result = icing2.Initialize(); + EXPECT_THAT(init_result.status(), ProtoIsOk()); + EXPECT_THAT(init_result.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::NO_DATA_LOSS)); + EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + + // Schema is still intact. + GetSchemaResultProto expected_get_schema_result_proto; + expected_get_schema_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_get_schema_result_proto.mutable_schema() = CreateMessageSchema(); + + EXPECT_THAT(icing2.GetSchema(), + EqualsProto(expected_get_schema_result_proto)); + + // Documents are still intact. + GetResultProto expected_get_result_proto; + expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_get_result_proto.mutable_document() = document; + + EXPECT_THAT( + icing2.Get("namespace", "uri", GetResultSpecProto::default_instance()), + EqualsProto(expected_get_result_proto)); + + // Index is still intact. + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::PREFIX); + search_spec.set_query("message"); // Content in the Message document. + + SearchResultProto expected_search_result_proto; + expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_search_result_proto.mutable_results()->Add()->mutable_document() = + document; + + SearchResultProto actual_results = + icing2.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); +} + +TEST_F(IcingSearchEngineTest, NoPersistToDiskLosesAllDocumentsAndIndex) { + IcingSearchEngine icing1(GetDefaultIcingOptions(), GetTestJniCache()); + EXPECT_THAT(icing1.Initialize().status(), ProtoIsOk()); + EXPECT_THAT(icing1.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + DocumentProto document = CreateMessageDocument("namespace", "uri"); + EXPECT_THAT(icing1.Put(document).status(), ProtoIsOk()); + EXPECT_THAT( + icing1.Get("namespace", "uri", GetResultSpecProto::default_instance()) + .document(), + EqualsProto(document)); + + // It's intentional that no PersistToDisk call is made before initializing a + // second instance of icing. + + IcingSearchEngine icing2(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto init_result = icing2.Initialize(); + EXPECT_THAT(init_result.status(), ProtoIsOk()); + EXPECT_THAT(init_result.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::PARTIAL_LOSS)); + EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(), + Eq(InitializeStatsProto::DATA_LOSS)); + EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + + // The document shouldn't be found because we forgot to call + // PersistToDisk(LITE)! + EXPECT_THAT( + icing2.Get("namespace", "uri", GetResultSpecProto::default_instance()) + .status(), + ProtoStatusIs(StatusProto::NOT_FOUND)); + + // Searching also shouldn't get us anything because the index wasn't + // recovered. + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::PREFIX); + search_spec.set_query("message"); // Content in the Message document. + + SearchResultProto expected_search_result_proto; + expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); + + SearchResultProto actual_results = + icing2.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); +} + +TEST_F(IcingSearchEngineTest, PersistToDiskLiteSavesGroundTruth) { + DocumentProto document = CreateMessageDocument("namespace", "uri"); + + IcingSearchEngine icing1(GetDefaultIcingOptions(), GetTestJniCache()); + EXPECT_THAT(icing1.Initialize().status(), ProtoIsOk()); + EXPECT_THAT(icing1.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + EXPECT_THAT(icing1.Put(document).status(), ProtoIsOk()); + EXPECT_THAT(icing1.PersistToDisk(PersistType::LITE).status(), ProtoIsOk()); + EXPECT_THAT( + icing1.Get("namespace", "uri", GetResultSpecProto::default_instance()) + .document(), + EqualsProto(document)); + + IcingSearchEngine icing2(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto init_result = icing2.Initialize(); + EXPECT_THAT(init_result.status(), ProtoIsOk()); + EXPECT_THAT(init_result.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::NO_DATA_LOSS)); + EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + + // A checksum mismatch gets reported as an IO error. The document store and + // index didn't have their derived files included in the checksum previously, + // so reinitializing will trigger a checksum mismatch. + EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(), + Eq(InitializeStatsProto::IO_ERROR)); + EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::IO_ERROR)); + + // Schema is still intact. + GetSchemaResultProto expected_get_schema_result_proto; + expected_get_schema_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_get_schema_result_proto.mutable_schema() = CreateMessageSchema(); + + EXPECT_THAT(icing2.GetSchema(), + EqualsProto(expected_get_schema_result_proto)); + + // The document should be found because we called PersistToDisk(LITE)! + EXPECT_THAT( + icing2.Get("namespace", "uri", GetResultSpecProto::default_instance()) + .document(), + EqualsProto(document)); + + // Recovered index is still intact. + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::PREFIX); + search_spec.set_query("message"); // Content in the Message document. + + SearchResultProto expected_search_result_proto; + expected_search_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_search_result_proto.mutable_results()->Add()->mutable_document() = + document; + + SearchResultProto actual_results = + icing2.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores( + expected_search_result_proto)); } TEST_F(IcingSearchEngineTest, ResetOk) { @@ -4886,11 +5989,11 @@ TEST_F(IcingSearchEngineTest, ResetOk) { EXPECT_THAT(icing.SetSchema(empty_schema).status(), ProtoIsOk()); } -TEST_F(IcingSearchEngineTest, ResetAbortedError) { +TEST_F(IcingSearchEngineTest, ResetDeleteFailureCausesInternalError) { auto mock_filesystem = std::make_unique<MockFilesystem>(); - // This fails IcingSearchEngine::Reset(). But since we didn't actually delete - // anything, we'll be able to consider this just an ABORTED call. + // This fails IcingSearchEngine::Reset() with status code INTERNAL and leaves + // the IcingSearchEngine instance in an uninitialized state. ON_CALL(*mock_filesystem, DeleteDirectoryRecursively(StrEq(GetTestBaseDir().c_str()))) .WillByDefault(Return(false)); @@ -4904,46 +6007,17 @@ TEST_F(IcingSearchEngineTest, ResetAbortedError) { DocumentProto document = CreateMessageDocument("namespace", "uri"); ASSERT_THAT(icing.Put(document).status(), ProtoIsOk()); - EXPECT_THAT(icing.Reset().status(), ProtoStatusIs(StatusProto::ABORTED)); + EXPECT_THAT(icing.Reset().status(), ProtoStatusIs(StatusProto::INTERNAL)); - // Everything is still intact. - // Can get old data. GetResultProto expected_get_result_proto; - expected_get_result_proto.mutable_status()->set_code(StatusProto::OK); + expected_get_result_proto.mutable_status()->set_code( + StatusProto::FAILED_PRECONDITION); *expected_get_result_proto.mutable_document() = document; - EXPECT_THAT(icing.Get(document.namespace_(), document.uri(), - GetResultSpecProto::default_instance()), - EqualsProto(expected_get_result_proto)); - - // Can add new data. - EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(), - ProtoIsOk()); -} - -TEST_F(IcingSearchEngineTest, ResetInternalError) { - auto mock_filesystem = std::make_unique<MockFilesystem>(); - - // Let all other calls succeed. - EXPECT_CALL(*mock_filesystem, Write(Matcher<const char*>(_), _, _)) - .WillRepeatedly(Return(true)); - - // This prevents IcingSearchEngine from creating a DocumentStore instance on - // reinitialization - const std::string document_log_path = - GetTestBaseDir() + "/document_dir/document_log"; - EXPECT_CALL( - *mock_filesystem, - Write(Matcher<const char*>(StrEq(document_log_path.c_str())), _, _)) - .WillOnce(Return(true)) - .WillOnce(Return(false)); - - TestIcingSearchEngine icing(GetDefaultIcingOptions(), - std::move(mock_filesystem), - std::make_unique<IcingFilesystem>(), - std::make_unique<FakeClock>(), GetTestJniCache()); - ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); - ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); - EXPECT_THAT(icing.Reset().status(), ProtoStatusIs(StatusProto::INTERNAL)); + EXPECT_THAT(icing + .Get(document.namespace_(), document.uri(), + GetResultSpecProto::default_instance()) + .status(), + ProtoStatusIs(StatusProto::FAILED_PRECONDITION)); } TEST_F(IcingSearchEngineTest, SnippetNormalization) { @@ -4985,34 +6059,28 @@ TEST_F(IcingSearchEngineTest, SnippetNormalization) { const DocumentProto& result_document_1 = results.results(0).document(); const SnippetProto& result_snippet_1 = results.results(0).snippet(); EXPECT_THAT(result_document_1, EqualsProto(document_two)); - EXPECT_THAT(GetMatch(result_document_1, result_snippet_1, "body", - /*snippet_index=*/0), - Eq("mdi")); - EXPECT_THAT(GetWindow(result_document_1, result_snippet_1, "body", - /*snippet_index=*/0), - Eq("mdi Zürich Team Meeting")); - EXPECT_THAT(GetMatch(result_document_1, result_snippet_1, "body", - /*snippet_index=*/1), - Eq("Zürich")); - EXPECT_THAT(GetWindow(result_document_1, result_snippet_1, "body", - /*snippet_index=*/1), - Eq("mdi Zürich Team Meeting")); + EXPECT_THAT(result_snippet_1.entries(), SizeIs(1)); + EXPECT_THAT(result_snippet_1.entries(0).property_name(), Eq("body")); + std::string_view content = GetString( + &result_document_1, result_snippet_1.entries(0).property_name()); + EXPECT_THAT( + GetWindows(content, result_snippet_1.entries(0)), + ElementsAre("mdi Zürich Team Meeting", "mdi Zürich Team Meeting")); + EXPECT_THAT(GetMatches(content, result_snippet_1.entries(0)), + ElementsAre("mdi", "Zürich")); const DocumentProto& result_document_2 = results.results(1).document(); const SnippetProto& result_snippet_2 = results.results(1).snippet(); EXPECT_THAT(result_document_2, EqualsProto(document_one)); - EXPECT_THAT(GetMatch(result_document_2, result_snippet_2, "body", - /*snippet_index=*/0), - Eq("MDI")); - EXPECT_THAT(GetWindow(result_document_2, result_snippet_2, "body", - /*snippet_index=*/0), - Eq("MDI zurich Team Meeting")); - EXPECT_THAT(GetMatch(result_document_2, result_snippet_2, "body", - /*snippet_index=*/1), - Eq("zurich")); - EXPECT_THAT(GetWindow(result_document_2, result_snippet_2, "body", - /*snippet_index=*/1), - Eq("MDI zurich Team Meeting")); + EXPECT_THAT(result_snippet_2.entries(), SizeIs(1)); + EXPECT_THAT(result_snippet_2.entries(0).property_name(), Eq("body")); + content = GetString(&result_document_2, + result_snippet_2.entries(0).property_name()); + EXPECT_THAT( + GetWindows(content, result_snippet_2.entries(0)), + ElementsAre("MDI zurich Team Meeting", "MDI zurich Team Meeting")); + EXPECT_THAT(GetMatches(content, result_snippet_2.entries(0)), + ElementsAre("MDI", "zurich")); } TEST_F(IcingSearchEngineTest, SnippetNormalizationPrefix) { @@ -5054,34 +6122,28 @@ TEST_F(IcingSearchEngineTest, SnippetNormalizationPrefix) { const DocumentProto& result_document_1 = results.results(0).document(); const SnippetProto& result_snippet_1 = results.results(0).snippet(); EXPECT_THAT(result_document_1, EqualsProto(document_two)); - EXPECT_THAT(GetMatch(result_document_1, result_snippet_1, "body", - /*snippet_index=*/0), - Eq("mdi")); - EXPECT_THAT(GetWindow(result_document_1, result_snippet_1, "body", - /*snippet_index=*/0), - Eq("mdi Zürich Team Meeting")); - EXPECT_THAT(GetMatch(result_document_1, result_snippet_1, "body", - /*snippet_index=*/1), - Eq("Zürich")); - EXPECT_THAT(GetWindow(result_document_1, result_snippet_1, "body", - /*snippet_index=*/1), - Eq("mdi Zürich Team Meeting")); + EXPECT_THAT(result_snippet_1.entries(), SizeIs(1)); + EXPECT_THAT(result_snippet_1.entries(0).property_name(), Eq("body")); + std::string_view content = GetString( + &result_document_1, result_snippet_1.entries(0).property_name()); + EXPECT_THAT( + GetWindows(content, result_snippet_1.entries(0)), + ElementsAre("mdi Zürich Team Meeting", "mdi Zürich Team Meeting")); + EXPECT_THAT(GetMatches(content, result_snippet_1.entries(0)), + ElementsAre("mdi", "Zürich")); const DocumentProto& result_document_2 = results.results(1).document(); const SnippetProto& result_snippet_2 = results.results(1).snippet(); EXPECT_THAT(result_document_2, EqualsProto(document_one)); - EXPECT_THAT(GetMatch(result_document_2, result_snippet_2, "body", - /*snippet_index=*/0), - Eq("MDI")); - EXPECT_THAT(GetWindow(result_document_2, result_snippet_2, "body", - /*snippet_index=*/0), - Eq("MDI zurich Team Meeting")); - EXPECT_THAT(GetMatch(result_document_2, result_snippet_2, "body", - /*snippet_index=*/1), - Eq("zurich")); - EXPECT_THAT(GetWindow(result_document_2, result_snippet_2, "body", - /*snippet_index=*/1), - Eq("MDI zurich Team Meeting")); + EXPECT_THAT(result_snippet_2.entries(), SizeIs(1)); + EXPECT_THAT(result_snippet_2.entries(0).property_name(), Eq("body")); + content = GetString(&result_document_2, + result_snippet_2.entries(0).property_name()); + EXPECT_THAT( + GetWindows(content, result_snippet_2.entries(0)), + ElementsAre("MDI zurich Team Meeting", "MDI zurich Team Meeting")); + EXPECT_THAT(GetMatches(content, result_snippet_2.entries(0)), + ElementsAre("MDI", "zurich")); } TEST_F(IcingSearchEngineTest, SnippetSectionRestrict) { @@ -5112,21 +6174,18 @@ TEST_F(IcingSearchEngineTest, SnippetSectionRestrict) { icing.Search(search_spec, GetDefaultScoringSpec(), result_spec); EXPECT_THAT(results.status(), ProtoIsOk()); ASSERT_THAT(results.results(), SizeIs(1)); + const DocumentProto& result_document = results.results(0).document(); const SnippetProto& result_snippet = results.results(0).snippet(); EXPECT_THAT(result_document, EqualsProto(document_one)); - EXPECT_THAT( - GetMatch(result_document, result_snippet, "body", /*snippet_index=*/0), - Eq("zurich")); - EXPECT_THAT( - GetWindow(result_document, result_snippet, "body", /*snippet_index=*/0), - Eq("MDI zurich Team Meeting")); - EXPECT_THAT( - GetMatch(result_document, result_snippet, "subject", /*snippet_index=*/0), - IsEmpty()); - EXPECT_THAT(GetWindow(result_document, result_snippet, "subject", - /*snippet_index=*/0), - IsEmpty()); + EXPECT_THAT(result_snippet.entries(), SizeIs(1)); + EXPECT_THAT(result_snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&result_document, result_snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, result_snippet.entries(0)), + ElementsAre("MDI zurich Team Meeting")); + EXPECT_THAT(GetMatches(content, result_snippet.entries(0)), + ElementsAre("zurich")); } TEST_F(IcingSearchEngineTest, UninitializedInstanceFailsSafely) { @@ -5167,7 +6226,7 @@ TEST_F(IcingSearchEngineTest, UninitializedInstanceFailsSafely) { ProtoStatusIs(StatusProto::FAILED_PRECONDITION)); icing.InvalidateNextPageToken(kSomePageToken); // Verify this doesn't crash. - EXPECT_THAT(icing.PersistToDisk().status(), + EXPECT_THAT(icing.PersistToDisk(PersistType::FULL).status(), ProtoStatusIs(StatusProto::FAILED_PRECONDITION)); EXPECT_THAT(icing.Optimize().status(), ProtoStatusIs(StatusProto::FAILED_PRECONDITION)); @@ -5401,15 +6460,16 @@ TEST_F(IcingSearchEngineTest, RestoreIndexLoseLiteIndex) { // 2. Delete the last document from the document log { - const std::string document_log_file = - absl_ports::StrCat(GetDocumentDir(), "/document_log"); + const std::string document_log_file = absl_ports::StrCat( + GetDocumentDir(), "/", DocumentLogCreator::GetDocumentLogFilename()); filesystem()->DeleteFile(document_log_file.c_str()); - ICING_ASSERT_OK_AND_ASSIGN(auto create_result, - FileBackedProtoLog<DocumentWrapper>::Create( - filesystem(), document_log_file.c_str(), - FileBackedProtoLog<DocumentWrapper>::Options( - /*compress_in=*/true))); - std::unique_ptr<FileBackedProtoLog<DocumentWrapper>> document_log = + ICING_ASSERT_OK_AND_ASSIGN( + auto create_result, + PortableFileBackedProtoLog<DocumentWrapper>::Create( + filesystem(), document_log_file.c_str(), + PortableFileBackedProtoLog<DocumentWrapper>::Options( + /*compress_in=*/true))); + std::unique_ptr<PortableFileBackedProtoLog<DocumentWrapper>> document_log = std::move(create_result.proto_log); document = DocumentBuilder(document).SetUri("fake_type/0").Build(); @@ -5474,15 +6534,16 @@ TEST_F(IcingSearchEngineTest, RestoreIndexLoseIndex) { // 2. Delete the last two documents from the document log. { - const std::string document_log_file = - absl_ports::StrCat(GetDocumentDir(), "/document_log"); + const std::string document_log_file = absl_ports::StrCat( + GetDocumentDir(), "/", DocumentLogCreator::GetDocumentLogFilename()); filesystem()->DeleteFile(document_log_file.c_str()); - ICING_ASSERT_OK_AND_ASSIGN(auto create_result, - FileBackedProtoLog<DocumentWrapper>::Create( - filesystem(), document_log_file.c_str(), - FileBackedProtoLog<DocumentWrapper>::Options( - /*compress_in=*/true))); - std::unique_ptr<FileBackedProtoLog<DocumentWrapper>> document_log = + ICING_ASSERT_OK_AND_ASSIGN( + auto create_result, + PortableFileBackedProtoLog<DocumentWrapper>::Create( + filesystem(), document_log_file.c_str(), + PortableFileBackedProtoLog<DocumentWrapper>::Options( + /*compress_in=*/true))); + std::unique_ptr<PortableFileBackedProtoLog<DocumentWrapper>> document_log = std::move(create_result.proto_log); document = DocumentBuilder(document).SetUri("fake_type/0").Build(); @@ -5514,6 +6575,88 @@ TEST_F(IcingSearchEngineTest, RestoreIndexLoseIndex) { } } +TEST_F(IcingSearchEngineTest, + DocumentWithNoIndexedContentDoesntCauseRestoreIndex) { + // 1. Create an index with a single document in it that has no indexed + // content. + { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + // Set a schema for a single type that has no indexed properties. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty( + PropertyConfigBuilder() + .SetName("unindexedField") + .SetDataTypeString(MATCH_NONE, TOKENIZER_NONE) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + + // Add a document that contains no indexed content. + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/0") + .SetSchema("Message") + .AddStringProperty("unindexedField", + "Don't you dare search over this!") + .Build(); + EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); + } + + // 2. Create the index again. This should NOT trigger a recovery of any kind. + { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto init_result = icing.Initialize(); + EXPECT_THAT(init_result.status(), ProtoIsOk()); + EXPECT_THAT(init_result.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::NO_DATA_LOSS)); + EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + } +} + +TEST_F(IcingSearchEngineTest, + DocumentWithNoValidIndexedContentDoesntCauseRestoreIndex) { + // 1. Create an index with a single document in it that has no valid indexed + // tokens in its content. + { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + // Set a schema for a single type that has no indexed properties. + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + + // Add a document that contains no valid indexed content - just punctuation. + DocumentProto document = DocumentBuilder() + .SetKey("icing", "fake_type/0") + .SetSchema("Message") + .AddStringProperty("body", "?...!") + .Build(); + EXPECT_THAT(icing.Put(document).status(), ProtoIsOk()); + } + + // 2. Create the index again. This should NOT trigger a recovery of any kind. + { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + InitializeResultProto init_result = icing.Initialize(); + EXPECT_THAT(init_result.status(), ProtoIsOk()); + EXPECT_THAT(init_result.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::NO_DATA_LOSS)); + EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + } +} + TEST_F(IcingSearchEngineTest, IndexingDocMergeFailureResets) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/0") @@ -5596,8 +6739,7 @@ TEST_F(IcingSearchEngineTest, InitializeShouldLogFunctionLatency) { std::move(fake_clock), GetTestJniCache()); InitializeResultProto initialize_result_proto = icing.Initialize(); EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); - EXPECT_THAT(initialize_result_proto.native_initialize_stats().latency_ms(), - Eq(10)); + EXPECT_THAT(initialize_result_proto.initialize_stats().latency_ms(), Eq(10)); } TEST_F(IcingSearchEngineTest, InitializeShouldLogNumberOfDocuments) { @@ -5617,9 +6759,8 @@ TEST_F(IcingSearchEngineTest, InitializeShouldLogNumberOfDocuments) { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); InitializeResultProto initialize_result_proto = icing.Initialize(); EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); - EXPECT_THAT( - initialize_result_proto.native_initialize_stats().num_documents(), - Eq(0)); + EXPECT_THAT(initialize_result_proto.initialize_stats().num_documents(), + Eq(0)); ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); @@ -5629,9 +6770,8 @@ TEST_F(IcingSearchEngineTest, InitializeShouldLogNumberOfDocuments) { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); InitializeResultProto initialize_result_proto = icing.Initialize(); EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); - EXPECT_THAT( - initialize_result_proto.native_initialize_stats().num_documents(), - Eq(1)); + EXPECT_THAT(initialize_result_proto.initialize_stats().num_documents(), + Eq(1)); // Put another document. ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); @@ -5641,9 +6781,8 @@ TEST_F(IcingSearchEngineTest, InitializeShouldLogNumberOfDocuments) { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); InitializeResultProto initialize_result_proto = icing.Initialize(); EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); - EXPECT_THAT( - initialize_result_proto.native_initialize_stats().num_documents(), - Eq(2)); + EXPECT_THAT(initialize_result_proto.initialize_stats().num_documents(), + Eq(2)); } } @@ -5659,25 +6798,25 @@ TEST_F(IcingSearchEngineTest, std::move(fake_clock), GetTestJniCache()); InitializeResultProto initialize_result_proto = icing.Initialize(); EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + EXPECT_THAT(initialize_result_proto.initialize_stats() .document_store_recovery_cause(), - Eq(NativeInitializeStats::NONE)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .document_store_recovery_latency_ms(), Eq(0)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() - .document_store_data_status(), - Eq(NativeInitializeStats::NO_DATA_LOSS)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() - .index_restoration_cause(), - Eq(NativeInitializeStats::NONE)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() - .index_restoration_latency_ms(), - Eq(0)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() - .schema_store_recovery_cause(), - Eq(NativeInitializeStats::NONE)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + EXPECT_THAT( + initialize_result_proto.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::NO_DATA_LOSS)); + EXPECT_THAT( + initialize_result_proto.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT( + initialize_result_proto.initialize_stats().index_restoration_latency_ms(), + Eq(0)); + EXPECT_THAT( + initialize_result_proto.initialize_stats().schema_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .schema_store_recovery_latency_ms(), Eq(0)); } @@ -5701,8 +6840,8 @@ TEST_F(IcingSearchEngineTest, InitializeShouldLogRecoveryCausePartialDataLoss) { // Append a non-checksummed document. This will mess up the checksum of the // proto log, forcing it to rewind and later return a DATA_LOSS error. const std::string serialized_document = document.SerializeAsString(); - const std::string document_log_file = - absl_ports::StrCat(GetDocumentDir(), "/document_log"); + const std::string document_log_file = absl_ports::StrCat( + GetDocumentDir(), "/", DocumentLogCreator::GetDocumentLogFilename()); int64_t file_size = filesystem()->GetFileSize(document_log_file.c_str()); filesystem()->PWrite(document_log_file.c_str(), file_size, @@ -5721,25 +6860,25 @@ TEST_F(IcingSearchEngineTest, InitializeShouldLogRecoveryCausePartialDataLoss) { std::move(fake_clock), GetTestJniCache()); InitializeResultProto initialize_result_proto = icing.Initialize(); EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + EXPECT_THAT(initialize_result_proto.initialize_stats() .document_store_recovery_cause(), - Eq(NativeInitializeStats::DATA_LOSS)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + Eq(InitializeStatsProto::DATA_LOSS)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .document_store_recovery_latency_ms(), Eq(10)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() - .document_store_data_status(), - Eq(NativeInitializeStats::PARTIAL_LOSS)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() - .index_restoration_cause(), - Eq(NativeInitializeStats::NONE)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + EXPECT_THAT( + initialize_result_proto.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::PARTIAL_LOSS)); + EXPECT_THAT( + initialize_result_proto.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .index_restoration_latency_ms(), Eq(0)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + EXPECT_THAT(initialize_result_proto.initialize_stats() .schema_store_recovery_cause(), - Eq(NativeInitializeStats::NONE)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .schema_store_recovery_latency_ms(), Eq(0)); } @@ -5752,31 +6891,47 @@ TEST_F(IcingSearchEngineTest, .SetSchema("Message") .AddStringProperty("body", "message body") .Build(); + + const std::string document_log_file = absl_ports::StrCat( + GetDocumentDir(), "/", DocumentLogCreator::GetDocumentLogFilename()); + int64_t corruptible_offset; + { // Initialize and put a document. IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + + // There's some space at the beginning of the file (e.g. header, kmagic, + // etc) that is necessary to initialize the FileBackedProtoLog. We can't + // corrupt that region, so we need to figure out the offset at which + // documents will be written to - which is the file size after + // initialization. + corruptible_offset = filesystem()->GetFileSize(document_log_file.c_str()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); EXPECT_THAT(icing.Put(document1).status(), ProtoIsOk()); } { - // Modify the document log checksum to trigger a complete document log - // rewind. - const std::string document_log_file = - absl_ports::StrCat(GetDocumentDir(), "/document_log"); - - FileBackedProtoLog<DocumentWrapper>::Header document_log_header; - filesystem()->PRead(document_log_file.c_str(), &document_log_header, - sizeof(FileBackedProtoLog<DocumentWrapper>::Header), - /*offset=*/0); - // Set a garbage checksum. - document_log_header.log_checksum = 10; - document_log_header.header_checksum = - document_log_header.CalculateHeaderChecksum(); - filesystem()->PWrite(document_log_file.c_str(), /*offset=*/0, - &document_log_header, - sizeof(FileBackedProtoLog<DocumentWrapper>::Header)); + // "Corrupt" the content written in the log. Make the corrupt document + // smaller than our original one so we don't accidentally write past our + // file. + DocumentProto document = + DocumentBuilder().SetKey("invalid_namespace", "invalid_uri").Build(); + std::string serialized_document = document.SerializeAsString(); + ASSERT_TRUE(filesystem()->PWrite( + document_log_file.c_str(), corruptible_offset, + serialized_document.data(), serialized_document.size())); + + PortableFileBackedProtoLog<DocumentWrapper>::Header header = + ReadDocumentLogHeader(*filesystem(), document_log_file); + + // Set dirty bit to true to reflect that something changed in the log. + header.SetDirtyFlag(true); + header.SetHeaderChecksum(header.CalculateHeaderChecksum()); + + WriteDocumentLogHeader(*filesystem(), document_log_file, header); } { @@ -5790,27 +6945,27 @@ TEST_F(IcingSearchEngineTest, std::move(fake_clock), GetTestJniCache()); InitializeResultProto initialize_result_proto = icing.Initialize(); EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + EXPECT_THAT(initialize_result_proto.initialize_stats() .document_store_recovery_cause(), - Eq(NativeInitializeStats::DATA_LOSS)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + Eq(InitializeStatsProto::DATA_LOSS)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .document_store_recovery_latency_ms(), Eq(10)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() - .document_store_data_status(), - Eq(NativeInitializeStats::COMPLETE_LOSS)); - // The complete rewind of ground truth causes the mismatch of total - // checksum, so index should be restored. - EXPECT_THAT(initialize_result_proto.native_initialize_stats() - .index_restoration_cause(), - Eq(NativeInitializeStats::TOTAL_CHECKSUM_MISMATCH)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + EXPECT_THAT( + initialize_result_proto.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::COMPLETE_LOSS)); + // The complete rewind of ground truth causes us to clear the index, but + // that's not considered a restoration. + EXPECT_THAT( + initialize_result_proto.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .index_restoration_latency_ms(), - Eq(10)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + Eq(0)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .schema_store_recovery_cause(), - Eq(NativeInitializeStats::NONE)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .schema_store_recovery_latency_ms(), Eq(0)); } @@ -5848,51 +7003,76 @@ TEST_F(IcingSearchEngineTest, std::move(fake_clock), GetTestJniCache()); InitializeResultProto initialize_result_proto = icing.Initialize(); EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() - .index_restoration_cause(), - Eq(NativeInitializeStats::INCONSISTENT_WITH_GROUND_TRUTH)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + EXPECT_THAT( + initialize_result_proto.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .index_restoration_latency_ms(), Eq(10)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + EXPECT_THAT(initialize_result_proto.initialize_stats() .document_store_recovery_cause(), - Eq(NativeInitializeStats::NONE)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .document_store_recovery_latency_ms(), Eq(0)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() - .document_store_data_status(), - Eq(NativeInitializeStats::NO_DATA_LOSS)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + EXPECT_THAT( + initialize_result_proto.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::NO_DATA_LOSS)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .schema_store_recovery_cause(), - Eq(NativeInitializeStats::NONE)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .schema_store_recovery_latency_ms(), Eq(0)); } } TEST_F(IcingSearchEngineTest, - InitializeShouldLogRecoveryCauseTotalChecksumMismatch) { + InitializeShouldLogRecoveryCauseSchemaChangesOutofSync) { DocumentProto document = DocumentBuilder() .SetKey("icing", "fake_type/0") .SetSchema("Message") .AddStringProperty("body", "message body") .Build(); + IcingSearchEngineOptions options = GetDefaultIcingOptions(); { // Initialize and put one document. - IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + IcingSearchEngine icing(options, GetTestJniCache()); ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); ASSERT_THAT(icing.Put(document).status(), ProtoIsOk()); } { - // Change the header's checksum value to a random value. - uint32_t invalid_checksum = 1; - filesystem()->PWrite(GetHeaderFilename().c_str(), - offsetof(IcingSearchEngine::Header, checksum), - &invalid_checksum, sizeof(invalid_checksum)); + // Simulate a schema change where power is lost after the schema is written. + SchemaProto new_schema = + SchemaBuilder() + .AddType( + SchemaTypeConfigBuilder() + .SetType("Message") + .AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + // Write the marker file + std::string marker_filepath = + absl_ports::StrCat(options.base_dir(), "/set_schema_marker"); + ScopedFd sfd(filesystem()->OpenForWrite(marker_filepath.c_str())); + ASSERT_TRUE(sfd.is_valid()); + + // Write the new schema + FakeClock fake_clock; + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaStore> schema_store, + SchemaStore::Create(filesystem(), GetSchemaDir(), &fake_clock)); + ICING_EXPECT_OK(schema_store->SetSchema(new_schema)); } { @@ -5905,25 +7085,58 @@ TEST_F(IcingSearchEngineTest, std::move(fake_clock), GetTestJniCache()); InitializeResultProto initialize_result_proto = icing.Initialize(); EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() - .index_restoration_cause(), - Eq(NativeInitializeStats::TOTAL_CHECKSUM_MISMATCH)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + EXPECT_THAT( + initialize_result_proto.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .index_restoration_latency_ms(), Eq(10)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + EXPECT_THAT(initialize_result_proto.initialize_stats() .document_store_recovery_cause(), - Eq(NativeInitializeStats::TOTAL_CHECKSUM_MISMATCH)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + Eq(InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .document_store_recovery_latency_ms(), Eq(10)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() - .document_store_data_status(), - Eq(NativeInitializeStats::NO_DATA_LOSS)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + EXPECT_THAT( + initialize_result_proto.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::NO_DATA_LOSS)); + EXPECT_THAT(initialize_result_proto.initialize_stats() + .schema_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() + .schema_store_recovery_latency_ms(), + Eq(0)); + } + + { + // No recovery should be needed. + auto fake_clock = std::make_unique<FakeClock>(); + fake_clock->SetTimerElapsedMilliseconds(10); + TestIcingSearchEngine icing(GetDefaultIcingOptions(), + std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), + std::move(fake_clock), GetTestJniCache()); + InitializeResultProto initialize_result_proto = icing.Initialize(); + EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); + EXPECT_THAT( + initialize_result_proto.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() + .index_restoration_latency_ms(), + Eq(0)); + EXPECT_THAT(initialize_result_proto.initialize_stats() + .document_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() + .document_store_recovery_latency_ms(), + Eq(0)); + EXPECT_THAT( + initialize_result_proto.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::NO_DATA_LOSS)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .schema_store_recovery_cause(), - Eq(NativeInitializeStats::NONE)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .schema_store_recovery_latency_ms(), Eq(0)); } @@ -5970,25 +7183,25 @@ TEST_F(IcingSearchEngineTest, InitializeShouldLogRecoveryCauseIndexIOError) { InitializeResultProto initialize_result_proto = icing.Initialize(); EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() - .index_restoration_cause(), - Eq(NativeInitializeStats::IO_ERROR)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() - .index_restoration_latency_ms(), - Eq(10)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + EXPECT_THAT( + initialize_result_proto.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::IO_ERROR)); + EXPECT_THAT( + initialize_result_proto.initialize_stats().index_restoration_latency_ms(), + Eq(10)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .document_store_recovery_cause(), - Eq(NativeInitializeStats::NONE)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .document_store_recovery_latency_ms(), Eq(0)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() - .document_store_data_status(), - Eq(NativeInitializeStats::NO_DATA_LOSS)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() - .schema_store_recovery_cause(), - Eq(NativeInitializeStats::NONE)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + EXPECT_THAT( + initialize_result_proto.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::NO_DATA_LOSS)); + EXPECT_THAT( + initialize_result_proto.initialize_stats().schema_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .schema_store_recovery_latency_ms(), Eq(0)); } @@ -6036,25 +7249,25 @@ TEST_F(IcingSearchEngineTest, InitializeShouldLogRecoveryCauseDocStoreIOError) { InitializeResultProto initialize_result_proto = icing.Initialize(); EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + EXPECT_THAT(initialize_result_proto.initialize_stats() .document_store_recovery_cause(), - Eq(NativeInitializeStats::IO_ERROR)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + Eq(InitializeStatsProto::IO_ERROR)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .document_store_recovery_latency_ms(), Eq(10)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() - .document_store_data_status(), - Eq(NativeInitializeStats::NO_DATA_LOSS)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() - .index_restoration_cause(), - Eq(NativeInitializeStats::NONE)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() - .index_restoration_latency_ms(), - Eq(0)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() - .schema_store_recovery_cause(), - Eq(NativeInitializeStats::NONE)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + EXPECT_THAT( + initialize_result_proto.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::NO_DATA_LOSS)); + EXPECT_THAT( + initialize_result_proto.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT( + initialize_result_proto.initialize_stats().index_restoration_latency_ms(), + Eq(0)); + EXPECT_THAT( + initialize_result_proto.initialize_stats().schema_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .schema_store_recovery_latency_ms(), Eq(0)); } @@ -6083,25 +7296,25 @@ TEST_F(IcingSearchEngineTest, std::move(fake_clock), GetTestJniCache()); InitializeResultProto initialize_result_proto = icing.Initialize(); EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + EXPECT_THAT(initialize_result_proto.initialize_stats() .schema_store_recovery_cause(), - Eq(NativeInitializeStats::IO_ERROR)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + Eq(InitializeStatsProto::IO_ERROR)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .schema_store_recovery_latency_ms(), Eq(10)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + EXPECT_THAT(initialize_result_proto.initialize_stats() .document_store_recovery_cause(), - Eq(NativeInitializeStats::NONE)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .document_store_recovery_latency_ms(), Eq(0)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() - .document_store_data_status(), - Eq(NativeInitializeStats::NO_DATA_LOSS)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() - .index_restoration_cause(), - Eq(NativeInitializeStats::NONE)); - EXPECT_THAT(initialize_result_proto.native_initialize_stats() + EXPECT_THAT( + initialize_result_proto.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::NO_DATA_LOSS)); + EXPECT_THAT( + initialize_result_proto.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(initialize_result_proto.initialize_stats() .index_restoration_latency_ms(), Eq(0)); } @@ -6114,9 +7327,8 @@ TEST_F(IcingSearchEngineTest, InitializeShouldLogNumberOfSchemaTypes) { InitializeResultProto initialize_result_proto = icing.Initialize(); EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); // There should be 0 schema types. - EXPECT_THAT( - initialize_result_proto.native_initialize_stats().num_schema_types(), - Eq(0)); + EXPECT_THAT(initialize_result_proto.initialize_stats().num_schema_types(), + Eq(0)); // Set a schema with one type config. ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); @@ -6127,9 +7339,8 @@ TEST_F(IcingSearchEngineTest, InitializeShouldLogNumberOfSchemaTypes) { InitializeResultProto initialize_result_proto = icing.Initialize(); EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); // There should be 1 schema type. - EXPECT_THAT( - initialize_result_proto.native_initialize_stats().num_schema_types(), - Eq(1)); + EXPECT_THAT(initialize_result_proto.initialize_stats().num_schema_types(), + Eq(1)); // Create and set a schema with two type configs: Email and Message. SchemaProto schema = CreateEmailSchema(); @@ -6152,9 +7363,8 @@ TEST_F(IcingSearchEngineTest, InitializeShouldLogNumberOfSchemaTypes) { IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); InitializeResultProto initialize_result_proto = icing.Initialize(); EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk()); - EXPECT_THAT( - initialize_result_proto.native_initialize_stats().num_schema_types(), - Eq(2)); + EXPECT_THAT(initialize_result_proto.initialize_stats().num_schema_types(), + Eq(2)); } } @@ -6176,8 +7386,7 @@ TEST_F(IcingSearchEngineTest, PutDocumentShouldLogFunctionLatency) { PutResultProto put_result_proto = icing.Put(document); EXPECT_THAT(put_result_proto.status(), ProtoIsOk()); - EXPECT_THAT(put_result_proto.native_put_document_stats().latency_ms(), - Eq(10)); + EXPECT_THAT(put_result_proto.put_document_stats().latency_ms(), Eq(10)); } TEST_F(IcingSearchEngineTest, PutDocumentShouldLogDocumentStoreStats) { @@ -6200,11 +7409,9 @@ TEST_F(IcingSearchEngineTest, PutDocumentShouldLogDocumentStoreStats) { PutResultProto put_result_proto = icing.Put(document); EXPECT_THAT(put_result_proto.status(), ProtoIsOk()); - EXPECT_THAT( - put_result_proto.native_put_document_stats().document_store_latency_ms(), - Eq(10)); - size_t document_size = - put_result_proto.native_put_document_stats().document_size(); + EXPECT_THAT(put_result_proto.put_document_stats().document_store_latency_ms(), + Eq(10)); + size_t document_size = put_result_proto.put_document_stats().document_size(); EXPECT_THAT(document_size, Ge(document.ByteSizeLong())); EXPECT_THAT(document_size, Le(document.ByteSizeLong() + sizeof(DocumentProto::InternalFields))); @@ -6228,18 +7435,16 @@ TEST_F(IcingSearchEngineTest, PutDocumentShouldLogIndexingStats) { PutResultProto put_result_proto = icing.Put(document); EXPECT_THAT(put_result_proto.status(), ProtoIsOk()); - EXPECT_THAT(put_result_proto.native_put_document_stats().index_latency_ms(), - Eq(10)); + EXPECT_THAT(put_result_proto.put_document_stats().index_latency_ms(), Eq(10)); // No merge should happen. - EXPECT_THAT( - put_result_proto.native_put_document_stats().index_merge_latency_ms(), - Eq(0)); + EXPECT_THAT(put_result_proto.put_document_stats().index_merge_latency_ms(), + Eq(0)); // Number of tokens should not exceed. - EXPECT_FALSE(put_result_proto.native_put_document_stats() + EXPECT_FALSE(put_result_proto.put_document_stats() .tokenization_stats() .exceeded_max_token_num()); // The input document has 2 tokens. - EXPECT_THAT(put_result_proto.native_put_document_stats() + EXPECT_THAT(put_result_proto.put_document_stats() .tokenization_stats() .num_tokens_indexed(), Eq(2)); @@ -6263,10 +7468,10 @@ TEST_F(IcingSearchEngineTest, PutDocumentShouldLogWhetherNumTokensExceeds) { PutResultProto put_result_proto = icing.Put(document); EXPECT_THAT(put_result_proto.status(), ProtoIsOk()); // Number of tokens(2) exceeds the max allowed value(1). - EXPECT_TRUE(put_result_proto.native_put_document_stats() + EXPECT_TRUE(put_result_proto.put_document_stats() .tokenization_stats() .exceeded_max_token_num()); - EXPECT_THAT(put_result_proto.native_put_document_stats() + EXPECT_THAT(put_result_proto.put_document_stats() .tokenization_stats() .num_tokens_indexed(), Eq(1)); @@ -6300,9 +7505,8 @@ TEST_F(IcingSearchEngineTest, PutDocumentShouldLogIndexMergeLatency) { // Putting document2 should trigger an index merge. PutResultProto put_result_proto = icing.Put(document2); EXPECT_THAT(put_result_proto.status(), ProtoIsOk()); - EXPECT_THAT( - put_result_proto.native_put_document_stats().index_merge_latency_ms(), - Eq(10)); + EXPECT_THAT(put_result_proto.put_document_stats().index_merge_latency_ms(), + Eq(10)); } TEST_F(IcingSearchEngineTest, SearchWithProjectionEmptyFieldPath) { @@ -6491,7 +7695,7 @@ TEST_F(IcingSearchEngineTest, SearchWithProjectionMultipleFieldPaths) { EqualsProto(projected_document_one)); } -TEST_F(IcingSearchEngineTest, NativeQueryStatsTest) { +TEST_F(IcingSearchEngineTest, QueryStatsProtoTest) { auto fake_clock = std::make_unique<FakeClock>(); fake_clock->SetTimerElapsedMilliseconds(5); TestIcingSearchEngine icing(GetDefaultIcingOptions(), @@ -6537,7 +7741,8 @@ TEST_F(IcingSearchEngineTest, NativeQueryStatsTest) { ASSERT_THAT(search_result.next_page_token(), Ne(kInvalidNextPageToken)); // Check the stats - NativeQueryStats exp_stats; + QueryStatsProto exp_stats; + exp_stats.set_query_length(7); exp_stats.set_num_terms(1); exp_stats.set_num_namespaces_filtered(1); exp_stats.set_num_schema_types_filtered(1); @@ -6547,7 +7752,7 @@ TEST_F(IcingSearchEngineTest, NativeQueryStatsTest) { exp_stats.set_requested_page_size(2); exp_stats.set_num_results_returned_current_page(2); exp_stats.set_num_documents_scored(5); - exp_stats.set_num_results_snippeted(2); + exp_stats.set_num_results_with_snippets(2); exp_stats.set_latency_ms(5); exp_stats.set_parse_query_latency_ms(5); exp_stats.set_scoring_latency_ms(5); @@ -6561,11 +7766,11 @@ TEST_F(IcingSearchEngineTest, NativeQueryStatsTest) { ASSERT_THAT(search_result.results(), SizeIs(2)); ASSERT_THAT(search_result.next_page_token(), Gt(kInvalidNextPageToken)); - exp_stats = NativeQueryStats(); + exp_stats = QueryStatsProto(); exp_stats.set_is_first_page(false); exp_stats.set_requested_page_size(2); exp_stats.set_num_results_returned_current_page(2); - exp_stats.set_num_results_snippeted(1); + exp_stats.set_num_results_with_snippets(1); exp_stats.set_latency_ms(5); exp_stats.set_document_retrieval_latency_ms(5); EXPECT_THAT(search_result.query_stats(), EqualsProto(exp_stats)); @@ -6576,16 +7781,434 @@ TEST_F(IcingSearchEngineTest, NativeQueryStatsTest) { ASSERT_THAT(search_result.results(), SizeIs(1)); ASSERT_THAT(search_result.next_page_token(), Eq(kInvalidNextPageToken)); - exp_stats = NativeQueryStats(); + exp_stats = QueryStatsProto(); exp_stats.set_is_first_page(false); exp_stats.set_requested_page_size(2); exp_stats.set_num_results_returned_current_page(1); - exp_stats.set_num_results_snippeted(0); + exp_stats.set_num_results_with_snippets(0); exp_stats.set_latency_ms(5); exp_stats.set_document_retrieval_latency_ms(5); EXPECT_THAT(search_result.query_stats(), EqualsProto(exp_stats)); } +TEST_F(IcingSearchEngineTest, OptimizeStatsProtoTest) { + auto fake_clock = std::make_unique<FakeClock>(); + fake_clock->SetTimerElapsedMilliseconds(5); + fake_clock->SetSystemTimeMilliseconds(10000); + auto icing = std::make_unique<TestIcingSearchEngine>( + GetDefaultIcingOptions(), std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), std::move(fake_clock), + GetTestJniCache()); + ASSERT_THAT(icing->Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing->SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + + // Create three documents. + DocumentProto document1 = CreateMessageDocument("namespace", "uri1"); + DocumentProto document2 = CreateMessageDocument("namespace", "uri2"); + document2.set_creation_timestamp_ms(9000); + document2.set_ttl_ms(500); + DocumentProto document3 = CreateMessageDocument("namespace", "uri3"); + ASSERT_THAT(icing->Put(document1).status(), ProtoIsOk()); + ASSERT_THAT(icing->Put(document2).status(), ProtoIsOk()); + ASSERT_THAT(icing->Put(document3).status(), ProtoIsOk()); + + // Delete the first document. + ASSERT_THAT(icing->Delete(document1.namespace_(), document1.uri()).status(), + ProtoIsOk()); + ASSERT_THAT(icing->PersistToDisk(PersistType::FULL).status(), ProtoIsOk()); + + OptimizeStatsProto expected; + expected.set_latency_ms(5); + expected.set_document_store_optimize_latency_ms(5); + expected.set_index_restoration_latency_ms(5); + expected.set_num_original_documents(3); + expected.set_num_deleted_documents(1); + expected.set_num_expired_documents(1); + + // Run Optimize + OptimizeResultProto result = icing->Optimize(); + // Depending on how many blocks the documents end up spread across, it's + // possible that Optimize can remove documents without shrinking storage. The + // first Optimize call will also write the OptimizeStatusProto for the first + // time which will take up 1 block. So make sure that before_size is no less + // than after_size - 1 block. + uint32_t page_size = getpagesize(); + EXPECT_THAT(result.optimize_stats().storage_size_before(), + Ge(result.optimize_stats().storage_size_after() - page_size)); + result.mutable_optimize_stats()->clear_storage_size_before(); + result.mutable_optimize_stats()->clear_storage_size_after(); + EXPECT_THAT(result.optimize_stats(), EqualsProto(expected)); + + fake_clock = std::make_unique<FakeClock>(); + fake_clock->SetTimerElapsedMilliseconds(5); + fake_clock->SetSystemTimeMilliseconds(20000); + icing = std::make_unique<TestIcingSearchEngine>( + GetDefaultIcingOptions(), std::make_unique<Filesystem>(), + std::make_unique<IcingFilesystem>(), std::move(fake_clock), + GetTestJniCache()); + ASSERT_THAT(icing->Initialize().status(), ProtoIsOk()); + + expected = OptimizeStatsProto(); + expected.set_latency_ms(5); + expected.set_document_store_optimize_latency_ms(5); + expected.set_index_restoration_latency_ms(5); + expected.set_num_original_documents(1); + expected.set_num_deleted_documents(0); + expected.set_num_expired_documents(0); + expected.set_time_since_last_optimize_ms(10000); + + // Run Optimize + result = icing->Optimize(); + EXPECT_THAT(result.optimize_stats().storage_size_before(), + Eq(result.optimize_stats().storage_size_after())); + result.mutable_optimize_stats()->clear_storage_size_before(); + result.mutable_optimize_stats()->clear_storage_size_after(); + EXPECT_THAT(result.optimize_stats(), EqualsProto(expected)); +} + +TEST_F(IcingSearchEngineTest, StorageInfoTest) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + + // Create three documents. + DocumentProto document1 = CreateMessageDocument("namespace", "uri1"); + DocumentProto document2 = CreateMessageDocument("namespace", "uri2"); + DocumentProto document3 = CreateMessageDocument("namespace", "uri3"); + ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk()); + + // Ensure that total_storage_size is set. All the other stats are covered by + // the classes that generate them. + StorageInfoResultProto result = icing.GetStorageInfo(); + EXPECT_THAT(result.status(), ProtoIsOk()); + EXPECT_THAT(result.storage_info().total_storage_size(), Ge(0)); +} + +TEST_F(IcingSearchEngineTest, SnippetErrorTest) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("Generic").AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk()); + + DocumentProto document1 = + DocumentBuilder() + .SetKey("namespace", "uri1") + .SetScore(10) + .SetSchema("Generic") + .AddStringProperty("subject", "I like cats", "I like dogs", + "I like birds", "I like fish") + .Build(); + DocumentProto document2 = + DocumentBuilder() + .SetKey("namespace", "uri2") + .SetScore(20) + .SetSchema("Generic") + .AddStringProperty("subject", "I like red", "I like green", + "I like blue", "I like yellow") + .Build(); + DocumentProto document3 = + DocumentBuilder() + .SetKey("namespace", "uri3") + .SetScore(5) + .SetSchema("Generic") + .AddStringProperty("subject", "I like cupcakes", "I like donuts", + "I like eclairs", "I like froyo") + .Build(); + ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk()); + + SearchSpecProto search_spec; + search_spec.add_schema_type_filters("Generic"); + search_spec.set_term_match_type(TermMatchType::EXACT_ONLY); + search_spec.set_query("like"); + ScoringSpecProto scoring_spec; + scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE); + ResultSpecProto result_spec; + result_spec.mutable_snippet_spec()->set_num_to_snippet(2); + result_spec.mutable_snippet_spec()->set_num_matches_per_property(3); + result_spec.mutable_snippet_spec()->set_max_window_bytes(4); + SearchResultProto search_results = + icing.Search(search_spec, scoring_spec, result_spec); + + ASSERT_THAT(search_results.results(), SizeIs(3)); + const SearchResultProto::ResultProto* result = &search_results.results(0); + EXPECT_THAT(result->document().uri(), Eq("uri2")); + ASSERT_THAT(result->snippet().entries(), SizeIs(3)); + const SnippetProto::EntryProto* entry = &result->snippet().entries(0); + EXPECT_THAT(entry->property_name(), "subject[0]"); + std::string_view content = GetString(&result->document(), "subject[0]"); + EXPECT_THAT(GetMatches(content, *entry), ElementsAre("like")); + + entry = &result->snippet().entries(1); + EXPECT_THAT(entry->property_name(), "subject[1]"); + content = GetString(&result->document(), "subject[1]"); + EXPECT_THAT(GetMatches(content, *entry), ElementsAre("like")); + + entry = &result->snippet().entries(2); + EXPECT_THAT(entry->property_name(), "subject[2]"); + content = GetString(&result->document(), "subject[2]"); + EXPECT_THAT(GetMatches(content, *entry), ElementsAre("like")); + + result = &search_results.results(1); + EXPECT_THAT(result->document().uri(), Eq("uri1")); + ASSERT_THAT(result->snippet().entries(), SizeIs(3)); + entry = &result->snippet().entries(0); + EXPECT_THAT(entry->property_name(), "subject[0]"); + content = GetString(&result->document(), "subject[0]"); + EXPECT_THAT(GetMatches(content, *entry), ElementsAre("like")); + + entry = &result->snippet().entries(1); + ASSERT_THAT(entry->property_name(), "subject[1]"); + content = GetString(&result->document(), "subject[1]"); + EXPECT_THAT(GetMatches(content, *entry), ElementsAre("like")); + + entry = &result->snippet().entries(2); + ASSERT_THAT(entry->property_name(), "subject[2]"); + content = GetString(&result->document(), "subject[2]"); + EXPECT_THAT(GetMatches(content, *entry), ElementsAre("like")); + + result = &search_results.results(2); + ASSERT_THAT(result->document().uri(), Eq("uri3")); + ASSERT_THAT(result->snippet().entries(), IsEmpty()); +} + +TEST_F(IcingSearchEngineTest, CJKSnippetTest) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk()); + + // String: "我每天走路去上班。" + // ^ ^ ^ ^^ + // UTF8 idx: 0 3 9 15 18 + // UTF16 idx: 0 1 3 5 6 + // Breaks into segments: "我", "每天", "走路", "去", "上班" + constexpr std::string_view kChinese = "我每天走路去上班。"; + DocumentProto document = DocumentBuilder() + .SetKey("namespace", "uri1") + .SetSchema("Message") + .AddStringProperty("body", kChinese) + .Build(); + ASSERT_THAT(icing.Put(document).status(), ProtoIsOk()); + + // Search and request snippet matching but no windowing. + SearchSpecProto search_spec; + search_spec.set_query("走"); + search_spec.set_term_match_type(MATCH_PREFIX); + + ResultSpecProto result_spec; + result_spec.mutable_snippet_spec()->set_num_to_snippet( + std::numeric_limits<int>::max()); + result_spec.mutable_snippet_spec()->set_num_matches_per_property( + std::numeric_limits<int>::max()); + + // Search and make sure that we got a single successful result + SearchResultProto search_results = icing.Search( + search_spec, ScoringSpecProto::default_instance(), result_spec); + ASSERT_THAT(search_results.status(), ProtoIsOk()); + ASSERT_THAT(search_results.results(), SizeIs(1)); + const SearchResultProto::ResultProto* result = &search_results.results(0); + EXPECT_THAT(result->document().uri(), Eq("uri1")); + + // Ensure that one and only one property was matched and it was "body" + ASSERT_THAT(result->snippet().entries(), SizeIs(1)); + const SnippetProto::EntryProto* entry = &result->snippet().entries(0); + EXPECT_THAT(entry->property_name(), Eq("body")); + + // Get the content for "subject" and see what the match is. + std::string_view content = GetString(&result->document(), "body"); + ASSERT_THAT(content, Eq(kChinese)); + + // Ensure that there is one and only one match within "subject" + ASSERT_THAT(entry->snippet_matches(), SizeIs(1)); + const SnippetMatchProto& match_proto = entry->snippet_matches(0); + + EXPECT_THAT(match_proto.exact_match_byte_position(), Eq(9)); + EXPECT_THAT(match_proto.exact_match_byte_length(), Eq(6)); + std::string_view match = + content.substr(match_proto.exact_match_byte_position(), + match_proto.exact_match_byte_length()); + ASSERT_THAT(match, Eq("走路")); + + // Ensure that the utf-16 values are also as expected + EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(3)); + EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2)); +} + +#ifndef ICING_JNI_TEST +// We skip this test case when we're running in a jni_test since the data files +// will be stored in the android-instrumented storage location, rather than the +// normal cc_library runfiles directory. To get that storage location, it's +// recommended to use the TestStorage APIs which handles different API +// levels/absolute vs relative/etc differences. Since that's only accessible on +// the java-side, and I haven't figured out a way to pass that directory path to +// this native side yet, we're just going to disable this. The functionality is +// already well-tested across 4 different emulated OS's so we're not losing much +// test coverage here. +TEST_F(IcingSearchEngineTest, MigrateToPortableFileBackedProtoLog) { + // Copy the testdata files into our IcingSearchEngine directory + std::string dir_without_portable_log; + if (IsAndroidX86()) { + dir_without_portable_log = GetTestFilePath( + "icing/testdata/not_portable_log/" + "icing_search_engine_android_x86"); + } else if (IsAndroidArm()) { + dir_without_portable_log = GetTestFilePath( + "icing/testdata/not_portable_log/" + "icing_search_engine_android_arm"); + } else if (IsIosPlatform()) { + dir_without_portable_log = GetTestFilePath( + "icing/testdata/not_portable_log/" + "icing_search_engine_ios"); + } else { + dir_without_portable_log = GetTestFilePath( + "icing/testdata/not_portable_log/" + "icing_search_engine_linux"); + } + + // Create dst directory that we'll initialize the IcingSearchEngine over. + std::string base_dir = GetTestBaseDir() + "_migrate"; + ASSERT_THAT(filesystem()->DeleteDirectoryRecursively(base_dir.c_str()), true); + ASSERT_THAT(filesystem()->CreateDirectoryRecursively(base_dir.c_str()), true); + + ASSERT_TRUE(filesystem()->CopyDirectory(dir_without_portable_log.c_str(), + base_dir.c_str(), + /*recursive=*/true)); + + IcingSearchEngineOptions icing_options; + icing_options.set_base_dir(base_dir); + + IcingSearchEngine icing(icing_options, GetTestJniCache()); + InitializeResultProto init_result = icing.Initialize(); + EXPECT_THAT(init_result.status(), ProtoIsOk()); + EXPECT_THAT(init_result.initialize_stats().document_store_data_status(), + Eq(InitializeStatsProto::NO_DATA_LOSS)); + EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(), + Eq(InitializeStatsProto::NONE)); + EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(), + Eq(InitializeStatsProto::NONE)); + + // Set up schema, this is the one used to validate documents in the testdata + // files. Do not change unless you're also updating the testdata files. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("email") + .AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + // Make sure our schema is still the same as we expect. If not, there's + // definitely no way we're getting the documents back that we expect. + GetSchemaResultProto expected_get_schema_result_proto; + expected_get_schema_result_proto.mutable_status()->set_code(StatusProto::OK); + *expected_get_schema_result_proto.mutable_schema() = schema; + ASSERT_THAT(icing.GetSchema(), EqualsProto(expected_get_schema_result_proto)); + + // These are the documents that are stored in the testdata files. Do not + // change unless you're also updating the testdata files. + DocumentProto document1 = DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("email") + .SetCreationTimestampMs(10) + .AddStringProperty("subject", "foo") + .AddStringProperty("body", "bar") + .Build(); + + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace1", "uri2") + .SetSchema("email") + .SetCreationTimestampMs(20) + .SetScore(321) + .AddStringProperty("body", "baz bat") + .Build(); + + DocumentProto document3 = DocumentBuilder() + .SetKey("namespace2", "uri1") + .SetSchema("email") + .SetCreationTimestampMs(30) + .SetScore(123) + .AddStringProperty("subject", "phoo") + .Build(); + + // Document 1 and 3 were put normally, and document 2 was deleted in our + // testdata files. + EXPECT_THAT(icing + .Get(document1.namespace_(), document1.uri(), + GetResultSpecProto::default_instance()) + .document(), + EqualsProto(document1)); + EXPECT_THAT(icing + .Get(document2.namespace_(), document2.uri(), + GetResultSpecProto::default_instance()) + .status(), + ProtoStatusIs(StatusProto::NOT_FOUND)); + EXPECT_THAT(icing + .Get(document3.namespace_(), document3.uri(), + GetResultSpecProto::default_instance()) + .document(), + EqualsProto(document3)); + + // Searching for "foo" should get us document1. + SearchSpecProto search_spec; + search_spec.set_term_match_type(TermMatchType::PREFIX); + search_spec.set_query("foo"); + + SearchResultProto expected_document1; + expected_document1.mutable_status()->set_code(StatusProto::OK); + *expected_document1.mutable_results()->Add()->mutable_document() = document1; + + SearchResultProto actual_results = + icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, + EqualsSearchResultIgnoreStatsAndScores(expected_document1)); + + // Searching for "baz" would've gotten us document2, except it got deleted. + // Make sure that it's cleared from our index too. + search_spec.set_query("baz"); + + SearchResultProto expected_no_documents; + expected_no_documents.mutable_status()->set_code(StatusProto::OK); + + actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, + EqualsSearchResultIgnoreStatsAndScores(expected_no_documents)); + + // Searching for "phoo" should get us document3. + search_spec.set_query("phoo"); + + SearchResultProto expected_document3; + expected_document3.mutable_status()->set_code(StatusProto::OK); + *expected_document3.mutable_results()->Add()->mutable_document() = document3; + + actual_results = icing.Search(search_spec, GetDefaultScoringSpec(), + ResultSpecProto::default_instance()); + EXPECT_THAT(actual_results, + EqualsSearchResultIgnoreStatsAndScores(expected_document3)); +} +#endif // !ICING_JNI_TEST + } // namespace } // namespace lib } // namespace icing diff --git a/icing/index/hit/hit.cc b/icing/index/hit/hit.cc index 2a5a0d9..887e6e4 100644 --- a/icing/index/hit/hit.cc +++ b/icing/index/hit/hit.cc @@ -67,9 +67,10 @@ Hit::Hit(SectionId section_id, DocumentId document_id, &temp_value); bit_util::BitfieldSet(section_id, kNumFlags, kSectionIdBits, &temp_value); bit_util::BitfieldSet(term_frequency != kDefaultTermFrequency, - kHasTermFrequency, 1, &temp_value); - bit_util::BitfieldSet(is_prefix_hit, kPrefixHit, 1, &temp_value); - bit_util::BitfieldSet(is_in_prefix_section, kInPrefixSection, 1, &temp_value); + kHasTermFrequency, /*len=*/1, &temp_value); + bit_util::BitfieldSet(is_prefix_hit, kPrefixHit, /*len=*/1, &temp_value); + bit_util::BitfieldSet(is_in_prefix_section, kInPrefixSection, + /*len=*/1, &temp_value); value_ = temp_value; } diff --git a/icing/index/index-processor.cc b/icing/index/index-processor.cc index d2f9d41..6d8632f 100644 --- a/icing/index/index-processor.cc +++ b/icing/index/index-processor.cc @@ -55,7 +55,7 @@ IndexProcessor::Create(const Normalizer* normalizer, Index* index, libtextclassifier3::Status IndexProcessor::IndexDocument( const TokenizedDocument& tokenized_document, DocumentId document_id, - NativePutDocumentStats* put_document_stats) { + PutDocumentStatsProto* put_document_stats) { std::unique_ptr<Timer> index_timer = clock_.GetNewTimer(); if (index_->last_added_document_id() != kInvalidDocumentId && @@ -64,6 +64,7 @@ libtextclassifier3::Status IndexProcessor::IndexDocument( "DocumentId %d must be greater than last added document_id %d", document_id, index_->last_added_document_id())); } + index_->set_last_added_document_id(document_id); uint32_t num_tokens = 0; libtextclassifier3::Status overall_status; for (const TokenizedSection& section : tokenized_document.sections()) { diff --git a/icing/index/index-processor.h b/icing/index/index-processor.h index 9fc7c46..6b07c98 100644 --- a/icing/index/index-processor.h +++ b/icing/index/index-processor.h @@ -81,7 +81,7 @@ class IndexProcessor { // INTERNAL_ERROR if any other errors occur libtextclassifier3::Status IndexDocument( const TokenizedDocument& tokenized_document, DocumentId document_id, - NativePutDocumentStats* put_document_stats = nullptr); + PutDocumentStatsProto* put_document_stats = nullptr); private: IndexProcessor(const Normalizer* normalizer, Index* index, diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc index e6bb615..8a6a9f5 100644 --- a/icing/index/index-processor_test.cc +++ b/icing/index/index-processor_test.cc @@ -36,9 +36,11 @@ #include "icing/index/term-property-id.h" #include "icing/legacy/index/icing-filesystem.h" #include "icing/legacy/index/icing-mock-filesystem.h" +#include "icing/portable/platform.h" #include "icing/proto/document.pb.h" #include "icing/proto/schema.pb.h" #include "icing/proto/term.pb.h" +#include "icing/schema-builder.h" #include "icing/schema/schema-store.h" #include "icing/schema/schema-util.h" #include "icing/schema/section-manager.h" @@ -46,7 +48,6 @@ #include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" -#include "icing/testing/platform.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" #include "icing/tokenization/language-segmenter-factory.h" @@ -103,6 +104,22 @@ using ::testing::Eq; using ::testing::IsEmpty; using ::testing::Test; +constexpr PropertyConfigProto_DataType_Code TYPE_STRING = + PropertyConfigProto_DataType_Code_STRING; +constexpr PropertyConfigProto_DataType_Code TYPE_BYTES = + PropertyConfigProto_DataType_Code_BYTES; + +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED = + PropertyConfigProto_Cardinality_Code_REPEATED; + +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; + +constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; +constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; + class IndexProcessorTest : public Test { protected: void SetUp() override { @@ -131,7 +148,49 @@ class IndexProcessorTest : public Test { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, SchemaStore::Create(&filesystem_, GetTestTempDir(), &fake_clock_)); - SchemaProto schema = CreateFakeSchema(); + SchemaProto schema = + SchemaBuilder() + .AddType( + SchemaTypeConfigBuilder() + .SetType(kFakeType) + .AddProperty( + PropertyConfigBuilder() + .SetName(kExactProperty) + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName(kPrefixedProperty) + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName(kUnindexedProperty1) + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName(kUnindexedProperty2) + .SetDataType(TYPE_BYTES) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName(kRepeatedProperty) + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty( + PropertyConfigBuilder() + .SetName(kSubProperty) + .SetDataTypeDocument( + kNestedType, /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType( + SchemaTypeConfigBuilder() + .SetType(kNestedType) + .AddProperty( + PropertyConfigBuilder() + .SetName(kNestedProperty) + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); ICING_ASSERT_OK(schema_store_->SetSchema(schema)); IndexProcessor::Options processor_options; @@ -162,72 +221,6 @@ class IndexProcessorTest : public Test { std::unique_ptr<Index> index_; std::unique_ptr<SchemaStore> schema_store_; std::unique_ptr<IndexProcessor> index_processor_; - - private: - static void AddStringProperty(std::string_view name, DataType::Code type, - Cardinality::Code cardinality, - TermMatchType::Code term_match_type, - SchemaTypeConfigProto* type_config) { - auto* prop = type_config->add_properties(); - prop->set_property_name(std::string(name)); - prop->set_data_type(type); - prop->set_cardinality(cardinality); - prop->mutable_string_indexing_config()->set_term_match_type( - term_match_type); - prop->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); - } - - static void AddNonIndexedProperty(std::string_view name, DataType::Code type, - Cardinality::Code cardinality, - SchemaTypeConfigProto* type_config) { - auto* prop = type_config->add_properties(); - prop->set_property_name(std::string(name)); - prop->set_data_type(type); - prop->set_cardinality(cardinality); - } - - static SchemaProto CreateFakeSchema() { - SchemaProto schema; - - // Add top-level type - auto* type_config = schema.add_types(); - type_config->set_schema_type(std::string(kFakeType)); - - AddStringProperty(std::string(kExactProperty), DataType::STRING, - Cardinality::OPTIONAL, TermMatchType::EXACT_ONLY, - type_config); - - AddStringProperty(std::string(kPrefixedProperty), DataType::STRING, - Cardinality::OPTIONAL, TermMatchType::PREFIX, - type_config); - - AddNonIndexedProperty(std::string(kUnindexedProperty1), DataType::STRING, - Cardinality::OPTIONAL, type_config); - - AddNonIndexedProperty(std::string(kUnindexedProperty2), DataType::BYTES, - Cardinality::OPTIONAL, type_config); - - AddStringProperty(std::string(kRepeatedProperty), DataType::STRING, - Cardinality::REPEATED, TermMatchType::PREFIX, - type_config); - - auto* prop = type_config->add_properties(); - prop->set_property_name(std::string(kSubProperty)); - prop->set_data_type(DataType::DOCUMENT); - prop->set_cardinality(Cardinality::OPTIONAL); - prop->set_schema_type(std::string(kNestedType)); - prop->mutable_document_indexing_config()->set_index_nested_properties(true); - - // Add nested type - type_config = schema.add_types(); - type_config->set_schema_type(std::string(kNestedType)); - - AddStringProperty(kNestedProperty, DataType::STRING, Cardinality::OPTIONAL, - TermMatchType::PREFIX, type_config); - - return schema; - } }; std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) { @@ -268,7 +261,23 @@ TEST_F(IndexProcessorTest, NoTermMatchTypeContent) { document)); EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), IsOk()); - EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId)); + EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); +} + +TEST_F(IndexProcessorTest, NoValidContent) { + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "fake_type/1") + .SetSchema(std::string(kFakeType)) + .AddStringProperty(std::string(kExactProperty), "?...!") + .Build(); + ICING_ASSERT_OK_AND_ASSIGN( + TokenizedDocument tokenized_document, + TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(), + document)); + EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0), + IsOk()); + EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexProcessorTest, OneDoc) { @@ -434,9 +443,8 @@ TEST_F(IndexProcessorTest, TooManyTokensReturnError) { IndexProcessor::Options::TokenLimitBehavior::kReturnError; ICING_ASSERT_OK_AND_ASSIGN( - index_processor_, - IndexProcessor::Create(normalizer_.get(), index_.get(), options, - &fake_clock_)); + index_processor_, IndexProcessor::Create(normalizer_.get(), index_.get(), + options, &fake_clock_)); DocumentProto document = DocumentBuilder() @@ -477,9 +485,8 @@ TEST_F(IndexProcessorTest, TooManyTokensSuppressError) { IndexProcessor::Options::TokenLimitBehavior::kSuppressError; ICING_ASSERT_OK_AND_ASSIGN( - index_processor_, - IndexProcessor::Create(normalizer_.get(), index_.get(), options, - &fake_clock_)); + index_processor_, IndexProcessor::Create(normalizer_.get(), index_.get(), + options, &fake_clock_)); DocumentProto document = DocumentBuilder() @@ -522,9 +529,8 @@ TEST_F(IndexProcessorTest, TooLongTokens) { /*max_term_byte_size=*/4)); ICING_ASSERT_OK_AND_ASSIGN( - index_processor_, - IndexProcessor::Create(normalizer.get(), index_.get(), options, - &fake_clock_)); + index_processor_, IndexProcessor::Create(normalizer.get(), index_.get(), + options, &fake_clock_)); DocumentProto document = DocumentBuilder() @@ -693,8 +699,8 @@ TEST_F(IndexProcessorTest, NonAsciiIndexing) { ICING_ASSERT_OK_AND_ASSIGN( index_processor_, - IndexProcessor::Create(normalizer_.get(), index_.get(), - processor_options, &fake_clock_)); + IndexProcessor::Create(normalizer_.get(), index_.get(), processor_options, + &fake_clock_)); DocumentProto document = DocumentBuilder() diff --git a/icing/index/index.cc b/icing/index/index.cc index bd41b51..db59ad2 100644 --- a/icing/index/index.cc +++ b/icing/index/index.cc @@ -164,7 +164,7 @@ libtextclassifier3::StatusOr<std::unique_ptr<Index>> Index::Create( icing_filesystem)); return std::unique_ptr<Index>(new Index(options, std::move(term_id_codec), std::move(lite_index), - std::move(main_index))); + std::move(main_index), filesystem)); } libtextclassifier3::Status Index::TruncateTo(DocumentId document_id) { @@ -277,6 +277,18 @@ Index::FindTermsByPrefix(const std::string& prefix, std::move(main_term_metadata_list), num_to_return); } +IndexStorageInfoProto Index::GetStorageInfo() const { + IndexStorageInfoProto storage_info; + int64_t directory_size = filesystem_->GetDiskUsage(options_.base_dir.c_str()); + if (directory_size != Filesystem::kBadFileSize) { + storage_info.set_index_size(directory_size); + } else { + storage_info.set_index_size(-1); + } + storage_info = lite_index_->GetStorageInfo(std::move(storage_info)); + return main_index_->GetStorageInfo(std::move(storage_info)); +} + libtextclassifier3::Status Index::Editor::BufferTerm(const char* term) { // Step 1: See if this term is already in the lexicon uint32_t tvi; diff --git a/icing/index/index.h b/icing/index/index.h index a4ea719..eab5be8 100644 --- a/icing/index/index.h +++ b/icing/index/index.h @@ -32,6 +32,7 @@ #include "icing/index/term-id-codec.h" #include "icing/index/term-metadata.h" #include "icing/legacy/index/icing-filesystem.h" +#include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" @@ -126,6 +127,16 @@ class Index { return main_index_->last_added_document_id(); } + // Sets last_added_document_id to document_id so long as document_id > + // last_added_document_id() + void set_last_added_document_id(DocumentId document_id) { + DocumentId lite_document_id = lite_index_->last_added_document_id(); + if (lite_document_id == kInvalidDocumentId || + document_id >= lite_document_id) { + lite_index_->set_last_added_document_id(document_id); + } + } + // Returns debug information for the index in out. // verbosity <= 0, simplest debug information - just the lexicons and lite // index. @@ -151,6 +162,12 @@ class Index { return lite_index_size + main_index_size; } + // Calculates the StorageInfo for the Index. + // + // If an IO error occurs while trying to calculate the value for a field, then + // that field will be set to -1. + IndexStorageInfoProto GetStorageInfo() const; + // Create an iterator to iterate through all doc hit infos in the index that // match the term. section_id_mask can be set to ignore hits from sections not // listed in the mask. Eg. section_id_mask = 1U << 3; would only return hits @@ -242,11 +259,12 @@ class Index { private: Index(const Options& options, std::unique_ptr<TermIdCodec> term_id_codec, std::unique_ptr<LiteIndex> lite_index, - std::unique_ptr<MainIndex> main_index) + std::unique_ptr<MainIndex> main_index, const Filesystem* filesystem) : lite_index_(std::move(lite_index)), main_index_(std::move(main_index)), options_(options), - term_id_codec_(std::move(term_id_codec)) {} + term_id_codec_(std::move(term_id_codec)), + filesystem_(filesystem) {} libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindLiteTermsByPrefix( const std::string& prefix, const std::vector<NamespaceId>& namespace_ids, @@ -256,6 +274,7 @@ class Index { std::unique_ptr<MainIndex> main_index_; const Options options_; std::unique_ptr<TermIdCodec> term_id_codec_; + const Filesystem* filesystem_; }; } // namespace lib diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc index 3479ab1..16593ef 100644 --- a/icing/index/index_test.cc +++ b/icing/index/index_test.cc @@ -31,6 +31,7 @@ #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/legacy/index/icing-filesystem.h" #include "icing/legacy/index/icing-mock-filesystem.h" +#include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" @@ -46,6 +47,7 @@ namespace { using ::testing::ElementsAre; using ::testing::Eq; +using ::testing::Ge; using ::testing::Gt; using ::testing::IsEmpty; using ::testing::IsTrue; @@ -151,8 +153,6 @@ TEST_F(IndexTest, EmptyIndex) { index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); EXPECT_THAT(itr->Advance(), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId)); } TEST_F(IndexTest, EmptyIndexAfterMerge) { @@ -170,8 +170,6 @@ TEST_F(IndexTest, EmptyIndexAfterMerge) { index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); EXPECT_THAT(itr->Advance(), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId)); } TEST_F(IndexTest, AdvancePastEnd) { @@ -236,8 +234,6 @@ TEST_F(IndexTest, SingleHitSingleTermIndex) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, SingleHitSingleTermIndexAfterMerge) { @@ -254,8 +250,6 @@ TEST_F(IndexTest, SingleHitSingleTermIndexAfterMerge) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, SingleHitMultiTermIndex) { @@ -271,8 +265,6 @@ TEST_F(IndexTest, SingleHitMultiTermIndex) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, SingleHitMultiTermIndexAfterMerge) { @@ -290,8 +282,6 @@ TEST_F(IndexTest, SingleHitMultiTermIndexAfterMerge) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, NoHitMultiTermIndex) { @@ -306,7 +296,6 @@ TEST_F(IndexTest, NoHitMultiTermIndex) { index_->GetIterator("baz", kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); EXPECT_THAT(itr->Advance(), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, NoHitMultiTermIndexAfterMerge) { @@ -323,7 +312,6 @@ TEST_F(IndexTest, NoHitMultiTermIndexAfterMerge) { index_->GetIterator("baz", kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); EXPECT_THAT(itr->Advance(), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, MultiHitMultiTermIndex) { @@ -350,7 +338,6 @@ TEST_F(IndexTest, MultiHitMultiTermIndex) { ElementsAre( EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}), EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2}))); - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId2)); } TEST_F(IndexTest, MultiHitMultiTermIndexAfterMerge) { @@ -379,7 +366,6 @@ TEST_F(IndexTest, MultiHitMultiTermIndexAfterMerge) { ElementsAre( EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}), EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2}))); - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId2)); } TEST_F(IndexTest, MultiHitSectionRestrict) { @@ -400,8 +386,6 @@ TEST_F(IndexTest, MultiHitSectionRestrict) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); } TEST_F(IndexTest, MultiHitSectionRestrictAfterMerge) { @@ -424,8 +408,6 @@ TEST_F(IndexTest, MultiHitSectionRestrictAfterMerge) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); } TEST_F(IndexTest, SingleHitDedupeIndex) { @@ -447,8 +429,6 @@ TEST_F(IndexTest, SingleHitDedupeIndex) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, PrefixHit) { @@ -463,8 +443,6 @@ TEST_F(IndexTest, PrefixHit) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, PrefixHitAfterMerge) { @@ -481,8 +459,6 @@ TEST_F(IndexTest, PrefixHitAfterMerge) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, MultiPrefixHit) { @@ -504,8 +480,6 @@ TEST_F(IndexTest, MultiPrefixHit) { ElementsAre( EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}), EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); } TEST_F(IndexTest, MultiPrefixHitAfterMerge) { @@ -529,8 +503,6 @@ TEST_F(IndexTest, MultiPrefixHitAfterMerge) { ElementsAre( EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}), EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); } TEST_F(IndexTest, NoExactHitInPrefixQuery) { @@ -550,7 +522,6 @@ TEST_F(IndexTest, NoExactHitInPrefixQuery) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId1, std::vector<SectionId>{kSectionId3}))); - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); } TEST_F(IndexTest, NoExactHitInPrefixQueryAfterMerge) { @@ -572,7 +543,6 @@ TEST_F(IndexTest, NoExactHitInPrefixQueryAfterMerge) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId1, std::vector<SectionId>{kSectionId3}))); - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); } TEST_F(IndexTest, PrefixHitDedupe) { @@ -588,7 +558,6 @@ TEST_F(IndexTest, PrefixHitDedupe) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, PrefixHitDedupeAfterMerge) { @@ -606,7 +575,6 @@ TEST_F(IndexTest, PrefixHitDedupeAfterMerge) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, PrefixToString) { @@ -703,9 +671,11 @@ TEST_F(IndexTest, FullIndex) { std::default_random_engine random; std::vector<std::string> query_terms; + std::string prefix = "prefix"; for (int i = 0; i < 2600; ++i) { constexpr int kTokenSize = 5; - query_terms.push_back(RandomString(kAlNumAlphabet, kTokenSize, &random)); + query_terms.push_back(prefix + + RandomString(kAlNumAlphabet, kTokenSize, &random)); } DocumentId document_id = 0; @@ -714,7 +684,7 @@ TEST_F(IndexTest, FullIndex) { while (status.ok()) { for (int i = 0; i < 100; ++i) { Index::Editor edit = - index_->Edit(document_id, kSectionId2, TermMatchType::EXACT_ONLY, + index_->Edit(document_id, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); size_t idx = uniform(random); status = edit.BufferTerm(query_terms.at(idx).c_str()); @@ -731,11 +701,14 @@ TEST_F(IndexTest, FullIndex) { // Adding more hits should fail. Index::Editor edit = - index_->Edit(document_id + 1, kSectionId2, TermMatchType::EXACT_ONLY, + index_->Edit(document_id + 1, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); - EXPECT_THAT(edit.BufferTerm("bar"), IsOk()); - EXPECT_THAT(edit.BufferTerm("baz"), IsOk()); + std::string term = prefix + "foo"; + EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk()); + term = prefix + "bar"; + EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk()); + term = prefix + "baz"; + EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); @@ -743,12 +716,17 @@ TEST_F(IndexTest, FullIndex) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocHitInfoIterator> itr, index_->GetIterator(query_terms.at(i).c_str(), kSectionIdMaskAll, - TermMatchType::EXACT_ONLY)); + TermMatchType::PREFIX)); // Each query term should contain at least one hit - there may have been // other hits for this term that were added. EXPECT_THAT(itr->Advance(), IsOk()); } - EXPECT_THAT(index_->last_added_document_id(), Eq(document_id - 1)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> last_itr, + index_->GetIterator(prefix.c_str(), kSectionIdMaskAll, + TermMatchType::PREFIX)); + EXPECT_THAT(last_itr->Advance(), IsOk()); + EXPECT_THAT(last_itr->doc_hit_info().document_id(), Eq(document_id - 1)); } TEST_F(IndexTest, FullIndexMerge) { @@ -759,9 +737,11 @@ TEST_F(IndexTest, FullIndexMerge) { std::default_random_engine random; std::vector<std::string> query_terms; + std::string prefix = "prefix"; for (int i = 0; i < 2600; ++i) { constexpr int kTokenSize = 5; - query_terms.push_back(RandomString(kAlNumAlphabet, kTokenSize, &random)); + query_terms.push_back(prefix + + RandomString(kAlNumAlphabet, kTokenSize, &random)); } DocumentId document_id = 0; @@ -770,7 +750,7 @@ TEST_F(IndexTest, FullIndexMerge) { while (status.ok()) { for (int i = 0; i < 100; ++i) { Index::Editor edit = - index_->Edit(document_id, kSectionId2, TermMatchType::EXACT_ONLY, + index_->Edit(document_id, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); size_t idx = uniform(random); status = edit.BufferTerm(query_terms.at(idx).c_str()); @@ -789,30 +769,45 @@ TEST_F(IndexTest, FullIndexMerge) { // Adding more hits should fail. Index::Editor edit = - index_->Edit(document_id + 1, kSectionId2, TermMatchType::EXACT_ONLY, + index_->Edit(document_id + 1, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); - EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); - EXPECT_THAT(edit.BufferTerm("bar"), IsOk()); - EXPECT_THAT(edit.BufferTerm("baz"), IsOk()); + std::string term = prefix + "foo"; + EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk()); + term = prefix + "bar"; + EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk()); + term = prefix + "baz"; + EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); - EXPECT_THAT(index_->last_added_document_id(), Eq(document_id - 1)); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<DocHitInfoIterator> last_itr, + index_->GetIterator(prefix.c_str(), kSectionIdMaskAll, + TermMatchType::PREFIX)); + EXPECT_THAT(last_itr->Advance(), IsOk()); + EXPECT_THAT(last_itr->doc_hit_info().document_id(), Eq(document_id - 1)); // After merging with the main index. Adding more hits should succeed now. ICING_ASSERT_OK(index_->Merge()); - edit = - index_->Edit(document_id + 1, kSectionId2, TermMatchType::EXACT_ONLY, 0); - EXPECT_THAT(edit.BufferTerm("foo"), IsOk()); - EXPECT_THAT(edit.BufferTerm("bar"), IsOk()); - EXPECT_THAT(edit.BufferTerm("baz"), IsOk()); + edit = index_->Edit(document_id + 1, kSectionId2, TermMatchType::PREFIX, 0); + prefix + "foo"; + EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk()); + term = prefix + "bar"; + EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk()); + term = prefix + "baz"; + EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<DocHitInfoIterator> itr, - index_->GetIterator("bar", kSectionIdMaskAll, TermMatchType::EXACT_ONLY)); + index_->GetIterator(prefix + "bar", kSectionIdMaskAll, + TermMatchType::EXACT_ONLY)); // We know that "bar" should have at least one hit because we just added it! EXPECT_THAT(itr->Advance(), IsOk()); EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(document_id + 1)); - EXPECT_THAT(index_->last_added_document_id(), Eq(document_id + 1)); + ICING_ASSERT_OK_AND_ASSIGN( + last_itr, index_->GetIterator(prefix.c_str(), kSectionIdMaskAll, + TermMatchType::PREFIX)); + EXPECT_THAT(last_itr->Advance(), IsOk()); + EXPECT_THAT(last_itr->doc_hit_info().document_id(), Eq(document_id + 1)); } TEST_F(IndexTest, IndexCreateIOFailure) { @@ -881,8 +876,6 @@ TEST_F(IndexTest, IndexPersistence) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, IndexPersistenceAfterMerge) { @@ -910,8 +903,6 @@ TEST_F(IndexTest, IndexPersistenceAfterMerge) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, InvalidHitBufferSize) { @@ -1278,8 +1269,6 @@ TEST_F(IndexTest, ExactResultsFromLiteAndMain) { ElementsAre( EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}), EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId2)); } TEST_F(IndexTest, PrefixResultsFromLiteAndMain) { @@ -1312,8 +1301,6 @@ TEST_F(IndexTest, PrefixResultsFromLiteAndMain) { EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}), EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}), EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId2)); } TEST_F(IndexTest, GetDebugInfo) { @@ -1420,8 +1407,6 @@ TEST_F(IndexTest, BackfillingMultipleTermsSucceeds) { ElementsAre( EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}), EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId3}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId2)); } TEST_F(IndexTest, BackfillingNewTermsSucceeds) { @@ -1476,8 +1461,6 @@ TEST_F(IndexTest, BackfillingNewTermsSucceeds) { ElementsAre( EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}), EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId3)); } TEST_F(IndexTest, TruncateToInvalidDocumentIdHasNoEffect) { @@ -1525,8 +1508,6 @@ TEST_F(IndexTest, TruncateToInvalidDocumentIdHasNoEffect) { ElementsAre( EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}), EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); } TEST_F(IndexTest, TruncateToLastAddedDocumentIdHasNoEffect) { @@ -1542,6 +1523,7 @@ TEST_F(IndexTest, TruncateToLastAddedDocumentIdHasNoEffect) { TermMatchType::PREFIX, /*namespace_id=*/0); ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + index_->set_last_added_document_id(kDocumentId0); ICING_EXPECT_OK(index_->TruncateTo(index_->last_added_document_id())); // Clipping to invalid should have no effect. ICING_ASSERT_OK_AND_ASSIGN( @@ -1563,6 +1545,7 @@ TEST_F(IndexTest, TruncateToLastAddedDocumentIdHasNoEffect) { /*namespace_id=*/0); ASSERT_THAT(edit.BufferTerm("foot"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + index_->set_last_added_document_id(kDocumentId1); // Clipping to invalid should still have no effect even if both indices have // hits. @@ -1574,8 +1557,6 @@ TEST_F(IndexTest, TruncateToLastAddedDocumentIdHasNoEffect) { ElementsAre( EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}), EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1)); } TEST_F(IndexTest, TruncateToThrowsOutLiteIndex) { @@ -1584,6 +1565,7 @@ TEST_F(IndexTest, TruncateToThrowsOutLiteIndex) { TermMatchType::PREFIX, /*namespace_id=*/0); ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + index_->set_last_added_document_id(kDocumentId0); ICING_ASSERT_OK(index_->Merge()); @@ -1592,6 +1574,7 @@ TEST_F(IndexTest, TruncateToThrowsOutLiteIndex) { /*namespace_id=*/0); ASSERT_THAT(edit.BufferTerm("foot"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + index_->set_last_added_document_id(kDocumentId1); EXPECT_THAT(index_->TruncateTo(kDocumentId0), IsOk()); @@ -1602,8 +1585,6 @@ TEST_F(IndexTest, TruncateToThrowsOutLiteIndex) { EXPECT_THAT(GetHits(std::move(itr)), ElementsAre(EqualsDocHitInfo( kDocumentId0, std::vector<SectionId>{kSectionId2}))); - - EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0)); } TEST_F(IndexTest, TruncateToThrowsOutBothIndices) { @@ -1612,10 +1593,12 @@ TEST_F(IndexTest, TruncateToThrowsOutBothIndices) { TermMatchType::PREFIX, /*namespace_id=*/0); ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + index_->set_last_added_document_id(kDocumentId0); edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); ASSERT_THAT(edit.BufferTerm("foul"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + index_->set_last_added_document_id(kDocumentId1); ICING_ASSERT_OK(index_->Merge()); @@ -1624,6 +1607,7 @@ TEST_F(IndexTest, TruncateToThrowsOutBothIndices) { /*namespace_id=*/0); ASSERT_THAT(edit.BufferTerm("foot"), IsOk()); EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + index_->set_last_added_document_id(kDocumentId2); EXPECT_THAT(index_->TruncateTo(kDocumentId0), IsOk()); @@ -1632,8 +1616,33 @@ TEST_F(IndexTest, TruncateToThrowsOutBothIndices) { std::unique_ptr<DocHitInfoIterator> itr, index_->GetIterator("f", kSectionIdMaskAll, TermMatchType::PREFIX)); EXPECT_THAT(GetHits(std::move(itr)), IsEmpty()); +} + +TEST_F(IndexTest, IndexStorageInfoProto) { + // Add two documents to the lite index and merge them into main. + { + Index::Editor edit = index_->Edit( + kDocumentId0, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0); + ASSERT_THAT(edit.BufferTerm("foo"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::PREFIX, + /*namespace_id=*/0); + ASSERT_THAT(edit.BufferTerm("foul"), IsOk()); + EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk()); + + ICING_ASSERT_OK(index_->Merge()); + } - EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId)); + IndexStorageInfoProto storage_info = index_->GetStorageInfo(); + EXPECT_THAT(storage_info.index_size(), Ge(0)); + EXPECT_THAT(storage_info.lite_index_lexicon_size(), Ge(0)); + EXPECT_THAT(storage_info.lite_index_hit_buffer_size(), Ge(0)); + EXPECT_THAT(storage_info.main_index_lexicon_size(), Ge(0)); + EXPECT_THAT(storage_info.main_index_storage_size(), Ge(0)); + EXPECT_THAT(storage_info.main_index_block_size(), Ge(0)); + // There should be 1 block for the header and 1 block for two posting lists. + EXPECT_THAT(storage_info.num_blocks(), Eq(2)); + EXPECT_THAT(storage_info.min_free_fraction(), Ge(0)); } } // namespace diff --git a/icing/index/iterator/doc-hit-info-iterator-and.cc b/icing/index/iterator/doc-hit-info-iterator-and.cc index 66f87bd..39aa969 100644 --- a/icing/index/iterator/doc-hit-info-iterator-and.cc +++ b/icing/index/iterator/doc-hit-info-iterator-and.cc @@ -162,6 +162,7 @@ libtextclassifier3::Status DocHitInfoIteratorAndNary::Advance() { DocumentId unused; ICING_ASSIGN_OR_RETURN( unused, AdvanceTo(iterator.get(), potential_document_id)); + (void)unused; // Silence unused warning. } if (iterator->doc_hit_info().document_id() == potential_document_id) { diff --git a/icing/index/iterator/doc-hit-info-iterator-and.h b/icing/index/iterator/doc-hit-info-iterator-and.h index faca785..8ceff44 100644 --- a/icing/index/iterator/doc-hit-info-iterator-and.h +++ b/icing/index/iterator/doc-hit-info-iterator-and.h @@ -47,13 +47,16 @@ class DocHitInfoIteratorAnd : public DocHitInfoIterator { std::string ToString() const override; void PopulateMatchedTermsStats( - std::vector<TermMatchInfo> *matched_terms_stats) const override { + std::vector<TermMatchInfo> *matched_terms_stats, + SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { if (doc_hit_info_.document_id() == kInvalidDocumentId) { // Current hit isn't valid, return. return; } - short_->PopulateMatchedTermsStats(matched_terms_stats); - long_->PopulateMatchedTermsStats(matched_terms_stats); + short_->PopulateMatchedTermsStats(matched_terms_stats, + filtering_section_mask); + long_->PopulateMatchedTermsStats(matched_terms_stats, + filtering_section_mask); } private: @@ -78,13 +81,15 @@ class DocHitInfoIteratorAndNary : public DocHitInfoIterator { std::string ToString() const override; void PopulateMatchedTermsStats( - std::vector<TermMatchInfo> *matched_terms_stats) const override { + std::vector<TermMatchInfo> *matched_terms_stats, + SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { if (doc_hit_info_.document_id() == kInvalidDocumentId) { // Current hit isn't valid, return. return; } for (size_t i = 0; i < iterators_.size(); ++i) { - iterators_.at(i)->PopulateMatchedTermsStats(matched_terms_stats); + iterators_.at(i)->PopulateMatchedTermsStats(matched_terms_stats, + filtering_section_mask); } } diff --git a/icing/index/iterator/doc-hit-info-iterator-filter.cc b/icing/index/iterator/doc-hit-info-iterator-filter.cc index c6cb86d..933f9b5 100644 --- a/icing/index/iterator/doc-hit-info-iterator-filter.cc +++ b/icing/index/iterator/doc-hit-info-iterator-filter.cc @@ -31,7 +31,6 @@ #include "icing/store/document-filter-data.h" #include "icing/store/document-id.h" #include "icing/store/document-store.h" -#include "icing/util/clock.h" namespace icing { namespace lib { @@ -39,12 +38,11 @@ namespace lib { DocHitInfoIteratorFilter::DocHitInfoIteratorFilter( std::unique_ptr<DocHitInfoIterator> delegate, const DocumentStore* document_store, const SchemaStore* schema_store, - const Clock* clock, const Options& options) + const Options& options) : delegate_(std::move(delegate)), document_store_(*document_store), schema_store_(*schema_store), - options_(options), - current_time_milliseconds_(clock->GetSystemTimeMilliseconds()) { + options_(options) { // Precompute all the NamespaceIds for (std::string_view name_space : options_.namespaces) { auto namespace_id_or = document_store_.GetNamespaceId(name_space); @@ -67,61 +65,50 @@ DocHitInfoIteratorFilter::DocHitInfoIteratorFilter( } libtextclassifier3::Status DocHitInfoIteratorFilter::Advance() { - if (!delegate_->Advance().ok()) { - // Didn't find anything on the delegate iterator. - doc_hit_info_ = DocHitInfo(kInvalidDocumentId); - hit_intersect_section_ids_mask_ = kSectionIdMaskNone; - return absl_ports::ResourceExhaustedError( - "No more DocHitInfos in iterator"); - } - - if (current_time_milliseconds_ < 0) { - // This shouldn't happen, but we add a sanity check here for any unknown - // errors. - return absl_ports::InternalError( - "Couldn't get current time. Try again in a bit"); - } - - if (!document_store_.DoesDocumentExist( - delegate_->doc_hit_info().document_id())) { - // Document doesn't exist, keep searching - return Advance(); - } + while (delegate_->Advance().ok()) { + if (!document_store_.DoesDocumentExist( + delegate_->doc_hit_info().document_id())) { + // Document doesn't exist, keep searching. This handles deletions and + // expired documents. + continue; + } - // Try to get the DocumentFilterData - auto document_filter_data_or = document_store_.GetDocumentFilterData( - delegate_->doc_hit_info().document_id()); - if (!document_filter_data_or.ok()) { - // Didn't find the DocumentFilterData in the filter cache. This could be - // because the DocumentId isn't valid or the filter cache is in some invalid - // state. This is bad, but not the query's responsibility to fix, so just - // skip this result for now. - return Advance(); - } - // We should be guaranteed that this exists now. - DocumentFilterData data = std::move(document_filter_data_or).ValueOrDie(); + // Try to get the DocumentFilterData + auto document_filter_data_or = document_store_.GetDocumentFilterData( + delegate_->doc_hit_info().document_id()); + if (!document_filter_data_or.ok()) { + // Didn't find the DocumentFilterData in the filter cache. This could be + // because the DocumentId isn't valid or the filter cache is in some + // invalid state. This is bad, but not the query's responsibility to fix, + // so just skip this result for now. + continue; + } + // We should be guaranteed that this exists now. + DocumentFilterData data = std::move(document_filter_data_or).ValueOrDie(); - if (!options_.namespaces.empty() && - target_namespace_ids_.count(data.namespace_id()) == 0) { - // Doesn't match one of the specified namespaces. Keep searching - return Advance(); - } + if (!options_.namespaces.empty() && + target_namespace_ids_.count(data.namespace_id()) == 0) { + // Doesn't match one of the specified namespaces. Keep searching + continue; + } - if (!options_.schema_types.empty() && - target_schema_type_ids_.count(data.schema_type_id()) == 0) { - // Doesn't match one of the specified schema types. Keep searching - return Advance(); - } + if (!options_.schema_types.empty() && + target_schema_type_ids_.count(data.schema_type_id()) == 0) { + // Doesn't match one of the specified schema types. Keep searching + continue; + } - if (current_time_milliseconds_ >= data.expiration_timestamp_ms()) { - // Current time has exceeded the document's expiration time - return Advance(); + // Satisfied all our specified filters + doc_hit_info_ = delegate_->doc_hit_info(); + hit_intersect_section_ids_mask_ = + delegate_->hit_intersect_section_ids_mask(); + return libtextclassifier3::Status::OK; } - // Satisfied all our specified filters - doc_hit_info_ = delegate_->doc_hit_info(); - hit_intersect_section_ids_mask_ = delegate_->hit_intersect_section_ids_mask(); - return libtextclassifier3::Status::OK; + // Didn't find anything on the delegate iterator. + doc_hit_info_ = DocHitInfo(kInvalidDocumentId); + hit_intersect_section_ids_mask_ = kSectionIdMaskNone; + return absl_ports::ResourceExhaustedError("No more DocHitInfos in iterator"); } int32_t DocHitInfoIteratorFilter::GetNumBlocksInspected() const { diff --git a/icing/index/iterator/doc-hit-info-iterator-filter.h b/icing/index/iterator/doc-hit-info-iterator-filter.h index fb60e38..5051607 100644 --- a/icing/index/iterator/doc-hit-info-iterator-filter.h +++ b/icing/index/iterator/doc-hit-info-iterator-filter.h @@ -27,7 +27,6 @@ #include "icing/schema/schema-store.h" #include "icing/store/document-store.h" #include "icing/store/namespace-id.h" -#include "icing/util/clock.h" namespace icing { namespace lib { @@ -57,7 +56,7 @@ class DocHitInfoIteratorFilter : public DocHitInfoIterator { explicit DocHitInfoIteratorFilter( std::unique_ptr<DocHitInfoIterator> delegate, const DocumentStore* document_store, const SchemaStore* schema_store, - const Clock* clock, const Options& options); + const Options& options); libtextclassifier3::Status Advance() override; @@ -68,8 +67,10 @@ class DocHitInfoIteratorFilter : public DocHitInfoIterator { std::string ToString() const override; void PopulateMatchedTermsStats( - std::vector<TermMatchInfo>* matched_terms_stats) const override { - delegate_->PopulateMatchedTermsStats(matched_terms_stats); + std::vector<TermMatchInfo>* matched_terms_stats, + SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { + delegate_->PopulateMatchedTermsStats(matched_terms_stats, + filtering_section_mask); } private: @@ -79,7 +80,6 @@ class DocHitInfoIteratorFilter : public DocHitInfoIterator { const Options options_; std::unordered_set<NamespaceId> target_namespace_ids_; std::unordered_set<SchemaTypeId> target_schema_type_ids_; - const int64_t current_time_milliseconds_; }; } // namespace lib diff --git a/icing/index/iterator/doc-hit-info-iterator-filter_test.cc b/icing/index/iterator/doc-hit-info-iterator-filter_test.cc index e0a8cd0..f80d1ea 100644 --- a/icing/index/iterator/doc-hit-info-iterator-filter_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-filter_test.cc @@ -28,6 +28,7 @@ #include "icing/index/iterator/doc-hit-info-iterator-test-util.h" #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/proto/document.pb.h" +#include "icing/schema-builder.h" #include "icing/schema/schema-store.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" @@ -59,10 +60,10 @@ class DocHitInfoIteratorDeletedFilterTest : public ::testing::Test { test_document3_ = DocumentBuilder().SetKey("icing", "email/3").SetSchema("email").Build(); - SchemaProto schema; - auto type_config = schema.add_types(); - type_config->set_schema_type("email"); - + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); @@ -100,9 +101,9 @@ TEST_F(DocHitInfoIteratorDeletedFilterTest, EmptyOriginalIterator) { std::unique_ptr<DocHitInfoIterator> original_iterator_empty = std::make_unique<DocHitInfoIteratorDummy>(); - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator_empty), document_store_.get(), - schema_store_.get(), &fake_clock_, options_); + DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator_empty), + document_store_.get(), + schema_store_.get(), options_); EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty()); } @@ -124,9 +125,9 @@ TEST_F(DocHitInfoIteratorDeletedFilterTest, DeletedDocumentsAreFiltered) { std::unique_ptr<DocHitInfoIterator> original_iterator = std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - &fake_clock_, options_); + DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator), + document_store_.get(), + schema_store_.get(), options_); EXPECT_THAT(GetDocumentIds(&filtered_iterator), ElementsAre(document_id1, document_id3)); @@ -150,9 +151,9 @@ TEST_F(DocHitInfoIteratorDeletedFilterTest, NonExistingDocumentsAreFiltered) { std::unique_ptr<DocHitInfoIterator> original_iterator = std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - &fake_clock_, options_); + DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator), + document_store_.get(), + schema_store_.get(), options_); EXPECT_THAT(GetDocumentIds(&filtered_iterator), ElementsAre(document_id1, document_id2, document_id3)); @@ -163,9 +164,9 @@ TEST_F(DocHitInfoIteratorDeletedFilterTest, NegativeDocumentIdIsIgnored) { std::unique_ptr<DocHitInfoIterator> original_iterator = std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - &fake_clock_, options_); + DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator), + document_store_.get(), + schema_store_.get(), options_); EXPECT_THAT(filtered_iterator.Advance(), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); @@ -177,9 +178,9 @@ TEST_F(DocHitInfoIteratorDeletedFilterTest, InvalidDocumentIdIsIgnored) { std::unique_ptr<DocHitInfoIterator> original_iterator = std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - &fake_clock_, options_); + DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator), + document_store_.get(), + schema_store_.get(), options_); EXPECT_THAT(filtered_iterator.Advance(), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); @@ -194,9 +195,9 @@ TEST_F(DocHitInfoIteratorDeletedFilterTest, GreaterThanMaxDocumentIdIsIgnored) { std::unique_ptr<DocHitInfoIterator> original_iterator = std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - &fake_clock_, options_); + DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator), + document_store_.get(), + schema_store_.get(), options_); EXPECT_THAT(filtered_iterator.Advance(), StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED)); @@ -226,10 +227,10 @@ class DocHitInfoIteratorNamespaceFilterTest : public ::testing::Test { .SetSchema("email") .Build(); - SchemaProto schema; - auto type_config = schema.add_types(); - type_config->set_schema_type("email"); - + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); @@ -270,9 +271,9 @@ TEST_F(DocHitInfoIteratorNamespaceFilterTest, EmptyOriginalIterator) { std::make_unique<DocHitInfoIteratorDummy>(); options_.namespaces = std::vector<std::string_view>{}; - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator_empty), document_store_.get(), - schema_store_.get(), &fake_clock_, options_); + DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator_empty), + document_store_.get(), + schema_store_.get(), options_); EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty()); } @@ -288,9 +289,9 @@ TEST_F(DocHitInfoIteratorNamespaceFilterTest, std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); options_.namespaces = std::vector<std::string_view>{"nonexistent_namespace"}; - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - &fake_clock_, options_); + DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator), + document_store_.get(), + schema_store_.get(), options_); EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty()); } @@ -305,9 +306,9 @@ TEST_F(DocHitInfoIteratorNamespaceFilterTest, NoNamespacesReturnsAll) { std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); options_.namespaces = std::vector<std::string_view>{}; - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - &fake_clock_, options_); + DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator), + document_store_.get(), + schema_store_.get(), options_); EXPECT_THAT(GetDocumentIds(&filtered_iterator), ElementsAre(document_id1)); } @@ -329,9 +330,9 @@ TEST_F(DocHitInfoIteratorNamespaceFilterTest, std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); options_.namespaces = std::vector<std::string_view>{namespace1_}; - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - &fake_clock_, options_); + DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator), + document_store_.get(), + schema_store_.get(), options_); EXPECT_THAT(GetDocumentIds(&filtered_iterator), ElementsAre(document_id1, document_id2)); @@ -355,9 +356,9 @@ TEST_F(DocHitInfoIteratorNamespaceFilterTest, FilterForMultipleNamespacesOk) { std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); options_.namespaces = std::vector<std::string_view>{namespace1_, namespace3_}; - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - &fake_clock_, options_); + DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator), + document_store_.get(), + schema_store_.get(), options_); EXPECT_THAT(GetDocumentIds(&filtered_iterator), ElementsAre(document_id1, document_id2, document_id4)); @@ -379,14 +380,12 @@ class DocHitInfoIteratorSchemaTypeFilterTest : public ::testing::Test { document4_schema1_ = DocumentBuilder().SetKey("namespace", "4").SetSchema(schema1_).Build(); - SchemaProto schema; - auto type_config = schema.add_types(); - type_config->set_schema_type(schema1_); - type_config = schema.add_types(); - type_config->set_schema_type(schema2_); - type_config = schema.add_types(); - type_config->set_schema_type(schema3_); - + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType(schema1_)) + .AddType(SchemaTypeConfigBuilder().SetType(schema2_)) + .AddType(SchemaTypeConfigBuilder().SetType(schema3_)) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); @@ -427,9 +426,9 @@ TEST_F(DocHitInfoIteratorSchemaTypeFilterTest, EmptyOriginalIterator) { std::make_unique<DocHitInfoIteratorDummy>(); options_.schema_types = std::vector<std::string_view>{}; - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator_empty), document_store_.get(), - schema_store_.get(), &fake_clock_, options_); + DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator_empty), + document_store_.get(), + schema_store_.get(), options_); EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty()); } @@ -446,9 +445,9 @@ TEST_F(DocHitInfoIteratorSchemaTypeFilterTest, options_.schema_types = std::vector<std::string_view>{"nonexistent_schema_type"}; - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - &fake_clock_, options_); + DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator), + document_store_.get(), + schema_store_.get(), options_); EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty()); } @@ -463,9 +462,9 @@ TEST_F(DocHitInfoIteratorSchemaTypeFilterTest, NoSchemaTypesReturnsAll) { std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); options_.schema_types = std::vector<std::string_view>{}; - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - &fake_clock_, options_); + DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator), + document_store_.get(), + schema_store_.get(), options_); EXPECT_THAT(GetDocumentIds(&filtered_iterator), ElementsAre(document_id1)); } @@ -484,9 +483,9 @@ TEST_F(DocHitInfoIteratorSchemaTypeFilterTest, std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); options_.schema_types = std::vector<std::string_view>{schema1_}; - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - &fake_clock_, options_); + DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator), + document_store_.get(), + schema_store_.get(), options_); EXPECT_THAT(GetDocumentIds(&filtered_iterator), ElementsAre(document_id1)); } @@ -507,9 +506,9 @@ TEST_F(DocHitInfoIteratorSchemaTypeFilterTest, FilterForMultipleSchemaTypesOk) { std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); options_.schema_types = std::vector<std::string_view>{schema2_, schema3_}; - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - &fake_clock_, options_); + DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator), + document_store_.get(), + schema_store_.get(), options_); EXPECT_THAT(GetDocumentIds(&filtered_iterator), ElementsAre(document_id2, document_id3)); @@ -523,10 +522,10 @@ class DocHitInfoIteratorExpirationFilterTest : public ::testing::Test { void SetUp() override { filesystem_.CreateDirectoryRecursively(test_dir_.c_str()); - SchemaProto schema; - auto type_config = schema.add_types(); - type_config->set_schema_type(email_schema_); - + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType(email_schema_)) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); @@ -557,6 +556,16 @@ class DocHitInfoIteratorExpirationFilterTest : public ::testing::Test { }; TEST_F(DocHitInfoIteratorExpirationFilterTest, TtlZeroIsntFilteredOut) { + // Arbitrary value + fake_clock_.SetSystemTimeMilliseconds(100); + + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> document_store = + std::move(create_result.document_store); + // Insert a document DocumentProto document = DocumentBuilder() .SetKey("namespace", "1") @@ -565,23 +574,30 @@ TEST_F(DocHitInfoIteratorExpirationFilterTest, TtlZeroIsntFilteredOut) { .SetTtlMs(0) .Build(); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, - document_store_->Put(document)); + document_store->Put(document)); std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1)}; std::unique_ptr<DocHitInfoIterator> original_iterator = std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); - // Arbitrary value - fake_clock_.SetSystemTimeMilliseconds(100); - - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - &fake_clock_, options_); + DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator), + document_store.get(), + schema_store_.get(), options_); EXPECT_THAT(GetDocumentIds(&filtered_iterator), ElementsAre(document_id1)); } TEST_F(DocHitInfoIteratorExpirationFilterTest, BeforeTtlNotFilteredOut) { + // Arbitrary value, but must be less than document's creation_timestamp + ttl + fake_clock_.SetSystemTimeMilliseconds(50); + + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> document_store = + std::move(create_result.document_store); + // Insert a document DocumentProto document = DocumentBuilder() .SetKey("namespace", "1") @@ -590,92 +606,84 @@ TEST_F(DocHitInfoIteratorExpirationFilterTest, BeforeTtlNotFilteredOut) { .SetTtlMs(100) .Build(); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, - document_store_->Put(document)); + document_store->Put(document)); std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1)}; std::unique_ptr<DocHitInfoIterator> original_iterator = std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); - // Arbitrary value, but must be less than document's creation_timestamp + ttl - fake_clock_.SetSystemTimeMilliseconds(50); - - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - &fake_clock_, options_); + DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator), + document_store.get(), + schema_store_.get(), options_); EXPECT_THAT(GetDocumentIds(&filtered_iterator), ElementsAre(document_id1)); } TEST_F(DocHitInfoIteratorExpirationFilterTest, EqualTtlFilteredOut) { + // Current time is exactly the document's creation_timestamp + ttl + fake_clock_.SetSystemTimeMilliseconds(150); + + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> document_store = + std::move(create_result.document_store); + // Insert a document DocumentProto document = DocumentBuilder() .SetKey("namespace", "1") .SetSchema(email_schema_) - .SetCreationTimestampMs(0) + .SetCreationTimestampMs(50) .SetTtlMs(100) .Build(); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, - document_store_->Put(document)); + document_store->Put(document)); std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1)}; std::unique_ptr<DocHitInfoIterator> original_iterator = std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); - // Current time is exactly the document's creation_timestamp + ttl - fake_clock_.SetSystemTimeMilliseconds(100); - - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - &fake_clock_, options_); + DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator), + document_store.get(), + schema_store_.get(), options_); EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty()); } TEST_F(DocHitInfoIteratorExpirationFilterTest, PastTtlFilteredOut) { + // Arbitrary value, but must be greater than the document's + // creation_timestamp + ttl + fake_clock_.SetSystemTimeMilliseconds(151); + + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> document_store = + std::move(create_result.document_store); + // Insert a document DocumentProto document = DocumentBuilder() .SetKey("namespace", "1") .SetSchema(email_schema_) - .SetCreationTimestampMs(0) + .SetCreationTimestampMs(50) .SetTtlMs(100) .Build(); ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, - document_store_->Put(document)); + document_store->Put(document)); std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1)}; std::unique_ptr<DocHitInfoIterator> original_iterator = std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); - // Arbitrary value, but must be greater than the document's - // creation_timestamp + ttl - fake_clock_.SetSystemTimeMilliseconds(101); - - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - &fake_clock_, options_); + DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator), + document_store.get(), + schema_store_.get(), options_); EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty()); } -TEST_F(DocHitInfoIteratorExpirationFilterTest, - InvalidTimeFiltersReturnsInternalError) { - // Put something in the original iterator so we don't get a ResourceExhausted - // error - std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(/*document_id_in=*/0)}; - std::unique_ptr<DocHitInfoIterator> original_iterator = - std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); - - // -1 is an invalid timestamp - fake_clock_.SetSystemTimeMilliseconds(-1); - - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - &fake_clock_, options_); - - EXPECT_THAT(filtered_iterator.Advance(), - StatusIs(libtextclassifier3::StatusCode::INTERNAL)); -} - class DocHitInfoIteratorFilterTest : public ::testing::Test { protected: DocHitInfoIteratorFilterTest() : test_dir_(GetTestTempDir() + "/icing") {} @@ -709,16 +717,15 @@ class DocHitInfoIteratorFilterTest : public ::testing::Test { document5_namespace1_schema1_ = DocumentBuilder() .SetKey(namespace1_, "5") .SetSchema(schema1_) - .SetCreationTimestampMs(0) + .SetCreationTimestampMs(1) .SetTtlMs(100) .Build(); - SchemaProto schema; - auto type_config = schema.add_types(); - type_config->set_schema_type(schema1_); - type_config = schema.add_types(); - type_config->set_schema_type(schema2_); - + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType(schema1_)) + .AddType(SchemaTypeConfigBuilder().SetType(schema2_)) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); @@ -756,26 +763,36 @@ class DocHitInfoIteratorFilterTest : public ::testing::Test { }; TEST_F(DocHitInfoIteratorFilterTest, CombineAllFiltersOk) { + // Filters out document5 since it's expired + fake_clock_.SetSystemTimeMilliseconds(199); + + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> document_store = + std::move(create_result.document_store); + ICING_ASSERT_OK_AND_ASSIGN( DocumentId document_id1, - document_store_->Put(document1_namespace1_schema1_)); + document_store->Put(document1_namespace1_schema1_)); ICING_ASSERT_OK_AND_ASSIGN( DocumentId document_id2, - document_store_->Put(document2_namespace1_schema1_)); + document_store->Put(document2_namespace1_schema1_)); ICING_ASSERT_OK_AND_ASSIGN( DocumentId document_id3, - document_store_->Put(document3_namespace2_schema1_)); + document_store->Put(document3_namespace2_schema1_)); ICING_ASSERT_OK_AND_ASSIGN( DocumentId document_id4, - document_store_->Put(document4_namespace1_schema2_)); + document_store->Put(document4_namespace1_schema2_)); ICING_ASSERT_OK_AND_ASSIGN( DocumentId document_id5, - document_store_->Put(document5_namespace1_schema1_)); + document_store->Put(document5_namespace1_schema1_)); // Deletes document2, causing it to be filtered out ICING_ASSERT_OK( - document_store_->Delete(document2_namespace1_schema1_.namespace_(), - document2_namespace1_schema1_.uri())); + document_store->Delete(document2_namespace1_schema1_.namespace_(), + document2_namespace1_schema1_.uri())); std::vector<DocHitInfo> doc_hit_infos = { DocHitInfo(document_id1), DocHitInfo(document_id2), @@ -793,13 +810,9 @@ TEST_F(DocHitInfoIteratorFilterTest, CombineAllFiltersOk) { // Filters out document4 by schema type options.schema_types = std::vector<std::string_view>{schema1_}; - // Filters out document5 since it's expired - FakeClock fake_clock; - fake_clock.SetSystemTimeMilliseconds(199); - - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - &fake_clock, options); + DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator), + document_store.get(), + schema_store_.get(), options); EXPECT_THAT(GetDocumentIds(&filtered_iterator), ElementsAre(document_id1)); } @@ -830,9 +843,9 @@ TEST_F(DocHitInfoIteratorFilterTest, SectionIdMasksArePopulatedCorrectly) { std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos); DocHitInfoIteratorFilter::Options options; - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - &fake_clock_, options); + DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator), + document_store_.get(), + schema_store_.get(), options); EXPECT_THAT(GetDocHitInfos(&filtered_iterator), ElementsAre(EqualsDocHitInfo(document_id1, section_ids1), @@ -845,9 +858,9 @@ TEST_F(DocHitInfoIteratorFilterTest, GetNumBlocksInspected) { original_iterator->SetNumBlocksInspected(5); DocHitInfoIteratorFilter::Options options; - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - &fake_clock_, options); + DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator), + document_store_.get(), + schema_store_.get(), options); EXPECT_THAT(filtered_iterator.GetNumBlocksInspected(), Eq(5)); } @@ -857,9 +870,9 @@ TEST_F(DocHitInfoIteratorFilterTest, GetNumLeafAdvanceCalls) { original_iterator->SetNumLeafAdvanceCalls(6); DocHitInfoIteratorFilter::Options options; - DocHitInfoIteratorFilter filtered_iterator( - std::move(original_iterator), document_store_.get(), schema_store_.get(), - &fake_clock_, options); + DocHitInfoIteratorFilter filtered_iterator(std::move(original_iterator), + document_store_.get(), + schema_store_.get(), options); EXPECT_THAT(filtered_iterator.GetNumLeafAdvanceCalls(), Eq(6)); } diff --git a/icing/index/iterator/doc-hit-info-iterator-not.cc b/icing/index/iterator/doc-hit-info-iterator-not.cc index e1ece5c..8fb3659 100644 --- a/icing/index/iterator/doc-hit-info-iterator-not.cc +++ b/icing/index/iterator/doc-hit-info-iterator-not.cc @@ -35,30 +35,29 @@ DocHitInfoIteratorNot::DocHitInfoIteratorNot( DocHitInfoIteratorAllDocumentId(document_id_limit)) {} libtextclassifier3::Status DocHitInfoIteratorNot::Advance() { - if (!all_document_id_iterator_.Advance().ok()) { - doc_hit_info_ = DocHitInfo(kInvalidDocumentId); - return absl_ports::ResourceExhaustedError( - "No more DocHitInfos in iterator"); - } + while (all_document_id_iterator_.Advance().ok()) { + if (all_document_id_iterator_.doc_hit_info().document_id() < + to_be_excluded_->doc_hit_info().document_id()) { + // Since DocumentIds are returned from DocHitInfoIterators in decreasing + // order, we have passed the last NOT result if we're smaller than its + // DocumentId. Advance the NOT result if so. + to_be_excluded_->Advance().IgnoreError(); + } - if (all_document_id_iterator_.doc_hit_info().document_id() < - to_be_excluded_->doc_hit_info().document_id()) { - // Since DocumentIds are returned from DocHitInfoIterators in decreasing - // order, we have passed the last NOT result if we're smaller than its - // DocumentId. Advance the NOT result if so. - to_be_excluded_->Advance().IgnoreError(); - } + if (all_document_id_iterator_.doc_hit_info().document_id() == + to_be_excluded_->doc_hit_info().document_id()) { + // This is a NOT result, skip and Advance to the next result. + continue; + } - if (all_document_id_iterator_.doc_hit_info().document_id() == - to_be_excluded_->doc_hit_info().document_id()) { - // This is a NOT result, skip and Advance to the next result. - return Advance(); + // No errors, we've found a valid result + doc_hit_info_ = all_document_id_iterator_.doc_hit_info(); + return libtextclassifier3::Status::OK; } - // No errors, we've found a valid result - doc_hit_info_ = all_document_id_iterator_.doc_hit_info(); - - return libtextclassifier3::Status::OK; + // Didn't find a hit, return with error + doc_hit_info_ = DocHitInfo(kInvalidDocumentId); + return absl_ports::ResourceExhaustedError("No more DocHitInfos in iterator"); } int32_t DocHitInfoIteratorNot::GetNumBlocksInspected() const { diff --git a/icing/index/iterator/doc-hit-info-iterator-or.h b/icing/index/iterator/doc-hit-info-iterator-or.h index 2f49430..2dae68d 100644 --- a/icing/index/iterator/doc-hit-info-iterator-or.h +++ b/icing/index/iterator/doc-hit-info-iterator-or.h @@ -43,15 +43,18 @@ class DocHitInfoIteratorOr : public DocHitInfoIterator { std::string ToString() const override; void PopulateMatchedTermsStats( - std::vector<TermMatchInfo> *matched_terms_stats) const override { + std::vector<TermMatchInfo> *matched_terms_stats, + SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { if (doc_hit_info_.document_id() == kInvalidDocumentId) { // Current hit isn't valid, return. return; } - current_->PopulateMatchedTermsStats(matched_terms_stats); + current_->PopulateMatchedTermsStats(matched_terms_stats, + filtering_section_mask); // If equal, then current_ == left_. Combine with results from right_. if (left_document_id_ == right_document_id_) { - right_->PopulateMatchedTermsStats(matched_terms_stats); + right_->PopulateMatchedTermsStats(matched_terms_stats, + filtering_section_mask); } } @@ -83,13 +86,15 @@ class DocHitInfoIteratorOrNary : public DocHitInfoIterator { std::string ToString() const override; void PopulateMatchedTermsStats( - std::vector<TermMatchInfo> *matched_terms_stats) const override { + std::vector<TermMatchInfo> *matched_terms_stats, + SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { if (doc_hit_info_.document_id() == kInvalidDocumentId) { // Current hit isn't valid, return. return; } for (size_t i = 0; i < current_iterators_.size(); i++) { - current_iterators_.at(i)->PopulateMatchedTermsStats(matched_terms_stats); + current_iterators_.at(i)->PopulateMatchedTermsStats( + matched_terms_stats, filtering_section_mask); } } diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc index 8acb91a..034c8cb 100644 --- a/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc +++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc @@ -45,57 +45,54 @@ DocHitInfoIteratorSectionRestrict::DocHitInfoIteratorSectionRestrict( target_section_(target_section) {} libtextclassifier3::Status DocHitInfoIteratorSectionRestrict::Advance() { - if (!delegate_->Advance().ok()) { - // Didn't find anything on the delegate iterator. - doc_hit_info_ = DocHitInfo(kInvalidDocumentId); - hit_intersect_section_ids_mask_ = kSectionIdMaskNone; - return absl_ports::ResourceExhaustedError( - "No more DocHitInfos in iterator"); - } - - DocumentId document_id = delegate_->doc_hit_info().document_id(); + while (delegate_->Advance().ok()) { + DocumentId document_id = delegate_->doc_hit_info().document_id(); - SectionIdMask section_id_mask = - delegate_->doc_hit_info().hit_section_ids_mask(); + SectionIdMask section_id_mask = + delegate_->doc_hit_info().hit_section_ids_mask(); - auto data_or = document_store_.GetDocumentFilterData(document_id); - if (!data_or.ok()) { - // Ran into some error retrieving information on this hit, skip - return Advance(); - } + auto data_or = document_store_.GetDocumentFilterData(document_id); + if (!data_or.ok()) { + // Ran into some error retrieving information on this hit, skip + continue; + } - // Guaranteed that the DocumentFilterData exists at this point - DocumentFilterData data = std::move(data_or).ValueOrDie(); - SchemaTypeId schema_type_id = data.schema_type_id(); - - // A hit can be in multiple sections at once, need to check that at least one - // of the confirmed section ids match the name of the target section - while (section_id_mask != 0) { - // There was a hit in this section id - SectionId section_id = __builtin_ctz(section_id_mask); - - auto section_metadata_or = - schema_store_.GetSectionMetadata(schema_type_id, section_id); - - if (section_metadata_or.ok()) { - const SectionMetadata* section_metadata = - section_metadata_or.ValueOrDie(); - - if (section_metadata->path == target_section_) { - // The hit was in the target section name, return OK/found - doc_hit_info_ = delegate_->doc_hit_info(); - hit_intersect_section_ids_mask_ = - delegate_->hit_intersect_section_ids_mask(); - return libtextclassifier3::Status::OK; + // Guaranteed that the DocumentFilterData exists at this point + DocumentFilterData data = std::move(data_or).ValueOrDie(); + SchemaTypeId schema_type_id = data.schema_type_id(); + + // A hit can be in multiple sections at once, need to check that at least + // one of the confirmed section ids match the name of the target section + while (section_id_mask != 0) { + // There was a hit in this section id + SectionId section_id = __builtin_ctz(section_id_mask); + + auto section_metadata_or = + schema_store_.GetSectionMetadata(schema_type_id, section_id); + + if (section_metadata_or.ok()) { + const SectionMetadata* section_metadata = + section_metadata_or.ValueOrDie(); + + if (section_metadata->path == target_section_) { + // The hit was in the target section name, return OK/found + doc_hit_info_ = delegate_->doc_hit_info(); + hit_intersect_section_ids_mask_ = 1u << section_id; + return libtextclassifier3::Status::OK; + } } + + // Mark this section as checked + section_id_mask &= ~(1U << section_id); } - // Mark this section as checked - section_id_mask &= ~(1U << section_id); + // Didn't find a matching section name for this hit. Continue. } - // Didn't find a matching section name for this hit, go to the next hit - return Advance(); + // Didn't find anything on the delegate iterator. + doc_hit_info_ = DocHitInfo(kInvalidDocumentId); + hit_intersect_section_ids_mask_ = kSectionIdMaskNone; + return absl_ports::ResourceExhaustedError("No more DocHitInfos in iterator"); } int32_t DocHitInfoIteratorSectionRestrict::GetNumBlocksInspected() const { diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict.h b/icing/index/iterator/doc-hit-info-iterator-section-restrict.h index ba74384..52b243a 100644 --- a/icing/index/iterator/doc-hit-info-iterator-section-restrict.h +++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict.h @@ -52,13 +52,21 @@ class DocHitInfoIteratorSectionRestrict : public DocHitInfoIterator { std::string ToString() const override; - // NOTE: currently, section restricts does decide which documents to - // return, but doesn't impact the relevance score of a document. - // TODO(b/173156803): decide whether we want to filter the matched_terms_stats - // for the restricted sections. + // Note that the DocHitInfoIteratorSectionRestrict is the only iterator that + // should set filtering_section_mask, hence the received + // filtering_section_mask is ignored and the filtering_section_mask passed to + // the delegate will be set to hit_intersect_section_ids_mask_. This will + // allow to filter the matching sections in the delegate. void PopulateMatchedTermsStats( - std::vector<TermMatchInfo>* matched_terms_stats) const override { - delegate_->PopulateMatchedTermsStats(matched_terms_stats); + std::vector<TermMatchInfo>* matched_terms_stats, + SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { + if (doc_hit_info_.document_id() == kInvalidDocumentId) { + // Current hit isn't valid, return. + return; + } + delegate_->PopulateMatchedTermsStats( + matched_terms_stats, + /*filtering_section_mask=*/hit_intersect_section_ids_mask_); } private: diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc index 91e0cbe..43a846b 100644 --- a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc +++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc @@ -29,6 +29,7 @@ #include "icing/proto/document.pb.h" #include "icing/proto/schema.pb.h" #include "icing/proto/term.pb.h" +#include "icing/schema-builder.h" #include "icing/schema/schema-store.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" @@ -43,9 +44,18 @@ namespace lib { namespace { using ::testing::ElementsAre; +using ::testing::ElementsAreArray; using ::testing::Eq; using ::testing::IsEmpty; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; + +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; + +constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; + class DocHitInfoIteratorSectionRestrictTest : public ::testing::Test { protected: DocHitInfoIteratorSectionRestrictTest() @@ -56,18 +66,18 @@ class DocHitInfoIteratorSectionRestrictTest : public ::testing::Test { document_ = DocumentBuilder().SetKey("namespace", "uri").SetSchema("email").Build(); - auto type_config = schema_.add_types(); - type_config->set_schema_type("email"); - - // Add an indexed property so we generate section metadata on it - auto property = type_config->add_properties(); - property->set_property_name(indexed_property_); - property->set_data_type(PropertyConfigProto::DataType::STRING); - property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - property->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - property->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); + schema_ = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("email") + // Add an indexed property so we generate section + // metadata on it + .AddProperty( + PropertyConfigBuilder() + .SetName(indexed_property_) + .SetDataTypeString(MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); // First and only indexed property, so it gets the first id of 0 indexed_section_id_ = 0; @@ -101,6 +111,57 @@ class DocHitInfoIteratorSectionRestrictTest : public ::testing::Test { FakeClock fake_clock_; }; +TEST_F(DocHitInfoIteratorSectionRestrictTest, + PopulateMatchedTermsStats_IncludesHitWithMatchingSection) { + // Populate the DocumentStore's FilterCache with this document's data + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, + document_store_->Put(document_)); + + // Arbitrary section ids for the documents in the DocHitInfoIterators. + // Created to test correct section_id_mask behavior. + SectionIdMask original_section_id_mask = 0b00000101; // hits in sections 0, 2 + + DocHitInfo doc_hit_info1 = DocHitInfo(document_id); + doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1); + doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2); + + // Create a hit that was found in the indexed section + std::vector<DocHitInfo> doc_hit_infos = {doc_hit_info1}; + + auto original_iterator = + std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "hi"); + original_iterator->set_hit_intersect_section_ids_mask( + original_section_id_mask); + + // Filtering for the indexed section name (which has a section id of 0) should + // get a result. + DocHitInfoIteratorSectionRestrict section_restrict_iterator( + std::move(original_iterator), document_store_.get(), schema_store_.get(), + /*target_section=*/indexed_property_); + + std::vector<TermMatchInfo> matched_terms_stats; + section_restrict_iterator.PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, IsEmpty()); + + ICING_EXPECT_OK(section_restrict_iterator.Advance()); + EXPECT_THAT(section_restrict_iterator.doc_hit_info().document_id(), + Eq(document_id)); + SectionIdMask expected_section_id_mask = 0b00000001; // hits in sections 0 + EXPECT_EQ(section_restrict_iterator.hit_intersect_section_ids_mask(), + expected_section_id_mask); + + section_restrict_iterator.PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_EQ(matched_terms_stats.at(0).term, "hi"); + std::array<Hit::TermFrequency, kMaxSectionId> expected_term_frequencies{ + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + EXPECT_THAT(matched_terms_stats.at(0).term_frequencies, + ElementsAreArray(expected_term_frequencies)); + EXPECT_EQ(matched_terms_stats.at(0).section_ids_mask, + expected_section_id_mask); + + EXPECT_FALSE(section_restrict_iterator.Advance().ok()); +} + TEST_F(DocHitInfoIteratorSectionRestrictTest, EmptyOriginalIterator) { std::unique_ptr<DocHitInfoIterator> original_iterator_empty = std::make_unique<DocHitInfoIteratorDummy>(); @@ -110,6 +171,9 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, EmptyOriginalIterator) { schema_store_.get(), /*target_section=*/""); EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty()); + std::vector<TermMatchInfo> matched_terms_stats; + filtered_iterator.PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, IsEmpty()); } TEST_F(DocHitInfoIteratorSectionRestrictTest, IncludesHitWithMatchingSection) { @@ -148,6 +212,9 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, NoMatchingDocumentFilterData) { /*target_section=*/""); EXPECT_THAT(GetDocumentIds(§ion_restrict_iterator), IsEmpty()); + std::vector<TermMatchInfo> matched_terms_stats; + section_restrict_iterator.PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, IsEmpty()); } TEST_F(DocHitInfoIteratorSectionRestrictTest, @@ -171,6 +238,9 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, "some_section_name"); EXPECT_THAT(GetDocumentIds(§ion_restrict_iterator), IsEmpty()); + std::vector<TermMatchInfo> matched_terms_stats; + section_restrict_iterator.PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, IsEmpty()); } TEST_F(DocHitInfoIteratorSectionRestrictTest, @@ -192,6 +262,9 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, indexed_property_); EXPECT_THAT(GetDocumentIds(§ion_restrict_iterator), IsEmpty()); + std::vector<TermMatchInfo> matched_terms_stats; + section_restrict_iterator.PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, IsEmpty()); } TEST_F(DocHitInfoIteratorSectionRestrictTest, @@ -216,6 +289,9 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, indexed_property_); EXPECT_THAT(GetDocumentIds(§ion_restrict_iterator), IsEmpty()); + std::vector<TermMatchInfo> matched_terms_stats; + section_restrict_iterator.PopulateMatchedTermsStats(&matched_terms_stats); + EXPECT_THAT(matched_terms_stats, IsEmpty()); } TEST_F(DocHitInfoIteratorSectionRestrictTest, GetNumBlocksInspected) { diff --git a/icing/index/iterator/doc-hit-info-iterator-test-util.h b/icing/index/iterator/doc-hit-info-iterator-test-util.h index 913696a..45acc8f 100644 --- a/icing/index/iterator/doc-hit-info-iterator-test-util.h +++ b/icing/index/iterator/doc-hit-info-iterator-test-util.h @@ -56,23 +56,25 @@ class DocHitInfoIteratorDummy : public DocHitInfoIterator { // Imitates behavior of DocHitInfoIteratorTermMain/DocHitInfoIteratorTermLite void PopulateMatchedTermsStats( - std::vector<TermMatchInfo>* matched_terms_stats) const override { + std::vector<TermMatchInfo>* matched_terms_stats, + SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { if (doc_hit_info_.document_id() == kInvalidDocumentId) { // Current hit isn't valid, return. return; } - SectionIdMask section_mask = doc_hit_info_.hit_section_ids_mask(); + SectionIdMask section_mask = + doc_hit_info_.hit_section_ids_mask() & filtering_section_mask; + SectionIdMask section_mask_copy = section_mask; std::array<Hit::TermFrequency, kMaxSectionId> section_term_frequencies = { Hit::kNoTermFrequency}; - - while (section_mask) { - SectionId section_id = __builtin_ctz(section_mask); + while (section_mask_copy) { + SectionId section_id = __builtin_ctz(section_mask_copy); section_term_frequencies.at(section_id) = doc_hit_info_.hit_term_frequency(section_id); - section_mask &= ~(1u << section_id); + section_mask_copy &= ~(1u << section_id); } - TermMatchInfo term_stats(term_, doc_hit_info_.hit_section_ids_mask(), - section_term_frequencies); + TermMatchInfo term_stats(term_, section_mask, + std::move(section_term_frequencies)); for (auto& cur_term_stats : *matched_terms_stats) { if (cur_term_stats.term == term_stats.term) { diff --git a/icing/index/iterator/doc-hit-info-iterator.h b/icing/index/iterator/doc-hit-info-iterator.h index 67bd74f..bf90202 100644 --- a/icing/index/iterator/doc-hit-info-iterator.h +++ b/icing/index/iterator/doc-hit-info-iterator.h @@ -66,6 +66,8 @@ class DocHitInfoIterator { // Returns: // OK if was able to advance to a new document_id. + // INVALID_ARGUMENT if there are less than 2 iterators for an AND/OR + // iterator // RESOUCE_EXHAUSTED if we've run out of document_ids to iterate over virtual libtextclassifier3::Status Advance() = 0; @@ -94,11 +96,14 @@ class DocHitInfoIterator { // For the last hit docid, retrieves all the matched query terms and other // stats, see TermMatchInfo. + // filtering_section_mask filters the matching sections and should be set only + // by DocHitInfoIteratorSectionRestrict. // If Advance() wasn't called after construction, Advance() returned false or // the concrete HitIterator didn't override this method, the vectors aren't // populated. virtual void PopulateMatchedTermsStats( - std::vector<TermMatchInfo>* matched_terms_stats) const {} + std::vector<TermMatchInfo>* matched_terms_stats, + SectionIdMask filtering_section_mask = kSectionIdMaskAll) const {} protected: DocHitInfo doc_hit_info_; diff --git a/icing/index/iterator/doc-hit-info-iterator_benchmark.cc b/icing/index/iterator/doc-hit-info-iterator_benchmark.cc index 90e4888..f975989 100644 --- a/icing/index/iterator/doc-hit-info-iterator_benchmark.cc +++ b/icing/index/iterator/doc-hit-info-iterator_benchmark.cc @@ -14,15 +14,15 @@ #include <vector> +#include "testing/base/public/benchmark.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/index/iterator/doc-hit-info-iterator-and.h" #include "icing/index/iterator/doc-hit-info-iterator-test-util.h" #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" -#include "testing/base/public/benchmark.h" -#include "gmock/gmock.h" -#include "gtest/gtest.h" namespace icing { namespace lib { diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.cc b/icing/index/lite/doc-hit-info-iterator-term-lite.cc index d535d7f..08df4fc 100644 --- a/icing/index/lite/doc-hit-info-iterator-term-lite.cc +++ b/icing/index/lite/doc-hit-info-iterator-term-lite.cc @@ -45,8 +45,13 @@ libtextclassifier3::Status DocHitInfoIteratorTermLite::Advance() { if (cached_hits_idx_ == -1) { libtextclassifier3::Status status = RetrieveMoreHits(); if (!status.ok()) { - ICING_LOG(ERROR) << "Failed to retrieve more hits " - << status.error_message(); + if (!absl_ports::IsNotFound(status)) { + // NOT_FOUND is expected to happen (not every term will be in the main + // index!). Other errors are worth logging. + ICING_LOG(ERROR) + << "Encountered unexpected failure while retrieving hits " + << status.error_message(); + } return absl_ports::ResourceExhaustedError( "No more DocHitInfos in iterator"); } diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.h b/icing/index/lite/doc-hit-info-iterator-term-lite.h index ac5e97f..179fc93 100644 --- a/icing/index/lite/doc-hit-info-iterator-term-lite.h +++ b/icing/index/lite/doc-hit-info-iterator-term-lite.h @@ -50,21 +50,24 @@ class DocHitInfoIteratorTermLite : public DocHitInfoIterator { int32_t GetNumLeafAdvanceCalls() const override { return num_advance_calls_; } void PopulateMatchedTermsStats( - std::vector<TermMatchInfo>* matched_terms_stats) const override { + std::vector<TermMatchInfo>* matched_terms_stats, + SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { if (doc_hit_info_.document_id() == kInvalidDocumentId) { // Current hit isn't valid, return. return; } - SectionIdMask section_mask = doc_hit_info_.hit_section_ids_mask(); + SectionIdMask section_mask = + doc_hit_info_.hit_section_ids_mask() & filtering_section_mask; + SectionIdMask section_mask_copy = section_mask; std::array<Hit::TermFrequency, kMaxSectionId> section_term_frequencies = { Hit::kNoTermFrequency}; - while (section_mask) { - SectionId section_id = __builtin_ctz(section_mask); + while (section_mask_copy) { + SectionId section_id = __builtin_ctz(section_mask_copy); section_term_frequencies.at(section_id) = doc_hit_info_.hit_term_frequency(section_id); - section_mask &= ~(1u << section_id); + section_mask_copy &= ~(1u << section_id); } - TermMatchInfo term_stats(term_, doc_hit_info_.hit_section_ids_mask(), + TermMatchInfo term_stats(term_, section_mask, std::move(section_term_frequencies)); for (const TermMatchInfo& cur_term_stats : *matched_terms_stats) { @@ -79,6 +82,11 @@ class DocHitInfoIteratorTermLite : public DocHitInfoIterator { protected: // Add DocHitInfos corresponding to term_ to cached_hits_. + // + // Returns: + // - OK, on success + // - NOT_FOUND if no term matching term_ was found in the lexicon. + // - INVALID_ARGUMENT if unable to properly encode the termid virtual libtextclassifier3::Status RetrieveMoreHits() = 0; const std::string term_; diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc index e0379b8..fb23934 100644 --- a/icing/index/lite/lite-index.cc +++ b/icing/index/lite/lite-index.cc @@ -310,8 +310,6 @@ libtextclassifier3::Status LiteIndex::AddHit(uint32_t term_id, const Hit& hit) { return absl_ports::ResourceExhaustedError("Hit buffer is full!"); } - header_->set_last_added_docid(hit.document_id()); - TermIdHitPair term_id_hit_pair(term_id, hit); uint32_t cur_size = header_->cur_size(); TermIdHitPair::Value* valp = @@ -394,26 +392,36 @@ void LiteIndex::GetDebugInfo(int verbosity, std::string* out) const { } libtextclassifier3::StatusOr<int64_t> LiteIndex::GetElementsSize() const { - int64_t header_and_hit_buffer_file_size = - filesystem_->GetFileSize(hit_buffer_fd_.get()); - - if (header_and_hit_buffer_file_size == Filesystem::kBadFileSize) { - return absl_ports::InternalError( - "Failed to get element size of the LiteIndex's header and hit buffer"); - } - - int64_t lexicon_disk_usage = lexicon_.GetElementsSize(); - if (lexicon_disk_usage == IcingFilesystem::kBadFileSize) { - return absl_ports::InternalError( - "Failed to get element size of LiteIndex's lexicon"); + IndexStorageInfoProto storage_info = GetStorageInfo(IndexStorageInfoProto()); + if (storage_info.lite_index_hit_buffer_size() == -1 || + storage_info.lite_index_lexicon_size() == -1) { + return absl_ports::AbortedError( + "Failed to get size of LiteIndex's members."); } - // On initialization, we grow the file to a padded size first. So this size // won't count towards the size taken up by elements size_t header_padded_size = IcingMMapper::page_aligned_size(header_size()); + return storage_info.lite_index_hit_buffer_size() - header_padded_size + + storage_info.lite_index_lexicon_size(); +} - return header_and_hit_buffer_file_size - header_padded_size + - lexicon_disk_usage; +IndexStorageInfoProto LiteIndex::GetStorageInfo( + IndexStorageInfoProto storage_info) const { + int64_t header_and_hit_buffer_file_size = + filesystem_->GetFileSize(hit_buffer_fd_.get()); + if (header_and_hit_buffer_file_size != Filesystem::kBadFileSize) { + storage_info.set_lite_index_hit_buffer_size( + header_and_hit_buffer_file_size); + } else { + storage_info.set_lite_index_hit_buffer_size(-1); + } + int64_t lexicon_disk_usage = lexicon_.GetElementsSize(); + if (lexicon_disk_usage != Filesystem::kBadFileSize) { + storage_info.set_lite_index_lexicon_size(lexicon_disk_usage); + } else { + storage_info.set_lite_index_lexicon_size(-1); + } + return storage_info; } uint32_t LiteIndex::Seek(uint32_t term_id) { diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h index 7b51aa4..b134aba 100644 --- a/icing/index/lite/lite-index.h +++ b/icing/index/lite/lite-index.h @@ -37,6 +37,7 @@ #include "icing/legacy/index/icing-lite-index-header.h" #include "icing/legacy/index/icing-lite-index-options.h" #include "icing/legacy/index/icing-mmapper.h" +#include "icing/proto/storage.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" @@ -224,6 +225,9 @@ class LiteIndex { DocumentId last_added_document_id() const { return header_->last_added_docid(); } + void set_last_added_document_id(DocumentId document_id) const { + header_->set_last_added_docid(document_id); + } const IcingDynamicTrie& lexicon() const { return lexicon_; } @@ -240,6 +244,14 @@ class LiteIndex { // INTERNAL_ERROR on IO error libtextclassifier3::StatusOr<int64_t> GetElementsSize() const; + // Takes the provided storage_info, populates the fields related to the lite + // index and returns that storage_info. + // + // If an IO error occurs while trying to calculate the value for a field, then + // that field will be set to -1. + IndexStorageInfoProto GetStorageInfo( + IndexStorageInfoProto storage_info) const; + private: static IcingDynamicTrie::RuntimeOptions MakeTrieRuntimeOptions(); diff --git a/icing/index/main/doc-hit-info-iterator-term-main.cc b/icing/index/main/doc-hit-info-iterator-term-main.cc index 5553c1e..98bc18e 100644 --- a/icing/index/main/doc-hit-info-iterator-term-main.cc +++ b/icing/index/main/doc-hit-info-iterator-term-main.cc @@ -57,8 +57,9 @@ libtextclassifier3::Status DocHitInfoIteratorTermMain::Advance() { if (!absl_ports::IsNotFound(status)) { // NOT_FOUND is expected to happen (not every term will be in the main // index!). Other errors are worth logging. - ICING_LOG(ERROR) << "Failed to retrieve more hits " - << status.error_message(); + ICING_LOG(ERROR) + << "Encountered unexpected failure while retrieving hits " + << status.error_message(); } return absl_ports::ResourceExhaustedError( "No more DocHitInfos in iterator"); diff --git a/icing/index/main/doc-hit-info-iterator-term-main.h b/icing/index/main/doc-hit-info-iterator-term-main.h index d626d7a..f3cf701 100644 --- a/icing/index/main/doc-hit-info-iterator-term-main.h +++ b/icing/index/main/doc-hit-info-iterator-term-main.h @@ -50,21 +50,24 @@ class DocHitInfoIteratorTermMain : public DocHitInfoIterator { int32_t GetNumLeafAdvanceCalls() const override { return num_advance_calls_; } void PopulateMatchedTermsStats( - std::vector<TermMatchInfo>* matched_terms_stats) const override { + std::vector<TermMatchInfo>* matched_terms_stats, + SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override { if (doc_hit_info_.document_id() == kInvalidDocumentId) { // Current hit isn't valid, return. return; } - SectionIdMask section_mask = doc_hit_info_.hit_section_ids_mask(); + SectionIdMask section_mask = + doc_hit_info_.hit_section_ids_mask() & filtering_section_mask; + SectionIdMask section_mask_copy = section_mask; std::array<Hit::TermFrequency, kMaxSectionId> section_term_frequencies = { Hit::kNoTermFrequency}; - while (section_mask) { - SectionId section_id = __builtin_ctz(section_mask); + while (section_mask_copy) { + SectionId section_id = __builtin_ctz(section_mask_copy); section_term_frequencies.at(section_id) = doc_hit_info_.hit_term_frequency(section_id); - section_mask &= ~(1u << section_id); + section_mask_copy &= ~(1u << section_id); } - TermMatchInfo term_stats(term_, doc_hit_info_.hit_section_ids_mask(), + TermMatchInfo term_stats(term_, section_mask, std::move(section_term_frequencies)); for (const TermMatchInfo& cur_term_stats : *matched_terms_stats) { diff --git a/icing/index/main/main-index.cc b/icing/index/main/main-index.cc index 636f631..8ae6b27 100644 --- a/icing/index/main/main-index.cc +++ b/icing/index/main/main-index.cc @@ -121,14 +121,34 @@ libtextclassifier3::Status MainIndex::Init( } libtextclassifier3::StatusOr<int64_t> MainIndex::GetElementsSize() const { + IndexStorageInfoProto storage_info = GetStorageInfo(IndexStorageInfoProto()); + if (storage_info.main_index_storage_size() == -1 || + storage_info.main_index_lexicon_size() == -1) { + return absl_ports::AbortedError( + "Failed to get size of MainIndex's members."); + } + return storage_info.main_index_storage_size() + + storage_info.main_index_lexicon_size(); +} + +IndexStorageInfoProto MainIndex::GetStorageInfo( + IndexStorageInfoProto storage_info) const { int64_t lexicon_elt_size = main_lexicon_->GetElementsSize(); + if (lexicon_elt_size != IcingFilesystem::kBadFileSize) { + storage_info.set_main_index_lexicon_size(lexicon_elt_size); + } else { + storage_info.set_main_index_lexicon_size(-1); + } int64_t index_elt_size = flash_index_storage_->GetElementsSize(); - if (lexicon_elt_size == IcingFilesystem::kBadFileSize || - index_elt_size == IcingFilesystem::kBadFileSize) { - return absl_ports::InternalError( - "Failed to get element size of LiteIndex's lexicon"); + if (lexicon_elt_size != IcingFilesystem::kBadFileSize) { + storage_info.set_main_index_storage_size(index_elt_size); + } else { + storage_info.set_main_index_storage_size(-1); } - return lexicon_elt_size + index_elt_size; + storage_info.set_main_index_block_size(flash_index_storage_->block_size()); + storage_info.set_num_blocks(flash_index_storage_->num_blocks()); + storage_info.set_min_free_fraction(flash_index_storage_->min_free_fraction()); + return storage_info; } libtextclassifier3::StatusOr<std::unique_ptr<PostingListAccessor>> diff --git a/icing/index/main/main-index.h b/icing/index/main/main-index.h index 7403b8c..43635ca 100644 --- a/icing/index/main/main-index.h +++ b/icing/index/main/main-index.h @@ -27,6 +27,7 @@ #include "icing/index/term-metadata.h" #include "icing/legacy/index/icing-dynamic-trie.h" #include "icing/legacy/index/icing-filesystem.h" +#include "icing/proto/storage.pb.h" #include "icing/store/namespace-id.h" #include "icing/util/status-macros.h" @@ -172,6 +173,14 @@ class MainIndex { // - INTERNAL on IO error libtextclassifier3::StatusOr<int64_t> GetElementsSize() const; + // Takes the provided storage_info, populates the fields related to the main + // index and returns that storage_info. + // + // If an IO error occurs while trying to calculate the value for a field, then + // that field will be set to -1. + IndexStorageInfoProto GetStorageInfo( + IndexStorageInfoProto storage_info) const; + // Returns debug information for the main index in out. // verbosity <= 0, simplest debug information - just the lexicon // verbosity > 0, more detailed debug information including raw postings diff --git a/icing/index/main/posting-list-free.h b/icing/index/main/posting-list-free.h index 4b27401..4f06057 100644 --- a/icing/index/main/posting-list-free.h +++ b/icing/index/main/posting-list-free.h @@ -115,7 +115,7 @@ class PostingListFree { // bytes which will store the next posting list index, the rest are unused and // can be anything. uint8_t *posting_list_buffer_; - uint32_t size_in_bytes_; + [[maybe_unused]] uint32_t size_in_bytes_; static_assert(sizeof(PostingListIndex) <= posting_list_utils::min_posting_list_size(), diff --git a/icing/jni.lds b/icing/jni.lds new file mode 100644 index 0000000..401682a --- /dev/null +++ b/icing/jni.lds @@ -0,0 +1,10 @@ +VERS_1.0 { + # Export JNI symbols. + global: + Java_*; + JNI_OnLoad; + + # Hide everything else + local: + *; +}; diff --git a/icing/jni/icing-search-engine-jni.cc b/icing/jni/icing-search-engine-jni.cc index bf709cd..ea2bcf7 100644 --- a/icing/jni/icing-search-engine-jni.cc +++ b/icing/jni/icing-search-engine-jni.cc @@ -27,6 +27,7 @@ #include "icing/proto/schema.pb.h" #include "icing/proto/scoring.pb.h" #include "icing/proto/search.pb.h" +#include "icing/proto/storage.pb.h" #include "icing/proto/usage.pb.h" #include "icing/util/status-macros.h" @@ -356,12 +357,19 @@ Java_com_google_android_icing_IcingSearchEngine_nativeDeleteByQuery( JNIEXPORT jbyteArray JNICALL Java_com_google_android_icing_IcingSearchEngine_nativePersistToDisk( - JNIEnv* env, jclass clazz, jobject object) { + JNIEnv* env, jclass clazz, jobject object, jint persist_type_code) { icing::lib::IcingSearchEngine* icing = GetIcingSearchEnginePointer(env, object); + if (!icing::lib::PersistType::Code_IsValid(persist_type_code)) { + ICING_LOG(ERROR) << persist_type_code + << " is an invalid value for PersistType::Code"; + return nullptr; + } + icing::lib::PersistType::Code persist_type_code_enum = + static_cast<icing::lib::PersistType::Code>(persist_type_code); icing::lib::PersistToDiskResultProto persist_to_disk_result_proto = - icing->PersistToDisk(); + icing->PersistToDisk(persist_type_code_enum); return SerializeProtoToJniByteArray(env, persist_to_disk_result_proto); } @@ -390,6 +398,18 @@ Java_com_google_android_icing_IcingSearchEngine_nativeGetOptimizeInfo( } JNIEXPORT jbyteArray JNICALL +Java_com_google_android_icing_IcingSearchEngine_nativeGetStorageInfo( + JNIEnv* env, jclass clazz, jobject object) { + icing::lib::IcingSearchEngine* icing = + GetIcingSearchEnginePointer(env, object); + + icing::lib::StorageInfoResultProto storage_info_result_proto = + icing->GetStorageInfo(); + + return SerializeProtoToJniByteArray(env, storage_info_result_proto); +} + +JNIEXPORT jbyteArray JNICALL Java_com_google_android_icing_IcingSearchEngine_nativeReset( JNIEnv* env, jclass clazz, jobject object) { icing::lib::IcingSearchEngine* icing = diff --git a/icing/jni/jni-cache.cc b/icing/jni/jni-cache.cc index 58eb8bf..9b75db6 100644 --- a/icing/jni/jni-cache.cc +++ b/icing/jni/jni-cache.cc @@ -14,6 +14,8 @@ #include "icing/jni/jni-cache.h" +#ifdef ICING_REVERSE_JNI_SEGMENTATION + #include "icing/text_classifier/lib3/utils/java/jni-base.h" #include "icing/text_classifier/lib3/utils/java/jni-helper.h" #include "icing/absl_ports/canonical_errors.h" @@ -214,3 +216,5 @@ JniCache::ConvertToJavaString(const char* utf8_text, } // namespace lib } // namespace icing + +#endif // ICING_REVERSE_JNI_SEGMENTATION diff --git a/icing/jni/jni-cache.h b/icing/jni/jni-cache.h index a5f16c7..3faaed6 100644 --- a/icing/jni/jni-cache.h +++ b/icing/jni/jni-cache.h @@ -15,6 +15,16 @@ #ifndef ICING_JNI_JNI_CACHE_H_ #define ICING_JNI_JNI_CACHE_H_ +#ifndef ICING_REVERSE_JNI_SEGMENTATION +namespace icing { +namespace lib { + +class JniCache {}; // Declare an empty class definition for non-Android builds. + +} // namespace lib +} // namespace icing +#else // ICING_REVERSE_JNI_SEGMENTATION + #include <jni.h> #include "icing/text_classifier/lib3/utils/base/statusor.h" @@ -75,4 +85,6 @@ struct JniCache { } // namespace lib } // namespace icing +#endif // !ICING_REVERSE_JNI_SEGMENTATION + #endif // ICING_JNI_JNI_CACHE_H_ diff --git a/icing/performance-configuration.cc b/icing/performance-configuration.cc index aeaa449..4020dd0 100644 --- a/icing/performance-configuration.cc +++ b/icing/performance-configuration.cc @@ -15,6 +15,7 @@ #include "icing/performance-configuration.h" #include "icing/result/result-state.h" +#include "icing/scoring/scored-document-hit.h" namespace icing { namespace lib { @@ -54,38 +55,21 @@ constexpr int kMaxQueryLength = 23000; constexpr int kDefaultNumToScore = 30000; // New Android devices nowadays all allow more than 16 MB memory per app. Using -// that as a guideline, we set 16 MB as the safe memory threshold. +// that as a guideline and being more conservative, we set 4 MB as the safe +// memory threshold. // TODO(b/150029642): Android apps / framework have better understanding of how // much memory is allowed, so it would be better to let clients pass in this // value. -constexpr int kSafeMemoryUsage = 16 * 1024 * 1024; // 16MB +constexpr int kSafeMemoryUsage = 4 * 1024 * 1024; // 4MB -// This number is not determined by benchmarks. We just assume that returning -// the best 1000 scored document hits of a query is enough. To find the best -// 1000 scored document hits from a heap, we need roughly 0.7 ms on a Pixel 3 XL -// according to //icing/scoring:ranker_benchmark. -constexpr int kMaxNumHitsPerQuery = 1000; +// The maximum number of hits that can fit below the kSafeMemoryUsage threshold. +constexpr int kMaxNumTotalHits = kSafeMemoryUsage / sizeof(ScoredDocumentHit); -// A rough estimation of the size of ResultState if it stores the maximum number -// of scored document hits. -constexpr int kMaxMemoryPerResult = - sizeof(ResultState) + kMaxNumHitsPerQuery * sizeof(ScoredDocumentHit); - -// To be safer, we assume that all the Results contain the maximum number of -// hits and only use half of the memory allowed. -constexpr int kDefaultNumResultsToCache = - kSafeMemoryUsage / 2 / kMaxMemoryPerResult; - -static_assert( - kDefaultNumResultsToCache > 500, - "Default number of results to cache has changed, please update and make " - "sure it still meets our requirements."); } // namespace PerformanceConfiguration::PerformanceConfiguration() : PerformanceConfiguration(kMaxQueryLength, kDefaultNumToScore, - kMaxNumHitsPerQuery, kDefaultNumResultsToCache) { -} + kMaxNumTotalHits) {} } // namespace lib } // namespace icing diff --git a/icing/performance-configuration.h b/icing/performance-configuration.h index fa4050b..b9282ca 100644 --- a/icing/performance-configuration.h +++ b/icing/performance-configuration.h @@ -24,12 +24,10 @@ struct PerformanceConfiguration { PerformanceConfiguration(); PerformanceConfiguration(int max_query_length_in, int num_to_score_in, - int max_num_hits_per_query_in, - int max_num_cache_results_in) + int max_num_total_hits) : max_query_length(max_query_length_in), num_to_score(num_to_score_in), - max_num_hits_per_query(max_num_hits_per_query_in), - max_num_cache_results(max_num_cache_results_in) {} + max_num_total_hits(max_num_total_hits) {} // Search performance @@ -41,11 +39,9 @@ struct PerformanceConfiguration { // Memory - // Maximum number of ScoredDocumentHits to return per query. - int max_num_hits_per_query; - - // Maximum number of ResultStates to store in ResultStateManager. - int max_num_cache_results; + // Maximum number of ScoredDocumentHits to cache in the ResultStateManager at + // one time. + int max_num_total_hits; }; // TODO(b/149040810): Consider creating a class to manage performance diff --git a/icing/portable/endian.h b/icing/portable/endian.h new file mode 100644 index 0000000..595b956 --- /dev/null +++ b/icing/portable/endian.h @@ -0,0 +1,208 @@ +// Copyright (C) 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Utility functions that depend on bytesex. We define versions of htonll and +// ntohll (HostToNetworkLL and NetworkToHostLL in our naming), as well as +// "Google" versions of all the standards: ghtonl, ghtons, and so on +// (GHostToNetworkL, GHostToNetworkS, etc in our naming). These functions do +// exactly the same as their standard variants, but don't require including the +// dangerous netinet/in.h. + +#ifndef ICING_PORTABLE_ENDIAN_H_ +#define ICING_PORTABLE_ENDIAN_H_ + +#include <cstdint> + +// IS_LITTLE_ENDIAN, IS_BIG_ENDIAN +#if defined OS_LINUX || defined OS_ANDROID || defined(__ANDROID__) +// _BIG_ENDIAN +#include <endian.h> + +#elif defined(__APPLE__) + +// BIG_ENDIAN +#include <machine/endian.h> // NOLINT(build/include) + +/* Let's try and follow the Linux convention */ +#define __BYTE_ORDER BYTE_ORDER +#define __LITTLE_ENDIAN LITTLE_ENDIAN +#define __BIG_ENDIAN BIG_ENDIAN + +#endif // operating system + +// defines __BYTE_ORDER for MSVC +#ifdef COMPILER_MSVC +#define __BYTE_ORDER __LITTLE_ENDIAN +#define IS_LITTLE_ENDIAN +#else // COMPILER_MSVC + +// define the macros IS_LITTLE_ENDIAN or IS_BIG_ENDIAN +// using the above endian definitions from endian.h if +// endian.h was included +#ifdef __BYTE_ORDER +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define IS_LITTLE_ENDIAN +#endif // __BYTE_ORDER == __LITTLE_ENDIAN + +#if __BYTE_ORDER == __BIG_ENDIAN +#define IS_BIG_ENDIAN +#endif // __BYTE_ORDER == __BIG_ENDIAN + +#else // __BYTE_ORDER + +#if defined(__LITTLE_ENDIAN__) +#define IS_LITTLE_ENDIAN +#elif defined(__BIG_ENDIAN__) +#define IS_BIG_ENDIAN +#endif // __LITTLE_ENDIAN__ or __BIG_ENDIAN__ + +#endif // __BYTE_ORDER +#endif // COMPILER_MSVC + +// byte swap functions (bswap_16, bswap_32, bswap_64). +// byte swap functions reverse the order of bytes, e.g. +// byteswap of 102030405060708 = 807060504030201 +// byteswap of 1020304 = 4030201 + +// The following guarantees declaration of the byte swap functions +#ifdef COMPILER_MSVC +#include <stdlib.h> // NOLINT(build/include) + +#define bswap_16(x) _byteswap_ushort(x) +#define bswap_32(x) _byteswap_ulong(x) +#define bswap_64(x) _byteswap_uint64(x) + +#elif defined(__APPLE__) +// Mac OS X / Darwin features +#include <libkern/OSByteOrder.h> + +#define bswap_16(x) OSSwapInt16(x) +#define bswap_32(x) OSSwapInt32(x) +#define bswap_64(x) OSSwapInt64(x) + +#elif defined(__GLIBC__) || defined(__BIONIC__) || defined(__ASYLO__) +#include <byteswap.h> // IWYU pragma: export + +#else // built-in byteswap functions + +static inline uint16 bswap_16(uint16 x) { +#ifdef __cplusplus + return static_cast<uint16>(((x & 0xFF) << 8) | ((x & 0xFF00) >> 8)); +#else // __cplusplus + return (uint16)(((x & 0xFF) << 8) | ((x & 0xFF00) >> 8)); // NOLINT +#endif // __cplusplus +} +#define bswap_16(x) bswap_16(x) +static inline uint32 bswap_32(uint32 x) { + return (((x & 0xFF) << 24) | ((x & 0xFF00) << 8) | ((x & 0xFF0000) >> 8) | + ((x & 0xFF000000) >> 24)); +} +#define bswap_32(x) bswap_32(x) +static inline uint64 bswap_64(uint64 x) { + return (((x & (uint64_t)0xFF) << 56) | ((x & (uint64_t)0xFF00) << 40) | + ((x & (uint64_t)0xFF0000) << 24) | ((x & (uint64_t)0xFF000000) << 8) | + ((x & (uint64_t)0xFF00000000) >> 8) | + ((x & (uint64_t)0xFF0000000000) >> 24) | + ((x & (uint64_t)0xFF000000000000) >> 40) | + ((x & (uint64_t)0xFF00000000000000) >> 56)); +} +#define bswap_64(x) bswap_64(x) + +#endif // end byteswap functions + +// Use compiler byte-swapping intrinsics if they are available. 32-bit +// and 64-bit versions are available in Clang and GCC as of GCC 4.3.0. +// The 16-bit version is available in Clang and GCC only as of GCC 4.8.0. +// For simplicity, we enable them all only for GCC 4.8.0 or later. +#if defined(__clang__) || \ + (defined(__GNUC__) && \ + ((__GNUC__ == 4 && __GNUC_MINOR__ >= 8) || __GNUC__ >= 5)) + +inline uint64_t gbswap_64(uint64_t host_int) { + return __builtin_bswap64(host_int); +} +inline uint32_t gbswap_32(uint32_t host_int) { + return __builtin_bswap32(host_int); +} +inline uint16_t gbswap_16(uint16_t host_int) { + return __builtin_bswap16(host_int); +} + +#else // intrinsics available + +inline uint64 gbswap_64(uint64 host_int) { +#if defined(__GNUC__) && defined(__x86_64__) && \ + !(defined(__APPLE__) && defined(__MACH__)) + // Adapted from /usr/include/byteswap.h. Not available on Mac. + if (__builtin_constant_p(host_int)) { + return __bswap_constant_64(host_int); + } else { + uint64 result; + __asm__("bswap %0" : "=r"(result) : "0"(host_int)); + return result; + } +#elif defined(bswap_64) + return bswap_64(host_int); +#else // bswap_64 + return static_cast<uint64>(bswap_32(static_cast<uint32>(host_int >> 32))) | + (static_cast<uint64>(bswap_32(static_cast<uint32>(host_int))) << 32); +#endif // bswap_64 +} +inline uint32 gbswap_32(uint32 host_int) { return bswap_32(host_int); } +inline uint16 gbswap_16(uint16 host_int) { return bswap_16(host_int); } + +#endif // intrinsics available + +#ifdef IS_LITTLE_ENDIAN + +// Definitions for ntohl etc. that don't require us to include +// netinet/in.h. We wrap gbswap_32 and gbswap_16 in functions rather +// than just #defining them because in debug mode, gcc doesn't +// correctly handle the (rather involved) definitions of bswap_32. +// gcc guarantees that inline functions are as fast as macros, so +// this isn't a performance hit. +inline uint16_t GHostToNetworkS(uint16_t x) { return gbswap_16(x); } +inline uint32_t GHostToNetworkL(uint32_t x) { return gbswap_32(x); } +inline uint64_t GHostToNetworkLL(uint64_t x) { return gbswap_64(x); } + +#elif defined IS_BIG_ENDIAN + +// These definitions are simpler on big-endian machines +// These are functions instead of macros to avoid self-assignment warnings +// on calls such as "i = ghtnol(i);". This also provides type checking. +inline uint16 GHostToNetworkS(uint16 x) { return x; } +inline uint32 GHostToNetworkL(uint32 x) { return x; } +inline uint64 GHostToNetworkLL(uint64 x) { return x; } + +#else // bytesex +#error \ + "Unsupported bytesex: Either IS_BIG_ENDIAN or IS_LITTLE_ENDIAN must be defined" // NOLINT +#endif // bytesex + +#ifndef HostToNetworkLL +// With the rise of 64-bit, some systems are beginning to define this. +#define HostToNetworkLL(x) GHostToNetworkLL(x) +#endif // HostToNetworkLL + +// ntoh* and hton* are the same thing for any size and bytesex, +// since the function is an involution, i.e., its own inverse. +inline uint16_t GNetworkToHostS(uint16_t x) { return GHostToNetworkS(x); } +inline uint32_t GNetworkToHostL(uint32_t x) { return GHostToNetworkL(x); } +inline uint64_t GNetworkToHostLL(uint64_t x) { return GHostToNetworkLL(x); } + +#ifndef NetworkToHostLL +#define NetworkToHostLL(x) GHostToNetworkLL(x) +#endif // NetworkToHostLL + +#endif // ICING_PORTABLE_ENDIAN_H_ diff --git a/icing/testing/platform.h b/icing/portable/platform.h index ad612d5..150eede 100644 --- a/icing/testing/platform.h +++ b/icing/portable/platform.h @@ -12,11 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef ICING_TESTING_PLATFORM_H_ -#define ICING_TESTING_PLATFORM_H_ +#ifndef ICING_PORTABLE_PLATFORM_H_ +#define ICING_PORTABLE_PLATFORM_H_ -// This file is meant to hold util functions for tests that help the test -// determine which platform-specific configuration it may be running in. namespace icing { namespace lib { @@ -36,11 +34,19 @@ inline bool IsReverseJniTokenization() { return false; } -// Whether the running test is an Android test. -inline bool IsAndroidPlatform() { -#if defined(__ANDROID__) +// Whether we're running on android_x86 +inline bool IsAndroidX86() { +#if defined(__ANDROID__) && defined(__i386__) return true; -#endif // defined(__ANDROID__) +#endif // defined(__ANDROID__) && defined(__i386__) + return false; +} + +// Whether we're running on android_armeabi-v7a +inline bool IsAndroidArm() { +#if defined(__ANDROID__) && defined(__arm__) + return true; +#endif // defined(__ANDROID__) && defined(__arm__) return false; } @@ -52,7 +58,27 @@ inline bool IsIosPlatform() { return false; } +enum Architecture { + UNKNOWN, + BIT_32, + BIT_64, +}; + +// Returns which architecture we're running on. +// +// Architecture macros pulled from +// https://developer.android.com/ndk/guides/cpu-features +inline Architecture GetArchitecture() { +#if defined(__arm__) || defined(__i386__) + return BIT_32; +#elif defined(__aarch64__) || defined(__x86_64__) + return BIT_64; +#else + return UNKNOWN; +#endif +} + } // namespace lib } // namespace icing -#endif // ICING_TESTING_PLATFORM_H_ +#endif // ICING_PORTABLE_PLATFORM_H_ diff --git a/icing/query/query-processor.cc b/icing/query/query-processor.cc index 0732ed0..1f937fd 100644 --- a/icing/query/query-processor.cc +++ b/icing/query/query-processor.cc @@ -46,7 +46,6 @@ #include "icing/tokenization/tokenizer-factory.h" #include "icing/tokenization/tokenizer.h" #include "icing/transform/normalizer.h" -#include "icing/util/clock.h" #include "icing/util/status-macros.h" namespace icing { @@ -105,31 +104,27 @@ QueryProcessor::Create(Index* index, const LanguageSegmenter* language_segmenter, const Normalizer* normalizer, const DocumentStore* document_store, - const SchemaStore* schema_store, const Clock* clock) { + const SchemaStore* schema_store) { ICING_RETURN_ERROR_IF_NULL(index); ICING_RETURN_ERROR_IF_NULL(language_segmenter); ICING_RETURN_ERROR_IF_NULL(normalizer); ICING_RETURN_ERROR_IF_NULL(document_store); ICING_RETURN_ERROR_IF_NULL(schema_store); - ICING_RETURN_ERROR_IF_NULL(clock); - return std::unique_ptr<QueryProcessor>( - new QueryProcessor(index, language_segmenter, normalizer, document_store, - schema_store, clock)); + return std::unique_ptr<QueryProcessor>(new QueryProcessor( + index, language_segmenter, normalizer, document_store, schema_store)); } QueryProcessor::QueryProcessor(Index* index, const LanguageSegmenter* language_segmenter, const Normalizer* normalizer, const DocumentStore* document_store, - const SchemaStore* schema_store, - const Clock* clock) + const SchemaStore* schema_store) : index_(*index), language_segmenter_(*language_segmenter), normalizer_(*normalizer), document_store_(*document_store), - schema_store_(*schema_store), - clock_(*clock) {} + schema_store_(*schema_store) {} DocHitInfoIteratorFilter::Options QueryProcessor::getFilterOptions( const SearchSpecProto& search_spec) { @@ -156,7 +151,7 @@ QueryProcessor::ParseSearch(const SearchSpecProto& search_spec) { DocHitInfoIteratorFilter::Options options = getFilterOptions(search_spec); results.root_iterator = std::make_unique<DocHitInfoIteratorFilter>( std::move(results.root_iterator), &document_store_, &schema_store_, - &clock_, options); + options); return results; } @@ -279,7 +274,7 @@ QueryProcessor::ParseRawQuery(const SearchSpecProto& search_spec) { results.query_term_iterators[normalized_text] = std::make_unique<DocHitInfoIteratorFilter>( std::move(term_iterator), &document_store_, &schema_store_, - &clock_, options); + options); results.query_terms[frames.top().section_restrict].insert( std::move(normalized_text)); diff --git a/icing/query/query-processor.h b/icing/query/query-processor.h index 0932ec5..bdf9ef2 100644 --- a/icing/query/query-processor.h +++ b/icing/query/query-processor.h @@ -27,7 +27,6 @@ #include "icing/store/document-store.h" #include "icing/tokenization/language-segmenter.h" #include "icing/transform/normalizer.h" -#include "icing/util/clock.h" namespace icing { namespace lib { @@ -47,7 +46,7 @@ class QueryProcessor { static libtextclassifier3::StatusOr<std::unique_ptr<QueryProcessor>> Create( Index* index, const LanguageSegmenter* language_segmenter, const Normalizer* normalizer, const DocumentStore* document_store, - const SchemaStore* schema_store, const Clock* clock); + const SchemaStore* schema_store); struct QueryResults { std::unique_ptr<DocHitInfoIterator> root_iterator; @@ -77,7 +76,7 @@ class QueryProcessor { const LanguageSegmenter* language_segmenter, const Normalizer* normalizer, const DocumentStore* document_store, - const SchemaStore* schema_store, const Clock* clock); + const SchemaStore* schema_store); // Parse the query into a one DocHitInfoIterator that represents the root of a // query tree. @@ -103,7 +102,6 @@ class QueryProcessor { const Normalizer& normalizer_; const DocumentStore& document_store_; const SchemaStore& schema_store_; - const Clock& clock_; }; } // namespace lib diff --git a/icing/query/query-processor_benchmark.cc b/icing/query/query-processor_benchmark.cc index eb8b7a4..bdd40aa 100644 --- a/icing/query/query-processor_benchmark.cc +++ b/icing/query/query-processor_benchmark.cc @@ -147,7 +147,7 @@ void BM_QueryOneTerm(benchmark::State& state) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index.get(), language_segmenter.get(), normalizer.get(), document_store.get(), - schema_store.get(), &clock)); + schema_store.get())); SearchSpecProto search_spec; search_spec.set_query(input_string); @@ -278,7 +278,7 @@ void BM_QueryFiveTerms(benchmark::State& state) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index.get(), language_segmenter.get(), normalizer.get(), document_store.get(), - schema_store.get(), &clock)); + schema_store.get())); const std::string query_string = absl_ports::StrCat( input_string_a, " ", input_string_b, " ", input_string_c, " ", @@ -402,7 +402,7 @@ void BM_QueryDiacriticTerm(benchmark::State& state) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index.get(), language_segmenter.get(), normalizer.get(), document_store.get(), - schema_store.get(), &clock)); + schema_store.get())); SearchSpecProto search_spec; search_spec.set_query(input_string); @@ -522,7 +522,7 @@ void BM_QueryHiragana(benchmark::State& state) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index.get(), language_segmenter.get(), normalizer.get(), document_store.get(), - schema_store.get(), &clock)); + schema_store.get())); SearchSpecProto search_spec; search_spec.set_query(input_string); diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc index 6ec0a2a..daeb479 100644 --- a/icing/query/query-processor_test.cc +++ b/icing/query/query-processor_test.cc @@ -29,9 +29,11 @@ #include "icing/index/iterator/doc-hit-info-iterator-test-util.h" #include "icing/index/iterator/doc-hit-info-iterator.h" #include "icing/legacy/index/icing-filesystem.h" +#include "icing/portable/platform.h" #include "icing/proto/schema.pb.h" #include "icing/proto/search.pb.h" #include "icing/proto/term.pb.h" +#include "icing/schema-builder.h" #include "icing/schema/schema-store.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" @@ -39,7 +41,6 @@ #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" #include "icing/testing/jni-test-helpers.h" -#include "icing/testing/platform.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" #include "icing/tokenization/language-segmenter-factory.h" @@ -60,30 +61,16 @@ using ::testing::SizeIs; using ::testing::Test; using ::testing::UnorderedElementsAre; -SchemaTypeConfigProto* AddSchemaType(SchemaProto* schema, - std::string schema_type) { - SchemaTypeConfigProto* type_config = schema->add_types(); - type_config->set_schema_type(schema_type); - return type_config; -} +constexpr PropertyConfigProto_DataType_Code TYPE_STRING = + PropertyConfigProto_DataType_Code_STRING; -void AddIndexedProperty(SchemaTypeConfigProto* type_config, std::string name) { - PropertyConfigProto* property_config = type_config->add_properties(); - property_config->set_property_name(name); - property_config->set_data_type(PropertyConfigProto::DataType::STRING); - property_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - property_config->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - property_config->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); -} +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; -void AddUnindexedProperty(SchemaTypeConfigProto* type_config, - std::string name) { - PropertyConfigProto* property_config = type_config->add_properties(); - property_config->set_property_name(name); - property_config->set_data_type(PropertyConfigProto::DataType::STRING); -} +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; + +constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; class QueryProcessorTest : public Test { protected: @@ -159,37 +146,33 @@ TEST_F(QueryProcessorTest, CreationWithNullPointerShouldFail) { EXPECT_THAT( QueryProcessor::Create(/*index=*/nullptr, language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_), + schema_store_.get()), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); EXPECT_THAT( QueryProcessor::Create(index_.get(), /*language_segmenter=*/nullptr, normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_), + schema_store_.get()), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); EXPECT_THAT( QueryProcessor::Create(index_.get(), language_segmenter_.get(), /*normalizer=*/nullptr, document_store_.get(), - schema_store_.get(), &fake_clock_), + schema_store_.get()), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); - EXPECT_THAT( - QueryProcessor::Create(index_.get(), language_segmenter_.get(), - normalizer_.get(), /*document_store=*/nullptr, - schema_store_.get(), &fake_clock_), - StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); - EXPECT_THAT(QueryProcessor::Create(index_.get(), language_segmenter_.get(), - normalizer_.get(), document_store_.get(), - /*schema_store=*/nullptr, &fake_clock_), + EXPECT_THAT(QueryProcessor::Create( + index_.get(), language_segmenter_.get(), normalizer_.get(), + /*document_store=*/nullptr, schema_store_.get()), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); EXPECT_THAT(QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), /*clock=*/nullptr), + /*schema_store=*/nullptr), StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); } TEST_F(QueryProcessorTest, EmptyGroupMatchAllDocuments) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -221,7 +204,7 @@ TEST_F(QueryProcessorTest, EmptyGroupMatchAllDocuments) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; search_spec.set_query("()"); @@ -238,8 +221,9 @@ TEST_F(QueryProcessorTest, EmptyGroupMatchAllDocuments) { TEST_F(QueryProcessorTest, EmptyQueryMatchAllDocuments) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -271,7 +255,7 @@ TEST_F(QueryProcessorTest, EmptyQueryMatchAllDocuments) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; search_spec.set_query(""); @@ -288,8 +272,9 @@ TEST_F(QueryProcessorTest, EmptyQueryMatchAllDocuments) { TEST_F(QueryProcessorTest, QueryTermNormalized) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -330,7 +315,7 @@ TEST_F(QueryProcessorTest, QueryTermNormalized) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; search_spec.set_query("hElLo WORLD"); @@ -363,8 +348,9 @@ TEST_F(QueryProcessorTest, QueryTermNormalized) { TEST_F(QueryProcessorTest, OneTermPrefixMatch) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -402,7 +388,7 @@ TEST_F(QueryProcessorTest, OneTermPrefixMatch) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; search_spec.set_query("he"); @@ -430,8 +416,9 @@ TEST_F(QueryProcessorTest, OneTermPrefixMatch) { TEST_F(QueryProcessorTest, OneTermExactMatch) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -469,7 +456,7 @@ TEST_F(QueryProcessorTest, OneTermExactMatch) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; search_spec.set_query("hello"); @@ -497,8 +484,9 @@ TEST_F(QueryProcessorTest, OneTermExactMatch) { TEST_F(QueryProcessorTest, AndSameTermExactMatch) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -536,7 +524,7 @@ TEST_F(QueryProcessorTest, AndSameTermExactMatch) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; search_spec.set_query("hello hello"); @@ -566,8 +554,9 @@ TEST_F(QueryProcessorTest, AndSameTermExactMatch) { TEST_F(QueryProcessorTest, AndTwoTermExactMatch) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -608,7 +597,7 @@ TEST_F(QueryProcessorTest, AndTwoTermExactMatch) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; search_spec.set_query("hello world"); @@ -640,8 +629,9 @@ TEST_F(QueryProcessorTest, AndTwoTermExactMatch) { TEST_F(QueryProcessorTest, AndSameTermPrefixMatch) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -679,7 +669,7 @@ TEST_F(QueryProcessorTest, AndSameTermPrefixMatch) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; search_spec.set_query("he he"); @@ -709,8 +699,9 @@ TEST_F(QueryProcessorTest, AndSameTermPrefixMatch) { TEST_F(QueryProcessorTest, AndTwoTermPrefixMatch) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -751,7 +742,7 @@ TEST_F(QueryProcessorTest, AndTwoTermPrefixMatch) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; search_spec.set_query("he wo"); @@ -784,8 +775,9 @@ TEST_F(QueryProcessorTest, AndTwoTermPrefixMatch) { TEST_F(QueryProcessorTest, AndTwoTermPrefixAndExactMatch) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -826,7 +818,7 @@ TEST_F(QueryProcessorTest, AndTwoTermPrefixAndExactMatch) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; search_spec.set_query("hello wo"); @@ -859,8 +851,9 @@ TEST_F(QueryProcessorTest, AndTwoTermPrefixAndExactMatch) { TEST_F(QueryProcessorTest, OrTwoTermExactMatch) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -906,7 +899,7 @@ TEST_F(QueryProcessorTest, OrTwoTermExactMatch) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; search_spec.set_query("hello OR world"); @@ -947,8 +940,9 @@ TEST_F(QueryProcessorTest, OrTwoTermExactMatch) { TEST_F(QueryProcessorTest, OrTwoTermPrefixMatch) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -994,7 +988,7 @@ TEST_F(QueryProcessorTest, OrTwoTermPrefixMatch) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; search_spec.set_query("he OR wo"); @@ -1034,8 +1028,9 @@ TEST_F(QueryProcessorTest, OrTwoTermPrefixMatch) { TEST_F(QueryProcessorTest, OrTwoTermPrefixAndExactMatch) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -1080,7 +1075,7 @@ TEST_F(QueryProcessorTest, OrTwoTermPrefixAndExactMatch) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; search_spec.set_query("hello OR wo"); @@ -1120,8 +1115,9 @@ TEST_F(QueryProcessorTest, OrTwoTermPrefixAndExactMatch) { TEST_F(QueryProcessorTest, CombinedAndOrTerms) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -1179,7 +1175,7 @@ TEST_F(QueryProcessorTest, CombinedAndOrTerms) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); { // OR gets precedence over AND, this is parsed as ((puppy OR kitten) AND @@ -1305,8 +1301,9 @@ TEST_F(QueryProcessorTest, CombinedAndOrTerms) { TEST_F(QueryProcessorTest, OneGroup) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -1356,7 +1353,7 @@ TEST_F(QueryProcessorTest, OneGroup) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); // Without grouping, this would be parsed as ((puppy OR kitten) AND foo) and // no documents would match. But with grouping, Document 1 matches puppy @@ -1380,8 +1377,9 @@ TEST_F(QueryProcessorTest, OneGroup) { TEST_F(QueryProcessorTest, TwoGroups) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -1430,7 +1428,7 @@ TEST_F(QueryProcessorTest, TwoGroups) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); // Without grouping, this would be parsed as (puppy AND (dog OR kitten) AND // cat) and wouldn't match any documents. But with grouping, Document 1 @@ -1457,8 +1455,9 @@ TEST_F(QueryProcessorTest, TwoGroups) { TEST_F(QueryProcessorTest, ManyLevelNestedGrouping) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -1508,7 +1507,7 @@ TEST_F(QueryProcessorTest, ManyLevelNestedGrouping) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); // Without grouping, this would be parsed as ((puppy OR kitten) AND foo) and // no documents would match. But with grouping, Document 1 matches puppy @@ -1532,8 +1531,9 @@ TEST_F(QueryProcessorTest, ManyLevelNestedGrouping) { TEST_F(QueryProcessorTest, OneLevelNestedGrouping) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -1583,7 +1583,7 @@ TEST_F(QueryProcessorTest, OneLevelNestedGrouping) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); // Document 1 will match puppy and Document 2 matches (kitten AND (cat)) SearchSpecProto search_spec; @@ -1608,8 +1608,9 @@ TEST_F(QueryProcessorTest, OneLevelNestedGrouping) { TEST_F(QueryProcessorTest, ExcludeTerm) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -1652,7 +1653,7 @@ TEST_F(QueryProcessorTest, ExcludeTerm) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; search_spec.set_query("-hello"); @@ -1672,8 +1673,9 @@ TEST_F(QueryProcessorTest, ExcludeTerm) { TEST_F(QueryProcessorTest, ExcludeNonexistentTerm) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -1715,7 +1717,7 @@ TEST_F(QueryProcessorTest, ExcludeNonexistentTerm) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; search_spec.set_query("-foo"); @@ -1734,8 +1736,9 @@ TEST_F(QueryProcessorTest, ExcludeNonexistentTerm) { TEST_F(QueryProcessorTest, ExcludeAnd) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -1785,7 +1788,7 @@ TEST_F(QueryProcessorTest, ExcludeAnd) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); { SearchSpecProto search_spec; @@ -1823,8 +1826,9 @@ TEST_F(QueryProcessorTest, ExcludeAnd) { TEST_F(QueryProcessorTest, ExcludeOr) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -1874,7 +1878,7 @@ TEST_F(QueryProcessorTest, ExcludeOr) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); { SearchSpecProto search_spec; @@ -1918,8 +1922,9 @@ TEST_F(QueryProcessorTest, ExcludeOr) { TEST_F(QueryProcessorTest, DeletedFilter) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -1970,7 +1975,7 @@ TEST_F(QueryProcessorTest, DeletedFilter) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; search_spec.set_query("animal"); @@ -1991,8 +1996,9 @@ TEST_F(QueryProcessorTest, DeletedFilter) { TEST_F(QueryProcessorTest, NamespaceFilter) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -2042,7 +2048,7 @@ TEST_F(QueryProcessorTest, NamespaceFilter) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; search_spec.set_query("animal"); @@ -2064,9 +2070,11 @@ TEST_F(QueryProcessorTest, NamespaceFilter) { TEST_F(QueryProcessorTest, SchemaTypeFilter) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); - AddSchemaType(&schema, "message"); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .AddType(SchemaTypeConfigBuilder().SetType("message")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -2112,7 +2120,7 @@ TEST_F(QueryProcessorTest, SchemaTypeFilter) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; search_spec.set_query("animal"); @@ -2134,11 +2142,15 @@ TEST_F(QueryProcessorTest, SchemaTypeFilter) { TEST_F(QueryProcessorTest, SectionFilterForOneDocument) { // Create the schema and document store - SchemaProto schema; - SchemaTypeConfigProto* email_type = AddSchemaType(&schema, "email"); - + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); // First and only indexed property, so it gets a section_id of 0 - AddIndexedProperty(email_type, "subject"); int subject_section_id = 0; ICING_ASSERT_OK_AND_ASSIGN( @@ -2174,7 +2186,7 @@ TEST_F(QueryProcessorTest, SectionFilterForOneDocument) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; // Create a section filter '<section name>:<query term>' @@ -2196,18 +2208,31 @@ TEST_F(QueryProcessorTest, SectionFilterForOneDocument) { TEST_F(QueryProcessorTest, SectionFilterAcrossSchemaTypes) { // Create the schema and document store - SchemaProto schema; - SchemaTypeConfigProto* email_type = AddSchemaType(&schema, "email"); - // SectionIds are assigned in ascending order per schema type, - // alphabetically. - AddIndexedProperty(email_type, "a"); // Section "a" would get sectionId 0 - AddIndexedProperty(email_type, "foo"); - int email_foo_section_id = 1; + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("email") + // Section "a" would get sectionId 0 + .AddProperty( + PropertyConfigBuilder() + .SetName("a") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("foo") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("message").AddProperty( + PropertyConfigBuilder() + .SetName("foo") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); - SchemaTypeConfigProto* message_type = AddSchemaType(&schema, "message"); // SectionIds are assigned in ascending order per schema type, // alphabetically. - AddIndexedProperty(message_type, "foo"); + int email_foo_section_id = 1; int message_foo_section_id = 0; ICING_ASSERT_OK_AND_ASSIGN( @@ -2253,7 +2278,7 @@ TEST_F(QueryProcessorTest, SectionFilterAcrossSchemaTypes) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; // Create a section filter '<section name>:<query term>' @@ -2277,18 +2302,20 @@ TEST_F(QueryProcessorTest, SectionFilterAcrossSchemaTypes) { } TEST_F(QueryProcessorTest, SectionFilterWithinSchemaType) { - // Create the schema and document store - SchemaProto schema; - SchemaTypeConfigProto* email_type = AddSchemaType(&schema, "email"); - // SectionIds are assigned in ascending order per schema type, - // alphabetically. - AddIndexedProperty(email_type, "foo"); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty( + PropertyConfigBuilder() + .SetName("foo") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("message").AddProperty( + PropertyConfigBuilder() + .SetName("foo") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); int email_foo_section_id = 0; - - SchemaTypeConfigProto* message_type = AddSchemaType(&schema, "message"); - // SectionIds are assigned in ascending order per schema type, - // alphabetically. - AddIndexedProperty(message_type, "foo"); int message_foo_section_id = 0; ICING_ASSERT_OK_AND_ASSIGN( @@ -2334,7 +2361,7 @@ TEST_F(QueryProcessorTest, SectionFilterWithinSchemaType) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; // Create a section filter '<section name>:<query term>', but only look @@ -2359,17 +2386,20 @@ TEST_F(QueryProcessorTest, SectionFilterWithinSchemaType) { TEST_F(QueryProcessorTest, SectionFilterRespectsDifferentSectionIds) { // Create the schema and document store - SchemaProto schema; - SchemaTypeConfigProto* email_type = AddSchemaType(&schema, "email"); - // SectionIds are assigned in ascending order per schema type, - // alphabetically. - AddIndexedProperty(email_type, "foo"); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty( + PropertyConfigBuilder() + .SetName("foo") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("message").AddProperty( + PropertyConfigBuilder() + .SetName("bar") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); int email_foo_section_id = 0; - - SchemaTypeConfigProto* message_type = AddSchemaType(&schema, "message"); - // SectionIds are assigned in ascending order per schema type, - // alphabetically. - AddIndexedProperty(message_type, "bar"); int message_foo_section_id = 0; ICING_ASSERT_OK_AND_ASSIGN( @@ -2417,7 +2447,7 @@ TEST_F(QueryProcessorTest, SectionFilterRespectsDifferentSectionIds) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; // Create a section filter '<section name>:<query term>', but only look @@ -2441,8 +2471,9 @@ TEST_F(QueryProcessorTest, SectionFilterRespectsDifferentSectionIds) { TEST_F(QueryProcessorTest, NonexistentSectionFilterReturnsEmptyResults) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -2477,7 +2508,7 @@ TEST_F(QueryProcessorTest, NonexistentSectionFilterReturnsEmptyResults) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; // Create a section filter '<section name>:<query term>', but only look @@ -2499,9 +2530,17 @@ TEST_F(QueryProcessorTest, NonexistentSectionFilterReturnsEmptyResults) { TEST_F(QueryProcessorTest, UnindexedSectionFilterReturnsEmptyResults) { // Create the schema and document store - SchemaProto schema; - SchemaTypeConfigProto* email_type = AddSchemaType(&schema, "email"); - AddUnindexedProperty(email_type, "foo"); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("email") + // Add an unindexed property so we generate section + // metadata on it + .AddProperty(PropertyConfigBuilder() + .SetName("foo") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -2536,7 +2575,7 @@ TEST_F(QueryProcessorTest, UnindexedSectionFilterReturnsEmptyResults) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; // Create a section filter '<section name>:<query term>', but only look @@ -2557,17 +2596,20 @@ TEST_F(QueryProcessorTest, UnindexedSectionFilterReturnsEmptyResults) { TEST_F(QueryProcessorTest, SectionFilterTermAndUnrestrictedTerm) { // Create the schema and document store - SchemaProto schema; - SchemaTypeConfigProto* email_type = AddSchemaType(&schema, "email"); - // SectionIds are assigned in ascending order per schema type, - // alphabetically. - AddIndexedProperty(email_type, "foo"); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty( + PropertyConfigBuilder() + .SetName("foo") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder().SetType("message").AddProperty( + PropertyConfigBuilder() + .SetName("foo") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); int email_foo_section_id = 0; - - SchemaTypeConfigProto* message_type = AddSchemaType(&schema, "message"); - // SectionIds are assigned in ascending order per schema type, - // alphabetically. - AddIndexedProperty(message_type, "foo"); int message_foo_section_id = 0; ICING_ASSERT_OK_AND_ASSIGN( @@ -2615,7 +2657,7 @@ TEST_F(QueryProcessorTest, SectionFilterTermAndUnrestrictedTerm) { std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; // Create a section filter '<section name>:<query term>' @@ -2641,27 +2683,34 @@ TEST_F(QueryProcessorTest, SectionFilterTermAndUnrestrictedTerm) { TEST_F(QueryProcessorTest, DocumentBeforeTtlNotFilteredOut) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); + // Arbitrary value, just has to be less than the document's creation + // timestamp + ttl + FakeClock fake_clock; + fake_clock.SetSystemTimeMilliseconds(50); + ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_, + DocumentStore::Create(&filesystem_, store_dir_, &fake_clock, schema_store_.get())); document_store_ = std::move(create_result.document_store); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, - document_store_->Put(DocumentBuilder() - .SetKey("namespace", "1") - .SetSchema("email") - .SetCreationTimestampMs(0) - .SetTtlMs(100) - .Build())); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id, + document_store_->Put(DocumentBuilder() + .SetKey("namespace", "1") + .SetSchema("email") + .SetCreationTimestampMs(10) + .SetTtlMs(100) + .Build())); // Populate the index int section_id = 0; @@ -2671,17 +2720,12 @@ TEST_F(QueryProcessorTest, DocumentBeforeTtlNotFilteredOut) { AddTokenToIndex(document_id, section_id, term_match_type, "hello"), IsOk()); - // Arbitrary value, just has to be less than the document's creation - // timestamp + ttl - FakeClock fake_clock; - fake_clock.SetSystemTimeMilliseconds(50); - // Perform query ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock_)); + schema_store_.get())); SearchSpecProto search_spec; search_spec.set_query("hello"); @@ -2698,27 +2742,34 @@ TEST_F(QueryProcessorTest, DocumentBeforeTtlNotFilteredOut) { TEST_F(QueryProcessorTest, DocumentPastTtlFilteredOut) { // Create the schema and document store - SchemaProto schema; - AddSchemaType(&schema, "email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); + // Arbitrary value, just has to be greater than the document's creation + // timestamp + ttl + FakeClock fake_clock; + fake_clock.SetSystemTimeMilliseconds(200); + ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_, + DocumentStore::Create(&filesystem_, store_dir_, &fake_clock, schema_store_.get())); document_store_ = std::move(create_result.document_store); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, - document_store_->Put(DocumentBuilder() - .SetKey("namespace", "1") - .SetSchema("email") - .SetCreationTimestampMs(0) - .SetTtlMs(100) - .Build())); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentId document_id, + document_store_->Put(DocumentBuilder() + .SetKey("namespace", "1") + .SetSchema("email") + .SetCreationTimestampMs(50) + .SetTtlMs(100) + .Build())); // Populate the index int section_id = 0; @@ -2728,17 +2779,12 @@ TEST_F(QueryProcessorTest, DocumentPastTtlFilteredOut) { AddTokenToIndex(document_id, section_id, term_match_type, "hello"), IsOk()); - // Arbitrary value, just has to be greater than the document's creation - // timestamp + ttl - FakeClock fake_clock; - fake_clock.SetSystemTimeMilliseconds(200); - // Perform query ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<QueryProcessor> query_processor, QueryProcessor::Create(index_.get(), language_segmenter_.get(), normalizer_.get(), document_store_.get(), - schema_store_.get(), &fake_clock)); + schema_store_.get())); SearchSpecProto search_spec; search_spec.set_query("hello"); diff --git a/icing/result/result-retriever.cc b/icing/result/result-retriever.cc index 85e78a8..943350c 100644 --- a/icing/result/result-retriever.cc +++ b/icing/result/result-retriever.cc @@ -107,6 +107,7 @@ ResultRetriever::RetrieveResults( // Add the document, itself. *result.mutable_document() = std::move(document); + result.set_score(scored_document_hit.score()); search_results.push_back(std::move(result)); } return search_results; diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc index 7cb2d62..1c9684d 100644 --- a/icing/result/result-retriever_test.cc +++ b/icing/result/result-retriever_test.cc @@ -24,17 +24,18 @@ #include "icing/file/mock-filesystem.h" #include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/equals-proto.h" +#include "icing/portable/platform.h" #include "icing/proto/document.pb.h" #include "icing/proto/schema.pb.h" #include "icing/proto/search.pb.h" #include "icing/proto/term.pb.h" #include "icing/result/projection-tree.h" +#include "icing/schema-builder.h" #include "icing/schema/schema-store.h" #include "icing/schema/section.h" #include "icing/store/document-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" -#include "icing/testing/platform.h" #include "icing/testing/snippet-helpers.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" @@ -54,6 +55,15 @@ using ::testing::IsEmpty; using ::testing::Return; using ::testing::SizeIs; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; + +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; + +constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; +constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; + class ResultRetrieverTest : public testing::Test { protected: ResultRetrieverTest() : test_dir_(GetTestTempDir() + "/icing") { @@ -78,65 +88,47 @@ class ResultRetrieverTest : public testing::Test { ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create( /*max_term_byte_size=*/10000)); - ASSERT_THAT(schema_store_->SetSchema(CreatePersonAndEmailSchema()), IsOk()); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("Email") + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(MATCH_PREFIX, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_EXACT, + TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("sender") + .SetDataTypeDocument( + "Person", /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType( + SchemaTypeConfigBuilder() + .SetType("Person") + .AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("emailAddress") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + ASSERT_THAT(schema_store_->SetSchema(schema), IsOk()); } void TearDown() override { filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); } - SchemaProto CreatePersonAndEmailSchema() { - SchemaProto schema; - - auto* type = schema.add_types(); - type->set_schema_type("Email"); - - auto* subj = type->add_properties(); - subj->set_property_name("name"); - subj->set_data_type(PropertyConfigProto::DataType::STRING); - subj->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - subj->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::PREFIX); - subj->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); - auto* body = type->add_properties(); - body->set_property_name("body"); - body->set_data_type(PropertyConfigProto::DataType::STRING); - body->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - body->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - body->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); - auto* sender = type->add_properties(); - sender->set_property_name("sender"); - sender->set_schema_type("Person"); - sender->set_data_type(PropertyConfigProto::DataType::DOCUMENT); - sender->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - sender->mutable_document_indexing_config()->set_index_nested_properties( - true); - - auto* person_type = schema.add_types(); - person_type->set_schema_type("Person"); - auto* name = person_type->add_properties(); - name->set_property_name("name"); - name->set_data_type(PropertyConfigProto::DataType::STRING); - name->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - name->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::PREFIX); - name->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); - auto* address = person_type->add_properties(); - address->set_property_name("emailAddress"); - address->set_data_type(PropertyConfigProto::DataType::STRING); - address->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - address->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::PREFIX); - address->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); - - return schema; - } - SectionId GetSectionId(const std::string& type, const std::string& property) { auto type_id_or = schema_store_->GetSchemaTypeId(type); if (!type_id_or.ok()) { @@ -236,9 +228,9 @@ TEST_F(ResultRetrieverTest, ShouldRetrieveSimpleResults) { GetSectionId("Email", "body")}; SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids); std::vector<ScoredDocumentHit> scored_document_hits = { - {document_id1, hit_section_id_mask, /*score=*/0}, - {document_id2, hit_section_id_mask, /*score=*/0}, - {document_id3, hit_section_id_mask, /*score=*/0}}; + {document_id1, hit_section_id_mask, /*score=*/19}, + {document_id2, hit_section_id_mask, /*score=*/5}, + {document_id3, hit_section_id_mask, /*score=*/1}}; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ResultRetriever> result_retriever, ResultRetriever::Create(doc_store.get(), schema_store_.get(), @@ -246,10 +238,13 @@ TEST_F(ResultRetrieverTest, ShouldRetrieveSimpleResults) { SearchResultProto::ResultProto result1; *result1.mutable_document() = CreateDocument(/*id=*/1); + result1.set_score(19); SearchResultProto::ResultProto result2; *result2.mutable_document() = CreateDocument(/*id=*/2); + result2.set_score(5); SearchResultProto::ResultProto result3; *result3.mutable_document() = CreateDocument(/*id=*/3); + result3.set_score(1); SnippetContext snippet_context( /*query_terms_in=*/{}, @@ -285,8 +280,8 @@ TEST_F(ResultRetrieverTest, IgnoreErrors) { GetSectionId("Email", "body")}; SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids); std::vector<ScoredDocumentHit> scored_document_hits = { - {document_id1, hit_section_id_mask, /*score=*/0}, - {document_id2, hit_section_id_mask, /*score=*/0}, + {document_id1, hit_section_id_mask, /*score=*/12}, + {document_id2, hit_section_id_mask, /*score=*/4}, {invalid_document_id, hit_section_id_mask, /*score=*/0}}; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<ResultRetriever> result_retriever, @@ -296,8 +291,10 @@ TEST_F(ResultRetrieverTest, IgnoreErrors) { SearchResultProto::ResultProto result1; *result1.mutable_document() = CreateDocument(/*id=*/1); + result1.set_score(12); SearchResultProto::ResultProto result2; *result2.mutable_document() = CreateDocument(/*id=*/2); + result2.set_score(4); SnippetContext snippet_context( /*query_terms_in=*/{}, @@ -495,35 +492,63 @@ TEST_F(ResultRetrieverTest, SimpleSnippeted) { std::vector<SearchResultProto::ResultProto> result, result_retriever->RetrieveResults(page_result_state)); EXPECT_THAT(result, SizeIs(3)); - EXPECT_THAT(result[0].document(), EqualsProto(CreateDocument(/*id=*/1))); - EXPECT_THAT(GetWindow(result[0].document(), result[0].snippet(), "name", 0), - Eq("subject foo 1")); - EXPECT_THAT(GetMatch(result[0].document(), result[0].snippet(), "name", 0), - Eq("foo")); - EXPECT_THAT(GetWindow(result[0].document(), result[0].snippet(), "body", 0), - Eq("body bar 1")); - EXPECT_THAT(GetMatch(result[0].document(), result[0].snippet(), "body", 0), - Eq("bar")); - EXPECT_THAT(result[1].document(), EqualsProto(CreateDocument(/*id=*/2))); - EXPECT_THAT(GetWindow(result[1].document(), result[1].snippet(), "name", 0), - Eq("subject foo 2")); - EXPECT_THAT(GetMatch(result[1].document(), result[1].snippet(), "name", 0), - Eq("foo")); - EXPECT_THAT(GetWindow(result[1].document(), result[1].snippet(), "body", 0), - Eq("body bar 2")); - EXPECT_THAT(GetMatch(result[1].document(), result[1].snippet(), "body", 0), - Eq("bar")); - - EXPECT_THAT(result[2].document(), EqualsProto(CreateDocument(/*id=*/3))); - EXPECT_THAT(GetWindow(result[2].document(), result[2].snippet(), "name", 0), - Eq("subject foo 3")); - EXPECT_THAT(GetMatch(result[2].document(), result[2].snippet(), "name", 0), - Eq("foo")); - EXPECT_THAT(GetWindow(result[2].document(), result[2].snippet(), "body", 0), - Eq("body bar 3")); - EXPECT_THAT(GetMatch(result[2].document(), result[2].snippet(), "body", 0), - Eq("bar")); + const DocumentProto& result_document_one = result.at(0).document(); + const SnippetProto& result_snippet_one = result.at(0).snippet(); + EXPECT_THAT(result_document_one, EqualsProto(CreateDocument(/*id=*/1))); + EXPECT_THAT(result_snippet_one.entries(), SizeIs(2)); + EXPECT_THAT(result_snippet_one.entries(0).property_name(), Eq("body")); + std::string_view content = GetString( + &result_document_one, result_snippet_one.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, result_snippet_one.entries(0)), + ElementsAre("body bar 1")); + EXPECT_THAT(GetMatches(content, result_snippet_one.entries(0)), + ElementsAre("bar")); + EXPECT_THAT(result_snippet_one.entries(1).property_name(), Eq("name")); + content = GetString(&result_document_one, + result_snippet_one.entries(1).property_name()); + EXPECT_THAT(GetWindows(content, result_snippet_one.entries(1)), + ElementsAre("subject foo 1")); + EXPECT_THAT(GetMatches(content, result_snippet_one.entries(1)), + ElementsAre("foo")); + + const DocumentProto& result_document_two = result.at(1).document(); + const SnippetProto& result_snippet_two = result.at(1).snippet(); + EXPECT_THAT(result_document_two, EqualsProto(CreateDocument(/*id=*/2))); + EXPECT_THAT(result_snippet_two.entries(), SizeIs(2)); + EXPECT_THAT(result_snippet_two.entries(0).property_name(), Eq("body")); + content = GetString(&result_document_two, + result_snippet_two.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, result_snippet_two.entries(0)), + ElementsAre("body bar 2")); + EXPECT_THAT(GetMatches(content, result_snippet_two.entries(0)), + ElementsAre("bar")); + EXPECT_THAT(result_snippet_two.entries(1).property_name(), Eq("name")); + content = GetString(&result_document_two, + result_snippet_two.entries(1).property_name()); + EXPECT_THAT(GetWindows(content, result_snippet_two.entries(1)), + ElementsAre("subject foo 2")); + EXPECT_THAT(GetMatches(content, result_snippet_two.entries(1)), + ElementsAre("foo")); + + const DocumentProto& result_document_three = result.at(2).document(); + const SnippetProto& result_snippet_three = result.at(2).snippet(); + EXPECT_THAT(result_document_three, EqualsProto(CreateDocument(/*id=*/3))); + EXPECT_THAT(result_snippet_three.entries(), SizeIs(2)); + EXPECT_THAT(result_snippet_three.entries(0).property_name(), Eq("body")); + content = GetString(&result_document_three, + result_snippet_three.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, result_snippet_three.entries(0)), + ElementsAre("body bar 3")); + EXPECT_THAT(GetMatches(content, result_snippet_three.entries(0)), + ElementsAre("bar")); + EXPECT_THAT(result_snippet_three.entries(1).property_name(), Eq("name")); + content = GetString(&result_document_three, + result_snippet_three.entries(1).property_name()); + EXPECT_THAT(GetWindows(content, result_snippet_three.entries(1)), + ElementsAre("subject foo 3")); + EXPECT_THAT(GetMatches(content, result_snippet_three.entries(1)), + ElementsAre("foo")); } TEST_F(ResultRetrieverTest, OnlyOneDocumentSnippeted) { @@ -568,15 +593,25 @@ TEST_F(ResultRetrieverTest, OnlyOneDocumentSnippeted) { std::vector<SearchResultProto::ResultProto> result, result_retriever->RetrieveResults(page_result_state)); EXPECT_THAT(result, SizeIs(3)); - EXPECT_THAT(result[0].document(), EqualsProto(CreateDocument(/*id=*/1))); - EXPECT_THAT(GetWindow(result[0].document(), result[0].snippet(), "name", 0), - Eq("subject foo 1")); - EXPECT_THAT(GetMatch(result[0].document(), result[0].snippet(), "name", 0), - Eq("foo")); - EXPECT_THAT(GetWindow(result[0].document(), result[0].snippet(), "body", 0), - Eq("body bar 1")); - EXPECT_THAT(GetMatch(result[0].document(), result[0].snippet(), "body", 0), - Eq("bar")); + + const DocumentProto& result_document = result.at(0).document(); + const SnippetProto& result_snippet = result.at(0).snippet(); + EXPECT_THAT(result_document, EqualsProto(CreateDocument(/*id=*/1))); + EXPECT_THAT(result_snippet.entries(), SizeIs(2)); + EXPECT_THAT(result_snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&result_document, result_snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, result_snippet.entries(0)), + ElementsAre("body bar 1")); + EXPECT_THAT(GetMatches(content, result_snippet.entries(0)), + ElementsAre("bar")); + EXPECT_THAT(result_snippet.entries(1).property_name(), Eq("name")); + content = + GetString(&result_document, result_snippet.entries(1).property_name()); + EXPECT_THAT(GetWindows(content, result_snippet.entries(1)), + ElementsAre("subject foo 1")); + EXPECT_THAT(GetMatches(content, result_snippet.entries(1)), + ElementsAre("foo")); EXPECT_THAT(result[1].document(), EqualsProto(CreateDocument(/*id=*/2))); EXPECT_THAT(result[1].snippet(), diff --git a/icing/result/result-state-manager.cc b/icing/result/result-state-manager.cc index 0f27d9e..d606e79 100644 --- a/icing/result/result-state-manager.cc +++ b/icing/result/result-state-manager.cc @@ -16,15 +16,17 @@ #include "icing/proto/search.pb.h" #include "icing/util/clock.h" +#include "icing/util/logging.h" #include "icing/util/status-macros.h" namespace icing { namespace lib { -ResultStateManager::ResultStateManager(int max_hits_per_query, - int max_result_states) - : max_hits_per_query_(max_hits_per_query), - max_result_states_(max_result_states), +ResultStateManager::ResultStateManager(int max_total_hits, + const DocumentStore& document_store) + : document_store_(document_store), + max_total_hits_(max_total_hits), + num_total_hits_(0), random_generator_(GetSteadyTimeNanoseconds()) {} libtextclassifier3::StatusOr<PageResultState> @@ -33,16 +35,13 @@ ResultStateManager::RankAndPaginate(ResultState result_state) { return absl_ports::InvalidArgumentError("ResultState has no results"); } - // Truncates scored document hits so that they don't take up too much space. - result_state.TruncateHitsTo(max_hits_per_query_); - // Gets the number before calling GetNextPage() because num_returned() may // change after returning more results. int num_previously_returned = result_state.num_returned(); int num_per_page = result_state.num_per_page(); std::vector<ScoredDocumentHit> page_result_document_hits = - result_state.GetNextPage(); + result_state.GetNextPage(document_store_); SnippetContext snippet_context_copy = result_state.snippet_context(); @@ -68,10 +67,12 @@ ResultStateManager::RankAndPaginate(ResultState result_state) { } uint64_t ResultStateManager::Add(ResultState result_state) { - RemoveStatesIfNeeded(); + RemoveStatesIfNeeded(result_state); + result_state.TruncateHitsTo(max_total_hits_); uint64_t new_token = GetUniqueToken(); + num_total_hits_ += result_state.num_remaining(); result_state_map_.emplace(new_token, std::move(result_state)); // Tracks the insertion order token_queue_.push(new_token); @@ -91,7 +92,7 @@ libtextclassifier3::StatusOr<PageResultState> ResultStateManager::GetNextPage( int num_returned = state_iterator->second.num_returned(); int num_per_page = state_iterator->second.num_per_page(); std::vector<ScoredDocumentHit> result_of_page = - state_iterator->second.GetNextPage(); + state_iterator->second.GetNextPage(document_store_); if (result_of_page.empty()) { // This shouldn't happen, all our active states should contain results, but // a sanity check here in case of any data inconsistency. @@ -112,6 +113,7 @@ libtextclassifier3::StatusOr<PageResultState> ResultStateManager::GetNextPage( next_page_token = kInvalidNextPageToken; } + num_total_hits_ -= result_of_page.size(); return PageResultState( result_of_page, next_page_token, std::move(snippet_context_copy), std::move(projection_tree_map_copy), num_returned, num_per_page); @@ -129,10 +131,14 @@ void ResultStateManager::InvalidateResultState(uint64_t next_page_token) { void ResultStateManager::InvalidateAllResultStates() { absl_ports::unique_lock l(&mutex_); + InternalInvalidateAllResultStates(); +} +void ResultStateManager::InternalInvalidateAllResultStates() { result_state_map_.clear(); invalidated_token_set_.clear(); - token_queue_ = {}; + token_queue_ = std::queue<uint64_t>(); + num_total_hits_ = 0; } uint64_t ResultStateManager::GetUniqueToken() { @@ -148,12 +154,21 @@ uint64_t ResultStateManager::GetUniqueToken() { return new_token; } -void ResultStateManager::RemoveStatesIfNeeded() { +void ResultStateManager::RemoveStatesIfNeeded(const ResultState& result_state) { if (result_state_map_.empty() || token_queue_.empty()) { return; } - // Removes any tokens that were previously invalidated. + // 1. Check if this new result_state would take up the entire result state + // manager budget. + if (result_state.num_remaining() > max_total_hits_) { + // This single result state will exceed our budget. Drop everything else to + // accomodate it. + InternalInvalidateAllResultStates(); + return; + } + + // 2. Remove any tokens that were previously invalidated. while (!token_queue_.empty() && invalidated_token_set_.find(token_queue_.front()) != invalidated_token_set_.end()) { @@ -161,11 +176,13 @@ void ResultStateManager::RemoveStatesIfNeeded() { token_queue_.pop(); } - // Removes the oldest state - if (result_state_map_.size() >= max_result_states_ && !token_queue_.empty()) { - result_state_map_.erase(token_queue_.front()); + // 3. If we're over budget, remove states from oldest to newest until we fit + // into our budget. + while (result_state.num_remaining() + num_total_hits_ > max_total_hits_) { + InternalInvalidateResultState(token_queue_.front()); token_queue_.pop(); } + invalidated_token_set_.clear(); } void ResultStateManager::InternalInvalidateResultState(uint64_t token) { @@ -173,7 +190,10 @@ void ResultStateManager::InternalInvalidateResultState(uint64_t token) { // invalidated_token_set_. The entry in token_queue_ can't be easily removed // right now (may need O(n) time), so we leave it there and later completely // remove the token in RemoveStatesIfNeeded(). - if (result_state_map_.erase(token) > 0) { + auto itr = result_state_map_.find(token); + if (itr != result_state_map_.end()) { + num_total_hits_ -= itr->second.num_remaining(); + result_state_map_.erase(itr); invalidated_token_set_.insert(token); } } diff --git a/icing/result/result-state-manager.h b/icing/result/result-state-manager.h index eaf9eb5..c04217f 100644 --- a/icing/result/result-state-manager.h +++ b/icing/result/result-state-manager.h @@ -37,7 +37,8 @@ inline constexpr uint64_t kInvalidNextPageToken = 0; // Used to store and manage ResultState. class ResultStateManager { public: - explicit ResultStateManager(int max_hits_per_query, int max_result_states); + explicit ResultStateManager(int max_total_hits, + const DocumentStore& document_store); ResultStateManager(const ResultStateManager&) = delete; ResultStateManager& operator=(const ResultStateManager&) = delete; @@ -77,13 +78,17 @@ class ResultStateManager { private: absl_ports::shared_mutex mutex_; - // The maximum number of scored document hits to return for a query. When we - // have more than the maximum number, extra hits will be truncated. - const int max_hits_per_query_; + const DocumentStore& document_store_; - // The maximum number of result states. When we have more than the maximum - // number, the oldest / firstly added result state will be removed. - const int max_result_states_; + // The maximum number of scored document hits that all result states may + // have. When a new result state is added such that num_total_hits_ would + // exceed max_total_hits_, the oldest result states are evicted until + // num_total_hits_ is below max_total_hits. + const int max_total_hits_; + + // The number of scored document hits that all result states currently held by + // the result state manager have. + int num_total_hits_; // A hash map of (next-page token -> result state) std::unordered_map<uint64_t, ResultState> result_state_map_ @@ -112,13 +117,21 @@ class ResultStateManager { uint64_t GetUniqueToken() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Helper method to remove old states to make room for incoming states. - void RemoveStatesIfNeeded() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + void RemoveStatesIfNeeded(const ResultState& result_state) + ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Helper method to remove a result state from result_state_map_, the token // will then be temporarily kept in invalidated_token_set_ until it's finally // removed from token_queue_. void InternalInvalidateResultState(uint64_t token) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); + + // Internal method to invalidates all result states / tokens currently in + // ResultStateManager. We need this separate method so that other public + // methods don't need to call InvalidateAllResultStates(). Public methods + // calling each other may cause deadlock issues. + void InternalInvalidateAllResultStates() + ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_); }; } // namespace lib diff --git a/icing/result/result-state-manager_test.cc b/icing/result/result-state-manager_test.cc index 6defa6f..32e45aa 100644 --- a/icing/result/result-state-manager_test.cc +++ b/icing/result/result-state-manager_test.cc @@ -14,9 +14,15 @@ #include "icing/result/result-state-manager.h" +#include "gmock/gmock.h" #include "gtest/gtest.h" +#include "icing/file/filesystem.h" #include "icing/portable/equals-proto.h" +#include "icing/schema/schema-store.h" +#include "icing/store/document-store.h" #include "icing/testing/common-matchers.h" +#include "icing/testing/tmp-directory.h" +#include "icing/util/clock.h" namespace icing { namespace lib { @@ -27,10 +33,6 @@ using ::testing::Eq; using ::testing::Gt; using ::testing::IsEmpty; -ScoredDocumentHit CreateScoredDocumentHit(DocumentId document_id) { - return ScoredDocumentHit(document_id, kSectionIdMaskNone, /*score=*/1); -} - ScoringSpecProto CreateScoringSpec() { ScoringSpecProto scoring_spec; scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE); @@ -43,24 +45,73 @@ ResultSpecProto CreateResultSpec(int num_per_page) { return result_spec; } -ResultState CreateResultState( - const std::vector<ScoredDocumentHit>& scored_document_hits, - int num_per_page) { - return ResultState(scored_document_hits, /*query_terms=*/{}, - SearchSpecProto::default_instance(), CreateScoringSpec(), - CreateResultSpec(num_per_page)); +ScoredDocumentHit CreateScoredHit(DocumentId document_id) { + return ScoredDocumentHit(document_id, kSectionIdMaskNone, /*score=*/1); } -TEST(ResultStateManagerTest, ShouldRankAndPaginateOnePage) { +class ResultStateManagerTest : public testing::Test { + protected: + void SetUp() override { + schema_store_base_dir_ = GetTestTempDir() + "/schema_store"; + filesystem_.CreateDirectoryRecursively(schema_store_base_dir_.c_str()); + ICING_ASSERT_OK_AND_ASSIGN( + schema_store_, + SchemaStore::Create(&filesystem_, schema_store_base_dir_, &clock_)); + SchemaProto schema; + schema.add_types()->set_schema_type("Document"); + ICING_ASSERT_OK(schema_store_->SetSchema(std::move(schema))); + + doc_store_base_dir_ = GetTestTempDir() + "/document_store"; + filesystem_.CreateDirectoryRecursively(doc_store_base_dir_.c_str()); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult result, + DocumentStore::Create(&filesystem_, doc_store_base_dir_, &clock_, + schema_store_.get())); + document_store_ = std::move(result.document_store); + } + + void TearDown() override { + filesystem_.DeleteDirectoryRecursively(doc_store_base_dir_.c_str()); + filesystem_.DeleteDirectoryRecursively(schema_store_base_dir_.c_str()); + } + + ResultState CreateResultState( + const std::vector<ScoredDocumentHit>& scored_document_hits, + int num_per_page) { + return ResultState(scored_document_hits, /*query_terms=*/{}, + SearchSpecProto::default_instance(), CreateScoringSpec(), + CreateResultSpec(num_per_page), *document_store_); + } + + ScoredDocumentHit AddScoredDocument(DocumentId document_id) { + DocumentProto document; + document.set_namespace_("namespace"); + document.set_uri(std::to_string(document_id)); + document.set_schema("Document"); + document_store_->Put(std::move(document)); + return ScoredDocumentHit(document_id, kSectionIdMaskNone, /*score=*/1); + } + + const DocumentStore& document_store() const { return *document_store_; } + + private: + Filesystem filesystem_; + std::string doc_store_base_dir_; + std::string schema_store_base_dir_; + Clock clock_; + std::unique_ptr<DocumentStore> document_store_; + std::unique_ptr<SchemaStore> schema_store_; +}; + +TEST_F(ResultStateManagerTest, ShouldRankAndPaginateOnePage) { ResultState original_result_state = - CreateResultState({CreateScoredDocumentHit(/*document_id=*/1), - CreateScoredDocumentHit(/*document_id=*/2), - CreateScoredDocumentHit(/*document_id=*/3)}, + CreateResultState({AddScoredDocument(/*document_id=*/0), + AddScoredDocument(/*document_id=*/1), + AddScoredDocument(/*document_id=*/2)}, /*num_per_page=*/10); ResultStateManager result_state_manager( - /*max_hits_per_query=*/std::numeric_limits<int>::max(), - /*max_result_states=*/std::numeric_limits<int>::max()); + /*max_total_hits=*/std::numeric_limits<int>::max(), document_store()); ICING_ASSERT_OK_AND_ASSIGN( PageResultState page_result_state, result_state_manager.RankAndPaginate(std::move(original_result_state))); @@ -70,24 +121,22 @@ TEST(ResultStateManagerTest, ShouldRankAndPaginateOnePage) { // Should get the original scored document hits EXPECT_THAT( page_result_state.scored_document_hits, - ElementsAre( - EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/3)), - EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/2)), - EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/1)))); + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(/*document_id=*/2)), + EqualsScoredDocumentHit(CreateScoredHit(/*document_id=*/1)), + EqualsScoredDocumentHit(CreateScoredHit(/*document_id=*/0)))); } -TEST(ResultStateManagerTest, ShouldRankAndPaginateMultiplePages) { +TEST_F(ResultStateManagerTest, ShouldRankAndPaginateMultiplePages) { ResultState original_result_state = - CreateResultState({CreateScoredDocumentHit(/*document_id=*/1), - CreateScoredDocumentHit(/*document_id=*/2), - CreateScoredDocumentHit(/*document_id=*/3), - CreateScoredDocumentHit(/*document_id=*/4), - CreateScoredDocumentHit(/*document_id=*/5)}, + CreateResultState({AddScoredDocument(/*document_id=*/0), + AddScoredDocument(/*document_id=*/1), + AddScoredDocument(/*document_id=*/2), + AddScoredDocument(/*document_id=*/3), + AddScoredDocument(/*document_id=*/4)}, /*num_per_page=*/2); ResultStateManager result_state_manager( - /*max_hits_per_query=*/std::numeric_limits<int>::max(), - /*max_result_states=*/std::numeric_limits<int>::max()); + /*max_total_hits=*/std::numeric_limits<int>::max(), document_store()); // First page, 2 results ICING_ASSERT_OK_AND_ASSIGN( @@ -95,9 +144,8 @@ TEST(ResultStateManagerTest, ShouldRankAndPaginateMultiplePages) { result_state_manager.RankAndPaginate(std::move(original_result_state))); EXPECT_THAT( page_result_state1.scored_document_hits, - ElementsAre( - EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/5)), - EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/4)))); + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(/*document_id=*/4)), + EqualsScoredDocumentHit(CreateScoredHit(/*document_id=*/3)))); uint64_t next_page_token = page_result_state1.next_page_token; @@ -106,48 +154,45 @@ TEST(ResultStateManagerTest, ShouldRankAndPaginateMultiplePages) { result_state_manager.GetNextPage(next_page_token)); EXPECT_THAT( page_result_state2.scored_document_hits, - ElementsAre( - EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/3)), - EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/2)))); + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(/*document_id=*/2)), + EqualsScoredDocumentHit(CreateScoredHit(/*document_id=*/1)))); // Third page, 1 result ICING_ASSERT_OK_AND_ASSIGN(PageResultState page_result_state3, result_state_manager.GetNextPage(next_page_token)); - EXPECT_THAT(page_result_state3.scored_document_hits, - ElementsAre(EqualsScoredDocumentHit( - CreateScoredDocumentHit(/*document_id=*/1)))); + EXPECT_THAT( + page_result_state3.scored_document_hits, + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(/*document_id=*/0)))); // No results EXPECT_THAT(result_state_manager.GetNextPage(next_page_token), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST(ResultStateManagerTest, EmptyStateShouldReturnError) { +TEST_F(ResultStateManagerTest, EmptyStateShouldReturnError) { ResultState empty_result_state = CreateResultState({}, /*num_per_page=*/1); ResultStateManager result_state_manager( - /*max_hits_per_query=*/std::numeric_limits<int>::max(), - /*max_result_states=*/std::numeric_limits<int>::max()); + /*max_total_hits=*/std::numeric_limits<int>::max(), document_store()); EXPECT_THAT( result_state_manager.RankAndPaginate(std::move(empty_result_state)), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST(ResultStateManagerTest, ShouldInvalidateOneToken) { +TEST_F(ResultStateManagerTest, ShouldInvalidateOneToken) { ResultState result_state1 = - CreateResultState({CreateScoredDocumentHit(/*document_id=*/1), - CreateScoredDocumentHit(/*document_id=*/2), - CreateScoredDocumentHit(/*document_id=*/3)}, + CreateResultState({AddScoredDocument(/*document_id=*/0), + AddScoredDocument(/*document_id=*/1), + AddScoredDocument(/*document_id=*/2)}, /*num_per_page=*/1); ResultState result_state2 = - CreateResultState({CreateScoredDocumentHit(/*document_id=*/4), - CreateScoredDocumentHit(/*document_id=*/5), - CreateScoredDocumentHit(/*document_id=*/6)}, + CreateResultState({AddScoredDocument(/*document_id=*/3), + AddScoredDocument(/*document_id=*/4), + AddScoredDocument(/*document_id=*/5)}, /*num_per_page=*/1); ResultStateManager result_state_manager( - /*max_hits_per_query=*/std::numeric_limits<int>::max(), - /*max_result_states=*/std::numeric_limits<int>::max()); + /*max_total_hits=*/std::numeric_limits<int>::max(), document_store()); ICING_ASSERT_OK_AND_ASSIGN( PageResultState page_result_state1, result_state_manager.RankAndPaginate(std::move(result_state1))); @@ -167,26 +212,25 @@ TEST(ResultStateManagerTest, ShouldInvalidateOneToken) { ICING_ASSERT_OK_AND_ASSIGN( page_result_state2, result_state_manager.GetNextPage(page_result_state2.next_page_token)); - EXPECT_THAT(page_result_state2.scored_document_hits, - ElementsAre(EqualsScoredDocumentHit( - CreateScoredDocumentHit(/*document_id=*/5)))); + EXPECT_THAT( + page_result_state2.scored_document_hits, + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit(/*document_id=*/4)))); } -TEST(ResultStateManagerTest, ShouldInvalidateAllTokens) { +TEST_F(ResultStateManagerTest, ShouldInvalidateAllTokens) { ResultState result_state1 = - CreateResultState({CreateScoredDocumentHit(/*document_id=*/1), - CreateScoredDocumentHit(/*document_id=*/2), - CreateScoredDocumentHit(/*document_id=*/3)}, + CreateResultState({AddScoredDocument(/*document_id=*/0), + AddScoredDocument(/*document_id=*/1), + AddScoredDocument(/*document_id=*/2)}, /*num_per_page=*/1); ResultState result_state2 = - CreateResultState({CreateScoredDocumentHit(/*document_id=*/4), - CreateScoredDocumentHit(/*document_id=*/5), - CreateScoredDocumentHit(/*document_id=*/6)}, + CreateResultState({AddScoredDocument(/*document_id=*/3), + AddScoredDocument(/*document_id=*/4), + AddScoredDocument(/*document_id=*/5)}, /*num_per_page=*/1); ResultStateManager result_state_manager( - /*max_hits_per_query=*/std::numeric_limits<int>::max(), - /*max_result_states=*/std::numeric_limits<int>::max()); + /*max_total_hits=*/std::numeric_limits<int>::max(), document_store()); ICING_ASSERT_OK_AND_ASSIGN( PageResultState page_result_state1, result_state_manager.RankAndPaginate(std::move(result_state1))); @@ -207,23 +251,22 @@ TEST(ResultStateManagerTest, ShouldInvalidateAllTokens) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST(ResultStateManagerTest, ShouldRemoveOldestResultState) { +TEST_F(ResultStateManagerTest, ShouldRemoveOldestResultState) { ResultState result_state1 = - CreateResultState({CreateScoredDocumentHit(/*document_id=*/1), - CreateScoredDocumentHit(/*document_id=*/2)}, + CreateResultState({AddScoredDocument(/*document_id=*/0), + AddScoredDocument(/*document_id=*/1)}, /*num_per_page=*/1); ResultState result_state2 = - CreateResultState({CreateScoredDocumentHit(/*document_id=*/3), - CreateScoredDocumentHit(/*document_id=*/4)}, + CreateResultState({AddScoredDocument(/*document_id=*/2), + AddScoredDocument(/*document_id=*/3)}, /*num_per_page=*/1); ResultState result_state3 = - CreateResultState({CreateScoredDocumentHit(/*document_id=*/5), - CreateScoredDocumentHit(/*document_id=*/6)}, + CreateResultState({AddScoredDocument(/*document_id=*/4), + AddScoredDocument(/*document_id=*/5)}, /*num_per_page=*/1); - ResultStateManager result_state_manager( - /*max_hits_per_query=*/std::numeric_limits<int>::max(), - /*max_result_states=*/2); + ResultStateManager result_state_manager(/*max_total_hits=*/2, + document_store()); ICING_ASSERT_OK_AND_ASSIGN( PageResultState page_result_state1, result_state_manager.RankAndPaginate(std::move(result_state1))); @@ -243,39 +286,111 @@ TEST(ResultStateManagerTest, ShouldRemoveOldestResultState) { page_result_state2, result_state_manager.GetNextPage(page_result_state2.next_page_token)); EXPECT_THAT(page_result_state2.scored_document_hits, - ElementsAre(EqualsScoredDocumentHit(CreateScoredDocumentHit( - /*document_id=*/3)))); + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit( + /*document_id=*/2)))); ICING_ASSERT_OK_AND_ASSIGN( page_result_state3, result_state_manager.GetNextPage(page_result_state3.next_page_token)); EXPECT_THAT(page_result_state3.scored_document_hits, - ElementsAre(EqualsScoredDocumentHit(CreateScoredDocumentHit( - /*document_id=*/5)))); + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit( + /*document_id=*/4)))); } -TEST(ResultStateManagerTest, - PreviouslyInvalidatedResultStateShouldNotBeCounted) { +TEST_F(ResultStateManagerTest, + InvalidatedResultStateShouldDecreaseCurrentHitsCount) { ResultState result_state1 = - CreateResultState({CreateScoredDocumentHit(/*document_id=*/1), - CreateScoredDocumentHit(/*document_id=*/2)}, + CreateResultState({AddScoredDocument(/*document_id=*/0), + AddScoredDocument(/*document_id=*/1)}, /*num_per_page=*/1); ResultState result_state2 = - CreateResultState({CreateScoredDocumentHit(/*document_id=*/3), - CreateScoredDocumentHit(/*document_id=*/4)}, + CreateResultState({AddScoredDocument(/*document_id=*/2), + AddScoredDocument(/*document_id=*/3)}, /*num_per_page=*/1); ResultState result_state3 = - CreateResultState({CreateScoredDocumentHit(/*document_id=*/5), - CreateScoredDocumentHit(/*document_id=*/6)}, + CreateResultState({AddScoredDocument(/*document_id=*/4), + AddScoredDocument(/*document_id=*/5)}, /*num_per_page=*/1); + + // Add the first three states. Remember, the first page for each result state + // won't be cached (since it is returned immediately from RankAndPaginate). + // Each result state has a page size of 1 and a result set of 2 hits. So each + // result will take up one hit of our three hit budget. + ResultStateManager result_state_manager(/*max_total_hits=*/3, + document_store()); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state1, + result_state_manager.RankAndPaginate(std::move(result_state1))); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state2, + result_state_manager.RankAndPaginate(std::move(result_state2))); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state3, + result_state_manager.RankAndPaginate(std::move(result_state3))); + + // Invalidates state 2, so that the number of hits current cached should be + // decremented to 2. + result_state_manager.InvalidateResultState( + page_result_state2.next_page_token); + + // If invalidating state 2 correctly decremented the current hit count to 2, + // then adding state 4 should still be within our budget and no other result + // states should be evicted. ResultState result_state4 = - CreateResultState({CreateScoredDocumentHit(/*document_id=*/7), - CreateScoredDocumentHit(/*document_id=*/8)}, + CreateResultState({AddScoredDocument(/*document_id=*/6), + AddScoredDocument(/*document_id=*/7)}, /*num_per_page=*/1); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state4, + result_state_manager.RankAndPaginate(std::move(result_state4))); + + ICING_ASSERT_OK_AND_ASSIGN( + page_result_state1, + result_state_manager.GetNextPage(page_result_state1.next_page_token)); + EXPECT_THAT(page_result_state1.scored_document_hits, + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit( + /*document_id=*/0)))); + + EXPECT_THAT( + result_state_manager.GetNextPage(page_result_state2.next_page_token), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); - ResultStateManager result_state_manager( - /*max_hits_per_query=*/std::numeric_limits<int>::max(), - /*max_result_states=*/3); + ICING_ASSERT_OK_AND_ASSIGN( + page_result_state3, + result_state_manager.GetNextPage(page_result_state3.next_page_token)); + EXPECT_THAT(page_result_state3.scored_document_hits, + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit( + /*document_id=*/4)))); + + ICING_ASSERT_OK_AND_ASSIGN( + page_result_state4, + result_state_manager.GetNextPage(page_result_state4.next_page_token)); + EXPECT_THAT(page_result_state4.scored_document_hits, + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit( + /*document_id=*/6)))); +} + +TEST_F(ResultStateManagerTest, + InvalidatedAllResultStatesShouldResetCurrentHitCount) { + ResultState result_state1 = + CreateResultState({AddScoredDocument(/*document_id=*/0), + AddScoredDocument(/*document_id=*/1)}, + /*num_per_page=*/1); + ResultState result_state2 = + CreateResultState({AddScoredDocument(/*document_id=*/2), + AddScoredDocument(/*document_id=*/3)}, + /*num_per_page=*/1); + ResultState result_state3 = + CreateResultState({AddScoredDocument(/*document_id=*/4), + AddScoredDocument(/*document_id=*/5)}, + /*num_per_page=*/1); + + // Add the first three states. Remember, the first page for each result state + // won't be cached (since it is returned immediately from RankAndPaginate). + // Each result state has a page size of 1 and a result set of 2 hits. So each + // result will take up one hit of our three hit budget. + ResultStateManager result_state_manager(/*max_total_hits=*/3, + document_store()); ICING_ASSERT_OK_AND_ASSIGN( PageResultState page_result_state1, result_state_manager.RankAndPaginate(std::move(result_state1))); @@ -286,21 +401,298 @@ TEST(ResultStateManagerTest, PageResultState page_result_state3, result_state_manager.RankAndPaginate(std::move(result_state3))); - // Invalidates state 2, so that the number of valid tokens becomes 2. + // Invalidates all states so that the current hit count will be 0. + result_state_manager.InvalidateAllResultStates(); + + // If invalidating all states correctly reset the current hit count to 0, + // then the entirety of state 4 should still be within our budget and no other + // result states should be evicted. + ResultState result_state4 = + CreateResultState({AddScoredDocument(/*document_id=*/6), + AddScoredDocument(/*document_id=*/7)}, + /*num_per_page=*/1); + ResultState result_state5 = + CreateResultState({AddScoredDocument(/*document_id=*/8), + AddScoredDocument(/*document_id=*/9)}, + /*num_per_page=*/1); + ResultState result_state6 = + CreateResultState({AddScoredDocument(/*document_id=*/10), + AddScoredDocument(/*document_id=*/11)}, + /*num_per_page=*/1); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state4, + result_state_manager.RankAndPaginate(std::move(result_state4))); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state5, + result_state_manager.RankAndPaginate(std::move(result_state5))); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state6, + result_state_manager.RankAndPaginate(std::move(result_state6))); + + EXPECT_THAT( + result_state_manager.GetNextPage(page_result_state1.next_page_token), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + + EXPECT_THAT( + result_state_manager.GetNextPage(page_result_state2.next_page_token), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + + EXPECT_THAT( + result_state_manager.GetNextPage(page_result_state3.next_page_token), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + + ICING_ASSERT_OK_AND_ASSIGN( + page_result_state4, + result_state_manager.GetNextPage(page_result_state4.next_page_token)); + EXPECT_THAT(page_result_state4.scored_document_hits, + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit( + /*document_id=*/6)))); + + ICING_ASSERT_OK_AND_ASSIGN( + page_result_state5, + result_state_manager.GetNextPage(page_result_state5.next_page_token)); + EXPECT_THAT(page_result_state5.scored_document_hits, + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit( + /*document_id=*/8)))); + + ICING_ASSERT_OK_AND_ASSIGN( + page_result_state6, + result_state_manager.GetNextPage(page_result_state6.next_page_token)); + EXPECT_THAT(page_result_state6.scored_document_hits, + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit( + /*document_id=*/10)))); +} + +TEST_F( + ResultStateManagerTest, + InvalidatedResultStateShouldDecreaseCurrentHitsCountByExactStateHitCount) { + ResultState result_state1 = + CreateResultState({AddScoredDocument(/*document_id=*/0), + AddScoredDocument(/*document_id=*/1)}, + /*num_per_page=*/1); + ResultState result_state2 = + CreateResultState({AddScoredDocument(/*document_id=*/2), + AddScoredDocument(/*document_id=*/3)}, + /*num_per_page=*/1); + ResultState result_state3 = + CreateResultState({AddScoredDocument(/*document_id=*/4), + AddScoredDocument(/*document_id=*/5)}, + /*num_per_page=*/1); + + // Add the first three states. Remember, the first page for each result state + // won't be cached (since it is returned immediately from RankAndPaginate). + // Each result state has a page size of 1 and a result set of 2 hits. So each + // result will take up one hit of our three hit budget. + ResultStateManager result_state_manager(/*max_total_hits=*/3, + document_store()); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state1, + result_state_manager.RankAndPaginate(std::move(result_state1))); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state2, + result_state_manager.RankAndPaginate(std::move(result_state2))); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state3, + result_state_manager.RankAndPaginate(std::move(result_state3))); + + // Invalidates state 2, so that the number of hits current cached should be + // decremented to 2. result_state_manager.InvalidateResultState( page_result_state2.next_page_token); - // Adding state 4 shouldn't affect rest of the states + // If invalidating state 2 correctly decremented the current hit count to 2, + // then adding state 4 should still be within our budget and no other result + // states should be evicted. + ResultState result_state4 = + CreateResultState({AddScoredDocument(/*document_id=*/6), + AddScoredDocument(/*document_id=*/7)}, + /*num_per_page=*/1); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state4, + result_state_manager.RankAndPaginate(std::move(result_state4))); + + // If invalidating result state 2 correctly decremented the current hit count + // to 2 and adding state 4 correctly incremented it to 3, then adding this + // result state should trigger the eviction of state 1. + ResultState result_state5 = + CreateResultState({AddScoredDocument(/*document_id=*/8), + AddScoredDocument(/*document_id=*/9)}, + /*num_per_page=*/1); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state5, + result_state_manager.RankAndPaginate(std::move(result_state5))); + + EXPECT_THAT( + result_state_manager.GetNextPage(page_result_state1.next_page_token), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + + EXPECT_THAT( + result_state_manager.GetNextPage(page_result_state2.next_page_token), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + + ICING_ASSERT_OK_AND_ASSIGN( + page_result_state3, + result_state_manager.GetNextPage(page_result_state3.next_page_token)); + EXPECT_THAT(page_result_state3.scored_document_hits, + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit( + /*document_id=*/4)))); + + ICING_ASSERT_OK_AND_ASSIGN( + page_result_state4, + result_state_manager.GetNextPage(page_result_state4.next_page_token)); + EXPECT_THAT(page_result_state4.scored_document_hits, + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit( + /*document_id=*/6)))); + + ICING_ASSERT_OK_AND_ASSIGN( + page_result_state5, + result_state_manager.GetNextPage(page_result_state5.next_page_token)); + EXPECT_THAT(page_result_state5.scored_document_hits, + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit( + /*document_id=*/8)))); +} + +TEST_F(ResultStateManagerTest, GetNextPageShouldDecreaseCurrentHitsCount) { + ResultState result_state1 = + CreateResultState({AddScoredDocument(/*document_id=*/0), + AddScoredDocument(/*document_id=*/1)}, + /*num_per_page=*/1); + ResultState result_state2 = + CreateResultState({AddScoredDocument(/*document_id=*/2), + AddScoredDocument(/*document_id=*/3)}, + /*num_per_page=*/1); + ResultState result_state3 = + CreateResultState({AddScoredDocument(/*document_id=*/4), + AddScoredDocument(/*document_id=*/5)}, + /*num_per_page=*/1); + + // Add the first three states. Remember, the first page for each result state + // won't be cached (since it is returned immediately from RankAndPaginate). + // Each result state has a page size of 1 and a result set of 2 hits. So each + // result will take up one hit of our three hit budget. + ResultStateManager result_state_manager(/*max_total_hits=*/3, + document_store()); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state1, + result_state_manager.RankAndPaginate(std::move(result_state1))); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state2, + result_state_manager.RankAndPaginate(std::move(result_state2))); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state3, + result_state_manager.RankAndPaginate(std::move(result_state3))); + + // GetNextPage for result state 1 should return its result and decrement the + // number of cached hits to 2. + ICING_ASSERT_OK_AND_ASSIGN( + page_result_state1, + result_state_manager.GetNextPage(page_result_state1.next_page_token)); + EXPECT_THAT(page_result_state1.scored_document_hits, + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit( + /*document_id=*/0)))); + + // If retrieving the next page for result state 1 correctly decremented the + // current hit count to 2, then adding state 4 should still be within our + // budget and no other result states should be evicted. + ResultState result_state4 = + CreateResultState({AddScoredDocument(/*document_id=*/6), + AddScoredDocument(/*document_id=*/7)}, + /*num_per_page=*/1); ICING_ASSERT_OK_AND_ASSIGN( PageResultState page_result_state4, result_state_manager.RankAndPaginate(std::move(result_state4))); + EXPECT_THAT( + result_state_manager.GetNextPage(page_result_state1.next_page_token), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + + ICING_ASSERT_OK_AND_ASSIGN( + page_result_state2, + result_state_manager.GetNextPage(page_result_state2.next_page_token)); + EXPECT_THAT(page_result_state2.scored_document_hits, + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit( + /*document_id=*/2)))); + + ICING_ASSERT_OK_AND_ASSIGN( + page_result_state3, + result_state_manager.GetNextPage(page_result_state3.next_page_token)); + EXPECT_THAT(page_result_state3.scored_document_hits, + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit( + /*document_id=*/4)))); + + ICING_ASSERT_OK_AND_ASSIGN( + page_result_state4, + result_state_manager.GetNextPage(page_result_state4.next_page_token)); + EXPECT_THAT(page_result_state4.scored_document_hits, + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit( + /*document_id=*/6)))); +} + +TEST_F(ResultStateManagerTest, + GetNextPageShouldDecreaseCurrentHitsCountByExactlyOnePage) { + ResultState result_state1 = + CreateResultState({AddScoredDocument(/*document_id=*/0), + AddScoredDocument(/*document_id=*/1)}, + /*num_per_page=*/1); + ResultState result_state2 = + CreateResultState({AddScoredDocument(/*document_id=*/2), + AddScoredDocument(/*document_id=*/3)}, + /*num_per_page=*/1); + ResultState result_state3 = + CreateResultState({AddScoredDocument(/*document_id=*/4), + AddScoredDocument(/*document_id=*/5)}, + /*num_per_page=*/1); + + // Add the first three states. Remember, the first page for each result state + // won't be cached (since it is returned immediately from RankAndPaginate). + // Each result state has a page size of 1 and a result set of 2 hits. So each + // result will take up one hit of our three hit budget. + ResultStateManager result_state_manager(/*max_total_hits=*/3, + document_store()); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state1, + result_state_manager.RankAndPaginate(std::move(result_state1))); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state2, + result_state_manager.RankAndPaginate(std::move(result_state2))); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state3, + result_state_manager.RankAndPaginate(std::move(result_state3))); + + // GetNextPage for result state 1 should return its result and decrement the + // number of cached hits to 2. ICING_ASSERT_OK_AND_ASSIGN( page_result_state1, result_state_manager.GetNextPage(page_result_state1.next_page_token)); EXPECT_THAT(page_result_state1.scored_document_hits, - ElementsAre(EqualsScoredDocumentHit(CreateScoredDocumentHit( - /*document_id=*/1)))); + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit( + /*document_id=*/0)))); + + // If retrieving the next page for result state 1 correctly decremented the + // current hit count to 2, then adding state 4 should still be within our + // budget and no other result states should be evicted. + ResultState result_state4 = + CreateResultState({AddScoredDocument(/*document_id=*/6), + AddScoredDocument(/*document_id=*/7)}, + /*num_per_page=*/1); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state4, + result_state_manager.RankAndPaginate(std::move(result_state4))); + + // If retrieving the next page for result state 1 correctly decremented the + // current hit count to 2 and adding state 4 correctly incremented it to 3, + // then adding this result state should trigger the eviction of state 2. + ResultState result_state5 = + CreateResultState({AddScoredDocument(/*document_id=*/8), + AddScoredDocument(/*document_id=*/9)}, + /*num_per_page=*/1); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state5, + result_state_manager.RankAndPaginate(std::move(result_state5))); + + EXPECT_THAT( + result_state_manager.GetNextPage(page_result_state1.next_page_token), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT( result_state_manager.GetNextPage(page_result_state2.next_page_token), @@ -310,18 +702,150 @@ TEST(ResultStateManagerTest, page_result_state3, result_state_manager.GetNextPage(page_result_state3.next_page_token)); EXPECT_THAT(page_result_state3.scored_document_hits, - ElementsAre(EqualsScoredDocumentHit(CreateScoredDocumentHit( - /*document_id=*/5)))); + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit( + /*document_id=*/4)))); ICING_ASSERT_OK_AND_ASSIGN( page_result_state4, result_state_manager.GetNextPage(page_result_state4.next_page_token)); EXPECT_THAT(page_result_state4.scored_document_hits, - ElementsAre(EqualsScoredDocumentHit(CreateScoredDocumentHit( + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit( + /*document_id=*/6)))); + + ICING_ASSERT_OK_AND_ASSIGN( + page_result_state5, + result_state_manager.GetNextPage(page_result_state5.next_page_token)); + EXPECT_THAT(page_result_state5.scored_document_hits, + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit( + /*document_id=*/8)))); +} + +TEST_F(ResultStateManagerTest, + AddingOverBudgetResultStateShouldEvictAllStates) { + ResultState result_state1 = + CreateResultState({AddScoredDocument(/*document_id=*/0), + AddScoredDocument(/*document_id=*/1), + AddScoredDocument(/*document_id=*/2)}, + /*num_per_page=*/1); + ResultState result_state2 = + CreateResultState({AddScoredDocument(/*document_id=*/3), + AddScoredDocument(/*document_id=*/4)}, + /*num_per_page=*/1); + + // Add the first two states. Remember, the first page for each result state + // won't be cached (since it is returned immediately from RankAndPaginate). + // Each result state has a page size of 1. So 3 hits will remain cached. + ResultStateManager result_state_manager(/*max_total_hits=*/4, + document_store()); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state1, + result_state_manager.RankAndPaginate(std::move(result_state1))); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state2, + result_state_manager.RankAndPaginate(std::move(result_state2))); + + // Add a result state that is larger than the entire budget. This should + // result in all previous result states being evicted, the first hit from + // result state 3 being returned and the next four hits being cached (the last + // hit should be dropped because it exceeds the max). + ResultState result_state3 = + CreateResultState({AddScoredDocument(/*document_id=*/5), + AddScoredDocument(/*document_id=*/6), + AddScoredDocument(/*document_id=*/7), + AddScoredDocument(/*document_id=*/8), + AddScoredDocument(/*document_id=*/9), + AddScoredDocument(/*document_id=*/10)}, + /*num_per_page=*/1); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state3, + result_state_manager.RankAndPaginate(std::move(result_state3))); + + // GetNextPage for result state 1 and 2 should return NOT_FOUND. + EXPECT_THAT( + result_state_manager.GetNextPage(page_result_state1.next_page_token), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + + EXPECT_THAT( + result_state_manager.GetNextPage(page_result_state2.next_page_token), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + + // Only the next four results in state 3 should be retrievable. + ICING_ASSERT_OK_AND_ASSIGN( + page_result_state3, + result_state_manager.GetNextPage(page_result_state3.next_page_token)); + EXPECT_THAT(page_result_state3.scored_document_hits, + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit( + /*document_id=*/9)))); + + ICING_ASSERT_OK_AND_ASSIGN( + page_result_state3, + result_state_manager.GetNextPage(page_result_state3.next_page_token)); + EXPECT_THAT(page_result_state3.scored_document_hits, + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit( + /*document_id=*/8)))); + + ICING_ASSERT_OK_AND_ASSIGN( + page_result_state3, + result_state_manager.GetNextPage(page_result_state3.next_page_token)); + EXPECT_THAT(page_result_state3.scored_document_hits, + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit( /*document_id=*/7)))); + + ICING_ASSERT_OK_AND_ASSIGN( + page_result_state3, + result_state_manager.GetNextPage(page_result_state3.next_page_token)); + EXPECT_THAT(page_result_state3.scored_document_hits, + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit( + /*document_id=*/6)))); + + // The final result should have been dropped because it exceeded the budget. + EXPECT_THAT( + result_state_manager.GetNextPage(page_result_state3.next_page_token), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); +} + +TEST_F(ResultStateManagerTest, + AddingResultStateShouldEvictOverBudgetResultState) { + ResultStateManager result_state_manager(/*max_total_hits=*/4, + document_store()); + // Add a result state that is larger than the entire budget. The entire result + // state will still be cached + ResultState result_state1 = + CreateResultState({AddScoredDocument(/*document_id=*/0), + AddScoredDocument(/*document_id=*/1), + AddScoredDocument(/*document_id=*/2), + AddScoredDocument(/*document_id=*/3), + AddScoredDocument(/*document_id=*/4), + AddScoredDocument(/*document_id=*/5)}, + /*num_per_page=*/1); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state1, + result_state_manager.RankAndPaginate(std::move(result_state1))); + + // Add a result state. Because state2 + state1 is larger than the budget, + // state1 should be evicted. + ResultState result_state2 = + CreateResultState({AddScoredDocument(/*document_id=*/6), + AddScoredDocument(/*document_id=*/7)}, + /*num_per_page=*/1); + ICING_ASSERT_OK_AND_ASSIGN( + PageResultState page_result_state2, + result_state_manager.RankAndPaginate(std::move(result_state2))); + + // state1 should have been evicted and state2 should still be retrievable. + EXPECT_THAT( + result_state_manager.GetNextPage(page_result_state1.next_page_token), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + + ICING_ASSERT_OK_AND_ASSIGN( + page_result_state2, + result_state_manager.GetNextPage(page_result_state2.next_page_token)); + EXPECT_THAT(page_result_state2.scored_document_hits, + ElementsAre(EqualsScoredDocumentHit(CreateScoredHit( + /*document_id=*/6)))); } -TEST(ResultStateManagerTest, ShouldGetSnippetContext) { +TEST_F(ResultStateManagerTest, ShouldGetSnippetContext) { ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/1); result_spec.mutable_snippet_spec()->set_num_to_snippet(5); result_spec.mutable_snippet_spec()->set_num_matches_per_property(5); @@ -334,13 +858,13 @@ TEST(ResultStateManagerTest, ShouldGetSnippetContext) { query_terms_map.emplace("term1", std::unordered_set<std::string>()); ResultState original_result_state = ResultState( - /*scored_document_hits=*/{CreateScoredDocumentHit(/*document_id=*/1), - CreateScoredDocumentHit(/*document_id=*/2)}, - query_terms_map, search_spec, CreateScoringSpec(), result_spec); + /*scored_document_hits=*/{AddScoredDocument(/*document_id=*/0), + AddScoredDocument(/*document_id=*/1)}, + query_terms_map, search_spec, CreateScoringSpec(), result_spec, + document_store()); ResultStateManager result_state_manager( - /*max_hits_per_query=*/std::numeric_limits<int>::max(), - /*max_result_states=*/std::numeric_limits<int>::max()); + /*max_total_hits=*/std::numeric_limits<int>::max(), document_store()); ICING_ASSERT_OK_AND_ASSIGN( PageResultState page_result_state, result_state_manager.RankAndPaginate(std::move(original_result_state))); @@ -355,7 +879,7 @@ TEST(ResultStateManagerTest, ShouldGetSnippetContext) { EqualsProto(result_spec.snippet_spec())); } -TEST(ResultStateManagerTest, ShouldGetDefaultSnippetContext) { +TEST_F(ResultStateManagerTest, ShouldGetDefaultSnippetContext) { ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/1); // 0 indicates no snippeting result_spec.mutable_snippet_spec()->set_num_to_snippet(0); @@ -369,13 +893,13 @@ TEST(ResultStateManagerTest, ShouldGetDefaultSnippetContext) { query_terms_map.emplace("term1", std::unordered_set<std::string>()); ResultState original_result_state = ResultState( - /*scored_document_hits=*/{CreateScoredDocumentHit(/*document_id=*/1), - CreateScoredDocumentHit(/*document_id=*/2)}, - query_terms_map, search_spec, CreateScoringSpec(), result_spec); + /*scored_document_hits=*/{AddScoredDocument(/*document_id=*/0), + AddScoredDocument(/*document_id=*/1)}, + query_terms_map, search_spec, CreateScoringSpec(), result_spec, + document_store()); ResultStateManager result_state_manager( - /*max_hits_per_query=*/std::numeric_limits<int>::max(), - /*max_result_states=*/std::numeric_limits<int>::max()); + /*max_total_hits=*/std::numeric_limits<int>::max(), document_store()); ICING_ASSERT_OK_AND_ASSIGN( PageResultState page_result_state, result_state_manager.RankAndPaginate(std::move(original_result_state))); @@ -390,18 +914,17 @@ TEST(ResultStateManagerTest, ShouldGetDefaultSnippetContext) { Eq(TermMatchType::UNKNOWN)); } -TEST(ResultStateManagerTest, ShouldGetCorrectNumPreviouslyReturned) { +TEST_F(ResultStateManagerTest, ShouldGetCorrectNumPreviouslyReturned) { ResultState original_result_state = - CreateResultState({CreateScoredDocumentHit(/*document_id=*/1), - CreateScoredDocumentHit(/*document_id=*/2), - CreateScoredDocumentHit(/*document_id=*/3), - CreateScoredDocumentHit(/*document_id=*/4), - CreateScoredDocumentHit(/*document_id=*/5)}, + CreateResultState({AddScoredDocument(/*document_id=*/0), + AddScoredDocument(/*document_id=*/1), + AddScoredDocument(/*document_id=*/2), + AddScoredDocument(/*document_id=*/3), + AddScoredDocument(/*document_id=*/4)}, /*num_per_page=*/2); ResultStateManager result_state_manager( - /*max_hits_per_query=*/std::numeric_limits<int>::max(), - /*max_result_states=*/std::numeric_limits<int>::max()); + /*max_total_hits=*/std::numeric_limits<int>::max(), document_store()); // First page, 2 results ICING_ASSERT_OK_AND_ASSIGN( @@ -435,41 +958,48 @@ TEST(ResultStateManagerTest, ShouldGetCorrectNumPreviouslyReturned) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST(ResultStateManagerTest, ShouldStoreMaxNumberOfScoredDocumentHits) { - ResultState original_result_state = - CreateResultState({CreateScoredDocumentHit(/*document_id=*/1), - CreateScoredDocumentHit(/*document_id=*/2), - CreateScoredDocumentHit(/*document_id=*/3), - CreateScoredDocumentHit(/*document_id=*/4), - CreateScoredDocumentHit(/*document_id=*/5)}, - /*num_per_page=*/2); +TEST_F(ResultStateManagerTest, ShouldStoreAllHits) { + ScoredDocumentHit scored_hit_1 = AddScoredDocument(/*document_id=*/0); + ScoredDocumentHit scored_hit_2 = AddScoredDocument(/*document_id=*/1); + ScoredDocumentHit scored_hit_3 = AddScoredDocument(/*document_id=*/2); + ScoredDocumentHit scored_hit_4 = AddScoredDocument(/*document_id=*/3); + ScoredDocumentHit scored_hit_5 = AddScoredDocument(/*document_id=*/4); - ResultStateManager result_state_manager( - /*max_hits_per_query=*/3, - /*max_result_states=*/std::numeric_limits<int>::max()); + ResultState original_result_state = CreateResultState( + {scored_hit_1, scored_hit_2, scored_hit_3, scored_hit_4, scored_hit_5}, + /*num_per_page=*/2); + + ResultStateManager result_state_manager(/*max_total_hits=*/4, + document_store()); - // The 5 input scored document hits will be truncated to 3. + // The 5 input scored document hits will not be truncated. The first page of + // two hits will be returned immediately and the other three hits will fit + // within our caching budget. // First page, 2 results ICING_ASSERT_OK_AND_ASSIGN( PageResultState page_result_state1, result_state_manager.RankAndPaginate(std::move(original_result_state))); - EXPECT_THAT( - page_result_state1.scored_document_hits, - ElementsAre( - EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/5)), - EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/4)))); + EXPECT_THAT(page_result_state1.scored_document_hits, + ElementsAre(EqualsScoredDocumentHit(scored_hit_5), + EqualsScoredDocumentHit(scored_hit_4))); uint64_t next_page_token = page_result_state1.next_page_token; - // Second page, 1 results. + // Second page, 2 results. ICING_ASSERT_OK_AND_ASSIGN(PageResultState page_result_state2, result_state_manager.GetNextPage(next_page_token)); EXPECT_THAT(page_result_state2.scored_document_hits, - ElementsAre(EqualsScoredDocumentHit( - CreateScoredDocumentHit(/*document_id=*/3)))); + ElementsAre(EqualsScoredDocumentHit(scored_hit_3), + EqualsScoredDocumentHit(scored_hit_2))); + + // Third page, 1 result. + ICING_ASSERT_OK_AND_ASSIGN(PageResultState page_result_state3, + result_state_manager.GetNextPage(next_page_token)); + EXPECT_THAT(page_result_state3.scored_document_hits, + ElementsAre(EqualsScoredDocumentHit(scored_hit_1))); - // No third page. + // Fourth page, 0 results. EXPECT_THAT(result_state_manager.GetNextPage(next_page_token), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } diff --git a/icing/result/result-state.cc b/icing/result/result-state.cc index 82738a9..fc89185 100644 --- a/icing/result/result-state.cc +++ b/icing/result/result-state.cc @@ -16,6 +16,7 @@ #include "icing/result/projection-tree.h" #include "icing/scoring/ranker.h" +#include "icing/store/namespace-id.h" #include "icing/util/logging.h" namespace icing { @@ -39,7 +40,8 @@ ResultState::ResultState(std::vector<ScoredDocumentHit> scored_document_hits, SectionRestrictQueryTermsMap query_terms, const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec, - const ResultSpecProto& result_spec) + const ResultSpecProto& result_spec, + const DocumentStore& document_store) : scored_document_hits_(std::move(scored_document_hits)), snippet_context_(CreateSnippetContext(std::move(query_terms), search_spec, result_spec)), @@ -52,14 +54,82 @@ ResultState::ResultState(std::vector<ScoredDocumentHit> scored_document_hits, projection_tree_map_.insert( {type_field_mask.schema_type(), ProjectionTree(type_field_mask)}); } + + for (const ResultSpecProto::ResultGrouping& result_grouping : + result_spec.result_groupings()) { + int group_id = group_result_limits_.size(); + group_result_limits_.push_back(result_grouping.max_results()); + for (const std::string& name_space : result_grouping.namespaces()) { + auto namespace_id_or = document_store.GetNamespaceId(name_space); + if (!namespace_id_or.ok()) { + continue; + } + namespace_group_id_map_.insert({namespace_id_or.ValueOrDie(), group_id}); + } + } BuildHeapInPlace(&scored_document_hits_, scored_document_hit_comparator_); } -std::vector<ScoredDocumentHit> ResultState::GetNextPage() { - std::vector<ScoredDocumentHit> scored_document_hits = PopTopResultsFromHeap( - &scored_document_hits_, num_per_page_, scored_document_hit_comparator_); - num_returned_ += scored_document_hits.size(); - return scored_document_hits; +class GroupResultLimiter { + public: + GroupResultLimiter( + const std::unordered_map<NamespaceId, int>& namespace_group_id_map, + std::vector<int>& group_result_limits, + const DocumentStore& document_store) + : namespace_group_id_map_(namespace_group_id_map), + group_result_limits_(&group_result_limits), + document_store_(document_store) {} + + // Returns true if the scored_document_hit should be removed. + bool operator()(const ScoredDocumentHit& scored_document_hit) { + auto document_filter_data_or = document_store_.GetDocumentFilterData( + scored_document_hit.document_id()); + if (!document_filter_data_or.ok()) { + return true; + } + NamespaceId namespace_id = + document_filter_data_or.ValueOrDie().namespace_id(); + auto iter = namespace_group_id_map_.find(namespace_id); + if (iter == namespace_group_id_map_.end()) { + return false; + } + int& count = group_result_limits_->at(iter->second); + if (count <= 0) { + return true; + } + --count; + return false; + } + + private: + const std::unordered_map<NamespaceId, int>& namespace_group_id_map_; + std::vector<int>* group_result_limits_; + const DocumentStore& document_store_; +}; + +std::vector<ScoredDocumentHit> ResultState::GetNextPage( + const DocumentStore& document_store) { + int num_requested = num_per_page_; + bool more_results_available = true; + std::vector<ScoredDocumentHit> final_scored_document_hits; + while (more_results_available && num_requested > 0) { + std::vector<ScoredDocumentHit> scored_document_hits = PopTopResultsFromHeap( + &scored_document_hits_, num_requested, scored_document_hit_comparator_); + more_results_available = scored_document_hits.size() == num_requested; + auto itr = std::remove_if( + scored_document_hits.begin(), scored_document_hits.end(), + GroupResultLimiter(namespace_group_id_map_, group_result_limits_, + document_store)); + scored_document_hits.erase(itr, scored_document_hits.end()); + final_scored_document_hits.reserve(final_scored_document_hits.size() + + scored_document_hits.size()); + std::move(scored_document_hits.begin(), scored_document_hits.end(), + std::back_inserter(final_scored_document_hits)); + num_requested = num_per_page_ - final_scored_document_hits.size(); + } + + num_returned_ += final_scored_document_hits.size(); + return final_scored_document_hits; } void ResultState::TruncateHitsTo(int new_size) { diff --git a/icing/result/result-state.h b/icing/result/result-state.h index be92b85..303d610 100644 --- a/icing/result/result-state.h +++ b/icing/result/result-state.h @@ -23,6 +23,8 @@ #include "icing/result/projection-tree.h" #include "icing/result/snippet-context.h" #include "icing/scoring/scored-document-hit.h" +#include "icing/store/document-store.h" +#include "icing/store/namespace-id.h" namespace icing { namespace lib { @@ -31,17 +33,19 @@ namespace lib { // same query. Stored in ResultStateManager. class ResultState { public: - explicit ResultState(std::vector<ScoredDocumentHit> scored_document_hits, - SectionRestrictQueryTermsMap query_terms, - const SearchSpecProto& search_spec, - const ScoringSpecProto& scoring_spec, - const ResultSpecProto& result_spec); + ResultState(std::vector<ScoredDocumentHit> scored_document_hits, + SectionRestrictQueryTermsMap query_terms, + const SearchSpecProto& search_spec, + const ScoringSpecProto& scoring_spec, + const ResultSpecProto& result_spec, + const DocumentStore& document_store); // Returns the next page of results. The size of page is passed in from // ResultSpecProto in constructor. Calling this method could increase the // value of num_returned(), so be careful of the order of calling these // methods. - std::vector<ScoredDocumentHit> GetNextPage(); + std::vector<ScoredDocumentHit> GetNextPage( + const DocumentStore& document_store); // Truncates the vector of ScoredDocumentHits to the given size. The best // ScoredDocumentHits are kept. @@ -67,6 +71,10 @@ class ResultState { // increased when GetNextPage() is called. int num_returned() const { return num_returned_; } + // The number of results yet to be returned. This number is decreased when + // GetNextPage is called. + int num_remaining() const { return scored_document_hits_.size(); } + private: // The scored document hits. It represents a heap data structure when ranking // is required so that we can get top K hits in O(KlgN) time. If no ranking is @@ -79,6 +87,13 @@ class ResultState { // Information needed for projection. std::unordered_map<std::string, ProjectionTree> projection_tree_map_; + // A map between namespace id and the id of the group that it appears in. + std::unordered_map<NamespaceId, int> namespace_group_id_map_; + + // The count of remaining results to return for a group where group id is the + // index. + std::vector<int> group_result_limits_; + // Number of results to return in each page. int num_per_page_; diff --git a/icing/result/result-state_test.cc b/icing/result/result-state_test.cc index 85cb242..f2121a5 100644 --- a/icing/result/result-state_test.cc +++ b/icing/result/result-state_test.cc @@ -15,9 +15,15 @@ #include "icing/result/result-state.h" #include "gtest/gtest.h" +#include "icing/document-builder.h" +#include "icing/file/filesystem.h" #include "icing/portable/equals-proto.h" +#include "icing/schema/schema-store.h" #include "icing/scoring/scored-document-hit.h" +#include "icing/store/document-store.h" #include "icing/testing/common-matchers.h" +#include "icing/testing/tmp-directory.h" +#include "icing/util/clock.h" namespace icing { namespace lib { @@ -50,42 +56,90 @@ ResultSpecProto CreateResultSpec(int num_per_page) { return result_spec; } +class ResultStateTest : public testing::Test { + protected: + void SetUp() override { + schema_store_base_dir_ = GetTestTempDir() + "/schema_store"; + filesystem_.CreateDirectoryRecursively(schema_store_base_dir_.c_str()); + ICING_ASSERT_OK_AND_ASSIGN( + schema_store_, + SchemaStore::Create(&filesystem_, schema_store_base_dir_, &clock_)); + SchemaProto schema; + schema.add_types()->set_schema_type("Document"); + ICING_ASSERT_OK(schema_store_->SetSchema(std::move(schema))); + + doc_store_base_dir_ = GetTestTempDir() + "/document_store"; + filesystem_.CreateDirectoryRecursively(doc_store_base_dir_.c_str()); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult result, + DocumentStore::Create(&filesystem_, doc_store_base_dir_, &clock_, + schema_store_.get())); + document_store_ = std::move(result.document_store); + } + + void TearDown() override { + filesystem_.DeleteDirectoryRecursively(doc_store_base_dir_.c_str()); + filesystem_.DeleteDirectoryRecursively(schema_store_base_dir_.c_str()); + } + + ScoredDocumentHit AddScoredDocument(DocumentId document_id) { + DocumentProto document; + document.set_namespace_("namespace"); + document.set_uri(std::to_string(document_id)); + document.set_schema("Document"); + document_store_->Put(std::move(document)); + return ScoredDocumentHit(document_id, kSectionIdMaskNone, /*score=*/1); + } + + DocumentStore& document_store() { return *document_store_; } + + private: + Filesystem filesystem_; + std::string doc_store_base_dir_; + std::string schema_store_base_dir_; + Clock clock_; + std::unique_ptr<DocumentStore> document_store_; + std::unique_ptr<SchemaStore> schema_store_; +}; + // ResultState::ResultState() and ResultState::GetNextPage() are calling // Ranker::BuildHeapInPlace() and Ranker::PopTopResultsFromHeap() directly, so // we don't need to test much on what order is returned as that is tested in // Ranker's tests. Here we just need one sanity test to make sure that the // correct functions are called. -TEST(ResultStateTest, ShouldReturnNextPage) { +TEST_F(ResultStateTest, ShouldReturnNextPage) { + ScoredDocumentHit scored_hit_0 = AddScoredDocument(/*document_id=*/0); + ScoredDocumentHit scored_hit_1 = AddScoredDocument(/*document_id=*/1); + ScoredDocumentHit scored_hit_2 = AddScoredDocument(/*document_id=*/2); + ScoredDocumentHit scored_hit_3 = AddScoredDocument(/*document_id=*/3); + ScoredDocumentHit scored_hit_4 = AddScoredDocument(/*document_id=*/4); std::vector<ScoredDocumentHit> scored_document_hits = { - CreateScoredDocumentHit(/*document_id=*/2), - CreateScoredDocumentHit(/*document_id=*/1), - CreateScoredDocumentHit(/*document_id=*/3), - CreateScoredDocumentHit(/*document_id=*/5), - CreateScoredDocumentHit(/*document_id=*/4)}; + scored_hit_1, scored_hit_0, scored_hit_2, scored_hit_4, scored_hit_3}; ResultState result_state(scored_document_hits, /*query_terms=*/{}, CreateSearchSpec(TermMatchType::EXACT_ONLY), CreateScoringSpec(/*is_descending_order=*/true), - CreateResultSpec(/*num_per_page=*/2)); + CreateResultSpec(/*num_per_page=*/2), + document_store()); EXPECT_THAT( - result_state.GetNextPage(), + result_state.GetNextPage(document_store()), ElementsAre( - EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/5)), - EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/4)))); + EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/4)), + EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/3)))); EXPECT_THAT( - result_state.GetNextPage(), + result_state.GetNextPage(document_store()), ElementsAre( - EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/3)), - EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/2)))); + EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/2)), + EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/1)))); - EXPECT_THAT(result_state.GetNextPage(), + EXPECT_THAT(result_state.GetNextPage(document_store()), ElementsAre(EqualsScoredDocumentHit( - CreateScoredDocumentHit(/*document_id=*/1)))); + CreateScoredDocumentHit(/*document_id=*/0)))); } -TEST(ResultStateTest, ShouldReturnSnippetContextAccordingToSpecs) { +TEST_F(ResultStateTest, ShouldReturnSnippetContextAccordingToSpecs) { ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2); result_spec.mutable_snippet_spec()->set_num_to_snippet(5); result_spec.mutable_snippet_spec()->set_num_matches_per_property(5); @@ -97,7 +151,8 @@ TEST(ResultStateTest, ShouldReturnSnippetContextAccordingToSpecs) { ResultState result_state( /*scored_document_hits=*/{}, query_terms_map, CreateSearchSpec(TermMatchType::EXACT_ONLY), - CreateScoringSpec(/*is_descending_order=*/true), result_spec); + CreateScoringSpec(/*is_descending_order=*/true), result_spec, + document_store()); const SnippetContext& snippet_context = result_state.snippet_context(); @@ -117,7 +172,7 @@ TEST(ResultStateTest, ShouldReturnSnippetContextAccordingToSpecs) { EXPECT_THAT(snippet_context2.match_type, Eq(TermMatchType::EXACT_ONLY)); } -TEST(ResultStateTest, NoSnippetingShouldReturnNull) { +TEST_F(ResultStateTest, NoSnippetingShouldReturnNull) { ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2); // Setting num_to_snippet to 0 so that snippeting info won't be // stored. @@ -131,7 +186,7 @@ TEST(ResultStateTest, NoSnippetingShouldReturnNull) { ResultState result_state(/*scored_document_hits=*/{}, query_terms_map, CreateSearchSpec(TermMatchType::EXACT_ONLY), CreateScoringSpec(/*is_descending_order=*/true), - result_spec); + result_spec, document_store()); const SnippetContext& snippet_context = result_state.snippet_context(); EXPECT_THAT(snippet_context.query_terms, IsEmpty()); @@ -141,72 +196,375 @@ TEST(ResultStateTest, NoSnippetingShouldReturnNull) { EXPECT_THAT(snippet_context.match_type, TermMatchType::UNKNOWN); } -TEST(ResultStateTest, ShouldTruncateToNewSize) { +TEST_F(ResultStateTest, ShouldTruncateToNewSize) { + ScoredDocumentHit scored_hit_0 = AddScoredDocument(/*document_id=*/0); + ScoredDocumentHit scored_hit_1 = AddScoredDocument(/*document_id=*/1); + ScoredDocumentHit scored_hit_2 = AddScoredDocument(/*document_id=*/2); + ScoredDocumentHit scored_hit_3 = AddScoredDocument(/*document_id=*/3); + ScoredDocumentHit scored_hit_4 = AddScoredDocument(/*document_id=*/4); std::vector<ScoredDocumentHit> scored_document_hits = { - CreateScoredDocumentHit(/*document_id=*/2), - CreateScoredDocumentHit(/*document_id=*/1), - CreateScoredDocumentHit(/*document_id=*/3), - CreateScoredDocumentHit(/*document_id=*/5), - CreateScoredDocumentHit(/*document_id=*/4)}; + scored_hit_1, scored_hit_0, scored_hit_2, scored_hit_4, scored_hit_3}; // Creates a ResultState with 5 ScoredDocumentHits. ResultState result_state(scored_document_hits, /*query_terms=*/{}, CreateSearchSpec(TermMatchType::EXACT_ONLY), CreateScoringSpec(/*is_descending_order=*/true), - CreateResultSpec(/*num_per_page=*/5)); + CreateResultSpec(/*num_per_page=*/5), + document_store()); result_state.TruncateHitsTo(/*new_size=*/3); // The best 3 are left. EXPECT_THAT( - result_state.GetNextPage(), + result_state.GetNextPage(document_store()), ElementsAre( - EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/5)), EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/4)), - EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/3)))); + EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/3)), + EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/2)))); } -TEST(ResultStateTest, ShouldTruncateToZero) { +TEST_F(ResultStateTest, ShouldTruncateToZero) { + ScoredDocumentHit scored_hit_0 = AddScoredDocument(/*document_id=*/0); + ScoredDocumentHit scored_hit_1 = AddScoredDocument(/*document_id=*/1); + ScoredDocumentHit scored_hit_2 = AddScoredDocument(/*document_id=*/2); + ScoredDocumentHit scored_hit_3 = AddScoredDocument(/*document_id=*/3); + ScoredDocumentHit scored_hit_4 = AddScoredDocument(/*document_id=*/4); std::vector<ScoredDocumentHit> scored_document_hits = { - CreateScoredDocumentHit(/*document_id=*/2), - CreateScoredDocumentHit(/*document_id=*/1), - CreateScoredDocumentHit(/*document_id=*/3), - CreateScoredDocumentHit(/*document_id=*/5), - CreateScoredDocumentHit(/*document_id=*/4)}; + scored_hit_1, scored_hit_0, scored_hit_2, scored_hit_4, scored_hit_3}; // Creates a ResultState with 5 ScoredDocumentHits. ResultState result_state(scored_document_hits, /*query_terms=*/{}, CreateSearchSpec(TermMatchType::EXACT_ONLY), CreateScoringSpec(/*is_descending_order=*/true), - CreateResultSpec(/*num_per_page=*/5)); + CreateResultSpec(/*num_per_page=*/5), + document_store()); result_state.TruncateHitsTo(/*new_size=*/0); - EXPECT_THAT(result_state.GetNextPage(), IsEmpty()); + EXPECT_THAT(result_state.GetNextPage(document_store()), IsEmpty()); } -TEST(ResultStateTest, ShouldNotTruncateToNegative) { +TEST_F(ResultStateTest, ShouldNotTruncateToNegative) { + ScoredDocumentHit scored_hit_0 = AddScoredDocument(/*document_id=*/0); + ScoredDocumentHit scored_hit_1 = AddScoredDocument(/*document_id=*/1); + ScoredDocumentHit scored_hit_2 = AddScoredDocument(/*document_id=*/2); + ScoredDocumentHit scored_hit_3 = AddScoredDocument(/*document_id=*/3); + ScoredDocumentHit scored_hit_4 = AddScoredDocument(/*document_id=*/4); std::vector<ScoredDocumentHit> scored_document_hits = { - CreateScoredDocumentHit(/*document_id=*/2), - CreateScoredDocumentHit(/*document_id=*/1), - CreateScoredDocumentHit(/*document_id=*/3), - CreateScoredDocumentHit(/*document_id=*/5), - CreateScoredDocumentHit(/*document_id=*/4)}; + scored_hit_1, scored_hit_0, scored_hit_2, scored_hit_4, scored_hit_3}; // Creates a ResultState with 5 ScoredDocumentHits. ResultState result_state(scored_document_hits, /*query_terms=*/{}, CreateSearchSpec(TermMatchType::EXACT_ONLY), CreateScoringSpec(/*is_descending_order=*/true), - CreateResultSpec(/*num_per_page=*/5)); + CreateResultSpec(/*num_per_page=*/5), + document_store()); result_state.TruncateHitsTo(/*new_size=*/-1); // Results are not affected. EXPECT_THAT( - result_state.GetNextPage(), + result_state.GetNextPage(document_store()), ElementsAre( - EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/5)), EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/4)), EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/3)), EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/2)), - EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/1)))); + EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/1)), + EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/0)))); +} + +TEST_F(ResultStateTest, ResultGroupingShouldLimitResults) { + // Creates 2 documents and ensures the relationship in terms of document + // score is: document1 < document2 + DocumentProto document1 = DocumentBuilder() + .SetKey("namespace", "uri/1") + .SetSchema("Document") + .SetScore(1) + .Build(); + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace", "uri/2") + .SetSchema("Document") + .SetScore(2) + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, + document_store().Put(document1)); + ScoredDocumentHit scored_hit_1(document_id1, kSectionIdMaskNone, + document1.score()); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, + document_store().Put(document2)); + ScoredDocumentHit scored_hit_2(document_id2, kSectionIdMaskNone, + document2.score()); + std::vector<ScoredDocumentHit> scored_document_hits = {scored_hit_2, + scored_hit_1}; + + // Create a ResultSpec that limits "namespace" to a single result. + ResultSpecProto result_spec; + result_spec.set_num_per_page(5); + ResultSpecProto::ResultGrouping* result_grouping = + result_spec.add_result_groupings(); + result_grouping->set_max_results(1); + result_grouping->add_namespaces("namespace"); + + // Creates a ResultState with 2 ScoredDocumentHits. + ResultState result_state(scored_document_hits, /*query_terms=*/{}, + CreateSearchSpec(TermMatchType::EXACT_ONLY), + CreateScoringSpec(/*is_descending_order=*/true), + result_spec, document_store()); + + // Only the top ranked document in "namespace" (document2), should be + // returned. + EXPECT_THAT(result_state.GetNextPage(document_store()), + ElementsAre(EqualsScoredDocumentHit(scored_hit_2))); +} + +TEST_F(ResultStateTest, ResultGroupingDoesNotLimitOtherNamespaceResults) { + // Creates 4 documents and ensures the relationship in terms of document + // score is: document1 < document2 < document3 < document4 + DocumentProto document1 = DocumentBuilder() + .SetKey("namespace1", "uri/1") + .SetSchema("Document") + .SetScore(1) + .Build(); + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace1", "uri/2") + .SetSchema("Document") + .SetScore(2) + .Build(); + DocumentProto document3 = DocumentBuilder() + .SetKey("namespace2", "uri/3") + .SetSchema("Document") + .SetScore(3) + .Build(); + DocumentProto document4 = DocumentBuilder() + .SetKey("namespace2", "uri/4") + .SetSchema("Document") + .SetScore(4) + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, + document_store().Put(document1)); + ScoredDocumentHit scored_hit_1(document_id1, kSectionIdMaskNone, + document1.score()); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, + document_store().Put(document2)); + ScoredDocumentHit scored_hit_2(document_id2, kSectionIdMaskNone, + document2.score()); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3, + document_store().Put(document3)); + ScoredDocumentHit scored_hit_3(document_id3, kSectionIdMaskNone, + document3.score()); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4, + document_store().Put(document4)); + ScoredDocumentHit scored_hit_4(document_id4, kSectionIdMaskNone, + document4.score()); + std::vector<ScoredDocumentHit> scored_document_hits = { + scored_hit_4, scored_hit_3, scored_hit_2, scored_hit_1}; + + // Create a ResultSpec that limits "namespace1" to a single result, but + // doesn't limit "namespace2". + ResultSpecProto result_spec; + result_spec.set_num_per_page(5); + ResultSpecProto::ResultGrouping* result_grouping = + result_spec.add_result_groupings(); + result_grouping->set_max_results(1); + result_grouping->add_namespaces("namespace1"); + + // Creates a ResultState with 4 ScoredDocumentHits. + ResultState result_state(scored_document_hits, /*query_terms=*/{}, + CreateSearchSpec(TermMatchType::EXACT_ONLY), + CreateScoringSpec(/*is_descending_order=*/true), + result_spec, document_store()); + + // Only the top ranked document in "namespace" (document2), should be + // returned. + EXPECT_THAT(result_state.GetNextPage(document_store()), + ElementsAre(EqualsScoredDocumentHit(scored_hit_4), + EqualsScoredDocumentHit(scored_hit_3), + EqualsScoredDocumentHit(scored_hit_2))); +} + +TEST_F(ResultStateTest, ResultGroupingNonexistentNamespaceShouldBeIgnored) { + // Creates 2 documents and ensures the relationship in terms of document + // score is: document1 < document2 + DocumentProto document1 = DocumentBuilder() + .SetKey("namespace", "uri/1") + .SetSchema("Document") + .SetScore(1) + .Build(); + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace", "uri/2") + .SetSchema("Document") + .SetScore(2) + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, + document_store().Put(document1)); + ScoredDocumentHit scored_hit_1(document_id1, kSectionIdMaskNone, + document1.score()); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, + document_store().Put(document2)); + ScoredDocumentHit scored_hit_2(document_id2, kSectionIdMaskNone, + document2.score()); + std::vector<ScoredDocumentHit> scored_document_hits = {scored_hit_2, + scored_hit_1}; + + // Create a ResultSpec that limits "namespace"+"nonExistentNamespace" to a + // single result. + ResultSpecProto result_spec; + result_spec.set_num_per_page(5); + ResultSpecProto::ResultGrouping* result_grouping = + result_spec.add_result_groupings(); + result_grouping->set_max_results(1); + result_grouping->add_namespaces("namespace"); + result_grouping->add_namespaces("nonexistentNamespace"); + + // Creates a ResultState with 2 ScoredDocumentHits. + ResultState result_state(scored_document_hits, /*query_terms=*/{}, + CreateSearchSpec(TermMatchType::EXACT_ONLY), + CreateScoringSpec(/*is_descending_order=*/true), + result_spec, document_store()); + + // Only the top ranked document in "namespace" (document2), should be + // returned. The presence of "nonexistentNamespace" in the same result + // grouping should have no effect. + EXPECT_THAT(result_state.GetNextPage(document_store()), + ElementsAre(EqualsScoredDocumentHit(scored_hit_2))); +} + +TEST_F(ResultStateTest, ResultGroupingMultiNamespaceGrouping) { + // Creates 6 documents and ensures the relationship in terms of document + // score is: document1 < document2 < document3 < document4 < document5 < + // document6 + DocumentProto document1 = DocumentBuilder() + .SetKey("namespace1", "uri/1") + .SetSchema("Document") + .SetScore(1) + .Build(); + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace1", "uri/2") + .SetSchema("Document") + .SetScore(2) + .Build(); + DocumentProto document3 = DocumentBuilder() + .SetKey("namespace2", "uri/3") + .SetSchema("Document") + .SetScore(3) + .Build(); + DocumentProto document4 = DocumentBuilder() + .SetKey("namespace2", "uri/4") + .SetSchema("Document") + .SetScore(4) + .Build(); + DocumentProto document5 = DocumentBuilder() + .SetKey("namespace3", "uri/5") + .SetSchema("Document") + .SetScore(5) + .Build(); + DocumentProto document6 = DocumentBuilder() + .SetKey("namespace3", "uri/6") + .SetSchema("Document") + .SetScore(6) + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, + document_store().Put(document1)); + ScoredDocumentHit scored_hit_1(document_id1, kSectionIdMaskNone, + document1.score()); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, + document_store().Put(document2)); + ScoredDocumentHit scored_hit_2(document_id2, kSectionIdMaskNone, + document2.score()); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3, + document_store().Put(document3)); + ScoredDocumentHit scored_hit_3(document_id3, kSectionIdMaskNone, + document3.score()); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4, + document_store().Put(document4)); + ScoredDocumentHit scored_hit_4(document_id4, kSectionIdMaskNone, + document4.score()); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id5, + document_store().Put(document5)); + ScoredDocumentHit scored_hit_5(document_id5, kSectionIdMaskNone, + document5.score()); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id6, + document_store().Put(document6)); + ScoredDocumentHit scored_hit_6(document_id6, kSectionIdMaskNone, + document6.score()); + std::vector<ScoredDocumentHit> scored_document_hits = { + scored_hit_6, scored_hit_5, scored_hit_4, + scored_hit_3, scored_hit_2, scored_hit_1}; + + // Create a ResultSpec that limits "namespace1" to a single result and limits + // "namespace2"+"namespace3" to a total of two results. + ResultSpecProto result_spec; + result_spec.set_num_per_page(5); + ResultSpecProto::ResultGrouping* result_grouping = + result_spec.add_result_groupings(); + result_grouping->set_max_results(1); + result_grouping->add_namespaces("namespace1"); + result_grouping = result_spec.add_result_groupings(); + result_grouping->set_max_results(2); + result_grouping->add_namespaces("namespace2"); + result_grouping->add_namespaces("namespace3"); + + // Creates a ResultState with 4 ScoredDocumentHits. + ResultState result_state(scored_document_hits, /*query_terms=*/{}, + CreateSearchSpec(TermMatchType::EXACT_ONLY), + CreateScoringSpec(/*is_descending_order=*/true), + result_spec, document_store()); + + // Only the top-ranked result in "namespace1" (document2) should be returned. + // Only the top-ranked results across "namespace2" and "namespace3" + // (document6, document5) should be returned. + EXPECT_THAT(result_state.GetNextPage(document_store()), + ElementsAre(EqualsScoredDocumentHit(scored_hit_6), + EqualsScoredDocumentHit(scored_hit_5), + EqualsScoredDocumentHit(scored_hit_2))); +} + +TEST_F(ResultStateTest, ResultGroupingOnlyNonexistentNamespaces) { + // Creates 2 documents and ensures the relationship in terms of document + // score is: document1 < document2 + DocumentProto document1 = DocumentBuilder() + .SetKey("namespace", "uri/1") + .SetSchema("Document") + .SetScore(1) + .Build(); + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace", "uri/2") + .SetSchema("Document") + .SetScore(2) + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1, + document_store().Put(document1)); + ScoredDocumentHit scored_hit_1(document_id1, kSectionIdMaskNone, + document1.score()); + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2, + document_store().Put(document2)); + ScoredDocumentHit scored_hit_2(document_id2, kSectionIdMaskNone, + document2.score()); + std::vector<ScoredDocumentHit> scored_document_hits = {scored_hit_2, + scored_hit_1}; + + // Create a ResultSpec that limits "nonexistentNamespace" to a single result. + // but doesn't limit "namespace" + ResultSpecProto result_spec; + result_spec.set_num_per_page(5); + ResultSpecProto::ResultGrouping* result_grouping = + result_spec.add_result_groupings(); + result_grouping->set_max_results(1); + result_grouping->add_namespaces("nonexistentNamespace"); + + // Creates a ResultState with 2 ScoredDocumentHits. + ResultState result_state(scored_document_hits, /*query_terms=*/{}, + CreateSearchSpec(TermMatchType::EXACT_ONLY), + CreateScoringSpec(/*is_descending_order=*/true), + result_spec, document_store()); + + // All documents in "namespace" should be returned. The presence of + // "nonexistentNamespace" should have no effect. + EXPECT_THAT(result_state.GetNextPage(document_store()), + ElementsAre(EqualsScoredDocumentHit(scored_hit_2), + EqualsScoredDocumentHit(scored_hit_1))); } } // namespace diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc b/icing/result/snippet-retriever-test-jni-layer.cc index 8392363..707d9ee 100644 --- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc +++ b/icing/result/snippet-retriever-test-jni-layer.cc @@ -21,12 +21,11 @@ JNIEnv* g_jenv = nullptr; extern "C" JNIEXPORT jboolean JNICALL -Java_icing_tokenization_reverse_1jni_ReverseJniLanguageSegmenterTest_testsMain( - JNIEnv* env, jclass ignored) { +Java_icing_jni_SnippetRetrieverJniTest_testsMain(JNIEnv* env, jclass ignored) { g_jenv = env; std::vector<char*> my_argv; - char arg[] = "reverse-jni-language-segmenter-test-lib"; + char arg[] = "jni-test-lib"; my_argv.push_back(arg); int argc = 1; char** argv = &(my_argv[0]); diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc index d4a5f79..33b343e 100644 --- a/icing/result/snippet-retriever.cc +++ b/icing/result/snippet-retriever.cc @@ -15,6 +15,7 @@ #include "icing/result/snippet-retriever.h" #include <algorithm> +#include <iterator> #include <memory> #include <string> #include <string_view> @@ -25,9 +26,12 @@ #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" +#include "icing/absl_ports/str_cat.h" +#include "icing/absl_ports/str_join.h" #include "icing/proto/term.pb.h" #include "icing/query/query-terms.h" #include "icing/schema/schema-store.h" +#include "icing/schema/section-manager.h" #include "icing/schema/section.h" #include "icing/store/document-filter-data.h" #include "icing/tokenization/language-segmenter.h" @@ -35,6 +39,7 @@ #include "icing/tokenization/tokenizer-factory.h" #include "icing/tokenization/tokenizer.h" #include "icing/transform/normalizer.h" +#include "icing/util/character-iterator.h" #include "icing/util/i18n-utils.h" #include "icing/util/status-macros.h" @@ -43,10 +48,47 @@ namespace lib { namespace { +const PropertyProto* GetProperty(const DocumentProto& document, + std::string_view property_name) { + for (const PropertyProto& property : document.properties()) { + if (property.name() == property_name) { + return &property; + } + } + return nullptr; +} + +inline std::string AddPropertyToPath(const std::string& current_path, + std::string_view property) { + if (current_path.empty()) { + return std::string(property); + } + return absl_ports::StrCat(current_path, kPropertySeparator, property); +} + +inline std::string AddIndexToPath(int values_size, int index, + const std::string& property_path) { + if (values_size == 1) { + return property_path; + } + return absl_ports::StrCat(property_path, kLBracket, std::to_string(index), + kRBracket); +} + class TokenMatcher { public: virtual ~TokenMatcher() = default; - virtual bool Matches(Token token) const = 0; + + // Returns a CharacterIterator pointing just past the end of the substring in + // token.text that matches a query term. Note that the utf* indices will be + // in relation to token.text's start. + // + // If there is no match, then it will construct a CharacterIterator with all + // of its indices set to -1. + // + // Ex. With an exact matcher, query terms=["foo","bar"] and token.text="bar", + // Matches will return a CharacterIterator(u8:3, u16:3, u32:3). + virtual CharacterIterator Matches(Token token) const = 0; }; class TokenMatcherExact : public TokenMatcher { @@ -59,10 +101,17 @@ class TokenMatcherExact : public TokenMatcher { restricted_query_terms_(restricted_query_terms), normalizer_(normalizer) {} - bool Matches(Token token) const override { + CharacterIterator Matches(Token token) const override { std::string s = normalizer_.NormalizeTerm(token.text); - return (unrestricted_query_terms_.count(s) > 0) || - (restricted_query_terms_.count(s) > 0); + auto itr = unrestricted_query_terms_.find(s); + if (itr == unrestricted_query_terms_.end()) { + itr = restricted_query_terms_.find(s); + } + if (itr != unrestricted_query_terms_.end() && + itr != restricted_query_terms_.end()) { + return normalizer_.CalculateNormalizedMatchLength(token.text, *itr); + } + return CharacterIterator(token.text, -1, -1, -1); } private: @@ -81,22 +130,23 @@ class TokenMatcherPrefix : public TokenMatcher { restricted_query_terms_(restricted_query_terms), normalizer_(normalizer) {} - bool Matches(Token token) const override { + CharacterIterator Matches(Token token) const override { std::string s = normalizer_.NormalizeTerm(token.text); - if (std::any_of(unrestricted_query_terms_.begin(), - unrestricted_query_terms_.end(), - [&s](const std::string& term) { - return term.length() <= s.length() && - s.compare(0, term.length(), term) == 0; - })) { - return true; + for (const std::string& query_term : unrestricted_query_terms_) { + if (query_term.length() <= s.length() && + s.compare(0, query_term.length(), query_term) == 0) { + return normalizer_.CalculateNormalizedMatchLength(token.text, + query_term); + } + } + for (const std::string& query_term : restricted_query_terms_) { + if (query_term.length() <= s.length() && + s.compare(0, query_term.length(), query_term) == 0) { + return normalizer_.CalculateNormalizedMatchLength(token.text, + query_term); + } } - return std::any_of(restricted_query_terms_.begin(), - restricted_query_terms_.end(), - [&s](const std::string& term) { - return term.length() <= s.length() && - s.compare(0, term.length(), term) == 0; - }); + return CharacterIterator(token.text, -1, -1, -1); } private: @@ -124,110 +174,170 @@ libtextclassifier3::StatusOr<std::unique_ptr<TokenMatcher>> CreateTokenMatcher( } } -// Returns true if token matches any of the terms in query terms according to -// the provided match type. +// Finds the start position of a valid token that is after +// window_start_min_exclusive_utf32 // // Returns: // the position of the window start if successful // INTERNAL_ERROR - if a tokenizer error is encountered -libtextclassifier3::StatusOr<int> DetermineWindowStart( +libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowStart( const ResultSpecProto::SnippetSpecProto& snippet_spec, - std::string_view value, int match_mid, Tokenizer::Iterator* iterator) { - int window_start_min = (match_mid - snippet_spec.max_window_bytes() / 2) - 1; - if (window_start_min < 0) { - return 0; - } - if (!iterator->ResetToTokenAfter(window_start_min)) { + std::string_view value, int window_start_min_exclusive_utf32, + Tokenizer::Iterator* iterator) { + if (!iterator->ResetToTokenAfter(window_start_min_exclusive_utf32)) { return absl_ports::InternalError( "Couldn't reset tokenizer to determine snippet window!"); } - return iterator->GetToken().text.data() - value.data(); + return iterator->CalculateTokenStart(); } // Increments window_end_exclusive so long as the character at the position // of window_end_exclusive is punctuation and does not exceed -// window_end_max_exclusive. -int IncludeTrailingPunctuation(std::string_view value, int window_end_exclusive, - int window_end_max_exclusive) { - while (window_end_exclusive < window_end_max_exclusive) { +// window_end_max_exclusive_utf32. +CharacterIterator IncludeTrailingPunctuation( + std::string_view value, CharacterIterator window_end_exclusive, + int window_end_max_exclusive_utf32) { + while (window_end_exclusive.utf32_index() < window_end_max_exclusive_utf32) { int char_len = 0; - if (!i18n_utils::IsPunctuationAt(value, window_end_exclusive, &char_len)) { - break; - } - if (window_end_exclusive + char_len > window_end_max_exclusive) { - // This is punctuation, but it goes beyond the window end max. Don't - // include it. + if (!i18n_utils::IsPunctuationAt(value, window_end_exclusive.utf8_index(), + &char_len)) { break; } // Expand window by char_len and check the next character. - window_end_exclusive += char_len; + window_end_exclusive.AdvanceToUtf32(window_end_exclusive.utf32_index() + 1); } return window_end_exclusive; } +// Finds the end position of a valid token that is before the +// window_end_max_exclusive_utf32. +// // Returns: // the position of the window end if successful // INTERNAL_ERROR - if a tokenizer error is encountered -libtextclassifier3::StatusOr<int> DetermineWindowEnd( +libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowEnd( const ResultSpecProto::SnippetSpecProto& snippet_spec, - std::string_view value, int match_mid, Tokenizer::Iterator* iterator) { - int window_end_max_exclusive = - match_mid + snippet_spec.max_window_bytes() / 2; - if (window_end_max_exclusive >= value.length()) { - return value.length(); - } - if (!iterator->ResetToTokenBefore(window_end_max_exclusive)) { + std::string_view value, int window_end_max_exclusive_utf32, + Tokenizer::Iterator* iterator) { + if (!iterator->ResetToTokenBefore(window_end_max_exclusive_utf32)) { return absl_ports::InternalError( "Couldn't reset tokenizer to determine snippet window!"); } - int window_end_exclusive = iterator->GetToken().text.data() - value.data() + - iterator->GetToken().text.length(); - return IncludeTrailingPunctuation(value, window_end_exclusive, - window_end_max_exclusive); + ICING_ASSIGN_OR_RETURN(CharacterIterator end_exclusive, + iterator->CalculateTokenEndExclusive()); + return IncludeTrailingPunctuation(value, end_exclusive, + window_end_max_exclusive_utf32); } struct SectionData { std::string_view section_name; std::string_view section_subcontent; - // Identifies which subsection of the section content, section_subcontent has - // come from. - // Ex. "recipient.address" : - // ["foo@google.com", "bar@google.com", "baz@google.com"] - // The subcontent_index of "bar@google.com" is 1. - int subcontent_index; }; +// Creates a snippet match proto for the match pointed to by the iterator and +// char_iterator +// +// Returns: +// the position of the window start if successful +// INTERNAL_ERROR - if a tokenizer error is encountered and iterator is left +// in an invalid state +// ABORTED_ERROR - if an invalid utf-8 sequence is encountered libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch( const ResultSpecProto::SnippetSpecProto& snippet_spec, - const SectionData& value, Tokenizer::Iterator* iterator) { + const SectionData& value, Tokenizer::Iterator* iterator, + const CharacterIterator& char_iterator) { SnippetMatchProto snippet_match; - snippet_match.set_values_index(value.subcontent_index); - - Token match = iterator->GetToken(); - int match_pos = match.text.data() - value.section_subcontent.data(); - int match_mid = match_pos + match.text.length() / 2; - - snippet_match.set_exact_match_position(match_pos); - snippet_match.set_exact_match_bytes(match.text.length()); - - if (snippet_spec.max_window_bytes() > match.text.length()) { + ICING_ASSIGN_OR_RETURN(CharacterIterator start_itr, + iterator->CalculateTokenStart()); + ICING_ASSIGN_OR_RETURN(CharacterIterator end_itr, + iterator->CalculateTokenEndExclusive()); + + // When finding boundaries, we have a few cases: + // + // Case 1: + // If we have an odd length match an odd length window, the window surrounds + // the match perfectly. + // match = "bar" in "foo bar baz" + // window = |---| + // + // Case 2: + // If we have an even length match with an even length window, the window + // surrounds the match perfectly. + // match = "baar" in "foo baar baz" + // window = |----| + // + // Case 3: + // If we have an odd length match with an even length window, we allocate + // that extra window byte to the beginning. + // match = "bar" in "foo bar baz" + // window = |----| + // + // Case 4: + // If we have an even length match with an odd length window, we allocate + // that extra window byte to the end. + // match = "baar" in "foo baar baz" + // window = |-----| + // + // We have do +1/-1 below to get the math to match up. + int match_pos_utf32 = start_itr.utf32_index(); + int match_len_utf32 = end_itr.utf32_index() - match_pos_utf32; + int match_mid_utf32 = match_pos_utf32 + match_len_utf32 / 2; + int window_start_min_exclusive_utf32 = + (match_mid_utf32 - snippet_spec.max_window_bytes() / 2) - 1; + int window_end_max_exclusive_utf32 = + match_mid_utf32 + (snippet_spec.max_window_bytes() + 1) / 2; + + snippet_match.set_exact_match_byte_position(start_itr.utf8_index()); + snippet_match.set_exact_match_utf16_position(start_itr.utf16_index()); + snippet_match.set_exact_match_byte_length(end_itr.utf8_index() - + start_itr.utf8_index()); + snippet_match.set_exact_match_utf16_length(end_itr.utf16_index() - + start_itr.utf16_index()); + + // Only include windows if it'll at least include the matched text. Otherwise, + // it'll just be an empty string anyways. + if (snippet_spec.max_window_bytes() >= match_len_utf32) { // Find the beginning of the window. ICING_ASSIGN_OR_RETURN( - int window_start, - DetermineWindowStart(snippet_spec, value.section_subcontent, match_mid, - iterator)); - snippet_match.set_window_position(window_start); + CharacterIterator window_start, + DetermineWindowStart(snippet_spec, value.section_subcontent, + window_start_min_exclusive_utf32, iterator)); + + // Check. Did we get fewer characters than we requested? If so, then add it + // on to the window_end. + int extra_window_space = + window_start.utf32_index() - 1 - window_start_min_exclusive_utf32; + window_end_max_exclusive_utf32 += extra_window_space; // Find the end of the window. ICING_ASSIGN_OR_RETURN( - int window_end_exclusive, - DetermineWindowEnd(snippet_spec, value.section_subcontent, match_mid, - iterator)); - snippet_match.set_window_bytes(window_end_exclusive - window_start); + CharacterIterator window_end, + DetermineWindowEnd(snippet_spec, value.section_subcontent, + window_end_max_exclusive_utf32, iterator)); + + // Check one more time. Did we get fewer characters than we requested? If + // so, then see if we can push the start back again. + extra_window_space = + window_end_max_exclusive_utf32 - window_end.utf32_index(); + if (extra_window_space > 0) { + window_start_min_exclusive_utf32 = + window_start.utf32_index() - 1 - extra_window_space; + ICING_ASSIGN_OR_RETURN( + window_start, + DetermineWindowStart(snippet_spec, value.section_subcontent, + window_start_min_exclusive_utf32, iterator)); + } + + snippet_match.set_window_byte_position(window_start.utf8_index()); + snippet_match.set_window_utf16_position(window_start.utf16_index()); + snippet_match.set_window_byte_length(window_end.utf8_index() - + window_start.utf8_index()); + snippet_match.set_window_utf16_length(window_end.utf16_index() - + window_start.utf16_index()); // DetermineWindowStart/End may change the position of the iterator. So, // reset the iterator back to the original position. - bool success = (match_pos > 0) ? iterator->ResetToTokenAfter(match_pos - 1) + bool success = (match_pos_utf32 > 0) ? iterator->ResetToTokenAfter(match_pos_utf32 - 1) : iterator->ResetToStart(); if (!success) { return absl_ports::InternalError( @@ -243,33 +353,142 @@ struct MatchOptions { int max_matches_remaining; }; -libtextclassifier3::StatusOr<SnippetProto::EntryProto> RetrieveMatches( - const TokenMatcher* matcher, const MatchOptions& match_options, - const SectionData& value, const Tokenizer* tokenizer) { - SnippetProto::EntryProto snippet_entry; - snippet_entry.set_property_name(std::string(value.section_name)); - ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator, - tokenizer->Tokenize(value.section_subcontent)); - while (iterator->Advance()) { - if (snippet_entry.snippet_matches_size() >= - match_options.max_matches_remaining) { - break; +// Retrieves snippets in the string values of current_property. +// Tokenizer is provided to tokenize string content and matcher is provided to +// indicate when a token matches content in the query. +// +// current_property is the property with the string values to snippet. +// property_path is the path in the document to current_property. +// +// MatchOptions holds the snippet spec and number of desired matches remaining. +// Each call to GetEntriesFromProperty will decrement max_matches_remaining +// by the number of entries that it adds to snippet_proto. +// +// The SnippetEntries found for matched content will be added to snippet_proto. +void GetEntriesFromProperty(const PropertyProto* current_property, + const std::string& property_path, + const TokenMatcher* matcher, + const Tokenizer* tokenizer, + MatchOptions* match_options, + SnippetProto* snippet_proto) { + // We're at the end. Let's check our values. + for (int i = 0; i < current_property->string_values_size(); ++i) { + SnippetProto::EntryProto snippet_entry; + snippet_entry.set_property_name(AddIndexToPath( + current_property->string_values_size(), /*index=*/i, property_path)); + std::string_view value = current_property->string_values(i); + std::unique_ptr<Tokenizer::Iterator> iterator = + tokenizer->Tokenize(value).ValueOrDie(); + CharacterIterator char_iterator(value); + while (iterator->Advance()) { + Token token = iterator->GetToken(); + CharacterIterator submatch_end = matcher->Matches(token); + // If the token matched a query term, then submatch_end will point to an + // actual position within token.text. + if (submatch_end.utf8_index() != -1) { + if (!char_iterator.AdvanceToUtf8(token.text.data() - value.data())) { + // We can't get the char_iterator to a valid position, so there's no + // way for us to provide valid utf-16 indices. There's nothing more we + // can do here, so just return whatever we've built up so far. + if (!snippet_entry.snippet_matches().empty()) { + *snippet_proto->add_entries() = std::move(snippet_entry); + } + return; + } + SectionData data = {property_path, value}; + auto match_or = RetrieveMatch(match_options->snippet_spec, data, + iterator.get(), char_iterator); + if (!match_or.ok()) { + if (absl_ports::IsAborted(match_or.status())) { + // Only an aborted. We can't get this match, but we might be able to + // retrieve others. Just continue. + continue; + } else { + // Probably an internal error. The tokenizer iterator is probably in + // an invalid state. There's nothing more we can do here, so just + // return whatever we've built up so far. + if (!snippet_entry.snippet_matches().empty()) { + *snippet_proto->add_entries() = std::move(snippet_entry); + } + return; + } + } + SnippetMatchProto match = std::move(match_or).ValueOrDie(); + // submatch_end refers to a position *within* token.text. + // This, conveniently enough, means that index that submatch_end points + // to is the length of the submatch (because the submatch starts at 0 in + // token.text). + match.set_submatch_byte_length(submatch_end.utf8_index()); + match.set_submatch_utf16_length(submatch_end.utf16_index()); + // Add the values for the submatch. + snippet_entry.mutable_snippet_matches()->Add(std::move(match)); + + if (--match_options->max_matches_remaining <= 0) { + *snippet_proto->add_entries() = std::move(snippet_entry); + return; + } + } } - Token token = iterator->GetToken(); - if (matcher->Matches(token)) { - // If there was an error while retrieving the match, the tokenizer - // iterator is probably in an invalid state. There's nothing we can do - // here, so just return. - ICING_ASSIGN_OR_RETURN( - SnippetMatchProto match, - RetrieveMatch(match_options.snippet_spec, value, iterator.get())); - snippet_entry.mutable_snippet_matches()->Add(std::move(match)); + if (!snippet_entry.snippet_matches().empty()) { + *snippet_proto->add_entries() = std::move(snippet_entry); } } - if (snippet_entry.snippet_matches().empty()) { - return absl_ports::NotFoundError("No matches found in value!"); +} + +// Retrieves snippets in document from content at section_path. +// Tokenizer is provided to tokenize string content and matcher is provided to +// indicate when a token matches content in the query. +// +// section_path_index refers to the current property that is held by document. +// current_path is equivalent to the first section_path_index values in +// section_path, but with value indices present. +// +// For example, suppose that a hit appeared somewhere in the "bcc.emailAddress". +// The arguments for RetrieveSnippetForSection might be +// {section_path=["bcc", "emailAddress"], section_path_index=0, current_path=""} +// on the first call and +// {section_path=["bcc", "emailAddress"], section_path_index=1, +// current_path="bcc[1]"} on the second recursive call. +// +// MatchOptions holds the snippet spec and number of desired matches remaining. +// Each call to RetrieveSnippetForSection will decrement max_matches_remaining +// by the number of entries that it adds to snippet_proto. +// +// The SnippetEntries found for matched content will be added to snippet_proto. +void RetrieveSnippetForSection( + const DocumentProto& document, const TokenMatcher* matcher, + const Tokenizer* tokenizer, + const std::vector<std::string_view>& section_path, int section_path_index, + const std::string& current_path, MatchOptions* match_options, + SnippetProto* snippet_proto) { + std::string_view next_property_name = section_path.at(section_path_index); + const PropertyProto* current_property = + GetProperty(document, next_property_name); + if (current_property == nullptr) { + ICING_VLOG(1) << "No property " << next_property_name << " found at path " + << current_path; + return; + } + std::string property_path = + AddPropertyToPath(current_path, next_property_name); + if (section_path_index == section_path.size() - 1) { + // We're at the end. Let's check our values. + GetEntriesFromProperty(current_property, property_path, matcher, tokenizer, + match_options, snippet_proto); + } else { + // Still got more to go. Let's look through our subdocuments. + std::vector<SnippetProto::EntryProto> entries; + for (int i = 0; i < current_property->document_values_size(); ++i) { + std::string new_path = AddIndexToPath( + current_property->document_values_size(), /*index=*/i, property_path); + RetrieveSnippetForSection(current_property->document_values(i), matcher, + tokenizer, section_path, section_path_index + 1, + new_path, match_options, snippet_proto); + if (match_options->max_matches_remaining <= 0) { + break; + } + } } - return snippet_entry; } } // namespace @@ -304,6 +523,10 @@ SnippetProto SnippetRetriever::RetrieveSnippet( // Remove this section from the mask. section_id_mask &= ~(1u << section_id); + MatchOptions match_options = {snippet_spec}; + match_options.max_matches_remaining = + snippet_spec.num_matches_per_property(); + // Determine the section name and match type. auto section_metadata_or = schema_store_.GetSectionMetadata(type_id, section_id); @@ -311,7 +534,9 @@ SnippetProto SnippetRetriever::RetrieveSnippet( continue; } const SectionMetadata* metadata = section_metadata_or.ValueOrDie(); - MatchOptions match_options = {snippet_spec}; + std::vector<std::string_view> section_path = + absl_ports::StrSplit(metadata->path, kPropertySeparator); + // Match type must be as restrictive as possible. Prefix matches for a // snippet should only be included if both the query is Prefix and the // section has prefixes enabled. @@ -330,38 +555,18 @@ SnippetProto SnippetRetriever::RetrieveSnippet( if (!matcher_or.ok()) { continue; } - match_options.max_matches_remaining = - snippet_spec.num_matches_per_property(); + std::unique_ptr<TokenMatcher> matcher = std::move(matcher_or).ValueOrDie(); - // Retrieve values and snippet them. - auto values_or = - schema_store_.GetStringSectionContent(document, metadata->path); - if (!values_or.ok()) { - continue; - } auto tokenizer_or = tokenizer_factory::CreateIndexingTokenizer( metadata->tokenizer, &language_segmenter_); if (!tokenizer_or.ok()) { // If we couldn't create the tokenizer properly, just skip this section. continue; } - std::vector<std::string_view> values = values_or.ValueOrDie(); - for (int value_index = 0; value_index < values.size(); ++value_index) { - if (match_options.max_matches_remaining <= 0) { - break; - } - SectionData value = {metadata->path, values.at(value_index), value_index}; - auto entry_or = - RetrieveMatches(matcher_or.ValueOrDie().get(), match_options, value, - tokenizer_or.ValueOrDie().get()); - - // Drop any entries that encountered errors or didn't find any matches. - if (entry_or.ok()) { - match_options.max_matches_remaining -= - entry_or.ValueOrDie().snippet_matches_size(); - snippet_proto.mutable_entries()->Add(std::move(entry_or).ValueOrDie()); - } - } + std::unique_ptr<Tokenizer> tokenizer = std::move(tokenizer_or).ValueOrDie(); + RetrieveSnippetForSection( + document, matcher.get(), tokenizer.get(), section_path, + /*section_path_index=*/0, "", &match_options, &snippet_proto); } return snippet_proto; } diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc index ecda400..ad70038 100644 --- a/icing/result/snippet-retriever_test.cc +++ b/icing/result/snippet-retriever_test.cc @@ -24,22 +24,26 @@ #include "icing/file/mock-filesystem.h" #include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/equals-proto.h" +#include "icing/portable/platform.h" #include "icing/proto/document.pb.h" #include "icing/proto/schema.pb.h" #include "icing/proto/search.pb.h" +#include "icing/proto/term.pb.h" #include "icing/query/query-terms.h" +#include "icing/schema-builder.h" #include "icing/schema/schema-store.h" #include "icing/schema/section-manager.h" #include "icing/store/document-id.h" #include "icing/store/key-mapper.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" -#include "icing/testing/platform.h" +#include "icing/testing/jni-test-helpers.h" #include "icing/testing/snippet-helpers.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" #include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/language-segmenter.h" +#include "icing/transform/map/map-normalizer.h" #include "icing/transform/normalizer-factory.h" #include "icing/transform/normalizer.h" #include "unicode/uloc.h" @@ -49,10 +53,30 @@ namespace lib { namespace { +using ::testing::ElementsAre; using ::testing::Eq; using ::testing::IsEmpty; using ::testing::SizeIs; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED = + PropertyConfigProto_Cardinality_Code_REPEATED; + +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; + +constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; +constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; + +std::vector<std::string_view> GetPropertyPaths(const SnippetProto& snippet) { + std::vector<std::string_view> paths; + for (const SnippetProto::EntryProto& entry : snippet.entries()) { + paths.push_back(entry.property_name()); + } + return paths; +} + class SnippetRetrieverTest : public testing::Test { protected: void SetUp() override { @@ -66,7 +90,9 @@ class SnippetRetrieverTest : public testing::Test { GetTestFilePath("icing/icu.dat"))); } - language_segmenter_factory::SegmenterOptions options(ULOC_US); + jni_cache_ = GetTestJniCache(); + language_segmenter_factory::SegmenterOptions options(ULOC_US, + jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( language_segmenter_, language_segmenter_factory::Create(std::move(options))); @@ -75,25 +101,22 @@ class SnippetRetrieverTest : public testing::Test { ICING_ASSERT_OK_AND_ASSIGN( schema_store_, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); - SchemaProto schema; - SchemaTypeConfigProto* type_config = schema.add_types(); - type_config->set_schema_type("email"); - PropertyConfigProto* prop_config = type_config->add_properties(); - prop_config->set_property_name("subject"); - prop_config->set_data_type(PropertyConfigProto::DataType::STRING); - prop_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - prop_config->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::PREFIX); - prop_config->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); - prop_config = type_config->add_properties(); - prop_config->set_property_name("body"); - prop_config->set_data_type(PropertyConfigProto::DataType::STRING); - prop_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - prop_config->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - prop_config->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); + SchemaProto schema = + SchemaBuilder() + .AddType( + SchemaTypeConfigBuilder() + .SetType("email") + .AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); ICING_ASSERT_OK(schema_store_->SetSchema(schema)); ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create( @@ -121,6 +144,7 @@ class SnippetRetrieverTest : public testing::Test { std::unique_ptr<LanguageSegmenter> language_segmenter_; std::unique_ptr<SnippetRetriever> snippet_retriever_; std::unique_ptr<Normalizer> normalizer_; + std::unique_ptr<const JniCache> jni_cache_; ResultSpecProto::SnippetSpecProto snippet_spec_; std::string test_dir_; }; @@ -156,11 +180,65 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeSmallerThanMatch) { // "three". len=4, orig_window= "thre" snippet_spec_.set_max_window_bytes(4); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document, - section_mask); + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + EXPECT_THAT(snippet.entries(), SizeIs(1)); - EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0), - Eq("")); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("")); +} + +TEST_F(SnippetRetrieverTest, + SnippetingWindowMaxWindowSizeEqualToMatch_OddLengthMatch) { + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "email/1") + .SetSchema("email") + .AddStringProperty("subject", "counting") + .AddStringProperty("body", "one two three four.... five") + .Build(); + + SectionIdMask section_mask = 0b00000011; + SectionRestrictQueryTermsMap query_terms{{"", {"three"}}}; + + // Window starts at the beginning of "three" and at the exact end of + // "three". len=5, orig_window= "three" + snippet_spec_.set_max_window_bytes(5); + SnippetProto snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + + EXPECT_THAT(snippet.entries(), SizeIs(1)); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("three")); +} + +TEST_F(SnippetRetrieverTest, + SnippetingWindowMaxWindowSizeEqualToMatch_EvenLengthMatch) { + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "email/1") + .SetSchema("email") + .AddStringProperty("subject", "counting") + .AddStringProperty("body", "one two three four.... five") + .Build(); + + SectionIdMask section_mask = 0b00000011; + SectionRestrictQueryTermsMap query_terms{{"", {"four"}}}; + + // Window starts at the beginning of "four" and at the exact end of + // "four". len=4, orig_window= "four" + snippet_spec_.set_max_window_bytes(4); + SnippetProto snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + + EXPECT_THAT(snippet.entries(), SizeIs(1)); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("four")); } TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsInWhitespace) { @@ -175,16 +253,25 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsInWhitespace) { SectionIdMask section_mask = 0b00000011; SectionRestrictQueryTermsMap query_terms{{"", {"three"}}}; - // Window starts at the space between "one" and "two". Window ends in the - // middle of "four". - // len=14, orig_window=" two three fou" + // String: "one two three four.... five" + // ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 4 8 14 23 27 + // UTF-32 idx: 0 4 8 14 23 27 + // + // The window will be: + // 1. untrimmed, no-shifting window will be (2,17). + // 2. trimmed, no-shifting window [4,13) "two three" + // 3. trimmed, shifted window [4,18) "two three four" snippet_spec_.set_max_window_bytes(14); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document, - section_mask); + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + EXPECT_THAT(snippet.entries(), SizeIs(1)); - EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0), - Eq("two three")); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), + ElementsAre("two three four")); } TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsMidToken) { @@ -199,15 +286,25 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsMidToken) { SectionIdMask section_mask = 0b00000011; SectionRestrictQueryTermsMap query_terms{{"", {"three"}}}; - // Window starts in the middle of "one" and ends at the end of "four". - // len=16, orig_window="e two three four" + // String: "one two three four.... five" + // ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 4 8 14 23 27 + // UTF-32 idx: 0 4 8 14 23 27 + // + // The window will be: + // 1. untrimmed, no-shifting window will be (1,18). + // 2. trimmed, no-shifting window [4,18) "two three four" + // 3. trimmed, shifted window [4,20) "two three four.." snippet_spec_.set_max_window_bytes(16); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document, - section_mask); + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + EXPECT_THAT(snippet.entries(), SizeIs(1)); - EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0), - Eq("two three four")); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), + ElementsAre("two three four..")); } TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInPunctuation) { @@ -226,15 +323,18 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInPunctuation) { // len=20, orig_window="one two three four.." snippet_spec_.set_max_window_bytes(20); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document, - section_mask); + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + EXPECT_THAT(snippet.entries(), SizeIs(1)); - EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0), - Eq("one two three four..")); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), + ElementsAre("one two three four..")); } TEST_F(SnippetRetrieverTest, - SnippetingWindowMaxWindowEndsInMiddleOfMultiBytePunctuation) { + SnippetingWindowMaxWindowEndsMultiBytePunctuation) { DocumentProto document = DocumentBuilder() .SetKey("icing", "email/1") @@ -248,18 +348,21 @@ TEST_F(SnippetRetrieverTest, SectionRestrictQueryTermsMap query_terms{{"", {"in"}}}; // Window ends in the middle of all the punctuation and window starts at 0. - // len=26, orig_window="pside down in Australia\xC2" + // len=26, orig_window="pside down in Australia¿" snippet_spec_.set_max_window_bytes(24); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document, - section_mask); + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + EXPECT_THAT(snippet.entries(), SizeIs(1)); - EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0), - Eq("down in Australia")); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), + ElementsAre("down in Australia¿")); } TEST_F(SnippetRetrieverTest, - SnippetingWindowMaxWindowEndsInMultiBytePunctuation) { + SnippetingWindowMaxWindowBeyondMultiBytePunctuation) { DocumentProto document = DocumentBuilder() .SetKey("icing", "email/1") @@ -273,14 +376,17 @@ TEST_F(SnippetRetrieverTest, SectionRestrictQueryTermsMap query_terms{{"", {"in"}}}; // Window ends in the middle of all the punctuation and window starts at 0. - // len=26, orig_window="upside down in Australia\xC2\xBF" + // len=26, orig_window="upside down in Australia¿ " snippet_spec_.set_max_window_bytes(26); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document, - section_mask); + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + EXPECT_THAT(snippet.entries(), SizeIs(1)); - EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0), - Eq("upside down in Australia¿")); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), + ElementsAre("upside down in Australia¿")); } TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsBeforeValueStart) { @@ -295,15 +401,25 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsBeforeValueStart) { SectionIdMask section_mask = 0b00000011; SectionRestrictQueryTermsMap query_terms{{"", {"three"}}}; - // Window starts before 0. - // len=22, orig_window="one two three four..." + // String: "one two three four.... five" + // ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 4 8 14 23 27 + // UTF-32 idx: 0 4 8 14 23 27 + // + // The window will be: + // 1. untrimmed, no-shifting window will be (-2,21). + // 2. trimmed, no-shifting window [0,21) "one two three four..." + // 3. trimmed, shifted window [0,22) "one two three four...." snippet_spec_.set_max_window_bytes(22); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document, - section_mask); + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + EXPECT_THAT(snippet.entries(), SizeIs(1)); - EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0), - Eq("one two three four...")); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), + ElementsAre("one two three four....")); } TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInWhitespace) { @@ -322,11 +438,14 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInWhitespace) { // len=26, orig_window="one two three four.... " snippet_spec_.set_max_window_bytes(26); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document, - section_mask); + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + EXPECT_THAT(snippet.entries(), SizeIs(1)); - EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0), - Eq("one two three four....")); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), + ElementsAre("one two three four....")); } TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsMidToken) { @@ -341,15 +460,25 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsMidToken) { SectionIdMask section_mask = 0b00000011; SectionRestrictQueryTermsMap query_terms{{"", {"three"}}}; - // Window ends in the middle of "five" - // len=32, orig_window="one two three four.... fiv" + // String: "one two three four.... five" + // ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 4 8 14 23 27 + // UTF-32 idx: 0 4 8 14 23 27 + // + // The window will be: + // 1. untrimmed, no-shifting window will be ((-7,26). + // 2. trimmed, no-shifting window [0,26) "one two three four...." + // 3. trimmed, shifted window [0,27) "one two three four.... five" snippet_spec_.set_max_window_bytes(32); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document, - section_mask); + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + EXPECT_THAT(snippet.entries(), SizeIs(1)); - EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0), - Eq("one two three four....")); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), + ElementsAre("one two three four.... five")); } TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeEqualToValueSize) { @@ -368,11 +497,14 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeEqualToValueSize) { // len=34, orig_window="one two three four.... five" snippet_spec_.set_max_window_bytes(34); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document, - section_mask); + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + EXPECT_THAT(snippet.entries(), SizeIs(1)); - EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0), - Eq("one two three four.... five")); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), + ElementsAre("one two three four.... five")); } TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeLargerThanValueSize) { @@ -391,11 +523,150 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeLargerThanValueSize) { // len=36, orig_window="one two three four.... five" snippet_spec_.set_max_window_bytes(36); SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document, - section_mask); + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + + EXPECT_THAT(snippet.entries(), SizeIs(1)); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), + ElementsAre("one two three four.... five")); +} + +TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStart) { + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "email/1") + .SetSchema("email") + .AddStringProperty("subject", "counting") + .AddStringProperty("body", "one two three four.... five six") + .Build(); + + SectionIdMask section_mask = 0b00000011; + SectionRestrictQueryTermsMap query_terms{{"", {"two"}}}; + + // String: "one two three four.... five six" + // ^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 4 8 14 23 28 31 + // UTF-32 idx: 0 4 8 14 23 28 31 + // + // Window size will go past the start of the window. + // The window will be: + // 1. untrimmed, no-shifting window will be (-10,19). + // 2. trimmed, no-shifting window [0,19) "one two three four." + // 3. trimmed, shifted window [0,27) "one two three four.... five" + snippet_spec_.set_max_window_bytes(28); + SnippetProto snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + + EXPECT_THAT(snippet.entries(), SizeIs(1)); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), + ElementsAre("one two three four.... five")); +} + +TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEnd) { + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "email/1") + .SetSchema("email") + .AddStringProperty("subject", "counting") + .AddStringProperty("body", "one two three four.... five six") + .Build(); + + SectionIdMask section_mask = 0b00000011; + SectionRestrictQueryTermsMap query_terms{{"", {"five"}}}; + + // String: "one two three four.... five six" + // ^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 4 8 14 23 28 31 + // UTF-32 idx: 0 4 8 14 23 28 31 + // + // Window size will go past the end of the window. + // The window will be: + // 1. untrimmed, no-shifting window will be (10,39). + // 2. trimmed, no-shifting window [14,31) "four.... five six" + // 3. trimmed, shifted window [4,31) "two three four.... five six" + snippet_spec_.set_max_window_bytes(28); + SnippetProto snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + EXPECT_THAT(snippet.entries(), SizeIs(1)); - EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0), - Eq("one two three four.... five")); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), + ElementsAre("two three four.... five six")); +} + +TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStartShortText) { + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "email/1") + .SetSchema("email") + .AddStringProperty("subject", "counting") + .AddStringProperty("body", "one two three four....") + .Build(); + + SectionIdMask section_mask = 0b00000011; + SectionRestrictQueryTermsMap query_terms{{"", {"two"}}}; + + // String: "one two three four...." + // ^ ^ ^ ^ ^ + // UTF-8 idx: 0 4 8 14 22 + // UTF-32 idx: 0 4 8 14 22 + // + // Window size will go past the start of the window. + // The window will be: + // 1. untrimmed, no-shifting window will be (-10,19). + // 2. trimmed, no-shifting window [0, 19) "one two three four." + // 3. trimmed, shifted window [0, 22) "one two three four...." + snippet_spec_.set_max_window_bytes(28); + SnippetProto snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + + EXPECT_THAT(snippet.entries(), SizeIs(1)); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), + ElementsAre("one two three four....")); +} + +TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEndShortText) { + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "email/1") + .SetSchema("email") + .AddStringProperty("subject", "counting") + .AddStringProperty("body", "one two three four....") + .Build(); + + SectionIdMask section_mask = 0b00000011; + SectionRestrictQueryTermsMap query_terms{{"", {"four"}}}; + + // String: "one two three four...." + // ^ ^ ^ ^ ^ + // UTF-8 idx: 0 4 8 14 22 + // UTF-32 idx: 0 4 8 14 22 + // + // Window size will go past the start of the window. + // The window will be: + // 1. untrimmed, no-shifting window will be (1,30). + // 2. trimmed, no-shifting window [4, 22) "two three four...." + // 3. trimmed, shifted window [0, 22) "one two three four...." + snippet_spec_.set_max_window_bytes(28); + SnippetProto snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + + EXPECT_THAT(snippet.entries(), SizeIs(1)); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), + ElementsAre("one two three four....")); } TEST_F(SnippetRetrieverTest, PrefixSnippeting) { @@ -409,14 +680,21 @@ TEST_F(SnippetRetrieverTest, PrefixSnippeting) { SectionIdMask section_mask = 0b00000011; SectionRestrictQueryTermsMap query_terms{{"", {"f"}}}; SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, TermMatchType::PREFIX, snippet_spec_, document, - section_mask); + query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); // Check the snippets. 'f' should match prefix-enabled property 'subject', but // not exact-only property 'body' EXPECT_THAT(snippet.entries(), SizeIs(1)); - EXPECT_THAT(GetWindow(document, snippet, "subject", 0), Eq("subject foo")); - EXPECT_THAT(GetMatch(document, snippet, "subject", 0), Eq("foo")); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("subject")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), + ElementsAre("subject foo")); + EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo")); + + if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("f")); + } } TEST_F(SnippetRetrieverTest, ExactSnippeting) { @@ -431,8 +709,7 @@ TEST_F(SnippetRetrieverTest, ExactSnippeting) { SectionIdMask section_mask = 0b00000011; SectionRestrictQueryTermsMap query_terms{{"", {"f"}}}; SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document, - section_mask); + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); // Check the snippets EXPECT_THAT(snippet.entries(), IsEmpty()); @@ -452,13 +729,18 @@ TEST_F(SnippetRetrieverTest, SimpleSnippetingNoWindowing) { SectionIdMask section_mask = 0b00000011; SectionRestrictQueryTermsMap query_terms{{"", {"foo"}}}; SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document, - section_mask); + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); // Check the snippets EXPECT_THAT(snippet.entries(), SizeIs(1)); - EXPECT_THAT(GetWindow(document, snippet, "subject", 0), IsEmpty()); - EXPECT_THAT(GetMatch(document, snippet, "subject", 0), Eq("foo")); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("subject")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("")); + EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo")); + if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo")); + } } TEST_F(SnippetRetrieverTest, SnippetingMultipleMatches) { @@ -471,23 +753,53 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatches) { "Concerning the subject of foo, we need to begin " "considering our options regarding body bar.") .Build(); + // String: "Concerning the subject of foo, we need to begin considering " + // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 11 15 23 26 31 34 39 42 48 + // UTF-32 idx: 0 11 15 23 26 31 34 39 42 48 + // + // String ctd: "our options regarding body bar." + // ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 60 64 72 82 87 91 + // UTF-32 idx: 60 64 72 82 87 91 SectionIdMask section_mask = 0b00000011; SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}}; SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, TermMatchType::PREFIX, snippet_spec_, document, - section_mask); + query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); // Check the snippets EXPECT_THAT(snippet.entries(), SizeIs(2)); - EXPECT_THAT(GetWindow(document, snippet, "subject", 0), Eq("subject foo")); - EXPECT_THAT(GetMatch(document, snippet, "subject", 0), Eq("foo")); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + // The first window will be: + // 1. untrimmed, no-shifting window will be (-6,59). + // 2. trimmed, no-shifting window [0, 59) "Concerning... considering". + // 3. trimmed, shifted window [0, 63) "Concerning... our" + // The second window will be: + // 1. untrimmed, no-shifting window will be (54,91). + // 2. trimmed, no-shifting window [60, 91) "our... bar.". + // 3. trimmed, shifted window [31, 91) "we... bar." EXPECT_THAT( - GetWindow(document, snippet, "body", 0), - Eq("Concerning the subject of foo, we need to begin considering")); - EXPECT_THAT(GetMatch(document, snippet, "body", 0), Eq("foo")); - EXPECT_THAT(GetWindow(document, snippet, "body", 1), - Eq("our options regarding body bar.")); - EXPECT_THAT(GetMatch(document, snippet, "body", 1), Eq("bar")); + GetWindows(content, snippet.entries(0)), + ElementsAre( + "Concerning the subject of foo, we need to begin considering our", + "we need to begin considering our options regarding body bar.")); + EXPECT_THAT(GetMatches(content, snippet.entries(0)), + ElementsAre("foo", "bar")); + if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), + ElementsAre("foo", "bar")); + } + + EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject")); + content = GetString(&document, snippet.entries(1).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(1)), + ElementsAre("subject foo")); + EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("foo")); + if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { + EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo")); + } } TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrict) { @@ -500,23 +812,47 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrict) { "Concerning the subject of foo, we need to begin " "considering our options regarding body bar.") .Build(); + // String: "Concerning the subject of foo, we need to begin considering " + // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 11 15 23 26 31 34 39 42 48 + // UTF-32 idx: 0 11 15 23 26 31 34 39 42 48 + // + // String ctd: "our options regarding body bar." + // ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 60 64 72 82 87 91 + // UTF-32 idx: 60 64 72 82 87 91 + // // Section 1 "subject" is not in the section_mask, so no snippet information // from that section should be returned by the SnippetRetriever. SectionIdMask section_mask = 0b00000001; SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}}; SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, TermMatchType::PREFIX, snippet_spec_, document, - section_mask); + query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); // Check the snippets EXPECT_THAT(snippet.entries(), SizeIs(1)); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + // The first window will be: + // 1. untrimmed, no-shifting window will be (-6,59). + // 2. trimmed, no-shifting window [0, 59) "Concerning... considering". + // 3. trimmed, shifted window [0, 63) "Concerning... our" + // The second window will be: + // 1. untrimmed, no-shifting window will be (54,91). + // 2. trimmed, no-shifting window [60, 91) "our... bar.". + // 3. trimmed, shifted window [31, 91) "we... bar." EXPECT_THAT( - GetWindow(document, snippet, "body", 0), - Eq("Concerning the subject of foo, we need to begin considering")); - EXPECT_THAT(GetMatch(document, snippet, "body", 0), Eq("foo")); - EXPECT_THAT(GetWindow(document, snippet, "body", 1), - Eq("our options regarding body bar.")); - EXPECT_THAT(GetMatch(document, snippet, "body", 1), Eq("bar")); + GetWindows(content, snippet.entries(0)), + ElementsAre( + "Concerning the subject of foo, we need to begin considering our", + "we need to begin considering our options regarding body bar.")); + EXPECT_THAT(GetMatches(content, snippet.entries(0)), + ElementsAre("foo", "bar")); + if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), + ElementsAre("foo", "bar")); + } } TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) { @@ -529,6 +865,15 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) { "Concerning the subject of foo, we need to begin " "considering our options regarding body bar.") .Build(); + // String: "Concerning the subject of foo, we need to begin considering " + // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 11 15 23 26 31 34 39 42 48 + // UTF-32 idx: 0 11 15 23 26 31 34 39 42 48 + // + // String ctd: "our options regarding body bar." + // ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 60 64 72 82 87 91 + // UTF-32 idx: 60 64 72 82 87 91 SectionIdMask section_mask = 0b00000011; // "subject" should match in both sections, but "foo" is restricted to "body" // so it should only match in the 'body' section and not the 'subject' @@ -536,25 +881,42 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) { SectionRestrictQueryTermsMap query_terms{{"", {"subject"}}, {"body", {"foo"}}}; SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, TermMatchType::PREFIX, snippet_spec_, document, - section_mask); + query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); // Check the snippets EXPECT_THAT(snippet.entries(), SizeIs(2)); - // 'subject' section should only have the one match for "subject". - EXPECT_THAT(GetWindow(document, snippet, "subject", 0), Eq("subject foo")); - EXPECT_THAT(GetMatch(document, snippet, "subject", 0), Eq("subject")); - EXPECT_THAT(GetWindow(document, snippet, "subject", 1), IsEmpty()); - EXPECT_THAT(GetMatch(document, snippet, "subject", 1), IsEmpty()); - - // 'body' section should have matches for "subject" and "foo". - EXPECT_THAT(GetWindow(document, snippet, "body", 0), - Eq("Concerning the subject of foo, we need to begin")); - EXPECT_THAT(GetMatch(document, snippet, "body", 0), Eq("subject")); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + // The first window will be: + // 1. untrimmed, no-shifting window will be (-15,50). + // 2. trimmed, no-shifting window [0, 47) "Concerning... begin". + // 3. trimmed, shifted window [0, 63) "Concerning... our" + // The second window will be: + // 1. untrimmed, no-shifting window will be (-6,59). + // 2. trimmed, no-shifting window [0, 59) "Concerning... considering". + // 3. trimmed, shifted window [0, 63) "Concerning... our" EXPECT_THAT( - GetWindow(document, snippet, "body", 1), - Eq("Concerning the subject of foo, we need to begin considering")); - EXPECT_THAT(GetMatch(document, snippet, "body", 1), Eq("foo")); + GetWindows(content, snippet.entries(0)), + ElementsAre( + "Concerning the subject of foo, we need to begin considering our", + "Concerning the subject of foo, we need to begin considering our")); + EXPECT_THAT(GetMatches(content, snippet.entries(0)), + ElementsAre("subject", "foo")); + if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), + ElementsAre("subject", "foo")); + } + + EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject")); + content = GetString(&document, snippet.entries(1).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(1)), + ElementsAre("subject foo")); + EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("subject")); + if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { + EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), + ElementsAre("subject")); + } } TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesOneMatchPerProperty) { @@ -568,24 +930,48 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesOneMatchPerProperty) { "considering our options regarding body bar.") .Build(); + // String: "Concerning the subject of foo, we need to begin considering " + // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 11 15 23 26 31 34 39 42 48 + // UTF-32 idx: 0 11 15 23 26 31 34 39 42 48 + // + // String ctd: "our options regarding body bar." + // ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 60 64 72 82 87 91 + // UTF-32 idx: 60 64 72 82 87 91 snippet_spec_.set_num_matches_per_property(1); SectionIdMask section_mask = 0b00000011; SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}}; SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, TermMatchType::PREFIX, snippet_spec_, document, - section_mask); + query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); // Check the snippets EXPECT_THAT(snippet.entries(), SizeIs(2)); - EXPECT_THAT(GetWindow(document, snippet, "subject", 0), Eq("subject foo")); - EXPECT_THAT(GetMatch(document, snippet, "subject", 0), Eq("foo")); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + // The window will be: + // 1. untrimmed, no-shifting window will be (-6,59). + // 2. trimmed, no-shifting window [0, 59) "Concerning... considering". + // 3. trimmed, shifted window [0, 63) "Concerning... our" EXPECT_THAT( - GetWindow(document, snippet, "body", 0), - Eq("Concerning the subject of foo, we need to begin considering")); - EXPECT_THAT(GetMatch(document, snippet, "body", 0), Eq("foo")); - EXPECT_THAT(GetWindow(document, snippet, "body", 1), IsEmpty()); - EXPECT_THAT(GetMatch(document, snippet, "body", 1), IsEmpty()); + GetWindows(content, snippet.entries(0)), + ElementsAre( + "Concerning the subject of foo, we need to begin considering our")); + EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo")); + if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo")); + } + + EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject")); + content = GetString(&document, snippet.entries(1).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(1)), + ElementsAre("subject foo")); + EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("foo")); + if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { + EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo")); + } } TEST_F(SnippetRetrieverTest, PrefixSnippetingNormalization) { @@ -599,12 +985,17 @@ TEST_F(SnippetRetrieverTest, PrefixSnippetingNormalization) { SectionIdMask section_mask = 0b00000011; SectionRestrictQueryTermsMap query_terms{{"", {"md"}}}; SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, TermMatchType::PREFIX, snippet_spec_, document, - section_mask); + query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(1)); - EXPECT_THAT(GetWindow(document, snippet, "subject", 0), Eq("MDI team")); - EXPECT_THAT(GetMatch(document, snippet, "subject", 0), Eq("MDI")); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("subject")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("MDI team")); + EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("MDI")); + if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("MD")); + } } TEST_F(SnippetRetrieverTest, ExactSnippetingNormalization) { @@ -619,13 +1010,646 @@ TEST_F(SnippetRetrieverTest, ExactSnippetingNormalization) { SectionIdMask section_mask = 0b00000011; SectionRestrictQueryTermsMap query_terms{{"", {"zurich"}}}; SnippetProto snippet = snippet_retriever_->RetrieveSnippet( - query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document, - section_mask); + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); EXPECT_THAT(snippet.entries(), SizeIs(1)); - EXPECT_THAT(GetWindow(document, snippet, "body", 0), - Eq("Some members are in Zürich.")); - EXPECT_THAT(GetMatch(document, snippet, "body", 0), Eq("Zürich")); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("body")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), + ElementsAre("Some members are in Zürich.")); + EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("Zürich")); + + if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), + ElementsAre("Zürich")); + } +} + +TEST_F(SnippetRetrieverTest, SnippetingTestOneLevel) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("SingleLevelType") + .AddProperty( + PropertyConfigBuilder() + .SetName("X") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty( + PropertyConfigBuilder() + .SetName("Y") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty( + PropertyConfigBuilder() + .SetName("Z") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + ICING_ASSERT_OK(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/true)); + ICING_ASSERT_OK_AND_ASSIGN( + snippet_retriever_, + SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(), + normalizer_.get())); + + std::vector<std::string> string_values = {"marco", "polo", "marco", "polo"}; + DocumentProto document; + document.set_schema("SingleLevelType"); + PropertyProto* prop = document.add_properties(); + prop->set_name("X"); + for (const std::string& s : string_values) { + prop->add_string_values(s); + } + prop = document.add_properties(); + prop->set_name("Y"); + for (const std::string& s : string_values) { + prop->add_string_values(s); + } + prop = document.add_properties(); + prop->set_name("Z"); + for (const std::string& s : string_values) { + prop->add_string_values(s); + } + + SectionIdMask section_mask = 0b00000111; + SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}}; + SnippetProto snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + + EXPECT_THAT(snippet.entries(), SizeIs(6)); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("X[1]")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo")); + EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo")); + if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), + ElementsAre("polo")); + } + + EXPECT_THAT(snippet.entries(1).property_name(), Eq("X[3]")); + content = GetString(&document, snippet.entries(1).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo")); + EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo")); + + if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { + EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), + ElementsAre("polo")); + } + + EXPECT_THAT(GetPropertyPaths(snippet), + ElementsAre("X[1]", "X[3]", "Y[1]", "Y[3]", "Z[1]", "Z[3]")); +} + +TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevel) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("SingleLevelType") + .AddProperty( + PropertyConfigBuilder() + .SetName("X") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty( + PropertyConfigBuilder() + .SetName("Y") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty( + PropertyConfigBuilder() + .SetName("Z") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("MultiLevelType") + .AddProperty(PropertyConfigBuilder() + .SetName("A") + .SetDataTypeDocument( + "SingleLevelType", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("B") + .SetDataTypeDocument( + "SingleLevelType", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("C") + .SetDataTypeDocument( + "SingleLevelType", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + ICING_ASSERT_OK(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/true)); + ICING_ASSERT_OK_AND_ASSIGN( + snippet_retriever_, + SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(), + normalizer_.get())); + + std::vector<std::string> string_values = {"marco", "polo", "marco", "polo"}; + DocumentProto subdocument; + PropertyProto* prop = subdocument.add_properties(); + prop->set_name("X"); + for (const std::string& s : string_values) { + prop->add_string_values(s); + } + prop = subdocument.add_properties(); + prop->set_name("Y"); + for (const std::string& s : string_values) { + prop->add_string_values(s); + } + prop = subdocument.add_properties(); + prop->set_name("Z"); + for (const std::string& s : string_values) { + prop->add_string_values(s); + } + + DocumentProto document; + document.set_schema("MultiLevelType"); + prop = document.add_properties(); + prop->set_name("A"); + *prop->add_document_values() = subdocument; + + prop = document.add_properties(); + prop->set_name("B"); + *prop->add_document_values() = subdocument; + + prop = document.add_properties(); + prop->set_name("C"); + *prop->add_document_values() = subdocument; + + SectionIdMask section_mask = 0b111111111; + SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}}; + SnippetProto snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + + EXPECT_THAT(snippet.entries(), SizeIs(18)); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("A.X[1]")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo")); + EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo")); + if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), + ElementsAre("polo")); + } + + EXPECT_THAT(snippet.entries(1).property_name(), Eq("A.X[3]")); + content = GetString(&document, snippet.entries(1).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo")); + EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo")); + if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { + EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), + ElementsAre("polo")); + } + + EXPECT_THAT( + GetPropertyPaths(snippet), + ElementsAre("A.X[1]", "A.X[3]", "A.Y[1]", "A.Y[3]", "A.Z[1]", "A.Z[3]", + "B.X[1]", "B.X[3]", "B.Y[1]", "B.Y[3]", "B.Z[1]", "B.Z[3]", + "C.X[1]", "C.X[3]", "C.Y[1]", "C.Y[3]", "C.Z[1]", "C.Z[3]")); +} + +TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelRepeated) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("SingleLevelType") + .AddProperty( + PropertyConfigBuilder() + .SetName("X") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty( + PropertyConfigBuilder() + .SetName("Y") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty( + PropertyConfigBuilder() + .SetName("Z") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REPEATED))) + .AddType(SchemaTypeConfigBuilder() + .SetType("MultiLevelType") + .AddProperty(PropertyConfigBuilder() + .SetName("A") + .SetDataTypeDocument( + "SingleLevelType", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty(PropertyConfigBuilder() + .SetName("B") + .SetDataTypeDocument( + "SingleLevelType", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty(PropertyConfigBuilder() + .SetName("C") + .SetDataTypeDocument( + "SingleLevelType", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + ICING_ASSERT_OK(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/true)); + ICING_ASSERT_OK_AND_ASSIGN( + snippet_retriever_, + SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(), + normalizer_.get())); + + std::vector<std::string> string_values = {"marco", "polo", "marco", "polo"}; + DocumentProto subdocument; + PropertyProto* prop = subdocument.add_properties(); + prop->set_name("X"); + for (const std::string& s : string_values) { + prop->add_string_values(s); + } + prop = subdocument.add_properties(); + prop->set_name("Y"); + for (const std::string& s : string_values) { + prop->add_string_values(s); + } + prop = subdocument.add_properties(); + prop->set_name("Z"); + for (const std::string& s : string_values) { + prop->add_string_values(s); + } + + DocumentProto document; + document.set_schema("MultiLevelType"); + prop = document.add_properties(); + prop->set_name("A"); + *prop->add_document_values() = subdocument; + *prop->add_document_values() = subdocument; + + prop = document.add_properties(); + prop->set_name("B"); + *prop->add_document_values() = subdocument; + *prop->add_document_values() = subdocument; + + prop = document.add_properties(); + prop->set_name("C"); + *prop->add_document_values() = subdocument; + *prop->add_document_values() = subdocument; + + SectionIdMask section_mask = 0b111111111; + SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}}; + SnippetProto snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + + EXPECT_THAT(snippet.entries(), SizeIs(36)); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("A[0].X[1]")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo")); + EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo")); + if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), + ElementsAre("polo")); + } + + EXPECT_THAT(snippet.entries(1).property_name(), Eq("A[0].X[3]")); + content = GetString(&document, snippet.entries(1).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo")); + EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo")); + if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { + EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), + ElementsAre("polo")); + } + + EXPECT_THAT(GetPropertyPaths(snippet), + ElementsAre("A[0].X[1]", "A[0].X[3]", "A[1].X[1]", "A[1].X[3]", + "A[0].Y[1]", "A[0].Y[3]", "A[1].Y[1]", "A[1].Y[3]", + "A[0].Z[1]", "A[0].Z[3]", "A[1].Z[1]", "A[1].Z[3]", + "B[0].X[1]", "B[0].X[3]", "B[1].X[1]", "B[1].X[3]", + "B[0].Y[1]", "B[0].Y[3]", "B[1].Y[1]", "B[1].Y[3]", + "B[0].Z[1]", "B[0].Z[3]", "B[1].Z[1]", "B[1].Z[3]", + "C[0].X[1]", "C[0].X[3]", "C[1].X[1]", "C[1].X[3]", + "C[0].Y[1]", "C[0].Y[3]", "C[1].Y[1]", "C[1].Y[3]", + "C[0].Z[1]", "C[0].Z[3]", "C[1].Z[1]", "C[1].Z[3]")); +} + +TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelSingleValue) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("SingleLevelType") + .AddProperty( + PropertyConfigBuilder() + .SetName("X") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("Y") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("Z") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder() + .SetType("MultiLevelType") + .AddProperty(PropertyConfigBuilder() + .SetName("A") + .SetDataTypeDocument( + "SingleLevelType", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty(PropertyConfigBuilder() + .SetName("B") + .SetDataTypeDocument( + "SingleLevelType", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty(PropertyConfigBuilder() + .SetName("C") + .SetDataTypeDocument( + "SingleLevelType", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + ICING_ASSERT_OK(schema_store_->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/true)); + ICING_ASSERT_OK_AND_ASSIGN( + snippet_retriever_, + SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(), + normalizer_.get())); + + DocumentProto subdocument; + PropertyProto* prop = subdocument.add_properties(); + prop->set_name("X"); + prop->add_string_values("polo"); + prop = subdocument.add_properties(); + prop->set_name("Y"); + prop->add_string_values("marco"); + prop = subdocument.add_properties(); + prop->set_name("Z"); + prop->add_string_values("polo"); + + DocumentProto document; + document.set_schema("MultiLevelType"); + prop = document.add_properties(); + prop->set_name("A"); + *prop->add_document_values() = subdocument; + *prop->add_document_values() = subdocument; + + prop = document.add_properties(); + prop->set_name("B"); + *prop->add_document_values() = subdocument; + *prop->add_document_values() = subdocument; + + prop = document.add_properties(); + prop->set_name("C"); + *prop->add_document_values() = subdocument; + *prop->add_document_values() = subdocument; + + SectionIdMask section_mask = 0b111111111; + SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}}; + SnippetProto snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_EXACT, snippet_spec_, document, section_mask); + + EXPECT_THAT(snippet.entries(), SizeIs(12)); + EXPECT_THAT(snippet.entries(0).property_name(), Eq("A[0].X")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo")); + EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo")); + if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { + EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), + ElementsAre("polo")); + } + + EXPECT_THAT(snippet.entries(1).property_name(), Eq("A[1].X")); + content = GetString(&document, snippet.entries(1).property_name()); + EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo")); + EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo")); + if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { + EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), + ElementsAre("polo")); + } + + EXPECT_THAT( + GetPropertyPaths(snippet), + ElementsAre("A[0].X", "A[1].X", "A[0].Z", "A[1].Z", "B[0].X", "B[1].X", + "B[0].Z", "B[1].Z", "C[0].X", "C[1].X", "C[0].Z", "C[1].Z")); +} + +TEST_F(SnippetRetrieverTest, CJKSnippetMatchTest) { + // String: "我每天走路去上班。" + // ^ ^ ^ ^^ + // UTF8 idx: 0 3 9 15 18 + // UTF16 idx: 0 1 3 5 6 + // Breaks into segments: "我", "每天", "走路", "去", "上班" + constexpr std::string_view kChinese = "我每天走路去上班。"; + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "email/1") + .SetSchema("email") + .AddStringProperty("subject", kChinese) + .AddStringProperty("body", + "Concerning the subject of foo, we need to begin " + "considering our options regarding body bar.") + .Build(); + + SectionIdMask section_mask = 0b00000011; + SectionRestrictQueryTermsMap query_terms{{"", {"走"}}}; + + SnippetProto snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + + // Ensure that one and only one property was matched and it was "body" + ASSERT_THAT(snippet.entries(), SizeIs(1)); + const SnippetProto::EntryProto* entry = &snippet.entries(0); + EXPECT_THAT(entry->property_name(), Eq("subject")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + + // Ensure that there is one and only one match within "subject" + ASSERT_THAT(entry->snippet_matches(), SizeIs(1)); + const SnippetMatchProto& match_proto = entry->snippet_matches(0); + + // Ensure that the match is correct. + EXPECT_THAT(GetMatches(content, *entry), ElementsAre("走路")); + if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { + EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("走")); + } + + // Ensure that the utf-16 values are also as expected + EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(3)); + EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2)); + if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { + EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(1)); + } +} + +TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) { + language_segmenter_factory::SegmenterOptions options(ULOC_SIMPLIFIED_CHINESE, + jni_cache_.get()); + ICING_ASSERT_OK_AND_ASSIGN( + language_segmenter_, + language_segmenter_factory::Create(std::move(options))); + ICING_ASSERT_OK_AND_ASSIGN( + snippet_retriever_, + SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(), + normalizer_.get())); + + // String: "我每天走路去上班。" + // ^ ^ ^ ^^ + // UTF8 idx: 0 3 9 15 18 + // UTF16 idx: 0 1 3 5 6 + // UTF32 idx: 0 1 3 5 6 + // Breaks into segments: "我", "每天", "走路", "去", "上班" + constexpr std::string_view kChinese = "我每天走路去上班。"; + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "email/1") + .SetSchema("email") + .AddStringProperty("subject", kChinese) + .AddStringProperty("body", + "Concerning the subject of foo, we need to begin " + "considering our options regarding body bar.") + .Build(); + + SectionIdMask section_mask = 0b00000011; + SectionRestrictQueryTermsMap query_terms{{"", {"走"}}}; + + // The window will be: + // 1. untrimmed, no-shifting window will be (0,7). + // 2. trimmed, no-shifting window [1, 6) "每天走路去". + // 3. trimmed, shifted window [0, 6) "我每天走路去" + snippet_spec_.set_max_window_bytes(6); + + SnippetProto snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + + // Ensure that one and only one property was matched and it was "body" + ASSERT_THAT(snippet.entries(), SizeIs(1)); + const SnippetProto::EntryProto* entry = &snippet.entries(0); + EXPECT_THAT(entry->property_name(), Eq("subject")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + + // Ensure that there is one and only one match within "subject" + ASSERT_THAT(entry->snippet_matches(), SizeIs(1)); + const SnippetMatchProto& match_proto = entry->snippet_matches(0); + + // Ensure that the match is correct. + EXPECT_THAT(GetWindows(content, *entry), ElementsAre("我每天走路去")); + + // Ensure that the utf-16 values are also as expected + EXPECT_THAT(match_proto.window_utf16_position(), Eq(0)); + EXPECT_THAT(match_proto.window_utf16_length(), Eq(6)); +} + +TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitSnippetMatchTest) { + // The following string has four-byte UTF-8 characters. Most importantly, it + // is also two code units in UTF-16. + // String: "𐀀𐀁 𐀂𐀃 𐀄" + // ^ ^ ^ + // UTF8 idx: 0 9 18 + // UTF16 idx: 0 5 10 + // Breaks into segments: "𐀀𐀁", "𐀂𐀃", "𐀄" + constexpr std::string_view kText = "𐀀𐀁 𐀂𐀃 𐀄"; + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "email/1") + .SetSchema("email") + .AddStringProperty("subject", kText) + .AddStringProperty("body", + "Concerning the subject of foo, we need to begin " + "considering our options regarding body bar.") + .Build(); + + SectionIdMask section_mask = 0b00000011; + SectionRestrictQueryTermsMap query_terms{{"", {"𐀂"}}}; + + SnippetProto snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + + // Ensure that one and only one property was matched and it was "body" + ASSERT_THAT(snippet.entries(), SizeIs(1)); + const SnippetProto::EntryProto* entry = &snippet.entries(0); + EXPECT_THAT(entry->property_name(), Eq("subject")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + + // Ensure that there is one and only one match within "subject" + ASSERT_THAT(entry->snippet_matches(), SizeIs(1)); + const SnippetMatchProto& match_proto = entry->snippet_matches(0); + + // Ensure that the match is correct. + EXPECT_THAT(GetMatches(content, *entry), ElementsAre("𐀂𐀃")); + if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { + EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("𐀂")); + } + + // Ensure that the utf-16 values are also as expected + EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(5)); + EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(4)); + if (normalizer_factory::GetNormalizerName() == MapNormalizer::kName) { + EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2)); + } +} + +TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) { + // The following string has four-byte UTF-8 characters. Most importantly, it + // is also two code units in UTF-16. + // String: "𐀀𐀁 𐀂𐀃 𐀄" + // ^ ^ ^ + // UTF8 idx: 0 9 18 + // UTF16 idx: 0 5 10 + // UTF32 idx: 0 3 6 + // Breaks into segments: "𐀀𐀁", "𐀂𐀃", "𐀄" + constexpr std::string_view kText = "𐀀𐀁 𐀂𐀃 𐀄"; + DocumentProto document = + DocumentBuilder() + .SetKey("icing", "email/1") + .SetSchema("email") + .AddStringProperty("subject", kText) + .AddStringProperty("body", + "Concerning the subject of foo, we need to begin " + "considering our options regarding body bar.") + .Build(); + + SectionIdMask section_mask = 0b00000011; + SectionRestrictQueryTermsMap query_terms{{"", {"𐀂"}}}; + + // Set a six character window. This will produce a window like this: + // String: "𐀀𐀁 𐀂𐀃 𐀄" + // ^ ^ + // UTF8 idx: 9 22 + // UTF16 idx: 5 12 + // UTF32 idx: 3 7 + snippet_spec_.set_max_window_bytes(6); + + SnippetProto snippet = snippet_retriever_->RetrieveSnippet( + query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask); + + // Ensure that one and only one property was matched and it was "body" + ASSERT_THAT(snippet.entries(), SizeIs(1)); + const SnippetProto::EntryProto* entry = &snippet.entries(0); + EXPECT_THAT(entry->property_name(), Eq("subject")); + std::string_view content = + GetString(&document, snippet.entries(0).property_name()); + + // Ensure that there is one and only one match within "subject" + ASSERT_THAT(entry->snippet_matches(), SizeIs(1)); + const SnippetMatchProto& match_proto = entry->snippet_matches(0); + + // Ensure that the match is correct. + EXPECT_THAT(GetWindows(content, *entry), ElementsAre("𐀂𐀃 𐀄")); + + // Ensure that the utf-16 values are also as expected + EXPECT_THAT(match_proto.window_utf16_position(), Eq(5)); + EXPECT_THAT(match_proto.window_utf16_length(), Eq(7)); } } // namespace diff --git a/icing/schema-builder.h b/icing/schema-builder.h new file mode 100644 index 0000000..59ed7c5 --- /dev/null +++ b/icing/schema-builder.h @@ -0,0 +1,130 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_SCHEMA_BUILDER_H_ +#define ICING_SCHEMA_BUILDER_H_ + +#include <cstdint> +#include <initializer_list> +#include <string> +#include <string_view> +#include <utility> + +#include "icing/proto/schema.pb.h" + +namespace icing { +namespace lib { + +class PropertyConfigBuilder { + public: + PropertyConfigBuilder() = default; + explicit PropertyConfigBuilder(PropertyConfigProto property) + : property_(std::move(property)) {} + + PropertyConfigBuilder& SetName(std::string_view name) { + property_.set_property_name(std::string(name)); + return *this; + } + + PropertyConfigBuilder& SetDataType( + PropertyConfigProto::DataType::Code data_type) { + property_.set_data_type(data_type); + return *this; + } + + PropertyConfigBuilder& SetDataTypeString( + TermMatchType::Code match_type, + StringIndexingConfig::TokenizerType::Code tokenizer) { + property_.set_data_type(PropertyConfigProto::DataType::STRING); + property_.mutable_string_indexing_config()->set_term_match_type(match_type); + property_.mutable_string_indexing_config()->set_tokenizer_type(tokenizer); + return *this; + } + + PropertyConfigBuilder& SetDataTypeDocument(std::string_view schema_type, + bool index_nested_properties) { + property_.set_data_type(PropertyConfigProto::DataType::DOCUMENT); + property_.set_schema_type(std::string(schema_type)); + property_.mutable_document_indexing_config()->set_index_nested_properties( + index_nested_properties); + return *this; + } + + PropertyConfigBuilder& SetCardinality( + PropertyConfigProto::Cardinality::Code cardinality) { + property_.set_cardinality(cardinality); + return *this; + } + + PropertyConfigProto Build() const { return std::move(property_); } + + private: + PropertyConfigProto property_; +}; + +class SchemaTypeConfigBuilder { + public: + SchemaTypeConfigBuilder() = default; + SchemaTypeConfigBuilder(SchemaTypeConfigProto type_config) + : type_config_(std::move(type_config)) {} + + SchemaTypeConfigBuilder& SetType(std::string_view type) { + type_config_.set_schema_type(std::string(type)); + return *this; + } + + SchemaTypeConfigBuilder& SetVersion(int version) { + type_config_.set_version(version); + return *this; + } + + SchemaTypeConfigBuilder& AddProperty(PropertyConfigProto property) { + *type_config_.add_properties() = std::move(property); + return *this; + } + SchemaTypeConfigBuilder& AddProperty(PropertyConfigBuilder property_builder) { + *type_config_.add_properties() = property_builder.Build(); + return *this; + } + + SchemaTypeConfigProto Build() { return std::move(type_config_); } + + private: + SchemaTypeConfigProto type_config_; +}; + +class SchemaBuilder { + public: + SchemaBuilder() = default; + SchemaBuilder(SchemaProto schema) : schema_(std::move(schema)) {} + + SchemaBuilder& AddType(SchemaTypeConfigProto type) { + *schema_.add_types() = std::move(type); + return *this; + } + SchemaBuilder& AddType(SchemaTypeConfigBuilder type_builder) { + *schema_.add_types() = type_builder.Build(); + return *this; + } + + SchemaProto Build() { return std::move(schema_); } + + private: + SchemaProto schema_; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_SCHEMA_BUILDER_H_ diff --git a/icing/schema/schema-store.cc b/icing/schema/schema-store.cc index b43d2a4..3307638 100644 --- a/icing/schema/schema-store.cc +++ b/icing/schema/schema-store.cc @@ -104,7 +104,7 @@ std::unordered_set<SchemaTypeId> SchemaTypeIdsChanged( libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create( const Filesystem* filesystem, const std::string& base_dir, - const Clock* clock, NativeInitializeStats* initialize_stats) { + const Clock* clock, InitializeStatsProto* initialize_stats) { ICING_RETURN_ERROR_IF_NULL(filesystem); ICING_RETURN_ERROR_IF_NULL(clock); @@ -122,7 +122,7 @@ SchemaStore::SchemaStore(const Filesystem* filesystem, std::string base_dir, schema_file_(*filesystem, MakeSchemaFilename(base_dir_)) {} SchemaStore::~SchemaStore() { - if (initialized_) { + if (has_schema_successfully_set_) { if (!PersistToDisk().ok()) { ICING_LOG(ERROR) << "Error persisting to disk in SchemaStore destructor"; } @@ -130,7 +130,7 @@ SchemaStore::~SchemaStore() { } libtextclassifier3::Status SchemaStore::Initialize( - NativeInitializeStats* initialize_stats) { + InitializeStatsProto* initialize_stats) { auto schema_proto_or = GetSchema(); if (absl_ports::IsNotFound(schema_proto_or.status())) { // Don't have an existing schema proto, that's fine @@ -139,6 +139,7 @@ libtextclassifier3::Status SchemaStore::Initialize( // Real error when trying to read the existing schema return schema_proto_or.status(); } + has_schema_successfully_set_ = true; if (!InitializeDerivedFiles().ok()) { ICING_VLOG(3) @@ -147,7 +148,7 @@ libtextclassifier3::Status SchemaStore::Initialize( std::unique_ptr<Timer> regenerate_timer = clock_.GetNewTimer(); if (initialize_stats != nullptr) { initialize_stats->set_schema_store_recovery_cause( - NativeInitializeStats::IO_ERROR); + InitializeStatsProto::IO_ERROR); } ICING_RETURN_IF_ERROR(RegenerateDerivedFiles()); if (initialize_stats != nullptr) { @@ -156,7 +157,6 @@ libtextclassifier3::Status SchemaStore::Initialize( } } - initialized_ = true; if (initialize_stats != nullptr) { initialize_stats->set_num_schema_types(type_config_map_.size()); } @@ -253,9 +253,12 @@ libtextclassifier3::Status SchemaStore::UpdateHeader(const Crc32& checksum) { header.magic = SchemaStore::Header::kMagic; header.checksum = checksum.Get(); + ScopedFd scoped_fd( + filesystem_.OpenForWrite(MakeHeaderFilename(base_dir_).c_str())); // This should overwrite the header. - if (!filesystem_.Write(MakeHeaderFilename(base_dir_).c_str(), &header, - sizeof(header))) { + if (!scoped_fd.is_valid() || + !filesystem_.Write(scoped_fd.get(), &header, sizeof(header)) || + !filesystem_.DataSync(scoped_fd.get())) { return absl_ports::InternalError(absl_ports::StrCat( "Failed to write SchemaStore header: ", MakeHeaderFilename(base_dir_))); } @@ -285,18 +288,11 @@ libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() { libtextclassifier3::StatusOr<Crc32> SchemaStore::ComputeChecksum() const { Crc32 total_checksum; - - auto schema_proto_or = GetSchema(); - if (absl_ports::IsNotFound(schema_proto_or.status())) { + if (!has_schema_successfully_set_) { // Nothing to checksum return total_checksum; - } else if (!schema_proto_or.ok()) { - // Some real error. Pass it up - return schema_proto_or.status(); } - - // Guaranteed to have a schema proto now - const SchemaProto* schema_proto = schema_proto_or.ValueOrDie(); + ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema()); Crc32 schema_checksum; schema_checksum.Append(schema_proto->SerializeAsString()); @@ -326,12 +322,18 @@ SchemaStore::SetSchema(const SchemaProto& new_schema, libtextclassifier3::StatusOr<const SchemaStore::SetSchemaResult> SchemaStore::SetSchema(SchemaProto&& new_schema, bool ignore_errors_and_delete_documents) { + ICING_ASSIGN_OR_RETURN(SchemaUtil::DependencyMap new_dependency_map, + SchemaUtil::Validate(new_schema)); + SetSchemaResult result; auto schema_proto_or = GetSchema(); if (absl_ports::IsNotFound(schema_proto_or.status())) { // We don't have a pre-existing schema, so anything is valid. result.success = true; + for (const SchemaTypeConfigProto& type_config : new_schema.types()) { + result.schema_types_new_by_name.insert(type_config.schema_type()); + } } else if (!schema_proto_or.ok()) { // Real error return schema_proto_or.status(); @@ -349,10 +351,14 @@ SchemaStore::SetSchema(SchemaProto&& new_schema, // Different schema, track the differences and see if we can still write it SchemaUtil::SchemaDelta schema_delta = - SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema); + SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema, + new_dependency_map); - // An incompatible index is fine, we can just reindex - result.index_incompatible = schema_delta.index_incompatible; + result.schema_types_new_by_name = std::move(schema_delta.schema_types_new); + result.schema_types_changed_fully_compatible_by_name = + std::move(schema_delta.schema_types_changed_fully_compatible); + result.schema_types_index_incompatible_by_name = + std::move(schema_delta.schema_types_index_incompatible); for (const auto& schema_type : schema_delta.schema_types_deleted) { // We currently don't support deletions, so mark this as not possible. @@ -390,6 +396,7 @@ SchemaStore::SetSchema(SchemaProto&& new_schema, // Write the schema (and potentially overwrite a previous schema) ICING_RETURN_IF_ERROR( schema_file_.Write(std::make_unique<SchemaProto>(new_schema))); + has_schema_successfully_set_ = true; ICING_RETURN_IF_ERROR(RegenerateDerivedFiles()); } @@ -399,14 +406,7 @@ SchemaStore::SetSchema(SchemaProto&& new_schema, libtextclassifier3::StatusOr<const SchemaTypeConfigProto*> SchemaStore::GetSchemaTypeConfig(std::string_view schema_type) const { - auto schema_proto_or = GetSchema(); - if (absl_ports::IsNotFound(schema_proto_or.status())) { - return absl_ports::FailedPreconditionError("Schema not set yet."); - } else if (!schema_proto_or.ok()) { - // Some other real error, pass it up - return schema_proto_or.status(); - } - + ICING_RETURN_IF_ERROR(CheckSchemaSet()); const auto& type_config_iter = type_config_map_.find(std::string(schema_type)); if (type_config_iter == type_config_map_.end()) { @@ -418,39 +418,42 @@ SchemaStore::GetSchemaTypeConfig(std::string_view schema_type) const { libtextclassifier3::StatusOr<SchemaTypeId> SchemaStore::GetSchemaTypeId( std::string_view schema_type) const { + ICING_RETURN_IF_ERROR(CheckSchemaSet()); return schema_type_mapper_->Get(schema_type); } libtextclassifier3::StatusOr<std::vector<std::string_view>> SchemaStore::GetStringSectionContent(const DocumentProto& document, std::string_view section_path) const { + ICING_RETURN_IF_ERROR(CheckSchemaSet()); return section_manager_->GetStringSectionContent(document, section_path); } libtextclassifier3::StatusOr<std::vector<std::string_view>> SchemaStore::GetStringSectionContent(const DocumentProto& document, SectionId section_id) const { + ICING_RETURN_IF_ERROR(CheckSchemaSet()); return section_manager_->GetStringSectionContent(document, section_id); } libtextclassifier3::StatusOr<const SectionMetadata*> SchemaStore::GetSectionMetadata(SchemaTypeId schema_type_id, SectionId section_id) const { + ICING_RETURN_IF_ERROR(CheckSchemaSet()); return section_manager_->GetSectionMetadata(schema_type_id, section_id); } libtextclassifier3::StatusOr<std::vector<Section>> SchemaStore::ExtractSections( const DocumentProto& document) const { + ICING_RETURN_IF_ERROR(CheckSchemaSet()); return section_manager_->ExtractSections(document); } libtextclassifier3::Status SchemaStore::PersistToDisk() { - if (schema_type_mapper_ != nullptr) { - // It's possible we haven't had a schema set yet, so SchemaTypeMapper hasn't - // been initialized and is still a nullptr - ICING_RETURN_IF_ERROR(schema_type_mapper_->PersistToDisk()); + if (!has_schema_successfully_set_) { + return libtextclassifier3::Status::OK; } - + ICING_RETURN_IF_ERROR(schema_type_mapper_->PersistToDisk()); // Write the header ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum()); ICING_RETURN_IF_ERROR(UpdateHeader(checksum)); @@ -458,5 +461,35 @@ libtextclassifier3::Status SchemaStore::PersistToDisk() { return libtextclassifier3::Status::OK; } +SchemaStoreStorageInfoProto SchemaStore::GetStorageInfo() const { + SchemaStoreStorageInfoProto storage_info; + int64_t directory_size = filesystem_.GetDiskUsage(base_dir_.c_str()); + if (directory_size != Filesystem::kBadFileSize) { + storage_info.set_schema_store_size(directory_size); + } else { + storage_info.set_schema_store_size(-1); + } + ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema(), storage_info); + storage_info.set_num_schema_types(schema->types_size()); + int total_sections = 0; + int num_types_sections_exhausted = 0; + for (const SchemaTypeConfigProto& type : schema->types()) { + auto sections_list_or = + section_manager_->GetMetadataList(type.schema_type()); + if (!sections_list_or.ok()) { + continue; + } + total_sections += sections_list_or.ValueOrDie()->size(); + if (sections_list_or.ValueOrDie()->size() == kMaxSectionId + 1) { + ++num_types_sections_exhausted; + } + } + + storage_info.set_num_total_sections(total_sections); + storage_info.set_num_schema_types_sections_exhausted( + num_types_sections_exhausted); + return storage_info; +} + } // namespace lib } // namespace icing diff --git a/icing/schema/schema-store.h b/icing/schema/schema-store.h index 3854704..b9be6c0 100644 --- a/icing/schema/schema-store.h +++ b/icing/schema/schema-store.h @@ -29,6 +29,7 @@ #include "icing/proto/document.pb.h" #include "icing/proto/logging.pb.h" #include "icing/proto/schema.pb.h" +#include "icing/proto/storage.pb.h" #include "icing/schema/schema-util.h" #include "icing/schema/section-manager.h" #include "icing/schema/section.h" @@ -67,9 +68,6 @@ class SchemaStore { // to file. bool success = false; - // Whether the new schema changes invalidate the index. - bool index_incompatible = false; - // SchemaTypeIds of schema types can be reassigned new SchemaTypeIds if: // 1. Schema types are added in the middle of the SchemaProto // 2. Schema types are removed from the middle of the SchemaProto @@ -99,6 +97,21 @@ class SchemaStore { // SchemaUtil::ComputeCompatibilityDelta. Represented by the SchemaTypeId // assigned to this SchemaTypeConfigProto in the *old* schema. std::unordered_set<SchemaTypeId> schema_types_incompatible_by_id; + + // Schema types that were added in the new schema. Represented by the + // `schema_type` field in the SchemaTypeConfigProto. + std::unordered_set<std::string> schema_types_new_by_name; + + // Schema types that were changed in a way that was backwards compatible and + // didn't invalidate the index. Represented by the `schema_type` field in + // the SchemaTypeConfigProto. + std::unordered_set<std::string> + schema_types_changed_fully_compatible_by_name; + + // Schema types that were changed in a way that was backwards compatible, + // but invalidated the index. Represented by the `schema_type` field in the + // SchemaTypeConfigProto. + std::unordered_set<std::string> schema_types_index_incompatible_by_name; }; // Factory function to create a SchemaStore which does not take ownership @@ -115,7 +128,7 @@ class SchemaStore { // INTERNAL_ERROR on any IO errors static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create( const Filesystem* filesystem, const std::string& base_dir, - const Clock* clock, NativeInitializeStats* initialize_stats = nullptr); + const Clock* clock, InitializeStatsProto* initialize_stats = nullptr); // Not copyable SchemaStore(const SchemaStore&) = delete; @@ -167,6 +180,7 @@ class SchemaStore { // // Returns: // SchemaTypeId on success + // FAILED_PRECONDITION if schema hasn't been set yet // NOT_FOUND_ERROR if we don't know about the schema type // INTERNAL_ERROR on IO error libtextclassifier3::StatusOr<SchemaTypeId> GetSchemaTypeId( @@ -176,6 +190,7 @@ class SchemaStore { // // Returns: // A string of content on success + // FAILED_PRECONDITION if schema hasn't been set yet // NOT_FOUND if: // 1. Property is optional and not found in the document // 2. section_path is invalid @@ -188,6 +203,7 @@ class SchemaStore { // // Returns: // A string of content on success + // FAILED_PRECONDITION if schema hasn't been set yet // INVALID_ARGUMENT if section id is invalid // NOT_FOUND if type config name of document not found libtextclassifier3::StatusOr<std::vector<std::string_view>> @@ -199,6 +215,7 @@ class SchemaStore { // // Returns: // pointer to SectionMetadata on success + // FAILED_PRECONDITION if schema hasn't been set yet // INVALID_ARGUMENT if schema type id or section is invalid libtextclassifier3::StatusOr<const SectionMetadata*> GetSectionMetadata( SchemaTypeId schema_type_id, SectionId section_id) const; @@ -209,6 +226,7 @@ class SchemaStore { // // Returns: // A list of sections on success + // FAILED_PRECONDITION if schema hasn't been set yet // NOT_FOUND if type config name of document not found libtextclassifier3::StatusOr<std::vector<Section>> ExtractSections( const DocumentProto& document) const; @@ -228,6 +246,12 @@ class SchemaStore { // INTERNAL_ERROR on compute error libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const; + // Calculates the StorageInfo for the Schema Store. + // + // If an IO error occurs while trying to calculate the value for a field, then + // that field will be set to -1. + SchemaStoreStorageInfoProto GetStorageInfo() const; + private: // Use SchemaStore::Create instead. explicit SchemaStore(const Filesystem* filesystem, std::string base_dir, @@ -238,8 +262,7 @@ class SchemaStore { // Returns: // OK on success // INTERNAL_ERROR on IO error - libtextclassifier3::Status Initialize( - NativeInitializeStats* initialize_stats); + libtextclassifier3::Status Initialize(InitializeStatsProto* initialize_stats); // Creates sub-components and verifies the integrity of each sub-component. // @@ -275,16 +298,20 @@ class SchemaStore { // Returns any IO errors. libtextclassifier3::Status ResetSchemaTypeMapper(); + libtextclassifier3::Status CheckSchemaSet() const { + return has_schema_successfully_set_ + ? libtextclassifier3::Status::OK + : absl_ports::FailedPreconditionError("Schema not set yet."); + } + const Filesystem& filesystem_; const std::string base_dir_; const Clock& clock_; - // Used internally to indicate whether the class has been initialized. This is - // to guard against cases where the object has been created, but Initialize - // fails in the constructor. If we have successfully exited the constructor, - // then this field can be ignored. Clients of SchemaStore should not need to - // worry about this field. - bool initialized_ = false; + // Used internally to indicate whether the class has been successfully + // initialized with a valid schema. Will be false if Initialize failed or no + // schema has ever been set. + bool has_schema_successfully_set_ = false; // Cached schema FileBackedProto<SchemaProto> schema_file_; diff --git a/icing/schema/schema-store_test.cc b/icing/schema/schema-store_test.cc index 7df3dd9..be7170f 100644 --- a/icing/schema/schema-store_test.cc +++ b/icing/schema/schema-store_test.cc @@ -25,13 +25,15 @@ #include "icing/portable/equals-proto.h" #include "icing/proto/document.pb.h" #include "icing/proto/schema.pb.h" +#include "icing/schema-builder.h" #include "icing/schema/schema-util.h" #include "icing/schema/section-manager.h" #include "icing/schema/section.h" #include "icing/store/document-filter-data.h" #include "icing/testing/common-matchers.h" -#include "icing/testing/tmp-directory.h" #include "icing/testing/fake-clock.h" +#include "icing/testing/tmp-directory.h" +#include "icing/util/crc32.h" namespace icing { namespace lib { @@ -41,26 +43,39 @@ namespace { using ::icing::lib::portable_equals_proto::EqualsProto; using ::testing::ElementsAre; using ::testing::Eq; +using ::testing::Ge; using ::testing::Not; using ::testing::Pointee; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED = + PropertyConfigProto_Cardinality_Code_REPEATED; + +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; + +constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; + +constexpr PropertyConfigProto_DataType_Code TYPE_STRING = + PropertyConfigProto_DataType_Code_STRING; +constexpr PropertyConfigProto_DataType_Code TYPE_DOUBLE = + PropertyConfigProto_DataType_Code_DOUBLE; + class SchemaStoreTest : public ::testing::Test { protected: SchemaStoreTest() : test_dir_(GetTestTempDir() + "/icing") { filesystem_.CreateDirectoryRecursively(test_dir_.c_str()); - auto type = schema_.add_types(); - type->set_schema_type("email"); - - // Add an indexed property so we generate section metadata on it - auto property = type->add_properties(); - property->set_property_name("subject"); - property->set_data_type(PropertyConfigProto::DataType::STRING); - property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - property->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - property->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); + schema_ = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty( + // Add an indexed property so we generate section metadata on it + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); } void TearDown() override { @@ -74,8 +89,9 @@ class SchemaStoreTest : public ::testing::Test { }; TEST_F(SchemaStoreTest, CreationWithNullPointerShouldFail) { - EXPECT_THAT(SchemaStore::Create(/*filesystem=*/nullptr, test_dir_, &fake_clock_), - StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT( + SchemaStore::Create(/*filesystem=*/nullptr, test_dir_, &fake_clock_), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); } TEST_F(SchemaStoreTest, CorruptSchemaError) { @@ -87,6 +103,7 @@ TEST_F(SchemaStoreTest, CorruptSchemaError) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert(schema_.types(0).schema_type()); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -97,9 +114,10 @@ TEST_F(SchemaStoreTest, CorruptSchemaError) { // "Corrupt" the ground truth schema by adding new data to it. This will mess // up the checksum of the schema store - SchemaProto corrupt_schema; - auto type = corrupt_schema.add_types(); - type->set_schema_type("corrupted"); + SchemaProto corrupt_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("corrupted")) + .Build(); const std::string schema_file = absl_ports::StrCat(test_dir_, "/schema.pb"); const std::string serialized_schema = corrupt_schema.SerializeAsString(); @@ -121,6 +139,7 @@ TEST_F(SchemaStoreTest, RecoverCorruptDerivedFileOk) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert(schema_.types(0).schema_type()); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -158,6 +177,7 @@ TEST_F(SchemaStoreTest, RecoverBadChecksumOk) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert(schema_.types(0).schema_type()); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -190,7 +210,36 @@ TEST_F(SchemaStoreTest, RecoverBadChecksumOk) { } TEST_F(SchemaStoreTest, CreateNoPreviousSchemaOk) { - EXPECT_THAT(SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_), IsOk()); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaStore> store, + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); + + // The apis to retrieve information about the schema should fail gracefully. + EXPECT_THAT(store->GetSchema(), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(store->GetSchemaTypeConfig("foo"), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT(store->GetSchemaTypeId("foo"), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT(store->GetSectionMetadata(/*schema_type_id=*/0, /*section_id=*/0), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + + // The apis to extract content from a document should fail gracefully. + DocumentProto doc; + PropertyProto* prop = doc.add_properties(); + prop->set_name("name"); + prop->add_string_values("foo bar baz"); + + EXPECT_THAT(store->GetStringSectionContent(doc, /*section_id=*/0), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT(store->GetStringSectionContent(doc, "name"), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + EXPECT_THAT(store->ExtractSections(doc), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + + // The apis to persist and checksum data should succeed. + EXPECT_THAT(store->ComputeChecksum(), IsOkAndHolds(Crc32())); + EXPECT_THAT(store->PersistToDisk(), IsOk()); } TEST_F(SchemaStoreTest, CreateWithPreviousSchemaOk) { @@ -200,11 +249,13 @@ TEST_F(SchemaStoreTest, CreateWithPreviousSchemaOk) { SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert(schema_.types(0).schema_type()); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); schema_store.reset(); - EXPECT_THAT(SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_), IsOk()); + EXPECT_THAT(SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_), + IsOk()); } TEST_F(SchemaStoreTest, MultipleCreateOk) { @@ -220,6 +271,7 @@ TEST_F(SchemaStoreTest, MultipleCreateOk) { SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert(schema_.types(0).schema_type()); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); @@ -256,6 +308,7 @@ TEST_F(SchemaStoreTest, SetNewSchemaOk) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert(schema_.types(0).schema_type()); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -271,6 +324,7 @@ TEST_F(SchemaStoreTest, SetSameSchemaOk) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert(schema_.types(0).schema_type()); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -278,6 +332,8 @@ TEST_F(SchemaStoreTest, SetSameSchemaOk) { EXPECT_THAT(*actual_schema, EqualsProto(schema_)); // And one more for fun + result = SchemaStore::SetSchemaResult(); + result.success = true; EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema()); @@ -292,6 +348,7 @@ TEST_F(SchemaStoreTest, SetIncompatibleSchemaOk) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert(schema_.types(0).schema_type()); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -302,6 +359,7 @@ TEST_F(SchemaStoreTest, SetIncompatibleSchemaOk) { schema_.clear_types(); // Set the incompatible schema + result = SchemaStore::SetSchemaResult(); result.success = false; result.schema_types_deleted_by_name.emplace("email"); result.schema_types_deleted_by_id.emplace(0); @@ -314,13 +372,14 @@ TEST_F(SchemaStoreTest, SetSchemaWithAddedTypeOk) { std::unique_ptr<SchemaStore> schema_store, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); - SchemaProto schema; - auto type = schema.add_types(); - type->set_schema_type("email"); + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert("email"); EXPECT_THAT(schema_store->SetSchema(schema), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -328,10 +387,14 @@ TEST_F(SchemaStoreTest, SetSchemaWithAddedTypeOk) { EXPECT_THAT(*actual_schema, EqualsProto(schema)); // Add a type, shouldn't affect the index or cached SchemaTypeIds - type = schema.add_types(); - type->set_schema_type("new_type"); + schema = SchemaBuilder(schema) + .AddType(SchemaTypeConfigBuilder().SetType("new_type")) + .Build(); // Set the compatible schema + result = SchemaStore::SetSchemaResult(); + result.success = true; + result.schema_types_new_by_name.insert("new_type"); EXPECT_THAT(schema_store->SetSchema(schema), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema()); @@ -343,15 +406,17 @@ TEST_F(SchemaStoreTest, SetSchemaWithDeletedTypeOk) { std::unique_ptr<SchemaStore> schema_store, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); - SchemaProto schema; - auto type = schema.add_types(); - type->set_schema_type("email"); - type = schema.add_types(); - type->set_schema_type("message"); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .AddType(SchemaTypeConfigBuilder().SetType("message")) + .Build(); // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert("email"); + result.schema_types_new_by_name.insert("message"); EXPECT_THAT(schema_store->SetSchema(schema), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -364,9 +429,9 @@ TEST_F(SchemaStoreTest, SetSchemaWithDeletedTypeOk) { schema_store->GetSchemaTypeId("message")); // Remove "email" type, this also changes previous SchemaTypeIds - schema.Clear(); - type = schema.add_types(); - type->set_schema_type("message"); + schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("message")) + .Build(); SchemaStore::SetSchemaResult incompatible_result; incompatible_result.success = false; @@ -399,15 +464,17 @@ TEST_F(SchemaStoreTest, SetSchemaWithReorderedTypesOk) { std::unique_ptr<SchemaStore> schema_store, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); - SchemaProto schema; - auto type = schema.add_types(); - type->set_schema_type("email"); - type = schema.add_types(); - type->set_schema_type("message"); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .AddType(SchemaTypeConfigBuilder().SetType("message")) + .Build(); // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert("email"); + result.schema_types_new_by_name.insert("message"); EXPECT_THAT(schema_store->SetSchema(schema), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -415,14 +482,15 @@ TEST_F(SchemaStoreTest, SetSchemaWithReorderedTypesOk) { EXPECT_THAT(*actual_schema, EqualsProto(schema)); // Reorder the types - schema.clear_types(); - type = schema.add_types(); - type->set_schema_type("message"); - type = schema.add_types(); - type->set_schema_type("email"); + schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("message")) + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); // Since we assign SchemaTypeIds based on order in the SchemaProto, this will // cause SchemaTypeIds to change + result = SchemaStore::SetSchemaResult(); + result.success = true; result.old_schema_type_ids_changed.emplace(0); // Old SchemaTypeId of "email" result.old_schema_type_ids_changed.emplace( 1); // Old SchemaTypeId of "message" @@ -434,24 +502,25 @@ TEST_F(SchemaStoreTest, SetSchemaWithReorderedTypesOk) { EXPECT_THAT(*actual_schema, EqualsProto(schema)); } -TEST_F(SchemaStoreTest, SetSchemaThatRequiresReindexingOk) { +TEST_F(SchemaStoreTest, IndexedPropertyChangeRequiresReindexingOk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); - SchemaProto schema; - auto type = schema.add_types(); - type->set_schema_type("email"); - - // Add an unindexed property - auto property = type->add_properties(); - property->set_property_name("subject"); - property->set_data_type(PropertyConfigProto::DataType::STRING); - property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty( + // Add an unindexed property + PropertyConfigBuilder() + .SetName("subject") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert("email"); EXPECT_THAT(schema_store->SetSchema(schema), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -459,40 +528,112 @@ TEST_F(SchemaStoreTest, SetSchemaThatRequiresReindexingOk) { EXPECT_THAT(*actual_schema, EqualsProto(schema)); // Make a previously unindexed property indexed - property = schema.mutable_types(0)->mutable_properties(0); - property->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - property->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); - - // With a new indexed property, we'll need to reindex - result.index_incompatible = true; + schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); // Set the compatible schema + result = SchemaStore::SetSchemaResult(); + result.success = true; + result.schema_types_index_incompatible_by_name.insert("email"); EXPECT_THAT(schema_store->SetSchema(schema), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema()); EXPECT_THAT(*actual_schema, EqualsProto(schema)); } -TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleTypesOk) { +TEST_F(SchemaStoreTest, IndexNestedDocumentsChangeRequiresReindexingOk) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); - SchemaProto schema; - auto type = schema.add_types(); - type->set_schema_type("email"); + // Make two schemas. One that sets index_nested_properties to false and one + // that sets it to true. + SchemaTypeConfigProto email_type_config = + SchemaTypeConfigBuilder() + .SetType("email") + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + SchemaProto no_nested_index_schema = + SchemaBuilder() + .AddType(email_type_config) + .AddType(SchemaTypeConfigBuilder().SetType("person").AddProperty( + PropertyConfigBuilder() + .SetName("emails") + .SetDataTypeDocument("email", + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + + SchemaProto nested_index_schema = + SchemaBuilder() + .AddType(email_type_config) + .AddType(SchemaTypeConfigBuilder().SetType("person").AddProperty( + PropertyConfigBuilder() + .SetName("emails") + .SetDataTypeDocument("email", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + + // Set schema with index_nested_properties=false to start. + SchemaStore::SetSchemaResult result; + result.success = true; + result.schema_types_new_by_name.insert("email"); + result.schema_types_new_by_name.insert("person"); + EXPECT_THAT(schema_store->SetSchema(no_nested_index_schema), + IsOkAndHolds(EqualsSetSchemaResult(result))); + ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, + schema_store->GetSchema()); + EXPECT_THAT(*actual_schema, EqualsProto(no_nested_index_schema)); + + // Set schema with index_nested_properties=true and confirm that the change to + // 'person' is index incompatible. + result = SchemaStore::SetSchemaResult(); + result.success = true; + result.schema_types_index_incompatible_by_name.insert("person"); + EXPECT_THAT(schema_store->SetSchema(nested_index_schema), + IsOkAndHolds(EqualsSetSchemaResult(result))); + ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema()); + EXPECT_THAT(*actual_schema, EqualsProto(nested_index_schema)); - // Add a STRING property - auto property = type->add_properties(); - property->set_property_name("subject"); - property->set_data_type(PropertyConfigProto::DataType::STRING); - property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + // Set schema with index_nested_properties=false and confirm that the change + // to 'person' is index incompatible. + result = SchemaStore::SetSchemaResult(); + result.success = true; + result.schema_types_index_incompatible_by_name.insert("person"); + EXPECT_THAT(schema_store->SetSchema(no_nested_index_schema), + IsOkAndHolds(EqualsSetSchemaResult(result))); + ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema()); + EXPECT_THAT(*actual_schema, EqualsProto(no_nested_index_schema)); +} + +TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleTypesOk) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaStore> schema_store, + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); + + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty( + // Add a STRING property + PropertyConfigBuilder() + .SetName("subject") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert("email"); EXPECT_THAT(schema_store->SetSchema(schema), IsOkAndHolds(EqualsSetSchemaResult(result))); ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema, @@ -503,8 +644,14 @@ TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleTypesOk) { schema_store->GetSchemaTypeId("email")); // Make a previously STRING property into DOUBLE - property = schema.mutable_types(0)->mutable_properties(0); - property->set_data_type(PropertyConfigProto::DataType::DOUBLE); + schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty( + // Add a STRING property + PropertyConfigBuilder() + .SetName("subject") + .SetDataType(TYPE_DOUBLE) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); SchemaStore::SetSchemaResult incompatible_result; incompatible_result.success = false; @@ -549,6 +696,8 @@ TEST_F(SchemaStoreTest, GetSchemaTypeId) { // Set it for the first time SchemaStore::SetSchemaResult result; result.success = true; + result.schema_types_new_by_name.insert(first_type); + result.schema_types_new_by_name.insert(second_type); EXPECT_THAT(schema_store->SetSchema(schema_), IsOkAndHolds(EqualsSetSchemaResult(result))); @@ -570,9 +719,8 @@ TEST_F(SchemaStoreTest, ComputeChecksumSameBetweenCalls) { std::unique_ptr<SchemaStore> schema_store, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); - SchemaProto foo_schema; - auto type_config = foo_schema.add_types(); - type_config->set_schema_type("foo"); + SchemaProto foo_schema = + SchemaBuilder().AddType(SchemaTypeConfigBuilder().SetType("foo")).Build(); ICING_EXPECT_OK(schema_store->SetSchema(foo_schema)); @@ -587,9 +735,8 @@ TEST_F(SchemaStoreTest, ComputeChecksumSameAcrossInstances) { std::unique_ptr<SchemaStore> schema_store, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); - SchemaProto foo_schema; - auto type_config = foo_schema.add_types(); - type_config->set_schema_type("foo"); + SchemaProto foo_schema = + SchemaBuilder().AddType(SchemaTypeConfigBuilder().SetType("foo")).Build(); ICING_EXPECT_OK(schema_store->SetSchema(foo_schema)); @@ -608,20 +755,19 @@ TEST_F(SchemaStoreTest, ComputeChecksumChangesOnModification) { std::unique_ptr<SchemaStore> schema_store, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); - SchemaProto foo_schema; - auto type_config = foo_schema.add_types(); - type_config->set_schema_type("foo"); + SchemaProto foo_schema = + SchemaBuilder().AddType(SchemaTypeConfigBuilder().SetType("foo")).Build(); ICING_EXPECT_OK(schema_store->SetSchema(foo_schema)); ICING_ASSERT_OK_AND_ASSIGN(Crc32 checksum, schema_store->ComputeChecksum()); // Modifying the SchemaStore changes the checksum - SchemaProto foo_bar_schema; - type_config = foo_bar_schema.add_types(); - type_config->set_schema_type("foo"); - type_config = foo_bar_schema.add_types(); - type_config->set_schema_type("bar"); + SchemaProto foo_bar_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("foo")) + .AddType(SchemaTypeConfigBuilder().SetType("bar")) + .Build(); ICING_EXPECT_OK(schema_store->SetSchema(foo_bar_schema)); @@ -642,9 +788,8 @@ TEST_F(SchemaStoreTest, PersistToDiskPreservesAcrossInstances) { std::unique_ptr<SchemaStore> schema_store, SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); - SchemaProto schema; - auto type_config = schema.add_types(); - type_config->set_schema_type("foo"); + SchemaProto schema = + SchemaBuilder().AddType(SchemaTypeConfigBuilder().SetType("foo")).Build(); ICING_EXPECT_OK(schema_store->SetSchema(schema)); @@ -656,8 +801,9 @@ TEST_F(SchemaStoreTest, PersistToDiskPreservesAcrossInstances) { EXPECT_THAT(*actual_schema, EqualsProto(schema)); // Modify the schema so that something different is persisted next time - type_config = schema.add_types(); - type_config->set_schema_type("bar"); + schema = SchemaBuilder(schema) + .AddType(SchemaTypeConfigBuilder().SetType("bar")) + .Build(); ICING_EXPECT_OK(schema_store->SetSchema(schema)); // Should also persist on destruction @@ -670,6 +816,58 @@ TEST_F(SchemaStoreTest, PersistToDiskPreservesAcrossInstances) { EXPECT_THAT(*actual_schema, EqualsProto(schema)); } +TEST_F(SchemaStoreTest, SchemaStoreStorageInfoProto) { + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaStore> schema_store, + SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_)); + + // Create a schema with two types: one simple type and one type that uses all + // 16 sections. + PropertyConfigProto prop = + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL) + .Build(); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty( + PropertyConfigBuilder(prop))) + .AddType( + SchemaTypeConfigBuilder() + .SetType("fullSectionsType") + .AddProperty(PropertyConfigBuilder(prop).SetName("prop0")) + .AddProperty(PropertyConfigBuilder(prop).SetName("prop1")) + .AddProperty(PropertyConfigBuilder(prop).SetName("prop2")) + .AddProperty(PropertyConfigBuilder(prop).SetName("prop3")) + .AddProperty(PropertyConfigBuilder(prop).SetName("prop4")) + .AddProperty(PropertyConfigBuilder(prop).SetName("prop5")) + .AddProperty(PropertyConfigBuilder(prop).SetName("prop6")) + .AddProperty(PropertyConfigBuilder(prop).SetName("prop7")) + .AddProperty(PropertyConfigBuilder(prop).SetName("prop8")) + .AddProperty(PropertyConfigBuilder(prop).SetName("prop9")) + .AddProperty(PropertyConfigBuilder(prop).SetName("prop10")) + .AddProperty(PropertyConfigBuilder(prop).SetName("prop11")) + .AddProperty(PropertyConfigBuilder(prop).SetName("prop12")) + .AddProperty(PropertyConfigBuilder(prop).SetName("prop13")) + .AddProperty(PropertyConfigBuilder(prop).SetName("prop14")) + .AddProperty(PropertyConfigBuilder(prop).SetName("prop15"))) + .Build(); + + SchemaStore::SetSchemaResult result; + result.success = true; + result.schema_types_new_by_name.insert("email"); + result.schema_types_new_by_name.insert("fullSectionsType"); + EXPECT_THAT(schema_store->SetSchema(schema), + IsOkAndHolds(EqualsSetSchemaResult(result))); + + SchemaStoreStorageInfoProto storage_info = schema_store->GetStorageInfo(); + EXPECT_THAT(storage_info.schema_store_size(), Ge(0)); + EXPECT_THAT(storage_info.num_schema_types(), Eq(2)); + EXPECT_THAT(storage_info.num_total_sections(), Eq(17)); + EXPECT_THAT(storage_info.num_schema_types_sections_exhausted(), Eq(1)); +} + } // namespace } // namespace lib diff --git a/icing/schema/schema-util.cc b/icing/schema/schema-util.cc index 49e7096..22bc3f6 100644 --- a/icing/schema/schema-util.cc +++ b/icing/schema/schema-util.cc @@ -37,6 +37,20 @@ namespace lib { namespace { +bool ArePropertiesEqual(const PropertyConfigProto& old_property, + const PropertyConfigProto& new_property) { + return old_property.property_name() == new_property.property_name() && + old_property.data_type() == new_property.data_type() && + old_property.schema_type() == new_property.schema_type() && + old_property.cardinality() == new_property.cardinality() && + old_property.string_indexing_config().term_match_type() == + new_property.string_indexing_config().term_match_type() && + old_property.string_indexing_config().tokenizer_type() == + new_property.string_indexing_config().tokenizer_type() && + old_property.document_indexing_config().index_nested_properties() == + new_property.document_indexing_config().index_nested_properties(); +} + bool IsCardinalityCompatible(const PropertyConfigProto& old_property, const PropertyConfigProto& new_property) { if (old_property.cardinality() < new_property.cardinality()) { @@ -95,43 +109,175 @@ bool IsTermMatchTypeCompatible(const StringIndexingConfig& old_indexed, } // namespace -libtextclassifier3::Status SchemaUtil::Validate(const SchemaProto& schema) { - // Tracks SchemaTypeConfigs that we've validated already. - std::unordered_set<std::string_view> known_schema_types; +libtextclassifier3::Status ExpandTranstiveDependencies( + const SchemaUtil::DependencyMap& child_to_direct_parent_map, + std::string_view type, + SchemaUtil::DependencyMap* expanded_child_to_parent_map, + std::unordered_set<std::string_view>* pending_expansions, + std::unordered_set<std::string_view>* orphaned_types) { + auto expanded_itr = expanded_child_to_parent_map->find(type); + if (expanded_itr != expanded_child_to_parent_map->end()) { + // We've already expanded this type. Just return. + return libtextclassifier3::Status::OK; + } + auto itr = child_to_direct_parent_map.find(type); + if (itr == child_to_direct_parent_map.end()) { + // It's an orphan. Just return. + orphaned_types->insert(type); + return libtextclassifier3::Status::OK; + } + pending_expansions->insert(type); + std::unordered_set<std::string_view> expanded_dependencies; + + // Add all of the direct parent dependencies. + expanded_dependencies.reserve(itr->second.size()); + expanded_dependencies.insert(itr->second.begin(), itr->second.end()); + + // Iterate through each direct parent and add their indirect parents. + for (std::string_view dep : itr->second) { + // 1. Check if we're in the middle of expanding this type - IOW there's a + // cycle! + if (pending_expansions->count(dep) > 0) { + return absl_ports::InvalidArgumentError( + absl_ports::StrCat("Infinite loop detected in type configs. '", type, + "' references itself.")); + } - // Tracks SchemaTypeConfigs that have been mentioned (by other - // SchemaTypeConfigs), but we haven't validated yet. - std::unordered_set<std::string_view> unknown_schema_types; + // 2. Expand this type as needed. + ICING_RETURN_IF_ERROR(ExpandTranstiveDependencies( + child_to_direct_parent_map, dep, expanded_child_to_parent_map, + pending_expansions, orphaned_types)); + if (orphaned_types->count(dep) > 0) { + // Dep is an orphan. Just skip to the next dep. + continue; + } - // Tracks PropertyConfigs within a SchemaTypeConfig that we've validated - // already. - std::unordered_set<std::string_view> known_property_names; + // 3. Dep has been fully expanded. Add all of its dependencies to this + // type's dependencies. + auto dep_expanded_itr = expanded_child_to_parent_map->find(dep); + expanded_dependencies.reserve(expanded_dependencies.size() + + dep_expanded_itr->second.size()); + expanded_dependencies.insert(dep_expanded_itr->second.begin(), + dep_expanded_itr->second.end()); + } + expanded_child_to_parent_map->insert( + {type, std::move(expanded_dependencies)}); + pending_expansions->erase(type); + return libtextclassifier3::Status::OK; +} - // Tracks which schemas reference other schemas. This is used to detect - // infinite loops between indexed schema references (e.g. A -> B -> C -> A). - // We could get into an infinite loop while trying to assign section ids. - // - // The key is the "child" schema that is being referenced within another - // schema. - // The value is a set of all the direct/indirect "parent" schemas that - // reference the "child" schema. - // - // For example, if A has a nested document property of type B, then A is the - // "parent" and B is the "child" and so schema_references will contain - // schema_references[B] == {A}. - std::unordered_map<std::string_view, std::unordered_set<std::string_view>> - schema_references; +// Expands the dependencies represented by the child_to_direct_parent_map to +// also include indirect parents. +// +// Ex. Suppose we have a schema with four types A, B, C, D. A has a property of +// type B and B has a property of type C. C and D only have non-document +// properties. +// +// The child to direct parent dependency map for this schema would be: +// C -> B +// B -> A +// +// This function would expand it so that A is also present as an indirect parent +// of C. +libtextclassifier3::StatusOr<SchemaUtil::DependencyMap> +ExpandTranstiveDependencies( + const SchemaUtil::DependencyMap& child_to_direct_parent_map) { + SchemaUtil::DependencyMap expanded_child_to_parent_map; + + // Types that we are expanding. + std::unordered_set<std::string_view> pending_expansions; + + // Types that have no parents that depend on them. + std::unordered_set<std::string_view> orphaned_types; + for (const auto& kvp : child_to_direct_parent_map) { + ICING_RETURN_IF_ERROR(ExpandTranstiveDependencies( + child_to_direct_parent_map, kvp.first, &expanded_child_to_parent_map, + &pending_expansions, &orphaned_types)); + } + return expanded_child_to_parent_map; +} +// Builds a transitive child-parent dependency map. 'Orphaned' types (types with +// no parents) will not be present in the map. +// +// Ex. Suppose we have a schema with four types A, B, C, D. A has a property of +// type B and B has a property of type C. C and D only have non-document +// properties. +// +// The transitive child-parent dependency map for this schema would be: +// C -> A, B +// B -> A +// +// A and D would be considered orphaned properties because no type refers to +// them. +// +// RETURNS: +// On success, a transitive child-parent dependency map of all types in the +// schema. +// INVALID_ARGUMENT if the schema contains a cycle or an undefined type. +// ALREADY_EXISTS if a schema type is specified more than once in the schema +libtextclassifier3::StatusOr<SchemaUtil::DependencyMap> +BuildTransitiveDependencyGraph(const SchemaProto& schema) { + // Child to parent map. + SchemaUtil::DependencyMap child_to_direct_parent_map; + + // Add all first-order dependencies. + std::unordered_set<std::string_view> known_types; + std::unordered_set<std::string_view> unknown_types; for (const auto& type_config : schema.types()) { std::string_view schema_type(type_config.schema_type()); - ICING_RETURN_IF_ERROR(ValidateSchemaType(schema_type)); - - // We can't have duplicate schema_types - if (!known_schema_types.insert(schema_type).second) { + if (known_types.count(schema_type) > 0) { return absl_ports::AlreadyExistsError(absl_ports::StrCat( "Field 'schema_type' '", schema_type, "' is already defined")); } - unknown_schema_types.erase(schema_type); + known_types.insert(schema_type); + unknown_types.erase(schema_type); + for (const auto& property_config : type_config.properties()) { + if (property_config.data_type() == + PropertyConfigProto::DataType::DOCUMENT) { + // Need to know what schema_type these Document properties should be + // validated against + std::string_view property_schema_type(property_config.schema_type()); + if (property_schema_type == schema_type) { + return absl_ports::InvalidArgumentError( + absl_ports::StrCat("Infinite loop detected in type configs. '", + schema_type, "' references itself.")); + } + if (known_types.count(property_schema_type) == 0) { + unknown_types.insert(property_schema_type); + } + auto itr = child_to_direct_parent_map.find(property_schema_type); + if (itr == child_to_direct_parent_map.end()) { + child_to_direct_parent_map.insert( + {property_schema_type, std::unordered_set<std::string_view>()}); + itr = child_to_direct_parent_map.find(property_schema_type); + } + itr->second.insert(schema_type); + } + } + } + if (!unknown_types.empty()) { + return absl_ports::InvalidArgumentError(absl_ports::StrCat( + "Undefined 'schema_type's: ", absl_ports::StrJoin(unknown_types, ","))); + } + return ExpandTranstiveDependencies(child_to_direct_parent_map); +} + +libtextclassifier3::StatusOr<SchemaUtil::DependencyMap> SchemaUtil::Validate( + const SchemaProto& schema) { + // 1. Build the dependency map. This will detect any cycles, non-existent or + // duplicate types in the schema. + ICING_ASSIGN_OR_RETURN(SchemaUtil::DependencyMap dependency_map, + BuildTransitiveDependencyGraph(schema)); + + // Tracks PropertyConfigs within a SchemaTypeConfig that we've validated + // already. + std::unordered_set<std::string_view> known_property_names; + + // 2. Validate the properties of each type. + for (const auto& type_config : schema.types()) { + std::string_view schema_type(type_config.schema_type()); + ICING_RETURN_IF_ERROR(ValidateSchemaType(schema_type)); // We only care about properties being unique within one type_config known_property_names.clear(); @@ -164,56 +310,6 @@ libtextclassifier3::Status SchemaUtil::Validate(const SchemaProto& schema) { "data_types in schema property '", schema_type, ".", property_name, "'")); } - - if (property_schema_type == schema_type) { - // The schema refers to itself. This also causes a infinite loop. - // - // TODO(b/171996137): When clients can opt out of indexing document - // properties, then we don't need to do this if the document property - // isn't indexed. We only care about infinite loops while we're trying - // to assign section ids for indexing. - return absl_ports::InvalidArgumentError( - absl_ports::StrCat("Infinite loop detected in type configs. '", - schema_type, "' references itself.")); - } - - // Need to make sure we eventually see/validate this schema_type - if (known_schema_types.count(property_schema_type) == 0) { - unknown_schema_types.insert(property_schema_type); - } - - // Start tracking the parent schemas that references this nested schema - // for infinite loop detection. - // - // TODO(b/171996137): When clients can opt out of indexing document - // properties, then we don't need to do this if the document property - // isn't indexed. We only care about infinite loops while we're trying - // to assign section ids for indexing. - std::unordered_set<std::string_view> parent_schemas; - parent_schemas.insert(schema_type); - - for (const auto& parent : parent_schemas) { - // Check for any indirect parents - auto indirect_parents_iter = schema_references.find(parent); - if (indirect_parents_iter == schema_references.end()) { - continue; - } - - // Our "parent" schema has parents as well. They're our indirect - // parents now. - for (const std::string_view& indirect_parent : - indirect_parents_iter->second) { - if (indirect_parent == property_schema_type) { - // We're our own indirect parent! Infinite loop found. - return absl_ports::InvalidArgumentError(absl_ports::StrCat( - "Infinite loop detected in type configs. '", - property_schema_type, "' references itself.")); - } - parent_schemas.insert(indirect_parent); - } - } - - schema_references.insert({property_schema_type, parent_schemas}); } ICING_RETURN_IF_ERROR(ValidateCardinality(property_config.cardinality(), @@ -227,15 +323,7 @@ libtextclassifier3::Status SchemaUtil::Validate(const SchemaProto& schema) { } } - // A Document property claimed to be of a schema_type that we never - // saw/validated - if (!unknown_schema_types.empty()) { - return absl_ports::UnknownError( - absl_ports::StrCat("Undefined 'schema_type's: ", - absl_ports::StrJoin(unknown_schema_types, ","))); - } - - return libtextclassifier3::Status::OK; + return dependency_map; } libtextclassifier3::Status SchemaUtil::ValidateSchemaType( @@ -355,9 +443,9 @@ SchemaUtil::ParsedPropertyConfigs SchemaUtil::ParsePropertyConfigs( } const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( - const SchemaProto& old_schema, const SchemaProto& new_schema) { + const SchemaProto& old_schema, const SchemaProto& new_schema, + const DependencyMap& new_schema_dependency_map) { SchemaDelta schema_delta; - schema_delta.index_incompatible = false; TypeConfigMap new_type_config_map; BuildTypeConfigMap(new_schema, &new_type_config_map); @@ -385,7 +473,29 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( // be reindexed. int32_t old_required_properties = 0; int32_t old_indexed_properties = 0; + + // If there is a different number of properties, then there must have been a + // change. + bool has_property_changed = + old_type_config.properties_size() != + new_schema_type_and_config->second.properties_size(); + bool is_incompatible = false; + bool is_index_incompatible = false; for (const auto& old_property_config : old_type_config.properties()) { + if (old_property_config.cardinality() == + PropertyConfigProto::Cardinality::REQUIRED) { + ++old_required_properties; + } + + // A non-default term_match_type indicates that this property is meant to + // be indexed. + bool is_indexed_property = + old_property_config.string_indexing_config().term_match_type() != + TermMatchType::UNKNOWN; + if (is_indexed_property) { + ++old_indexed_properties; + } + auto new_property_name_and_config = new_parsed_property_configs.property_config_map.find( old_property_config.property_name()); @@ -397,39 +507,35 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( "Previously defined property type '", old_type_config.schema_type(), ".", old_property_config.property_name(), "' was not defined in new schema"); - schema_delta.schema_types_incompatible.insert( - old_type_config.schema_type()); + is_incompatible = true; + is_index_incompatible |= is_indexed_property; continue; } const PropertyConfigProto* new_property_config = new_property_name_and_config->second; + if (!has_property_changed && + !ArePropertiesEqual(old_property_config, *new_property_config)) { + // Finally found a property that changed. + has_property_changed = true; + } if (!IsPropertyCompatible(old_property_config, *new_property_config)) { ICING_VLOG(1) << absl_ports::StrCat( "Property '", old_type_config.schema_type(), ".", old_property_config.property_name(), "' is incompatible."); - schema_delta.schema_types_incompatible.insert( - old_type_config.schema_type()); - } - - if (old_property_config.cardinality() == - PropertyConfigProto::Cardinality::REQUIRED) { - ++old_required_properties; - } - - // A non-default term_match_type indicates that this property is meant to - // be indexed. - if (old_property_config.string_indexing_config().term_match_type() != - TermMatchType::UNKNOWN) { - ++old_indexed_properties; + is_incompatible = true; } // Any change in the indexed property requires a reindexing if (!IsTermMatchTypeCompatible( old_property_config.string_indexing_config(), - new_property_config->string_indexing_config())) { - schema_delta.index_incompatible = true; + new_property_config->string_indexing_config()) || + old_property_config.document_indexing_config() + .index_nested_properties() != + new_property_config->document_indexing_config() + .index_nested_properties()) { + is_index_incompatible = true; } } @@ -444,8 +550,7 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( "New schema '", old_type_config.schema_type(), "' has REQUIRED properties that are not " "present in the previously defined schema"); - schema_delta.schema_types_incompatible.insert( - old_type_config.schema_type()); + is_incompatible = true; } // If we've gained any new indexed properties, then the section ids may @@ -457,8 +562,59 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta( "Set of indexed properties in schema type '", old_type_config.schema_type(), "' has changed, required reindexing."); - schema_delta.index_incompatible = true; + is_index_incompatible = true; + } + + if (is_incompatible) { + // If this type is incompatible, then every type that depends on it might + // also be incompatible. Use the dependency map to mark those ones as + // incompatible too. + schema_delta.schema_types_incompatible.insert( + old_type_config.schema_type()); + auto parent_types_itr = + new_schema_dependency_map.find(old_type_config.schema_type()); + if (parent_types_itr != new_schema_dependency_map.end()) { + schema_delta.schema_types_incompatible.reserve( + schema_delta.schema_types_incompatible.size() + + parent_types_itr->second.size()); + schema_delta.schema_types_incompatible.insert( + parent_types_itr->second.begin(), parent_types_itr->second.end()); + } + } + + if (is_index_incompatible) { + // If this type is index incompatible, then every type that depends on it + // might also be index incompatible. Use the dependency map to mark those + // ones as index incompatible too. + schema_delta.schema_types_index_incompatible.insert( + old_type_config.schema_type()); + auto parent_types_itr = + new_schema_dependency_map.find(old_type_config.schema_type()); + if (parent_types_itr != new_schema_dependency_map.end()) { + schema_delta.schema_types_index_incompatible.reserve( + schema_delta.schema_types_index_incompatible.size() + + parent_types_itr->second.size()); + schema_delta.schema_types_index_incompatible.insert( + parent_types_itr->second.begin(), parent_types_itr->second.end()); + } } + + if (!is_incompatible && !is_index_incompatible && has_property_changed) { + schema_delta.schema_types_changed_fully_compatible.insert( + old_type_config.schema_type()); + } + + // Lastly, remove this type from the map. We know that this type can't + // come up in future iterations through the old schema types because the old + // type config has unique types. + new_type_config_map.erase(old_type_config.schema_type()); + } + + // Any types that are still present in the new_type_config_map are newly added + // types. + schema_delta.schema_types_new.reserve(new_type_config_map.size()); + for (auto& kvp : new_type_config_map) { + schema_delta.schema_types_new.insert(std::move(kvp.first)); } return schema_delta; diff --git a/icing/schema/schema-util.h b/icing/schema/schema-util.h index 7b989a8..fa80b15 100644 --- a/icing/schema/schema-util.h +++ b/icing/schema/schema-util.h @@ -22,6 +22,7 @@ #include <unordered_set> #include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/proto/schema.pb.h" namespace icing { @@ -32,13 +33,14 @@ class SchemaUtil { using TypeConfigMap = std::unordered_map<std::string, const SchemaTypeConfigProto>; - struct SchemaDelta { - // Whether an indexing config has changed, requiring the index to be - // regenerated. We don't list out all the types that make the index - // incompatible because our index isn't optimized for that. It's much easier - // to reset the entire index and reindex every document. - bool index_incompatible = false; + // Maps from a child type to the parent types that depend on it. + // Ex. type A has a single property of type B + // The dependency map will be { { "B", { "A" } } } + using DependencyMap = + std::unordered_map<std::string_view, + std::unordered_set<std::string_view>>; + struct SchemaDelta { // Which schema types were present in the old schema, but were deleted from // the new schema. std::unordered_set<std::string> schema_types_deleted; @@ -47,10 +49,28 @@ class SchemaUtil { // could invalidate existing Documents of that schema type. std::unordered_set<std::string> schema_types_incompatible; + // Schema types that were added in the new schema. Represented by the + // `schema_type` field in the SchemaTypeConfigProto. + std::unordered_set<std::string> schema_types_new; + + // Schema types that were changed in a way that was backwards compatible and + // didn't invalidate the index. Represented by the `schema_type` field in + // the SchemaTypeConfigProto. + std::unordered_set<std::string> schema_types_changed_fully_compatible; + + // Schema types that were changed in a way that was backwards compatible, + // but invalidated the index. Represented by the `schema_type` field in the + // SchemaTypeConfigProto. + std::unordered_set<std::string> schema_types_index_incompatible; + bool operator==(const SchemaDelta& other) const { - return index_incompatible == other.index_incompatible && - schema_types_deleted == other.schema_types_deleted && - schema_types_incompatible == other.schema_types_incompatible; + return schema_types_deleted == other.schema_types_deleted && + schema_types_incompatible == other.schema_types_incompatible && + schema_types_new == other.schema_types_new && + schema_types_changed_fully_compatible == + other.schema_types_changed_fully_compatible && + schema_types_index_incompatible == + other.schema_types_index_incompatible; } }; @@ -90,10 +110,12 @@ class SchemaUtil { // document properties can be opted out of indexing. // // Returns: + // On success, a dependency map from each child types to all parent types + // that depend on it directly or indirectly. // ALREADY_EXISTS for case 1 and 2 // INVALID_ARGUMENT for 3-13 - // OK otherwise - static libtextclassifier3::Status Validate(const SchemaProto& schema); + static libtextclassifier3::StatusOr<DependencyMap> Validate( + const SchemaProto& schema); // Creates a mapping of schema type -> schema type config proto. The // type_config_map is cleared, and then each schema-type_config_proto pair is @@ -142,7 +164,8 @@ class SchemaUtil { // // Returns a SchemaDelta that captures the aforementioned differences. static const SchemaDelta ComputeCompatibilityDelta( - const SchemaProto& old_schema, const SchemaProto& new_schema); + const SchemaProto& old_schema, const SchemaProto& new_schema, + const DependencyMap& new_schema_dependency_map); // Validates the 'property_name' field. // 1. Can't be an empty string diff --git a/icing/schema/schema-util_test.cc b/icing/schema/schema-util_test.cc index 61a861c..26ef4c7 100644 --- a/icing/schema/schema-util_test.cc +++ b/icing/schema/schema-util_test.cc @@ -17,11 +17,13 @@ #include <cstdint> #include <string> #include <string_view> +#include <unordered_set> #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/proto/schema.pb.h" #include "icing/proto/term.pb.h" +#include "icing/schema-builder.h" #include "icing/testing/common-matchers.h" namespace icing { @@ -33,691 +35,1286 @@ using ::testing::HasSubstr; // Properties/fields in a schema type constexpr char kEmailType[] = "EmailMessage"; +constexpr char kMessageType[] = "Text"; constexpr char kPersonType[] = "Person"; -class SchemaUtilTest : public ::testing::Test { - protected: - SchemaProto schema_proto_; - - static SchemaTypeConfigProto CreateSchemaTypeConfig( - const std::string_view schema_type, - const std::string_view nested_schema_type = "") { - SchemaTypeConfigProto type; - type.set_schema_type(std::string(schema_type)); - - auto string_property = type.add_properties(); - string_property->set_property_name("string"); - string_property->set_data_type(PropertyConfigProto::DataType::STRING); - string_property->set_cardinality( - PropertyConfigProto::Cardinality::REQUIRED); - - auto int_property = type.add_properties(); - int_property->set_property_name("int"); - int_property->set_data_type(PropertyConfigProto::DataType::INT64); - int_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - - auto double_property = type.add_properties(); - double_property->set_property_name("double"); - double_property->set_data_type(PropertyConfigProto::DataType::DOUBLE); - double_property->set_cardinality( - PropertyConfigProto::Cardinality::REPEATED); - - auto bool_property = type.add_properties(); - bool_property->set_property_name("boolean"); - bool_property->set_data_type(PropertyConfigProto::DataType::BOOLEAN); - bool_property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); +constexpr PropertyConfigProto_DataType_Code TYPE_DOCUMENT = + PropertyConfigProto_DataType_Code_DOCUMENT; +constexpr PropertyConfigProto_DataType_Code TYPE_STRING = + PropertyConfigProto_DataType_Code_STRING; +constexpr PropertyConfigProto_DataType_Code TYPE_INT = + PropertyConfigProto_DataType_Code_INT64; +constexpr PropertyConfigProto_DataType_Code TYPE_DOUBLE = + PropertyConfigProto_DataType_Code_DOUBLE; + +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_UNKNOWN = + PropertyConfigProto_Cardinality_Code_UNKNOWN; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED = + PropertyConfigProto_Cardinality_Code_REQUIRED; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED = + PropertyConfigProto_Cardinality_Code_REPEATED; + +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_NONE = + StringIndexingConfig_TokenizerType_Code_NONE; +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; + +constexpr TermMatchType_Code MATCH_UNKNOWN = TermMatchType_Code_UNKNOWN; +constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; +constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; + +TEST(SchemaUtilTest, DependencyGraphAlphabeticalOrder) { + // Create a schema with the following dependencies: + // C + // / \ + // A - B E - F + // \ / + // D + SchemaTypeConfigProto type_a = + SchemaTypeConfigBuilder() + .SetType("A") + .AddProperty( + PropertyConfigBuilder() + .SetName("b") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("B", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_b = + SchemaTypeConfigBuilder() + .SetType("B") + .AddProperty( + PropertyConfigBuilder() + .SetName("c") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("C", /*index_nested_properties=*/true)) + .AddProperty( + PropertyConfigBuilder() + .SetName("d") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("D", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_c = + SchemaTypeConfigBuilder() + .SetType("C") + .AddProperty( + PropertyConfigBuilder() + .SetName("e") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("E", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_d = + SchemaTypeConfigBuilder() + .SetType("D") + .AddProperty( + PropertyConfigBuilder() + .SetName("e") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("E", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_e = + SchemaTypeConfigBuilder() + .SetType("E") + .AddProperty( + PropertyConfigBuilder() + .SetName("f") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("F", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_f = + SchemaTypeConfigBuilder() + .SetType("F") + .AddProperty(PropertyConfigBuilder() + .SetName("text") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)) + .Build(); + + // Provide these in alphabetical (also parent-child) order: A, B, C, D, E, F + SchemaProto schema = SchemaBuilder() + .AddType(type_a) + .AddType(type_b) + .AddType(type_c) + .AddType(type_d) + .AddType(type_e) + .AddType(type_f) + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependencyMap d_map, + SchemaUtil::Validate(schema)); + EXPECT_THAT(d_map, testing::SizeIs(5)); + EXPECT_THAT(d_map["F"], + testing::UnorderedElementsAre("A", "B", "C", "D", "E")); + EXPECT_THAT(d_map["E"], testing::UnorderedElementsAre("A", "B", "C", "D")); + EXPECT_THAT(d_map["D"], testing::UnorderedElementsAre("A", "B")); + EXPECT_THAT(d_map["C"], testing::UnorderedElementsAre("A", "B")); + EXPECT_THAT(d_map["B"], testing::UnorderedElementsAre("A")); +} - auto bytes_property = type.add_properties(); - bytes_property->set_property_name("bytes"); - bytes_property->set_data_type(PropertyConfigProto::DataType::BYTES); - bytes_property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); +TEST(SchemaUtilTest, DependencyGraphReverseAlphabeticalOrder) { + // Create a schema with the following dependencies: + // C + // / \ + // A - B E - F + // \ / + // D + SchemaTypeConfigProto type_a = + SchemaTypeConfigBuilder() + .SetType("A") + .AddProperty( + PropertyConfigBuilder() + .SetName("b") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("B", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_b = + SchemaTypeConfigBuilder() + .SetType("B") + .AddProperty( + PropertyConfigBuilder() + .SetName("c") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("C", /*index_nested_properties=*/true)) + .AddProperty( + PropertyConfigBuilder() + .SetName("d") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("D", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_c = + SchemaTypeConfigBuilder() + .SetType("C") + .AddProperty( + PropertyConfigBuilder() + .SetName("e") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("E", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_d = + SchemaTypeConfigBuilder() + .SetType("D") + .AddProperty( + PropertyConfigBuilder() + .SetName("e") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("E", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_e = + SchemaTypeConfigBuilder() + .SetType("E") + .AddProperty( + PropertyConfigBuilder() + .SetName("f") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("F", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_f = + SchemaTypeConfigBuilder() + .SetType("F") + .AddProperty(PropertyConfigBuilder() + .SetName("text") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)) + .Build(); + + // Provide these in reverse alphabetical (also child-parent) order: + // F, E, D, C, B, A + SchemaProto schema = SchemaBuilder() + .AddType(type_f) + .AddType(type_e) + .AddType(type_d) + .AddType(type_c) + .AddType(type_b) + .AddType(type_a) + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependencyMap d_map, + SchemaUtil::Validate(schema)); + EXPECT_THAT(d_map, testing::SizeIs(5)); + EXPECT_THAT(d_map["F"], + testing::UnorderedElementsAre("A", "B", "C", "D", "E")); + EXPECT_THAT(d_map["E"], testing::UnorderedElementsAre("A", "B", "C", "D")); + EXPECT_THAT(d_map["D"], testing::UnorderedElementsAre("A", "B")); + EXPECT_THAT(d_map["C"], testing::UnorderedElementsAre("A", "B")); + EXPECT_THAT(d_map["B"], testing::UnorderedElementsAre("A")); +} - if (!nested_schema_type.empty()) { - auto document_property = type.add_properties(); - document_property->set_property_name("document"); - document_property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); - document_property->set_cardinality( - PropertyConfigProto::Cardinality::REPEATED); - document_property->set_schema_type(std::string(nested_schema_type)); - } +TEST(SchemaUtilTest, DependencyGraphMixedOrder) { + // Create a schema with the following dependencies: + // C + // / \ + // A - B E - F + // \ / + // D + SchemaTypeConfigProto type_a = + SchemaTypeConfigBuilder() + .SetType("A") + .AddProperty( + PropertyConfigBuilder() + .SetName("b") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("B", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_b = + SchemaTypeConfigBuilder() + .SetType("B") + .AddProperty( + PropertyConfigBuilder() + .SetName("c") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("C", /*index_nested_properties=*/true)) + .AddProperty( + PropertyConfigBuilder() + .SetName("d") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("D", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_c = + SchemaTypeConfigBuilder() + .SetType("C") + .AddProperty( + PropertyConfigBuilder() + .SetName("e") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("E", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_d = + SchemaTypeConfigBuilder() + .SetType("D") + .AddProperty( + PropertyConfigBuilder() + .SetName("e") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("E", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_e = + SchemaTypeConfigBuilder() + .SetType("E") + .AddProperty( + PropertyConfigBuilder() + .SetName("f") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("F", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_f = + SchemaTypeConfigBuilder() + .SetType("F") + .AddProperty(PropertyConfigBuilder() + .SetName("text") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN)) + .Build(); + + // Provide these in a random order: C, E, F, A, B, D + SchemaProto schema = SchemaBuilder() + .AddType(type_c) + .AddType(type_e) + .AddType(type_f) + .AddType(type_a) + .AddType(type_b) + .AddType(type_d) + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependencyMap d_map, + SchemaUtil::Validate(schema)); + EXPECT_THAT(d_map, testing::SizeIs(5)); + EXPECT_THAT(d_map["F"], + testing::UnorderedElementsAre("A", "B", "C", "D", "E")); + EXPECT_THAT(d_map["E"], testing::UnorderedElementsAre("A", "B", "C", "D")); + EXPECT_THAT(d_map["D"], testing::UnorderedElementsAre("A", "B")); + EXPECT_THAT(d_map["C"], testing::UnorderedElementsAre("A", "B")); + EXPECT_THAT(d_map["B"], testing::UnorderedElementsAre("A")); +} - return type; - } -}; +TEST(SchemaUtilTest, TopLevelCycle) { + // Create a schema with the following dependencies: + // A - B - B - B - B.... + SchemaTypeConfigProto type_a = + SchemaTypeConfigBuilder() + .SetType("A") + .AddProperty( + PropertyConfigBuilder() + .SetName("b") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("B", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_b = + SchemaTypeConfigBuilder() + .SetType("B") + .AddProperty( + PropertyConfigBuilder() + .SetName("b") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("B", /*index_nested_properties=*/true)) + .Build(); + + SchemaProto schema = SchemaBuilder().AddType(type_a).AddType(type_b).Build(); + EXPECT_THAT(SchemaUtil::Validate(schema), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT, + HasSubstr("Infinite loop"))); +} -TEST_F(SchemaUtilTest, EmptySchemaProtoIsValid) { - ICING_ASSERT_OK(SchemaUtil::Validate(schema_proto_)); +TEST(SchemaUtilTest, MultiLevelCycle) { + // Create a schema with the following dependencies: + // A - B - C - A - B - C - A ... + SchemaTypeConfigProto type_a = + SchemaTypeConfigBuilder() + .SetType("A") + .AddProperty( + PropertyConfigBuilder() + .SetName("b") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("B", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_b = + SchemaTypeConfigBuilder() + .SetType("B") + .AddProperty( + PropertyConfigBuilder() + .SetName("c") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("C", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_c = + SchemaTypeConfigBuilder() + .SetType("C") + .AddProperty( + PropertyConfigBuilder() + .SetName("a") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("A", /*index_nested_properties=*/true)) + .Build(); + + SchemaProto schema = + SchemaBuilder().AddType(type_a).AddType(type_b).AddType(type_c).Build(); + EXPECT_THAT(SchemaUtil::Validate(schema), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(SchemaUtilTest, Valid_Nested) { - auto email_type = schema_proto_.add_types(); - *email_type = CreateSchemaTypeConfig(kEmailType, kPersonType); +TEST(SchemaUtilTest, NonExistentType) { + // Create a schema with the following dependencies: + // A - B - C - X (does not exist) + SchemaTypeConfigProto type_a = + SchemaTypeConfigBuilder() + .SetType("A") + .AddProperty( + PropertyConfigBuilder() + .SetName("b") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("B", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_b = + SchemaTypeConfigBuilder() + .SetType("B") + .AddProperty( + PropertyConfigBuilder() + .SetName("c") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("C", /*index_nested_properties=*/true)) + .Build(); + SchemaTypeConfigProto type_c = + SchemaTypeConfigBuilder() + .SetType("C") + .AddProperty( + PropertyConfigBuilder() + .SetName("x") + .SetCardinality(CARDINALITY_OPTIONAL) + .SetDataTypeDocument("X", /*index_nested_properties=*/true)) + .Build(); + + SchemaProto schema = + SchemaBuilder().AddType(type_a).AddType(type_b).AddType(type_c).Build(); + EXPECT_THAT(SchemaUtil::Validate(schema), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} - auto person_type = schema_proto_.add_types(); - *person_type = CreateSchemaTypeConfig(kPersonType); +TEST(SchemaUtilTest, EmptySchemaProtoIsValid) { + SchemaProto schema; + ICING_ASSERT_OK(SchemaUtil::Validate(schema)); +} - ICING_ASSERT_OK(SchemaUtil::Validate(schema_proto_)); +TEST(SchemaUtilTest, Valid_Nested) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("sender") + .SetDataTypeDocument( + kPersonType, + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED))) + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("name") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + ICING_ASSERT_OK(SchemaUtil::Validate(schema)); } -TEST_F(SchemaUtilTest, ClearedPropertyConfigsIsValid) { +TEST(SchemaUtilTest, ClearedPropertyConfigsIsValid) { // No property fields is technically ok, but probably not realistic. - auto type = schema_proto_.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - type->clear_properties(); - - ICING_ASSERT_OK(SchemaUtil::Validate(schema_proto_)); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType(kEmailType)) + .Build(); + ICING_ASSERT_OK(SchemaUtil::Validate(schema)); } -TEST_F(SchemaUtilTest, ClearedSchemaTypeIsInvalid) { - auto type = schema_proto_.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - type->clear_schema_type(); - - ASSERT_THAT(SchemaUtil::Validate(schema_proto_), +TEST(SchemaUtilTest, ClearedSchemaTypeIsInvalid) { + SchemaProto schema = + SchemaBuilder().AddType(SchemaTypeConfigBuilder()).Build(); + ASSERT_THAT(SchemaUtil::Validate(schema), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(SchemaUtilTest, EmptySchemaTypeIsInvalid) { - auto type = schema_proto_.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - type->set_schema_type(""); +TEST(SchemaUtilTest, EmptySchemaTypeIsInvalid) { + SchemaProto schema = + SchemaBuilder().AddType(SchemaTypeConfigBuilder().SetType("")).Build(); - ASSERT_THAT(SchemaUtil::Validate(schema_proto_), + ASSERT_THAT(SchemaUtil::Validate(schema), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(SchemaUtilTest, AnySchemaTypeOk) { - auto type = schema_proto_.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - type->set_schema_type("abc123!@#$%^&*()_-+=[{]}|\\;:'\",<.>?你好"); +TEST(SchemaUtilTest, AnySchemaTypeOk) { + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType( + "abc123!@#$%^&*()_-+=[{]}|\\;:'\",<.>?你好")) + .Build(); - ICING_ASSERT_OK(SchemaUtil::Validate(schema_proto_)); + ICING_ASSERT_OK(SchemaUtil::Validate(schema)); } -TEST_F(SchemaUtilTest, ClearedPropertyNameIsInvalid) { - auto type = schema_proto_.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - - auto property = type->add_properties(); - property->clear_property_name(); - property->set_data_type(PropertyConfigProto::DataType::STRING); - property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - - ASSERT_THAT(SchemaUtil::Validate(schema_proto_), +TEST(SchemaUtilTest, ClearedPropertyNameIsInvalid) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("foo") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + schema.mutable_types(0)->mutable_properties(0)->clear_property_name(); + ASSERT_THAT(SchemaUtil::Validate(schema), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(SchemaUtilTest, EmptyPropertyNameIsInvalid) { - auto type = schema_proto_.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - - auto property = type->add_properties(); - property->set_property_name(""); - property->set_data_type(PropertyConfigProto::DataType::STRING); - property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - - ASSERT_THAT(SchemaUtil::Validate(schema_proto_), +TEST(SchemaUtilTest, EmptyPropertyNameIsInvalid) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + ASSERT_THAT(SchemaUtil::Validate(schema), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(SchemaUtilTest, NonAlphanumericPropertyNameIsInvalid) { - auto type = schema_proto_.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - - auto property = type->add_properties(); - property->set_property_name("_"); - property->set_data_type(PropertyConfigProto::DataType::STRING); - property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - - ASSERT_THAT(SchemaUtil::Validate(schema_proto_), +TEST(SchemaUtilTest, NonAlphanumericPropertyNameIsInvalid) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("a_b") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + ASSERT_THAT(SchemaUtil::Validate(schema), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(SchemaUtilTest, AlphanumericPropertyNameOk) { - auto type = schema_proto_.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - - auto property = type->add_properties(); - property->set_property_name("abc123"); - property->set_data_type(PropertyConfigProto::DataType::STRING); - property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - - ICING_ASSERT_OK(SchemaUtil::Validate(schema_proto_)); +TEST(SchemaUtilTest, AlphanumericPropertyNameOk) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("abc123") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + ICING_ASSERT_OK(SchemaUtil::Validate(schema)); } -TEST_F(SchemaUtilTest, DuplicatePropertyNameIsInvalid) { - auto type = schema_proto_.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - - auto first_property = type->add_properties(); - first_property->set_property_name("DuplicatedProperty"); - first_property->set_data_type(PropertyConfigProto::DataType::STRING); - first_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - - auto second_property = type->add_properties(); - second_property->set_property_name("DuplicatedProperty"); - second_property->set_data_type(PropertyConfigProto::DataType::STRING); - second_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - - ASSERT_THAT(SchemaUtil::Validate(schema_proto_), +TEST(SchemaUtilTest, DuplicatePropertyNameIsInvalid) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("DuplicatedProperty") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("DuplicatedProperty") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + ASSERT_THAT(SchemaUtil::Validate(schema), StatusIs(libtextclassifier3::StatusCode::ALREADY_EXISTS)); } -TEST_F(SchemaUtilTest, ClearedDataTypeIsInvalid) { - auto type = schema_proto_.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - - auto property = type->add_properties(); - property->set_property_name("NewProperty"); - property->clear_data_type(); - property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - - ASSERT_THAT(SchemaUtil::Validate(schema_proto_), +TEST(SchemaUtilTest, ClearedDataTypeIsInvalid) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("NewProperty") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + schema.mutable_types(0)->mutable_properties(0)->clear_data_type(); + ASSERT_THAT(SchemaUtil::Validate(schema), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(SchemaUtilTest, UnknownDataTypeIsInvalid) { - auto type = schema_proto_.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - - auto property = type->add_properties(); - property->set_property_name("NewProperty"); - property->set_data_type(PropertyConfigProto::DataType::UNKNOWN); - property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - - ASSERT_THAT(SchemaUtil::Validate(schema_proto_), +TEST(SchemaUtilTest, UnknownDataTypeIsInvalid) { + SchemaProto schema = + SchemaBuilder() + .AddType( + SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty( + PropertyConfigBuilder() + .SetName("NewProperty") + .SetDataType(PropertyConfigProto::DataType::UNKNOWN) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + ASSERT_THAT(SchemaUtil::Validate(schema), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(SchemaUtilTest, ClearedCardinalityIsInvalid) { - auto type = schema_proto_.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - - auto property = type->add_properties(); - property->set_property_name("NewProperty"); - property->set_data_type(PropertyConfigProto::DataType::STRING); - property->clear_cardinality(); - - ASSERT_THAT(SchemaUtil::Validate(schema_proto_), +TEST(SchemaUtilTest, ClearedCardinalityIsInvalid) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("NewProperty") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + schema.mutable_types(0)->mutable_properties(0)->clear_cardinality(); + ASSERT_THAT(SchemaUtil::Validate(schema), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(SchemaUtilTest, UnknownCardinalityIsInvalid) { - auto type = schema_proto_.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - - auto property = type->add_properties(); - property->set_property_name("NewProperty"); - property->set_data_type(PropertyConfigProto::DataType::STRING); - property->set_cardinality(PropertyConfigProto::Cardinality::UNKNOWN); - - ASSERT_THAT(SchemaUtil::Validate(schema_proto_), +TEST(SchemaUtilTest, UnknownCardinalityIsInvalid) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("NewProperty") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_UNKNOWN))) + .Build(); + ASSERT_THAT(SchemaUtil::Validate(schema), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(SchemaUtilTest, ClearedPropertySchemaTypeIsInvalid) { - auto type = schema_proto_.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - - auto property = type->add_properties(); - property->set_property_name("NewProperty"); - property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); - property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); - property->clear_schema_type(); - - ASSERT_THAT(SchemaUtil::Validate(schema_proto_), +TEST(SchemaUtilTest, ClearedPropertySchemaTypeIsInvalid) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("NewProperty") + .SetDataType(TYPE_DOCUMENT) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + ASSERT_THAT(SchemaUtil::Validate(schema), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(SchemaUtilTest, Invalid_EmptyPropertySchemaType) { - auto type = schema_proto_.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - - auto property = type->add_properties(); - property->set_property_name("NewProperty"); - property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); - property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); - property->set_schema_type(""); - - ASSERT_THAT(SchemaUtil::Validate(schema_proto_), +TEST(SchemaUtilTest, Invalid_EmptyPropertySchemaType) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("NewProperty") + .SetDataTypeDocument( + /*schema_type=*/"", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + ASSERT_THAT(SchemaUtil::Validate(schema), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(SchemaUtilTest, NoMatchingSchemaTypeIsInvalid) { - auto type = schema_proto_.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - - auto property = type->add_properties(); - property->set_property_name("NewProperty"); - property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); - property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); - property->set_schema_type("NewSchemaType"); - - ASSERT_THAT(SchemaUtil::Validate(schema_proto_), - StatusIs(libtextclassifier3::StatusCode::UNKNOWN, +TEST(SchemaUtilTest, NoMatchingSchemaTypeIsInvalid) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("NewProperty") + .SetDataTypeDocument( + /*schema_type=*/"NewSchemaType", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + ASSERT_THAT(SchemaUtil::Validate(schema), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT, HasSubstr("Undefined 'schema_type'"))); } -TEST_F(SchemaUtilTest, NewOptionalPropertyIsCompatible) { +TEST(SchemaUtilTest, NewOptionalPropertyIsCompatible) { // Configure old schema - SchemaProto old_schema; - auto type = old_schema.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); + SchemaProto old_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("prop1") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); // Configure new schema with an optional field, not considered incompatible // since it's fine if old data doesn't have this optional field - SchemaProto new_schema_with_optional; - type = new_schema_with_optional.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - - auto property = type->add_properties(); - property->set_property_name("NewOptional"); - property->set_data_type(PropertyConfigProto::DataType::DOUBLE); - property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + SchemaProto new_schema_with_optional = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("prop1") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("NewOptional") + .SetDataType(TYPE_DOUBLE) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); SchemaUtil::SchemaDelta schema_delta; - EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, - new_schema_with_optional), + schema_delta.schema_types_changed_fully_compatible.insert(kEmailType); + SchemaUtil::DependencyMap no_dependencies_map; + EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta( + old_schema, new_schema_with_optional, no_dependencies_map), Eq(schema_delta)); } -TEST_F(SchemaUtilTest, NewRequiredPropertyIsIncompatible) { +TEST(SchemaUtilTest, NewRequiredPropertyIsIncompatible) { // Configure old schema - SchemaProto old_schema; - auto type = old_schema.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); + SchemaProto old_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("prop1") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); // Configure new schema with a required field, considered incompatible since // old data won't have this required field - SchemaProto new_schema_with_required; - type = new_schema_with_required.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - - auto property = type->add_properties(); - property->set_property_name("NewRequired"); - property->set_data_type(PropertyConfigProto::DataType::DOUBLE); - property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); + SchemaProto new_schema_with_required = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("prop1") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("NewRequired") + .SetDataType(TYPE_DOUBLE) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); SchemaUtil::SchemaDelta schema_delta; schema_delta.schema_types_incompatible.emplace(kEmailType); - EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, - new_schema_with_required), + SchemaUtil::DependencyMap no_dependencies_map; + EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta( + old_schema, new_schema_with_required, no_dependencies_map), Eq(schema_delta)); } -TEST_F(SchemaUtilTest, NewSchemaMissingPropertyIsIncompatible) { +TEST(SchemaUtilTest, NewSchemaMissingPropertyIsIncompatible) { // Configure old schema - SchemaProto old_schema; - auto type = old_schema.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - - auto property = type->add_properties(); - property->set_property_name("OldOptional"); - property->set_data_type(PropertyConfigProto::DataType::INT64); - property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + SchemaProto old_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("prop1") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName("OldOptional") + .SetDataType(TYPE_INT) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); // Configure new schema, new schema needs to at least have all the // previously defined properties - SchemaProto new_schema; - type = new_schema.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); + SchemaProto new_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("prop1") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); SchemaUtil::SchemaDelta schema_delta; schema_delta.schema_types_incompatible.emplace(kEmailType); - EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema), + SchemaUtil::DependencyMap no_dependencies_map; + EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema, + no_dependencies_map), Eq(schema_delta)); } -TEST_F(SchemaUtilTest, CompatibilityOfDifferentCardinalityOk) { +TEST(SchemaUtilTest, CompatibilityOfDifferentCardinalityOk) { // Configure less restrictive schema based on cardinality - SchemaProto less_restrictive_schema; - auto type = less_restrictive_schema.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - - auto property = type->add_properties(); - property->set_property_name("Property"); - property->set_data_type(PropertyConfigProto::DataType::INT64); - property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); + SchemaProto less_restrictive_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataType(TYPE_INT) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); // Configure more restrictive schema based on cardinality - SchemaProto more_restrictive_schema; - type = more_restrictive_schema.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - - property = type->add_properties(); - property->set_property_name("Property"); - property->set_data_type(PropertyConfigProto::DataType::INT64); - property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - - // We can't have a new schema be less restrictive, REQUIRED->OPTIONAL + SchemaProto more_restrictive_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataType(TYPE_INT) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + // We can't have a new schema be more restrictive, REPEATED->OPTIONAL SchemaUtil::SchemaDelta incompatible_schema_delta; incompatible_schema_delta.schema_types_incompatible.emplace(kEmailType); + SchemaUtil::DependencyMap no_dependencies_map; EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta( /*old_schema=*/less_restrictive_schema, - /*new_schema=*/more_restrictive_schema), + /*new_schema=*/more_restrictive_schema, no_dependencies_map), Eq(incompatible_schema_delta)); - // We can have the new schema be more restrictive, OPTIONAL->REPEATED; + // We can have the new schema be less restrictive, OPTIONAL->REPEATED; SchemaUtil::SchemaDelta compatible_schema_delta; + compatible_schema_delta.schema_types_changed_fully_compatible.insert( + kEmailType); EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta( /*old_schema=*/more_restrictive_schema, - /*new_schema=*/less_restrictive_schema), + /*new_schema=*/less_restrictive_schema, no_dependencies_map), Eq(compatible_schema_delta)); } -TEST_F(SchemaUtilTest, DifferentDataTypeIsIncompatible) { +TEST(SchemaUtilTest, DifferentDataTypeIsIncompatible) { // Configure old schema, with an int64_t property - SchemaProto old_schema; - auto type = old_schema.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - - auto property = type->add_properties(); - property->set_property_name("Property"); - property->set_data_type(PropertyConfigProto::DataType::INT64); - property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); + SchemaProto old_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataType(TYPE_INT) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); // Configure new schema, with a double property - SchemaProto new_schema; - type = new_schema.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - - property = type->add_properties(); - property->set_property_name("Property"); - property->set_data_type(PropertyConfigProto::DataType::DOUBLE); - property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); + SchemaProto new_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataType(TYPE_DOUBLE) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); SchemaUtil::SchemaDelta schema_delta; schema_delta.schema_types_incompatible.emplace(kEmailType); - EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema), + SchemaUtil::DependencyMap no_dependencies_map; + EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema, + no_dependencies_map), Eq(schema_delta)); } -TEST_F(SchemaUtilTest, DifferentSchemaTypeIsIncompatible) { +TEST(SchemaUtilTest, DifferentSchemaTypeIsIncompatible) { // Configure old schema, where Property is supposed to be a Person type - SchemaProto old_schema; - auto type = old_schema.add_types(); - *type = CreateSchemaTypeConfig(kPersonType); - - *type = CreateSchemaTypeConfig(kEmailType); - auto property = type->add_properties(); - property->set_property_name("Property"); - property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); - property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); - property->set_schema_type(kPersonType); + SchemaProto old_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("prop") + .SetDataType(TYPE_INT) + .SetCardinality(CARDINALITY_REPEATED))) + .AddType(SchemaTypeConfigBuilder() + .SetType(kMessageType) + .AddProperty(PropertyConfigBuilder() + .SetName("prop") + .SetDataType(TYPE_INT) + .SetCardinality(CARDINALITY_REPEATED))) + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeDocument( + kPersonType, + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); // Configure new schema, where Property is supposed to be an Email type - SchemaProto new_schema; - type = new_schema.add_types(); - *type = CreateSchemaTypeConfig(kPersonType); - - *type = CreateSchemaTypeConfig(kEmailType); - property = type->add_properties(); - property->set_property_name("Property"); - property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); - property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); - property->set_schema_type(kEmailType); + SchemaProto new_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("prop") + .SetDataType(TYPE_INT) + .SetCardinality(CARDINALITY_REPEATED))) + .AddType(SchemaTypeConfigBuilder() + .SetType(kMessageType) + .AddProperty(PropertyConfigBuilder() + .SetName("prop") + .SetDataType(TYPE_INT) + .SetCardinality(CARDINALITY_REPEATED))) + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeDocument( + kMessageType, + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); SchemaUtil::SchemaDelta schema_delta; schema_delta.schema_types_incompatible.emplace(kEmailType); - EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema), - Eq(schema_delta)); + // kEmailType depends on kMessageType + SchemaUtil::DependencyMap dependencies_map = {{kMessageType, {kEmailType}}}; + SchemaUtil::SchemaDelta actual = SchemaUtil::ComputeCompatibilityDelta( + old_schema, new_schema, dependencies_map); + EXPECT_THAT(actual, Eq(schema_delta)); + EXPECT_THAT(actual.schema_types_incompatible, + testing::ElementsAre(kEmailType)); + EXPECT_THAT(actual.schema_types_deleted, testing::IsEmpty()); } -TEST_F(SchemaUtilTest, ChangingIndexedPropertiesMakesIndexIncompatible) { +TEST(SchemaUtilTest, ChangingIndexedPropertiesMakesIndexIncompatible) { // Configure old schema - SchemaProto old_schema; - auto old_type = old_schema.add_types(); - *old_type = CreateSchemaTypeConfig(kEmailType, kPersonType); - - auto old_property = old_type->add_properties(); - old_property->set_property_name("Property"); - old_property->set_data_type(PropertyConfigProto::DataType::STRING); - old_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + SchemaProto schema_with_indexed_property = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty( + PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); // Configure new schema - SchemaProto new_schema; - auto new_type = new_schema.add_types(); - *new_type = CreateSchemaTypeConfig(kEmailType, kPersonType); - - auto new_property = new_type->add_properties(); - new_property->set_property_name("Property"); - new_property->set_data_type(PropertyConfigProto::DataType::STRING); - new_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + SchemaProto schema_with_unindexed_property = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty( + PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeString(MATCH_UNKNOWN, TOKENIZER_NONE) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); SchemaUtil::SchemaDelta schema_delta; - schema_delta.index_incompatible = true; + schema_delta.schema_types_index_incompatible.insert(kPersonType); // New schema gained a new indexed property. - old_property->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::UNKNOWN); - new_property->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema), + SchemaUtil::DependencyMap no_dependencies_map; + EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta( + schema_with_indexed_property, schema_with_unindexed_property, + no_dependencies_map), Eq(schema_delta)); // New schema lost an indexed property. - old_property->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - new_property->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::UNKNOWN); - EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema), + EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta( + schema_with_indexed_property, schema_with_unindexed_property, + no_dependencies_map), Eq(schema_delta)); } -TEST_F(SchemaUtilTest, AddingNewIndexedPropertyMakesIndexIncompatible) { +TEST(SchemaUtilTest, AddingNewIndexedPropertyMakesIndexIncompatible) { // Configure old schema - SchemaProto old_schema; - auto old_type = old_schema.add_types(); - *old_type = CreateSchemaTypeConfig(kEmailType, kPersonType); - - auto old_property = old_type->add_properties(); - old_property->set_property_name("Property"); - old_property->set_data_type(PropertyConfigProto::DataType::STRING); - old_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + SchemaProto old_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty( + PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); // Configure new schema - SchemaProto new_schema; - auto new_type = new_schema.add_types(); - *new_type = CreateSchemaTypeConfig(kEmailType, kPersonType); - - auto new_property = new_type->add_properties(); - new_property->set_property_name("Property"); - new_property->set_data_type(PropertyConfigProto::DataType::STRING); - new_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - - new_property = new_type->add_properties(); - new_property->set_property_name("NewIndexedProperty"); - new_property->set_data_type(PropertyConfigProto::DataType::STRING); - new_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - new_property->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); + SchemaProto new_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty( + PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("NewIndexedProperty") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); SchemaUtil::SchemaDelta schema_delta; - schema_delta.index_incompatible = true; - EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema), + schema_delta.schema_types_index_incompatible.insert(kPersonType); + SchemaUtil::DependencyMap no_dependencies_map; + EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema, + no_dependencies_map), Eq(schema_delta)); } -TEST_F(SchemaUtilTest, AddingTypeIsCompatible) { +TEST(SchemaUtilTest, AddingTypeIsCompatible) { // Can add a new type, existing data isn't incompatible, since none of them // are of this new schema type - SchemaProto old_schema; - auto type = old_schema.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - - SchemaProto new_schema; - type = new_schema.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - type = new_schema.add_types(); - *type = CreateSchemaTypeConfig(kPersonType); + SchemaProto old_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty( + PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + SchemaProto new_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty( + PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty( + PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); SchemaUtil::SchemaDelta schema_delta; - EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema), + schema_delta.schema_types_new.insert(kEmailType); + SchemaUtil::DependencyMap no_dependencies_map; + EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema, + no_dependencies_map), Eq(schema_delta)); } -TEST_F(SchemaUtilTest, DeletingTypeIsNoted) { +TEST(SchemaUtilTest, DeletingTypeIsNoted) { // Can't remove an old type, new schema needs to at least have all the // previously defined schema otherwise the Documents of the missing schema // are invalid - SchemaProto old_schema; - auto type = old_schema.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); - type = old_schema.add_types(); - *type = CreateSchemaTypeConfig(kPersonType); - - SchemaProto new_schema; - type = new_schema.add_types(); - *type = CreateSchemaTypeConfig(kEmailType); + SchemaProto old_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty( + PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty( + PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + SchemaProto new_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty( + PropertyConfigBuilder() + .SetName("Property") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); SchemaUtil::SchemaDelta schema_delta; schema_delta.schema_types_deleted.emplace(kPersonType); - EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema), + SchemaUtil::DependencyMap no_dependencies_map; + EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema, + no_dependencies_map), Eq(schema_delta)); } -TEST_F(SchemaUtilTest, ValidateStringIndexingConfigShouldHaveTermMatchType) { - SchemaProto schema; - auto* type = schema.add_types(); - type->set_schema_type("MyType"); +TEST(SchemaUtilTest, DeletingPropertyAndChangingProperty) { + SchemaProto old_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("Property1") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("Property2") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); + + // Remove Property2 and make Property1 indexed now. Removing Property2 should + // be incompatible. + SchemaProto new_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty( + PropertyConfigBuilder() + .SetName("Property1") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + SchemaUtil::SchemaDelta schema_delta; + schema_delta.schema_types_incompatible.emplace(kEmailType); + schema_delta.schema_types_index_incompatible.emplace(kEmailType); + SchemaUtil::DependencyMap no_dependencies_map; + SchemaUtil::SchemaDelta actual = SchemaUtil::ComputeCompatibilityDelta( + old_schema, new_schema, no_dependencies_map); + EXPECT_THAT(actual, Eq(schema_delta)); +} + +TEST(SchemaUtilTest, IndexNestedDocumentsIndexIncompatible) { + // Make two schemas. One that sets index_nested_properties to false and one + // that sets it to true. + SchemaTypeConfigProto email_type_config = + SchemaTypeConfigBuilder() + .SetType(kEmailType) + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + SchemaProto no_nested_index_schema = + SchemaBuilder() + .AddType(email_type_config) + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty(PropertyConfigBuilder() + .SetName("emails") + .SetDataTypeDocument( + kEmailType, + /*index_nested_properties=*/false) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + + SchemaProto nested_index_schema = + SchemaBuilder() + .AddType(email_type_config) + .AddType(SchemaTypeConfigBuilder() + .SetType(kPersonType) + .AddProperty( + PropertyConfigBuilder() + .SetName("emails") + .SetDataTypeDocument( + kEmailType, /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); + + // Going from index_nested_properties=false to index_nested_properties=true + // should make kPersonType index_incompatible. kEmailType should be + // unaffected. + SchemaUtil::SchemaDelta schema_delta; + schema_delta.schema_types_index_incompatible.emplace(kPersonType); + SchemaUtil::DependencyMap dependencies_map = {{kEmailType, {kPersonType}}}; + SchemaUtil::SchemaDelta actual = SchemaUtil::ComputeCompatibilityDelta( + no_nested_index_schema, nested_index_schema, dependencies_map); + EXPECT_THAT(actual, Eq(schema_delta)); + + // Going from index_nested_properties=true to index_nested_properties=false + // should also make kPersonType index_incompatible. kEmailType should be + // unaffected. + actual = SchemaUtil::ComputeCompatibilityDelta( + nested_index_schema, no_nested_index_schema, dependencies_map); + EXPECT_THAT(actual, Eq(schema_delta)); +} - auto* prop = type->add_properties(); - prop->set_property_name("Foo"); - prop->set_data_type(PropertyConfigProto::DataType::STRING); - prop->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - prop->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); +TEST(SchemaUtilTest, ValidateStringIndexingConfigShouldHaveTermMatchType) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty( + PropertyConfigBuilder() + .SetName("Foo") + .SetDataTypeString(MATCH_UNKNOWN, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); // Error if we don't set a term match type EXPECT_THAT(SchemaUtil::Validate(schema), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); // Passes once we set a term match type - prop->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); + schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty( + PropertyConfigBuilder() + .SetName("Foo") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); } -TEST_F(SchemaUtilTest, ValidateStringIndexingConfigShouldHaveTokenizer) { - SchemaProto schema; - auto* type = schema.add_types(); - type->set_schema_type("MyType"); - - auto* prop = type->add_properties(); - prop->set_property_name("Foo"); - prop->set_data_type(PropertyConfigProto::DataType::STRING); - prop->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - prop->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); +TEST(SchemaUtilTest, ValidateStringIndexingConfigShouldHaveTokenizer) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty( + PropertyConfigBuilder() + .SetName("Foo") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_NONE) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); // Error if we don't set a tokenizer type EXPECT_THAT(SchemaUtil::Validate(schema), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); // Passes once we set a tokenizer type - prop->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); + schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty( + PropertyConfigBuilder() + .SetName("Foo") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); } -TEST_F(SchemaUtilTest, MultipleReferencesToSameNestedSchemaOk) { - SchemaProto schema; - - // Create a parent schema - auto type = schema.add_types(); - type->set_schema_type("ParentSchema"); - - // Create multiple references to the same child schema - auto property = type->add_properties(); - property->set_property_name("ChildProperty1"); - property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); - property->set_schema_type("ChildSchema"); - property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); - - property = type->add_properties(); - property->set_property_name("ChildProperty2"); - property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); - property->set_schema_type("ChildSchema"); - property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); - - // Create a child schema - type = schema.add_types(); - type->set_schema_type("ChildSchema"); +TEST(SchemaUtilTest, MultipleReferencesToSameNestedSchemaOk) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("ChildSchema")) + .AddType(SchemaTypeConfigBuilder() + .SetType("ParentSchema") + .AddProperty(PropertyConfigBuilder() + .SetName("ChildProperty1") + .SetDataTypeDocument( + "ChildSchema", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED)) + .AddProperty(PropertyConfigBuilder() + .SetName("ChildProperty2") + .SetDataTypeDocument( + "ChildSchema", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); EXPECT_THAT(SchemaUtil::Validate(schema), IsOk()); } -TEST_F(SchemaUtilTest, InvalidSelfReference) { - SchemaProto schema; - +TEST(SchemaUtilTest, InvalidSelfReference) { // Create a schema with a self-reference cycle in it: OwnSchema -> OwnSchema - auto type = schema.add_types(); - type->set_schema_type("OwnSchema"); - - // Reference a child schema, so far so good - auto property = type->add_properties(); - property->set_property_name("NestedDocument"); - property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); - property->set_schema_type("OwnSchema"); - property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("OwnSchema") + .AddProperty(PropertyConfigBuilder() + .SetName("NestedDocument") + .SetDataTypeDocument( + "OwnSchema", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); EXPECT_THAT(SchemaUtil::Validate(schema), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT, HasSubstr("Infinite loop"))); } -TEST_F(SchemaUtilTest, InvalidSelfReferenceEvenWithOtherProperties) { - SchemaProto schema; - +TEST(SchemaUtilTest, InvalidSelfReferenceEvenWithOtherProperties) { // Create a schema with a self-reference cycle in it: OwnSchema -> OwnSchema - auto type = schema.add_types(); - type->set_schema_type("OwnSchema"); - - // Reference a child schema, so far so good - auto property = type->add_properties(); - property->set_property_name("NestedDocument"); - property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); - property->set_schema_type("OwnSchema"); - property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - - property = type->add_properties(); - property->set_property_name("SomeString"); - property->set_data_type(PropertyConfigProto::DataType::STRING); - property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - property->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::PREFIX); - property->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("OwnSchema") + .AddProperty(PropertyConfigBuilder() + .SetName("NestedDocument") + .SetDataTypeDocument( + "OwnSchema", + /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("SomeString") + .SetDataTypeString(MATCH_PREFIX, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); EXPECT_THAT(SchemaUtil::Validate(schema), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT, HasSubstr("Infinite loop"))); } -TEST_F(SchemaUtilTest, InvalidInfiniteLoopTwoDegrees) { - SchemaProto schema; - +TEST(SchemaUtilTest, InvalidInfiniteLoopTwoDegrees) { // Create a schema for the parent schema - auto type = schema.add_types(); - type->set_schema_type("A"); - - // Reference schema B, so far so good - auto property = type->add_properties(); - property->set_property_name("NestedDocument"); - property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); - property->set_schema_type("B"); - property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - - // Create the child schema - type = schema.add_types(); - type->set_schema_type("B"); - - // Reference the schema A, causing an infinite loop of references. - property = type->add_properties(); - property->set_property_name("NestedDocument"); - property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); - property->set_schema_type("A"); - property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); + SchemaProto schema = + SchemaBuilder() + .AddType( + SchemaTypeConfigBuilder() + .SetType("A") + // Reference schema B, so far so good + .AddProperty(PropertyConfigBuilder() + .SetName("NestedDocument") + .SetDataTypeDocument( + "B", /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL))) + // Create the child schema + .AddType( + SchemaTypeConfigBuilder() + .SetType("B") + // Reference the schema A, causing an infinite loop of + // references. + .AddProperty(PropertyConfigBuilder() + .SetName("NestedDocument") + .SetDataTypeDocument( + "A", /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); // Two degrees of referencing: A -> B -> A EXPECT_THAT(SchemaUtil::Validate(schema), @@ -725,41 +1322,40 @@ TEST_F(SchemaUtilTest, InvalidInfiniteLoopTwoDegrees) { HasSubstr("Infinite loop"))); } -TEST_F(SchemaUtilTest, InvalidInfiniteLoopThreeDegrees) { - SchemaProto schema; - - // Create a schema for the parent schema - auto type = schema.add_types(); - type->set_schema_type("A"); - - // Reference schema B , so far so good - auto property = type->add_properties(); - property->set_property_name("NestedDocument"); - property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); - property->set_schema_type("B"); - property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - - // Create the child schema - type = schema.add_types(); - type->set_schema_type("B"); - - // Reference schema C, so far so good - property = type->add_properties(); - property->set_property_name("NestedDocument"); - property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); - property->set_schema_type("C"); - property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); - - // Create the child schema - type = schema.add_types(); - type->set_schema_type("C"); - - // Reference schema A, no good - property = type->add_properties(); - property->set_property_name("NestedDocument"); - property->set_data_type(PropertyConfigProto::DataType::DOCUMENT); - property->set_schema_type("A"); - property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); +TEST(SchemaUtilTest, InvalidInfiniteLoopThreeDegrees) { + SchemaProto schema = + SchemaBuilder() + // Create a schema for the parent schema + .AddType( + SchemaTypeConfigBuilder() + .SetType("A") + // Reference schema B, so far so good + .AddProperty(PropertyConfigBuilder() + .SetName("NestedDocument") + .SetDataTypeDocument( + "B", /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_OPTIONAL))) + // Create the child schema + .AddType( + SchemaTypeConfigBuilder() + .SetType("B") + // Reference schema C, so far so good + .AddProperty(PropertyConfigBuilder() + .SetName("NestedDocument") + .SetDataTypeDocument( + "C", /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED))) + // Create the child schema + .AddType( + SchemaTypeConfigBuilder() + .SetType("C") + // Reference schema C, so far so good + .AddProperty(PropertyConfigBuilder() + .SetName("NestedDocument") + .SetDataTypeDocument( + "A", /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); // Three degrees of referencing: A -> B -> C -> A EXPECT_THAT(SchemaUtil::Validate(schema), diff --git a/icing/schema/section-manager.cc b/icing/schema/section-manager.cc index a10e9b9..a0893e6 100644 --- a/icing/schema/section-manager.cc +++ b/icing/schema/section-manager.cc @@ -165,16 +165,6 @@ std::vector<std::string_view> GetStringPropertyContent( return values; } -// Helper function to get metadata list of a type config -libtextclassifier3::StatusOr<std::vector<SectionMetadata>> GetMetadataList( - const KeyMapper<SchemaTypeId>& schema_type_mapper, - const std::vector<std::vector<SectionMetadata>>& section_metadata_cache, - const std::string& type_config_name) { - ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id, - schema_type_mapper.Get(type_config_name)); - return section_metadata_cache.at(schema_type_id); -} - } // namespace SectionManager::SectionManager( @@ -263,18 +253,16 @@ SectionManager::GetStringSectionContent(const DocumentProto& document, "Section id %d is greater than the max value %d", section_id, kMaxSectionId)); } - ICING_ASSIGN_OR_RETURN( - const std::vector<SectionMetadata>& metadata_list, - GetMetadataList(schema_type_mapper_, section_metadata_cache_, - document.schema())); - if (section_id >= metadata_list.size()) { + ICING_ASSIGN_OR_RETURN(const std::vector<SectionMetadata>* metadata_list, + GetMetadataList(document.schema())); + if (section_id >= metadata_list->size()) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( "Section with id %d doesn't exist in type config %s", section_id, document.schema().c_str())); } // The index of metadata list is the same as the section id, so we can use // section id as the index. - return GetStringSectionContent(document, metadata_list[section_id].path); + return GetStringSectionContent(document, metadata_list->at(section_id).path); } libtextclassifier3::StatusOr<const SectionMetadata*> @@ -300,12 +288,10 @@ SectionManager::GetSectionMetadata(SchemaTypeId schema_type_id, libtextclassifier3::StatusOr<std::vector<Section>> SectionManager::ExtractSections(const DocumentProto& document) const { - ICING_ASSIGN_OR_RETURN( - const std::vector<SectionMetadata>& metadata_list, - GetMetadataList(schema_type_mapper_, section_metadata_cache_, - document.schema())); + ICING_ASSIGN_OR_RETURN(const std::vector<SectionMetadata>* metadata_list, + GetMetadataList(document.schema())); std::vector<Section> sections; - for (const auto& section_metadata : metadata_list) { + for (const auto& section_metadata : *metadata_list) { auto section_content_or = GetStringSectionContent(document, section_metadata.path); // Adds to result vector if section is found in document @@ -317,5 +303,12 @@ SectionManager::ExtractSections(const DocumentProto& document) const { return sections; } +libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*> +SectionManager::GetMetadataList(const std::string& type_config_name) const { + ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id, + schema_type_mapper_.Get(type_config_name)); + return §ion_metadata_cache_.at(schema_type_id); +} + } // namespace lib } // namespace icing diff --git a/icing/schema/section-manager.h b/icing/schema/section-manager.h index 191a169..51eb133 100644 --- a/icing/schema/section-manager.h +++ b/icing/schema/section-manager.h @@ -30,7 +30,9 @@ namespace icing { namespace lib { -inline constexpr char kPropertySeparator[] = "."; +inline constexpr std::string_view kPropertySeparator = "."; +inline constexpr std::string_view kLBracket = "["; +inline constexpr std::string_view kRBracket = "]"; // This class provides section-related operations. It assigns sections according // to type configs and extracts section / sections from documents. @@ -94,6 +96,12 @@ class SectionManager { libtextclassifier3::StatusOr<std::vector<Section>> ExtractSections( const DocumentProto& document) const; + // Returns: + // - On success, the section metadatas for the specified type + // - NOT_FOUND if the type config name is not present in the schema + libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*> + GetMetadataList(const std::string& type_config_name) const; + private: // Use SectionManager::Create() to instantiate explicit SectionManager( diff --git a/icing/schema/section-manager_test.cc b/icing/schema/section-manager_test.cc index 15d9a19..3dcc5a9 100644 --- a/icing/schema/section-manager_test.cc +++ b/icing/schema/section-manager_test.cc @@ -20,7 +20,6 @@ #include "gtest/gtest.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" -#include "icing/proto/schema.proto.h" #include "icing/proto/schema.pb.h" #include "icing/proto/term.pb.h" #include "icing/schema/schema-util.h" diff --git a/icing/scoring/bm25f-calculator.cc b/icing/scoring/bm25f-calculator.cc index 7495e98..4822d7f 100644 --- a/icing/scoring/bm25f-calculator.cc +++ b/icing/scoring/bm25f-calculator.cc @@ -42,24 +42,25 @@ constexpr float k1_ = 1.2f; constexpr float b_ = 0.7f; // TODO(b/158603900): add tests for Bm25fCalculator -Bm25fCalculator::Bm25fCalculator(const DocumentStore *document_store) +Bm25fCalculator::Bm25fCalculator(const DocumentStore* document_store) : document_store_(document_store) {} // During initialization, Bm25fCalculator iterates through // hit-iterators for each query term to pre-compute n(q_i) for each corpus under // consideration. void Bm25fCalculator::PrepareToScore( - std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>> - *query_term_iterators) { + std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>* + query_term_iterators) { Clear(); TermId term_id = 0; - for (auto &iter : *query_term_iterators) { - const std::string &term = iter.first; + for (auto& iter : *query_term_iterators) { + const std::string& term = iter.first; if (term_id_map_.find(term) != term_id_map_.end()) { continue; } term_id_map_[term] = ++term_id; - DocHitInfoIterator *term_it = iter.second.get(); + DocHitInfoIterator* term_it = iter.second.get(); + while (term_it->Advance().ok()) { auto status_or = document_store_->GetDocumentAssociatedScoreData( term_it->doc_hit_info().document_id()); @@ -89,8 +90,8 @@ void Bm25fCalculator::Clear() { // where IDF(q_i) is the Inverse Document Frequency (IDF) weight of the query // term q_i in the corpus with document D, and tf(q_i, D) is the weighted and // normalized term frequency of query term q_i in the document D. -float Bm25fCalculator::ComputeScore(const DocHitInfoIterator *query_it, - const DocHitInfo &hit_info, +float Bm25fCalculator::ComputeScore(const DocHitInfoIterator* query_it, + const DocHitInfo& hit_info, double default_score) { auto status_or = document_store_->GetDocumentAssociatedScoreData(hit_info.document_id()); @@ -103,7 +104,7 @@ float Bm25fCalculator::ComputeScore(const DocHitInfoIterator *query_it, query_it->PopulateMatchedTermsStats(&matched_terms_stats); float score = 0; - for (const TermMatchInfo &term_match_info : matched_terms_stats) { + for (const TermMatchInfo& term_match_info : matched_terms_stats) { float idf_weight = GetCorpusIdfWeightForTerm(term_match_info.term, data.corpus_id()); float normalized_tf = @@ -186,8 +187,8 @@ float Bm25fCalculator::GetCorpusAvgDocLength(CorpusId corpus_id) { // |D| is the #tokens in D, avgdl is the average document length in the corpus, // k1 and b are smoothing parameters. float Bm25fCalculator::ComputedNormalizedTermFrequency( - const TermMatchInfo &term_match_info, const DocHitInfo &hit_info, - const DocumentAssociatedScoreData &data) { + const TermMatchInfo& term_match_info, const DocHitInfo& hit_info, + const DocumentAssociatedScoreData& data) { uint32_t dl = data.length_in_tokens(); float avgdl = GetCorpusAvgDocLength(data.corpus_id()); float f_q = @@ -204,7 +205,7 @@ float Bm25fCalculator::ComputedNormalizedTermFrequency( // Note: once we support section weights, we should update this function to // compute the weighted term frequency. float Bm25fCalculator::ComputeTermFrequencyForMatchedSections( - CorpusId corpus_id, const TermMatchInfo &term_match_info) const { + CorpusId corpus_id, const TermMatchInfo& term_match_info) const { float sum = 0.0f; SectionIdMask sections = term_match_info.section_ids_mask; while (sections != 0) { diff --git a/icing/scoring/scorer.cc b/icing/scoring/scorer.cc index b7e1b92..a4734b4 100644 --- a/icing/scoring/scorer.cc +++ b/icing/scoring/scorer.cc @@ -89,6 +89,7 @@ class RelevanceScoreScorer : public Scorer { if (!query_it) { return default_score_; } + return static_cast<double>( bm25f_calculator_->ComputeScore(query_it, hit_info, default_score_)); } @@ -122,11 +123,11 @@ class UsageScorer : public Scorer { case ScoringSpecProto::RankingStrategy::USAGE_TYPE3_COUNT: return usage_scores.usage_type3_count; case ScoringSpecProto::RankingStrategy::USAGE_TYPE1_LAST_USED_TIMESTAMP: - return usage_scores.usage_type1_last_used_timestamp_s; + return usage_scores.usage_type1_last_used_timestamp_s * 1000.0; case ScoringSpecProto::RankingStrategy::USAGE_TYPE2_LAST_USED_TIMESTAMP: - return usage_scores.usage_type2_last_used_timestamp_s; + return usage_scores.usage_type2_last_used_timestamp_s * 1000.0; case ScoringSpecProto::RankingStrategy::USAGE_TYPE3_LAST_USED_TIMESTAMP: - return usage_scores.usage_type3_last_used_timestamp_s; + return usage_scores.usage_type3_last_used_timestamp_s * 1000.0; default: // This shouldn't happen if this scorer is used correctly. return default_score_; diff --git a/icing/scoring/scorer_test.cc b/icing/scoring/scorer_test.cc index b114515..8b89514 100644 --- a/icing/scoring/scorer_test.cc +++ b/icing/scoring/scorer_test.cc @@ -25,6 +25,7 @@ #include "icing/proto/document.pb.h" #include "icing/proto/schema.pb.h" #include "icing/proto/scoring.pb.h" +#include "icing/schema-builder.h" #include "icing/schema/schema-store.h" #include "icing/store/document-id.h" #include "icing/store/document-store.h" @@ -38,6 +39,12 @@ namespace lib { namespace { using ::testing::Eq; +constexpr PropertyConfigProto_DataType_Code TYPE_STRING = + PropertyConfigProto_DataType_Code_STRING; + +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED = + PropertyConfigProto_Cardinality_Code_REQUIRED; + class ScorerTest : public testing::Test { protected: ScorerTest() @@ -64,13 +71,14 @@ class ScorerTest : public testing::Test { document_store_ = std::move(create_result.document_store); // Creates a simple email schema - SchemaProto test_email_schema; - auto type_config = test_email_schema.add_types(); - type_config->set_schema_type("email"); - auto subject = type_config->add_properties(); - subject->set_property_name("subject"); - subject->set_data_type(PropertyConfigProto::DataType::STRING); - subject->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); + SchemaProto test_email_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_REQUIRED))) + .Build(); ICING_ASSERT_OK(schema_store_->SetSchema(test_email_schema)); } @@ -87,6 +95,10 @@ class ScorerTest : public testing::Test { const FakeClock& fake_clock2() { return fake_clock2_; } + void SetFakeClock1Time(int64_t new_time) { + fake_clock1_.SetSystemTimeMilliseconds(new_time); + } + private: const std::string test_dir_; const std::string doc_store_dir_; @@ -115,7 +127,7 @@ TEST_F(ScorerTest, CreationWithNullPointerShouldFail) { StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); } -TEST_F(ScorerTest, ShouldGetDefaultScore) { +TEST_F(ScorerTest, ShouldGetDefaultScoreIfDocumentDoesntExist) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<Scorer> scorer, Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE, @@ -127,6 +139,66 @@ TEST_F(ScorerTest, ShouldGetDefaultScore) { EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(10)); } +TEST_F(ScorerTest, ShouldGetDefaultScoreIfDocumentIsDeleted) { + // Creates a test document with a provided score + DocumentProto test_document = DocumentBuilder() + .SetKey("icing", "email/1") + .SetSchema("email") + .AddStringProperty("subject", "subject foo") + .SetScore(42) + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, + document_store()->Put(test_document)); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Scorer> scorer, + Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE, + /*default_score=*/10, document_store())); + + DocHitInfo docHitInfo = DocHitInfo(document_id); + + // The document's score is returned + EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(42)); + + // Delete the document and check that the caller-provided default score is + // returned + EXPECT_THAT(document_store()->Delete(document_id), IsOk()); + EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(10)); +} + +TEST_F(ScorerTest, ShouldGetDefaultScoreIfDocumentIsExpired) { + // Creates a test document with a provided score + int64_t creation_time = fake_clock1().GetSystemTimeMilliseconds(); + int64_t ttl = 100; + DocumentProto test_document = DocumentBuilder() + .SetKey("icing", "email/1") + .SetSchema("email") + .AddStringProperty("subject", "subject foo") + .SetScore(42) + .SetCreationTimestampMs(creation_time) + .SetTtlMs(ttl) + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, + document_store()->Put(test_document)); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Scorer> scorer, + Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE, + /*default_score=*/10, document_store())); + + DocHitInfo docHitInfo = DocHitInfo(document_id); + + // The document's score is returned since the document hasn't expired yet. + EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(42)); + + // Expire the document and check that the caller-provided default score is + // returned + SetFakeClock1Time(creation_time + ttl + 10); + EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(10)); +} + TEST_F(ScorerTest, ShouldGetDefaultDocumentScore) { // Creates a test document with the default document score 0 DocumentProto test_document = @@ -389,7 +461,7 @@ TEST_F(ScorerTest, ShouldGetCorrectUsageTimestampScoreForType1) { /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/1000, UsageReport::USAGE_TYPE1); ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type1_time1)); - EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(1)); + EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(1000)); EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0)); EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0)); @@ -398,7 +470,7 @@ TEST_F(ScorerTest, ShouldGetCorrectUsageTimestampScoreForType1) { /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/5000, UsageReport::USAGE_TYPE1); ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type1_time5)); - EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(5)); + EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(5000)); EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0)); EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0)); @@ -407,7 +479,7 @@ TEST_F(ScorerTest, ShouldGetCorrectUsageTimestampScoreForType1) { /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/3000, UsageReport::USAGE_TYPE1); ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type1_time3)); - EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(5)); + EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(5000)); EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0)); EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0)); } @@ -450,7 +522,7 @@ TEST_F(ScorerTest, ShouldGetCorrectUsageTimestampScoreForType2) { UsageReport::USAGE_TYPE2); ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type2_time1)); EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0)); - EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(1)); + EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(1000)); EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0)); // Report usage with timestamp = 5000ms, score should be updated. @@ -459,7 +531,7 @@ TEST_F(ScorerTest, ShouldGetCorrectUsageTimestampScoreForType2) { UsageReport::USAGE_TYPE2); ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type2_time5)); EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0)); - EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(5)); + EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(5000)); EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0)); // Report usage with timestamp = 3000ms, score should not be updated. @@ -468,7 +540,7 @@ TEST_F(ScorerTest, ShouldGetCorrectUsageTimestampScoreForType2) { UsageReport::USAGE_TYPE2); ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type2_time3)); EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0)); - EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(5)); + EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(5000)); EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0)); } @@ -511,7 +583,7 @@ TEST_F(ScorerTest, ShouldGetCorrectUsageTimestampScoreForType3) { ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type3_time1)); EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0)); EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0)); - EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(1)); + EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(1000)); // Report usage with timestamp = 5000ms, score should be updated. UsageReport usage_report_type3_time5 = CreateUsageReport( @@ -520,7 +592,7 @@ TEST_F(ScorerTest, ShouldGetCorrectUsageTimestampScoreForType3) { ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type3_time5)); EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0)); EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0)); - EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(5)); + EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(5000)); // Report usage with timestamp = 3000ms, score should not be updated. UsageReport usage_report_type3_time3 = CreateUsageReport( @@ -529,7 +601,7 @@ TEST_F(ScorerTest, ShouldGetCorrectUsageTimestampScoreForType3) { ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type3_time3)); EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0)); EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0)); - EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(5)); + EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(5000)); } TEST_F(ScorerTest, NoScorerShouldAlwaysReturnDefaultScore) { @@ -557,6 +629,37 @@ TEST_F(ScorerTest, NoScorerShouldAlwaysReturnDefaultScore) { EXPECT_THAT(scorer->GetScore(docHitInfo3), Eq(111)); } +TEST_F(ScorerTest, ShouldScaleUsageTimestampScoreForMaxTimestamp) { + DocumentProto test_document = + DocumentBuilder() + .SetKey("icing", "email/1") + .SetSchema("email") + .AddStringProperty("subject", "subject foo") + .SetCreationTimestampMs(fake_clock1().GetSystemTimeMilliseconds()) + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, + document_store()->Put(test_document)); + + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<Scorer> scorer1, + Scorer::Create( + ScoringSpecProto::RankingStrategy::USAGE_TYPE1_LAST_USED_TIMESTAMP, + /*default_score=*/0, document_store())); + DocHitInfo docHitInfo = DocHitInfo(document_id); + + // Create usage report for the maximum allowable timestamp. + UsageReport usage_report_type1 = CreateUsageReport( + /*name_space=*/"icing", /*uri=*/"email/1", + /*timestamp_ms=*/std::numeric_limits<uint32_t>::max() * 1000.0, + UsageReport::USAGE_TYPE1); + + double max_int_usage_timestamp_score = + std::numeric_limits<uint32_t>::max() * 1000.0; + ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type1)); + EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(max_int_usage_timestamp_score)); +} + } // namespace } // namespace lib diff --git a/icing/scoring/scoring-processor_test.cc b/icing/scoring/scoring-processor_test.cc index 65eecd1..125e2a7 100644 --- a/icing/scoring/scoring-processor_test.cc +++ b/icing/scoring/scoring-processor_test.cc @@ -24,6 +24,7 @@ #include "icing/proto/document.pb.h" #include "icing/proto/schema.pb.h" #include "icing/proto/scoring.pb.h" +#include "icing/schema-builder.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" #include "icing/testing/tmp-directory.h" @@ -36,6 +37,12 @@ using ::testing::ElementsAre; using ::testing::IsEmpty; using ::testing::SizeIs; +constexpr PropertyConfigProto_DataType_Code TYPE_STRING = + PropertyConfigProto_DataType_Code_STRING; + +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; + class ScoringProcessorTest : public testing::Test { protected: ScoringProcessorTest() @@ -60,14 +67,14 @@ class ScoringProcessorTest : public testing::Test { document_store_ = std::move(create_result.document_store); // Creates a simple email schema - SchemaProto test_email_schema; - auto type_config = test_email_schema.add_types(); - type_config->set_schema_type("email"); - auto subject = type_config->add_properties(); - subject->set_property_name("subject"); - subject->set_data_type(PropertyConfigProto::DataType::STRING); - subject->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - + SchemaProto test_email_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); ICING_ASSERT_OK(schema_store_->SetSchema(test_email_schema)); } @@ -603,9 +610,9 @@ TEST_F(ScoringProcessorTest, ShouldScoreByUsageTimestamp) { DocHitInfo doc_hit_info2(document_id2); DocHitInfo doc_hit_info3(document_id3); ScoredDocumentHit scored_document_hit1(document_id1, kSectionIdMaskNone, - /*score=*/1); + /*score=*/1000); ScoredDocumentHit scored_document_hit2(document_id2, kSectionIdMaskNone, - /*score=*/5); + /*score=*/5000); ScoredDocumentHit scored_document_hit3(document_id3, kSectionIdMaskNone, /*score=*/0); diff --git a/icing/store/document-log-creator.cc b/icing/store/document-log-creator.cc new file mode 100644 index 0000000..5e0426e --- /dev/null +++ b/icing/store/document-log-creator.cc @@ -0,0 +1,196 @@ +// Copyright (C) 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/store/document-log-creator.h" + +#include <memory> +#include <string> +#include <utility> + +#include "icing/text_classifier/lib3/utils/base/logging.h" +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/annotate.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/absl_ports/str_cat.h" +#include "icing/file/file-backed-proto-log.h" +#include "icing/file/filesystem.h" +#include "icing/file/portable-file-backed-proto-log.h" +#include "icing/proto/document_wrapper.pb.h" +#include "icing/util/logging.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +namespace { + +// Used in DocumentId mapper to mark a document as deleted +constexpr char kDocumentLogFilename[] = "document_log"; + +std::string DocumentLogFilenameV0() { + // Originally only had this one version, no suffix. + return kDocumentLogFilename; +} + +std::string DocumentLogFilenameV1() { + return absl_ports::StrCat(kDocumentLogFilename, "_v1"); +} + +std::string MakeDocumentLogFilenameV0(const std::string& base_dir) { + return absl_ports::StrCat(base_dir, "/", DocumentLogFilenameV0()); +} + +std::string MakeDocumentLogFilenameV1(const std::string& base_dir) { + return absl_ports::StrCat(base_dir, "/", DocumentLogFilenameV1()); +} + +} // namespace + +std::string DocumentLogCreator::GetDocumentLogFilename() { + // This should always return the latest version of the document log in use. + // The current latest version is V1. + return DocumentLogFilenameV1(); +} + +libtextclassifier3::StatusOr<DocumentLogCreator::CreateResult> +DocumentLogCreator::Create(const Filesystem* filesystem, + const std::string& base_dir) { + bool v0_exists = + filesystem->FileExists(MakeDocumentLogFilenameV0(base_dir).c_str()); + bool v1_exists = + filesystem->FileExists(MakeDocumentLogFilenameV1(base_dir).c_str()); + + bool regen_derived_files = false; + if (v0_exists && !v1_exists) { + ICING_RETURN_IF_ERROR(MigrateFromV0ToV1(filesystem, base_dir)); + + // Need to regenerate derived files since documents may be written to a + // different file offset in the log. + regen_derived_files = true; + } else if (!v1_exists) { + // First time initializing a v1 log. There are no existing derived files at + // this point, so we should generate some. "regenerate" here also means + // "generate for the first time", i.e. we shouldn't expect there to be any + // existing derived files. + regen_derived_files = true; + } + + ICING_ASSIGN_OR_RETURN( + PortableFileBackedProtoLog<DocumentWrapper>::CreateResult + log_create_result, + PortableFileBackedProtoLog<DocumentWrapper>::Create( + filesystem, MakeDocumentLogFilenameV1(base_dir), + PortableFileBackedProtoLog<DocumentWrapper>::Options( + /*compress_in=*/true))); + + CreateResult create_result = {std::move(log_create_result), + regen_derived_files}; + return create_result; +} + +libtextclassifier3::Status DocumentLogCreator::MigrateFromV0ToV1( + const Filesystem* filesystem, const std::string& base_dir) { + ICING_VLOG(1) << "Migrating from v0 to v1 document log."; + + // Our v0 proto log was non-portable, create it so we can read protos out from + // it. + auto v0_create_result_or = FileBackedProtoLog<DocumentWrapper>::Create( + filesystem, MakeDocumentLogFilenameV0(base_dir), + FileBackedProtoLog<DocumentWrapper>::Options( + /*compress_in=*/true)); + if (!v0_create_result_or.ok()) { + return absl_ports::Annotate( + v0_create_result_or.status(), + "Failed to initialize v0 document log while migrating."); + return v0_create_result_or.status(); + } + FileBackedProtoLog<DocumentWrapper>::CreateResult v0_create_result = + std::move(v0_create_result_or).ValueOrDie(); + std::unique_ptr<FileBackedProtoLog<DocumentWrapper>> v0_proto_log = + std::move(v0_create_result.proto_log); + + // Create a v1 portable proto log that we will write our protos to. + auto v1_create_result_or = + PortableFileBackedProtoLog<DocumentWrapper>::Create( + filesystem, MakeDocumentLogFilenameV1(base_dir), + PortableFileBackedProtoLog<DocumentWrapper>::Options( + /*compress_in=*/true)); + if (!v1_create_result_or.ok()) { + return absl_ports::Annotate( + v1_create_result_or.status(), + "Failed to initialize v1 document log while migrating."); + } + PortableFileBackedProtoLog<DocumentWrapper>::CreateResult v1_create_result = + std::move(v1_create_result_or).ValueOrDie(); + std::unique_ptr<PortableFileBackedProtoLog<DocumentWrapper>> v1_proto_log = + std::move(v1_create_result.proto_log); + + // Dummy empty document to be used when copying over deleted documents. + DocumentProto empty_document; + + // Start reading out from the old log and putting them in the new log. + auto iterator = v0_proto_log->GetIterator(); + auto iterator_status = iterator.Advance(); + while (iterator_status.ok()) { + libtextclassifier3::StatusOr<DocumentWrapper> document_wrapper_or = + v0_proto_log->ReadProto(iterator.GetOffset()); + + bool deleted_document = false; + DocumentWrapper document_wrapper; + if (absl_ports::IsNotFound(document_wrapper_or.status())) { + // Proto was erased, we can skip copying this into our new log. + *document_wrapper.mutable_document() = empty_document; + deleted_document = true; + } else if (!document_wrapper_or.ok()) { + // Some real error, pass up + return document_wrapper_or.status(); + } else { + document_wrapper = std::move(document_wrapper_or).ValueOrDie(); + } + + auto offset_or = v1_proto_log->WriteProto(document_wrapper); + if (!offset_or.ok()) { + return absl_ports::Annotate( + offset_or.status(), + "Failed to write proto to v1 document log while migrating."); + } + + // If the original document was deleted, erase the proto we just wrote. + // We do this to maintain the document_ids, i.e. we still want document_id 2 + // to point to a deleted document even though we may not have the document + // contents anymore. DocumentStore guarantees that the document_ids don't + // change unless an Optimize is triggered. + if (deleted_document) { + int64_t offset = offset_or.ValueOrDie(); + auto erased_status = v1_proto_log->EraseProto(offset); + if (!erased_status.ok()) { + return absl_ports::Annotate( + erased_status, + "Failed to erase proto in v1 document log while migrating."); + } + } + + iterator_status = iterator.Advance(); + } + + // Close out our file log pointers. + v0_proto_log.reset(); + v1_proto_log.reset(); + + return libtextclassifier3::Status::OK; +} + +} // namespace lib +} // namespace icing diff --git a/icing/store/document-log-creator.h b/icing/store/document-log-creator.h new file mode 100644 index 0000000..51cf497 --- /dev/null +++ b/icing/store/document-log-creator.h @@ -0,0 +1,77 @@ +// Copyright (C) 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_STORE_DOCUMENT_LOG_CREATOR_H_ +#define ICING_STORE_DOCUMENT_LOG_CREATOR_H_ + +#include <string> + +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/file/filesystem.h" +#include "icing/file/portable-file-backed-proto-log.h" +#include "icing/proto/document_wrapper.pb.h" + +namespace icing { +namespace lib { + +// Handles creation of the document log and any underlying migrations that may +// be necessary. +class DocumentLogCreator { + public: + struct CreateResult { + // The create result passed up from the PortableFileBackedProtoLog::Create. + // Contains the document log. + PortableFileBackedProtoLog<DocumentWrapper>::CreateResult log_create_result; + + // Whether the caller needs to also regenerate/generate any derived files + // based off of the initialized document log. + bool regen_derived_files; + }; + + // Creates the document log in the base_dir. Will create one if it doesn't + // already exist. + // + // This also handles any potential migrations from old document log versions. + // At the end of this call, the most up-to-date log will be returned and will + // be usable. + // + // Returns: + // CreateResult on success. + // INTERNAL on any I/O error. + static libtextclassifier3::StatusOr<DocumentLogCreator::CreateResult> Create( + const Filesystem* filesystem, const std::string& base_dir); + + // Returns the filename of the document log, without any directory prefixes. + // Used mainly for testing purposes. + static std::string GetDocumentLogFilename(); + + private: + // Handles migrating a v0 document log (not portable) to a v1 document log + // (portable). This will initialize the log in the beginning, and close it + // when migration is done. Callers will need to reinitialize the log on their + // own. + // + // Returns: + // OK on success. + // INVALID_ARGUMENT if some invalid option was passed to the document log. + // INTERNAL on I/O error. + static libtextclassifier3::Status MigrateFromV0ToV1( + const Filesystem* filesystem, const std::string& base_dir); +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_STORE_DOCUMENT_LOG_CREATOR_H_ diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc index 72bf736..226a96b 100644 --- a/icing/store/document-store.cc +++ b/icing/store/document-store.cc @@ -19,6 +19,7 @@ #include <memory> #include <string> #include <string_view> +#include <unordered_map> #include <utility> #include <vector> @@ -32,18 +33,22 @@ #include "icing/file/file-backed-vector.h" #include "icing/file/filesystem.h" #include "icing/file/memory-mapped-file.h" +#include "icing/file/portable-file-backed-proto-log.h" #include "icing/legacy/core/icing-string-util.h" #include "icing/proto/document.pb.h" #include "icing/proto/document_wrapper.pb.h" #include "icing/proto/logging.pb.h" +#include "icing/proto/storage.pb.h" #include "icing/schema/schema-store.h" #include "icing/store/corpus-associated-scoring-data.h" #include "icing/store/corpus-id.h" #include "icing/store/document-associated-score-data.h" #include "icing/store/document-filter-data.h" #include "icing/store/document-id.h" +#include "icing/store/document-log-creator.h" #include "icing/store/key-mapper.h" #include "icing/store/namespace-id.h" +#include "icing/store/usage-store.h" #include "icing/tokenization/language-segmenter.h" #include "icing/util/clock.h" #include "icing/util/crc32.h" @@ -59,7 +64,6 @@ namespace { // Used in DocumentId mapper to mark a document as deleted constexpr int64_t kDocDeletedFlag = -1; -constexpr char kDocumentLogFilename[] = "document_log"; constexpr char kDocumentIdMapperFilename[] = "document_id_mapper"; constexpr char kDocumentStoreHeaderFilename[] = "document_store_header"; constexpr char kScoreCacheFilename[] = "score_cache"; @@ -69,7 +73,9 @@ constexpr char kNamespaceMapperFilename[] = "namespace_mapper"; constexpr char kUsageStoreDirectoryName[] = "usage_store"; constexpr char kCorpusIdMapperFilename[] = "corpus_mapper"; -constexpr int32_t kUriMapperMaxSize = 12 * 1024 * 1024; // 12 MiB +// Determined through manual testing to allow for 1 million uris. 1 million +// because we allow up to 1 million DocumentIds. +constexpr int32_t kUriMapperMaxSize = 36 * 1024 * 1024; // 36 MiB // 384 KiB for a KeyMapper would allow each internal array to have a max of // 128 KiB for storage. @@ -82,33 +88,6 @@ DocumentWrapper CreateDocumentWrapper(DocumentProto&& document) { return document_wrapper; } -DocumentWrapper CreateDocumentTombstone(std::string_view document_namespace, - std::string_view document_uri) { - DocumentWrapper document_wrapper; - document_wrapper.set_deleted(true); - DocumentProto* document = document_wrapper.mutable_document(); - document->set_namespace_(std::string(document_namespace)); - document->set_uri(std::string(document_uri)); - return document_wrapper; -} - -DocumentWrapper CreateNamespaceTombstone(std::string_view document_namespace) { - DocumentWrapper document_wrapper; - document_wrapper.set_deleted(true); - DocumentProto* document = document_wrapper.mutable_document(); - document->set_namespace_(std::string(document_namespace)); - return document_wrapper; -} - -DocumentWrapper CreateSchemaTypeTombstone( - std::string_view document_schema_type) { - DocumentWrapper document_wrapper; - document_wrapper.set_deleted(true); - DocumentProto* document = document_wrapper.mutable_document(); - document->set_schema(std::string(document_schema_type)); - return document_wrapper; -} - std::string MakeHeaderFilename(const std::string& base_dir) { return absl_ports::StrCat(base_dir, "/", kDocumentStoreHeaderFilename); } @@ -117,10 +96,6 @@ std::string MakeDocumentIdMapperFilename(const std::string& base_dir) { return absl_ports::StrCat(base_dir, "/", kDocumentIdMapperFilename); } -std::string MakeDocumentLogFilename(const std::string& base_dir) { - return absl_ports::StrCat(base_dir, "/", kDocumentLogFilename); -} - std::string MakeScoreCacheFilename(const std::string& base_dir) { return absl_ports::StrCat(base_dir, "/", kScoreCacheFilename); } @@ -203,20 +178,20 @@ DocumentStore::DocumentStore(const Filesystem* filesystem, libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put( const DocumentProto& document, int32_t num_tokens, - NativePutDocumentStats* put_document_stats) { + PutDocumentStatsProto* put_document_stats) { return Put(DocumentProto(document), num_tokens, put_document_stats); } libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put( DocumentProto&& document, int32_t num_tokens, - NativePutDocumentStats* put_document_stats) { + PutDocumentStatsProto* put_document_stats) { document.mutable_internal_fields()->set_length_in_tokens(num_tokens); return InternalPut(document, put_document_stats); } DocumentStore::~DocumentStore() { if (initialized_) { - if (!PersistToDisk().ok()) { + if (!PersistToDisk(PersistType::FULL).ok()) { ICING_LOG(ERROR) << "Error persisting to disk in DocumentStore destructor"; } @@ -226,15 +201,18 @@ DocumentStore::~DocumentStore() { libtextclassifier3::StatusOr<DocumentStore::CreateResult> DocumentStore::Create( const Filesystem* filesystem, const std::string& base_dir, const Clock* clock, const SchemaStore* schema_store, - NativeInitializeStats* initialize_stats) { + bool force_recovery_and_revalidate_documents, + InitializeStatsProto* initialize_stats) { ICING_RETURN_ERROR_IF_NULL(filesystem); ICING_RETURN_ERROR_IF_NULL(clock); ICING_RETURN_ERROR_IF_NULL(schema_store); auto document_store = std::unique_ptr<DocumentStore>( new DocumentStore(filesystem, base_dir, clock, schema_store)); - ICING_ASSIGN_OR_RETURN(DataLoss data_loss, - document_store->Initialize(initialize_stats)); + ICING_ASSIGN_OR_RETURN( + DataLoss data_loss, + document_store->Initialize(force_recovery_and_revalidate_documents, + initialize_stats)); CreateResult create_result; create_result.document_store = std::move(document_store); @@ -243,42 +221,57 @@ libtextclassifier3::StatusOr<DocumentStore::CreateResult> DocumentStore::Create( } libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize( - NativeInitializeStats* initialize_stats) { - auto create_result_or = FileBackedProtoLog<DocumentWrapper>::Create( - filesystem_, MakeDocumentLogFilename(base_dir_), - FileBackedProtoLog<DocumentWrapper>::Options( - /*compress_in=*/true)); + bool force_recovery_and_revalidate_documents, + InitializeStatsProto* initialize_stats) { + auto create_result_or = DocumentLogCreator::Create(filesystem_, base_dir_); + // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN // that can support error logging. if (!create_result_or.ok()) { ICING_LOG(ERROR) << create_result_or.status().error_message() - << "\nFailed to initialize DocumentLog"; + << "\nFailed to initialize DocumentLog."; return create_result_or.status(); } - FileBackedProtoLog<DocumentWrapper>::CreateResult create_result = + DocumentLogCreator::CreateResult create_result = std::move(create_result_or).ValueOrDie(); - document_log_ = std::move(create_result.proto_log); - if (create_result.has_data_loss()) { - ICING_LOG(WARNING) - << "Data loss in document log, regenerating derived files."; - if (initialize_stats != nullptr) { + document_log_ = std::move(create_result.log_create_result.proto_log); + + if (create_result.regen_derived_files || + force_recovery_and_revalidate_documents || + create_result.log_create_result.has_data_loss()) { + // We can't rely on any existing derived files. Recreate them from scratch. + // Currently happens if: + // 1) This is a new log and we don't have derived files yet + // 2) Client wanted us to force a regeneration. + // 3) Log has some data loss, can't rely on existing derived data. + if (create_result.log_create_result.has_data_loss() && + initialize_stats != nullptr) { + ICING_LOG(WARNING) + << "Data loss in document log, regenerating derived files."; initialize_stats->set_document_store_recovery_cause( - NativeInitializeStats::DATA_LOSS); + InitializeStatsProto::DATA_LOSS); - if (create_result.data_loss == DataLoss::PARTIAL) { + if (create_result.log_create_result.data_loss == DataLoss::PARTIAL) { // Ground truth is partially lost. initialize_stats->set_document_store_data_status( - NativeInitializeStats::PARTIAL_LOSS); + InitializeStatsProto::PARTIAL_LOSS); } else { // Ground truth is completely lost. initialize_stats->set_document_store_data_status( - NativeInitializeStats::COMPLETE_LOSS); + InitializeStatsProto::COMPLETE_LOSS); } } + std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer(); - libtextclassifier3::Status status = RegenerateDerivedFiles(); - if (initialize_stats != nullptr) { + libtextclassifier3::Status status = + RegenerateDerivedFiles(force_recovery_and_revalidate_documents); + if (initialize_stats != nullptr && + (force_recovery_and_revalidate_documents || + create_result.log_create_result.has_data_loss())) { + // Only consider it a recovery if the client forced a recovery or there + // was data loss. Otherwise, this could just be the first time we're + // initializing and generating derived files. initialize_stats->set_document_store_recovery_latency_ms( document_recovery_timer->GetElapsedMilliseconds()); } @@ -288,17 +281,16 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize( return status; } } else { - if (!InitializeDerivedFiles().ok()) { + if (!InitializeExistingDerivedFiles().ok()) { ICING_VLOG(1) << "Couldn't find derived files or failed to initialize them, " "regenerating derived files for DocumentStore."; - if (initialize_stats != nullptr) { - initialize_stats->set_document_store_recovery_cause( - NativeInitializeStats::IO_ERROR); - } std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer(); - libtextclassifier3::Status status = RegenerateDerivedFiles(); - if (initialize_stats != nullptr) { + libtextclassifier3::Status status = RegenerateDerivedFiles( + /*force_recovery_and_revalidate_documents*/ false); + if (initialize_stats != nullptr && num_documents() > 0) { + initialize_stats->set_document_store_recovery_cause( + InitializeStatsProto::IO_ERROR); initialize_stats->set_document_store_recovery_latency_ms( document_recovery_timer->GetElapsedMilliseconds()); } @@ -315,10 +307,10 @@ libtextclassifier3::StatusOr<DataLoss> DocumentStore::Initialize( initialize_stats->set_num_documents(document_id_mapper_->num_elements()); } - return create_result.data_loss; + return create_result.log_create_result.data_loss; } -libtextclassifier3::Status DocumentStore::InitializeDerivedFiles() { +libtextclassifier3::Status DocumentStore::InitializeExistingDerivedFiles() { if (!HeaderExists()) { // Without a header, we don't know if things are consistent between each // other so the caller should just regenerate everything from ground @@ -404,7 +396,8 @@ libtextclassifier3::Status DocumentStore::InitializeDerivedFiles() { return libtextclassifier3::Status::OK; } -libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles() { +libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles( + bool revalidate_documents) { ICING_RETURN_IF_ERROR(ResetDocumentKeyMapper()); ICING_RETURN_IF_ERROR(ResetDocumentIdMapper()); ICING_RETURN_IF_ERROR(ResetDocumentAssociatedScoreCache()); @@ -438,148 +431,80 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles() { DocumentWrapper document_wrapper = std::move(document_wrapper_or).ValueOrDie(); - if (document_wrapper.deleted()) { - if (!document_wrapper.document().uri().empty()) { - // Individual document deletion. - auto document_id_or = - GetDocumentId(document_wrapper.document().namespace_(), - document_wrapper.document().uri()); - // Updates document_id mapper with deletion - if (document_id_or.ok()) { - ICING_RETURN_IF_ERROR(document_id_mapper_->Set( - document_id_or.ValueOrDie(), kDocDeletedFlag)); - } else if (!absl_ports::IsNotFound(document_id_or.status())) { - // Real error - return absl_ports::Annotate( - document_id_or.status(), - absl_ports::StrCat("Failed to find document id. namespace: ", - document_wrapper.document().namespace_(), - ", uri: ", document_wrapper.document().uri())); - } - } else if (!document_wrapper.document().namespace_().empty()) { - // Namespace deletion. - ICING_ASSIGN_OR_RETURN( - NamespaceId namespace_id, - namespace_mapper_->Get(document_wrapper.document().namespace_())); - // Tombstone indicates it's a soft delete. - ICING_RETURN_IF_ERROR(BatchDelete(namespace_id, kInvalidSchemaTypeId, - /*soft_delete=*/true)); - } else if (!document_wrapper.document().schema().empty()) { - // SchemaType deletion. - auto schema_type_id_or = schema_store_->GetSchemaTypeId( - document_wrapper.document().schema()); - - if (schema_type_id_or.ok()) { - // Tombstone indicates it's a soft delete. - ICING_RETURN_IF_ERROR(BatchDelete(kInvalidNamespaceId, - schema_type_id_or.ValueOrDie(), - /*soft_delete=*/true)); - } else { - // The deleted schema type doesn't have a SchemaTypeId we can refer - // to in the FilterCache. - // - // TODO(cassiewang): We could avoid reading out all the documents. - // When we see a schema type doesn't have a SchemaTypeId, assign the - // unknown schema type a unique, temporary SchemaTypeId and store - // that in the FilterCache. Then, when we see the schema type - // tombstone here, we can look up its temporary SchemaTypeId and - // just iterate through the FilterCache to mark those documents as - // deleted. - int size = document_id_mapper_->num_elements(); - for (DocumentId document_id = 0; document_id < size; document_id++) { - auto document_or = Get(document_id); - if (absl_ports::IsNotFound(document_or.status())) { - // Skip nonexistent documents - continue; - } else if (!document_or.ok()) { - // Real error, pass up - return absl_ports::Annotate( - document_or.status(), - IcingStringUtil::StringPrintf( - "Failed to retrieve Document for DocumentId %d", - document_id)); - } - - // Guaranteed to have a document now. - DocumentProto document = document_or.ValueOrDie(); - - if (document.schema() == document_wrapper.document().schema()) { - ICING_RETURN_IF_ERROR( - document_id_mapper_->Set(document_id, kDocDeletedFlag)); - } - } - } - } else { - return absl_ports::InternalError( - "Encountered an invalid tombstone during recovery!"); + // Revalidate that this document is still compatible if requested. + if (revalidate_documents) { + if (!document_validator_.Validate(document_wrapper.document()).ok()) { + // Document is no longer valid with the current schema. Mark as + // deleted + DocumentId new_document_id = document_id_mapper_->num_elements(); + ICING_RETURN_IF_ERROR(document_log_->EraseProto(iterator.GetOffset())); + ICING_RETURN_IF_ERROR(ClearDerivedData(new_document_id)); + continue; } + } + // Updates key mapper and document_id mapper with the new document + DocumentId new_document_id = document_id_mapper_->num_elements(); + ICING_RETURN_IF_ERROR(document_key_mapper_->Put( + MakeFingerprint(document_wrapper.document().namespace_(), + document_wrapper.document().uri()), + new_document_id)); + ICING_RETURN_IF_ERROR( + document_id_mapper_->Set(new_document_id, iterator.GetOffset())); + + SchemaTypeId schema_type_id; + auto schema_type_id_or = + schema_store_->GetSchemaTypeId(document_wrapper.document().schema()); + if (absl_ports::IsNotFound(schema_type_id_or.status())) { + // Didn't find a SchemaTypeId. This means that the DocumentStore and + // the SchemaStore are out of sync. But DocumentStore can't do + // anything about it so just ignore this for now. This should be + // detected/handled by the owner of DocumentStore. Set it to some + // arbitrary invalid value for now, it'll get updated to the correct + // ID later. + schema_type_id = -1; + } else if (!schema_type_id_or.ok()) { + // Real error. Pass it up + return schema_type_id_or.status(); } else { - // Updates key mapper and document_id mapper with the new document - DocumentId new_document_id = document_id_mapper_->num_elements(); - ICING_RETURN_IF_ERROR(document_key_mapper_->Put( - MakeFingerprint(document_wrapper.document().namespace_(), - document_wrapper.document().uri()), - new_document_id)); - ICING_RETURN_IF_ERROR( - document_id_mapper_->Set(new_document_id, iterator.GetOffset())); - - SchemaTypeId schema_type_id; - auto schema_type_id_or = - schema_store_->GetSchemaTypeId(document_wrapper.document().schema()); - if (absl_ports::IsNotFound(schema_type_id_or.status())) { - // Didn't find a SchemaTypeId. This means that the DocumentStore and - // the SchemaStore are out of sync. But DocumentStore can't do - // anything about it so just ignore this for now. This should be - // detected/handled by the owner of DocumentStore. Set it to some - // arbitrary invalid value for now, it'll get updated to the correct - // ID later. - schema_type_id = -1; - } else if (!schema_type_id_or.ok()) { - // Real error. Pass it up - return schema_type_id_or.status(); - } else { - // We're guaranteed that SchemaTypeId is valid now - schema_type_id = schema_type_id_or.ValueOrDie(); - } + // We're guaranteed that SchemaTypeId is valid now + schema_type_id = schema_type_id_or.ValueOrDie(); + } - ICING_ASSIGN_OR_RETURN( - NamespaceId namespace_id, - namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(), - namespace_mapper_->num_keys())); + ICING_ASSIGN_OR_RETURN( + NamespaceId namespace_id, + namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(), + namespace_mapper_->num_keys())); - // Update corpus maps - std::string corpus = - MakeFingerprint(document_wrapper.document().namespace_(), - document_wrapper.document().schema()); - ICING_ASSIGN_OR_RETURN( - CorpusId corpusId, - corpus_mapper_->GetOrPut(corpus, corpus_mapper_->num_keys())); + // Update corpus maps + std::string corpus = + MakeFingerprint(document_wrapper.document().namespace_(), + document_wrapper.document().schema()); + ICING_ASSIGN_OR_RETURN( + CorpusId corpusId, + corpus_mapper_->GetOrPut(corpus, corpus_mapper_->num_keys())); - ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data, - GetCorpusAssociatedScoreDataToUpdate(corpusId)); - scoring_data.AddDocument( - document_wrapper.document().internal_fields().length_in_tokens()); + ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data, + GetCorpusAssociatedScoreDataToUpdate(corpusId)); + scoring_data.AddDocument( + document_wrapper.document().internal_fields().length_in_tokens()); - ICING_RETURN_IF_ERROR( - UpdateCorpusAssociatedScoreCache(corpusId, scoring_data)); - - ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache( - new_document_id, - DocumentAssociatedScoreData( - corpusId, document_wrapper.document().score(), - document_wrapper.document().creation_timestamp_ms(), - document_wrapper.document() - .internal_fields() - .length_in_tokens()))); - - int64_t expiration_timestamp_ms = CalculateExpirationTimestampMs( - document_wrapper.document().creation_timestamp_ms(), - document_wrapper.document().ttl_ms()); - - ICING_RETURN_IF_ERROR(UpdateFilterCache( - new_document_id, DocumentFilterData(namespace_id, schema_type_id, - expiration_timestamp_ms))); - } + ICING_RETURN_IF_ERROR( + UpdateCorpusAssociatedScoreCache(corpusId, scoring_data)); + + ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache( + new_document_id, + DocumentAssociatedScoreData( + corpusId, document_wrapper.document().score(), + document_wrapper.document().creation_timestamp_ms(), + document_wrapper.document().internal_fields().length_in_tokens()))); + + int64_t expiration_timestamp_ms = CalculateExpirationTimestampMs( + document_wrapper.document().creation_timestamp_ms(), + document_wrapper.document().ttl_ms()); + + ICING_RETURN_IF_ERROR(UpdateFilterCache( + new_document_id, DocumentFilterData(namespace_id, schema_type_id, + expiration_timestamp_ms))); iterator_status = iterator.Advance(); } @@ -788,6 +713,11 @@ libtextclassifier3::StatusOr<Crc32> DocumentStore::ComputeChecksum() const { } Crc32 corpus_score_cache_checksum = std::move(checksum_or).ValueOrDie(); + // NOTE: We purposely don't include usage_store checksum here because we can't + // regenerate it from ground truth documents. If it gets corrupted, we'll just + // clear all usage reports, but we shouldn't throw everything else in the + // document store out. + total_checksum.Append(std::to_string(document_log_checksum.Get())); total_checksum.Append(std::to_string(document_key_mapper_checksum.Get())); total_checksum.Append(std::to_string(document_id_mapper_checksum.Get())); @@ -819,8 +749,11 @@ libtextclassifier3::Status DocumentStore::UpdateHeader(const Crc32& checksum) { header.checksum = checksum.Get(); // This should overwrite the header. - if (!filesystem_->Write(MakeHeaderFilename(base_dir_).c_str(), &header, - sizeof(header))) { + ScopedFd sfd( + filesystem_->OpenForWrite(MakeHeaderFilename(base_dir_).c_str())); + if (!sfd.is_valid() || + !filesystem_->Write(sfd.get(), &header, sizeof(header)) || + !filesystem_->DataSync(sfd.get())) { return absl_ports::InternalError(absl_ports::StrCat( "Failed to write DocStore header: ", MakeHeaderFilename(base_dir_))); } @@ -828,7 +761,7 @@ libtextclassifier3::Status DocumentStore::UpdateHeader(const Crc32& checksum) { } libtextclassifier3::StatusOr<DocumentId> DocumentStore::InternalPut( - DocumentProto& document, NativePutDocumentStats* put_document_stats) { + DocumentProto& document, PutDocumentStatsProto* put_document_stats) { std::unique_ptr<Timer> put_timer = clock_.GetNewTimer(); ICING_RETURN_IF_ERROR(document_validator_.Validate(document)); @@ -874,6 +807,12 @@ libtextclassifier3::StatusOr<DocumentId> DocumentStore::InternalPut( // Creates a new document id, updates key mapper and document_id mapper DocumentId new_document_id = document_id_mapper_->num_elements(); + if (!IsDocumentIdValid(new_document_id)) { + return absl_ports::ResourceExhaustedError( + "Exceeded maximum number of documents. Try calling Optimize to reclaim " + "some space."); + } + ICING_RETURN_IF_ERROR(document_key_mapper_->Put( MakeFingerprint(name_space, uri), new_document_id)); ICING_RETURN_IF_ERROR(document_id_mapper_->Set(new_document_id, file_offset)); @@ -909,18 +848,20 @@ libtextclassifier3::StatusOr<DocumentId> DocumentStore::InternalPut( expiration_timestamp_ms))); if (old_document_id_or.ok()) { + // The old document exists, copy over the usage scores and delete the old + // document. DocumentId old_document_id = old_document_id_or.ValueOrDie(); - auto offset_or = DoesDocumentExistAndGetFileOffset(old_document_id); - if (offset_or.ok()) { - // The old document exists, copy over the usage scores. - ICING_RETURN_IF_ERROR( - usage_store_->CloneUsageScores(/*from_document_id=*/old_document_id, - /*to_document_id=*/new_document_id)); - - // Hard delete the old document. - ICING_RETURN_IF_ERROR( - HardDelete(old_document_id, offset_or.ValueOrDie())); + ICING_RETURN_IF_ERROR( + usage_store_->CloneUsageScores(/*from_document_id=*/old_document_id, + /*to_document_id=*/new_document_id)); + + // Delete the old document. It's fine if it's not found since it might have + // been deleted previously. + auto delete_status = Delete(old_document_id); + if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) { + // Real error, pass it up. + return delete_status; } } @@ -939,7 +880,7 @@ libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get( // existing Status. auto document_id_or = GetDocumentId(name_space, uri); if (absl_ports::IsNotFound(document_id_or.status())) { - ICING_LOG(ERROR) << document_id_or.status().error_message(); + ICING_VLOG(1) << document_id_or.status().error_message(); return libtextclassifier3::Status( document_id_or.status().CanonicalCode(), IcingStringUtil::StringPrintf("Document (%s, %s) not found.", @@ -962,8 +903,16 @@ libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get( libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get( DocumentId document_id, bool clear_internal_fields) const { - ICING_ASSIGN_OR_RETURN(int64_t document_log_offset, - DoesDocumentExistAndGetFileOffset(document_id)); + ICING_RETURN_IF_ERROR(DoesDocumentExistWithStatus(document_id)); + + auto document_log_offset_or = document_id_mapper_->Get(document_id); + if (!document_log_offset_or.ok()) { + // Since we've just checked that our document_id is valid a few lines + // above, there's no reason this should fail and an error should never + // happen. + return absl_ports::InternalError("Failed to find document offset."); + } + int64_t document_log_offset = *document_log_offset_or.ValueOrDie(); // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN // that can support error logging. @@ -1014,7 +963,7 @@ std::vector<std::string> DocumentStore::GetAllNamespaces() const { } const DocumentFilterData* data = status_or_data.ValueOrDie(); - if (DoesDocumentExist(document_id)) { + if (InternalDoesDocumentExist(document_id)) { existing_namespace_ids.insert(data->namespace_id()); } } @@ -1027,45 +976,78 @@ std::vector<std::string> DocumentStore::GetAllNamespaces() const { return existing_namespaces; } -libtextclassifier3::StatusOr<int64_t> -DocumentStore::DoesDocumentExistAndGetFileOffset(DocumentId document_id) const { +bool DocumentStore::DoesDocumentExist(DocumentId document_id) const { if (!IsDocumentIdValid(document_id)) { - return absl_ports::InvalidArgumentError( - IcingStringUtil::StringPrintf("DocumentId %d is invalid", document_id)); + return false; } - auto file_offset_or = document_id_mapper_->Get(document_id); + if (document_id >= document_id_mapper_->num_elements()) { + // Somehow got an validly constructed document_id that the document store + // doesn't know about + return false; + } + + return InternalDoesDocumentExist(document_id); +} - bool deleted = - file_offset_or.ok() && *file_offset_or.ValueOrDie() == kDocDeletedFlag; - if (deleted || absl_ports::IsOutOfRange(file_offset_or.status())) { - // Document has been deleted or doesn't exist - return absl_ports::NotFoundError( - IcingStringUtil::StringPrintf("Document %d not found", document_id)); +libtextclassifier3::Status DocumentStore::DoesDocumentExistWithStatus( + DocumentId document_id) const { + if (!IsDocumentIdValid(document_id)) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Document id '%d' invalid.", document_id)); } - ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data, - filter_cache_->Get(document_id)); - if (clock_.GetSystemTimeMilliseconds() >= - filter_data->expiration_timestamp_ms()) { - // Past the expiration time, so also return NOT FOUND since it *shouldn't* - // exist anymore. - return absl_ports::NotFoundError( - IcingStringUtil::StringPrintf("Document %d not found", document_id)); + if (document_id >= document_id_mapper_->num_elements()) { + // Somehow got a validly constructed document_id that the document store + // doesn't know about. + return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( + "Unknown document id '%d'.", document_id)); } - ICING_RETURN_IF_ERROR(file_offset_or.status()); - return *file_offset_or.ValueOrDie(); + if (!InternalDoesDocumentExist(document_id)) { + return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( + "Document id '%d' doesn't exist", document_id)); + }; + return libtextclassifier3::Status::OK; +} + +bool DocumentStore::InternalDoesDocumentExist(DocumentId document_id) const { + return !IsDeleted(document_id) && !IsExpired(document_id); } -bool DocumentStore::DoesDocumentExist(DocumentId document_id) const { - // If we can successfully get the document log offset, the document exists. - return DoesDocumentExistAndGetFileOffset(document_id).ok(); +bool DocumentStore::IsDeleted(DocumentId document_id) const { + auto file_offset_or = document_id_mapper_->Get(document_id); + if (!file_offset_or.ok()) { + // This would only happen if document_id is out of range of the + // document_id_mapper, meaning we got some invalid document_id. Callers + // should already have checked that their document_id is valid or used + // DoesDocumentExist(WithStatus). Regardless, return true since the + // document doesn't exist. + return true; + } + int64_t file_offset = *file_offset_or.ValueOrDie(); + return file_offset == kDocDeletedFlag; +} + +bool DocumentStore::IsExpired(DocumentId document_id) const { + auto filter_data_or = filter_cache_->Get(document_id); + if (!filter_data_or.ok()) { + // This would only happen if document_id is out of range of the + // filter_cache, meaning we got some invalid document_id. Callers should + // already have checked that their document_id is valid or used + // DoesDocumentExist(WithStatus). Regardless, return true since the + // document doesn't exist. + return true; + } + const DocumentFilterData* filter_data = filter_data_or.ValueOrDie(); + + // Check if it's past the expiration time + return clock_.GetSystemTimeMilliseconds() >= + filter_data->expiration_timestamp_ms(); } libtextclassifier3::Status DocumentStore::Delete( - const std::string_view name_space, const std::string_view uri, - bool soft_delete) { + const std::string_view name_space, const std::string_view uri) { // Try to get the DocumentId first auto document_id_or = GetDocumentId(name_space, uri); if (!document_id_or.ok()) { @@ -1074,69 +1056,18 @@ libtextclassifier3::Status DocumentStore::Delete( absl_ports::StrCat("Failed to delete Document. namespace: ", name_space, ", uri: ", uri)); } - - // Check if the DocumentId's Document still exists. - DocumentId document_id = document_id_or.ValueOrDie(); - auto file_offset_or = DoesDocumentExistAndGetFileOffset(document_id); - if (!file_offset_or.ok()) { - return absl_ports::Annotate( - file_offset_or.status(), - absl_ports::StrCat("Failed to delete Document. namespace: ", name_space, - ", uri: ", uri)); - } - - if (soft_delete) { - return SoftDelete(name_space, uri, document_id); - } else { - return HardDelete(document_id, file_offset_or.ValueOrDie()); - } + return Delete(document_id_or.ValueOrDie()); } -libtextclassifier3::Status DocumentStore::Delete(DocumentId document_id, - bool soft_delete) { - // Copy out the document to get namespace and uri. - ICING_ASSIGN_OR_RETURN(int64_t document_log_offset, - DoesDocumentExistAndGetFileOffset(document_id)); - - if (soft_delete) { - auto document_wrapper_or = document_log_->ReadProto(document_log_offset); - if (!document_wrapper_or.ok()) { - ICING_LOG(ERROR) << document_wrapper_or.status().error_message() - << "Failed to read from document log"; - return document_wrapper_or.status(); - } - DocumentWrapper document_wrapper = - std::move(document_wrapper_or).ValueOrDie(); +libtextclassifier3::Status DocumentStore::Delete(DocumentId document_id) { + ICING_RETURN_IF_ERROR(DoesDocumentExistWithStatus(document_id)); - return SoftDelete(document_wrapper.document().namespace_(), - document_wrapper.document().uri(), document_id); - } else { - return HardDelete(document_id, document_log_offset); + auto document_log_offset_or = document_id_mapper_->Get(document_id); + if (!document_log_offset_or.ok()) { + return absl_ports::InternalError("Failed to find document offset."); } -} + int64_t document_log_offset = *document_log_offset_or.ValueOrDie(); -// TODO(b/169969469): Consider removing SoftDelete(). -libtextclassifier3::Status DocumentStore::SoftDelete( - std::string_view name_space, std::string_view uri, DocumentId document_id) { - // Update ground truth first. - // Mark the document as deleted by appending a tombstone of it and actually - // remove it from file later in Optimize() - // TODO(b/144458732): Implement a more robust version of - // ICING_RETURN_IF_ERROR that can support error logging. - libtextclassifier3::Status status = - document_log_->WriteProto(CreateDocumentTombstone(name_space, uri)) - .status(); - if (!status.ok()) { - return absl_ports::Annotate( - status, absl_ports::StrCat("Failed to delete Document. namespace:", - name_space, ", uri: ", uri)); - } - - return document_id_mapper_->Set(document_id, kDocDeletedFlag); -} - -libtextclassifier3::Status DocumentStore::HardDelete( - DocumentId document_id, int64_t document_log_offset) { // Erases document proto. ICING_RETURN_IF_ERROR(document_log_->EraseProto(document_log_offset)); return ClearDerivedData(document_id); @@ -1154,7 +1085,12 @@ libtextclassifier3::StatusOr<CorpusId> DocumentStore::GetCorpusId( libtextclassifier3::StatusOr<DocumentAssociatedScoreData> DocumentStore::GetDocumentAssociatedScoreData(DocumentId document_id) const { - auto score_data_or = score_cache_->Get(document_id); + if (!DoesDocumentExist(document_id)) { + return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( + "Can't get usage scores, document id '%d' doesn't exist", document_id)); + } + + auto score_data_or = score_cache_->GetCopy(document_id); if (!score_data_or.ok()) { ICING_LOG(ERROR) << " while trying to access DocumentId " << document_id << " from score_cache_"; @@ -1162,7 +1098,7 @@ DocumentStore::GetDocumentAssociatedScoreData(DocumentId document_id) const { } DocumentAssociatedScoreData document_associated_score_data = - *std::move(score_data_or).ValueOrDie(); + std::move(score_data_or).ValueOrDie(); if (document_associated_score_data.document_score() < 0) { // An negative / invalid score means that the score data has been deleted. return absl_ports::NotFoundError("Document score data not found."); @@ -1172,13 +1108,13 @@ DocumentStore::GetDocumentAssociatedScoreData(DocumentId document_id) const { libtextclassifier3::StatusOr<CorpusAssociatedScoreData> DocumentStore::GetCorpusAssociatedScoreData(CorpusId corpus_id) const { - auto score_data_or = corpus_score_cache_->Get(corpus_id); + auto score_data_or = corpus_score_cache_->GetCopy(corpus_id); if (!score_data_or.ok()) { return score_data_or.status(); } CorpusAssociatedScoreData corpus_associated_score_data = - *std::move(score_data_or).ValueOrDie(); + std::move(score_data_or).ValueOrDie(); return corpus_associated_score_data; } @@ -1200,23 +1136,28 @@ DocumentStore::GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const { libtextclassifier3::StatusOr<DocumentFilterData> DocumentStore::GetDocumentFilterData(DocumentId document_id) const { - auto filter_data_or = filter_cache_->Get(document_id); + if (!DoesDocumentExist(document_id)) { + return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( + "Can't get filter data, document id '%d' doesn't exist", document_id)); + } + + auto filter_data_or = filter_cache_->GetCopy(document_id); if (!filter_data_or.ok()) { ICING_LOG(ERROR) << " while trying to access DocumentId " << document_id << " from filter_cache_"; return filter_data_or.status(); } DocumentFilterData document_filter_data = - *std::move(filter_data_or).ValueOrDie(); - if (document_filter_data.namespace_id() == kInvalidNamespaceId) { - // An invalid namespace id means that the filter data has been deleted. - return absl_ports::NotFoundError("Document filter data not found."); - } + std::move(filter_data_or).ValueOrDie(); return document_filter_data; } libtextclassifier3::StatusOr<UsageStore::UsageScores> DocumentStore::GetUsageScores(DocumentId document_id) const { + if (!DoesDocumentExist(document_id)) { + return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( + "Can't get usage scores, document id '%d' doesn't exist", document_id)); + } return usage_store_->GetUsageScores(document_id); } @@ -1225,11 +1166,22 @@ libtextclassifier3::Status DocumentStore::ReportUsage( ICING_ASSIGN_OR_RETURN(DocumentId document_id, GetDocumentId(usage_report.document_namespace(), usage_report.document_uri())); + // We can use the internal version here because we got our document_id from + // our internal data structures. We would have thrown some error if the + // namespace and/or uri were incorrect. + if (!InternalDoesDocumentExist(document_id)) { + // Document was probably deleted or expired. + return absl_ports::NotFoundError(absl_ports::StrCat( + "Couldn't report usage on a nonexistent document: (namespace: '", + usage_report.document_namespace(), "', uri: '", + usage_report.document_uri(), "')")); + } + return usage_store_->AddUsageReport(usage_report, document_id); } DocumentStore::DeleteByGroupResult DocumentStore::DeleteByNamespace( - std::string_view name_space, bool soft_delete) { + std::string_view name_space) { DeleteByGroupResult result; auto namespace_id_or = namespace_mapper_->Get(name_space); if (!namespace_id_or.ok()) { @@ -1239,26 +1191,7 @@ DocumentStore::DeleteByGroupResult DocumentStore::DeleteByNamespace( return result; } NamespaceId namespace_id = namespace_id_or.ValueOrDie(); - - if (soft_delete) { - // To delete an entire namespace, we append a tombstone that only contains - // the deleted bit and the name of the deleted namespace. - // TODO(b/144458732): Implement a more robust version of - // ICING_RETURN_IF_ERROR that can support error logging. - libtextclassifier3::Status status = - document_log_->WriteProto(CreateNamespaceTombstone(name_space)) - .status(); - if (!status.ok()) { - ICING_LOG(ERROR) << status.error_message() - << "Failed to delete namespace. namespace = " - << name_space; - result.status = std::move(status); - return result; - } - } - - auto num_deleted_or = - BatchDelete(namespace_id, kInvalidSchemaTypeId, soft_delete); + auto num_deleted_or = BatchDelete(namespace_id, kInvalidSchemaTypeId); if (!num_deleted_or.ok()) { result.status = std::move(num_deleted_or).status(); return result; @@ -1277,7 +1210,7 @@ DocumentStore::DeleteByGroupResult DocumentStore::DeleteByNamespace( } DocumentStore::DeleteByGroupResult DocumentStore::DeleteBySchemaType( - std::string_view schema_type, bool soft_delete) { + std::string_view schema_type) { DeleteByGroupResult result; auto schema_type_id_or = schema_store_->GetSchemaTypeId(schema_type); if (!schema_type_id_or.ok()) { @@ -1288,26 +1221,7 @@ DocumentStore::DeleteByGroupResult DocumentStore::DeleteBySchemaType( return result; } SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie(); - - if (soft_delete) { - // To soft-delete an entire schema type, we append a tombstone that only - // contains the deleted bit and the name of the deleted schema type. - // TODO(b/144458732): Implement a more robust version of - // ICING_RETURN_IF_ERROR that can support error logging. - libtextclassifier3::Status status = - document_log_->WriteProto(CreateSchemaTypeTombstone(schema_type)) - .status(); - if (!status.ok()) { - ICING_LOG(ERROR) << status.error_message() - << "Failed to delete schema_type. schema_type = " - << schema_type; - result.status = std::move(status); - return result; - } - } - - auto num_deleted_or = - BatchDelete(kInvalidNamespaceId, schema_type_id, soft_delete); + auto num_deleted_or = BatchDelete(kInvalidNamespaceId, schema_type_id); if (!num_deleted_or.ok()) { result.status = std::move(num_deleted_or).status(); return result; @@ -1324,7 +1238,7 @@ DocumentStore::DeleteByGroupResult DocumentStore::DeleteBySchemaType( } libtextclassifier3::StatusOr<int> DocumentStore::BatchDelete( - NamespaceId namespace_id, SchemaTypeId schema_type_id, bool soft_delete) { + NamespaceId namespace_id, SchemaTypeId schema_type_id) { // Tracks if there were any existing documents with this namespace that we // will mark as deleted. int num_updated_documents = 0; @@ -1356,37 +1270,27 @@ libtextclassifier3::StatusOr<int> DocumentStore::BatchDelete( continue; } - // The document has the desired namespace and schema type, it either exists - // or has been soft-deleted / expired. - if (soft_delete) { - if (DoesDocumentExist(document_id)) { - ++num_updated_documents; - } - - // docid_mapper_->Set can only fail if document_id is < 0 - // or >= docid_mapper_->num_elements. So the only possible way to get an - // error here would be if filter_cache_->num_elements > - // docid_mapper_->num_elements, which SHOULD NEVER HAPPEN. - ICING_RETURN_IF_ERROR( - document_id_mapper_->Set(document_id, kDocDeletedFlag)); - } else { - // Hard delete. - libtextclassifier3::Status delete_status = - Delete(document_id, /*soft_delete=*/false); - if (absl_ports::IsNotFound(delete_status)) { - continue; - } else if (!delete_status.ok()) { - // Real error, pass up. - return delete_status; - } - ++num_updated_documents; + // The document has the desired namespace and schema type, it either + // exists or has expired. + libtextclassifier3::Status delete_status = Delete(document_id); + if (absl_ports::IsNotFound(delete_status)) { + continue; + } else if (!delete_status.ok()) { + // Real error, pass up. + return delete_status; } + ++num_updated_documents; } return num_updated_documents; } -libtextclassifier3::Status DocumentStore::PersistToDisk() { +libtextclassifier3::Status DocumentStore::PersistToDisk( + PersistType::Code persist_type) { + if (persist_type == PersistType::LITE) { + // only persist the document log. + return document_log_->PersistToDisk(); + } ICING_RETURN_IF_ERROR(document_log_->PersistToDisk()); ICING_RETURN_IF_ERROR(document_key_mapper_->PersistToDisk()); ICING_RETURN_IF_ERROR(document_id_mapper_->PersistToDisk()); @@ -1404,30 +1308,139 @@ libtextclassifier3::Status DocumentStore::PersistToDisk() { return libtextclassifier3::Status::OK; } -libtextclassifier3::StatusOr<int64_t> DocumentStore::GetDiskUsage() const { - ICING_ASSIGN_OR_RETURN(const int64_t document_log_disk_usage, - document_log_->GetDiskUsage()); - ICING_ASSIGN_OR_RETURN(const int64_t document_key_mapper_disk_usage, - document_key_mapper_->GetDiskUsage()); - ICING_ASSIGN_OR_RETURN(const int64_t document_id_mapper_disk_usage, - document_id_mapper_->GetDiskUsage()); - ICING_ASSIGN_OR_RETURN(const int64_t score_cache_disk_usage, - score_cache_->GetDiskUsage()); - ICING_ASSIGN_OR_RETURN(const int64_t filter_cache_disk_usage, - filter_cache_->GetDiskUsage()); - ICING_ASSIGN_OR_RETURN(const int64_t namespace_mapper_disk_usage, - namespace_mapper_->GetDiskUsage()); - ICING_ASSIGN_OR_RETURN(const int64_t corpus_mapper_disk_usage, - corpus_mapper_->GetDiskUsage()); - ICING_ASSIGN_OR_RETURN(const int64_t corpus_score_cache_disk_usage, - corpus_score_cache_->GetDiskUsage()); - - int64_t disk_usage = document_log_disk_usage + - document_key_mapper_disk_usage + - document_id_mapper_disk_usage + score_cache_disk_usage + - filter_cache_disk_usage + namespace_mapper_disk_usage + - corpus_mapper_disk_usage + corpus_score_cache_disk_usage; - return disk_usage; +int64_t GetValueOrDefault(const libtextclassifier3::StatusOr<int64_t>& value_or, + int64_t default_value) { + return (value_or.ok()) ? value_or.ValueOrDie() : default_value; +} + +DocumentStorageInfoProto DocumentStore::GetMemberStorageInfo() const { + DocumentStorageInfoProto storage_info; + storage_info.set_document_log_size( + GetValueOrDefault(document_log_->GetDiskUsage(), -1)); + storage_info.set_key_mapper_size( + GetValueOrDefault(document_key_mapper_->GetDiskUsage(), -1)); + storage_info.set_document_id_mapper_size( + GetValueOrDefault(document_id_mapper_->GetDiskUsage(), -1)); + storage_info.set_score_cache_size( + GetValueOrDefault(score_cache_->GetDiskUsage(), -1)); + storage_info.set_filter_cache_size( + GetValueOrDefault(filter_cache_->GetDiskUsage(), -1)); + storage_info.set_namespace_id_mapper_size( + GetValueOrDefault(namespace_mapper_->GetDiskUsage(), -1)); + storage_info.set_corpus_mapper_size( + GetValueOrDefault(corpus_mapper_->GetDiskUsage(), -1)); + storage_info.set_corpus_score_cache_size( + GetValueOrDefault(corpus_score_cache_->GetDiskUsage(), -1)); + return storage_info; +} + +DocumentStorageInfoProto DocumentStore::CalculateDocumentStatusCounts( + DocumentStorageInfoProto storage_info) const { + int total_num_alive = 0; + int total_num_expired = 0; + int total_num_deleted = 0; + std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace = + namespace_mapper_->GetValuesToKeys(); + std::unordered_map<std::string, NamespaceStorageInfoProto> + namespace_to_storage_info; + + for (DocumentId document_id = 0; + document_id < document_id_mapper_->num_elements(); ++document_id) { + // Check if it's deleted first. + if (IsDeleted(document_id)) { + // We don't have the namespace id of hard deleted documents anymore, so + // we can't add to our namespace storage info. + ++total_num_deleted; + continue; + } + + // At this point, the document is either alive or expired, we can get + // namespace info for it. + auto filter_data_or = filter_cache_->Get(document_id); + if (!filter_data_or.ok()) { + ICING_VLOG(1) << "Error trying to get filter data for document store " + "storage info counts."; + continue; + } + const DocumentFilterData* filter_data = filter_data_or.ValueOrDie(); + auto itr = namespace_id_to_namespace.find(filter_data->namespace_id()); + if (itr == namespace_id_to_namespace.end()) { + ICING_VLOG(1) << "Error trying to find namespace for document store " + "storage info counts."; + continue; + } + const std::string& name_space = itr->second; + + // Always set the namespace, if the NamespaceStorageInfoProto didn't exist + // before, we'll get back a default instance of it. + NamespaceStorageInfoProto& namespace_storage_info = + namespace_to_storage_info[name_space]; + namespace_storage_info.set_namespace_(name_space); + + // Get usage scores + auto usage_scores_or = usage_store_->GetUsageScores(document_id); + if (!usage_scores_or.ok()) { + ICING_VLOG(1) << "Error trying to get usage scores for document store " + "storage info counts."; + continue; + } + UsageStore::UsageScores usage_scores = usage_scores_or.ValueOrDie(); + + // Update our stats + if (IsExpired(document_id)) { + ++total_num_expired; + namespace_storage_info.set_num_expired_documents( + namespace_storage_info.num_expired_documents() + 1); + if (usage_scores.usage_type1_count > 0) { + namespace_storage_info.set_num_expired_documents_usage_type1( + namespace_storage_info.num_expired_documents_usage_type1() + 1); + } + if (usage_scores.usage_type2_count > 0) { + namespace_storage_info.set_num_expired_documents_usage_type2( + namespace_storage_info.num_expired_documents_usage_type2() + 1); + } + if (usage_scores.usage_type3_count > 0) { + namespace_storage_info.set_num_expired_documents_usage_type3( + namespace_storage_info.num_expired_documents_usage_type3() + 1); + } + } else { + ++total_num_alive; + namespace_storage_info.set_num_alive_documents( + namespace_storage_info.num_alive_documents() + 1); + if (usage_scores.usage_type1_count > 0) { + namespace_storage_info.set_num_alive_documents_usage_type1( + namespace_storage_info.num_alive_documents_usage_type1() + 1); + } + if (usage_scores.usage_type2_count > 0) { + namespace_storage_info.set_num_alive_documents_usage_type2( + namespace_storage_info.num_alive_documents_usage_type2() + 1); + } + if (usage_scores.usage_type3_count > 0) { + namespace_storage_info.set_num_alive_documents_usage_type3( + namespace_storage_info.num_alive_documents_usage_type3() + 1); + } + } + } + + for (auto& itr : namespace_to_storage_info) { + storage_info.mutable_namespace_storage_info()->Add(std::move(itr.second)); + } + storage_info.set_num_alive_documents(total_num_alive); + storage_info.set_num_deleted_documents(total_num_deleted); + storage_info.set_num_expired_documents(total_num_expired); + return storage_info; +} + +DocumentStorageInfoProto DocumentStore::GetStorageInfo() const { + DocumentStorageInfoProto storage_info = GetMemberStorageInfo(); + int64_t directory_size = filesystem_->GetDiskUsage(base_dir_.c_str()); + if (directory_size != Filesystem::kBadFileSize) { + storage_info.set_document_store_size(directory_size); + } else { + storage_info.set_document_store_size(-1); + } + storage_info.set_num_namespaces(namespace_mapper_->num_keys()); + return CalculateDocumentStatusCounts(std::move(storage_info)); } libtextclassifier3::Status DocumentStore::UpdateSchemaStore( @@ -1486,50 +1499,19 @@ libtextclassifier3::Status DocumentStore::OptimizedUpdateSchemaStore( schema_store_ = schema_store; document_validator_.UpdateSchemaStore(schema_store); - // Append a tombstone for each deleted schema type. This way, we don't have - // to read out each document, check if the schema type has been deleted, and - // append a tombstone per-document. - for (const auto& schema_type : - set_schema_result.schema_types_deleted_by_name) { - // TODO(b/144458732): Implement a more robust version of - // ICING_RETURN_IF_ERROR that can support error logging. - libtextclassifier3::Status status = - document_log_->WriteProto(CreateSchemaTypeTombstone(schema_type)) - .status(); - if (!status.ok()) { - ICING_LOG(ERROR) << status.error_message() - << "Failed to delete schema_type. schema_type = " - << schema_type; - return status; - } - } - int size = document_id_mapper_->num_elements(); for (DocumentId document_id = 0; document_id < size; document_id++) { - auto exists_or = DoesDocumentExistAndGetFileOffset(document_id); - if (absl_ports::IsNotFound(exists_or.status())) { + if (!InternalDoesDocumentExist(document_id)) { // Skip nonexistent documents continue; - } else if (!exists_or.ok()) { - // Real error, pass up - return absl_ports::Annotate( - exists_or.status(), - IcingStringUtil::StringPrintf("Failed to retrieve DocumentId %d", - document_id)); } // Guaranteed that the document exists now. ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data, filter_cache_->Get(document_id)); - if (set_schema_result.schema_types_deleted_by_id.count( - filter_data->schema_type_id()) != 0) { - // We already created a tombstone for this deleted type. Just update the - // derived files now. - ICING_RETURN_IF_ERROR( - document_id_mapper_->Set(document_id, kDocDeletedFlag)); - continue; - } + bool delete_document = set_schema_result.schema_types_deleted_by_id.count( + filter_data->schema_type_id()) != 0; // Check if we need to update the FilterCache entry for this document. It // may have been assigned a different SchemaTypeId in the new SchemaStore. @@ -1553,17 +1535,17 @@ libtextclassifier3::Status DocumentStore::OptimizedUpdateSchemaStore( filter_cache_->mutable_array()[document_id].set_schema_type_id( schema_type_id); } - if (revalidate_document) { - if (!document_validator_.Validate(document).ok()) { - // Document is no longer valid with the new SchemaStore. Mark as - // deleted - auto delete_status = Delete(document.namespace_(), document.uri()); - if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) { - // Real error, pass up - return delete_status; - } - } + delete_document = !document_validator_.Validate(document).ok(); + } + } + + if (delete_document) { + // Document is no longer valid with the new SchemaStore. Mark as deleted + auto delete_status = Delete(document_id); + if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) { + // Real error, pass up + return delete_status; } } } @@ -1577,7 +1559,8 @@ libtextclassifier3::Status DocumentStore::Optimize() { } libtextclassifier3::Status DocumentStore::OptimizeInto( - const std::string& new_directory, const LanguageSegmenter* lang_segmenter) { + const std::string& new_directory, const LanguageSegmenter* lang_segmenter, + OptimizeStatsProto* stats) { // Validates directory if (new_directory == base_dir_) { return absl_ports::InvalidArgumentError( @@ -1592,10 +1575,17 @@ libtextclassifier3::Status DocumentStore::OptimizeInto( // Writes all valid docs into new document store (new directory) int size = document_id_mapper_->num_elements(); + int num_deleted = 0; + int num_expired = 0; + UsageStore::UsageScores default_usage; for (DocumentId document_id = 0; document_id < size; document_id++) { auto document_or = Get(document_id, /*clear_internal_fields=*/false); if (absl_ports::IsNotFound(document_or.status())) { - // Skip nonexistent documents + if (IsDeleted(document_id)) { + ++num_deleted; + } else if (IsExpired(document_id)) { + ++num_expired; + } continue; } else if (!document_or.ok()) { // Real error, pass up @@ -1636,12 +1626,21 @@ libtextclassifier3::Status DocumentStore::OptimizeInto( // Copy over usage scores. ICING_ASSIGN_OR_RETURN(UsageStore::UsageScores usage_scores, usage_store_->GetUsageScores(document_id)); - DocumentId new_document_id = new_document_id_or.ValueOrDie(); - ICING_RETURN_IF_ERROR( - new_doc_store->SetUsageScores(new_document_id, usage_scores)); + if (!(usage_scores == default_usage)) { + // If the usage scores for this document are the default (no usage), then + // don't bother setting it. No need to possibly allocate storage if + // there's nothing interesting to store. + DocumentId new_document_id = new_document_id_or.ValueOrDie(); + ICING_RETURN_IF_ERROR( + new_doc_store->SetUsageScores(new_document_id, usage_scores)); + } } - - ICING_RETURN_IF_ERROR(new_doc_store->PersistToDisk()); + if (stats != nullptr) { + stats->set_num_original_documents(size); + stats->set_num_deleted_documents(num_deleted); + stats->set_num_expired_documents(num_expired); + } + ICING_RETURN_IF_ERROR(new_doc_store->PersistToDisk(PersistType::FULL)); return libtextclassifier3::Status::OK; } @@ -1653,7 +1652,7 @@ DocumentStore::GetOptimizeInfo() const { int32_t num_documents = document_id_mapper_->num_elements(); for (DocumentId document_id = kMinDocumentId; document_id < num_documents; ++document_id) { - if (!DoesDocumentExist(document_id)) { + if (!InternalDoesDocumentExist(document_id)) { ++optimize_info.optimizable_docs; } @@ -1691,10 +1690,10 @@ DocumentStore::GetOptimizeInfo() const { ICING_ASSIGN_OR_RETURN(const int64_t document_key_mapper_size, document_key_mapper_->GetElementsSize()); - // We don't include the namespace_mapper or the corpus_mapper because it's not - // clear if we could recover any space even if Optimize were called. Deleting - // 100s of documents could still leave a few documents of a namespace, and - // then there would be no change. + // We don't include the namespace_mapper or the corpus_mapper because it's + // not clear if we could recover any space even if Optimize were called. + // Deleting 100s of documents could still leave a few documents of a + // namespace, and then there would be no change. int64_t total_size = document_log_file_size + document_key_mapper_size + document_id_mapper_file_size + score_cache_file_size + @@ -1724,8 +1723,8 @@ libtextclassifier3::Status DocumentStore::UpdateFilterCache( libtextclassifier3::Status DocumentStore::ClearDerivedData( DocumentId document_id) { // We intentionally leave the data in key_mapper_ because locating that data - // requires fetching namespace and uri. Leaving data in key_mapper_ should be - // fine because the data is hashed. + // requires fetching namespace and uri. Leaving data in key_mapper_ should + // be fine because the data is hashed. ICING_RETURN_IF_ERROR(document_id_mapper_->Set(document_id, kDocDeletedFlag)); diff --git a/icing/store/document-store.h b/icing/store/document-store.h index b2908f0..c85c989 100644 --- a/icing/store/document-store.h +++ b/icing/store/document-store.h @@ -26,9 +26,13 @@ #include "icing/file/file-backed-proto-log.h" #include "icing/file/file-backed-vector.h" #include "icing/file/filesystem.h" +#include "icing/file/portable-file-backed-proto-log.h" #include "icing/proto/document.pb.h" #include "icing/proto/document_wrapper.pb.h" #include "icing/proto/logging.pb.h" +#include "icing/proto/optimize.pb.h" +#include "icing/proto/persist.pb.h" +#include "icing/proto/storage.pb.h" #include "icing/schema/schema-store.h" #include "icing/store/corpus-associated-scoring-data.h" #include "icing/store/corpus-id.h" @@ -106,6 +110,11 @@ class DocumentStore { // previously initialized with this directory, it will reload the files saved // by the last instance. // + // force_recovery_and_revalidate_documents=true will pre-emptively throw out + // the derived files and validate each document while recreating them. This + // can be used to indicate that the schema (and type ids) may have changed and + // those changes might not have been applied to the document store. + // // If initialize_stats is present, the fields related to DocumentStore will be // populated. // @@ -122,7 +131,8 @@ class DocumentStore { static libtextclassifier3::StatusOr<DocumentStore::CreateResult> Create( const Filesystem* filesystem, const std::string& base_dir, const Clock* clock, const SchemaStore* schema_store, - NativeInitializeStats* initialize_stats = nullptr); + bool force_recovery_and_revalidate_documents = false, + InitializeStatsProto* initialize_stats = nullptr); // Returns the maximum DocumentId that the DocumentStore has assigned. If // there has not been any DocumentIds assigned, i.e. the DocumentStore is @@ -146,16 +156,17 @@ class DocumentStore { // // Returns: // A newly generated document id on success + // RESOURCE_EXHAUSED if exceeds maximum number of allowed documents // FAILED_PRECONDITION if schema hasn't been set yet // NOT_FOUND if the schema_type or a property config of the document doesn't // exist in schema // INTERNAL_ERROR on IO error libtextclassifier3::StatusOr<DocumentId> Put( const DocumentProto& document, int32_t num_tokens = 0, - NativePutDocumentStats* put_document_stats = nullptr); + PutDocumentStatsProto* put_document_stats = nullptr); libtextclassifier3::StatusOr<DocumentId> Put( DocumentProto&& document, int32_t num_tokens = 0, - NativePutDocumentStats* put_document_stats = nullptr); + PutDocumentStatsProto* put_document_stats = nullptr); // Finds and returns the document identified by the given key (namespace + // uri). If 'clear_internal_fields' is true, document level data that's @@ -189,18 +200,21 @@ class DocumentStore { // Check if a document exists. Existence means it hasn't been deleted and it // hasn't expired yet. // + // NOTE: This should be used when callers don't care about error messages, + // expect documents to be deleted/not found, or in frequently called code + // paths that could cause performance issues. A signficant amount of CPU + // cycles can be saved if we don't construct strings and create new Status + // objects on the heap. See b/185822483. + // // Returns: // boolean whether a document exists or not bool DoesDocumentExist(DocumentId document_id) const; // Deletes the document identified by the given namespace and uri. The - // document proto will be marked as deleted if 'soft_delete' is true, - // otherwise the document proto will be erased immediately. + // document proto will be erased immediately. // // NOTE: - // 1. The soft deletion uses less CPU power, it can be applied on - // non-sensitive data. - // 2. Space is not reclaimed for deleted documents until Optimize() is + // Space is not reclaimed for deleted documents until Optimize() is // called. // // Returns: @@ -208,26 +222,21 @@ class DocumentStore { // NOT_FOUND if no document exists with namespace, uri // INTERNAL_ERROR on IO error libtextclassifier3::Status Delete(std::string_view name_space, - std::string_view uri, - bool soft_delete = false); + std::string_view uri); - // Deletes the document identified by the given document_id. The - // document proto will be marked as deleted if 'soft_delete' is true, - // otherwise the document proto will be erased immediately. + // Deletes the document identified by the given document_id. The document + // proto will be erased immediately. // // NOTE: - // 1. If possible, please use the other method Delete(name_space, uri, - // soft_delete) for soft deletes because we need namespace and uri to - // perform soft deletes. - // 2. Space is not reclaimed for deleted documents until Optimize() is + // Space is not reclaimed for deleted documents until Optimize() is // called. // // Returns: // OK on success + // NOT_FOUND if the document doesn't exist (i.e. deleted or expired) // INTERNAL_ERROR on IO error // INVALID_ARGUMENT if document_id is invalid. - libtextclassifier3::Status Delete(DocumentId document_id, - bool soft_delete = false); + libtextclassifier3::Status Delete(DocumentId document_id); // Returns the NamespaceId of the string namespace // @@ -250,16 +259,9 @@ class DocumentStore { // Returns the DocumentAssociatedScoreData of the document specified by the // DocumentId. // - // NOTE: This does not check if the document exists and will return the - // DocumentFilterData of the document even if it has been deleted. Users - // should check DoesDocumentExist(document_id) if they only want existing - // documents' DocumentFilterData. - // // Returns: // DocumentAssociatedScoreData on success - // OUT_OF_RANGE if document_id is negative or exceeds previously seen - // DocumentIds - // NOT_FOUND if no score data is found + // NOT_FOUND if the document or the score data is not found libtextclassifier3::StatusOr<DocumentAssociatedScoreData> GetDocumentAssociatedScoreData(DocumentId document_id) const; @@ -279,16 +281,11 @@ class DocumentStore { // Returns the DocumentFilterData of the document specified by the DocumentId. // - // NOTE: This does not check if the document exists and will return the - // DocumentFilterData of the document even if it has been deleted. Users - // should check DoesDocumentExist(document_id) if they only want existing - // documents' DocumentFilterData. - // // Returns: // DocumentFilterData on success // OUT_OF_RANGE if document_id is negative or exceeds previously seen // DocumentIds - // NOT_FOUND if no filter data is found + // NOT_FOUND if the document or the filter data is not found libtextclassifier3::StatusOr<DocumentFilterData> GetDocumentFilterData( DocumentId document_id) const; @@ -296,8 +293,8 @@ class DocumentStore { // // Returns: // UsageScores on success + // NOT_FOUND if document_id no longer exists. // INVALID_ARGUMENT if document_id is invalid - // INTERNAL_ERROR on I/O errors libtextclassifier3::StatusOr<UsageStore::UsageScores> GetUsageScores( DocumentId document_id) const; @@ -311,56 +308,43 @@ class DocumentStore { libtextclassifier3::Status ReportUsage(const UsageReport& usage_report); // Deletes all documents belonging to the given namespace. The documents will - // be marked as deleted if 'soft_delete' is true, otherwise they will be - // erased immediately. + // be erased immediately. // // NOTE: - // 1. The soft deletion uses less CPU power, it can be applied on - // non-sensitive data. - // 2. Space is not reclaimed for deleted documents until Optimize() is + // Space is not reclaimed for deleted documents until Optimize() is // called. // // Returns: // OK on success // NOT_FOUND if namespace doesn't exist // INTERNAL_ERROR on IO error - DeleteByGroupResult DeleteByNamespace(std::string_view name_space, - bool soft_delete = false); + DeleteByGroupResult DeleteByNamespace(std::string_view name_space); // Deletes all documents belonging to the given schema type. The documents - // will be marked as deleted if 'soft_delete' is true, otherwise they will be - // erased immediately. + // will be erased immediately. // // NOTE: - // 1. The soft deletion uses less CPU power, it can be applied on - // non-sensitive data. - // 2. Space is not reclaimed for deleted documents until Optimize() is + // Space is not reclaimed for deleted documents until Optimize() is // called. // // Returns: // OK on success // NOT_FOUND if schema_type doesn't exist // INTERNAL_ERROR on IO error - DeleteByGroupResult DeleteBySchemaType(std::string_view schema_type, - bool soft_delete = false); + DeleteByGroupResult DeleteBySchemaType(std::string_view schema_type); // Syncs all the data and metadata changes to disk. // // Returns: // OK on success // INTERNAL on I/O error - libtextclassifier3::Status PersistToDisk(); + libtextclassifier3::Status PersistToDisk(PersistType::Code persist_type); - // Calculates and returns the disk usage in bytes. Rounds up to the nearest - // block size. - // - // Returns: - // Disk usage on success - // INTERNAL_ERROR on IO error + // Calculates the StorageInfo for the Document Store. // - // TODO(tjbarron): consider returning a struct which has the breakdown of each - // component. - libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const; + // If an IO error occurs while trying to calculate the value for a field, then + // that field will be set to -1. + DocumentStorageInfoProto GetStorageInfo() const; // Update any derived data off of the SchemaStore with the new SchemaStore. // This may include pointers, SchemaTypeIds, etc. @@ -407,6 +391,8 @@ class DocumentStore { // reassigned so any files / classes that are based on old document ids may be // outdated. // + // stats will be set if non-null. + // // NOTE: The tasks in this method are too expensive to be executed in // real-time. The caller should decide how frequently and when to call this // method based on device usage. @@ -416,8 +402,8 @@ class DocumentStore { // INVALID_ARGUMENT if new_directory is same as current base directory // INTERNAL_ERROR on IO error libtextclassifier3::Status OptimizeInto( - const std::string& new_directory, - const LanguageSegmenter* lang_segmenter); + const std::string& new_directory, const LanguageSegmenter* lang_segmenter, + OptimizeStatsProto* stats = nullptr); // Calculates status for a potential Optimize call. Includes how many docs // there are vs how many would be optimized away. And also includes an @@ -454,7 +440,7 @@ class DocumentStore { // A log used to store all documents, it serves as a ground truth of doc // store. key_mapper_ and document_id_mapper_ can be regenerated from it. - std::unique_ptr<FileBackedProtoLog<DocumentWrapper>> document_log_; + std::unique_ptr<PortableFileBackedProtoLog<DocumentWrapper>> document_log_; // Key (namespace + uri) to DocumentId mapping std::unique_ptr<KeyMapper<DocumentId>> document_key_mapper_; @@ -508,16 +494,22 @@ class DocumentStore { bool initialized_ = false; libtextclassifier3::StatusOr<DataLoss> Initialize( - NativeInitializeStats* initialize_stats); + bool force_recovery_and_revalidate_documents, + InitializeStatsProto* initialize_stats); // Creates sub-components and verifies the integrity of each sub-component. + // This assumes that the the underlying files already exist, and will return + // an error if it doesn't find what it's expecting. // // Returns an error if subcomponents failed to initialize successfully. // INTERNAL_ERROR on IO error - libtextclassifier3::Status InitializeDerivedFiles(); + libtextclassifier3::Status InitializeExistingDerivedFiles(); // Re-generates all files derived from the ground truth: the document log. // + // revalidate_documents=true will also cause each document to be revalidated + // the schema as it is read out of the document log. + // // NOTE: if this function fails, the only thing we can do is to retry it until // it succeeds or prevent the initialization of a DocumentStore. The // DocumentStore object wouldn't work reliably if this fails. @@ -528,7 +520,7 @@ class DocumentStore { // document_id // mapper. // 3. Create header and store the updated combined checksum - libtextclassifier3::Status RegenerateDerivedFiles(); + libtextclassifier3::Status RegenerateDerivedFiles(bool revalidate_documents); // Resets the unique_ptr to the document_key_mapper, deletes the underlying // file, and re-creates a new instance of the document_key_mapper . @@ -576,8 +568,8 @@ class DocumentStore { // if it doesn't exist. bool HeaderExists(); - // Update and replace the header file. Creates the header file if it doesn't - // exist. + // Update, replace and persist the header file. Creates the header file if it + // doesn't exist. // // Returns: // OK on success @@ -586,14 +578,13 @@ class DocumentStore { libtextclassifier3::StatusOr<DocumentId> InternalPut( DocumentProto& document, - NativePutDocumentStats* put_document_stats = nullptr); + PutDocumentStatsProto* put_document_stats = nullptr); // Helper function to do batch deletes. Documents with the given // "namespace_id" and "schema_type_id" will be deleted. If callers don't need // to specify the namespace or schema type, pass in kInvalidNamespaceId or - // kInvalidSchemaTypeId. The document protos will be marked as deleted if - // 'soft_delete' is true, otherwise the document protos with their derived - // data will be erased / cleared immediately. + // kInvalidSchemaTypeId. The document protos with their derived data will be + // erased / cleared immediately. // // NOTE: Space is not reclaimed in the derived files until Optimize() is // called. @@ -602,28 +593,7 @@ class DocumentStore { // Number of documents that were actually updated to be deleted // INTERNAL_ERROR on IO error libtextclassifier3::StatusOr<int> BatchDelete(NamespaceId namespace_id, - SchemaTypeId schema_type_id, - bool soft_delete); - - // Marks the document identified by the given name_space, uri and document_id - // as deleted, to be removed later during Optimize(). - // - // Returns: - // OK on success - // INTERNAL_ERROR on IO error - libtextclassifier3::Status SoftDelete(std::string_view name_space, - std::string_view uri, - DocumentId document_id); - - // Erases the document at the given document_log_offset from the document_log - // and clears the derived data identified by the given document_id. The space - // will be reclaimed later during Optimize(). - // - // Returns: - // OK on success - // INTERNAL_ERROR on IO error - libtextclassifier3::Status HardDelete(DocumentId document_id, - int64_t document_log_offset); + SchemaTypeId schema_type_id); // Helper method to find a DocumentId that is associated with the given // namespace and uri. @@ -654,22 +624,46 @@ class DocumentStore { libtextclassifier3::StatusOr<CorpusAssociatedScoreData> GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const; - // Helper method to validate the document id and return the file offset of the - // associated document in document_log_. - // - // This can be a more informative call than just DoesDocumentExist because it - // can return more status errors on whether the Document actually doesn't - // exist or if there was an internal error while accessing files. + // Check if a document exists. Existence means it hasn't been deleted and it + // hasn't expired yet. // // Returns: - // The file offset on success + // OK if the document exists // INVALID_ARGUMENT if document_id is less than 0 or greater than the // maximum value // NOT_FOUND if the document doesn't exist (i.e. deleted or expired) // INTERNAL_ERROR on IO error - libtextclassifier3::StatusOr<int64_t> DoesDocumentExistAndGetFileOffset( + libtextclassifier3::Status DoesDocumentExistWithStatus( DocumentId document_id) const; + // Check if a document exists. Existence means it hasn't been deleted and it + // hasn't expired yet. + // + // This is for internal-use only because we assume that the document_id is + // already valid. If you're unsure if the document_id is valid, use + // DoesDocumentExist(document_id) instead, which will perform those additional + // checks. + // + // Returns: + // boolean whether a document exists or not + bool InternalDoesDocumentExist(DocumentId document_id) const; + + // Checks if a document has been deleted + // + // This is for internal-use only because we assume that the document_id is + // already valid. If you're unsure if the document_id is valid, use + // DoesDocumentExist(document_id) instead, which will perform those additional + // checks. + bool IsDeleted(DocumentId document_id) const; + + // Checks if a document has expired. + // + // This is for internal-use only because we assume that the document_id is + // already valid. If you're unsure if the document_id is valid, use + // DoesDocumentExist(document_id) instead, which will perform those additional + // checks. + bool IsExpired(DocumentId document_id) const; + // Updates the entry in the score cache for document_id. libtextclassifier3::Status UpdateDocumentAssociatedScoreCache( DocumentId document_id, const DocumentAssociatedScoreData& score_data); @@ -688,6 +682,20 @@ class DocumentStore { // Sets usage scores for the given document. libtextclassifier3::Status SetUsageScores( DocumentId document_id, const UsageStore::UsageScores& usage_scores); + + // Returns: + // - on success, a DocumentStorageInfoProto with the fields relating to the + // size of Document Store member variables populated. + // - INTERNAL on failure to get file size + DocumentStorageInfoProto GetMemberStorageInfo() const; + + // Returns: + // - on success, the storage_info that was passed in but with the number of + // alive, deleted and expired documents also set. + // - OUT_OF_RANGE, this should never happen. This could only be returned if + // the document_id_mapper somehow became larger than the filter cache. + DocumentStorageInfoProto CalculateDocumentStatusCounts( + DocumentStorageInfoProto storage_info) const; }; } // namespace lib diff --git a/icing/store/document-store_benchmark.cc b/icing/store/document-store_benchmark.cc new file mode 100644 index 0000000..77da928 --- /dev/null +++ b/icing/store/document-store_benchmark.cc @@ -0,0 +1,330 @@ +// Copyright (C) 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <unistd.h> + +#include <fstream> +#include <iostream> +#include <memory> +#include <ostream> +#include <random> +#include <sstream> +#include <stdexcept> +#include <string> +#include <string_view> +#include <unordered_set> +#include <vector> + +#include "testing/base/public/benchmark.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/document-builder.h" +#include "icing/file/filesystem.h" +#include "icing/proto/document.pb.h" +#include "icing/proto/persist.pb.h" +#include "icing/proto/schema.pb.h" +#include "icing/schema-builder.h" +#include "icing/schema/schema-store.h" +#include "icing/store/document-store.h" +#include "icing/testing/common-matchers.h" +#include "icing/testing/tmp-directory.h" +#include "icing/util/clock.h" + +// Run on a Linux workstation: +// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt +// //icing/store:document-store_benchmark +// +// $ blaze-bin/icing/store/document-store_benchmark +// --benchmarks=all --benchmark_memory_usage +// +// Run on an Android device: +// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1" +// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt +// //icing/store:document-store_benchmark +// +// $ adb push blaze-bin/icing/store/document-store_benchmark +// /data/local/tmp/ +// +// $ adb shell /data/local/tmp/document-store_benchmark +// --benchmarks=all + +namespace icing { +namespace lib { + +namespace { + +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; + +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; + +constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; + +class DestructibleDirectory { + public: + explicit DestructibleDirectory(const Filesystem& filesystem, + const std::string& dir) + : filesystem_(filesystem), dir_(dir) { + filesystem_.CreateDirectoryRecursively(dir_.c_str()); + } + ~DestructibleDirectory() { + filesystem_.DeleteDirectoryRecursively(dir_.c_str()); + } + + private: + Filesystem filesystem_; + std::string dir_; +}; + +DocumentProto CreateDocument(const std::string namespace_, + const std::string uri) { + return DocumentBuilder() + .SetKey(namespace_, uri) + .SetSchema("email") + .AddStringProperty("subject", "subject foo") + .AddStringProperty("body", "body bar") + .Build(); +} + +SchemaProto CreateSchema() { + return SchemaBuilder() + .AddType( + SchemaTypeConfigBuilder() + .SetType("email") + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); +} + +std::unique_ptr<SchemaStore> CreateSchemaStore(Filesystem filesystem, + const std::string directory, + const Clock* clock) { + const std::string schema_store_dir = directory + "/schema"; + filesystem.CreateDirectoryRecursively(schema_store_dir.data()); + std::unique_ptr<SchemaStore> schema_store = + SchemaStore::Create(&filesystem, schema_store_dir, clock).ValueOrDie(); + + auto set_schema_status = schema_store->SetSchema(CreateSchema()); + if (!set_schema_status.ok()) { + ICING_LOG(ERROR) << set_schema_status.status().error_message(); + } + + return schema_store; +} + +void BM_DoesDocumentExistBenchmark(benchmark::State& state) { + Filesystem filesystem; + Clock clock; + + std::string directory = GetTestTempDir() + "/icing"; + DestructibleDirectory ddir(filesystem, directory); + + std::string document_store_dir = directory + "/store"; + std::unique_ptr<SchemaStore> schema_store = + CreateSchemaStore(filesystem, directory, &clock); + + filesystem.CreateDirectoryRecursively(document_store_dir.data()); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem, document_store_dir, &clock, + schema_store.get())); + std::unique_ptr<DocumentStore> document_store = + std::move(create_result.document_store); + + int max_document_id = 300000; + for (int i = 0; i < max_document_id; ++i) { + // Put and delete a lot of documents to fill up our derived files with + // stuff. + ICING_ASSERT_OK(document_store->Put( + CreateDocument("namespace", /*uri=*/std::to_string(i)))); + document_store->Delete("namespace", /*uri=*/std::to_string(i)); + } + + std::default_random_engine random; + std::uniform_int_distribution<> dist(1, max_document_id); + for (auto s : state) { + // Check random document ids to see if they exist. Hopefully to simulate + // page faulting in different sections of our mmapped derived files. + int document_id = dist(random); + benchmark::DoNotOptimize(document_store->DoesDocumentExist(document_id)); + } +} +BENCHMARK(BM_DoesDocumentExistBenchmark); + +void BM_Put(benchmark::State& state) { + Filesystem filesystem; + Clock clock; + + std::string directory = GetTestTempDir() + "/icing"; + DestructibleDirectory ddir(filesystem, directory); + + std::string document_store_dir = directory + "/store"; + std::unique_ptr<SchemaStore> schema_store = + CreateSchemaStore(filesystem, directory, &clock); + + filesystem.CreateDirectoryRecursively(document_store_dir.data()); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem, document_store_dir, &clock, + schema_store.get())); + std::unique_ptr<DocumentStore> document_store = + std::move(create_result.document_store); + + DocumentProto document = CreateDocument("namespace", "uri"); + + for (auto s : state) { + // It's ok that this is the same document over and over. We'll create a new + // document_id for it and still insert the proto into the underlying log. + benchmark::DoNotOptimize(document_store->Put(document)); + } +} +BENCHMARK(BM_Put); + +void BM_GetSameDocument(benchmark::State& state) { + Filesystem filesystem; + Clock clock; + + std::string directory = GetTestTempDir() + "/icing"; + DestructibleDirectory ddir(filesystem, directory); + + std::string document_store_dir = directory + "/store"; + std::unique_ptr<SchemaStore> schema_store = + CreateSchemaStore(filesystem, directory, &clock); + + filesystem.CreateDirectoryRecursively(document_store_dir.data()); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem, document_store_dir, &clock, + schema_store.get())); + std::unique_ptr<DocumentStore> document_store = + std::move(create_result.document_store); + + ICING_ASSERT_OK(document_store->Put(CreateDocument("namespace", "uri"))); + + for (auto s : state) { + benchmark::DoNotOptimize(document_store->Get("namespace", "uri")); + } +} +BENCHMARK(BM_GetSameDocument); + +void BM_Delete(benchmark::State& state) { + Filesystem filesystem; + Clock clock; + + std::string directory = GetTestTempDir() + "/icing"; + DestructibleDirectory ddir(filesystem, directory); + + std::string document_store_dir = directory + "/store"; + std::unique_ptr<SchemaStore> schema_store = + CreateSchemaStore(filesystem, directory, &clock); + + filesystem.CreateDirectoryRecursively(document_store_dir.data()); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem, document_store_dir, &clock, + schema_store.get())); + std::unique_ptr<DocumentStore> document_store = + std::move(create_result.document_store); + + DocumentProto document = CreateDocument("namespace", "uri"); + + for (auto s : state) { + state.PauseTiming(); + ICING_ASSERT_OK(document_store->Put(document)); + state.ResumeTiming(); + + benchmark::DoNotOptimize(document_store->Delete("namespace", "uri")); + } +} +BENCHMARK(BM_Delete); + +void BM_Create(benchmark::State& state) { + Filesystem filesystem; + Clock clock; + + std::string directory = GetTestTempDir() + "/icing"; + std::string document_store_dir = directory + "/store"; + + std::unique_ptr<SchemaStore> schema_store = + CreateSchemaStore(filesystem, directory, &clock); + + // Create an initial document store and put some data in. + { + DestructibleDirectory ddir(filesystem, directory); + + filesystem.CreateDirectoryRecursively(document_store_dir.data()); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem, document_store_dir, &clock, + schema_store.get())); + std::unique_ptr<DocumentStore> document_store = + std::move(create_result.document_store); + + DocumentProto document = CreateDocument("namespace", "uri"); + ICING_ASSERT_OK(document_store->Put(document)); + ICING_ASSERT_OK(document_store->PersistToDisk(PersistType::FULL)); + } + + // Recreating it with some content to checksum over. + DestructibleDirectory ddir(filesystem, directory); + + filesystem.CreateDirectoryRecursively(document_store_dir.data()); + + for (auto s : state) { + benchmark::DoNotOptimize(DocumentStore::Create( + &filesystem, document_store_dir, &clock, schema_store.get())); + } +} +BENCHMARK(BM_Create); + +void BM_ComputeChecksum(benchmark::State& state) { + Filesystem filesystem; + Clock clock; + + std::string directory = GetTestTempDir() + "/icing"; + DestructibleDirectory ddir(filesystem, directory); + + std::string document_store_dir = directory + "/store"; + std::unique_ptr<SchemaStore> schema_store = + CreateSchemaStore(filesystem, directory, &clock); + + filesystem.CreateDirectoryRecursively(document_store_dir.data()); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem, document_store_dir, &clock, + schema_store.get())); + std::unique_ptr<DocumentStore> document_store = + std::move(create_result.document_store); + + DocumentProto document = CreateDocument("namespace", "uri"); + ICING_ASSERT_OK(document_store->Put(document)); + ICING_ASSERT_OK(document_store->PersistToDisk(PersistType::LITE)); + + for (auto s : state) { + benchmark::DoNotOptimize(document_store->ComputeChecksum()); + } +} +BENCHMARK(BM_ComputeChecksum); + +} // namespace + +} // namespace lib +} // namespace icing diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc index 7754373..a506eea 100644 --- a/icing/store/document-store_test.cc +++ b/icing/store/document-store_test.cc @@ -15,10 +15,12 @@ #include "icing/store/document-store.h" #include <cstdint> +#include <filesystem> #include <limits> #include <memory> #include <string> +#include "icing/text_classifier/lib3/utils/base/status.h" #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/absl_ports/str_cat.h" @@ -29,17 +31,20 @@ #include "icing/file/mock-filesystem.h" #include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/portable/equals-proto.h" +#include "icing/portable/platform.h" #include "icing/proto/document.pb.h" #include "icing/proto/schema.pb.h" +#include "icing/proto/storage.pb.h" +#include "icing/schema-builder.h" #include "icing/schema/schema-store.h" #include "icing/store/corpus-associated-scoring-data.h" #include "icing/store/corpus-id.h" #include "icing/store/document-filter-data.h" #include "icing/store/document-id.h" +#include "icing/store/document-log-creator.h" #include "icing/store/namespace-id.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" -#include "icing/testing/platform.h" #include "icing/testing/test-data.h" #include "icing/testing/tmp-directory.h" #include "icing/tokenization/language-segmenter-factory.h" @@ -55,6 +60,7 @@ namespace { using ::icing::lib::portable_equals_proto::EqualsProto; using ::testing::_; using ::testing::Eq; +using ::testing::Ge; using ::testing::Gt; using ::testing::HasSubstr; using ::testing::IsEmpty; @@ -64,6 +70,32 @@ using ::testing::Not; using ::testing::Return; using ::testing::UnorderedElementsAre; +const NamespaceStorageInfoProto& GetNamespaceStorageInfo( + const DocumentStorageInfoProto& storage_info, + const std::string& name_space) { + for (const NamespaceStorageInfoProto& namespace_storage_info : + storage_info.namespace_storage_info()) { + if (namespace_storage_info.namespace_() == name_space) { + return namespace_storage_info; + } + } + // Didn't find our namespace, fail the test. + EXPECT_TRUE(false) << "Failed to find namespace '" << name_space + << "' in DocumentStorageInfoProto."; + return std::move(NamespaceStorageInfoProto()); +} + +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; + +constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = + StringIndexingConfig_TokenizerType_Code_PLAIN; + +constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; + +constexpr PropertyConfigProto_DataType_Code TYPE_INT = + PropertyConfigProto_DataType_Code_INT64; + UsageReport CreateUsageReport(std::string name_space, std::string uri, int64 timestamp_ms, UsageReport::UsageType usage_type) { @@ -75,6 +107,22 @@ UsageReport CreateUsageReport(std::string name_space, std::string uri, return usage_report; } +PortableFileBackedProtoLog<DocumentWrapper>::Header ReadDocumentLogHeader( + Filesystem filesystem, const std::string& file_path) { + PortableFileBackedProtoLog<DocumentWrapper>::Header header; + filesystem.PRead(file_path.c_str(), &header, + sizeof(PortableFileBackedProtoLog<DocumentWrapper>::Header), + /*offset=*/0); + return header; +} + +void WriteDocumentLogHeader( + Filesystem filesystem, const std::string& file_path, + PortableFileBackedProtoLog<DocumentWrapper>::Header& header) { + filesystem.Write(file_path.c_str(), &header, + sizeof(PortableFileBackedProtoLog<DocumentWrapper>::Header)); +} + class DocumentStoreTest : public ::testing::Test { protected: DocumentStoreTest() @@ -124,28 +172,22 @@ class DocumentStoreTest : public ::testing::Test { filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str()); filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()); - SchemaProto schema; - auto type_config = schema.add_types(); - type_config->set_schema_type("email"); - - auto subject = type_config->add_properties(); - subject->set_property_name("subject"); - subject->set_data_type(PropertyConfigProto::DataType::STRING); - subject->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - subject->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - subject->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); - - auto body = type_config->add_properties(); - body->set_property_name("body"); - body->set_data_type(PropertyConfigProto::DataType::STRING); - body->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - body->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - body->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); - + SchemaProto schema = + SchemaBuilder() + .AddType( + SchemaTypeConfigBuilder() + .SetType("email") + .AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); @@ -161,6 +203,19 @@ class DocumentStoreTest : public ::testing::Test { filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); } + void CorruptDocStoreHeaderChecksumFile() { + // Change the DocStore's header combined checksum so that it won't match the + // recalculated checksum on initialization. This will force a regeneration + // of derived files from ground truth. + const std::string header_file = + absl_ports::StrCat(document_store_dir_, "/document_store_header"); + DocumentStore::Header header; + header.magic = DocumentStore::Header::kMagic; + header.checksum = 10; // Arbitrary garbage checksum + filesystem_.DeleteFile(header_file.c_str()); + filesystem_.Write(header_file.c_str(), &header, sizeof(header)); + } + const Filesystem filesystem_; const std::string test_dir_; FakeClock fake_clock_; @@ -290,7 +345,7 @@ TEST_F(DocumentStoreTest, PutSameKey) { EXPECT_THAT(doc_store->Put(document3), IsOkAndHolds(Not(document_id1))); } -TEST_F(DocumentStoreTest, IsDocumentExisting) { +TEST_F(DocumentStoreTest, IsDocumentExistingWithoutStatus) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -322,7 +377,7 @@ TEST_F(DocumentStoreTest, IsDocumentExisting) { IsFalse()); } -TEST_F(DocumentStoreTest, GetSoftDeletedDocumentNotFound) { +TEST_F(DocumentStoreTest, GetDeletedDocumentNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -336,29 +391,7 @@ TEST_F(DocumentStoreTest, GetSoftDeletedDocumentNotFound) { IsOkAndHolds(EqualsProto(test_document1_))); ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(), - test_document1_.uri(), - /*soft_delete=*/true)); - EXPECT_THAT( - document_store->Get(test_document1_.namespace_(), test_document1_.uri()), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); -} - -TEST_F(DocumentStoreTest, GetHardDeletedDocumentNotFound) { - ICING_ASSERT_OK_AND_ASSIGN( - DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, - schema_store_.get())); - std::unique_ptr<DocumentStore> document_store = - std::move(create_result.document_store); - - ICING_EXPECT_OK(document_store->Put(DocumentProto(test_document1_))); - EXPECT_THAT( - document_store->Get(test_document1_.namespace_(), test_document1_.uri()), - IsOkAndHolds(EqualsProto(test_document1_))); - - ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(), - test_document1_.uri(), - /*soft_delete=*/false)); + test_document1_.uri())); EXPECT_THAT( document_store->Get(test_document1_.namespace_(), test_document1_.uri()), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); @@ -436,16 +469,20 @@ TEST_F(DocumentStoreTest, DeleteNonexistentDocumentNotFound) { // Validates that deleting something non-existing won't append anything to // ground truth - int64_t ground_truth_size_before = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); + int64_t document_log_size_before = filesystem_.GetFileSize( + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str()); EXPECT_THAT( document_store->Delete("nonexistent_namespace", "nonexistent_uri"), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); - int64_t ground_truth_size_after = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); - EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after)); + int64_t document_log_size_after = filesystem_.GetFileSize( + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str()); + EXPECT_THAT(document_log_size_before, Eq(document_log_size_after)); } TEST_F(DocumentStoreTest, DeleteAlreadyDeletedDocumentNotFound) { @@ -468,7 +505,7 @@ TEST_F(DocumentStoreTest, DeleteAlreadyDeletedDocumentNotFound) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, SoftDeleteByNamespaceOk) { +TEST_F(DocumentStoreTest, DeleteByNamespaceOk) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -499,7 +536,7 @@ TEST_F(DocumentStoreTest, SoftDeleteByNamespaceOk) { // DELETE namespace.1. document1 and document 4 should be deleted. document2 // and document3 should still be retrievable. DocumentStore::DeleteByGroupResult group_result = - doc_store->DeleteByNamespace("namespace.1", /*soft_delete=*/true); + doc_store->DeleteByNamespace("namespace.1"); EXPECT_THAT(group_result.status, IsOk()); EXPECT_THAT(group_result.num_docs_deleted, Eq(2)); EXPECT_THAT(doc_store->Get(document1.namespace_(), document1.uri()), @@ -512,51 +549,7 @@ TEST_F(DocumentStoreTest, SoftDeleteByNamespaceOk) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, HardDeleteByNamespaceOk) { - ICING_ASSERT_OK_AND_ASSIGN( - DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, - schema_store_.get())); - std::unique_ptr<DocumentStore> doc_store = - std::move(create_result.document_store); - - DocumentProto document1 = test_document1_; - document1.set_namespace_("namespace.1"); - document1.set_uri("uri1"); - ICING_ASSERT_OK(doc_store->Put(document1)); - - DocumentProto document2 = test_document1_; - document2.set_namespace_("namespace.2"); - document2.set_uri("uri1"); - ICING_ASSERT_OK(doc_store->Put(document2)); - - DocumentProto document3 = test_document1_; - document3.set_namespace_("namespace.3"); - document3.set_uri("uri1"); - ICING_ASSERT_OK(doc_store->Put(document3)); - - DocumentProto document4 = test_document1_; - document4.set_namespace_("namespace.1"); - document4.set_uri("uri2"); - ICING_ASSERT_OK(doc_store->Put(document4)); - - // DELETE namespace.1. document1 and document 4 should be deleted. document2 - // and document3 should still be retrievable. - DocumentStore::DeleteByGroupResult group_result = - doc_store->DeleteByNamespace("namespace.1", /*soft_delete=*/false); - EXPECT_THAT(group_result.status, IsOk()); - EXPECT_THAT(group_result.num_docs_deleted, Eq(2)); - EXPECT_THAT(doc_store->Get(document1.namespace_(), document1.uri()), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); - EXPECT_THAT(doc_store->Get(document2.namespace_(), document2.uri()), - IsOkAndHolds(EqualsProto(document2))); - EXPECT_THAT(doc_store->Get(document3.namespace_(), document3.uri()), - IsOkAndHolds(EqualsProto(document3))); - EXPECT_THAT(doc_store->Get(document4.namespace_(), document4.uri()), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); -} - -TEST_F(DocumentStoreTest, SoftDeleteByNamespaceNonexistentNamespaceNotFound) { +TEST_F(DocumentStoreTest, DeleteByNamespaceNonexistentNamespaceNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -566,45 +559,22 @@ TEST_F(DocumentStoreTest, SoftDeleteByNamespaceNonexistentNamespaceNotFound) { // Validates that deleting something non-existing won't append anything to // ground truth - int64_t ground_truth_size_before = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); + int64_t document_log_size_before = filesystem_.GetFileSize( + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str()); - EXPECT_THAT(doc_store - ->DeleteByNamespace("nonexistent_namespace", - /*soft_delete=*/true) - .status, + EXPECT_THAT(doc_store->DeleteByNamespace("nonexistent_namespace").status, StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); - int64_t ground_truth_size_after = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); - EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after)); + int64_t document_log_size_after = filesystem_.GetFileSize( + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str()); + EXPECT_THAT(document_log_size_before, Eq(document_log_size_after)); } -TEST_F(DocumentStoreTest, HardDeleteByNamespaceNonexistentNamespaceNotFound) { - ICING_ASSERT_OK_AND_ASSIGN( - DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, - schema_store_.get())); - std::unique_ptr<DocumentStore> doc_store = - std::move(create_result.document_store); - - // Validates that deleting something non-existing won't append anything to - // ground truth - int64_t ground_truth_size_before = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); - - EXPECT_THAT(doc_store - ->DeleteByNamespace("nonexistent_namespace", - /*soft_delete=*/false) - .status, - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); - - int64_t ground_truth_size_after = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); - EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after)); -} - -TEST_F(DocumentStoreTest, SoftDeleteByNamespaceNoExistingDocumentsNotFound) { +TEST_F(DocumentStoreTest, DeleteByNamespaceNoExistingDocumentsNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -619,33 +589,9 @@ TEST_F(DocumentStoreTest, SoftDeleteByNamespaceNoExistingDocumentsNotFound) { // At this point, there are no existing documents with the namespace, even // though Icing's derived files know about this namespace. We should still // return NOT_FOUND since nothing existing has this namespace. - EXPECT_THAT(document_store - ->DeleteByNamespace(test_document1_.namespace_(), - /*soft_delete=*/true) - .status, - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); -} - -TEST_F(DocumentStoreTest, HardDeleteByNamespaceNoExistingDocumentsNotFound) { - ICING_ASSERT_OK_AND_ASSIGN( - DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, - schema_store_.get())); - std::unique_ptr<DocumentStore> document_store = - std::move(create_result.document_store); - - ICING_EXPECT_OK(document_store->Put(test_document1_)); - ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(), - test_document1_.uri())); - - // At this point, there are no existing documents with the namespace, even - // though Icing's derived files know about this namespace. We should still - // return NOT_FOUND since nothing existing has this namespace. - EXPECT_THAT(document_store - ->DeleteByNamespace(test_document1_.namespace_(), - /*soft_delete=*/false) - .status, - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT( + document_store->DeleteByNamespace(test_document1_.namespace_()).status, + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } TEST_F(DocumentStoreTest, DeleteByNamespaceRecoversOk) { @@ -665,7 +611,7 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceRecoversOk) { document4.set_namespace_("namespace.1"); document4.set_uri("uri2"); - int64_t ground_truth_size_before; + int64_t document_log_size_before; { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, @@ -686,21 +632,13 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceRecoversOk) { EXPECT_THAT(group_result.status, IsOk()); EXPECT_THAT(group_result.num_docs_deleted, Eq(2)); - ground_truth_size_before = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); + document_log_size_before = filesystem_.GetFileSize( + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str()); } // Destructors should update checksum and persist all data to file. - // Change the DocStore's header combined checksum so that it won't match the - // recalculated checksum on initialization. This will force a regeneration of - // derived files from ground truth. - const std::string header_file = - absl_ports::StrCat(document_store_dir_, "/document_store_header"); - DocumentStore::Header header; - header.magic = DocumentStore::Header::kMagic; - header.checksum = 10; // Arbitrary garbage checksum - filesystem_.DeleteFile(header_file.c_str()); - filesystem_.Write(header_file.c_str(), &header, sizeof(header)); - + CorruptDocStoreHeaderChecksumFile(); // Successfully recover from a corrupt derived file issue. ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, @@ -710,9 +648,11 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceRecoversOk) { std::move(create_result.document_store); // Make sure we didn't add anything to the ground truth after we recovered. - int64_t ground_truth_size_after = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); - EXPECT_EQ(ground_truth_size_before, ground_truth_size_after); + int64_t document_log_size_after = filesystem_.GetFileSize( + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str()); + EXPECT_EQ(document_log_size_before, document_log_size_after); EXPECT_THAT(doc_store->Get(document1.namespace_(), document1.uri()), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); @@ -724,101 +664,13 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceRecoversOk) { StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, SoftDeleteBySchemaTypeOk) { - SchemaProto schema; - auto type_config = schema.add_types(); - type_config->set_schema_type("email"); - type_config = schema.add_types(); - type_config->set_schema_type("message"); - type_config = schema.add_types(); - type_config->set_schema_type("person"); - - std::string schema_store_dir = schema_store_dir_ + "_custom"; - filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); - filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); - ICING_ASSERT_OK_AND_ASSIGN( - std::unique_ptr<SchemaStore> schema_store, - SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_)); - - ICING_ASSERT_OK(schema_store->SetSchema(schema)); - - ICING_ASSERT_OK_AND_ASSIGN( - DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, - schema_store.get())); - std::unique_ptr<DocumentStore> document_store = - std::move(create_result.document_store); - - DocumentProto email_document_1 = DocumentBuilder() - .SetKey("namespace1", "1") - .SetSchema("email") - .SetCreationTimestampMs(1) - .Build(); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_1_document_id, - document_store->Put(email_document_1)); - - DocumentProto email_document_2 = DocumentBuilder() - .SetKey("namespace2", "2") - .SetSchema("email") - .SetCreationTimestampMs(1) - .Build(); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_2_document_id, - document_store->Put(email_document_2)); - - DocumentProto message_document = DocumentBuilder() - .SetKey("namespace", "3") - .SetSchema("message") - .SetCreationTimestampMs(1) - .Build(); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId message_document_id, - document_store->Put(message_document)); - - DocumentProto person_document = DocumentBuilder() - .SetKey("namespace", "4") - .SetSchema("person") - .SetCreationTimestampMs(1) - .Build(); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId person_document_id, - document_store->Put(person_document)); - - // Delete the "email" type and ensure that it works across both - // email_document's namespaces. And that other documents aren't affected. - DocumentStore::DeleteByGroupResult group_result = - document_store->DeleteBySchemaType("email", /*soft_delete=*/true); - EXPECT_THAT(group_result.status, IsOk()); - EXPECT_THAT(group_result.num_docs_deleted, Eq(2)); - EXPECT_THAT(document_store->Get(email_1_document_id), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); - EXPECT_THAT(document_store->Get(email_2_document_id), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); - EXPECT_THAT(document_store->Get(message_document_id), - IsOkAndHolds(EqualsProto(message_document))); - EXPECT_THAT(document_store->Get(person_document_id), - IsOkAndHolds(EqualsProto(person_document))); - - // Delete the "message" type and check that other documents aren't affected - group_result = - document_store->DeleteBySchemaType("message", /*soft_delete=*/true); - EXPECT_THAT(group_result.status, IsOk()); - EXPECT_THAT(group_result.num_docs_deleted, Eq(1)); - EXPECT_THAT(document_store->Get(email_1_document_id), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); - EXPECT_THAT(document_store->Get(email_2_document_id), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); - EXPECT_THAT(document_store->Get(message_document_id), - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); - EXPECT_THAT(document_store->Get(person_document_id), - IsOkAndHolds(EqualsProto(person_document))); -} - -TEST_F(DocumentStoreTest, HardDeleteBySchemaTypeOk) { - SchemaProto schema; - auto type_config = schema.add_types(); - type_config->set_schema_type("email"); - type_config = schema.add_types(); - type_config->set_schema_type("message"); - type_config = schema.add_types(); - type_config->set_schema_type("person"); +TEST_F(DocumentStoreTest, DeleteBySchemaTypeOk) { + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .AddType(SchemaTypeConfigBuilder().SetType("message")) + .AddType(SchemaTypeConfigBuilder().SetType("person")) + .Build(); std::string schema_store_dir = schema_store_dir_ + "_custom"; filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); @@ -871,7 +723,7 @@ TEST_F(DocumentStoreTest, HardDeleteBySchemaTypeOk) { // Delete the "email" type and ensure that it works across both // email_document's namespaces. And that other documents aren't affected. DocumentStore::DeleteByGroupResult group_result = - document_store->DeleteBySchemaType("email", /*soft_delete=*/true); + document_store->DeleteBySchemaType("email"); EXPECT_THAT(group_result.status, IsOk()); EXPECT_THAT(group_result.num_docs_deleted, Eq(2)); EXPECT_THAT(document_store->Get(email_1_document_id), @@ -884,8 +736,7 @@ TEST_F(DocumentStoreTest, HardDeleteBySchemaTypeOk) { IsOkAndHolds(EqualsProto(person_document))); // Delete the "message" type and check that other documents aren't affected - group_result = - document_store->DeleteBySchemaType("message", /*soft_delete=*/true); + group_result = document_store->DeleteBySchemaType("message"); EXPECT_THAT(group_result.status, IsOk()); EXPECT_THAT(group_result.num_docs_deleted, Eq(1)); EXPECT_THAT(document_store->Get(email_1_document_id), @@ -898,32 +749,7 @@ TEST_F(DocumentStoreTest, HardDeleteBySchemaTypeOk) { IsOkAndHolds(EqualsProto(person_document))); } -TEST_F(DocumentStoreTest, SoftDeleteBySchemaTypeNonexistentSchemaTypeNotFound) { - ICING_ASSERT_OK_AND_ASSIGN( - DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, - schema_store_.get())); - std::unique_ptr<DocumentStore> document_store = - std::move(create_result.document_store); - - // Validates that deleting something non-existing won't append anything to - // ground truth - int64_t ground_truth_size_before = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); - - EXPECT_THAT(document_store - ->DeleteBySchemaType("nonexistent_type", - /*soft_delete=*/true) - .status, - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); - - int64_t ground_truth_size_after = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); - - EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after)); -} - -TEST_F(DocumentStoreTest, HardDeleteBySchemaTypeNonexistentSchemaTypeNotFound) { +TEST_F(DocumentStoreTest, DeleteBySchemaTypeNonexistentSchemaTypeNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -933,41 +759,23 @@ TEST_F(DocumentStoreTest, HardDeleteBySchemaTypeNonexistentSchemaTypeNotFound) { // Validates that deleting something non-existing won't append anything to // ground truth - int64_t ground_truth_size_before = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); + int64_t document_log_size_before = filesystem_.GetFileSize( + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str()); - EXPECT_THAT(document_store - ->DeleteBySchemaType("nonexistent_type", - /*soft_delete=*/false) - .status, + EXPECT_THAT(document_store->DeleteBySchemaType("nonexistent_type").status, StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); - int64_t ground_truth_size_after = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); - - EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after)); -} - -TEST_F(DocumentStoreTest, SoftDeleteBySchemaTypeNoExistingDocumentsNotFound) { - ICING_ASSERT_OK_AND_ASSIGN( - DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, - schema_store_.get())); - std::unique_ptr<DocumentStore> document_store = - std::move(create_result.document_store); - - ICING_EXPECT_OK(document_store->Put(test_document1_)); - ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(), - test_document1_.uri())); + int64_t document_log_size_after = filesystem_.GetFileSize( + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str()); - EXPECT_THAT(document_store - ->DeleteBySchemaType(test_document1_.schema(), - /*soft_delete=*/true) - .status, - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(document_log_size_before, Eq(document_log_size_after)); } -TEST_F(DocumentStoreTest, HardDeleteBySchemaTypeNoExistingDocumentsNotFound) { +TEST_F(DocumentStoreTest, DeleteBySchemaTypeNoExistingDocumentsNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -979,19 +787,17 @@ TEST_F(DocumentStoreTest, HardDeleteBySchemaTypeNoExistingDocumentsNotFound) { ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(), test_document1_.uri())); - EXPECT_THAT(document_store - ->DeleteBySchemaType(test_document1_.schema(), - /*soft_delete=*/false) - .status, - StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT( + document_store->DeleteBySchemaType(test_document1_.schema()).status, + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } TEST_F(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) { - SchemaProto schema; - auto type_config = schema.add_types(); - type_config->set_schema_type("email"); - type_config = schema.add_types(); - type_config->set_schema_type("message"); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .AddType(SchemaTypeConfigBuilder().SetType("message")) + .Build(); std::string schema_store_dir = schema_store_dir_ + "_custom"; filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); @@ -1016,7 +822,7 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) { .SetSchema("message") .SetCreationTimestampMs(1) .Build(); - int64_t ground_truth_size_before; + int64_t document_log_size_before; { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, @@ -1036,21 +842,13 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) { EXPECT_THAT(group_result.status, IsOk()); EXPECT_THAT(group_result.num_docs_deleted, Eq(1)); - ground_truth_size_before = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); + document_log_size_before = filesystem_.GetFileSize( + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str()); } // Destructors should update checksum and persist all data to file. - // Change the DocumentStore's header combined checksum so that it won't match - // the recalculated checksum on initialization. This will force a regeneration - // of derived files from ground truth. - const std::string header_file = - absl_ports::StrCat(document_store_dir_, "/document_store_header"); - DocumentStore::Header header; - header.magic = DocumentStore::Header::kMagic; - header.checksum = 10; // Arbitrary garbage checksum - filesystem_.DeleteFile(header_file.c_str()); - filesystem_.Write(header_file.c_str(), &header, sizeof(header)); - + CorruptDocStoreHeaderChecksumFile(); // Successfully recover from a corrupt derived file issue. ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, @@ -1060,9 +858,11 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) { std::move(create_result.document_store); // Make sure we didn't add anything to the ground truth after we recovered. - int64_t ground_truth_size_after = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); - EXPECT_EQ(ground_truth_size_before, ground_truth_size_after); + int64_t document_log_size_after = filesystem_.GetFileSize( + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str()); + EXPECT_EQ(document_log_size_before, document_log_size_after); EXPECT_THAT(document_store->Get(email_document_id), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); @@ -1070,12 +870,25 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) { IsOkAndHolds(EqualsProto(message_document))); } +TEST_F(DocumentStoreTest, PutDeleteThenPut) { + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + ICING_EXPECT_OK(doc_store->Put(test_document1_)); + ICING_EXPECT_OK( + doc_store->Delete(test_document1_.namespace_(), test_document1_.uri())); + ICING_EXPECT_OK(doc_store->Put(test_document1_)); +} + TEST_F(DocumentStoreTest, DeletedSchemaTypeFromSchemaStoreRecoversOk) { - SchemaProto schema; - auto type_config = schema.add_types(); - type_config->set_schema_type("email"); - type_config = schema.add_types(); - type_config->set_schema_type("message"); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .AddType(SchemaTypeConfigBuilder().SetType("message")) + .Build(); std::string schema_store_dir = schema_store_dir_ + "_custom"; filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); @@ -1100,7 +913,7 @@ TEST_F(DocumentStoreTest, DeletedSchemaTypeFromSchemaStoreRecoversOk) { .SetSchema("message") .SetCreationTimestampMs(1) .Build(); - int64_t ground_truth_size_before; + int64_t document_log_size_before; { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, @@ -1125,25 +938,18 @@ TEST_F(DocumentStoreTest, DeletedSchemaTypeFromSchemaStoreRecoversOk) { EXPECT_THAT(document_store->Get(message_document_id), IsOkAndHolds(EqualsProto(message_document))); - ground_truth_size_before = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); + document_log_size_before = filesystem_.GetFileSize( + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str()); } // Destructors should update checksum and persist all data to file. - // Change the DocumentStore's header combined checksum so that it won't match - // the recalculated checksum on initialization. This will force a regeneration - // of derived files from ground truth. - const std::string header_file = - absl_ports::StrCat(document_store_dir_, "/document_store_header"); - DocumentStore::Header header; - header.magic = DocumentStore::Header::kMagic; - header.checksum = 10; // Arbitrary garbage checksum - filesystem_.DeleteFile(header_file.c_str()); - filesystem_.Write(header_file.c_str(), &header, sizeof(header)); - - SchemaProto new_schema; - type_config = new_schema.add_types(); - type_config->set_schema_type("message"); + CorruptDocStoreHeaderChecksumFile(); + SchemaProto new_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("message")) + .Build(); ICING_EXPECT_OK(schema_store->SetSchema( new_schema, /*ignore_errors_and_delete_documents=*/true)); @@ -1156,9 +962,11 @@ TEST_F(DocumentStoreTest, DeletedSchemaTypeFromSchemaStoreRecoversOk) { std::move(create_result.document_store); // Make sure we didn't add anything to the ground truth after we recovered. - int64_t ground_truth_size_after = filesystem_.GetFileSize( - absl_ports::StrCat(document_store_dir_, "/document_log").c_str()); - EXPECT_EQ(ground_truth_size_before, ground_truth_size_after); + int64_t document_log_size_after = filesystem_.GetFileSize( + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str()); + EXPECT_EQ(document_log_size_before, document_log_size_after); EXPECT_THAT(document_store->Get(email_document_id), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); @@ -1202,7 +1010,9 @@ TEST_F(DocumentStoreTest, OptimizeInto) { ICING_ASSERT_OK(doc_store->Put(document2)); ICING_ASSERT_OK(doc_store->Put(document3)); - std::string original_document_log = document_store_dir_ + "/document_log"; + std::string original_document_log = absl_ports::StrCat( + document_store_dir_, "/", DocumentLogCreator::GetDocumentLogFilename()); + int64_t original_size = filesystem_.GetFileSize(original_document_log.c_str()); @@ -1213,7 +1023,8 @@ TEST_F(DocumentStoreTest, OptimizeInto) { HasSubstr("directory is the same"))); std::string optimized_dir = document_store_dir_ + "_optimize"; - std::string optimized_document_log = optimized_dir + "/document_log"; + std::string optimized_document_log = + optimized_dir + "/" + DocumentLogCreator::GetDocumentLogFilename(); // Validates that the optimized document log has the same size if nothing is // deleted @@ -1301,8 +1112,8 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromDataLoss) { DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); const std::string serialized_document = document.SerializeAsString(); - const std::string document_log_file = - absl_ports::StrCat(document_store_dir_, "/document_log"); + const std::string document_log_file = absl_ports::StrCat( + document_store_dir_, "/", DocumentLogCreator::GetDocumentLogFilename()); int64_t file_size = filesystem_.GetFileSize(document_log_file.c_str()); filesystem_.PWrite(document_log_file.c_str(), file_size, serialized_document.data(), serialized_document.size()); @@ -1467,17 +1278,7 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromBadChecksum) { IsOkAndHolds(EqualsProto(test_document2_))); } - // Change the DocStore's header combined checksum so that it won't match the - // recalculated checksum on initialization. This will force a regeneration of - // derived files from ground truth. - const std::string header_file = - absl_ports::StrCat(document_store_dir_, "/document_store_header"); - DocumentStore::Header header; - header.magic = DocumentStore::Header::kMagic; - header.checksum = 10; // Arbitrary garbage checksum - filesystem_.DeleteFile(header_file.c_str()); - filesystem_.Write(header_file.c_str(), &header, sizeof(header)); - + CorruptDocStoreHeaderChecksumFile(); // Successfully recover from a corrupt derived file issue. ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, @@ -1507,7 +1308,7 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromBadChecksum) { /*num_docs=*/1, /*sum_length_in_tokens=*/4))); } -TEST_F(DocumentStoreTest, GetDiskUsage) { +TEST_F(DocumentStoreTest, GetStorageInfo) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -1515,8 +1316,8 @@ TEST_F(DocumentStoreTest, GetDiskUsage) { std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); - ICING_ASSERT_OK_AND_ASSIGN(int64_t empty_doc_store_size, - doc_store->GetDiskUsage()); + DocumentStorageInfoProto doc_store_storage_info = doc_store->GetStorageInfo(); + int64_t empty_doc_store_size = doc_store_storage_info.document_store_size(); EXPECT_THAT(empty_doc_store_size, Gt(0)); DocumentProto document = DocumentBuilder() @@ -1525,15 +1326,16 @@ TEST_F(DocumentStoreTest, GetDiskUsage) { .AddStringProperty("subject", "foo") .Build(); - // Since our GetDiskUsage can only get sizes in increments of block_size, we + // Since GetStorageInfo can only get sizes in increments of block_size, we // need to insert enough documents so the disk usage will increase by at least // 1 block size. The number 100 is a bit arbitrary, gotten from manually // testing. for (int i = 0; i < 100; ++i) { ICING_ASSERT_OK(doc_store->Put(document)); } - EXPECT_THAT(doc_store->GetDiskUsage(), - IsOkAndHolds(Gt(empty_doc_store_size))); + doc_store_storage_info = doc_store->GetStorageInfo(); + EXPECT_THAT(doc_store_storage_info.document_store_size(), + Gt(empty_doc_store_size)); // Bad file system MockFilesystem mock_filesystem; @@ -1546,8 +1348,8 @@ TEST_F(DocumentStoreTest, GetDiskUsage) { std::unique_ptr<DocumentStore> doc_store_with_mock_filesystem = std::move(create_result.document_store); - EXPECT_THAT(doc_store_with_mock_filesystem->GetDiskUsage(), - StatusIs(libtextclassifier3::StatusCode::INTERNAL)); + doc_store_storage_info = doc_store_with_mock_filesystem->GetStorageInfo(); + EXPECT_THAT(doc_store_storage_info.document_store_size(), Eq(-1)); } TEST_F(DocumentStoreTest, MaxDocumentId) { @@ -1838,7 +1640,7 @@ TEST_F(DocumentStoreTest, GetDocumentAssociatedScoreDataSameCorpus) { /*length_in_tokens=*/7))); } -TEST_F(DocumentStoreTest, GetCorpusAssociatedScoreDataDifferentCorpus) { +TEST_F(DocumentStoreTest, GetDocumentAssociatedScoreDataDifferentCorpus) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -1882,7 +1684,7 @@ TEST_F(DocumentStoreTest, GetCorpusAssociatedScoreDataDifferentCorpus) { /*length_in_tokens=*/7))); } -TEST_F(DocumentStoreTest, NonexistentDocumentAssociatedScoreDataOutOfRange) { +TEST_F(DocumentStoreTest, NonexistentDocumentAssociatedScoreDataNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -1891,10 +1693,10 @@ TEST_F(DocumentStoreTest, NonexistentDocumentAssociatedScoreDataOutOfRange) { std::move(create_result.document_store); EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(/*document_id=*/0), - StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, SoftDeletionDoesNotClearFilterCache) { +TEST_F(DocumentStoreTest, NonexistentDocumentFilterDataNotFound) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -1902,22 +1704,11 @@ TEST_F(DocumentStoreTest, SoftDeletionDoesNotClearFilterCache) { std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, - doc_store->Put(test_document1_)); - - EXPECT_THAT( - doc_store->GetDocumentFilterData(document_id), - IsOkAndHolds(DocumentFilterData( - /*namespace_id=*/0, - /*schema_type_id=*/0, - /*expiration_timestamp_ms=*/document1_expiration_timestamp_))); - - ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/true)); - // Associated entry of the deleted document is removed. - EXPECT_THAT(doc_store->GetDocumentFilterData(document_id).status(), IsOk()); + EXPECT_THAT(doc_store->GetDocumentFilterData(/*document_id=*/0), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, HardDeleteClearsFilterCache) { +TEST_F(DocumentStoreTest, DeleteClearsFilterCache) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -1935,36 +1726,13 @@ TEST_F(DocumentStoreTest, HardDeleteClearsFilterCache) { /*schema_type_id=*/0, /*expiration_timestamp_ms=*/document1_expiration_timestamp_))); - ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/false)); + ICING_ASSERT_OK(doc_store->Delete("icing", "email/1")); // Associated entry of the deleted document is removed. EXPECT_THAT(doc_store->GetDocumentFilterData(document_id), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, SoftDeletionDoesNotClearScoreCache) { - ICING_ASSERT_OK_AND_ASSIGN( - DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, - schema_store_.get())); - std::unique_ptr<DocumentStore> doc_store = - std::move(create_result.document_store); - - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, - doc_store->Put(test_document1_, /*num_tokens=*/4)); - - EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id), - IsOkAndHolds(DocumentAssociatedScoreData( - /*corpus_id=*/0, /*document_score=*/document1_score_, - /*creation_timestamp_ms=*/document1_creation_timestamp_, - /*length_in_tokens=*/4))); - - ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/true)); - // Associated entry of the deleted document is removed. - EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id).status(), - IsOk()); -} - -TEST_F(DocumentStoreTest, HardDeleteClearsScoreCache) { +TEST_F(DocumentStoreTest, DeleteClearsScoreCache) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -1982,13 +1750,13 @@ TEST_F(DocumentStoreTest, HardDeleteClearsScoreCache) { /*creation_timestamp_ms=*/document1_creation_timestamp_, /*length_in_tokens=*/4))); - ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/false)); + ICING_ASSERT_OK(doc_store->Delete("icing", "email/1")); // Associated entry of the deleted document is removed. EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } -TEST_F(DocumentStoreTest, SoftDeleteDoesNotClearUsageScores) { +TEST_F(DocumentStoreTest, DeleteShouldPreventUsageScores) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -2010,15 +1778,21 @@ TEST_F(DocumentStoreTest, SoftDeleteDoesNotClearUsageScores) { ASSERT_THAT(doc_store->GetUsageScores(document_id), IsOkAndHolds(expected_scores)); - // Soft delete the document. - ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/true)); + // Delete the document. + ICING_ASSERT_OK(doc_store->Delete("icing", "email/1")); + + // Can't report or get usage scores on the deleted document + ASSERT_THAT( + doc_store->ReportUsage(usage_report_type1), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND, + HasSubstr("Couldn't report usage on a nonexistent document"))); - // The scores should be the same. ASSERT_THAT(doc_store->GetUsageScores(document_id), - IsOkAndHolds(expected_scores)); + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND, + HasSubstr("Can't get usage scores"))); } -TEST_F(DocumentStoreTest, HardDeleteShouldClearUsageScores) { +TEST_F(DocumentStoreTest, ExpirationShouldPreventUsageScores) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -2026,8 +1800,20 @@ TEST_F(DocumentStoreTest, HardDeleteShouldClearUsageScores) { std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); - ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, - doc_store->Put(test_document1_)); + DocumentProto document = DocumentBuilder() + .SetKey("icing", "email/1") + .SetSchema("email") + .AddStringProperty("subject", "subject foo") + .AddStringProperty("body", "body bar") + .SetScore(document1_score_) + .SetCreationTimestampMs(10) + .SetTtlMs(100) + .Build(); + + ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, doc_store->Put(document)); + + // Some arbitrary time before the document's creation time (10) + ttl (100) + fake_clock_.SetSystemTimeMilliseconds(109); // Report usage with type 1. UsageReport usage_report_type1 = CreateUsageReport( @@ -2040,13 +1826,18 @@ TEST_F(DocumentStoreTest, HardDeleteShouldClearUsageScores) { ASSERT_THAT(doc_store->GetUsageScores(document_id), IsOkAndHolds(expected_scores)); - // Hard delete the document. - ICING_ASSERT_OK(doc_store->Delete("icing", "email/1", /*soft_delete=*/false)); + // Some arbitrary time past the document's creation time (10) + ttl (100) + fake_clock_.SetSystemTimeMilliseconds(200); + + // Can't report or get usage scores on the expired document + ASSERT_THAT( + doc_store->ReportUsage(usage_report_type1), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND, + HasSubstr("Couldn't report usage on a nonexistent document"))); - // The scores should be cleared. - expected_scores.usage_type1_count = 0; ASSERT_THAT(doc_store->GetUsageScores(document_id), - IsOkAndHolds(expected_scores)); + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND, + HasSubstr("Can't get usage scores"))); } TEST_F(DocumentStoreTest, @@ -2231,7 +2022,7 @@ TEST_F(DocumentStoreTest, ComputeChecksumSameAcrossInstances) { EXPECT_THAT(document_store->ComputeChecksum(), IsOkAndHolds(checksum)); } -TEST_F(DocumentStoreTest, ComputeChecksumChangesOnModification) { +TEST_F(DocumentStoreTest, ComputeChecksumChangesOnNewDocument) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, @@ -2247,6 +2038,24 @@ TEST_F(DocumentStoreTest, ComputeChecksumChangesOnModification) { IsOkAndHolds(Not(Eq(checksum)))); } +TEST_F(DocumentStoreTest, ComputeChecksumDoesntChangeOnNewUsage) { + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> document_store = + std::move(create_result.document_store); + + ICING_EXPECT_OK(document_store->Put(test_document1_)); + ICING_ASSERT_OK_AND_ASSIGN(Crc32 checksum, document_store->ComputeChecksum()); + + UsageReport usage_report = + CreateUsageReport(test_document1_.namespace_(), test_document1_.uri(), + /*timestamp_ms=*/1000, UsageReport::USAGE_TYPE1); + ICING_EXPECT_OK(document_store->ReportUsage(usage_report)); + EXPECT_THAT(document_store->ComputeChecksum(), IsOkAndHolds(Eq(checksum))); +} + TEST_F(DocumentStoreTest, RegenerateDerivedFilesSkipsUnknownSchemaTypeIds) { const std::string schema_store_dir = schema_store_dir_ + "_custom"; @@ -2275,11 +2084,11 @@ TEST_F(DocumentStoreTest, RegenerateDerivedFilesSkipsUnknownSchemaTypeIds) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_)); - SchemaProto schema; - auto type_config = schema.add_types(); - type_config->set_schema_type("email"); - type_config = schema.add_types(); - type_config->set_schema_type("message"); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .AddType(SchemaTypeConfigBuilder().SetType("message")) + .Build(); ICING_EXPECT_OK(schema_store->SetSchema(schema)); ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id, @@ -2320,16 +2129,7 @@ TEST_F(DocumentStoreTest, RegenerateDerivedFilesSkipsUnknownSchemaTypeIds) { message_expiration_timestamp = message_data.expiration_timestamp_ms(); } // Everything destructs and commits changes to file - // Change the DocumentStore's header combined checksum so that it won't match - // the recalculated checksum on initialization. This will force a regeneration - // of derived files from ground truth. - const std::string header_file = - absl_ports::StrCat(document_store_dir_, "/document_store_header"); - DocumentStore::Header header; - header.magic = DocumentStore::Header::kMagic; - header.checksum = 10; // Arbitrary garbage checksum - filesystem_.DeleteFile(header_file.c_str()); - filesystem_.Write(header_file.c_str(), &header, sizeof(header)); + CorruptDocStoreHeaderChecksumFile(); // Change the schema so that we don't know of the Document's type anymore. // Since we can't set backwards incompatible changes, we do some file-level @@ -2340,9 +2140,10 @@ TEST_F(DocumentStoreTest, RegenerateDerivedFilesSkipsUnknownSchemaTypeIds) { ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_)); - SchemaProto schema; - auto type_config = schema.add_types(); - type_config->set_schema_type("email"); + + SchemaProto schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_EXPECT_OK(schema_store->SetSchema(schema)); ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id, @@ -2388,11 +2189,11 @@ TEST_F(DocumentStoreTest, UpdateSchemaStoreUpdatesSchemaTypeIds) { filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); // Set a schema - SchemaProto schema; - auto type_config = schema.add_types(); - type_config->set_schema_type("email"); - type_config = schema.add_types(); - type_config->set_schema_type("message"); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .AddType(SchemaTypeConfigBuilder().SetType("message")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, @@ -2440,11 +2241,10 @@ TEST_F(DocumentStoreTest, UpdateSchemaStoreUpdatesSchemaTypeIds) { // Rearrange the schema types. Since SchemaTypeId is assigned based on order, // this should change the SchemaTypeIds. - schema.clear_types(); - type_config = schema.add_types(); - type_config->set_schema_type("message"); - type_config = schema.add_types(); - type_config->set_schema_type("email"); + schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("message")) + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_EXPECT_OK(schema_store->SetSchema(schema)); @@ -2475,18 +2275,14 @@ TEST_F(DocumentStoreTest, UpdateSchemaStoreDeletesInvalidDocuments) { filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); // Set a schema - SchemaProto schema; - auto type_config = schema.add_types(); - type_config->set_schema_type("email"); - - auto property_config = type_config->add_properties(); - property_config->set_property_name("subject"); - property_config->set_data_type(PropertyConfigProto::DataType::STRING); - property_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - property_config->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - property_config->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, @@ -2553,11 +2349,11 @@ TEST_F(DocumentStoreTest, filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); // Set a schema - SchemaProto schema; - auto type_config = schema.add_types(); - type_config->set_schema_type("email"); - type_config = schema.add_types(); - type_config->set_schema_type("message"); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .AddType(SchemaTypeConfigBuilder().SetType("message")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, @@ -2597,9 +2393,10 @@ TEST_F(DocumentStoreTest, EXPECT_THAT(document_store->Get(message_document_id), IsOkAndHolds(EqualsProto(message_document))); - SchemaProto new_schema; - type_config = new_schema.add_types(); - type_config->set_schema_type("message"); + SchemaProto new_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("message")) + .Build(); ICING_EXPECT_OK( schema_store->SetSchema(new_schema, @@ -2622,11 +2419,11 @@ TEST_F(DocumentStoreTest, OptimizedUpdateSchemaStoreUpdatesSchemaTypeIds) { filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); // Set a schema - SchemaProto schema; - auto type_config = schema.add_types(); - type_config->set_schema_type("email"); - type_config = schema.add_types(); - type_config->set_schema_type("message"); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .AddType(SchemaTypeConfigBuilder().SetType("message")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, @@ -2674,11 +2471,10 @@ TEST_F(DocumentStoreTest, OptimizedUpdateSchemaStoreUpdatesSchemaTypeIds) { // Rearrange the schema types. Since SchemaTypeId is assigned based on order, // this should change the SchemaTypeIds. - schema.clear_types(); - type_config = schema.add_types(); - type_config->set_schema_type("message"); - type_config = schema.add_types(); - type_config->set_schema_type("email"); + schema = SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("message")) + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN(SchemaStore::SetSchemaResult set_schema_result, schema_store->SetSchema(schema)); @@ -2711,18 +2507,14 @@ TEST_F(DocumentStoreTest, OptimizedUpdateSchemaStoreDeletesInvalidDocuments) { filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); // Set a schema - SchemaProto schema; - auto type_config = schema.add_types(); - type_config->set_schema_type("email"); - - auto property_config = type_config->add_properties(); - property_config->set_property_name("subject"); - property_config->set_data_type(PropertyConfigProto::DataType::STRING); - property_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - property_config->mutable_string_indexing_config()->set_term_match_type( - TermMatchType::EXACT_ONLY); - property_config->mutable_string_indexing_config()->set_tokenizer_type( - StringIndexingConfig::TokenizerType::PLAIN); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, @@ -2792,11 +2584,11 @@ TEST_F(DocumentStoreTest, filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); // Set a schema - SchemaProto schema; - auto type_config = schema.add_types(); - type_config->set_schema_type("email"); - type_config = schema.add_types(); - type_config->set_schema_type("message"); + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("email")) + .AddType(SchemaTypeConfigBuilder().SetType("message")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<SchemaStore> schema_store, @@ -2836,9 +2628,10 @@ TEST_F(DocumentStoreTest, EXPECT_THAT(document_store->Get(message_document_id), IsOkAndHolds(EqualsProto(message_document))); - SchemaProto new_schema; - type_config = new_schema.add_types(); - type_config->set_schema_type("message"); + SchemaProto new_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder().SetType("message")) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( SchemaStore::SetSchemaResult set_schema_result, @@ -3126,17 +2919,7 @@ TEST_F(DocumentStoreTest, UsageScoresShouldNotBeClearedOnChecksumMismatch) { IsOkAndHolds(expected_scores)); } - // Change the DocStore's header combined checksum so that it won't match the - // recalculated checksum on initialization. This will force a regeneration of - // derived files from ground truth. - const std::string header_file = - absl_ports::StrCat(document_store_dir_, "/document_store_header"); - DocumentStore::Header header; - header.magic = DocumentStore::Header::kMagic; - header.checksum = 10; // Arbitrary garbage checksum - filesystem_.DeleteFile(header_file.c_str()); - filesystem_.Write(header_file.c_str(), &header, sizeof(header)); - + CorruptDocStoreHeaderChecksumFile(); // Successfully recover from a corrupt derived file issue. ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, @@ -3181,8 +2964,8 @@ TEST_F(DocumentStoreTest, UsageScoresShouldBeAvailableAfterDataLoss) { DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build(); const std::string serialized_document = document.SerializeAsString(); - const std::string document_log_file = - absl_ports::StrCat(document_store_dir_, "/document_log"); + const std::string document_log_file = absl_ports::StrCat( + document_store_dir_, "/", DocumentLogCreator::GetDocumentLogFilename()); int64_t file_size = filesystem_.GetFileSize(document_log_file.c_str()); filesystem_.PWrite(document_log_file.c_str(), file_size, serialized_document.data(), serialized_document.size()); @@ -3235,45 +3018,6 @@ TEST_F(DocumentStoreTest, UsageScoresShouldBeCopiedOverToUpdatedDocument) { IsOkAndHolds(expected_scores)); } -TEST_F(DocumentStoreTest, - UsageScoresShouldNotBeCopiedOverFromOldSoftDeletedDocs) { - ICING_ASSERT_OK_AND_ASSIGN( - DocumentStore::CreateResult create_result, - DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, - schema_store_.get())); - std::unique_ptr<DocumentStore> document_store = - std::move(create_result.document_store); - - ICING_ASSERT_OK_AND_ASSIGN( - DocumentId document_id, - document_store->Put(DocumentProto(test_document1_))); - - // Report usage with type 1. - UsageReport usage_report_type1 = CreateUsageReport( - /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/0, - UsageReport::USAGE_TYPE1); - ICING_ASSERT_OK(document_store->ReportUsage(usage_report_type1)); - - UsageStore::UsageScores expected_scores; - ++expected_scores.usage_type1_count; - ASSERT_THAT(document_store->GetUsageScores(document_id), - IsOkAndHolds(expected_scores)); - - // Soft delete the doc. - ICING_ASSERT_OK(document_store->Delete(document_id, /*soft_delete=*/true)); - - // Put the same document. - ICING_ASSERT_OK_AND_ASSIGN( - DocumentId updated_document_id, - document_store->Put(DocumentProto(test_document1_))); - // We should get a different document id. - ASSERT_THAT(updated_document_id, Not(Eq(document_id))); - - // Usage scores should be cleared. - EXPECT_THAT(document_store->GetUsageScores(updated_document_id), - IsOkAndHolds(UsageStore::UsageScores())); -} - TEST_F(DocumentStoreTest, UsageScoresShouldPersistOnOptimize) { ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, @@ -3344,7 +3088,9 @@ TEST_F(DocumentStoreTest, DetectPartialDataLoss) { const std::string serialized_document = document.SerializeAsString(); const std::string document_log_file = - absl_ports::StrCat(document_store_dir_, "/document_log"); + absl_ports::StrCat(document_store_dir_, "/", + DocumentLogCreator::GetDocumentLogFilename()) + .c_str(); int64_t file_size = filesystem_.GetFileSize(document_log_file.c_str()); filesystem_.PWrite(document_log_file.c_str(), file_size, serialized_document.data(), serialized_document.size()); @@ -3361,8 +3107,8 @@ TEST_F(DocumentStoreTest, DetectPartialDataLoss) { TEST_F(DocumentStoreTest, DetectCompleteDataLoss) { int64_t corruptible_offset; - const std::string document_log_file = - absl_ports::StrCat(document_store_dir_, "/document_log"); + const std::string document_log_file = absl_ports::StrCat( + document_store_dir_, "/", DocumentLogCreator::GetDocumentLogFilename()); { // Can put and delete fine. ICING_ASSERT_OK_AND_ASSIGN( @@ -3389,8 +3135,30 @@ TEST_F(DocumentStoreTest, DetectCompleteDataLoss) { // "Corrupt" the persisted content written in the log. We can't recover if // the persisted data was corrupted. std::string corruption = "abc"; - filesystem_.PWrite(document_log_file.c_str(), /*offset=*/corruptible_offset, - corruption.data(), corruption.size()); + filesystem_.PWrite(document_log_file.c_str(), + /*offset=*/corruptible_offset, corruption.data(), + corruption.size()); + + { + // "Corrupt" the content written in the log. Make the corrupt document + // smaller than our original one so we don't accidentally write past our + // file. + DocumentProto document = + DocumentBuilder().SetKey("invalid_namespace", "invalid_uri").Build(); + std::string serialized_document = document.SerializeAsString(); + ASSERT_TRUE(filesystem_.PWrite( + document_log_file.c_str(), corruptible_offset, + serialized_document.data(), serialized_document.size())); + + PortableFileBackedProtoLog<DocumentWrapper>::Header header = + ReadDocumentLogHeader(filesystem_, document_log_file); + + // Set dirty bit to true to reflect that something changed in the log. + header.SetDirtyFlag(true); + header.SetHeaderChecksum(header.CalculateHeaderChecksum()); + + WriteDocumentLogHeader(filesystem_, document_log_file, header); + } // Successfully recover from a data loss issue. ICING_ASSERT_OK_AND_ASSIGN( @@ -3402,54 +3170,699 @@ TEST_F(DocumentStoreTest, DetectCompleteDataLoss) { ASSERT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE)); } +// TODO(b/185845269) Re-enable this test by copying over a full valid set of +// document store files. Right now this test only includes the score_cache and +// the document store header. +// +// This causes a problem now because this cl changes behavior to not consider an +// InitializeExistingDerivedFiles failure to be a recovery if there is nothing +// to recover because the doocument store is empty. +#define DISABLE_BACKWARDS_COMPAT_TEST +#ifndef DISABLE_BACKWARDS_COMPAT_TEST TEST_F(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) { - // The directory testdata/v0/document_store contains only the scoring_cache - // and the document_store_header (holding the crc for the scoring_cache). If - // the current code is compatible with the format of the v0 scoring_cache, - // then an empty document store should be initialized, but the non-empty - // scoring_cache should be retained. - // The current document-asscoiated-score-data has a new field with respect to - // the ones stored in testdata/v0, hence the document store's initialization - // requires regenerating its derived files. + // The directory testdata/score_cache_without_length_in_tokens/document_store + // contains only the scoring_cache and the document_store_header (holding the + // crc for the scoring_cache). If the current code is compatible with the + // format of the v0 scoring_cache, then an empty document store should be + // initialized, but the non-empty scoring_cache should be retained. The + // current document-asscoiated-score-data has a new field with respect to the + // ones stored in testdata/score_cache_Without_length_in_tokens, hence the + // document store's initialization requires regenerating its derived files. // Create dst directory ASSERT_THAT(filesystem_.CreateDirectory(document_store_dir_.c_str()), true); // Get src files - std::string document_store_v0; + std::string document_store_without_length_in_tokens; if (IsAndroidPlatform() || IsIosPlatform()) { - document_store_v0 = GetTestFilePath( - "icing/testdata/v0/document_store_android_ios_compatible"); + document_store_without_length_in_tokens = GetTestFilePath( + "icing/testdata/score_cache_without_length_in_tokens/" + "document_store_android_ios_compatible"); } else { - document_store_v0 = - GetTestFilePath("icing/testdata/v0/document_store"); + document_store_without_length_in_tokens = GetTestFilePath( + "icing/testdata/score_cache_without_length_in_tokens/" + "document_store"); } std::vector<std::string> document_store_files; Filesystem filesystem; - filesystem.ListDirectory(document_store_v0.c_str(), &document_store_files); + filesystem.ListDirectory(document_store_without_length_in_tokens.c_str(), + &document_store_files); - VLOG(1) << "Copying files " << document_store_v0 << ' ' - << document_store_files.size(); + ICING_LOG(INFO) << "Copying files " << document_store_without_length_in_tokens + << ' ' << document_store_files.size(); for (size_t i = 0; i != document_store_files.size(); i++) { - std::string src = - absl_ports::StrCat(document_store_v0, "/", document_store_files[i]); + std::string src = absl_ports::StrCat( + document_store_without_length_in_tokens, "/", document_store_files[i]); std::string dst = absl_ports::StrCat(document_store_dir_, "/", document_store_files[i]); ASSERT_THAT(filesystem_.CopyFile(src.c_str(), dst.c_str()), true); } - NativeInitializeStats initializeStats; + InitializeStatsProto initialize_stats; ICING_ASSERT_OK_AND_ASSIGN( DocumentStore::CreateResult create_result, DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, - schema_store_.get(), &initializeStats)); + schema_store_.get(), + /*force_recovery_and_revalidate_documents=*/false, + &initialize_stats)); std::unique_ptr<DocumentStore> doc_store = std::move(create_result.document_store); // The store_cache trigger regeneration because its element size is // inconsistent: expected 20 (current new size), actual 12 (as per the v0 // score_cache). - EXPECT_TRUE(initializeStats.has_document_store_recovery_cause()); + EXPECT_TRUE(initialize_stats.has_document_store_recovery_cause()); +} +#endif // DISABLE_BACKWARDS_COMPAT_TEST + +TEST_F(DocumentStoreTest, DocumentStoreStorageInfo) { + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store_.get())); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + + // Add three documents. + DocumentProto document1 = test_document1_; + document1.set_namespace_("namespace.1"); + document1.set_uri("uri1"); + ICING_ASSERT_OK(doc_store->Put(document1)); + + DocumentProto document2 = test_document1_; + document2.set_namespace_("namespace.1"); + document2.set_uri("uri2"); + document2.set_creation_timestamp_ms(fake_clock_.GetSystemTimeMilliseconds()); + document2.set_ttl_ms(100); + ICING_ASSERT_OK(doc_store->Put(document2)); + + DocumentProto document3 = test_document1_; + document3.set_namespace_("namespace.1"); + document3.set_uri("uri3"); + ICING_ASSERT_OK(doc_store->Put(document3)); + + DocumentProto document4 = test_document1_; + document4.set_namespace_("namespace.2"); + document4.set_uri("uri1"); + ICING_ASSERT_OK(doc_store->Put(document4)); + + // Report usage with type 1 on document1 + UsageReport usage_report_type1 = CreateUsageReport( + /*name_space=*/"namespace.1", /*uri=*/"uri1", /*timestamp_ms=*/1000, + UsageReport::USAGE_TYPE1); + ICING_ASSERT_OK(doc_store->ReportUsage(usage_report_type1)); + + // Report usage with type 2 on document2 + UsageReport usage_report_type2 = CreateUsageReport( + /*name_space=*/"namespace.1", /*uri=*/"uri2", /*timestamp_ms=*/1000, + UsageReport::USAGE_TYPE2); + ICING_ASSERT_OK(doc_store->ReportUsage(usage_report_type2)); + + // Report usage with type 3 on document3 + UsageReport usage_report_type3 = CreateUsageReport( + /*name_space=*/"namespace.1", /*uri=*/"uri3", /*timestamp_ms=*/1000, + UsageReport::USAGE_TYPE3); + ICING_ASSERT_OK(doc_store->ReportUsage(usage_report_type3)); + + // Report usage with type 1 on document4 + usage_report_type1 = CreateUsageReport( + /*name_space=*/"namespace.2", /*uri=*/"uri1", /*timestamp_ms=*/1000, + UsageReport::USAGE_TYPE1); + ICING_ASSERT_OK(doc_store->ReportUsage(usage_report_type1)); + + // Delete the first doc. + ICING_ASSERT_OK(doc_store->Delete(document1.namespace_(), document1.uri())); + + // Expire the second doc. + fake_clock_.SetSystemTimeMilliseconds(document2.creation_timestamp_ms() + + document2.ttl_ms() + 1); + + // Check high level info + DocumentStorageInfoProto storage_info = doc_store->GetStorageInfo(); + EXPECT_THAT(storage_info.num_alive_documents(), Eq(2)); + EXPECT_THAT(storage_info.num_deleted_documents(), Eq(1)); + EXPECT_THAT(storage_info.num_expired_documents(), Eq(1)); + EXPECT_THAT(storage_info.document_store_size(), Ge(0)); + EXPECT_THAT(storage_info.document_log_size(), Ge(0)); + EXPECT_THAT(storage_info.key_mapper_size(), Ge(0)); + EXPECT_THAT(storage_info.document_id_mapper_size(), Ge(0)); + EXPECT_THAT(storage_info.score_cache_size(), Ge(0)); + EXPECT_THAT(storage_info.filter_cache_size(), Ge(0)); + EXPECT_THAT(storage_info.corpus_mapper_size(), Ge(0)); + EXPECT_THAT(storage_info.corpus_score_cache_size(), Ge(0)); + EXPECT_THAT(storage_info.namespace_id_mapper_size(), Ge(0)); + EXPECT_THAT(storage_info.num_namespaces(), Eq(2)); + + // Check per-namespace info + EXPECT_THAT(storage_info.namespace_storage_info_size(), Eq(2)); + + NamespaceStorageInfoProto namespace_storage_info = + GetNamespaceStorageInfo(storage_info, "namespace.1"); + EXPECT_THAT(namespace_storage_info.num_alive_documents(), Eq(1)); + EXPECT_THAT(namespace_storage_info.num_expired_documents(), Eq(1)); + EXPECT_THAT(namespace_storage_info.num_alive_documents_usage_type1(), Eq(0)); + EXPECT_THAT(namespace_storage_info.num_alive_documents_usage_type2(), Eq(0)); + EXPECT_THAT(namespace_storage_info.num_alive_documents_usage_type3(), Eq(1)); + EXPECT_THAT(namespace_storage_info.num_expired_documents_usage_type1(), + Eq(0)); + EXPECT_THAT(namespace_storage_info.num_expired_documents_usage_type2(), + Eq(1)); + EXPECT_THAT(namespace_storage_info.num_expired_documents_usage_type3(), + Eq(0)); + + namespace_storage_info = GetNamespaceStorageInfo(storage_info, "namespace.2"); + EXPECT_THAT(namespace_storage_info.num_alive_documents(), Eq(1)); + EXPECT_THAT(namespace_storage_info.num_expired_documents(), Eq(0)); + EXPECT_THAT(namespace_storage_info.num_alive_documents_usage_type1(), Eq(1)); + EXPECT_THAT(namespace_storage_info.num_alive_documents_usage_type2(), Eq(0)); + EXPECT_THAT(namespace_storage_info.num_alive_documents_usage_type3(), Eq(0)); + EXPECT_THAT(namespace_storage_info.num_expired_documents_usage_type1(), + Eq(0)); + EXPECT_THAT(namespace_storage_info.num_expired_documents_usage_type2(), + Eq(0)); + EXPECT_THAT(namespace_storage_info.num_expired_documents_usage_type3(), + Eq(0)); +} + +TEST_F(DocumentStoreTest, InitializeForceRecoveryUpdatesTypeIds) { + // Start fresh and set the schema with one type. + filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); + filesystem_.CreateDirectoryRecursively(test_dir_.c_str()); + filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str()); + filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()); + + SchemaTypeConfigProto email_type_config = + SchemaTypeConfigBuilder() + .SetType("email") + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + SchemaProto schema = SchemaBuilder().AddType(email_type_config).Build(); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaStore> schema_store, + SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + ASSERT_THAT(schema_store->SetSchema(schema), IsOk()); + // The typeid for "email" should be 0. + ASSERT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(0)); + + DocumentId docid = kInvalidDocumentId; + { + // Create the document store the first time and add an email document. + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store.get())); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + + DocumentProto doc = + DocumentBuilder() + .SetKey("icing", "email/1") + .SetSchema("email") + .AddStringProperty("subject", "subject foo") + .AddStringProperty("body", "body bar") + .SetScore(document1_score_) + .SetCreationTimestampMs( + document1_creation_timestamp_) // A random timestamp + .SetTtlMs(document1_ttl_) + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(doc)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentFilterData filter_data, + doc_store->GetDocumentFilterData(docid)); + + ASSERT_THAT(filter_data.schema_type_id(), Eq(0)); + } + + // Add another type to the schema before the email type. + schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("alarm") + .AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("time") + .SetDataType(TYPE_INT) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(email_type_config) + .Build(); + ASSERT_THAT(schema_store->SetSchema(schema), IsOk()); + // Adding a new type should cause ids to be reassigned. Ids are assigned in + // order of appearance so 'alarm' should be 0 and 'email' should be 1. + ASSERT_THAT(schema_store->GetSchemaTypeId("alarm"), IsOkAndHolds(0)); + ASSERT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(1)); + + { + // Create the document store the second time and force recovery + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create( + &filesystem_, document_store_dir_, &fake_clock_, schema_store.get(), + /*force_recovery_and_revalidate_documents=*/true)); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + + // Ensure that the type id of the email document has been correctly updated. + ICING_ASSERT_OK_AND_ASSIGN(DocumentFilterData filter_data, + doc_store->GetDocumentFilterData(docid)); + ASSERT_THAT(filter_data.schema_type_id(), Eq(1)); + } +} + +TEST_F(DocumentStoreTest, InitializeDontForceRecoveryDoesntUpdateTypeIds) { + // Start fresh and set the schema with one type. + filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); + filesystem_.CreateDirectoryRecursively(test_dir_.c_str()); + filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str()); + filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()); + + SchemaTypeConfigProto email_type_config = + SchemaTypeConfigBuilder() + .SetType("email") + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + SchemaProto schema = SchemaBuilder().AddType(email_type_config).Build(); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaStore> schema_store, + SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + ASSERT_THAT(schema_store->SetSchema(schema), IsOk()); + // The typeid for "email" should be 0. + ASSERT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(0)); + + DocumentId docid = kInvalidDocumentId; + { + // Create the document store the first time and add an email document. + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store.get())); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + + DocumentProto doc = + DocumentBuilder() + .SetKey("icing", "email/1") + .SetSchema("email") + .AddStringProperty("subject", "subject foo") + .AddStringProperty("body", "body bar") + .SetScore(document1_score_) + .SetCreationTimestampMs( + document1_creation_timestamp_) // A random timestamp + .SetTtlMs(document1_ttl_) + .Build(); + ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(doc)); + ICING_ASSERT_OK_AND_ASSIGN(DocumentFilterData filter_data, + doc_store->GetDocumentFilterData(docid)); + + ASSERT_THAT(filter_data.schema_type_id(), Eq(0)); + } + + // Add another type to the schema. + schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("alarm") + .AddProperty( + PropertyConfigBuilder() + .SetName("name") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("time") + .SetDataType(TYPE_INT) + .SetCardinality(CARDINALITY_OPTIONAL))) + .AddType(email_type_config) + .Build(); + ASSERT_THAT(schema_store->SetSchema(schema), IsOk()); + // Adding a new type should cause ids to be reassigned. Ids are assigned in + // order of appearance so 'alarm' should be 0 and 'email' should be 1. + ASSERT_THAT(schema_store->GetSchemaTypeId("alarm"), IsOkAndHolds(0)); + ASSERT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(1)); + + { + // Create the document store the second time. Don't force recovery. + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create( + &filesystem_, document_store_dir_, &fake_clock_, schema_store.get(), + /*force_recovery_and_revalidate_documents=*/false)); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + + // Check that the type id of the email document has not been updated. + ICING_ASSERT_OK_AND_ASSIGN(DocumentFilterData filter_data, + doc_store->GetDocumentFilterData(docid)); + ASSERT_THAT(filter_data.schema_type_id(), Eq(0)); + } +} + +TEST_F(DocumentStoreTest, InitializeForceRecoveryDeletesInvalidDocument) { + // Start fresh and set the schema with one type. + filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); + filesystem_.CreateDirectoryRecursively(test_dir_.c_str()); + filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str()); + filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()); + + SchemaTypeConfigProto email_type_config = + SchemaTypeConfigBuilder() + .SetType("email") + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + SchemaProto schema = SchemaBuilder().AddType(email_type_config).Build(); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaStore> schema_store, + SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + ASSERT_THAT(schema_store->SetSchema(schema), IsOk()); + + DocumentProto docWithBody = + DocumentBuilder() + .SetKey("icing", "email/1") + .SetSchema("email") + .AddStringProperty("subject", "subject foo") + .AddStringProperty("body", "body bar") + .SetScore(document1_score_) + .SetCreationTimestampMs( + document1_creation_timestamp_) // A random timestamp + .SetTtlMs(document1_ttl_) + .Build(); + DocumentProto docWithoutBody = + DocumentBuilder() + .SetKey("icing", "email/2") + .SetSchema("email") + .AddStringProperty("subject", "subject foo") + .SetScore(document1_score_) + .SetCreationTimestampMs( + document1_creation_timestamp_) // A random timestamp + .SetTtlMs(document1_ttl_) + .Build(); + + { + // Create the document store the first time and add two email documents: one + // that has the 'body' section and one that doesn't. + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store.get())); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + + DocumentId docid = kInvalidDocumentId; + ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(docWithBody)); + ASSERT_NE(docid, kInvalidDocumentId); + docid = kInvalidDocumentId; + ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(docWithoutBody)); + ASSERT_NE(docid, kInvalidDocumentId); + + ASSERT_THAT(doc_store->Get(docWithBody.namespace_(), docWithBody.uri()), + IsOkAndHolds(EqualsProto(docWithBody))); + ASSERT_THAT( + doc_store->Get(docWithoutBody.namespace_(), docWithoutBody.uri()), + IsOkAndHolds(EqualsProto(docWithoutBody))); + } + + // Delete the 'body' property from the 'email' type, making all pre-existing + // documents with the 'body' property invalid. + email_type_config = + SchemaTypeConfigBuilder() + .SetType("email") + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + schema = SchemaBuilder().AddType(email_type_config).Build(); + ASSERT_THAT(schema_store->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/true), + IsOk()); + + { + // Create the document store the second time and force recovery + CorruptDocStoreHeaderChecksumFile(); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create( + &filesystem_, document_store_dir_, &fake_clock_, schema_store.get(), + /*force_recovery_and_revalidate_documents=*/true)); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + + ASSERT_THAT(doc_store->Get(docWithBody.namespace_(), docWithBody.uri()), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + ASSERT_THAT( + doc_store->Get(docWithoutBody.namespace_(), docWithoutBody.uri()), + IsOkAndHolds(EqualsProto(docWithoutBody))); + } +} + +TEST_F(DocumentStoreTest, InitializeDontForceRecoveryKeepsInvalidDocument) { + // Start fresh and set the schema with one type. + filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()); + filesystem_.CreateDirectoryRecursively(test_dir_.c_str()); + filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str()); + filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()); + + SchemaTypeConfigProto email_type_config = + SchemaTypeConfigBuilder() + .SetType("email") + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + SchemaProto schema = SchemaBuilder().AddType(email_type_config).Build(); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaStore> schema_store, + SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_)); + ASSERT_THAT(schema_store->SetSchema(schema), IsOk()); + + DocumentProto docWithBody = + DocumentBuilder() + .SetKey("icing", "email/1") + .SetSchema("email") + .AddStringProperty("subject", "subject foo") + .AddStringProperty("body", "body bar") + .SetScore(document1_score_) + .SetCreationTimestampMs( + document1_creation_timestamp_) // A random timestamp + .SetTtlMs(document1_ttl_) + .Build(); + DocumentProto docWithoutBody = + DocumentBuilder() + .SetKey("icing", "email/2") + .SetSchema("email") + .AddStringProperty("subject", "subject foo") + .SetScore(document1_score_) + .SetCreationTimestampMs( + document1_creation_timestamp_) // A random timestamp + .SetTtlMs(document1_ttl_) + .Build(); + + { + // Create the document store the first time and add two email documents: one + // that has the 'body' section and one that doesn't. + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_, + schema_store.get())); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + + DocumentId docid = kInvalidDocumentId; + ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(docWithBody)); + ASSERT_NE(docid, kInvalidDocumentId); + docid = kInvalidDocumentId; + ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(docWithoutBody)); + ASSERT_NE(docid, kInvalidDocumentId); + + ASSERT_THAT(doc_store->Get(docWithBody.namespace_(), docWithBody.uri()), + IsOkAndHolds(EqualsProto(docWithBody))); + ASSERT_THAT( + doc_store->Get(docWithoutBody.namespace_(), docWithoutBody.uri()), + IsOkAndHolds(EqualsProto(docWithoutBody))); + } + + // Delete the 'body' property from the 'email' type, making all pre-existing + // documents with the 'body' property invalid. + email_type_config = + SchemaTypeConfigBuilder() + .SetType("email") + .AddProperty(PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .Build(); + schema = SchemaBuilder().AddType(email_type_config).Build(); + ASSERT_THAT(schema_store->SetSchema( + schema, /*ignore_errors_and_delete_documents=*/true), + IsOk()); + + { + // Corrupt the document store header checksum so that we will perform + // recovery, but without revalidation. + CorruptDocStoreHeaderChecksumFile(); + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create( + &filesystem_, document_store_dir_, &fake_clock_, schema_store.get(), + /*force_recovery_and_revalidate_documents=*/false)); + std::unique_ptr<DocumentStore> doc_store = + std::move(create_result.document_store); + + ASSERT_THAT(doc_store->Get(docWithBody.namespace_(), docWithBody.uri()), + IsOkAndHolds(EqualsProto(docWithBody))); + ASSERT_THAT( + doc_store->Get(docWithoutBody.namespace_(), docWithoutBody.uri()), + IsOkAndHolds(EqualsProto(docWithoutBody))); + } +} + +#ifndef DISABLE_BACKWARDS_COMPAT_TEST +TEST_F(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) { + // Set up schema. + SchemaProto schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType("email") + .AddProperty( + PropertyConfigBuilder() + .SetName("subject") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty( + PropertyConfigBuilder() + .SetName("body") + .SetDataTypeString(MATCH_EXACT, TOKENIZER_PLAIN) + .SetCardinality(CARDINALITY_OPTIONAL))) + .Build(); + + std::string schema_store_dir = schema_store_dir_ + "_migrate"; + filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str()); + filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str()); + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<SchemaStore> schema_store, + SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_)); + + ASSERT_THAT(schema_store->SetSchema(schema), IsOk()); + + // Create dst directory that we'll initialize the DocumentStore over. + std::string document_store_dir = document_store_dir_ + "_migrate"; + ASSERT_THAT( + filesystem_.DeleteDirectoryRecursively(document_store_dir.c_str()), true); + ASSERT_THAT( + filesystem_.CreateDirectoryRecursively(document_store_dir.c_str()), true); + + // Copy the testdata files into our DocumentStore directory + std::string document_store_without_portable_log; + if (IsAndroidX86()) { + document_store_without_portable_log = GetTestFilePath( + "icing/testdata/not_portable_log/" + "icing_search_engine_android_x86/document_dir"); + } else if (IsAndroidArm()) { + document_store_without_portable_log = GetTestFilePath( + "icing/testdata/not_portable_log/" + "icing_search_engine_android_arm/document_dir"); + } else if (IsIosPlatform()) { + document_store_without_portable_log = GetTestFilePath( + "icing/testdata/not_portable_log/" + "icing_search_engine_ios/document_dir"); + } else { + document_store_without_portable_log = GetTestFilePath( + "icing/testdata/not_portable_log/" + "icing_search_engine_linux/document_dir"); + } + + ASSERT_TRUE(filesystem_.CopyDirectory( + document_store_without_portable_log.c_str(), document_store_dir.c_str(), + /*recursive=*/true)); + + // Initialize the DocumentStore over our copied files. + InitializeStatsProto initialize_stats; + ICING_ASSERT_OK_AND_ASSIGN( + DocumentStore::CreateResult create_result, + DocumentStore::Create(&filesystem_, document_store_dir, &fake_clock_, + schema_store.get(), + /*force_recovery_and_revalidate_documents=*/false, + &initialize_stats)); + std::unique_ptr<DocumentStore> document_store = + std::move(create_result.document_store); + + // These are the documents that are stored in the testdata files. Do not + // change unless you're also updating the testdata files. + DocumentProto document1 = DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("email") + .SetCreationTimestampMs(10) + .AddStringProperty("subject", "foo") + .AddStringProperty("body", "bar") + .Build(); + + DocumentProto document2 = DocumentBuilder() + .SetKey("namespace1", "uri2") + .SetSchema("email") + .SetCreationTimestampMs(20) + .SetScore(321) + .AddStringProperty("body", "baz bat") + .Build(); + + DocumentProto document3 = DocumentBuilder() + .SetKey("namespace2", "uri1") + .SetSchema("email") + .SetCreationTimestampMs(30) + .SetScore(123) + .AddStringProperty("subject", "phoo") + .Build(); + + // Check that we didn't lose anything. A migration also doesn't technically + // count as a recovery. + EXPECT_THAT(create_result.data_loss, Eq(DataLoss::NONE)); + EXPECT_FALSE(initialize_stats.has_document_store_recovery_cause()); + + // Document 1 and 3 were put normally, and document 2 was deleted in our + // testdata files. + // + // Check by namespace, uri + EXPECT_THAT(document_store->Get(document1.namespace_(), document1.uri()), + IsOkAndHolds(EqualsProto(document1))); + EXPECT_THAT(document_store->Get(document2.namespace_(), document2.uri()), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(document_store->Get(document3.namespace_(), document3.uri()), + IsOkAndHolds(EqualsProto(document3))); + + // Check by document_id + EXPECT_THAT(document_store->Get(/*document_id=*/0), + IsOkAndHolds(EqualsProto(document1))); + EXPECT_THAT(document_store->Get(/*document_id=*/1), + StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); + EXPECT_THAT(document_store->Get(/*document_id=*/2), + IsOkAndHolds(EqualsProto(document3))); } +#endif // DISABLE_BACKWARDS_COMPAT_TEST } // namespace diff --git a/icing/store/usage-store.cc b/icing/store/usage-store.cc index 54896dc..546067d 100644 --- a/icing/store/usage-store.cc +++ b/icing/store/usage-store.cc @@ -74,6 +74,9 @@ libtextclassifier3::Status UsageStore::AddUsageReport(const UsageReport& report, "Document id %d is invalid.", document_id)); } + // We don't need a copy here because we'll set the value at the same index. + // This won't unintentionally grow the underlying file since we already have + // enough space for the current index. auto usage_scores_or = usage_score_cache_->Get(document_id); // OutOfRange means that the mapper hasn't seen this document id before, it's @@ -159,7 +162,7 @@ UsageStore::GetUsageScores(DocumentId document_id) { "Document id %d is invalid.", document_id)); } - auto usage_scores_or = usage_score_cache_->Get(document_id); + auto usage_scores_or = usage_score_cache_->GetCopy(document_id); if (absl_ports::IsOutOfRange(usage_scores_or.status())) { // No usage scores found. Return the default scores. return UsageScores(); @@ -168,7 +171,7 @@ UsageStore::GetUsageScores(DocumentId document_id) { return usage_scores_or.status(); } - return *std::move(usage_scores_or).ValueOrDie(); + return std::move(usage_scores_or).ValueOrDie(); } libtextclassifier3::Status UsageStore::SetUsageScores( @@ -193,10 +196,10 @@ libtextclassifier3::Status UsageStore::CloneUsageScores( "to_document_id %d is invalid.", to_document_id)); } - auto usage_scores_or = usage_score_cache_->Get(from_document_id); + auto usage_scores_or = usage_score_cache_->GetCopy(from_document_id); if (usage_scores_or.ok()) { return usage_score_cache_->Set(to_document_id, - *std::move(usage_scores_or).ValueOrDie()); + std::move(usage_scores_or).ValueOrDie()); } else if (absl_ports::IsOutOfRange(usage_scores_or.status())) { // No usage scores found. Set default scores to to_document_id. return usage_score_cache_->Set(to_document_id, UsageScores()); @@ -218,6 +221,10 @@ libtextclassifier3::StatusOr<int64_t> UsageStore::GetElementsFileSize() const { return usage_score_cache_->GetElementsFileSize(); } +libtextclassifier3::StatusOr<int64_t> UsageStore::GetDiskUsage() const { + return usage_score_cache_->GetDiskUsage(); +} + libtextclassifier3::Status UsageStore::TruncateTo(DocumentId num_documents) { if (num_documents >= usage_score_cache_->num_elements()) { // No need to truncate diff --git a/icing/store/usage-store.h b/icing/store/usage-store.h index b7de970..fd77df4 100644 --- a/icing/store/usage-store.h +++ b/icing/store/usage-store.h @@ -157,6 +157,14 @@ class UsageStore { // INTERNAL_ERROR on IO error libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const; + // Calculates and returns the disk usage in bytes. Rounds up to the nearest + // block size. + // + // Returns: + // Disk usage on success + // INTERNAL_ERROR on IO error + libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const; + // Resizes the storage so that only the usage scores of and before // last_document_id are stored. // diff --git a/icing/store/usage-store_test.cc b/icing/store/usage-store_test.cc index 220c226..b2dbe4b 100644 --- a/icing/store/usage-store_test.cc +++ b/icing/store/usage-store_test.cc @@ -577,6 +577,41 @@ TEST_F(UsageStoreTest, GetElementsFileSize) { IsOkAndHolds(Gt(empty_file_size))); } +TEST_F(UsageStoreTest, GetDiskUsageEmpty) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store, + UsageStore::Create(&filesystem_, test_dir_)); + + // There's some internal metadata, so our disk usage will round up to 1 block. + ICING_ASSERT_OK_AND_ASSIGN(int64_t empty_disk_usage, + usage_store->GetDiskUsage()); + EXPECT_THAT(empty_disk_usage, Gt(0)); +} + +TEST_F(UsageStoreTest, GetDiskUsageNonEmpty) { + ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store, + UsageStore::Create(&filesystem_, test_dir_)); + + // There's some internal metadata, so our disk usage will round up to 1 block. + ICING_ASSERT_OK_AND_ASSIGN(int64_t empty_disk_usage, + usage_store->GetDiskUsage()); + + // Since our GetDiskUsage can only get sizes in increments of block_size, we + // need to insert enough usage reports so the disk usage will increase by at + // least 1 block size. The number 200 is a bit arbitrary, gotten from manually + // testing. + UsageReport usage_report = CreateUsageReport( + "namespace", "uri", /*timestamp_ms=*/1000, UsageReport::USAGE_TYPE1); + for (int i = 0; i < 200; ++i) { + usage_store->AddUsageReport(usage_report, /*document_id=*/i); + } + + // We need to persist since iOS won't see the new disk allocations until after + // everything gets written. + usage_store->PersistToDisk(); + + EXPECT_THAT(usage_store->GetDiskUsage(), IsOkAndHolds(Gt(empty_disk_usage))); +} + } // namespace } // namespace lib diff --git a/icing/testing/common-matchers.h b/icing/testing/common-matchers.h index b7f54ba..f83fe0a 100644 --- a/icing/testing/common-matchers.h +++ b/icing/testing/common-matchers.h @@ -25,7 +25,6 @@ #include "icing/absl_ports/str_join.h" #include "icing/index/hit/doc-hit-info.h" #include "icing/legacy/core/icing-string-util.h" -#include "icing/proto/search.proto.h" #include "icing/proto/search.pb.h" #include "icing/schema/schema-store.h" #include "icing/schema/section.h" @@ -122,7 +121,6 @@ MATCHER_P(EqualsSetSchemaResult, expected, "") { const SchemaStore::SetSchemaResult& actual = arg; if (actual.success == expected.success && - actual.index_incompatible == expected.index_incompatible && actual.old_schema_type_ids_changed == expected.old_schema_type_ids_changed && actual.schema_types_deleted_by_name == @@ -132,7 +130,12 @@ MATCHER_P(EqualsSetSchemaResult, expected, "") { actual.schema_types_incompatible_by_name == expected.schema_types_incompatible_by_name && actual.schema_types_incompatible_by_id == - expected.schema_types_incompatible_by_id) { + expected.schema_types_incompatible_by_id && + actual.schema_types_new_by_name == expected.schema_types_new_by_name && + actual.schema_types_changed_fully_compatible_by_name == + expected.schema_types_changed_fully_compatible_by_name && + actual.schema_types_index_incompatible_by_name == + expected.schema_types_index_incompatible_by_name) { return true; } @@ -192,37 +195,82 @@ MATCHER_P(EqualsSetSchemaResult, expected, "") { absl_ports::NumberFormatter()), "]"); + // Format schema_types_new_by_name + std::string actual_schema_types_new_by_name = absl_ports::StrCat( + "[", absl_ports::StrJoin(actual.schema_types_new_by_name, ","), "]"); + + std::string expected_schema_types_new_by_name = absl_ports::StrCat( + "[", absl_ports::StrJoin(expected.schema_types_new_by_name, ","), "]"); + + // Format schema_types_changed_fully_compatible_by_name + std::string actual_schema_types_changed_fully_compatible_by_name = + absl_ports::StrCat( + "[", + absl_ports::StrJoin( + actual.schema_types_changed_fully_compatible_by_name, ","), + "]"); + + std::string expected_schema_types_changed_fully_compatible_by_name = + absl_ports::StrCat( + "[", + absl_ports::StrJoin( + expected.schema_types_changed_fully_compatible_by_name, ","), + "]"); + + // Format schema_types_deleted_by_id + std::string actual_schema_types_index_incompatible_by_name = + absl_ports::StrCat( + "[", + absl_ports::StrJoin(actual.schema_types_index_incompatible_by_name, + ","), + "]"); + + std::string expected_schema_types_index_incompatible_by_name = + absl_ports::StrCat( + "[", + absl_ports::StrJoin(expected.schema_types_index_incompatible_by_name, + ","), + "]"); + *result_listener << IcingStringUtil::StringPrintf( "\nExpected {\n" "\tsuccess=%d,\n" - "\tindex_incompatible=%d,\n" "\told_schema_type_ids_changed=%s,\n" "\tschema_types_deleted_by_name=%s,\n" "\tschema_types_deleted_by_id=%s,\n" "\tschema_types_incompatible_by_name=%s,\n" "\tschema_types_incompatible_by_id=%s\n" + "\tschema_types_new_by_name=%s,\n" + "\tschema_types_index_incompatible_by_name=%s,\n" + "\tschema_types_changed_fully_compatible_by_name=%s\n" "}\n" "Actual {\n" "\tsuccess=%d,\n" - "\tindex_incompatible=%d,\n" "\told_schema_type_ids_changed=%s,\n" "\tschema_types_deleted_by_name=%s,\n" "\tschema_types_deleted_by_id=%s,\n" "\tschema_types_incompatible_by_name=%s,\n" "\tschema_types_incompatible_by_id=%s\n" + "\tschema_types_new_by_name=%s,\n" + "\tschema_types_index_incompatible_by_name=%s,\n" + "\tschema_types_changed_fully_compatible_by_name=%s\n" "}\n", - expected.success, expected.index_incompatible, - expected_old_schema_type_ids_changed.c_str(), + expected.success, expected_old_schema_type_ids_changed.c_str(), expected_schema_types_deleted_by_name.c_str(), expected_schema_types_deleted_by_id.c_str(), expected_schema_types_incompatible_by_name.c_str(), - expected_schema_types_incompatible_by_id.c_str(), actual.success, - actual.index_incompatible, actual_old_schema_type_ids_changed.c_str(), + expected_schema_types_incompatible_by_id.c_str(), + expected_schema_types_new_by_name.c_str(), + expected_schema_types_changed_fully_compatible_by_name.c_str(), + expected_schema_types_index_incompatible_by_name.c_str(), actual.success, + actual_old_schema_type_ids_changed.c_str(), actual_schema_types_deleted_by_name.c_str(), actual_schema_types_deleted_by_id.c_str(), actual_schema_types_incompatible_by_name.c_str(), - actual_schema_types_incompatible_by_id.c_str()); - + actual_schema_types_incompatible_by_id.c_str(), + actual_schema_types_new_by_name.c_str(), + actual_schema_types_changed_fully_compatible_by_name.c_str(), + actual_schema_types_index_incompatible_by_name.c_str()); return false; } @@ -267,7 +315,7 @@ std::string StatusCodeToString(libtextclassifier3::StatusCode code) { } } -string ProtoStatusCodeToString(StatusProto::Code code) { +std::string ProtoStatusCodeToString(StatusProto::Code code) { switch (code) { case StatusProto::OK: return "OK"; @@ -376,14 +424,22 @@ MATCHER_P2(ProtoStatusIs, status_code, error_matcher, "") { return ExplainMatchResult(error_matcher, arg.message(), result_listener); } -MATCHER_P(EqualsSearchResultIgnoreStats, expected, "") { +MATCHER_P(EqualsSearchResultIgnoreStatsAndScores, expected, "") { SearchResultProto actual_copy = arg; actual_copy.clear_query_stats(); actual_copy.clear_debug_info(); + for (SearchResultProto::ResultProto& result : + *actual_copy.mutable_results()) { + result.clear_score(); + } SearchResultProto expected_copy = expected; expected_copy.clear_query_stats(); expected_copy.clear_debug_info(); + for (SearchResultProto::ResultProto& result : + *expected_copy.mutable_results()) { + result.clear_score(); + } return ExplainMatchResult(testing::EqualsProto(expected_copy), actual_copy, result_listener); } diff --git a/icing/testing/jni-test-helpers.h b/icing/testing/jni-test-helpers.h index adc469a..67a98c3 100644 --- a/icing/testing/jni-test-helpers.h +++ b/icing/testing/jni-test-helpers.h @@ -15,6 +15,8 @@ #ifndef ICING_TESTING_JNI_TEST_HELPERS_H_ #define ICING_TESTING_JNI_TEST_HELPERS_H_ +#include <memory> + #include "icing/jni/jni-cache.h" #ifdef ICING_REVERSE_JNI_SEGMENTATION diff --git a/icing/testing/schema-generator.h b/icing/testing/schema-generator.h index 78430cc..12133f5 100644 --- a/icing/testing/schema-generator.h +++ b/icing/testing/schema-generator.h @@ -18,7 +18,6 @@ #include <random> #include <string> -#include "icing/proto/schema.proto.h" #include "icing/proto/schema.pb.h" namespace icing { diff --git a/icing/testing/snippet-helpers.cc b/icing/testing/snippet-helpers.cc index fde0004..7a71987 100644 --- a/icing/testing/snippet-helpers.cc +++ b/icing/testing/snippet-helpers.cc @@ -17,28 +17,37 @@ #include <algorithm> #include <string_view> +#include "icing/absl_ports/str_join.h" #include "icing/proto/search.pb.h" +#include "icing/schema/section-manager.h" namespace icing { namespace lib { -const SnippetMatchProto* GetSnippetMatch(const SnippetProto& snippet_proto, - const std::string& property_name, - int snippet_index) { - auto iterator = std::find_if( - snippet_proto.entries().begin(), snippet_proto.entries().end(), - [&property_name](const SnippetProto::EntryProto& entry) { - return entry.property_name() == property_name; - }); - if (iterator == snippet_proto.entries().end() || - iterator->snippet_matches_size() <= snippet_index) { - return nullptr; +namespace { + +// Returns the property index and the property name with the index removed. +// Examples: +// GetPropertyIndex("foo") will return ["foo", 0] +// GetPropertyIndex("foo[5]") will return ["foo", 5] +std::pair<std::string_view, int> GetPropertyIndex(std::string_view property) { + size_t l_bracket = property.find(kLBracket); + if (l_bracket == std::string_view::npos || l_bracket >= property.length()) { + return {property, 0}; + } + size_t r_bracket = property.find(kRBracket, l_bracket); + if (r_bracket == std::string_view::npos || r_bracket - l_bracket < 2) { + return {property, 0}; } - return &iterator->snippet_matches(snippet_index); + std::string index_string = + std::string(property.substr(l_bracket + 1, r_bracket - l_bracket - 1)); + return {property.substr(0, l_bracket), std::stoi(index_string)}; } +} // namespace + const PropertyProto* GetProperty(const DocumentProto& document, - const std::string& property_name) { + std::string_view property_name) { const PropertyProto* property = nullptr; for (const PropertyProto& prop : document.properties()) { if (prop.name() == property_name) { @@ -48,32 +57,65 @@ const PropertyProto* GetProperty(const DocumentProto& document, return property; } -std::string GetWindow(const DocumentProto& document, - const SnippetProto& snippet_proto, - const std::string& property_name, int snippet_index) { - const SnippetMatchProto* match = - GetSnippetMatch(snippet_proto, property_name, snippet_index); - const PropertyProto* property = GetProperty(document, property_name); - if (match == nullptr || property == nullptr) { - return ""; +std::vector<std::string_view> GetWindows( + std::string_view content, const SnippetProto::EntryProto& snippet_proto) { + std::vector<std::string_view> windows; + for (const SnippetMatchProto& match : snippet_proto.snippet_matches()) { + windows.push_back(content.substr(match.window_byte_position(), + match.window_byte_length())); + } + return windows; +} + +std::vector<std::string_view> GetMatches( + std::string_view content, const SnippetProto::EntryProto& snippet_proto) { + std::vector<std::string_view> matches; + for (const SnippetMatchProto& match : snippet_proto.snippet_matches()) { + matches.push_back(content.substr(match.exact_match_byte_position(), + match.exact_match_byte_length())); } - std::string_view value = property->string_values(match->values_index()); - return std::string( - value.substr(match->window_position(), match->window_bytes())); + return matches; } -std::string GetMatch(const DocumentProto& document, - const SnippetProto& snippet_proto, - const std::string& property_name, int snippet_index) { - const SnippetMatchProto* match = - GetSnippetMatch(snippet_proto, property_name, snippet_index); - const PropertyProto* property = GetProperty(document, property_name); - if (match == nullptr || property == nullptr) { - return ""; +std::vector<std::string_view> GetSubMatches( + std::string_view content, const SnippetProto::EntryProto& snippet_proto) { + std::vector<std::string_view> matches; + for (const SnippetMatchProto& match : snippet_proto.snippet_matches()) { + matches.push_back(content.substr(match.exact_match_byte_position(), + match.submatch_byte_length())); + } + return matches; +} + +std::string_view GetString(const DocumentProto* document, + std::string_view property_path) { + std::vector<std::string_view> properties = + absl_ports::StrSplit(property_path, kPropertySeparator); + for (int i = 0; i < properties.size(); ++i) { + std::string_view property = properties.at(i); + int property_index; + std::tie(property, property_index) = GetPropertyIndex(property); + const PropertyProto* prop = GetProperty(*document, property); + if (prop == nullptr) { + // requested property doesn't exist in the document. Return empty string. + return ""; + } + if (i == properties.size() - 1) { + // The last property. Get the string_value + if (prop->string_values_size() - 1 < property_index) { + // The requested string doesn't exist. Return empty string. + return ""; + } + return prop->string_values(property_index); + } else if (prop->document_values_size() - 1 < property_index) { + // The requested subproperty doesn't exist. return an empty string. + return ""; + } else { + // Go to the next subproperty. + document = &prop->document_values(property_index); + } } - std::string_view value = property->string_values(match->values_index()); - return std::string( - value.substr(match->exact_match_position(), match->exact_match_bytes())); + return ""; } } // namespace lib diff --git a/icing/testing/snippet-helpers.h b/icing/testing/snippet-helpers.h index 124e421..73b2ce2 100644 --- a/icing/testing/snippet-helpers.h +++ b/icing/testing/snippet-helpers.h @@ -23,36 +23,36 @@ namespace icing { namespace lib { -// Retrieve pointer to the snippet_index'th SnippetMatchProto within the -// EntryProto identified by property_name within snippet_proto. -// Returns nullptr -// - if there is no EntryProto within snippet_proto corresponding to -// property_name. -// - if there is no SnippetMatchProto at snippet_index within the EntryProto -const SnippetMatchProto* GetSnippetMatch(const SnippetProto& snippet_proto, - const std::string& property_name, - int snippet_index); - // Retrieve pointer to the PropertyProto identified by property_name. // Returns nullptr if no such property exists. +// +// NOTE: This function does not handle nesting or indexes. "foo.bar" will return +// a nullptr even if document contains a property called "foo" that contains a +// subproperty called "bar". const PropertyProto* GetProperty(const DocumentProto& document, const std::string& property_name); -// Retrieves the window defined by the SnippetMatchProto returned by -// GetSnippetMatch(snippet_proto, property_name, snippet_index) for the property -// returned by GetProperty(document, property_name). -// Returns "" if no such property, snippet or window exists. -std::string GetWindow(const DocumentProto& document, - const SnippetProto& snippet_proto, - const std::string& property_name, int snippet_index); - -// Retrieves the match defined by the SnippetMatchProto returned by -// GetSnippetMatch(snippet_proto, property_name, snippet_index) for the property -// returned by GetProperty(document, property_name). -// Returns "" if no such property or snippet exists. -std::string GetMatch(const DocumentProto& document, - const SnippetProto& snippet_proto, - const std::string& property_name, int snippet_index); +// Retrieves all windows defined by the snippet_proto for the content. +std::vector<std::string_view> GetWindows( + std::string_view content, const SnippetProto::EntryProto& snippet_proto); + +// Retrieves all matches defined by the snippet_proto for the content. +std::vector<std::string_view> GetMatches( + std::string_view content, const SnippetProto::EntryProto& snippet_proto); + +// Retrieves all submatches defined by the snippet_proto for the content. +std::vector<std::string_view> GetSubMatches( + std::string_view content, const SnippetProto::EntryProto& snippet_proto); + +// Retrieves the string value held in the document corresponding to the +// property_path. +// Example: +// - GetString(doc, "foo") will retrieve the first string value in the +// property "foo" in document or an empty string if it doesn't exist. +// - GetString(doc, "foo[1].bar[2]") will retrieve the third string value in +// the subproperty "bar" of the second document value in the property "foo". +std::string_view GetString(const DocumentProto* document, + std::string_view property_path); } // namespace lib } // namespace icing diff --git a/icing/tokenization/icu/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc index 74d22cd..cb31441 100644 --- a/icing/tokenization/icu/icu-language-segmenter.cc +++ b/icing/tokenization/icu/icu-language-segmenter.cc @@ -25,6 +25,7 @@ #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/legacy/core/icing-string-util.h" +#include "icing/util/character-iterator.h" #include "icing/util/i18n-utils.h" #include "icing/util/status-macros.h" #include "unicode/ubrk.h" @@ -101,59 +102,149 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { return text_.substr(term_start_index_, term_length); } - libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter( + libtextclassifier3::StatusOr<CharacterIterator> CalculateTermStart() + override { + if (!offset_iterator_.MoveToUtf8(term_start_index_)) { + return absl_ports::AbortedError( + "Could not retrieve valid utf8 character!"); + } + return offset_iterator_; + } + + libtextclassifier3::StatusOr<CharacterIterator> CalculateTermEndExclusive() + override { + if (!offset_iterator_.MoveToUtf8(term_end_index_exclusive_)) { + return absl_ports::AbortedError( + "Could not retrieve valid utf8 character!"); + } + return offset_iterator_; + } + + libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfterUtf32( int32_t offset) override { - if (offset < 0 || offset >= text_.length()) { + if (offset < 0) { + // Very simple. The first term start after a negative offset is the first + // term. So just reset to start and Advance. + return ResetToStartUtf32(); + } + + // 1. Find the unicode character that contains the byte at offset. + if (!offset_iterator_.MoveToUtf32(offset)) { + // An error occurred. Mark as DONE + if (offset_iterator_.utf8_index() != text_.length()) { + // We returned false for some reason other than hitting the end. This is + // a real error. Just return. + MarkAsDone(); + return absl_ports::AbortedError( + "Could not retrieve valid utf8 character!"); + } + } + if (offset_iterator_.utf8_index() == text_.length()) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( - "Illegal offset provided! Offset %d is not within bounds of string " - "of length %zu", - offset, text_.length())); + "Illegal offset provided! Offset utf-32:%d, utf-8:%d is not within " + "bounds of string of length %zu", + offset_iterator_.utf32_index(), offset_iterator_.utf8_index(), + text_.length())); } - term_start_index_ = ubrk_following(break_iterator_, offset); - if (term_start_index_ == UBRK_DONE) { + + // 2. We've got the unicode character containing byte offset. Now, we need + // to point to the segment that starts after this character. + int following_utf8_index = + ubrk_following(break_iterator_, offset_iterator_.utf8_index()); + if (following_utf8_index == UBRK_DONE) { MarkAsDone(); return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( "No segments begin after provided offset %d.", offset)); } - term_end_index_exclusive_ = ubrk_next(break_iterator_); - if (term_end_index_exclusive_ == UBRK_DONE) { - MarkAsDone(); + term_end_index_exclusive_ = following_utf8_index; + + // 3. The term_end_exclusive_ points to the start of the term that we want + // to return. We need to Advance so that term_start_ will now point to this + // term. + if (!Advance()) { return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( "No segments begin after provided offset %d.", offset)); } - if (!IsValidSegment()) { - if (!Advance()) { - return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( - "No segments begin after provided offset %d.", offset)); - } + if (!offset_iterator_.MoveToUtf8(term_start_index_)) { + return absl_ports::AbortedError( + "Could not retrieve valid utf8 character!"); } - return term_start_index_; + return offset_iterator_.utf32_index(); } - libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore( + libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBeforeUtf32( int32_t offset) override { - if (offset < 0 || offset >= text_.length()) { + if (offset < 0) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( "Illegal offset provided! Offset %d is not within bounds of string " "of length %zu", offset, text_.length())); } - ICING_RETURN_IF_ERROR(ResetToTermStartingBefore(offset)); - if (term_end_index_exclusive_ > offset) { - // This term ends after offset. So we need to get the term just before - // this one. - ICING_RETURN_IF_ERROR(ResetToTermStartingBefore(term_start_index_)); + + if (!offset_iterator_.MoveToUtf32(offset)) { + // An error occurred. Mark as DONE + if (offset_iterator_.utf8_index() != text_.length()) { + // We returned false for some reason other than hitting the end. This is + // a real error. Just return. + MarkAsDone(); + return absl_ports::AbortedError( + "Could not retrieve valid utf8 character!"); + } + // If it returned false because we hit the end. Then that's fine. We'll + // just treat it as if the request was for the end. + } + + // 2. We've got the unicode character containing byte offset. Now, we need + // to point to the segment that ends before this character. + int starting_utf8_index = + ubrk_preceding(break_iterator_, offset_iterator_.utf8_index()); + if (starting_utf8_index == UBRK_DONE) { + // Rewind the end indices. + MarkAsDone(); + return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( + "No segments end before provided offset %d.", offset)); } - return term_start_index_; + term_start_index_ = starting_utf8_index; + + // 3. We've correctly set the start index and the iterator currently points + // to that position. Now we need to find the correct end position and + // advance the iterator to that position. + int ending_utf8_index = ubrk_next(break_iterator_); + if (ending_utf8_index == UBRK_DONE) { + // This shouldn't ever happen. + MarkAsDone(); + return absl_ports::AbortedError(IcingStringUtil::StringPrintf( + "No segments end before provided offset %d.", offset)); + } + term_end_index_exclusive_ = ending_utf8_index; + + // 4. The start and end indices point to a segment, but we need to ensure + // that this segment is 1) valid and 2) ends before offset. Otherwise, we'll + // need a segment prior to this one. + CharacterIterator term_start_iterator = offset_iterator_; + if (!term_start_iterator.MoveToUtf8(term_start_index_)) { + return absl_ports::AbortedError( + "Could not retrieve valid utf8 character!"); + } + if (term_end_index_exclusive_ > offset_iterator_.utf8_index() || + !IsValidSegment()) { + return ResetToTermEndingBeforeUtf32(term_start_iterator.utf32_index()); + } + return term_start_iterator.utf32_index(); } - libtextclassifier3::StatusOr<int32_t> ResetToStart() override { + libtextclassifier3::StatusOr<int32_t> ResetToStartUtf32() override { term_start_index_ = 0; term_end_index_exclusive_ = 0; if (!Advance()) { - return absl_ports::NotFoundError(""); + return absl_ports::NotFoundError( + "Unable to find any valid terms in text."); + } + if (!offset_iterator_.MoveToUtf8(term_start_index_)) { + return absl_ports::AbortedError( + "Could not retrieve valid utf8 character!"); } - return term_start_index_; + return offset_iterator_.utf32_index(); } private: @@ -163,6 +254,7 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { text_(text), locale_(locale), u_text_(UTEXT_INITIALIZER), + offset_iterator_(text), term_start_index_(0), term_end_index_exclusive_(0) {} @@ -232,6 +324,15 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator { // utext_close() must be called after using. UText u_text_; + // Offset iterator. This iterator is not guaranteed to point to any particular + // character, but is guaranteed to point to a valid UTF character sequence. + // + // This iterator is used to save some amount of linear traversal when seeking + // to a specific UTF-32 offset. Each function that uses it could just create + // a CharacterIterator starting at the beginning of the text and traverse + // forward from there. + CharacterIterator offset_iterator_; + // The start and end indices are used to track the positions of current // term. int term_start_index_; diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc index c0d6d43..01eb7d8 100644 --- a/icing/tokenization/icu/icu-language-segmenter_test.cc +++ b/icing/tokenization/icu/icu-language-segmenter_test.cc @@ -12,24 +12,39 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include <memory> +#include <string_view> + +#include "icing/jni/jni-cache.h" +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/absl_ports/str_cat.h" #include "icing/helpers/icu/icu-data-file-helper.h" #include "icing/testing/common-matchers.h" #include "icing/testing/icu-i18n-test-utils.h" +#include "icing/testing/jni-test-helpers.h" #include "icing/testing/test-data.h" #include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/language-segmenter.h" +#include "icing/util/character-iterator.h" #include "unicode/uloc.h" namespace icing { namespace lib { -namespace { + using ::testing::ElementsAre; using ::testing::Eq; using ::testing::IsEmpty; +namespace { + +language_segmenter_factory::SegmenterOptions GetSegmenterOptions( + const std::string& locale, const JniCache* jni_cache) { + return language_segmenter_factory::SegmenterOptions(locale, jni_cache); +} + // Returns a vector containing all terms retrieved by Advancing on the iterator. std::vector<std::string_view> GetAllTermsAdvance( LanguageSegmenter::Iterator* itr) { @@ -40,70 +55,61 @@ std::vector<std::string_view> GetAllTermsAdvance( return terms; } -// Returns a vector containing all terms retrieved by calling -// ResetToStart/ResetAfter with the current position to simulate Advancing on -// the iterator. -std::vector<std::string_view> GetAllTermsResetAfter( +// Returns a vector containing all terms retrieved by calling ResetAfter with +// the UTF-32 position of the current term start to simulate Advancing on the +// iterator. +std::vector<std::string_view> GetAllTermsResetAfterUtf32( LanguageSegmenter::Iterator* itr) { std::vector<std::string_view> terms; - if (!itr->ResetToStart().ok()) { - return terms; - } - terms.push_back(itr->GetTerm()); - const char* text_begin = itr->GetTerm().data(); - // Calling ResetToTermStartingAfter with the current position should get the - // very next term in the sequence. - for (int current_pos = 0; itr->ResetToTermStartingAfter(current_pos).ok(); - current_pos = itr->GetTerm().data() - text_begin) { + // Calling ResetToTermStartingAfterUtf32 with -1 should get the first term in + // the sequence. + bool is_ok = itr->ResetToTermStartingAfterUtf32(-1).ok(); + while (is_ok) { terms.push_back(itr->GetTerm()); + // Calling ResetToTermStartingAfterUtf32 with the current position should + // get the very next term in the sequence. + CharacterIterator char_itr = itr->CalculateTermStart().ValueOrDie(); + is_ok = itr->ResetToTermStartingAfterUtf32(char_itr.utf32_index()).ok(); } return terms; } // Returns a vector containing all terms retrieved by alternating calls to -// Advance and calls to ResetAfter with the current position to simulate -// Advancing. -std::vector<std::string_view> GetAllTermsAdvanceAndResetAfter( +// Advance and calls to ResetAfter with the UTF-32 position of the current term +// start to simulate Advancing. +std::vector<std::string_view> GetAllTermsAdvanceAndResetAfterUtf32( LanguageSegmenter::Iterator* itr) { - const char* text_begin = itr->GetTerm().data(); std::vector<std::string_view> terms; - - bool is_ok = true; - int current_pos = 0; + bool is_ok = itr->Advance(); while (is_ok) { + terms.push_back(itr->GetTerm()); // Alternate between using Advance and ResetToTermAfter. if (terms.size() % 2 == 0) { is_ok = itr->Advance(); } else { - // Calling ResetToTermStartingAfter with the current position should get - // the very next term in the sequence. - current_pos = itr->GetTerm().data() - text_begin; - is_ok = itr->ResetToTermStartingAfter(current_pos).ok(); - } - if (is_ok) { - terms.push_back(itr->GetTerm()); + // Calling ResetToTermStartingAfterUtf32 with the current position should + // get the very next term in the sequence. + CharacterIterator char_itr = itr->CalculateTermStart().ValueOrDie(); + is_ok = itr->ResetToTermStartingAfterUtf32(char_itr.utf32_index()).ok(); } } return terms; } // Returns a vector containing all terms retrieved by calling ResetBefore with -// the current position, starting at the end of the text. This vector should be -// in reverse order of GetAllTerms and missing the last term. -std::vector<std::string_view> GetAllTermsResetBefore( +// the UTF-32 position of the current term start, starting at the end of the +// text. This vector should be in reverse order of GetAllTerms and missing the +// last term. +std::vector<std::string_view> GetAllTermsResetBeforeUtf32( LanguageSegmenter::Iterator* itr) { - const char* text_begin = itr->GetTerm().data(); - int last_pos = 0; - while (itr->Advance()) { - last_pos = itr->GetTerm().data() - text_begin; - } std::vector<std::string_view> terms; - // Calling ResetToTermEndingBefore with the current position should get the - // previous term in the sequence. - for (int current_pos = last_pos; - itr->ResetToTermEndingBefore(current_pos).ok(); - current_pos = itr->GetTerm().data() - text_begin) { + bool is_ok = itr->ResetToTermEndingBeforeUtf32(1000).ok(); + while (is_ok) { terms.push_back(itr->GetTerm()); + // Calling ResetToTermEndingBeforeUtf32 with the current position should get + // the previous term in the sequence. + CharacterIterator char_itr = itr->CalculateTermStart().ValueOrDie(); + is_ok = itr->ResetToTermEndingBeforeUtf32(char_itr.utf32_index()).ok(); } return terms; } @@ -119,27 +125,34 @@ class IcuLanguageSegmenterAllLocalesTest } static std::string GetLocale() { return GetParam(); } - static language_segmenter_factory::SegmenterOptions GetOptions() { - return language_segmenter_factory::SegmenterOptions(GetLocale()); - } + + std::unique_ptr<const JniCache> jni_cache_ = GetTestJniCache(); }; +} // namespace + TEST_P(IcuLanguageSegmenterAllLocalesTest, EmptyText) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty())); } TEST_P(IcuLanguageSegmenterAllLocalesTest, SimpleText) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"), IsOkAndHolds(ElementsAre("Hello", " ", "World"))); } TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_Punctuation) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // ASCII punctuation marks are kept EXPECT_THAT( language_segmenter->GetAllTerms("Hello, World!!!"), @@ -153,8 +166,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_Punctuation) { } TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_SpecialCharacter) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // ASCII special characters are kept EXPECT_THAT(language_segmenter->GetAllTerms("Pay $1000"), IsOkAndHolds(ElementsAre("Pay", " ", "$", "1000"))); @@ -169,8 +184,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_SpecialCharacter) { } TEST_P(IcuLanguageSegmenterAllLocalesTest, Non_ASCII_Non_Alphabetic) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Full-width (non-ASCII) punctuation marks and special characters are left // out. EXPECT_THAT(language_segmenter->GetAllTerms("。?·Hello!×"), @@ -178,10 +195,12 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, Non_ASCII_Non_Alphabetic) { } TEST_P(IcuLanguageSegmenterAllLocalesTest, Acronym) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); - EXPECT_THAT(language_segmenter->GetAllTerms("U.S. Bank"), - IsOkAndHolds(ElementsAre("U.S", ".", " ", "Bank"))); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); + EXPECT_THAT(language_segmenter->GetAllTerms("U.S.𡔖 Bank"), + IsOkAndHolds(ElementsAre("U.S", ".", "𡔖", " ", "Bank"))); EXPECT_THAT(language_segmenter->GetAllTerms("I.B.M."), IsOkAndHolds(ElementsAre("I.B.M", "."))); EXPECT_THAT(language_segmenter->GetAllTerms("I,B,M"), @@ -191,8 +210,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, Acronym) { } TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // According to unicode word break rules // WB6(https://unicode.org/reports/tr29/#WB6), // WB7(https://unicode.org/reports/tr29/#WB7), and a few others, some @@ -274,8 +295,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) { } TEST_P(IcuLanguageSegmenterAllLocalesTest, Apostrophes) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); EXPECT_THAT(language_segmenter->GetAllTerms("It's ok."), IsOkAndHolds(ElementsAre("It's", " ", "ok", "."))); EXPECT_THAT(language_segmenter->GetAllTerms("He'll be back."), @@ -295,8 +318,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, Apostrophes) { } TEST_P(IcuLanguageSegmenterAllLocalesTest, Parentheses) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); EXPECT_THAT(language_segmenter->GetAllTerms("(Hello)"), IsOkAndHolds(ElementsAre("(", "Hello", ")"))); @@ -306,8 +331,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, Parentheses) { } TEST_P(IcuLanguageSegmenterAllLocalesTest, Quotes) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); EXPECT_THAT(language_segmenter->GetAllTerms("\"Hello\""), IsOkAndHolds(ElementsAre("\"", "Hello", "\""))); @@ -317,8 +344,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, Quotes) { } TEST_P(IcuLanguageSegmenterAllLocalesTest, Alphanumeric) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Alphanumeric terms are allowed EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"), @@ -326,8 +355,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, Alphanumeric) { } TEST_P(IcuLanguageSegmenterAllLocalesTest, Number) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Alphanumeric terms are allowed EXPECT_THAT( @@ -342,8 +373,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, Number) { } TEST_P(IcuLanguageSegmenterAllLocalesTest, ContinuousWhitespaces) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Multiple continuous whitespaces are treated as one. const int kNumSeparators = 256; std::string text_with_spaces = @@ -367,8 +400,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ContinuousWhitespaces) { } TEST_P(IcuLanguageSegmenterAllLocalesTest, CJKT) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that don't // have whitespaces as word delimiter. @@ -389,15 +424,19 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, CJKT) { } TEST_P(IcuLanguageSegmenterAllLocalesTest, LatinLettersWithAccents) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); EXPECT_THAT(language_segmenter->GetAllTerms("āăąḃḅḇčćç"), IsOkAndHolds(ElementsAre("āăąḃḅḇčćç"))); } TEST_P(IcuLanguageSegmenterAllLocalesTest, WhitespaceSplitLanguages) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Turkish EXPECT_THAT(language_segmenter->GetAllTerms("merhaba dünya"), IsOkAndHolds(ElementsAre("merhaba", " ", "dünya"))); @@ -408,8 +447,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, WhitespaceSplitLanguages) { } TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguages) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); EXPECT_THAT(language_segmenter->GetAllTerms("How are you你好吗お元気ですか"), IsOkAndHolds(ElementsAre("How", " ", "are", " ", "you", "你好", "吗", "お", "元気", "です", "か"))); @@ -420,8 +461,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguages) { } TEST_P(IcuLanguageSegmenterAllLocalesTest, NotCopyStrings) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Validates that the input strings are not copied const std::string text = "Hello World"; const char* word1_address = text.c_str(); @@ -437,127 +480,141 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, NotCopyStrings) { EXPECT_THAT(word2_address, Eq(word2_result_address)); } -TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToStartWordConnector) { - ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, - language_segmenter_factory::Create(GetOptions())); +TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToStartUtf32WordConnector) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "com:google:android is package"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, segmenter->Segment(kText)); - // String: "com:google:android is package" - // ^ ^^ ^^ - // Bytes: 0 18 19 21 22 - auto position_or = itr->ResetToStart(); + // String: "com:google:android is package" + // ^ ^^ ^^ + // UTF-8 idx: 0 18 19 21 22 + // UTF-32 idx: 0 18 19 21 22 + auto position_or = itr->ResetToStartUtf32(); EXPECT_THAT(position_or, IsOk()); ASSERT_THAT(itr->GetTerm(), Eq("com:google:android")); } -TEST_P(IcuLanguageSegmenterAllLocalesTest, NewIteratorResetToStart) { - ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, - language_segmenter_factory::Create(GetOptions())); +TEST_P(IcuLanguageSegmenterAllLocalesTest, NewIteratorResetToStartUtf32) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "How are you你好吗お元気ですか"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, segmenter->Segment(kText)); - // String: "How are you你好吗お元気ですか" - // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ - // Bytes: 0 3 4 7 8 11 172023 29 35 - EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0))); + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 3 4 7 8 11 172023 29 35 + // UTF-32 idx: 0 3 4 7 8 11 131415 17 19 + EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0))); EXPECT_THAT(itr->GetTerm(), Eq("How")); } -TEST_P(IcuLanguageSegmenterAllLocalesTest, IteratorOneAdvanceResetToStart) { - ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, - language_segmenter_factory::Create(GetOptions())); +TEST_P(IcuLanguageSegmenterAllLocalesTest, + IteratorOneAdvanceResetToStartUtf32) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "How are you你好吗お元気ですか"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, segmenter->Segment(kText)); - // String: "How are you你好吗お元気ですか" - // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ - // Bytes: 0 3 4 7 8 11 172023 29 35 + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 3 4 7 8 11 172023 29 35 + // UTF-32 idx: 0 3 4 7 8 11 131415 17 19 ASSERT_TRUE(itr->Advance()); // itr points to 'How' - EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0))); EXPECT_THAT(itr->GetTerm(), Eq("How")); } TEST_P(IcuLanguageSegmenterAllLocalesTest, - IteratorMultipleAdvancesResetToStart) { - ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, - language_segmenter_factory::Create(GetOptions())); + IteratorMultipleAdvancesResetToStartUtf32) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "How are you你好吗お元気ですか"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, segmenter->Segment(kText)); - // String: "How are you你好吗お元気ですか" - // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ - // Bytes: 0 3 4 7 8 11 172023 29 35 + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 3 4 7 8 11 172023 29 35 + // UTF-32 idx: 0 3 4 7 8 11 131415 17 19 ASSERT_TRUE(itr->Advance()); ASSERT_TRUE(itr->Advance()); ASSERT_TRUE(itr->Advance()); ASSERT_TRUE(itr->Advance()); // itr points to ' ' - EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0))); EXPECT_THAT(itr->GetTerm(), Eq("How")); } -TEST_P(IcuLanguageSegmenterAllLocalesTest, IteratorDoneResetToStart) { - ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, - language_segmenter_factory::Create(GetOptions())); +TEST_P(IcuLanguageSegmenterAllLocalesTest, IteratorDoneResetToStartUtf32) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "How are you你好吗お元気ですか"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, segmenter->Segment(kText)); - // String: "How are you你好吗お元気ですか" - // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ - // Bytes: 0 3 4 7 8 11 172023 29 35 + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 3 4 7 8 11 172023 29 35 + // UTF-32 idx: 0 3 4 7 8 11 131415 17 19 while (itr->Advance()) { // Do nothing. } - EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0))); EXPECT_THAT(itr->GetTerm(), Eq("How")); } -TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterWordConnector) { - ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, - language_segmenter_factory::Create(GetOptions())); +TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterUtf32WordConnector) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "package com:google:android name"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, segmenter->Segment(kText)); - // String: "package com:google:android name" - // ^ ^^ ^^ - // Bytes: 0 7 8 26 27 - auto position_or = itr->ResetToTermStartingAfter(8); + // String: "package com:google:android name" + // ^ ^^ ^^ + // UTF-8 idx: 0 7 8 26 27 + // UTF-32 idx: 0 7 8 26 27 + auto position_or = itr->ResetToTermStartingAfterUtf32(8); EXPECT_THAT(position_or, IsOk()); EXPECT_THAT(position_or.ValueOrDie(), Eq(26)); ASSERT_THAT(itr->GetTerm(), Eq(" ")); - position_or = itr->ResetToTermStartingAfter(7); + position_or = itr->ResetToTermStartingAfterUtf32(7); EXPECT_THAT(position_or, IsOk()); EXPECT_THAT(position_or.ValueOrDie(), Eq(8)); ASSERT_THAT(itr->GetTerm(), Eq("com:google:android")); } -TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterOutOfBounds) { - ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, - language_segmenter_factory::Create(GetOptions())); +TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterUtf32OutOfBounds) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "How are you你好吗お元気ですか"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, segmenter->Segment(kText)); - // String: "How are you你好吗お元気ですか" - // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ - // Bytes: 0 3 4 7 8 11 172023 29 35 - ASSERT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8))); + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 3 4 7 8 11 172023 29 35 + // UTF-32 idx: 0 3 4 7 8 11 131415 17 19 + ASSERT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8))); ASSERT_THAT(itr->GetTerm(), Eq("you")); - EXPECT_THAT(itr->ResetToTermStartingAfter(-1), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - EXPECT_THAT(itr->GetTerm(), Eq("you")); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(-1), IsOk()); + EXPECT_THAT(itr->GetTerm(), Eq("How")); - EXPECT_THAT(itr->ResetToTermStartingAfter(kText.length()), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(21), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - EXPECT_THAT(itr->GetTerm(), Eq("you")); + EXPECT_THAT(itr->GetTerm(), Eq("How")); } // Tests that ResetToTermAfter and Advance produce the same output. With the @@ -566,9 +623,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterOutOfBounds) { // terms produced by ResetToTermAfter calls with the current position // provided as the argument. TEST_P(IcuLanguageSegmenterAllLocalesTest, - MixedLanguagesResetToTermAfterEquivalentToAdvance) { - ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, - language_segmenter_factory::Create(GetOptions())); + MixedLanguagesResetToTermAfterUtf32EquivalentToAdvance) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか"; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, @@ -580,16 +638,17 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, segmenter->Segment(kText)); std::vector<std::string_view> reset_terms = - GetAllTermsResetAfter(reset_to_term_itr.get()); + GetAllTermsResetAfterUtf32(reset_to_term_itr.get()); EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms)); EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm())); } TEST_P(IcuLanguageSegmenterAllLocalesTest, - ThaiResetToTermAfterEquivalentToAdvance) { - ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, - language_segmenter_factory::Create(GetOptions())); + ThaiResetToTermAfterUtf32EquivalentToAdvance) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, @@ -601,16 +660,17 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, segmenter->Segment(kThai)); std::vector<std::string_view> reset_terms = - GetAllTermsResetAfter(reset_to_term_itr.get()); + GetAllTermsResetAfterUtf32(reset_to_term_itr.get()); EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms)); EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm())); } TEST_P(IcuLanguageSegmenterAllLocalesTest, - KoreanResetToTermAfterEquivalentToAdvance) { - ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, - language_segmenter_factory::Create(GetOptions())); + KoreanResetToTermAfterUtf32EquivalentToAdvance) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kKorean = "나는 매일 출근합니다."; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, @@ -622,7 +682,7 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, segmenter->Segment(kKorean)); std::vector<std::string_view> reset_terms = - GetAllTermsResetAfter(reset_to_term_itr.get()); + GetAllTermsResetAfterUtf32(reset_to_term_itr.get()); EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms)); EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm())); @@ -633,9 +693,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, // should be able to mix ResetToTermAfter(current_position) calls and Advance // calls to mimic calling Advance. TEST_P(IcuLanguageSegmenterAllLocalesTest, - MixedLanguagesResetToTermAfterInteroperableWithAdvance) { - ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, - language_segmenter_factory::Create(GetOptions())); + MixedLanguagesResetToTermAfterUtf32InteroperableWithAdvance) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか"; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, @@ -647,7 +708,7 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr, segmenter->Segment(kText)); std::vector<std::string_view> advance_and_reset_terms = - GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get()); + GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get()); EXPECT_THAT(advance_and_reset_terms, testing::ElementsAreArray(advance_terms)); @@ -655,9 +716,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, } TEST_P(IcuLanguageSegmenterAllLocalesTest, - ThaiResetToTermAfterInteroperableWithAdvance) { - ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, - language_segmenter_factory::Create(GetOptions())); + ThaiResetToTermAfterUtf32InteroperableWithAdvance) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, @@ -669,7 +731,7 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr, segmenter->Segment(kThai)); std::vector<std::string_view> advance_and_reset_terms = - GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get()); + GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get()); EXPECT_THAT(advance_and_reset_terms, testing::ElementsAreArray(advance_terms)); @@ -677,9 +739,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, } TEST_P(IcuLanguageSegmenterAllLocalesTest, - KoreanResetToTermAfterInteroperableWithAdvance) { - ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, - language_segmenter_factory::Create(GetOptions())); + KoreanResetToTermAfterUtf32InteroperableWithAdvance) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kKorean = "나는 매일 출근합니다."; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, @@ -691,211 +754,234 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr, segmenter->Segment(kKorean)); std::vector<std::string_view> advance_and_reset_terms = - GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get()); + GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get()); EXPECT_THAT(advance_and_reset_terms, testing::ElementsAreArray(advance_terms)); EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm())); } -TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguagesResetToTermAfter) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); +TEST_P(IcuLanguageSegmenterAllLocalesTest, + MixedLanguagesResetToTermAfterUtf32) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment("How are you你好吗お元気ですか")); - // String: "How are you你好吗お元気ですか" - // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ - // Bytes: 0 3 4 7 8 11 172023 29 35 - EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(3))); + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 3 4 7 8 11 172023 29 35 + // UTF-32 idx: 0 3 4 7 8 11 131415 17 19 + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3))); EXPECT_THAT(itr->GetTerm(), Eq(" ")); - EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(11))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(10), IsOkAndHolds(Eq(11))); EXPECT_THAT(itr->GetTerm(), Eq("你好")); - EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8))); EXPECT_THAT(itr->GetTerm(), Eq("you")); - EXPECT_THAT(itr->ResetToTermStartingAfter(32), IsOkAndHolds(Eq(35))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(18), IsOkAndHolds(Eq(19))); EXPECT_THAT(itr->GetTerm(), Eq("か")); - EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(17))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(13))); EXPECT_THAT(itr->GetTerm(), Eq("吗")); - EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3))); EXPECT_THAT(itr->GetTerm(), Eq(" ")); - EXPECT_THAT(itr->ResetToTermStartingAfter(35), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(19), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); } TEST_P(IcuLanguageSegmenterAllLocalesTest, - ContinuousWhitespacesResetToTermAfter) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); + ContinuousWhitespacesResetToTermAfterUtf32) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Multiple continuous whitespaces are treated as one. constexpr std::string_view kTextWithSpace = "Hello World"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kTextWithSpace)); - // String: "Hello World" - // ^ ^ ^ - // Bytes: 0 5 15 - EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(5))); + // String: "Hello World" + // ^ ^ ^ + // UTF-8 idx: 0 5 15 + // UTF-32 idx: 0 5 15 + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(5))); EXPECT_THAT(itr->GetTerm(), Eq(" ")); - EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(5))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(5))); EXPECT_THAT(itr->GetTerm(), Eq(" ")); - EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(15))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(10), IsOkAndHolds(Eq(15))); EXPECT_THAT(itr->GetTerm(), Eq("World")); - EXPECT_THAT(itr->ResetToTermStartingAfter(5), IsOkAndHolds(Eq(15))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(5), IsOkAndHolds(Eq(15))); EXPECT_THAT(itr->GetTerm(), Eq("World")); - EXPECT_THAT(itr->ResetToTermStartingAfter(15), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermStartingAfter(17), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(17), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermStartingAfter(19), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(19), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); } -TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermAfter) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); +TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermAfterUtf32) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that // don't have whitespaces as word delimiter. Chinese constexpr std::string_view kChinese = "我每天走路去上班。"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kChinese)); - // String: "我每天走路去上班。" - // ^ ^ ^ ^^ - // Bytes: 0 3 9 15 18 - EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3))); + // String: "我每天走路去上班。" + // ^ ^ ^ ^^ + // UTF-8 idx: 0 3 9 15 18 + // UTF-832 idx: 0 1 3 5 6 + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1))); EXPECT_THAT(itr->GetTerm(), Eq("每天")); - EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(9))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3))); EXPECT_THAT(itr->GetTerm(), Eq("走路")); - EXPECT_THAT(itr->ResetToTermStartingAfter(19), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); } -TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermAfter) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); +TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermAfterUtf32) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Japanese constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kJapanese)); - // String: "私は毎日仕事に歩いています。" - // ^ ^ ^ ^ ^ ^ ^ ^ ^ - // Bytes: 0 3 6 12 18212427 33 - EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3))); + // String: "私は毎日仕事に歩いています。" + // ^ ^ ^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 3 6 12 18212427 33 + // UTF-32 idx: 0 1 2 4 6 7 8 9 11 + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1))); EXPECT_THAT(itr->GetTerm(), Eq("は")); - EXPECT_THAT(itr->ResetToTermStartingAfter(33), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(11), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(12))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(3), IsOkAndHolds(Eq(4))); EXPECT_THAT(itr->GetTerm(), Eq("仕事")); } -TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermAfter) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); +TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermAfterUtf32) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kKhmer)); - // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។" - // ^ ^ ^ ^ - // Bytes: 0 9 24 45 - EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9))); + // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។" + // ^ ^ ^ ^ + // UTF-8 idx: 0 9 24 45 + // UTF-32 idx: 0 3 8 15 + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3))); EXPECT_THAT(itr->GetTerm(), Eq("ដើរទៅ")); - EXPECT_THAT(itr->ResetToTermStartingAfter(47), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(24))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(6), IsOkAndHolds(Eq(8))); EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ")); } -TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermAfter) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); +TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermAfterUtf32) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Thai constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kThai)); - // String: "ฉันเดินไปทำงานทุกวัน" - // ^ ^ ^ ^ ^ ^ - // Bytes: 0 9 21 27 42 51 - EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9))); + // String: "ฉันเดินไปทำงานทุกวัน" + // ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 9 21 27 42 51 + // UTF-32 idx: 0 3 7 9 14 17 + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3))); EXPECT_THAT(itr->GetTerm(), Eq("เดิน")); - EXPECT_THAT(itr->ResetToTermStartingAfter(51), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(17), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermStartingAfter(13), IsOkAndHolds(Eq(21))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(6), IsOkAndHolds(Eq(7))); EXPECT_THAT(itr->GetTerm(), Eq("ไป")); - EXPECT_THAT(itr->ResetToTermStartingAfter(34), IsOkAndHolds(Eq(42))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(14))); EXPECT_THAT(itr->GetTerm(), Eq("ทุก")); } -TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermBeforeWordConnector) { - ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, - language_segmenter_factory::Create(GetOptions())); +TEST_P(IcuLanguageSegmenterAllLocalesTest, + ResetToTermBeforeWordConnectorUtf32) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "package name com:google:android!"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, segmenter->Segment(kText)); - // String: "package name com:google:android!" - // ^ ^^ ^^ ^ - // Bytes: 0 7 8 12 13 31 - auto position_or = itr->ResetToTermEndingBefore(31); + // String: "package name com:google:android!" + // ^ ^^ ^^ ^ + // UTF-8 idx: 0 7 8 12 13 31 + // UTF-32 idx: 0 7 8 12 13 31 + auto position_or = itr->ResetToTermEndingBeforeUtf32(31); EXPECT_THAT(position_or, IsOk()); EXPECT_THAT(position_or.ValueOrDie(), Eq(13)); ASSERT_THAT(itr->GetTerm(), Eq("com:google:android")); - position_or = itr->ResetToTermEndingBefore(21); + position_or = itr->ResetToTermEndingBeforeUtf32(21); EXPECT_THAT(position_or, IsOk()); EXPECT_THAT(position_or.ValueOrDie(), Eq(12)); ASSERT_THAT(itr->GetTerm(), Eq(" ")); } -TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermBeforeOutOfBounds) { - ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, - language_segmenter_factory::Create(GetOptions())); +TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermBeforeOutOfBoundsUtf32) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "How are you你好吗お元気ですか"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, segmenter->Segment(kText)); - // String: "How are you你好吗お元気ですか" - // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ - // Bytes: 0 3 4 7 8 11 172023 29 35 - ASSERT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4))); + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 3 4 7 8 11 172023 29 35 + // UTF-32 idx: 0 3 4 7 8 11 131415 17 19 + ASSERT_THAT(itr->ResetToTermEndingBeforeUtf32(7), IsOkAndHolds(Eq(4))); ASSERT_THAT(itr->GetTerm(), Eq("are")); - EXPECT_THAT(itr->ResetToTermEndingBefore(-1), + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(-1), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); EXPECT_THAT(itr->GetTerm(), Eq("are")); - EXPECT_THAT(itr->ResetToTermEndingBefore(kText.length()), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - EXPECT_THAT(itr->GetTerm(), Eq("are")); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(29), IsOk()); + EXPECT_THAT(itr->GetTerm(), Eq("か")); } // Tests that ResetToTermBefore and Advance produce the same output. With the @@ -904,26 +990,22 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermBeforeOutOfBounds) { // terms produced by ResetToTermBefore calls with the current position // provided as the argument (after their order has been reversed). TEST_P(IcuLanguageSegmenterAllLocalesTest, - MixedLanguagesResetToTermBeforeEquivalentToAdvance) { - ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, - language_segmenter_factory::Create(GetOptions())); + MixedLanguagesResetToTermBeforeEquivalentToAdvanceUtf32) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか"; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, segmenter->Segment(kText)); std::vector<std::string_view> advance_terms = GetAllTermsAdvance(advance_itr.get()); - // Can't produce the last term via calls to ResetToTermBefore. So skip - // past that one. - auto itr = advance_terms.begin(); - std::advance(itr, advance_terms.size() - 1); - advance_terms.erase(itr); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, segmenter->Segment(kText)); std::vector<std::string_view> reset_terms = - GetAllTermsResetBefore(reset_to_term_itr.get()); + GetAllTermsResetBeforeUtf32(reset_to_term_itr.get()); std::reverse(reset_terms.begin(), reset_terms.end()); EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms)); @@ -932,26 +1014,22 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, } TEST_P(IcuLanguageSegmenterAllLocalesTest, - ThaiResetToTermBeforeEquivalentToAdvance) { - ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, - language_segmenter_factory::Create(GetOptions())); + ThaiResetToTermBeforeEquivalentToAdvanceUtf32) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, segmenter->Segment(kThai)); std::vector<std::string_view> advance_terms = GetAllTermsAdvance(advance_itr.get()); - // Can't produce the last term via calls to ResetToTermBefore. So skip - // past that one. - auto itr = advance_terms.begin(); - std::advance(itr, advance_terms.size() - 1); - advance_terms.erase(itr); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, segmenter->Segment(kThai)); std::vector<std::string_view> reset_terms = - GetAllTermsResetBefore(reset_to_term_itr.get()); + GetAllTermsResetBeforeUtf32(reset_to_term_itr.get()); std::reverse(reset_terms.begin(), reset_terms.end()); EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms)); @@ -959,192 +1037,209 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, } TEST_P(IcuLanguageSegmenterAllLocalesTest, - KoreanResetToTermBeforeEquivalentToAdvance) { - ICING_ASSERT_OK_AND_ASSIGN(auto segmenter, - language_segmenter_factory::Create(GetOptions())); + KoreanResetToTermBeforeEquivalentToAdvanceUtf32) { + ICING_ASSERT_OK_AND_ASSIGN( + auto segmenter, language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kKorean = "나는 매일 출근합니다."; ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> advance_itr, segmenter->Segment(kKorean)); std::vector<std::string_view> advance_terms = GetAllTermsAdvance(advance_itr.get()); - // Can't produce the last term via calls to ResetToTermBefore. So skip - // past that one. - auto itr = advance_terms.begin(); - std::advance(itr, advance_terms.size() - 1); - advance_terms.erase(itr); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, segmenter->Segment(kKorean)); std::vector<std::string_view> reset_terms = - GetAllTermsResetBefore(reset_to_term_itr.get()); + GetAllTermsResetBeforeUtf32(reset_to_term_itr.get()); std::reverse(reset_terms.begin(), reset_terms.end()); EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms)); EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm())); } -TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguagesResetToTermBefore) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); +TEST_P(IcuLanguageSegmenterAllLocalesTest, + MixedLanguagesResetToTermBeforeUtf32) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment("How are you你好吗お元気ですか")); - // String: "How are you你好吗お元気ですか" - // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ - // Bytes: 0 3 4 7 8 11 172023 29 35 - EXPECT_THAT(itr->ResetToTermEndingBefore(2), + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 3 4 7 8 11 172023 29 35 + // UTF-32 idx: 0 3 4 7 8 11 131415 17 19 + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(2), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(7))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(10), IsOkAndHolds(Eq(7))); EXPECT_THAT(itr->GetTerm(), Eq(" ")); - EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(7), IsOkAndHolds(Eq(4))); EXPECT_THAT(itr->GetTerm(), Eq("are")); - EXPECT_THAT(itr->ResetToTermEndingBefore(32), IsOkAndHolds(Eq(23))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(18), IsOkAndHolds(Eq(15))); EXPECT_THAT(itr->GetTerm(), Eq("元気")); - EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(8))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(12), IsOkAndHolds(Eq(8))); EXPECT_THAT(itr->GetTerm(), Eq("you")); - EXPECT_THAT(itr->ResetToTermEndingBefore(0), + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermEndingBefore(35), IsOkAndHolds(Eq(29))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(19), IsOkAndHolds(Eq(17))); EXPECT_THAT(itr->GetTerm(), Eq("です")); } TEST_P(IcuLanguageSegmenterAllLocalesTest, - ContinuousWhitespacesResetToTermBefore) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); + ContinuousWhitespacesResetToTermBeforeUtf32) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Multiple continuous whitespaces are treated as one. constexpr std::string_view kTextWithSpace = "Hello World"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kTextWithSpace)); - // String: "Hello World" - // ^ ^ ^ - // Bytes: 0 5 15 - EXPECT_THAT(itr->ResetToTermEndingBefore(0), + // String: "Hello World" + // ^ ^ ^ + // UTF-8 idx: 0 5 15 + // UTF-32 idx: 0 5 15 + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermEndingBefore(2), + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(2), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(10), IsOkAndHolds(Eq(0))); EXPECT_THAT(itr->GetTerm(), Eq("Hello")); - EXPECT_THAT(itr->ResetToTermEndingBefore(5), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(5), IsOkAndHolds(Eq(0))); EXPECT_THAT(itr->GetTerm(), Eq("Hello")); - EXPECT_THAT(itr->ResetToTermEndingBefore(15), IsOkAndHolds(Eq(5))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(15), IsOkAndHolds(Eq(5))); EXPECT_THAT(itr->GetTerm(), Eq(" ")); - EXPECT_THAT(itr->ResetToTermEndingBefore(17), IsOkAndHolds(Eq(5))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(17), IsOkAndHolds(Eq(5))); EXPECT_THAT(itr->GetTerm(), Eq(" ")); - EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(5))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(19), IsOkAndHolds(Eq(5))); EXPECT_THAT(itr->GetTerm(), Eq(" ")); } -TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermBefore) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); +TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermBeforeUtf32) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that // don't have whitespaces as word delimiter. Chinese constexpr std::string_view kChinese = "我每天走路去上班。"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kChinese)); - // String: "我每天走路去上班。" - // ^ ^ ^ ^^ - // Bytes: 0 3 9 15 18 - EXPECT_THAT(itr->ResetToTermEndingBefore(0), + // String: "我每天走路去上班。" + // ^ ^ ^ ^^ + // UTF-8 idx: 0 3 9 15 18 + // UTF-32 idx: 0 1 3 5 6 + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(2), IsOkAndHolds(Eq(0))); EXPECT_THAT(itr->GetTerm(), Eq("我")); - EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(15))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(7), IsOkAndHolds(Eq(5))); EXPECT_THAT(itr->GetTerm(), Eq("去")); } -TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermBefore) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); +TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermBeforeUtf32) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Japanese constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kJapanese)); - // String: "私は毎日仕事に歩いています。" - // ^ ^ ^ ^ ^ ^ ^ ^ ^ - // Bytes: 0 3 6 12 18212427 33 - EXPECT_THAT(itr->ResetToTermEndingBefore(0), + // String: "私は毎日仕事に歩いています。" + // ^ ^ ^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 3 6 12 18212427 33 + // UTF-32 idx: 0 1 2 4 6 7 8 9 11 + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermEndingBefore(33), IsOkAndHolds(Eq(27))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(11), IsOkAndHolds(Eq(9))); EXPECT_THAT(itr->GetTerm(), Eq("てい")); - EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(3))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(3), IsOkAndHolds(Eq(1))); EXPECT_THAT(itr->GetTerm(), Eq("は")); } -TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermBefore) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); +TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermBeforeUtf32) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kKhmer)); - // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។" - // ^ ^ ^ ^ - // Bytes: 0 9 24 45 - EXPECT_THAT(itr->ResetToTermEndingBefore(0), + // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។" + // ^ ^ ^ ^ + // UTF-8 idx: 0 9 24 45 + // UTF-32 idx: 0 3 8 15 + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermEndingBefore(47), IsOkAndHolds(Eq(24))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(16), IsOkAndHolds(Eq(8))); EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ")); - EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(5), IsOkAndHolds(Eq(0))); EXPECT_THAT(itr->GetTerm(), Eq("ញុំ")); } -TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermBefore) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); +TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermBeforeUtf32) { + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Thai constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kThai)); - // String: "ฉันเดินไปทำงานทุกวัน" - // ^ ^ ^ ^ ^ ^ - // Bytes: 0 9 21 27 42 51 - EXPECT_THAT(itr->ResetToTermEndingBefore(0), + // String: "ฉันเดินไปทำงานทุกวัน" + // ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 9 21 27 42 51 + // UTF-32 idx: 0 3 7 9 14 17 + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermEndingBefore(51), IsOkAndHolds(Eq(42))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(17), IsOkAndHolds(Eq(14))); EXPECT_THAT(itr->GetTerm(), Eq("ทุก")); - EXPECT_THAT(itr->ResetToTermEndingBefore(13), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(4), IsOkAndHolds(Eq(0))); EXPECT_THAT(itr->GetTerm(), Eq("ฉัน")); - EXPECT_THAT(itr->ResetToTermEndingBefore(34), IsOkAndHolds(Eq(21))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(11), IsOkAndHolds(Eq(7))); EXPECT_THAT(itr->GetTerm(), Eq("ไป")); } TEST_P(IcuLanguageSegmenterAllLocalesTest, QuerySyntax) { - ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter, - language_segmenter_factory::Create(GetOptions())); + ICING_ASSERT_OK_AND_ASSIGN( + auto language_segmenter, + language_segmenter_factory::Create( + GetSegmenterOptions(GetLocale(), jni_cache_.get()))); // Validates that the input strings are not copied ICING_ASSERT_OK_AND_ASSIGN( std::vector<std::string_view> terms, @@ -1174,6 +1269,5 @@ INSTANTIATE_TEST_SUITE_P( "" // Will fall back to ICU default locale )); -} // namespace } // namespace lib } // namespace icing diff --git a/icing/tokenization/language-segmenter-factory.h b/icing/tokenization/language-segmenter-factory.h index e60c168..cae3eee 100644 --- a/icing/tokenization/language-segmenter-factory.h +++ b/icing/tokenization/language-segmenter-factory.h @@ -18,11 +18,7 @@ #include <memory> #include <string_view> -#ifdef __ANDROID__ #include "icing/jni/jni-cache.h" -#else // __ANDROID__ -class JniCache; // forward declaration to let non-Android builds work. -#endif // __ANDROID__ #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/tokenization/language-segmenter.h" diff --git a/icing/tokenization/language-segmenter-iterator-test-jni-layer.cc b/icing/tokenization/language-segmenter-iterator-test-jni-layer.cc new file mode 100644 index 0000000..3a94af3 --- /dev/null +++ b/icing/tokenization/language-segmenter-iterator-test-jni-layer.cc @@ -0,0 +1,37 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <jni.h> + +#include "gtest/gtest.h" +#include "icing/testing/logging-event-listener.h" + +// Global variable used so that the test implementation can access the JNIEnv. +JNIEnv* g_jenv = nullptr; + +extern "C" JNIEXPORT jboolean JNICALL +Java_icing_jni_LanguageSegmenterIteratorJniTest_testsMain(JNIEnv* env, + jclass ignored) { + g_jenv = env; + + std::vector<char*> my_argv; + char arg[] = "jni-test-lib"; + my_argv.push_back(arg); + int argc = 1; + char** argv = &(my_argv[0]); + testing::InitGoogleTest(&argc, argv); + testing::UnitTest::GetInstance()->listeners().Append( + new icing::lib::LoggingEventListener()); + return RUN_ALL_TESTS() == 0; +} diff --git a/icing/tokenization/language-segmenter-iterator_test.cc b/icing/tokenization/language-segmenter-iterator_test.cc index 2b1911e..d293581 100644 --- a/icing/tokenization/language-segmenter-iterator_test.cc +++ b/icing/tokenization/language-segmenter-iterator_test.cc @@ -16,8 +16,9 @@ #include "gtest/gtest.h" #include "icing/absl_ports/str_cat.h" #include "icing/helpers/icu/icu-data-file-helper.h" +#include "icing/portable/platform.h" #include "icing/testing/common-matchers.h" -#include "icing/testing/platform.h" +#include "icing/testing/jni-test-helpers.h" #include "icing/testing/test-data.h" #include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/language-segmenter.h" @@ -43,10 +44,13 @@ class LanguageSegmenterIteratorTest : public testing::Test { GetTestFilePath("icing/icu.dat"))); } } + + std::unique_ptr<const JniCache> jni_cache_ = GetTestJniCache(); }; TEST_F(LanguageSegmenterIteratorTest, AdvanceAndGetTerm) { - language_segmenter_factory::SegmenterOptions options(ULOC_US); + language_segmenter_factory::SegmenterOptions options(ULOC_US, + jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); @@ -66,86 +70,91 @@ TEST_F(LanguageSegmenterIteratorTest, AdvanceAndGetTerm) { } TEST_F(LanguageSegmenterIteratorTest, - ResetToTermStartingAfterWithOffsetInText) { - language_segmenter_factory::SegmenterOptions options(ULOC_US); + ResetToTermStartingAfterUtf32WithOffsetInText) { + language_segmenter_factory::SegmenterOptions options(ULOC_US, + jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment("foo bar")); - EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/0), + EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/0), IsOkAndHolds(3)); // The term " " - EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/3), + EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/3), IsOkAndHolds(4)); // The term "bar" - EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/4), + EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/4), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } TEST_F(LanguageSegmenterIteratorTest, - ResetToTermStartingAfterWithNegativeOffsetNotOk) { - language_segmenter_factory::SegmenterOptions options(ULOC_US); + ResetToTermStartingAfterUtf32WithNegativeOffsetNotOk) { + language_segmenter_factory::SegmenterOptions options(ULOC_US, + jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment("foo bar")); - EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/-1), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/-1), IsOk()); - EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/-100), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/-100), IsOk()); - EXPECT_THAT(iterator->ResetToStart(), IsOkAndHolds(0)); + EXPECT_THAT(iterator->ResetToStartUtf32(), IsOkAndHolds(0)); EXPECT_THAT(iterator->GetTerm(), Eq("foo")); } TEST_F(LanguageSegmenterIteratorTest, - ResetToTermStartingAfterWithTextLengthOffsetInvalidArgument) { + ResetToTermStartingAfterUtf32WithTextLengthOffsetInvalidArgument) { std::string text = "foo bar"; - language_segmenter_factory::SegmenterOptions options(ULOC_US); + language_segmenter_factory::SegmenterOptions options(ULOC_US, + jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text)); - EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/text.size()), + EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/text.length()), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST_F(LanguageSegmenterIteratorTest, - ResetToTermStartingAfterWithOffsetPastTextLengthInvalidArgument) { + ResetToTermStartingAfterUtf32WithOffsetPastTextLengthInvalidArgument) { std::string text = "foo bar"; - language_segmenter_factory::SegmenterOptions options(ULOC_US); + language_segmenter_factory::SegmenterOptions options(ULOC_US, + jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text)); - EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/100), + EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/100), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } -TEST_F(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithOffsetInText) { - language_segmenter_factory::SegmenterOptions options(ULOC_US); +TEST_F(LanguageSegmenterIteratorTest, + ResetToTermEndingBeforeUtf32WithOffsetInText) { + language_segmenter_factory::SegmenterOptions options(ULOC_US, + jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment("foo bar")); - EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/6), + EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/6), IsOkAndHolds(3)); // The term " " - EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/3), + EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/3), IsOkAndHolds(0)); // The term "foo" - EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/2), + EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/2), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } TEST_F(LanguageSegmenterIteratorTest, - ResetToTermEndingBeforeWithZeroNotFound) { - language_segmenter_factory::SegmenterOptions options(ULOC_US); + ResetToTermEndingBeforeUtf32WithZeroNotFound) { + language_segmenter_factory::SegmenterOptions options(ULOC_US, + jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); @@ -153,40 +162,43 @@ TEST_F(LanguageSegmenterIteratorTest, language_segmenter->Segment("foo bar")); // Zero is a valid argument, but there aren't any terms that end before it. - EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/0), + EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/0), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); } TEST_F(LanguageSegmenterIteratorTest, - ResetToTermEndingBeforeWithNegativeOffsetInvalidArgument) { - language_segmenter_factory::SegmenterOptions options(ULOC_US); + ResetToTermEndingBeforeUtf32WithNegativeOffsetInvalidArgument) { + language_segmenter_factory::SegmenterOptions options(ULOC_US, + jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment("foo bar")); - EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/-1), + EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/-1), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/-100), + EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/-100), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST_F(LanguageSegmenterIteratorTest, - ResetToTermEndingBeforeWithOffsetPastTextEndInvalidArgument) { + ResetToTermEndingBeforeUtf32WithOffsetPastTextEndInvalidArgument) { std::string text = "foo bar"; - language_segmenter_factory::SegmenterOptions options(ULOC_US); + language_segmenter_factory::SegmenterOptions options(ULOC_US, + jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text)); - EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/text.length()), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/text.length()), + IsOk()); - EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/text.length() + 1), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT( + iterator->ResetToTermEndingBeforeUtf32(/*offset=*/text.length() + 1), + IsOk()); } } // namespace diff --git a/icing/tokenization/language-segmenter.h b/icing/tokenization/language-segmenter.h index 7ca31d1..913386a 100644 --- a/icing/tokenization/language-segmenter.h +++ b/icing/tokenization/language-segmenter.h @@ -21,6 +21,8 @@ #include <vector> #include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/util/character-iterator.h" namespace icing { namespace lib { @@ -56,51 +58,81 @@ class LanguageSegmenter { // true. virtual std::string_view GetTerm() const = 0; - // Resets the iterator to point to the first term that starts after offset. + // RETURNS: + // On success, a CharacterIterator pointing to the beginning of the + // current term. + // ABORTED if an invalid unicode character is encountered while + // calculating the term start. + virtual libtextclassifier3::StatusOr<CharacterIterator> + CalculateTermStart() { + return absl_ports::UnimplementedError(""); + } + + // RETURNS: + // On success, a CharacterIterator pointing just past the end of the + // current term. + // ABORTED if an invalid unicode character is encountered while + // calculating the term end. + virtual libtextclassifier3::StatusOr<CharacterIterator> + CalculateTermEndExclusive() { + return absl_ports::UnimplementedError(""); + } + + // Resets the iterator to point to the first term that starts after UTF-32 + // offset. // GetTerm will now return that term. For example: // // language_segmenter = language_segmenter_factory::Create(type); // iterator = language_segmenter->Segment("foo bar baz"); - // iterator.ResetToTermStartingAfter(4); + // iterator.ResetToTermStartingAfterUtf32(4); // iterator.GetTerm() // returns "baz"; // // Return types of OK and NOT_FOUND indicate that the function call was // valid and the state of the iterator has changed. Return type of - // INVALID_ARGUMENT will leave the iterator unchanged. + // INVALID_ARGUMENT will leave the iterator unchanged. Lastly, a return type + // of ABORTED means that the iterator may be left in an undefined state and + // no longer be usable. // // Returns: - // On success, the starting position of the first term that starts after + // On success, the UTF-32 offset of the first term that starts after // offset. // NOT_FOUND if an error occurred or there are no terms that start after // offset. - // INVALID_ARGUMENT if offset is out of bounds for the provided text. + // INVALID_ARGUMENT if offset is beyond the end of the text. // ABORTED if an invalid unicode character is encountered while // traversing the text. - virtual libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter( - int32_t offset) = 0; + virtual libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfterUtf32( + int32_t offset) { + return absl_ports::UnimplementedError(""); + } - // Resets the iterator to point to the first term that ends before offset. + // Resets the iterator to point to the first term that ends before UTF-32 + // offset. // GetTerm will now return that term. For example: // // language_segmenter = language_segmenter_factory::Create(type); // iterator = language_segmenter->Segment("foo bar baz"); - // iterator.ResetToTermEndingBefore(7); + // iterator.ResetToTermEndingBeforeUtf32(7); // iterator.GetTerm() // returns "bar"; // // Return types of OK and NOT_FOUND indicate that the function call was // valid and the state of the iterator has changed. Return type of - // INVALID_ARGUMENT will leave the iterator unchanged. + // INVALID_ARGUMENT will leave the iterator unchanged. Lastly, a return type + // of ABORTED means that the iterator may be left in an undefined state and + // no longer be usable. // // Returns: - // On success, the starting position of the first term that ends before + // On success, the UTF-32 offset of the first term that ends before // offset. // NOT_FOUND if an error occurred or there are no terms that ends before // offset. - // INVALID_ARGUMENT if offset is out of bounds for the provided text. + // INVALID_ARGUMENT if offset is negative // ABORTED if an invalid unicode character is encountered while // traversing the text. - virtual libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore( - int32_t offset) = 0; + virtual libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBeforeUtf32( + int32_t offset) { + return absl_ports::UnimplementedError(""); + } // Resets the iterator to point to the first term. // GetTerm will now return that term. For example: @@ -108,7 +140,7 @@ class LanguageSegmenter { // language_segmenter = language_segmenter_factory::Create(type); // iterator = language_segmenter->Segment("foo bar baz"); // iterator.Advance(); - // iterator.ResetToStart(); + // iterator.ResetToStartUtf32(); // iterator.GetTerm() // returns "foo"; // // Return types of OK and NOT_FOUND indicate that the function call was @@ -119,7 +151,7 @@ class LanguageSegmenter { // NOT_FOUND if an error occurred or there are no valid terms in the text. // ABORTED if an invalid unicode character is encountered while // traversing the text. - virtual libtextclassifier3::StatusOr<int32_t> ResetToStart() = 0; + virtual libtextclassifier3::StatusOr<int32_t> ResetToStartUtf32() = 0; }; // Segments the input text into terms. diff --git a/icing/tokenization/plain-tokenizer-test-jni-layer.cc b/icing/tokenization/plain-tokenizer-test-jni-layer.cc new file mode 100644 index 0000000..efa6427 --- /dev/null +++ b/icing/tokenization/plain-tokenizer-test-jni-layer.cc @@ -0,0 +1,36 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <jni.h> + +#include "gtest/gtest.h" +#include "icing/testing/logging-event-listener.h" + +// Global variable used so that the test implementation can access the JNIEnv. +JNIEnv* g_jenv = nullptr; + +extern "C" JNIEXPORT jboolean JNICALL +Java_icing_jni_PlainTokenizerJniTest_testsMain(JNIEnv* env, jclass ignored) { + g_jenv = env; + + std::vector<char*> my_argv; + char arg[] = "jni-test-lib"; + my_argv.push_back(arg); + int argc = 1; + char** argv = &(my_argv[0]); + testing::InitGoogleTest(&argc, argv); + testing::UnitTest::GetInstance()->listeners().Append( + new icing::lib::LoggingEventListener()); + return RUN_ALL_TESTS() == 0; +} diff --git a/icing/tokenization/plain-tokenizer.cc b/icing/tokenization/plain-tokenizer.cc index 6e54af9..13fe550 100644 --- a/icing/tokenization/plain-tokenizer.cc +++ b/icing/tokenization/plain-tokenizer.cc @@ -18,6 +18,7 @@ #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/tokenization/language-segmenter.h" +#include "icing/util/character-iterator.h" #include "icing/util/i18n-utils.h" #include "icing/util/status-macros.h" @@ -70,8 +71,18 @@ class PlainTokenIterator : public Tokenizer::Iterator { return Token(Token::REGULAR, current_term_); } + libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart() + override { + return base_iterator_->CalculateTermStart(); + } + + libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenEndExclusive() + override { + return base_iterator_->CalculateTermEndExclusive(); + } + bool ResetToTokenAfter(int32_t offset) override { - if (!base_iterator_->ResetToTermStartingAfter(offset).ok()) { + if (!base_iterator_->ResetToTermStartingAfterUtf32(offset).ok()) { return false; } current_term_ = base_iterator_->GetTerm(); @@ -84,20 +95,20 @@ class PlainTokenIterator : public Tokenizer::Iterator { bool ResetToTokenBefore(int32_t offset) override { ICING_ASSIGN_OR_RETURN( - offset, base_iterator_->ResetToTermEndingBefore(offset), false); + offset, base_iterator_->ResetToTermEndingBeforeUtf32(offset), false); current_term_ = base_iterator_->GetTerm(); while (!IsValidTerm(current_term_)) { // Haven't found a valid term yet. Retrieve the term prior to this one // from the segmenter. ICING_ASSIGN_OR_RETURN( - offset, base_iterator_->ResetToTermEndingBefore(offset), false); + offset, base_iterator_->ResetToTermEndingBeforeUtf32(offset), false); current_term_ = base_iterator_->GetTerm(); } return true; } bool ResetToStart() override { - if (!base_iterator_->ResetToStart().ok()) { + if (!base_iterator_->ResetToStartUtf32().ok()) { return false; } current_term_ = base_iterator_->GetTerm(); diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc index f578567..7490bfa 100644 --- a/icing/tokenization/plain-tokenizer_test.cc +++ b/icing/tokenization/plain-tokenizer_test.cc @@ -19,9 +19,10 @@ #include "gmock/gmock.h" #include "icing/absl_ports/str_cat.h" #include "icing/helpers/icu/icu-data-file-helper.h" +#include "icing/portable/platform.h" #include "icing/testing/common-matchers.h" #include "icing/testing/icu-i18n-test-utils.h" -#include "icing/testing/platform.h" +#include "icing/testing/jni-test-helpers.h" #include "icing/testing/test-data.h" #include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/tokenizer-factory.h" @@ -43,6 +44,8 @@ class PlainTokenizerTest : public ::testing::Test { GetTestFilePath("icing/icu.dat"))); } } + + std::unique_ptr<const JniCache> jni_cache_ = GetTestJniCache(); }; TEST_F(PlainTokenizerTest, CreationWithNullPointerShouldFail) { @@ -53,7 +56,8 @@ TEST_F(PlainTokenizerTest, CreationWithNullPointerShouldFail) { } TEST_F(PlainTokenizerTest, Simple) { - language_segmenter_factory::SegmenterOptions options(ULOC_US); + language_segmenter_factory::SegmenterOptions options(ULOC_US, + jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); @@ -87,7 +91,8 @@ TEST_F(PlainTokenizerTest, Simple) { } TEST_F(PlainTokenizerTest, Whitespace) { - language_segmenter_factory::SegmenterOptions options(ULOC_US); + language_segmenter_factory::SegmenterOptions options(ULOC_US, + jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); @@ -115,7 +120,8 @@ TEST_F(PlainTokenizerTest, Whitespace) { } TEST_F(PlainTokenizerTest, Punctuation) { - language_segmenter_factory::SegmenterOptions options(ULOC_US); + language_segmenter_factory::SegmenterOptions options(ULOC_US, + jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); @@ -161,7 +167,8 @@ TEST_F(PlainTokenizerTest, Punctuation) { } TEST_F(PlainTokenizerTest, SpecialCharacters) { - language_segmenter_factory::SegmenterOptions options(ULOC_US); + language_segmenter_factory::SegmenterOptions options(ULOC_US, + jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); @@ -187,7 +194,8 @@ TEST_F(PlainTokenizerTest, CJKT) { // In plain tokenizer, CJKT characters are handled the same way as non-CJKT // characters, just add these tests as sanity checks. // Chinese - language_segmenter_factory::SegmenterOptions options(ULOC_SIMPLIFIED_CHINESE); + language_segmenter_factory::SegmenterOptions options(ULOC_SIMPLIFIED_CHINESE, + jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); @@ -202,7 +210,8 @@ TEST_F(PlainTokenizerTest, CJKT) { EqualsToken(Token::REGULAR, "去"), EqualsToken(Token::REGULAR, "上班")))); // Japanese - options = language_segmenter_factory::SegmenterOptions(ULOC_JAPANESE); + options = language_segmenter_factory::SegmenterOptions(ULOC_JAPANESE, + jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( language_segmenter, language_segmenter_factory::Create(std::move(options))); @@ -272,7 +281,8 @@ TEST_F(PlainTokenizerTest, CJKT) { } TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) { - language_segmenter_factory::SegmenterOptions options(ULOC_US); + language_segmenter_factory::SegmenterOptions options(ULOC_US, + jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); @@ -291,7 +301,8 @@ TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) { } TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) { - language_segmenter_factory::SegmenterOptions options(ULOC_US); + language_segmenter_factory::SegmenterOptions options(ULOC_US, + jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); @@ -310,7 +321,8 @@ TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) { } TEST_F(PlainTokenizerTest, ResetToTokenAfter) { - language_segmenter_factory::SegmenterOptions options(ULOC_US); + language_segmenter_factory::SegmenterOptions options(ULOC_US, + jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); @@ -360,7 +372,8 @@ TEST_F(PlainTokenizerTest, ResetToTokenAfter) { } TEST_F(PlainTokenizerTest, ResetToTokenBefore) { - language_segmenter_factory::SegmenterOptions options(ULOC_US); + language_segmenter_factory::SegmenterOptions options(ULOC_US, + jni_cache_.get()); ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create(std::move(options))); diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc index e1a666b..500efa0 100644 --- a/icing/tokenization/raw-query-tokenizer_test.cc +++ b/icing/tokenization/raw-query-tokenizer_test.cc @@ -17,8 +17,8 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" #include "icing/helpers/icu/icu-data-file-helper.h" +#include "icing/portable/platform.h" #include "icing/testing/common-matchers.h" -#include "icing/testing/platform.h" #include "icing/testing/test-data.h" #include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/tokenizer-factory.h" diff --git a/icing/jni/reverse-jni-break-iterator.cc b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc index 1a8a799..6b1cb3a 100644 --- a/icing/jni/reverse-jni-break-iterator.cc +++ b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/jni/reverse-jni-break-iterator.h" +#include "icing/tokenization/reverse_jni/reverse-jni-break-iterator.h" #include <jni.h> #include <math.h> diff --git a/icing/jni/reverse-jni-break-iterator.h b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.h index c1f05f4..41b470c 100644 --- a/icing/jni/reverse-jni-break-iterator.h +++ b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.h @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef ICING_JNI_REVERSE_JNI_BREAK_ITERATOR_H_ -#define ICING_JNI_REVERSE_JNI_BREAK_ITERATOR_H_ +#ifndef ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_BREAK_ITERATOR_H_ +#define ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_BREAK_ITERATOR_H_ #include <jni.h> @@ -121,4 +121,4 @@ class ReverseJniBreakIterator { } // namespace lib } // namespace icing -#endif // ICING_JNI_REVERSE_JNI_BREAK_ITERATOR_H_ +#endif // ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_BREAK_ITERATOR_H_ diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni-layer.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni-layer.cc new file mode 100644 index 0000000..5f5202c --- /dev/null +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni-layer.cc @@ -0,0 +1,37 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <jni.h> + +#include "gtest/gtest.h" +#include "icing/testing/logging-event-listener.h" + +// Global variable used so that the test implementation can access the JNIEnv. +JNIEnv* g_jenv = nullptr; + +extern "C" JNIEXPORT jboolean JNICALL +Java_icing_jni_ReverseJniLanguageSegmenterJniTest_testsMain(JNIEnv* env, + jclass ignored) { + g_jenv = env; + + std::vector<char*> my_argv; + char arg[] = "jni-test-lib"; + my_argv.push_back(arg); + int argc = 1; + char** argv = &(my_argv[0]); + testing::InitGoogleTest(&argc, argv); + testing::UnitTest::GetInstance()->listeners().Append( + new icing::lib::LoggingEventListener()); + return RUN_ALL_TESTS() == 0; +} diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h deleted file mode 100644 index 64b68ec..0000000 --- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_ -#define ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_ - -#include <jni.h> - -#include "icing/jni/jni-cache.h" -#include "gtest/gtest.h" - -extern JNIEnv* g_jenv; - -namespace icing { -namespace lib { - -namespace test_internal { - -class ReverseJniLanguageSegmenterTest - : public testing::TestWithParam<const char*> { - protected: - ReverseJniLanguageSegmenterTest() - : jni_cache_(std::move(JniCache::Create(g_jenv)).ValueOrDie()) {} - - static std::string GetLocale() { return GetParam(); } - - std::unique_ptr<JniCache> jni_cache_; -}; - -} // namespace test_internal - -} // namespace lib -} // namespace icing - -#endif // ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_ diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc index bb26364..76219b5 100644 --- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc @@ -19,11 +19,11 @@ #include <string> #include <string_view> -#include "icing/jni/reverse-jni-break-iterator.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/legacy/core/icing-string-util.h" #include "icing/tokenization/language-segmenter.h" +#include "icing/tokenization/reverse_jni/reverse-jni-break-iterator.h" #include "icing/util/character-iterator.h" #include "icing/util/i18n-utils.h" #include "icing/util/status-macros.h" @@ -44,13 +44,13 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { // Advances to the next term. Returns false if it has reached the end. bool Advance() override { // Prerequisite check - if (term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone) { + if (IsDone()) { return false; } if (term_end_exclusive_.utf16_index() == 0) { int first = break_iterator_->First(); - if (!term_start_.AdvanceToUtf16(first)) { + if (!term_start_.MoveToUtf16(first)) { // First is guaranteed to succeed and return a position within bonds. So // the only possible failure could be an invalid sequence. Mark as DONE // and return. @@ -67,7 +67,7 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { MarkAsDone(); return false; } - if (!term_end_exclusive_.AdvanceToUtf16(next_utf16_index_exclusive)) { + if (!term_end_exclusive_.MoveToUtf16(next_utf16_index_exclusive)) { // next_utf16_index_exclusive is guaranteed to be within bonds thanks to // the check for kDone above. So the only possible failure could be an // invalid sequence. Mark as DONE and return. @@ -87,6 +87,9 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { // Returns the current term. It can be called only when Advance() returns // true. std::string_view GetTerm() const override { + if (IsDone()) { + return text_.substr(0, 0); + } int term_length = term_end_exclusive_.utf8_index() - term_start_.utf8_index(); if (term_length > 0 && std::isspace(text_[term_start_.utf8_index()])) { @@ -96,6 +99,16 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { return text_.substr(term_start_.utf8_index(), term_length); } + libtextclassifier3::StatusOr<CharacterIterator> CalculateTermStart() + override { + return term_start_; + } + + libtextclassifier3::StatusOr<CharacterIterator> CalculateTermEndExclusive() + override { + return term_end_exclusive_; + } + // Resets the iterator to point to the first term that starts after offset. // GetTerm will now return that term. // @@ -107,15 +120,14 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { // INVALID_ARGUMENT if offset is out of bounds for the provided text. // ABORTED if an invalid unicode character is encountered while // traversing the text. - libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter( + libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfterUtf32( int32_t offset) override { - if (offset < 0 || offset >= text_.length()) { - return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( - "Illegal offset provided! Offset %d is not within bounds of string " - "of length %zu", - offset, text_.length())); + if (offset < 0) { + // Very simple. The first term start after a negative offset is the first + // term. So just reset to start. + return ResetToStartUtf32(); } - if (term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone) { + if (IsDone()) { // We're done. Need to start from the beginning if we're going to reset // properly. term_start_ = CharacterIterator(text_); @@ -123,43 +135,48 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { } // 1. Find the unicode character that contains the byte at offset. - CharacterIterator offset_iterator = term_end_exclusive_; - bool success = (offset > offset_iterator.utf8_index()) - ? offset_iterator.AdvanceToUtf8(offset) - : offset_iterator.RewindToUtf8(offset); - if (!success) { - // Offset is guaranteed to be within bounds thanks to the check above. So - // the only possible failure could be an invalid sequence. Mark as DONE - // and return. - MarkAsDone(); - return absl_ports::AbortedError("Encountered invalid UTF sequence!"); + CharacterIterator offset_iterator = (offset < term_start_.utf32_index()) + ? term_start_ + : term_end_exclusive_; + if (!offset_iterator.MoveToUtf32(offset)) { + if (offset_iterator.utf8_index() != text_.length()) { + // We returned false for some reason other than hitting the end. This is + // a real error. Just return. + MarkAsDone(); + return absl_ports::AbortedError( + "Could not retrieve valid utf8 character!"); + } + } + // Check to see if offset is past the end of the text. If it is, then + // there's no term starting after it. Return an invalid argument. + if (offset_iterator.utf8_index() == text_.length()) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Illegal offset provided! Offset utf-32:%d, utf-8:%d is not within " + "bounds of string of length %zu", + offset_iterator.utf32_index(), offset_iterator.utf8_index(), + text_.length())); } // 2. We've got the unicode character containing byte offset. Now, we need // to point to the segment that starts after this character. int following_utf16_index = break_iterator_->Following(offset_iterator.utf16_index()); - if (following_utf16_index == ReverseJniBreakIterator::kDone) { + if (following_utf16_index == ReverseJniBreakIterator::kDone || + !offset_iterator.MoveToUtf16(following_utf16_index)) { MarkAsDone(); return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( "No segments begin after provided offset %d.", offset)); } - if (!offset_iterator.AdvanceToUtf16(following_utf16_index)) { - // following_utf16_index is guaranteed to be within bonds thanks to the - // check for kDone above. So the only possible failure could be an invalid - // sequence. Mark as DONE and return. - MarkAsDone(); - return absl_ports::AbortedError("Encountered invalid UTF sequence!"); - } term_end_exclusive_ = offset_iterator; - // 3. The term_end_exclusive_ points to the term that we want to return. We - // need to Advance so that term_start_ will now point to this term. + // 3. The term_end_exclusive_ points to the start of the term that we want + // to return. We need to Advance so that term_start_ will now point to this + // term. if (!Advance()) { return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( "No segments begin after provided offset %d.", offset)); } - return term_start_.utf8_index(); + return term_start_.utf32_index(); } // Resets the iterator to point to the first term that ends before offset. @@ -173,52 +190,48 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { // INVALID_ARGUMENT if offset is out of bounds for the provided text. // ABORTED if an invalid unicode character is encountered while // traversing the text. - libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore( + libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBeforeUtf32( int32_t offset) override { - if (offset < 0 || offset >= text_.length()) { + if (offset < 0) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( "Illegal offset provided! Offset %d is not within bounds of string " "of length %zu", offset, text_.length())); } - if (term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone) { + if (IsDone()) { // We're done. Need to start from the beginning if we're going to reset // properly. term_start_ = CharacterIterator(text_); term_end_exclusive_ = CharacterIterator(text_); } - // 1. Find the unicode character that contains the byte at offset. - CharacterIterator offset_iterator = term_end_exclusive_; - bool success = (offset > offset_iterator.utf8_index()) - ? offset_iterator.AdvanceToUtf8(offset) - : offset_iterator.RewindToUtf8(offset); - if (!success) { - // Offset is guaranteed to be within bounds thanks to the check above. So - // the only possible failure could be an invalid sequence. Mark as DONE - // and return. - MarkAsDone(); - return absl_ports::AbortedError( - "Could not retrieve valid utf8 character!"); + CharacterIterator offset_iterator = (offset < term_start_.utf32_index()) + ? term_start_ + : term_end_exclusive_; + if (!offset_iterator.MoveToUtf32(offset)) { + // An error occurred. Mark as DONE + if (offset_iterator.utf8_index() != text_.length()) { + // We returned false for some reason other than hitting the end. This is + // a real error. Just return. + MarkAsDone(); + return absl_ports::AbortedError( + "Could not retrieve valid utf8 character!"); + } + // If it returned false because we hit the end. Then that's fine. We'll + // just treat it as if the request was for the end. } // 2. We've got the unicode character containing byte offset. Now, we need - // to point to the segment that starts before this character. + // to point to the segment that ends before this character. int starting_utf16_index = break_iterator_->Preceding(offset_iterator.utf16_index()); - if (starting_utf16_index == ReverseJniBreakIterator::kDone) { + if (starting_utf16_index == ReverseJniBreakIterator::kDone || + !offset_iterator.MoveToUtf16(starting_utf16_index)) { // Rewind the end indices. MarkAsDone(); return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( "No segments end before provided offset %d.", offset)); } - if (!offset_iterator.RewindToUtf16(starting_utf16_index)) { - // starting_utf16_index is guaranteed to be within bonds thanks to the - // check for kDone above. So the only possible failure could be an invalid - // sequence. Mark as DONE and return. - MarkAsDone(); - return absl_ports::AbortedError("Encountered invalid UTF sequence!"); - } term_start_ = offset_iterator; // 3. We've correctly set the start index and the iterator currently points @@ -226,24 +239,25 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { // advance the iterator to that position. int end_utf16_index = break_iterator_->Next(); term_end_exclusive_ = term_start_; - term_end_exclusive_.AdvanceToUtf16(end_utf16_index); + term_end_exclusive_.MoveToUtf16(end_utf16_index); // 4. The start and end indices point to a segment, but we need to ensure // that this segment is 1) valid and 2) ends before offset. Otherwise, we'll // need a segment prior to this one. - if (term_end_exclusive_.utf8_index() > offset || !IsValidTerm()) { - return ResetToTermEndingBefore(term_start_.utf8_index()); + if (term_end_exclusive_.utf32_index() > offset || !IsValidTerm()) { + return ResetToTermEndingBeforeUtf32(term_start_.utf32_index()); } - return term_start_.utf8_index(); + return term_start_.utf32_index(); } - libtextclassifier3::StatusOr<int32_t> ResetToStart() override { + libtextclassifier3::StatusOr<int32_t> ResetToStartUtf32() override { term_start_ = CharacterIterator(text_); term_end_exclusive_ = CharacterIterator(text_); if (!Advance()) { - return absl_ports::NotFoundError(""); + return absl_ports::NotFoundError( + "Unable to find any valid terms in text."); } - return term_start_.utf8_index(); + return term_start_.utf32_index(); } private: @@ -255,11 +269,19 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator { // break_iterator_ may be in any state. void MarkAsDone() { term_start_ = - CharacterIterator(text_, /*utf8_index=*/0, - /*utf16_index=*/ReverseJniBreakIterator::kDone); + CharacterIterator(text_, /*utf8_index=*/ReverseJniBreakIterator::kDone, + /*utf16_index=*/ReverseJniBreakIterator::kDone, + /*utf32_index=*/ReverseJniBreakIterator::kDone); term_end_exclusive_ = - CharacterIterator(text_, /*utf8_index=*/0, - /*utf16_index=*/ReverseJniBreakIterator::kDone); + CharacterIterator(text_, /*utf8_index=*/ReverseJniBreakIterator::kDone, + /*utf16_index=*/ReverseJniBreakIterator::kDone, + /*utf32_index=*/ReverseJniBreakIterator::kDone); + } + bool IsDone() const { + // We could just as easily check the other utf indices or the values in + // term_start_ to check for done. There's no particular reason to choose any + // one since they should all hold kDone. + return term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone; } bool IsValidTerm() const { diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc index 2c268ff..b1a8f72 100644 --- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc +++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc @@ -12,19 +12,22 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h" +#include <jni.h> #include <memory> #include <string_view> +#include "icing/jni/jni-cache.h" #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "gmock/gmock.h" #include "icing/absl_ports/str_cat.h" #include "icing/testing/common-matchers.h" #include "icing/testing/icu-i18n-test-utils.h" +#include "icing/testing/jni-test-helpers.h" #include "icing/tokenization/language-segmenter-factory.h" #include "icing/tokenization/language-segmenter.h" +#include "icing/util/character-iterator.h" #include "unicode/uloc.h" namespace icing { @@ -54,72 +57,72 @@ std::vector<std::string_view> GetAllTermsAdvance( } // Returns a vector containing all terms retrieved by calling ResetAfter with -// the current position to simulate Advancing on the iterator. -std::vector<std::string_view> GetAllTermsResetAfter( +// the UTF-32 position of the current term start to simulate Advancing on the +// iterator. +std::vector<std::string_view> GetAllTermsResetAfterUtf32( LanguageSegmenter::Iterator* itr) { std::vector<std::string_view> terms; - if (!itr->ResetToStart().ok()) { - return terms; - } - terms.push_back(itr->GetTerm()); - const char* text_begin = itr->GetTerm().data(); - // Calling ResetToTermStartingAfter with the current position should get the - // very next term in the sequence. - for (int current_pos = 0; itr->ResetToTermStartingAfter(current_pos).ok(); - current_pos = itr->GetTerm().data() - text_begin) { + // Calling ResetToTermStartingAfterUtf32 with -1 should get the first term in + // the sequence. + bool is_ok = itr->ResetToTermStartingAfterUtf32(-1).ok(); + while (is_ok) { terms.push_back(itr->GetTerm()); + // Calling ResetToTermStartingAfterUtf32 with the current position should + // get the very next term in the sequence. + CharacterIterator char_itr = itr->CalculateTermStart().ValueOrDie(); + is_ok = itr->ResetToTermStartingAfterUtf32(char_itr.utf32_index()).ok(); } return terms; } // Returns a vector containing all terms retrieved by alternating calls to -// Advance and calls to ResetAfter with the current position to simulate -// Advancing. -std::vector<std::string_view> GetAllTermsAdvanceAndResetAfter( +// Advance and calls to ResetAfter with the UTF-32 position of the current term +// start to simulate Advancing. +std::vector<std::string_view> GetAllTermsAdvanceAndResetAfterUtf32( LanguageSegmenter::Iterator* itr) { - const char* text_begin = itr->GetTerm().data(); std::vector<std::string_view> terms; - - bool is_ok = true; - int current_pos = 0; + bool is_ok = itr->Advance(); while (is_ok) { + terms.push_back(itr->GetTerm()); // Alternate between using Advance and ResetToTermAfter. if (terms.size() % 2 == 0) { is_ok = itr->Advance(); } else { - // Calling ResetToTermStartingAfter with the current position should get - // the very next term in the sequence. - current_pos = itr->GetTerm().data() - text_begin; - is_ok = itr->ResetToTermStartingAfter(current_pos).ok(); - } - if (is_ok) { - terms.push_back(itr->GetTerm()); + // Calling ResetToTermStartingAfterUtf32 with the current position should + // get the very next term in the sequence. + CharacterIterator char_itr = itr->CalculateTermStart().ValueOrDie(); + is_ok = itr->ResetToTermStartingAfterUtf32(char_itr.utf32_index()).ok(); } } return terms; } // Returns a vector containing all terms retrieved by calling ResetBefore with -// the current position, starting at the end of the text. This vector should be -// in reverse order of GetAllTerms and missing the last term. -std::vector<std::string_view> GetAllTermsResetBefore( +// the UTF-32 position of the current term start, starting at the end of the +// text. This vector should be in reverse order of GetAllTerms and missing the +// last term. +std::vector<std::string_view> GetAllTermsResetBeforeUtf32( LanguageSegmenter::Iterator* itr) { - const char* text_begin = itr->GetTerm().data(); - int last_pos = 0; - while (itr->Advance()) { - last_pos = itr->GetTerm().data() - text_begin; - } std::vector<std::string_view> terms; - // Calling ResetToTermEndingBefore with the current position should get the - // previous term in the sequence. - for (int current_pos = last_pos; - itr->ResetToTermEndingBefore(current_pos).ok(); - current_pos = itr->GetTerm().data() - text_begin) { + bool is_ok = itr->ResetToTermEndingBeforeUtf32(1000).ok(); + while (is_ok) { terms.push_back(itr->GetTerm()); + // Calling ResetToTermEndingBeforeUtf32 with the current position should get + // the previous term in the sequence. + CharacterIterator char_itr = itr->CalculateTermStart().ValueOrDie(); + is_ok = itr->ResetToTermEndingBeforeUtf32(char_itr.utf32_index()).ok(); } return terms; } +class ReverseJniLanguageSegmenterTest + : public testing::TestWithParam<const char*> { + protected: + static std::string GetLocale() { return GetParam(); } + + std::unique_ptr<const JniCache> jni_cache_ = GetTestJniCache(); +}; + } // namespace TEST_P(ReverseJniLanguageSegmenterTest, EmptyText) { @@ -471,7 +474,7 @@ TEST_P(ReverseJniLanguageSegmenterTest, NotCopyStrings) { EXPECT_THAT(word2_address, Eq(word2_result_address)); } -TEST_P(ReverseJniLanguageSegmenterTest, ResetToStartWordConnector) { +TEST_P(ReverseJniLanguageSegmenterTest, ResetToStartUtf32WordConnector) { ICING_ASSERT_OK_AND_ASSIGN( auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); @@ -479,15 +482,16 @@ TEST_P(ReverseJniLanguageSegmenterTest, ResetToStartWordConnector) { ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, segmenter->Segment(kText)); - // String: "com:google:android is package" - // ^ ^^ ^^ - // Bytes: 0 18 19 21 22 - auto position_or = itr->ResetToStart(); + // String: "com:google:android is package" + // ^ ^^ ^^ + // UTF-8 idx: 0 18 19 21 22 + // UTF-32 idx: 0 18 19 21 22 + auto position_or = itr->ResetToStartUtf32(); EXPECT_THAT(position_or, IsOk()); ASSERT_THAT(itr->GetTerm(), Eq("com:google:android")); } -TEST_P(ReverseJniLanguageSegmenterTest, NewIteratorResetToStart) { +TEST_P(ReverseJniLanguageSegmenterTest, NewIteratorResetToStartUtf32) { ICING_ASSERT_OK_AND_ASSIGN( auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); @@ -495,14 +499,15 @@ TEST_P(ReverseJniLanguageSegmenterTest, NewIteratorResetToStart) { ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, segmenter->Segment(kText)); - // String: "How are you你好吗お元気ですか" - // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ - // Bytes: 0 3 4 7 8 11 172023 29 35 - EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0))); + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 3 4 7 8 11 172023 29 35 + // UTF-32 idx: 0 3 4 7 8 11 131415 17 19 + EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0))); EXPECT_THAT(itr->GetTerm(), Eq("How")); } -TEST_P(ReverseJniLanguageSegmenterTest, IteratorOneAdvanceResetToStart) { +TEST_P(ReverseJniLanguageSegmenterTest, IteratorOneAdvanceResetToStartUtf32) { ICING_ASSERT_OK_AND_ASSIGN( auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); @@ -510,15 +515,17 @@ TEST_P(ReverseJniLanguageSegmenterTest, IteratorOneAdvanceResetToStart) { ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, segmenter->Segment(kText)); - // String: "How are you你好吗お元気ですか" - // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ - // Bytes: 0 3 4 7 8 11 172023 29 35 + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 3 4 7 8 11 172023 29 35 + // UTF-32 idx: 0 3 4 7 8 11 131415 17 19 ASSERT_TRUE(itr->Advance()); // itr points to 'How' - EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0))); EXPECT_THAT(itr->GetTerm(), Eq("How")); } -TEST_P(ReverseJniLanguageSegmenterTest, IteratorMultipleAdvancesResetToStart) { +TEST_P(ReverseJniLanguageSegmenterTest, + IteratorMultipleAdvancesResetToStartUtf32) { ICING_ASSERT_OK_AND_ASSIGN( auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); @@ -526,18 +533,19 @@ TEST_P(ReverseJniLanguageSegmenterTest, IteratorMultipleAdvancesResetToStart) { ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, segmenter->Segment(kText)); - // String: "How are you你好吗お元気ですか" - // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ - // Bytes: 0 3 4 7 8 11 172023 29 35 + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 3 4 7 8 11 172023 29 35 + // UTF-32 idx: 0 3 4 7 8 11 131415 17 19 ASSERT_TRUE(itr->Advance()); ASSERT_TRUE(itr->Advance()); ASSERT_TRUE(itr->Advance()); ASSERT_TRUE(itr->Advance()); // itr points to ' ' - EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0))); EXPECT_THAT(itr->GetTerm(), Eq("How")); } -TEST_P(ReverseJniLanguageSegmenterTest, IteratorDoneResetToStart) { +TEST_P(ReverseJniLanguageSegmenterTest, IteratorDoneResetToStartUtf32) { ICING_ASSERT_OK_AND_ASSIGN( auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); @@ -545,17 +553,18 @@ TEST_P(ReverseJniLanguageSegmenterTest, IteratorDoneResetToStart) { ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, segmenter->Segment(kText)); - // String: "How are you你好吗お元気ですか" - // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ - // Bytes: 0 3 4 7 8 11 172023 29 35 + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 3 4 7 8 11 172023 29 35 + // UTF-32 idx: 0 3 4 7 8 11 131415 17 19 while (itr->Advance()) { // Do nothing. } - EXPECT_THAT(itr->ResetToStart(), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0))); EXPECT_THAT(itr->GetTerm(), Eq("How")); } -TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterWordConnector) { +TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterUtf32WordConnector) { ICING_ASSERT_OK_AND_ASSIGN( auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); @@ -563,21 +572,22 @@ TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterWordConnector) { ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, segmenter->Segment(kText)); - // String: "package com:google:android name" - // ^ ^^ ^^ - // Bytes: 0 7 8 26 27 - auto position_or = itr->ResetToTermStartingAfter(8); + // String: "package com:google:android name" + // ^ ^^ ^^ + // UTF-8 idx: 0 7 8 26 27 + // UTF-32 idx: 0 7 8 26 27 + auto position_or = itr->ResetToTermStartingAfterUtf32(8); EXPECT_THAT(position_or, IsOk()); EXPECT_THAT(position_or.ValueOrDie(), Eq(26)); ASSERT_THAT(itr->GetTerm(), Eq(" ")); - position_or = itr->ResetToTermStartingAfter(7); + position_or = itr->ResetToTermStartingAfterUtf32(7); EXPECT_THAT(position_or, IsOk()); EXPECT_THAT(position_or.ValueOrDie(), Eq(8)); ASSERT_THAT(itr->GetTerm(), Eq("com:google:android")); } -TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterOutOfBounds) { +TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterUtf32OutOfBounds) { ICING_ASSERT_OK_AND_ASSIGN( auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); @@ -585,19 +595,19 @@ TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterOutOfBounds) { ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, segmenter->Segment(kText)); - // String: "How are you你好吗お元気ですか" - // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ - // Bytes: 0 3 4 7 8 11 172023 29 35 - ASSERT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8))); + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 3 4 7 8 11 172023 29 35 + // UTF-32 idx: 0 3 4 7 8 11 131415 17 19 + ASSERT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8))); ASSERT_THAT(itr->GetTerm(), Eq("you")); - EXPECT_THAT(itr->ResetToTermStartingAfter(-1), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - EXPECT_THAT(itr->GetTerm(), Eq("you")); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(-1), IsOk()); + EXPECT_THAT(itr->GetTerm(), Eq("How")); - EXPECT_THAT(itr->ResetToTermStartingAfter(kText.length()), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(21), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - EXPECT_THAT(itr->GetTerm(), Eq("you")); + EXPECT_THAT(itr->GetTerm(), Eq("How")); } // Tests that ResetToTermAfter and Advance produce the same output. With the @@ -606,7 +616,7 @@ TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterOutOfBounds) { // terms produced by ResetToTermAfter calls with the current position // provided as the argument. TEST_P(ReverseJniLanguageSegmenterTest, - MixedLanguagesResetToTermAfterEquivalentToAdvance) { + MixedLanguagesResetToTermAfterUtf32EquivalentToAdvance) { ICING_ASSERT_OK_AND_ASSIGN( auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); @@ -621,14 +631,14 @@ TEST_P(ReverseJniLanguageSegmenterTest, std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, segmenter->Segment(kText)); std::vector<std::string_view> reset_terms = - GetAllTermsResetAfter(reset_to_term_itr.get()); + GetAllTermsResetAfterUtf32(reset_to_term_itr.get()); EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms)); EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm())); } TEST_P(ReverseJniLanguageSegmenterTest, - ThaiResetToTermAfterEquivalentToAdvance) { + ThaiResetToTermAfterUtf32EquivalentToAdvance) { ICING_ASSERT_OK_AND_ASSIGN( auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); @@ -643,14 +653,14 @@ TEST_P(ReverseJniLanguageSegmenterTest, std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, segmenter->Segment(kThai)); std::vector<std::string_view> reset_terms = - GetAllTermsResetAfter(reset_to_term_itr.get()); + GetAllTermsResetAfterUtf32(reset_to_term_itr.get()); EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms)); EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm())); } TEST_P(ReverseJniLanguageSegmenterTest, - KoreanResetToTermAfterEquivalentToAdvance) { + KoreanResetToTermAfterUtf32EquivalentToAdvance) { ICING_ASSERT_OK_AND_ASSIGN( auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); @@ -665,7 +675,7 @@ TEST_P(ReverseJniLanguageSegmenterTest, std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, segmenter->Segment(kKorean)); std::vector<std::string_view> reset_terms = - GetAllTermsResetAfter(reset_to_term_itr.get()); + GetAllTermsResetAfterUtf32(reset_to_term_itr.get()); EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms)); EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm())); @@ -676,7 +686,7 @@ TEST_P(ReverseJniLanguageSegmenterTest, // should be able to mix ResetToTermAfter(current_position) calls and Advance // calls to mimic calling Advance. TEST_P(ReverseJniLanguageSegmenterTest, - MixedLanguagesResetToTermAfterInteroperableWithAdvance) { + MixedLanguagesResetToTermAfterUtf32InteroperableWithAdvance) { ICING_ASSERT_OK_AND_ASSIGN( auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); @@ -691,7 +701,7 @@ TEST_P(ReverseJniLanguageSegmenterTest, std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr, segmenter->Segment(kText)); std::vector<std::string_view> advance_and_reset_terms = - GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get()); + GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get()); EXPECT_THAT(advance_and_reset_terms, testing::ElementsAreArray(advance_terms)); @@ -699,7 +709,7 @@ TEST_P(ReverseJniLanguageSegmenterTest, } TEST_P(ReverseJniLanguageSegmenterTest, - ThaiResetToTermAfterInteroperableWithAdvance) { + ThaiResetToTermAfterUtf32InteroperableWithAdvance) { ICING_ASSERT_OK_AND_ASSIGN( auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); @@ -714,7 +724,7 @@ TEST_P(ReverseJniLanguageSegmenterTest, std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr, segmenter->Segment(kThai)); std::vector<std::string_view> advance_and_reset_terms = - GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get()); + GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get()); EXPECT_THAT(advance_and_reset_terms, testing::ElementsAreArray(advance_terms)); @@ -722,7 +732,7 @@ TEST_P(ReverseJniLanguageSegmenterTest, } TEST_P(ReverseJniLanguageSegmenterTest, - KoreanResetToTermAfterInteroperableWithAdvance) { + KoreanResetToTermAfterUtf32InteroperableWithAdvance) { ICING_ASSERT_OK_AND_ASSIGN( auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); @@ -737,14 +747,14 @@ TEST_P(ReverseJniLanguageSegmenterTest, std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr, segmenter->Segment(kKorean)); std::vector<std::string_view> advance_and_reset_terms = - GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get()); + GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get()); EXPECT_THAT(advance_and_reset_terms, testing::ElementsAreArray(advance_terms)); EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm())); } -TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermAfter) { +TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermAfterUtf32) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create( @@ -753,33 +763,35 @@ TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermAfter) { std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment("How are you你好吗お元気ですか")); - // String: "How are you你好吗お元気ですか" - // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ - // Bytes: 0 3 4 7 8 11 172023 29 35 - EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(3))); + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 3 4 7 8 11 172023 29 35 + // UTF-32 idx: 0 3 4 7 8 11 131415 17 19 + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3))); EXPECT_THAT(itr->GetTerm(), Eq(" ")); - EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(11))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(10), IsOkAndHolds(Eq(11))); EXPECT_THAT(itr->GetTerm(), Eq("你好")); - EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8))); EXPECT_THAT(itr->GetTerm(), Eq("you")); - EXPECT_THAT(itr->ResetToTermStartingAfter(32), IsOkAndHolds(Eq(35))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(18), IsOkAndHolds(Eq(19))); EXPECT_THAT(itr->GetTerm(), Eq("か")); - EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(17))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(13))); EXPECT_THAT(itr->GetTerm(), Eq("吗")); - EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3))); EXPECT_THAT(itr->GetTerm(), Eq(" ")); - EXPECT_THAT(itr->ResetToTermStartingAfter(35), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(19), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); } -TEST_P(ReverseJniLanguageSegmenterTest, ContinuousWhitespacesResetToTermAfter) { +TEST_P(ReverseJniLanguageSegmenterTest, + ContinuousWhitespacesResetToTermAfterUtf32) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create( @@ -789,35 +801,36 @@ TEST_P(ReverseJniLanguageSegmenterTest, ContinuousWhitespacesResetToTermAfter) { ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kTextWithSpace)); - // String: "Hello World" - // ^ ^ ^ - // Bytes: 0 5 15 - EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(5))); + // String: "Hello World" + // ^ ^ ^ + // UTF-8 idx: 0 5 15 + // UTF-32 idx: 0 5 15 + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(5))); EXPECT_THAT(itr->GetTerm(), Eq(" ")); - EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(5))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(5))); EXPECT_THAT(itr->GetTerm(), Eq(" ")); - EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(15))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(10), IsOkAndHolds(Eq(15))); EXPECT_THAT(itr->GetTerm(), Eq("World")); - EXPECT_THAT(itr->ResetToTermStartingAfter(5), IsOkAndHolds(Eq(15))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(5), IsOkAndHolds(Eq(15))); EXPECT_THAT(itr->GetTerm(), Eq("World")); - EXPECT_THAT(itr->ResetToTermStartingAfter(15), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermStartingAfter(17), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(17), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermStartingAfter(19), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(19), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); } -TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermAfter) { +TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermAfterUtf32) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create( @@ -827,21 +840,22 @@ TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermAfter) { constexpr std::string_view kChinese = "我每天走路去上班。"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kChinese)); - // String: "我每天走路去上班。" - // ^ ^ ^ ^^ - // Bytes: 0 3 9 15 18 - EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3))); + // String: "我每天走路去上班。" + // ^ ^ ^ ^^ + // UTF-8 idx: 0 3 9 15 18 + // UTF-832 idx: 0 1 3 5 6 + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1))); EXPECT_THAT(itr->GetTerm(), Eq("每天")); - EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(9))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3))); EXPECT_THAT(itr->GetTerm(), Eq("走路")); - EXPECT_THAT(itr->ResetToTermStartingAfter(19), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); } -TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermAfter) { +TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermAfterUtf32) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create( @@ -850,21 +864,22 @@ TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermAfter) { constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kJapanese)); - // String: "私は毎日仕事に歩いています。" - // ^ ^ ^ ^ ^ ^ ^ ^ ^ - // Bytes: 0 3 6 12 18212427 33 - EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3))); + // String: "私は毎日仕事に歩いています。" + // ^ ^ ^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 3 6 12 18212427 33 + // UTF-32 idx: 0 1 2 4 6 7 8 9 11 + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1))); EXPECT_THAT(itr->GetTerm(), Eq("は")); - EXPECT_THAT(itr->ResetToTermStartingAfter(33), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(11), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(12))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(3), IsOkAndHolds(Eq(4))); EXPECT_THAT(itr->GetTerm(), Eq("仕事")); } -TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfter) { +TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfterUtf32) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create( @@ -872,21 +887,22 @@ TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfter) { constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kKhmer)); - // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។" - // ^ ^ ^ ^ - // Bytes: 0 9 24 45 - EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9))); + // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។" + // ^ ^ ^ ^ + // UTF-8 idx: 0 9 24 45 + // UTF-32 idx: 0 3 8 15 + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3))); EXPECT_THAT(itr->GetTerm(), Eq("ដើរទៅ")); - EXPECT_THAT(itr->ResetToTermStartingAfter(47), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(24))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(6), IsOkAndHolds(Eq(8))); EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ")); } -TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermAfter) { +TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermAfterUtf32) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create( @@ -895,24 +911,25 @@ TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermAfter) { constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kThai)); - // String: "ฉันเดินไปทำงานทุกวัน" - // ^ ^ ^ ^ ^ ^ - // Bytes: 0 9 21 27 42 51 - EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9))); + // String: "ฉันเดินไปทำงานทุกวัน" + // ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 9 21 27 42 51 + // UTF-32 idx: 0 3 7 9 14 17 + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3))); EXPECT_THAT(itr->GetTerm(), Eq("เดิน")); - EXPECT_THAT(itr->ResetToTermStartingAfter(51), + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(17), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermStartingAfter(13), IsOkAndHolds(Eq(21))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(6), IsOkAndHolds(Eq(7))); EXPECT_THAT(itr->GetTerm(), Eq("ไป")); - EXPECT_THAT(itr->ResetToTermStartingAfter(34), IsOkAndHolds(Eq(42))); + EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(14))); EXPECT_THAT(itr->GetTerm(), Eq("ทุก")); } -TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeWordConnector) { +TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeWordConnectorUtf32) { ICING_ASSERT_OK_AND_ASSIGN( auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); @@ -920,21 +937,22 @@ TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeWordConnector) { ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, segmenter->Segment(kText)); - // String: "package name com:google:android!" - // ^ ^^ ^^ ^ - // Bytes: 0 7 8 12 13 31 - auto position_or = itr->ResetToTermEndingBefore(31); + // String: "package name com:google:android!" + // ^ ^^ ^^ ^ + // UTF-8 idx: 0 7 8 12 13 31 + // UTF-32 idx: 0 7 8 12 13 31 + auto position_or = itr->ResetToTermEndingBeforeUtf32(31); EXPECT_THAT(position_or, IsOk()); EXPECT_THAT(position_or.ValueOrDie(), Eq(13)); ASSERT_THAT(itr->GetTerm(), Eq("com:google:android")); - position_or = itr->ResetToTermEndingBefore(21); + position_or = itr->ResetToTermEndingBeforeUtf32(21); EXPECT_THAT(position_or, IsOk()); EXPECT_THAT(position_or.ValueOrDie(), Eq(12)); ASSERT_THAT(itr->GetTerm(), Eq(" ")); } -TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeOutOfBounds) { +TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeOutOfBoundsUtf32) { ICING_ASSERT_OK_AND_ASSIGN( auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); @@ -942,19 +960,19 @@ TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeOutOfBounds) { ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, segmenter->Segment(kText)); - // String: "How are you你好吗お元気ですか" - // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ - // Bytes: 0 3 4 7 8 11 172023 29 35 - ASSERT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4))); + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 3 4 7 8 11 172023 29 35 + // UTF-32 idx: 0 3 4 7 8 11 131415 17 19 + ASSERT_THAT(itr->ResetToTermEndingBeforeUtf32(7), IsOkAndHolds(Eq(4))); ASSERT_THAT(itr->GetTerm(), Eq("are")); - EXPECT_THAT(itr->ResetToTermEndingBefore(-1), + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(-1), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); EXPECT_THAT(itr->GetTerm(), Eq("are")); - EXPECT_THAT(itr->ResetToTermEndingBefore(kText.length()), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - EXPECT_THAT(itr->GetTerm(), Eq("are")); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(29), IsOk()); + EXPECT_THAT(itr->GetTerm(), Eq("か")); } // Tests that ResetToTermBefore and Advance produce the same output. With the @@ -963,7 +981,7 @@ TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeOutOfBounds) { // terms produced by ResetToTermBefore calls with the current position // provided as the argument (after their order has been reversed). TEST_P(ReverseJniLanguageSegmenterTest, - MixedLanguagesResetToTermBeforeEquivalentToAdvance) { + MixedLanguagesResetToTermBeforeEquivalentToAdvanceUtf32) { ICING_ASSERT_OK_AND_ASSIGN( auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); @@ -973,17 +991,12 @@ TEST_P(ReverseJniLanguageSegmenterTest, segmenter->Segment(kText)); std::vector<std::string_view> advance_terms = GetAllTermsAdvance(advance_itr.get()); - // Can't produce the last term via calls to ResetToTermBefore. So skip - // past that one. - auto itr = advance_terms.begin(); - std::advance(itr, advance_terms.size() - 1); - advance_terms.erase(itr); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, segmenter->Segment(kText)); std::vector<std::string_view> reset_terms = - GetAllTermsResetBefore(reset_to_term_itr.get()); + GetAllTermsResetBeforeUtf32(reset_to_term_itr.get()); std::reverse(reset_terms.begin(), reset_terms.end()); EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms)); @@ -992,7 +1005,7 @@ TEST_P(ReverseJniLanguageSegmenterTest, } TEST_P(ReverseJniLanguageSegmenterTest, - ThaiResetToTermBeforeEquivalentToAdvance) { + ThaiResetToTermBeforeEquivalentToAdvanceUtf32) { ICING_ASSERT_OK_AND_ASSIGN( auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); @@ -1002,17 +1015,12 @@ TEST_P(ReverseJniLanguageSegmenterTest, segmenter->Segment(kThai)); std::vector<std::string_view> advance_terms = GetAllTermsAdvance(advance_itr.get()); - // Can't produce the last term via calls to ResetToTermBefore. So skip - // past that one. - auto itr = advance_terms.begin(); - std::advance(itr, advance_terms.size() - 1); - advance_terms.erase(itr); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, segmenter->Segment(kThai)); std::vector<std::string_view> reset_terms = - GetAllTermsResetBefore(reset_to_term_itr.get()); + GetAllTermsResetBeforeUtf32(reset_to_term_itr.get()); std::reverse(reset_terms.begin(), reset_terms.end()); EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms)); @@ -1020,7 +1028,7 @@ TEST_P(ReverseJniLanguageSegmenterTest, } TEST_P(ReverseJniLanguageSegmenterTest, - KoreanResetToTermBeforeEquivalentToAdvance) { + KoreanResetToTermBeforeEquivalentToAdvanceUtf32) { ICING_ASSERT_OK_AND_ASSIGN( auto segmenter, language_segmenter_factory::Create( GetSegmenterOptions(GetLocale(), jni_cache_.get()))); @@ -1030,24 +1038,19 @@ TEST_P(ReverseJniLanguageSegmenterTest, segmenter->Segment(kKorean)); std::vector<std::string_view> advance_terms = GetAllTermsAdvance(advance_itr.get()); - // Can't produce the last term via calls to ResetToTermBefore. So skip - // past that one. - auto itr = advance_terms.begin(); - std::advance(itr, advance_terms.size() - 1); - advance_terms.erase(itr); ICING_ASSERT_OK_AND_ASSIGN( std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr, segmenter->Segment(kKorean)); std::vector<std::string_view> reset_terms = - GetAllTermsResetBefore(reset_to_term_itr.get()); + GetAllTermsResetBeforeUtf32(reset_to_term_itr.get()); std::reverse(reset_terms.begin(), reset_terms.end()); EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms)); EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm())); } -TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermBefore) { +TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermBeforeUtf32) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create( @@ -1056,35 +1059,36 @@ TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermBefore) { std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment("How are you你好吗お元気ですか")); - // String: "How are you你好吗お元気ですか" - // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ - // Bytes: 0 3 4 7 8 11 172023 29 35 - EXPECT_THAT(itr->ResetToTermEndingBefore(2), + // String: "How are you你好吗お元気ですか" + // ^ ^^ ^^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 3 4 7 8 11 172023 29 35 + // UTF-32 idx: 0 3 4 7 8 11 131415 17 19 + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(2), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(7))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(10), IsOkAndHolds(Eq(7))); EXPECT_THAT(itr->GetTerm(), Eq(" ")); - EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(7), IsOkAndHolds(Eq(4))); EXPECT_THAT(itr->GetTerm(), Eq("are")); - EXPECT_THAT(itr->ResetToTermEndingBefore(32), IsOkAndHolds(Eq(23))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(18), IsOkAndHolds(Eq(15))); EXPECT_THAT(itr->GetTerm(), Eq("元気")); - EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(8))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(12), IsOkAndHolds(Eq(8))); EXPECT_THAT(itr->GetTerm(), Eq("you")); - EXPECT_THAT(itr->ResetToTermEndingBefore(0), + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermEndingBefore(35), IsOkAndHolds(Eq(29))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(19), IsOkAndHolds(Eq(17))); EXPECT_THAT(itr->GetTerm(), Eq("です")); } TEST_P(ReverseJniLanguageSegmenterTest, - ContinuousWhitespacesResetToTermBefore) { + ContinuousWhitespacesResetToTermBeforeUtf32) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create( @@ -1094,34 +1098,35 @@ TEST_P(ReverseJniLanguageSegmenterTest, ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kTextWithSpace)); - // String: "Hello World" - // ^ ^ ^ - // Bytes: 0 5 15 - EXPECT_THAT(itr->ResetToTermEndingBefore(0), + // String: "Hello World" + // ^ ^ ^ + // UTF-8 idx: 0 5 15 + // UTF-32 idx: 0 5 15 + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermEndingBefore(2), + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(2), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(10), IsOkAndHolds(Eq(0))); EXPECT_THAT(itr->GetTerm(), Eq("Hello")); - EXPECT_THAT(itr->ResetToTermEndingBefore(5), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(5), IsOkAndHolds(Eq(0))); EXPECT_THAT(itr->GetTerm(), Eq("Hello")); - EXPECT_THAT(itr->ResetToTermEndingBefore(15), IsOkAndHolds(Eq(5))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(15), IsOkAndHolds(Eq(5))); EXPECT_THAT(itr->GetTerm(), Eq(" ")); - EXPECT_THAT(itr->ResetToTermEndingBefore(17), IsOkAndHolds(Eq(5))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(17), IsOkAndHolds(Eq(5))); EXPECT_THAT(itr->GetTerm(), Eq(" ")); - EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(5))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(19), IsOkAndHolds(Eq(5))); EXPECT_THAT(itr->GetTerm(), Eq(" ")); } -TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermBefore) { +TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermBeforeUtf32) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create( @@ -1131,21 +1136,22 @@ TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermBefore) { constexpr std::string_view kChinese = "我每天走路去上班。"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kChinese)); - // String: "我每天走路去上班。" - // ^ ^ ^ ^^ - // Bytes: 0 3 9 15 18 - EXPECT_THAT(itr->ResetToTermEndingBefore(0), + // String: "我每天走路去上班。" + // ^ ^ ^ ^^ + // UTF-8 idx: 0 3 9 15 18 + // UTF-32 idx: 0 1 3 5 6 + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(2), IsOkAndHolds(Eq(0))); EXPECT_THAT(itr->GetTerm(), Eq("我")); - EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(15))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(7), IsOkAndHolds(Eq(5))); EXPECT_THAT(itr->GetTerm(), Eq("去")); } -TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermBefore) { +TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermBeforeUtf32) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create( @@ -1154,21 +1160,22 @@ TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermBefore) { constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kJapanese)); - // String: "私は毎日仕事に歩いています。" - // ^ ^ ^ ^ ^ ^ ^ ^ ^ - // Bytes: 0 3 6 12 18212427 33 - EXPECT_THAT(itr->ResetToTermEndingBefore(0), + // String: "私は毎日仕事に歩いています。" + // ^ ^ ^ ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 3 6 12 18212427 33 + // UTF-32 idx: 0 1 2 4 6 7 8 9 11 + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermEndingBefore(33), IsOkAndHolds(Eq(27))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(11), IsOkAndHolds(Eq(9))); EXPECT_THAT(itr->GetTerm(), Eq("てい")); - EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(3))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(3), IsOkAndHolds(Eq(1))); EXPECT_THAT(itr->GetTerm(), Eq("は")); } -TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermBefore) { +TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermBeforeUtf32) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create( @@ -1176,21 +1183,22 @@ TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermBefore) { constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kKhmer)); - // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។" - // ^ ^ ^ ^ - // Bytes: 0 9 24 45 - EXPECT_THAT(itr->ResetToTermEndingBefore(0), + // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។" + // ^ ^ ^ ^ + // UTF-8 idx: 0 9 24 45 + // UTF-32 idx: 0 3 8 15 + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermEndingBefore(47), IsOkAndHolds(Eq(24))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(16), IsOkAndHolds(Eq(8))); EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ")); - EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(5), IsOkAndHolds(Eq(0))); EXPECT_THAT(itr->GetTerm(), Eq("ញុំ")); } -TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermBefore) { +TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermBeforeUtf32) { ICING_ASSERT_OK_AND_ASSIGN( auto language_segmenter, language_segmenter_factory::Create( @@ -1199,20 +1207,21 @@ TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermBefore) { constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน"; ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr, language_segmenter->Segment(kThai)); - // String: "ฉันเดินไปทำงานทุกวัน" - // ^ ^ ^ ^ ^ ^ - // Bytes: 0 9 21 27 42 51 - EXPECT_THAT(itr->ResetToTermEndingBefore(0), + // String: "ฉันเดินไปทำงานทุกวัน" + // ^ ^ ^ ^ ^ ^ + // UTF-8 idx: 0 9 21 27 42 51 + // UTF-32 idx: 0 3 7 9 14 17 + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0), StatusIs(libtextclassifier3::StatusCode::NOT_FOUND)); EXPECT_THAT(itr->GetTerm(), IsEmpty()); - EXPECT_THAT(itr->ResetToTermEndingBefore(51), IsOkAndHolds(Eq(42))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(17), IsOkAndHolds(Eq(14))); EXPECT_THAT(itr->GetTerm(), Eq("ทุก")); - EXPECT_THAT(itr->ResetToTermEndingBefore(13), IsOkAndHolds(Eq(0))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(4), IsOkAndHolds(Eq(0))); EXPECT_THAT(itr->GetTerm(), Eq("ฉัน")); - EXPECT_THAT(itr->ResetToTermEndingBefore(34), IsOkAndHolds(Eq(21))); + EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(11), IsOkAndHolds(Eq(7))); EXPECT_THAT(itr->GetTerm(), Eq("ไป")); } diff --git a/icing/tokenization/simple/space-language-segmenter-factory.cc b/icing/tokenization/simple/space-language-segmenter-factory.cc deleted file mode 100644 index 856ba0a..0000000 --- a/icing/tokenization/simple/space-language-segmenter-factory.cc +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "icing/tokenization/language-segmenter-factory.h" -#include "icing/tokenization/simple/space-language-segmenter.h" -#include "icing/util/logging.h" - -namespace icing { -namespace lib { - -namespace language_segmenter_factory { - -// Creates a language segmenter with the given locale. -// -// Returns: -// A LanguageSegmenter on success -// INVALID_ARGUMENT if locale string is invalid -// -// TODO(b/156383798): Figure out if we want to verify locale strings and notify -// users. Right now illegal locale strings will be ignored by ICU. ICU -// components will be created with its default locale. -libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create( - SegmenterOptions) { - return std::make_unique<SpaceLanguageSegmenter>(); -} - -} // namespace language_segmenter_factory - -} // namespace lib -} // namespace icing diff --git a/icing/tokenization/simple/space-language-segmenter.cc b/icing/tokenization/simple/space-language-segmenter.cc deleted file mode 100644 index 7e301ec..0000000 --- a/icing/tokenization/simple/space-language-segmenter.cc +++ /dev/null @@ -1,205 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "icing/tokenization/simple/space-language-segmenter.h" - -#include <cstdint> -#include <memory> -#include <string> -#include <string_view> -#include <utility> -#include <vector> - -#include "icing/text_classifier/lib3/utils/base/status.h" -#include "icing/text_classifier/lib3/utils/base/statusor.h" -#include "icing/absl_ports/canonical_errors.h" -#include "icing/legacy/core/icing-string-util.h" -#include "icing/util/status-macros.h" - -namespace icing { -namespace lib { - -namespace { -constexpr char kASCIISpace = ' '; -} // namespace - -class SpaceLanguageSegmenterIterator : public LanguageSegmenter::Iterator { - public: - SpaceLanguageSegmenterIterator(std::string_view text) - : text_(text), term_start_index_(0), term_end_index_exclusive_(0) {} - - // Advances to the next term. Returns false if it has reached the end. - bool Advance() override { - if (term_end_index_exclusive_ >= text_.size() || - term_start_index_ >= text_.size()) { - // Reached the end - return false; - } - - // Next term starts where we left off. - term_start_index_ = term_end_index_exclusive_; - - // We know a term is at least one length, so we can +1 first. - term_end_index_exclusive_++; - - // We alternate terms between space and non-space. Figure out what type of - // term we're currently on so we know how to stop. - bool is_space = text_[term_start_index_] == kASCIISpace; - - while (term_end_index_exclusive_ < text_.size()) { - bool end_is_space = text_[term_end_index_exclusive_] == kASCIISpace; - if (is_space != end_is_space) { - // We finally see a different type of character, reached the end. - break; - } - // We're still seeing the same types of characters (saw a space and - // still seeing spaces, or saw a non-space and still seeing non-spaces). - // Haven't reached the next term yet, keep advancing. - term_end_index_exclusive_++; - } - - return true; - } - - // Returns the current term. It can be called only when Advance() returns - // true. - std::string_view GetTerm() const override { - if (text_[term_start_index_] == kASCIISpace) { - // Rule: multiple continuous whitespaces are treated as one. - return std::string_view(&text_[term_start_index_], 1); - } - return text_.substr(term_start_index_, - term_end_index_exclusive_ - term_start_index_); - } - - libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter( - int32_t offset) override { - if (offset < 0) { - // Start over from the beginning to find the first term. - term_start_index_ = 0; - term_end_index_exclusive_ = 0; - } else { - // Offset points to a term right now. Advance to get past the current - // term. - term_end_index_exclusive_ = offset; - if (!Advance()) { - return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( - "No term found in '%s' that starts after offset %d", - std::string(text_).c_str(), offset)); - } - } - - // Advance again so we can point to the next term. - if (!Advance()) { - return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( - "No term found in '%s' that starts after offset %d", - std::string(text_).c_str(), offset)); - } - - return term_start_index_; - } - - libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore( - int32_t offset) override { - if (offset <= 0 || offset > text_.size()) { - return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( - "No term found in '%s' that ends before offset %d", - std::string(text_).c_str(), offset)); - } - - if (offset == text_.size()) { - // Special-case if the offset is the text length, this is the last term in - // the text, which is also considered to be "ending before" the offset. - term_end_index_exclusive_ = offset; - ICING_ASSIGN_OR_RETURN(term_start_index_, GetTermStartingBefore(offset)); - return term_start_index_; - } - - // Otherwise, this is just the end of the previous term and we still need to - // find the start of the previous term. - ICING_ASSIGN_OR_RETURN(term_end_index_exclusive_, - GetTermStartingBefore(offset)); - - if (term_end_index_exclusive_ == 0) { - // The current term starts at the beginning of the underlying text_. - // There is no term before this. - return absl_ports::NotFoundError(IcingStringUtil::StringPrintf( - "No term found in '%s' that ends before offset %d", - std::string(text_).c_str(), offset)); - } - - // Reset ourselves to find the term before the end. - ICING_ASSIGN_OR_RETURN( - term_start_index_, - GetTermStartingBefore(term_end_index_exclusive_ - 1)); - return term_start_index_; - } - - libtextclassifier3::StatusOr<int32_t> ResetToStart() override { - term_start_index_ = 0; - term_end_index_exclusive_ = 0; - if (!Advance()) { - return absl_ports::NotFoundError(""); - } - return term_start_index_; - } - - private: - // Return the start offset of the term starting right before the given offset. - libtextclassifier3::StatusOr<int32_t> GetTermStartingBefore(int32_t offset) { - bool is_space = text_[offset] == kASCIISpace; - - // Special-case that if offset was the text length, then we're already at - // the "end" of our current term. - if (offset == text_.size()) { - is_space = text_[--offset] == kASCIISpace; - } - - // While it's the same type of character (space vs non-space), we're in the - // same term. So keep iterating backwards until we see a change. - while (offset >= 0 && (text_[offset] == kASCIISpace) == is_space) { - --offset; - } - - // +1 is because offset was off-by-one to exit the while-loop. - return ++offset; - } - - // Text to be segmented - std::string_view text_; - - // The start and end indices are used to track the positions of current - // term. - int term_start_index_; - int term_end_index_exclusive_; -}; - -libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>> -SpaceLanguageSegmenter::Segment(const std::string_view text) const { - return std::make_unique<SpaceLanguageSegmenterIterator>(text); -} - -libtextclassifier3::StatusOr<std::vector<std::string_view>> -SpaceLanguageSegmenter::GetAllTerms(const std::string_view text) const { - ICING_ASSIGN_OR_RETURN(std::unique_ptr<LanguageSegmenter::Iterator> iterator, - Segment(text)); - std::vector<std::string_view> terms; - while (iterator->Advance()) { - terms.push_back(iterator->GetTerm()); - } - return terms; -} - -} // namespace lib -} // namespace icing diff --git a/icing/tokenization/simple/space-language-segmenter.h b/icing/tokenization/simple/space-language-segmenter.h deleted file mode 100644 index de0a6d3..0000000 --- a/icing/tokenization/simple/space-language-segmenter.h +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef ICING_TOKENIZATION_SIMPLE_SPACE_LANGUAGE_SEGMENTER_H_ -#define ICING_TOKENIZATION_SIMPLE_SPACE_LANGUAGE_SEGMENTER_H_ - -#include <cstdint> -#include <memory> -#include <string> -#include <string_view> -#include <vector> - -#include "icing/text_classifier/lib3/utils/base/statusor.h" -#include "icing/tokenization/language-segmenter.h" - -namespace icing { -namespace lib { - -// Simple segmenter that splits on spaces, regardless of language. Continuous -// whitespaces will be returned as a single whitespace character. -class SpaceLanguageSegmenter : public LanguageSegmenter { - public: - SpaceLanguageSegmenter() = default; - SpaceLanguageSegmenter(const SpaceLanguageSegmenter&) = delete; - SpaceLanguageSegmenter& operator=(const SpaceLanguageSegmenter&) = delete; - - // Segmentation is based purely on whitespace; does not take into account the - // language of the text. - // - // Returns: - // An iterator of terms on success - libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>> - Segment(std::string_view text) const override; - - // Does not take into account the language of the text. - // - // Returns: - // A list of terms on success - // INTERNAL_ERROR if any error occurs - libtextclassifier3::StatusOr<std::vector<std::string_view>> GetAllTerms( - std::string_view text) const override; -}; - -} // namespace lib -} // namespace icing - -#endif // ICING_TOKENIZATION_SIMPLE_SPACE_LANGUAGE_SEGMENTER_H_ diff --git a/icing/tokenization/simple/space-language-segmenter_test.cc b/icing/tokenization/simple/space-language-segmenter_test.cc deleted file mode 100644 index 6c5e3f6..0000000 --- a/icing/tokenization/simple/space-language-segmenter_test.cc +++ /dev/null @@ -1,129 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "gmock/gmock.h" -#include "gtest/gtest.h" -#include "icing/absl_ports/str_cat.h" -#include "icing/testing/common-matchers.h" -#include "icing/tokenization/language-segmenter-factory.h" -#include "icing/tokenization/language-segmenter.h" -#include "unicode/uloc.h" - -namespace icing { -namespace lib { -namespace { - -using ::testing::ElementsAre; -using ::testing::Eq; -using ::testing::IsEmpty; - -TEST(SpaceLanguageSegmenterTest, EmptyText) { - language_segmenter_factory::SegmenterOptions options(ULOC_US); - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(std::move(options))); - EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty())); -} - -TEST(SpaceLanguageSegmenterTest, SimpleText) { - language_segmenter_factory::SegmenterOptions options(ULOC_US); - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(std::move(options))); - EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"), - IsOkAndHolds(ElementsAre("Hello", " ", "World"))); -} - -TEST(SpaceLanguageSegmenterTest, Punctuation) { - language_segmenter_factory::SegmenterOptions options(ULOC_US); - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(std::move(options))); - - EXPECT_THAT(language_segmenter->GetAllTerms("Hello, World!!!"), - IsOkAndHolds(ElementsAre("Hello,", " ", "World!!!"))); - EXPECT_THAT(language_segmenter->GetAllTerms("Open-source project"), - IsOkAndHolds(ElementsAre("Open-source", " ", "project"))); - EXPECT_THAT(language_segmenter->GetAllTerms("100%"), - IsOkAndHolds(ElementsAre("100%"))); - EXPECT_THAT(language_segmenter->GetAllTerms("(A&B)"), - IsOkAndHolds(ElementsAre("(A&B)"))); -} - -TEST(SpaceLanguageSegmenterTest, Alphanumeric) { - language_segmenter_factory::SegmenterOptions options(ULOC_US); - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(std::move(options))); - - // Alphanumeric terms are allowed - EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"), - IsOkAndHolds(ElementsAre("Se7en", " ", "A4", " ", "3a"))); -} - -TEST(SpaceLanguageSegmenterTest, Number) { - language_segmenter_factory::SegmenterOptions options(ULOC_US); - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(std::move(options))); - - // Alphanumeric terms are allowed - EXPECT_THAT( - language_segmenter->GetAllTerms("3.141592653589793238462643383279"), - IsOkAndHolds(ElementsAre("3.141592653589793238462643383279"))); - - EXPECT_THAT(language_segmenter->GetAllTerms("3,456.789"), - IsOkAndHolds(ElementsAre("3,456.789"))); - - EXPECT_THAT(language_segmenter->GetAllTerms("-123"), - IsOkAndHolds(ElementsAre("-123"))); -} - -TEST(SpaceLanguageSegmenterTest, ContinuousWhitespaces) { - language_segmenter_factory::SegmenterOptions options(ULOC_US); - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(std::move(options))); - - // Multiple continuous whitespaces are treated as one. - const int kNumSeparators = 256; - const std::string text_with_spaces = - absl_ports::StrCat("Hello", std::string(kNumSeparators, ' '), "World"); - EXPECT_THAT(language_segmenter->GetAllTerms(text_with_spaces), - IsOkAndHolds(ElementsAre("Hello", " ", "World"))); -} - -TEST(SpaceLanguageSegmenterTest, NotCopyStrings) { - language_segmenter_factory::SegmenterOptions options(ULOC_US); - ICING_ASSERT_OK_AND_ASSIGN( - auto language_segmenter, - language_segmenter_factory::Create(std::move(options))); - // Validates that the input strings are not copied - const std::string text = "Hello World"; - const char* word1_address = text.c_str(); - const char* word2_address = text.c_str() + 6; - ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string_view> terms, - language_segmenter->GetAllTerms(text)); - ASSERT_THAT(terms, ElementsAre("Hello", " ", "World")); - const char* word1_result_address = terms.at(0).data(); - const char* word2_result_address = terms.at(2).data(); - - // The underlying char* should be the same - EXPECT_THAT(word1_address, Eq(word1_result_address)); - EXPECT_THAT(word2_address, Eq(word2_result_address)); -} - -} // namespace -} // namespace lib -} // namespace icing diff --git a/icing/tokenization/tokenizer.h b/icing/tokenization/tokenizer.h index 38c4745..b4f0c6e 100644 --- a/icing/tokenization/tokenizer.h +++ b/icing/tokenization/tokenizer.h @@ -20,7 +20,9 @@ #include <string_view> #include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/absl_ports/canonical_errors.h" #include "icing/tokenization/token.h" +#include "icing/util/character-iterator.h" namespace icing { namespace lib { @@ -64,6 +66,18 @@ class Tokenizer { // true, otherwise an invalid token could be returned. virtual Token GetToken() const = 0; + virtual libtextclassifier3::StatusOr<CharacterIterator> + CalculateTokenStart() { + return absl_ports::UnimplementedError( + "CalculateTokenStart is not implemented!"); + } + + virtual libtextclassifier3::StatusOr<CharacterIterator> + CalculateTokenEndExclusive() { + return absl_ports::UnimplementedError( + "CalculateTokenEndExclusive is not implemented!"); + } + // Sets the tokenizer to point at the first token that *starts* *after* // offset. Returns false if there are no valid tokens starting after // offset. diff --git a/icing/tools/document-store-dump.cc b/icing/tools/document-store-dump.cc deleted file mode 100644 index 45c9bf5..0000000 --- a/icing/tools/document-store-dump.cc +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "icing/tools/document-store-dump.h" - -#include <cinttypes> - -#include "icing/absl_ports/str_cat.h" -#include "icing/legacy/core/icing-string-util.h" -#include "icing/util/logging.h" - -namespace icing { -namespace lib { -namespace { - -void AppendDocumentProto(DocId document_id, const Document& doc, - std::string* output) { - absl_ports::StrAppend( - output, IcingStringUtil::StringPrintf( - "Document {\n document_id: %d\n corpus_id: %d\n uri: " - "'%s'\n score: %d\n created_timestamp_ms: %" PRIu64 "\n", - static_cast<int>(document_id), doc.corpus_id(), - doc.uri().c_str(), static_cast<int>(doc.score()), - static_cast<int64_t>(doc.created_timestamp_ms()))); - for (const auto& section : doc.sections()) { - absl_ports::StrAppend( - output, IcingStringUtil::StringPrintf( - " section {\n id: %d\n indexed_length: " - "%d\n content: '%s'\n snippet: '%s'\n", - static_cast<int>(section.id()), - static_cast<int>(section.indexed_length()), - section.content().c_str(), section.snippet().c_str())); - for (int64_t extracted_number : section.extracted_numbers()) { - absl_ports::StrAppend(output, IcingStringUtil::StringPrintf( - " extracted_numbers: %" PRId64 "\n", - extracted_number)); - } - for (const std::string& annotation_token : section.annotation_tokens()) { - absl_ports::StrAppend( - output, IcingStringUtil::StringPrintf(" annotation_tokens: '%s'\n", - annotation_token.c_str())); - } - std::string indexed = (section.config().indexed()) ? "true" : "false"; - std::string index_prefixes = - (section.config().index_prefixes()) ? "true" : "false"; - absl_ports::StrAppend( - output, - IcingStringUtil::StringPrintf( - " config {\n name: '%s'\n indexed: %s\n " - "tokenizer: %d\n weight: %d\n index_prefixes: %s\n " - "subsection_separator: '%s'\n", - section.config().name().c_str(), indexed.c_str(), - section.config().tokenizer(), - static_cast<int>(section.config().weight()), index_prefixes.c_str(), - section.config().subsection_separator().c_str())); - for (const auto& variant_generator : - section.config().variant_generators()) { - absl_ports::StrAppend( - output, IcingStringUtil::StringPrintf( - " variant_generators: %d\n", variant_generator)); - } - absl_ports::StrAppend( - output, - IcingStringUtil::StringPrintf( - " common_term_legacy_hit_score: %d\n " - "rfc822_host_name_term_legacy_hit_score: %d\n " - "semantic_property: '%s'\n universal_section_id: %d\n " - "omnibox_section_type: %d\n st_section_type: %d\n }\n }\n", - section.config().common_term_legacy_hit_score(), - section.config().rfc822_host_name_term_legacy_hit_score(), - section.config().semantic_property().c_str(), - section.config().universal_section_id(), - section.config().omnibox_section_type(), - section.config().st_section_type())); - } - for (const auto& language : doc.languages()) { - std::string used_classifier = - (language.used_classifier()) ? "true" : "false"; - absl_ports::StrAppend( - output, IcingStringUtil::StringPrintf( - " languages {\n language: %d\n score: %d\n " - "used_classifier: %s\n }\n", - language.language(), static_cast<int>(language.score()), - used_classifier.c_str())); - } - absl_ports::StrAppend( - output, IcingStringUtil::StringPrintf( - " ANNOTATIONS PRINTING NOT IMPLEMENTED YET IN ICING-TOOL\n")); -} - -} // namespace - -std::string GetDocumentStoreDump(const DocumentStore& document_store) { - std::string output; - for (DocId document_id = 0; document_id < document_store.num_documents(); - document_id++) { - Document doc; - if (!document_store.ReadDocument(document_id, &doc)) { - ICING_LOG(FATAL) << "Failed to read document"; - } - - AppendDocumentProto(document_id, doc, &output); - } - return output; -} - -} // namespace lib -} // namespace icing diff --git a/icing/tools/document-store-dump.h b/icing/tools/document-store-dump.h deleted file mode 100644 index 023b301..0000000 --- a/icing/tools/document-store-dump.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef ICING_TOOLS_DOCUMENT_STORE_DUMP_H_ -#define ICING_TOOLS_DOCUMENT_STORE_DUMP_H_ - -#include <string> - -#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/document-store.h" - -namespace icing { -namespace lib { - -// Utility function for dumping the complete document store content. -// This provides a human-readable representation of the document store, mainly -// provided for easier understandability for developers. -// The output of this class should only be available on cmdline-tool-level -// (with root access), or unit tests. In other words it should not be possible -// to trigger this on a release key device, for data protection reasons. -std::string GetDocumentStoreDump(const DocumentStore& document_store); - -} // namespace lib -} // namespace icing -#endif // ICING_TOOLS_DOCUMENT_STORE_DUMP_H_ diff --git a/icing/tools/icing-tool.cc b/icing/tools/icing-tool.cc deleted file mode 100644 index 72a11e9..0000000 --- a/icing/tools/icing-tool.cc +++ /dev/null @@ -1,306 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Copyright 2012 Google Inc. All Rights Reserved. -// Author: ulas@google.com (Ulas Kirazci) -// -// A tool to debug the native index. - -#include <getopt.h> -#include <unistd.h> - -#include <string> - -#include "java/com/google/android/gmscore/integ/modules/icing/jni/core/string-util.h" -#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/doc-property-filter.h" -#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/document-store.h" -#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/dynamic-trie.h" -#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/filesystem.h" -#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/mobstore.h" -#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/native-index-impl.h" -#include "icing/absl_ports/str_cat.h" -#include "icing/legacy/core/icing-string-util.h" -#include "icing/tools/document-store-dump.h" -#include "icing/util/logging.h" - -using std::vector; -using ::wireless_android_play_playlog::icing::IndexRestorationStats; - -namespace icing { -namespace lib { - -// 256KB for debugging. -const size_t kMaxDocumentSizeForDebugging = 1u << 18; -// Dump dynamic trie stats and contents. -void ProcessDynamicTrie(const char* filename) { - Filesystem filesystem; - DynamicTrie trie(filename, DynamicTrie::RuntimeOptions(), &filesystem); - if (!trie.Init()) { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Opening trie %s failed", - filename); - return; - } - - std::string out; - trie.GetDebugInfo(true, &out); - printf("Stats:\n%s", out.c_str()); - - std::ostringstream contents; - vector<std::string> keys; - trie.DumpTrie(&contents, &keys); - printf("Contents:\n%s", contents.str().c_str()); -} - -NativeIndexImpl* MakeIndex(const char* root_dir) { - NativeConfig native_config; - native_config.set_max_document_size(kMaxDocumentSizeForDebugging); - FlashIndexOptions flash_index_options( - NativeIndexImpl::GetNativeIndexDir(root_dir)); - NativeIndexImpl* ni = - new NativeIndexImpl(root_dir, native_config, flash_index_options); - InitStatus init_status; - if (!ni->Init(&init_status)) { - ICING_LOG(FATAL) << "Failed to initialize legacy native index impl"; - } - - IndexRestorationStats unused; - ni->RestoreIndex(IndexRequestSpec::default_instance(), &unused); - return ni; -} - -void RunQuery(NativeIndexImpl* ni, const std::string& query, int start, - int num_results) { - // Pull out corpusids and uris. - QueryRequestSpec spec; - spec.set_no_corpus_filter(true); - spec.set_want_uris(true); - spec.set_scoring_verbosity_level(1); - spec.set_prefix_match(true); - - QueryResponse response; - ni->ExecuteQuery(query, spec, 10000, start, num_results, &response); - - ICING_VLOG(1) << IcingStringUtil::StringPrintf( - "Query [%s] num results %u", query.c_str(), response.num_results()); - - for (int i = 0, uri_offset = 0; i < response.num_results(); i++) { - ICING_VLOG(1) << IcingStringUtil::StringPrintf( - "%d: (cid=%u) uri %.*s", i, response.corpus_ids(i), - response.uri_lengths(i), response.uri_buffer().data() + uri_offset); - uri_offset += response.uri_lengths(i); - } -} - -void RunSuggest(NativeIndexImpl* ni, const std::string& prefix, - int num_results) { - SuggestionResponse results; - ni->Suggest(prefix, num_results, vector<CorpusId>(), &results); - - ICING_VLOG(1) << IcingStringUtil::StringPrintf( - "Query [%s] num results %zu", prefix.c_str(), - static_cast<size_t>(results.suggestions_size())); - - for (size_t i = 0; i < results.suggestions_size(); i++) { - ICING_VLOG(1) << IcingStringUtil::StringPrintf( - "Sugg: [%s] display text [%s]", results.suggestions(i).query().c_str(), - results.suggestions(i).display_text().c_str()); - } -} - -int IcingTool(int argc, char** argv) { - auto file_storage = CreatePosixFileStorage(); - enum Options { - OPT_FILENAME, - OPT_OP, - OPT_QUERY, - NUM_OPT, - }; - static const option kOptions[NUM_OPT + 1] = { - {"filename", 1, nullptr, 0}, - {"op", 1, nullptr, 0}, - {"query", 1, nullptr, 0}, - {nullptr, 0, nullptr, 0}, - }; - const char* opt_values[NUM_OPT]; - memset(opt_values, 0, sizeof(opt_values)); - - while (true) { - int opt_idx = -1; - int ret = getopt_long(argc, argv, "", kOptions, &opt_idx); - if (ret != 0) break; - - if (opt_idx >= 0 && opt_idx < NUM_OPT) { - opt_values[opt_idx] = optarg; - } - } - - if (!opt_values[OPT_OP]) { - ICING_LOG(ERROR) << "No op specified"; - return -1; - } - - if (!opt_values[OPT_FILENAME]) { - ICING_LOG(ERROR) << "No filename specified"; - return -1; - } - if (!strncmp( - opt_values[OPT_FILENAME], - "/data/data/com.google.android.gms/files/AppDataSearch", - strlen("/data/data/com.google.android.gms/files/AppDataSearch"))) { - ICING_LOG(ERROR) - << "Should not read directly from the file in gmscore - " - "icing-tool also commits writes as side-effects which corrupts " - "the index on concurrent modification"; - return -1; - } - - const char* op = opt_values[OPT_OP]; - DocumentStore::Options options(file_storage.get(), - kMaxDocumentSizeForDebugging); - if (!strcmp(op, "dyntrie")) { - std::string full_file_path = - absl_ports::StrCat(opt_values[OPT_FILENAME], "/idx.lexicon"); - ProcessDynamicTrie(full_file_path.c_str()); - } else if (!strcmp(op, "verify")) { - std::unique_ptr<NativeIndexImpl> ni(MakeIndex(opt_values[OPT_FILENAME])); - ni->CheckVerify(); - } else if (!strcmp(op, "query")) { - if (opt_values[OPT_QUERY] == nullptr) { - ICING_LOG(FATAL) << "Opt value is null"; - } - - std::unique_ptr<NativeIndexImpl> ni(MakeIndex(opt_values[OPT_FILENAME])); - RunQuery(ni.get(), opt_values[OPT_QUERY], 0, 100); - } else if (!strcmp(op, "suggest")) { - if (opt_values[OPT_QUERY] == nullptr) { - ICING_LOG(FATAL) << "Opt value is null"; - } - - std::unique_ptr<NativeIndexImpl> ni(MakeIndex(opt_values[OPT_FILENAME])); - RunSuggest(ni.get(), opt_values[OPT_QUERY], 100); - } else if (!strcmp(op, "dump-all-docs")) { - DocumentStore ds(opt_values[OPT_FILENAME], options); - if (!ds.Init()) { - ICING_LOG(FATAL) << "Legacy document store failed to initialize"; - } - - printf( - "------ Document Store Dump Start ------\n" - "%s\n" - "------ Document Store Dump End ------\n", - GetDocumentStoreDump(ds).c_str()); - } else if (!strcmp(op, "dump-uris")) { - CorpusId corpus_id = kInvalidCorpusId; - if (opt_values[OPT_QUERY]) { - // Query is corpus id. - corpus_id = atoi(opt_values[OPT_QUERY]); // NOLINT - } - DocumentStore ds(opt_values[OPT_FILENAME], options); - if (!ds.Init()) { - ICING_LOG(FATAL) << "Legacy document store failed to initialize"; - } - - DocPropertyFilter dpf; - ds.AddDeletedTagFilter(&dpf); - - // Dump with format "<corpusid> <uri> <tagname>*". - int filtered = 0; - vector<std::string> tagnames; - for (DocId document_id = 0; document_id < ds.num_documents(); - document_id++) { - Document doc; - if (!ds.ReadDocument(document_id, &doc)) { - ICING_LOG(FATAL) << "Failed to read document."; - } - - if (corpus_id != kInvalidCorpusId && corpus_id != doc.corpus_id()) { - filtered++; - continue; - } - if (dpf.Match(0, document_id)) { - filtered++; - continue; - } - - tagnames.clear(); - ds.GetAllSetUserTagNames(document_id, &tagnames); - - printf("%d %s %s\n", doc.corpus_id(), doc.uri().c_str(), - StringUtil::JoinStrings("/", tagnames).c_str()); - } - ICING_VLOG(1) << IcingStringUtil::StringPrintf( - "Processed %u filtered %d", ds.num_documents(), filtered); - } else if (!strcmp(op, "dump-docs")) { - std::string out_filename = opt_values[OPT_FILENAME]; - out_filename.append("/docs-dump"); - CorpusId corpus_id = kInvalidCorpusId; - if (opt_values[OPT_QUERY]) { - // Query is corpus id. - corpus_id = atoi(opt_values[OPT_QUERY]); // NOLINT - out_filename.push_back('.'); - out_filename.append(opt_values[OPT_QUERY]); - } - DocumentStore ds(opt_values[OPT_FILENAME], options); - if (!ds.Init()) { - ICING_LOG(FATAL) << "Legacy document store failed to initialize"; - } - - DocPropertyFilter dpf; - ds.AddDeletedTagFilter(&dpf); - - // Dump with format (<32-bit length><serialized content>)*. - FILE* fp = fopen(out_filename.c_str(), "w"); - int filtered = 0; - for (DocId document_id = 0; document_id < ds.num_documents(); - document_id++) { - Document doc; - if (!ds.ReadDocument(document_id, &doc)) { - ICING_LOG(FATAL) << "Failed to read document."; - } - - if (corpus_id != kInvalidCorpusId && corpus_id != doc.corpus_id()) { - filtered++; - continue; - } - if (dpf.Match(0, document_id)) { - filtered++; - continue; - } - - std::string serialized = doc.SerializeAsString(); - uint32_t length = serialized.size(); - if (fwrite(&length, 1, sizeof(length), fp) != sizeof(length)) { - ICING_LOG(FATAL) << "Failed to write length information to file"; - } - - if (fwrite(serialized.data(), 1, serialized.size(), fp) != - serialized.size()) { - ICING_LOG(FATAL) << "Failed to write document to file"; - } - } - ICING_VLOG(1) << IcingStringUtil::StringPrintf( - "Processed %u filtered %d", ds.num_documents(), filtered); - fclose(fp); - } else { - ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unknown op %s", op); - return -1; - } - - return 0; -} - -} // namespace lib -} // namespace icing - -int main(int argc, char** argv) { return icing::lib::IcingTool(argc, argv); } diff --git a/icing/transform/icu/icu-normalizer-factory.cc b/icing/transform/icu/icu-normalizer-factory.cc index 493aeb5..9951325 100644 --- a/icing/transform/icu/icu-normalizer-factory.cc +++ b/icing/transform/icu/icu-normalizer-factory.cc @@ -44,6 +44,8 @@ libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create( return IcuNormalizer::Create(max_term_byte_size); } +std::string_view GetNormalizerName() { return IcuNormalizer::kName; } + } // namespace normalizer_factory } // namespace lib diff --git a/icing/transform/icu/icu-normalizer.h b/icing/transform/icu/icu-normalizer.h index f20a9fb..4442f3b 100644 --- a/icing/transform/icu/icu-normalizer.h +++ b/icing/transform/icu/icu-normalizer.h @@ -39,6 +39,8 @@ namespace lib { // details. class IcuNormalizer : public Normalizer { public: + static constexpr std::string_view kName = "IcuNormalizer"; + // Creates a normalizer with the subcomponents it needs. max_term_byte_size // enforces the max size of text after normalization, text will be truncated // if exceeds the max size. diff --git a/icing/transform/map/map-normalizer-factory.cc b/icing/transform/map/map-normalizer-factory.cc index 3bf84b3..286b8f6 100644 --- a/icing/transform/map/map-normalizer-factory.cc +++ b/icing/transform/map/map-normalizer-factory.cc @@ -42,6 +42,8 @@ libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create( return std::make_unique<MapNormalizer>(max_term_byte_size); } +std::string_view GetNormalizerName() { return MapNormalizer::kName; } + } // namespace normalizer_factory } // namespace lib diff --git a/icing/transform/map/map-normalizer.cc b/icing/transform/map/map-normalizer.cc index c888551..4ad5dec 100644 --- a/icing/transform/map/map-normalizer.cc +++ b/icing/transform/map/map-normalizer.cc @@ -23,6 +23,7 @@ #include "icing/absl_ports/str_cat.h" #include "icing/transform/map/normalization-map.h" +#include "icing/util/character-iterator.h" #include "icing/util/i18n-utils.h" #include "icing/util/logging.h" #include "unicode/utypes.h" @@ -30,48 +31,64 @@ namespace icing { namespace lib { +namespace { + +UChar32 NormalizeChar(UChar32 c) { + if (i18n_utils::GetUtf16Length(c) > 1) { + // All the characters we need to normalize can be encoded into a + // single char16_t. If this character needs more than 1 char16_t code + // unit, we can skip normalization and append it directly. + return c; + } + + // The original character can be encoded into a single char16_t. + const std::unordered_map<char16_t, char16_t>& normalization_map = + GetNormalizationMap(); + auto iterator = normalization_map.find(static_cast<char16_t>(c)); + if (iterator == normalization_map.end()) { + // Normalization mapping not found, append the original character. + return c; + } + + // Found a normalization mapping. The normalized character (stored in a + // char16_t) can have 1 or 2 bytes. + if (i18n_utils::IsAscii(iterator->second)) { + // The normalized character has 1 byte. It may be an upper-case char. + // Lower-case it before returning it. + return std::tolower(static_cast<char>(iterator->second)); + } else { + return iterator->second; + } +} + +} // namespace + std::string MapNormalizer::NormalizeTerm(std::string_view term) const { std::string normalized_text; normalized_text.reserve(term.length()); - for (int i = 0; i < term.length(); ++i) { - if (i18n_utils::IsAscii(term[i])) { - // The original character has 1 byte. - normalized_text.push_back(std::tolower(term[i])); - } else if (i18n_utils::IsLeadUtf8Byte(term[i])) { - UChar32 uchar32 = i18n_utils::GetUChar32At(term.data(), term.length(), i); + int current_pos = 0; + while (current_pos < term.length()) { + if (i18n_utils::IsAscii(term[current_pos])) { + normalized_text.push_back(std::tolower(term[current_pos])); + ++current_pos; + } else { + UChar32 uchar32 = + i18n_utils::GetUChar32At(term.data(), term.length(), current_pos); if (uchar32 == i18n_utils::kInvalidUChar32) { ICING_LOG(WARNING) << "Unable to get uchar32 from " << term - << " at position" << i; - continue; - } - int utf8_length = i18n_utils::GetUtf8Length(uchar32); - if (i18n_utils::GetUtf16Length(uchar32) > 1) { - // All the characters we need to normalize can be encoded into a - // single char16_t. If this character needs more than 1 char16_t code - // unit, we can skip normalization and append it directly. - absl_ports::StrAppend(&normalized_text, term.substr(i, utf8_length)); + << " at position" << current_pos; + ++current_pos; continue; } - // The original character can be encoded into a single char16_t. - const std::unordered_map<char16_t, char16_t>& normalization_map = - GetNormalizationMap(); - auto iterator = normalization_map.find(static_cast<char16_t>(uchar32)); - if (iterator != normalization_map.end()) { - // Found a normalization mapping. The normalized character (stored in a - // char16_t) can have 1 or 2 bytes. - if (i18n_utils::IsAscii(iterator->second)) { - // The normalized character has 1 byte. - normalized_text.push_back( - std::tolower(static_cast<char>(iterator->second))); - } else { - // The normalized character has 2 bytes. - i18n_utils::AppendUchar32ToUtf8(&normalized_text, iterator->second); - } + UChar32 normalized_char32 = NormalizeChar(uchar32); + if (i18n_utils::IsAscii(normalized_char32)) { + normalized_text.push_back(normalized_char32); } else { - // Normalization mapping not found, append the original character. - absl_ports::StrAppend(&normalized_text, term.substr(i, utf8_length)); + // The normalized character has 2 bytes. + i18n_utils::AppendUchar32ToUtf8(&normalized_text, normalized_char32); } + current_pos += i18n_utils::GetUtf8Length(uchar32); } } @@ -82,5 +99,27 @@ std::string MapNormalizer::NormalizeTerm(std::string_view term) const { return normalized_text; } +CharacterIterator MapNormalizer::CalculateNormalizedMatchLength( + std::string_view term, std::string_view normalized_term) const { + CharacterIterator char_itr(term); + CharacterIterator normalized_char_itr(normalized_term); + while (char_itr.utf8_index() < term.length() && + normalized_char_itr.utf8_index() < normalized_term.length()) { + UChar32 c = char_itr.GetCurrentChar(); + if (i18n_utils::IsAscii(c)) { + c = std::tolower(c); + } else { + c = NormalizeChar(c); + } + UChar32 normalized_c = normalized_char_itr.GetCurrentChar(); + if (c != normalized_c) { + return char_itr; + } + char_itr.AdvanceToUtf32(char_itr.utf32_index() + 1); + normalized_char_itr.AdvanceToUtf32(normalized_char_itr.utf32_index() + 1); + } + return char_itr; +} + } // namespace lib } // namespace icing diff --git a/icing/transform/map/map-normalizer.h b/icing/transform/map/map-normalizer.h index f9c0e42..8fbe83b 100644 --- a/icing/transform/map/map-normalizer.h +++ b/icing/transform/map/map-normalizer.h @@ -19,12 +19,15 @@ #include <string_view> #include "icing/transform/normalizer.h" +#include "icing/util/character-iterator.h" namespace icing { namespace lib { class MapNormalizer : public Normalizer { public: + static constexpr std::string_view kName = "MapNormalizer"; + explicit MapNormalizer(int max_term_byte_size) : max_term_byte_size_(max_term_byte_size){}; @@ -39,6 +42,17 @@ class MapNormalizer : public Normalizer { // Read more mapping details in normalization-map.cc std::string NormalizeTerm(std::string_view term) const override; + // Returns a CharacterIterator pointing to one past the end of the segment of + // term that (once normalized) matches with normalized_term. + // + // Ex. CalculateNormalizedMatchLength("YELLOW", "yell") will return + // CharacterIterator(u8:4, u16:4, u32:4). + // + // Ex. CalculateNormalizedMatchLength("YELLOW", "red") will return + // CharacterIterator(u8:0, u16:0, u32:0). + CharacterIterator CalculateNormalizedMatchLength( + std::string_view term, std::string_view normalized_term) const override; + private: // The maximum term length allowed after normalization. int max_term_byte_size_; diff --git a/icing/transform/map/map-normalizer_test.cc b/icing/transform/map/map-normalizer_test.cc index b62ae0e..26fdd4a 100644 --- a/icing/transform/map/map-normalizer_test.cc +++ b/icing/transform/map/map-normalizer_test.cc @@ -23,6 +23,7 @@ #include "icing/testing/icu-i18n-test-utils.h" #include "icing/transform/normalizer-factory.h" #include "icing/transform/normalizer.h" +#include "icing/util/character-iterator.h" namespace icing { namespace lib { @@ -199,6 +200,52 @@ TEST(MapNormalizerTest, Truncate) { } } +TEST(MapNormalizerTest, PrefixMatchLength) { + ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( + /*max_term_byte_size=*/1000)); + + // Upper to lower + std::string term = "MDI"; + CharacterIterator match_end = + normalizer->CalculateNormalizedMatchLength(term, "md"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("MD")); + + term = "Icing"; + match_end = normalizer->CalculateNormalizedMatchLength(term, "icin"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Icin")); + + // Full-width + term = "525600"; + match_end = normalizer->CalculateNormalizedMatchLength(term, "525"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("525")); + + term = "FULLWIDTH"; + match_end = normalizer->CalculateNormalizedMatchLength(term, "full"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("FULL")); + + // Hiragana to Katakana + term = "あいうえお"; + match_end = normalizer->CalculateNormalizedMatchLength(term, "アイ"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい")); + + term = "かきくけこ"; + match_end = normalizer->CalculateNormalizedMatchLength(term, "カ"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か")); + + // Latin accents + term = "Zürich"; + match_end = normalizer->CalculateNormalizedMatchLength(term, "zur"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür")); + + term = "après-midi"; + match_end = normalizer->CalculateNormalizedMatchLength(term, "apre"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè")); + + term = "Buenos días"; + match_end = normalizer->CalculateNormalizedMatchLength(term, "buenos di"); + EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Buenos dí")); +} + } // namespace } // namespace lib diff --git a/icing/transform/normalizer-factory.h b/icing/transform/normalizer-factory.h index f1f3f62..1db9915 100644 --- a/icing/transform/normalizer-factory.h +++ b/icing/transform/normalizer-factory.h @@ -36,6 +36,9 @@ namespace normalizer_factory { libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create( int max_term_byte_size); +// Returns the name of the normalizer being used. +std::string_view GetNormalizerName(); + } // namespace normalizer_factory } // namespace lib diff --git a/icing/transform/normalizer.h b/icing/transform/normalizer.h index 4cbfa63..7305c46 100644 --- a/icing/transform/normalizer.h +++ b/icing/transform/normalizer.h @@ -20,6 +20,7 @@ #include <string_view> #include "icing/text_classifier/lib3/utils/base/statusor.h" +#include "icing/util/character-iterator.h" namespace icing { namespace lib { @@ -39,6 +40,21 @@ class Normalizer { // Normalizes the input term based on rules. See implementation classes for // specific transformation rules. virtual std::string NormalizeTerm(std::string_view term) const = 0; + + // Returns a CharacterIterator pointing to one past the end of the segment of + // term that (once normalized) matches with normalized_term. + // + // Ex. CalculateNormalizedMatchLength("YELLOW", "yell") will return + // CharacterIterator(u8:4, u16:4, u32:4). + // + // Ex. CalculateNormalizedMatchLength("YELLOW", "red") will return + // CharacterIterator(u8:0, u16:0, u32:0). + virtual CharacterIterator CalculateNormalizedMatchLength( + std::string_view term, std::string_view normalized_term) const { + // TODO(b/195720764) Remove this default impl and implement in all + // subclasses. + return CharacterIterator(term, 0, 0, 0); + } }; } // namespace lib diff --git a/icing/transform/simple/none-normalizer-factory.cc b/icing/transform/simple/none-normalizer-factory.cc deleted file mode 100644 index 6b35270..0000000 --- a/icing/transform/simple/none-normalizer-factory.cc +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_FACTORY_H_ -#define ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_FACTORY_H_ - -#include <memory> -#include <string_view> - -#include "icing/text_classifier/lib3/utils/base/statusor.h" -#include "icing/absl_ports/canonical_errors.h" -#include "icing/transform/normalizer.h" -#include "icing/transform/simple/none-normalizer.h" - -namespace icing { -namespace lib { - -namespace normalizer_factory { - -// Creates a dummy normalizer. The term is not normalized, but -// the text will be truncated to max_term_byte_size if it exceeds the max size. -// -// Returns: -// A normalizer on success -// INVALID_ARGUMENT if max_term_byte_size <= 0 -// INTERNAL_ERROR on errors -libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create( - int max_term_byte_size) { - if (max_term_byte_size <= 0) { - return absl_ports::InvalidArgumentError( - "max_term_byte_size must be greater than zero."); - } - - return std::make_unique<NoneNormalizer>(max_term_byte_size); -} - -} // namespace normalizer_factory - -} // namespace lib -} // namespace icing - -#endif // ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_FACTORY_H_ diff --git a/icing/transform/simple/none-normalizer.h b/icing/transform/simple/none-normalizer.h deleted file mode 100644 index 47085e1..0000000 --- a/icing/transform/simple/none-normalizer.h +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_H_ -#define ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_H_ - -#include <string> -#include <string_view> - -#include "icing/transform/normalizer.h" - -namespace icing { -namespace lib { - -// This normalizer is not meant for production use. Currently only used to get -// the Icing library to compile in Jetpack. -// -// No normalization is done, but the term is truncated if it exceeds -// max_term_byte_size. -class NoneNormalizer : public Normalizer { - public: - explicit NoneNormalizer(int max_term_byte_size) - : max_term_byte_size_(max_term_byte_size){}; - - std::string NormalizeTerm(std::string_view term) const override { - if (term.length() > max_term_byte_size_) { - return std::string(term.substr(0, max_term_byte_size_)); - } - return std::string(term); - } - - private: - // The maximum term length allowed after normalization. - int max_term_byte_size_; -}; - -} // namespace lib -} // namespace icing - -#endif // ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_H_ diff --git a/icing/transform/simple/none-normalizer_test.cc b/icing/transform/simple/none-normalizer_test.cc deleted file mode 100644 index e074828..0000000 --- a/icing/transform/simple/none-normalizer_test.cc +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (C) 2019 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include <memory> - -#include "gmock/gmock.h" -#include "gtest/gtest.h" -#include "icing/testing/common-matchers.h" -#include "icing/transform/normalizer-factory.h" -#include "icing/transform/normalizer.h" - -namespace icing { -namespace lib { -namespace { - -using ::testing::Eq; - -TEST(NoneNormalizerTest, Creation) { - EXPECT_THAT(normalizer_factory::Create( - /*max_term_byte_size=*/5), - IsOk()); - EXPECT_THAT(normalizer_factory::Create( - /*max_term_byte_size=*/0), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - EXPECT_THAT(normalizer_factory::Create( - /*max_term_byte_size=*/-1), - StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); -} - -TEST(IcuNormalizerTest, NoNormalizationDone) { - ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( - /*max_term_byte_size=*/1000)); - EXPECT_THAT(normalizer->NormalizeTerm(""), Eq("")); - EXPECT_THAT(normalizer->NormalizeTerm("hello world"), Eq("hello world")); - - // Capitalization - EXPECT_THAT(normalizer->NormalizeTerm("MDI"), Eq("MDI")); - - // Accents - EXPECT_THAT(normalizer->NormalizeTerm("Zürich"), Eq("Zürich")); - - // Full-width punctuation to ASCII punctuation - EXPECT_THAT(normalizer->NormalizeTerm("。,!?:”"), Eq("。,!?:”")); - - // Half-width katakana - EXPECT_THAT(normalizer->NormalizeTerm("カ"), Eq("カ")); -} - -TEST(NoneNormalizerTest, Truncate) { - ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create( - /*max_term_byte_size=*/5)); - - // Won't be truncated - EXPECT_THAT(normalizer->NormalizeTerm("hi"), Eq("hi")); - EXPECT_THAT(normalizer->NormalizeTerm("hello"), Eq("hello")); - - // Truncated to length 5. - EXPECT_THAT(normalizer->NormalizeTerm("hello!"), Eq("hello")); -} - -} // namespace -} // namespace lib -} // namespace icing diff --git a/icing/util/bit-util.h b/icing/util/bit-util.h index e2bb817..7ca20b4 100644 --- a/icing/util/bit-util.h +++ b/icing/util/bit-util.h @@ -24,19 +24,18 @@ namespace bit_util { // Manipulating bit fields. // -// x value containing the bit field(s) -// offset offset of bit field in x -// len len of bit field in x +// value value containing the bit field(s) +// lsb_offset offset of bit field in value, starting from the least significant +// bit. for example, the '1' in '0100' has a lsb_offset of 2 +// len len of bit field in value // // REQUIREMENTS // -// - x an unsigned integer <= 64 bits -// - offset + len <= sizeof(x) * 8 +// - value is an unsigned integer <= 64 bits +// - lsb_offset + len <= sizeof(value) * 8 // // There is no error checking so you will get garbage if you don't // ensure the above. -// -// To set a value, use BITFIELD_CLEAR then BITFIELD_OR. // Shifting by more than the word length is undefined (on ARM it has the // intended effect, but on Intel it shifts by % word length), so check the @@ -44,20 +43,65 @@ namespace bit_util { inline uint64_t BitfieldMask(uint32_t len) { return ((len == 0) ? 0U : ((~uint64_t{0}) >> (64 - (len)))); } -inline uint64_t BitfieldGet(uint64_t mask, uint32_t lsb_offset, uint32_t len) { - return ((mask) >> (lsb_offset)) & BitfieldMask(len); + +inline void BitfieldClear(uint32_t lsb_offset, uint32_t len, + uint8_t* value_out) { + *value_out &= ~(BitfieldMask(len) << lsb_offset); +} + +inline void BitfieldClear(uint32_t lsb_offset, uint32_t len, + uint16_t* value_out) { + *value_out &= ~(BitfieldMask(len) << lsb_offset); +} + +inline void BitfieldClear(uint32_t lsb_offset, uint32_t len, + uint32_t* value_out) { + *value_out &= ~(BitfieldMask(len) << lsb_offset); +} + +inline void BitfieldClear(uint32_t lsb_offset, uint32_t len, + uint64_t* value_out) { + *value_out &= ~(BitfieldMask(len) << lsb_offset); +} + +inline uint64_t BitfieldGet(uint64_t value, uint32_t lsb_offset, uint32_t len) { + return ((value) >> (lsb_offset)) & BitfieldMask(len); +} + +inline void BitfieldSet(uint8_t new_value, uint32_t lsb_offset, uint32_t len, + uint8_t* value_out) { + BitfieldClear(lsb_offset, len, value_out); + + // We conservatively mask new_value at len so value won't be corrupted if + // new_value >= (1 << len). + *value_out |= (new_value & BitfieldMask(len)) << (lsb_offset); +} + +inline void BitfieldSet(uint16_t new_value, uint32_t lsb_offset, uint32_t len, + uint16_t* value_out) { + BitfieldClear(lsb_offset, len, value_out); + + // We conservatively mask new_value at len so value won't be corrupted if + // new_value >= (1 << len). + *value_out |= (new_value & BitfieldMask(len)) << (lsb_offset); } -inline void BitfieldSet(uint32_t value, uint32_t lsb_offset, uint32_t len, - uint32_t* mask) { - // We conservatively mask val at len so x won't be corrupted if val >= - // 1 << len. - *mask |= (uint64_t{value} & BitfieldMask(len)) << (lsb_offset); + +inline void BitfieldSet(uint32_t new_value, uint32_t lsb_offset, uint32_t len, + uint32_t* value_out) { + BitfieldClear(lsb_offset, len, value_out); + + // We conservatively mask new_value at len so value won't be corrupted if + // new_value >= (1 << len). + *value_out |= (new_value & BitfieldMask(len)) << (lsb_offset); } -inline void BitfieldSet(uint64_t value, uint32_t lsb_offset, uint32_t len, - uint64_t* mask) { - // We conservatively mask val at len so x won't be corrupted if val >= - // 1 << len. - *mask |= (value & BitfieldMask(len)) << (lsb_offset); + +inline void BitfieldSet(uint64_t new_value, uint32_t lsb_offset, uint32_t len, + uint64_t* value_out) { + BitfieldClear(lsb_offset, len, value_out); + + // We conservatively mask new_value at len so value won't be corrupted if + // new_value >= (1 << len). + *value_out |= (new_value & BitfieldMask(len)) << (lsb_offset); } } // namespace bit_util diff --git a/icing/util/bit-util_test.cc b/icing/util/bit-util_test.cc new file mode 100644 index 0000000..3b86a21 --- /dev/null +++ b/icing/util/bit-util_test.cc @@ -0,0 +1,145 @@ +// Copyright (C) 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/util/bit-util.h" + +#include <memory> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace icing { +namespace lib { +namespace { + +using ::testing::Eq; + +TEST(BitUtilTest, BitfieldMask) { + // Check that we can handle up to uint8_t's + EXPECT_THAT(bit_util::BitfieldMask(/*len=*/0), Eq(0b0)); + EXPECT_THAT(bit_util::BitfieldMask(/*len=*/1), Eq(0b01)); + + // Check that we can handle up to uint32_t's + EXPECT_THAT(bit_util::BitfieldMask(/*len=*/16), Eq(0b01111111111111111)); + + // Check that we can handle up to uint64_t's + EXPECT_THAT( + bit_util::BitfieldMask(/*len=*/63), + Eq(0b0111111111111111111111111111111111111111111111111111111111111111)); +} + +TEST(BitUtilTest, BitfieldClear) { + // Check that we can handle up to uint8_t's + uint8_t value_8 = 0b0; + bit_util::BitfieldClear(/*lsb_offset=*/0, /*len=*/1, &value_8); + EXPECT_THAT(value_8, Eq(0b0)); + + value_8 = 0b01; + bit_util::BitfieldClear(/*lsb_offset=*/0, /*len=*/1, &value_8); + EXPECT_THAT(value_8, Eq(0b00)); + + value_8 = 0b011; + bit_util::BitfieldClear(/*lsb_offset=*/1, /*len=*/1, &value_8); + EXPECT_THAT(value_8, Eq(0b001)); + + value_8 = 0b011; + bit_util::BitfieldClear(/*lsb_offset=*/0, /*len=*/2, &value_8); + EXPECT_THAT(value_8, Eq(0b000)); + + value_8 = 0b0110; + bit_util::BitfieldClear(/*lsb_offset=*/1, /*len=*/2, &value_8); + EXPECT_THAT(value_8, Eq(0b0000)); + + // Check that we can handle up to uint32_t's + uint32_t value_32 = 0b010000000000000000000000; + bit_util::BitfieldClear(/*lsb_offset=*/22, /*len=*/1, &value_32); + EXPECT_THAT(value_32, Eq(0b0)); + + // Check that we can handle up to uint64_t's + uint64_t value_64 = 0b0100000000000000000000000000000000000; + bit_util::BitfieldClear(/*lsb_offset=*/35, /*len=*/1, &value_64); + EXPECT_THAT(value_64, Eq(0b0)); +} + +TEST(BitUtilTest, BitfieldGet) { + // Get something in the uint8_t range + EXPECT_THAT(bit_util::BitfieldGet(0b0, /*lsb_offset=*/0, /*len=*/1), Eq(0b0)); + EXPECT_THAT(bit_util::BitfieldGet(0b01, /*lsb_offset=*/0, /*len=*/1), + Eq(0b01)); + EXPECT_THAT(bit_util::BitfieldGet(0b010, /*lsb_offset=*/1, /*len=*/1), + Eq(0b01)); + EXPECT_THAT(bit_util::BitfieldGet(0b001, /*lsb_offset=*/1, /*len=*/1), + Eq(0b0)); + EXPECT_THAT(bit_util::BitfieldGet(0b011, /*lsb_offset=*/0, /*len=*/2), + Eq(0b011)); + EXPECT_THAT(bit_util::BitfieldGet(0b0110, /*lsb_offset=*/1, /*len=*/2), + Eq(0b011)); + EXPECT_THAT(bit_util::BitfieldGet(0b0101, /*lsb_offset=*/0, /*len=*/3), + Eq(0b0101)); + + // Get something in the uint32_t range + EXPECT_THAT( + bit_util::BitfieldGet(0b01000000000000, /*lsb_offset=*/12, /*len=*/1), + Eq(0b01)); + + // Get something in the uint64_t range + EXPECT_THAT(bit_util::BitfieldGet(0b010000000000000000000000000000000000, + /*lsb_offset=*/34, /*len=*/1), + Eq(0b01)); +} + +TEST(BitUtilTest, BitfieldSet) { + // Set something in the uint8_t range + uint8_t value_8 = 0b0; + bit_util::BitfieldSet(0b0, /*lsb_offset=*/0, /*len=*/1, &value_8); + EXPECT_THAT(value_8, Eq(0b0)); + + value_8 = 0b01; + bit_util::BitfieldSet(0b01, /*lsb_offset=*/0, /*len=*/1, &value_8); + EXPECT_THAT(value_8, Eq(0b01)); + + value_8 = 0b00; + bit_util::BitfieldSet(0b01, /*lsb_offset=*/0, /*len=*/1, &value_8); + EXPECT_THAT(value_8, Eq(0b01)); + + value_8 = 0b00; + bit_util::BitfieldSet(0b011, /*lsb_offset=*/0, /*len=*/2, &value_8); + EXPECT_THAT(value_8, Eq(0b011)); + + value_8 = 0b01; + bit_util::BitfieldSet(0b011, /*lsb_offset=*/0, /*len=*/2, &value_8); + EXPECT_THAT(value_8, Eq(0b011)); + + value_8 = 0b01; + bit_util::BitfieldSet(0b01, /*lsb_offset=*/1, /*len=*/1, &value_8); + EXPECT_THAT(value_8, Eq(0b011)); + + value_8 = 0b0001; + bit_util::BitfieldSet(0b011, /*lsb_offset=*/1, /*len=*/2, &value_8); + EXPECT_THAT(value_8, Eq(0b0111)); + + // Set something in the uint32_t range + uint32_t value_32 = 0b0; + bit_util::BitfieldSet(0b01, /*lsb_offset=*/16, /*len=*/1, &value_32); + EXPECT_THAT(value_32, Eq(0b010000000000000000)); + + // Set something in the uint64_t range + uint64_t value_64 = 0b0; + bit_util::BitfieldSet(0b01, /*lsb_offset=*/34, /*len=*/1, &value_64); + EXPECT_THAT(value_64, Eq(0b010000000000000000000000000000000000)); +} + +} // namespace +} // namespace lib +} // namespace icing diff --git a/icing/util/character-iterator.cc b/icing/util/character-iterator.cc index 3707f95..d483031 100644 --- a/icing/util/character-iterator.cc +++ b/icing/util/character-iterator.cc @@ -14,6 +14,8 @@ #include "icing/util/character-iterator.h" +#include "icing/util/i18n-utils.h" + namespace icing { namespace lib { @@ -30,17 +32,35 @@ int GetUTF8StartPosition(std::string_view text, int current_byte_index) { } // namespace +UChar32 CharacterIterator::GetCurrentChar() { + if (cached_current_char_ == i18n_utils::kInvalidUChar32) { + // Our indices point to the right character, we just need to read that + // character. No need to worry about an error. If GetUChar32At fails, then + // current_char will be i18n_utils::kInvalidUChar32. + cached_current_char_ = + i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); + } + return cached_current_char_; +} + +bool CharacterIterator::MoveToUtf8(int desired_utf8_index) { + return (desired_utf8_index > utf8_index_) ? AdvanceToUtf8(desired_utf8_index) + : RewindToUtf8(desired_utf8_index); +} + bool CharacterIterator::AdvanceToUtf8(int desired_utf8_index) { if (desired_utf8_index > text_.length()) { // Enforce the requirement. return false; } // Need to work forwards. + UChar32 uchar32 = cached_current_char_; while (utf8_index_ < desired_utf8_index) { - UChar32 uchar32 = + uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); if (uchar32 == i18n_utils::kInvalidUChar32) { // Unable to retrieve a valid UTF-32 character at the previous position. + cached_current_char_ = i18n_utils::kInvalidUChar32; return false; } int utf8_length = i18n_utils::GetUtf8Length(uchar32); @@ -50,7 +70,10 @@ bool CharacterIterator::AdvanceToUtf8(int desired_utf8_index) { } utf8_index_ += utf8_length; utf16_index_ += i18n_utils::GetUtf16Length(uchar32); + ++utf32_index_; } + cached_current_char_ = + i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); return true; } @@ -60,32 +83,50 @@ bool CharacterIterator::RewindToUtf8(int desired_utf8_index) { return false; } // Need to work backwards. + UChar32 uchar32 = cached_current_char_; while (utf8_index_ > desired_utf8_index) { - --utf8_index_; - utf8_index_ = GetUTF8StartPosition(text_, utf8_index_); - if (utf8_index_ < 0) { + int utf8_index = utf8_index_ - 1; + utf8_index = GetUTF8StartPosition(text_, utf8_index); + if (utf8_index < 0) { // Somehow, there wasn't a single UTF-8 lead byte at // requested_byte_index or an earlier byte. + cached_current_char_ = i18n_utils::kInvalidUChar32; return false; } // We've found the start of a unicode char! - UChar32 uchar32 = - i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); - if (uchar32 == i18n_utils::kInvalidUChar32) { - // Unable to retrieve a valid UTF-32 character at the previous position. + uchar32 = + i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index); + int expected_length = utf8_index_ - utf8_index; + if (uchar32 == i18n_utils::kInvalidUChar32 || + expected_length != i18n_utils::GetUtf8Length(uchar32)) { + // Either unable to retrieve a valid UTF-32 character at the previous + // position or we skipped past an invalid sequence while seeking the + // previous start position. + cached_current_char_ = i18n_utils::kInvalidUChar32; return false; } + cached_current_char_ = uchar32; + utf8_index_ = utf8_index; utf16_index_ -= i18n_utils::GetUtf16Length(uchar32); + --utf32_index_; } return true; } +bool CharacterIterator::MoveToUtf16(int desired_utf16_index) { + return (desired_utf16_index > utf16_index_) + ? AdvanceToUtf16(desired_utf16_index) + : RewindToUtf16(desired_utf16_index); +} + bool CharacterIterator::AdvanceToUtf16(int desired_utf16_index) { + UChar32 uchar32 = cached_current_char_; while (utf16_index_ < desired_utf16_index) { - UChar32 uchar32 = + uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); if (uchar32 == i18n_utils::kInvalidUChar32) { // Unable to retrieve a valid UTF-32 character at the previous position. + cached_current_char_ = i18n_utils::kInvalidUChar32; return false; } int utf16_length = i18n_utils::GetUtf16Length(uchar32); @@ -96,11 +137,15 @@ bool CharacterIterator::AdvanceToUtf16(int desired_utf16_index) { int utf8_length = i18n_utils::GetUtf8Length(uchar32); if (utf8_index_ + utf8_length > text_.length()) { // Enforce the requirement. + cached_current_char_ = i18n_utils::kInvalidUChar32; return false; } utf8_index_ += utf8_length; utf16_index_ += utf16_length; + ++utf32_index_; } + cached_current_char_ = + i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); return true; } @@ -108,17 +153,98 @@ bool CharacterIterator::RewindToUtf16(int desired_utf16_index) { if (desired_utf16_index < 0) { return false; } + UChar32 uchar32 = cached_current_char_; while (utf16_index_ > desired_utf16_index) { - --utf8_index_; - utf8_index_ = GetUTF8StartPosition(text_, utf8_index_); + int utf8_index = utf8_index_ - 1; + utf8_index = GetUTF8StartPosition(text_, utf8_index); + if (utf8_index < 0) { + // Somehow, there wasn't a single UTF-8 lead byte at + // requested_byte_index or an earlier byte. + cached_current_char_ = i18n_utils::kInvalidUChar32; + return false; + } // We've found the start of a unicode char! - UChar32 uchar32 = + uchar32 = + i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index); + int expected_length = utf8_index_ - utf8_index; + if (uchar32 == i18n_utils::kInvalidUChar32 || + expected_length != i18n_utils::GetUtf8Length(uchar32)) { + // Either unable to retrieve a valid UTF-32 character at the previous + // position or we skipped past an invalid sequence while seeking the + // previous start position. + cached_current_char_ = i18n_utils::kInvalidUChar32; + return false; + } + cached_current_char_ = uchar32; + utf8_index_ = utf8_index; + utf16_index_ -= i18n_utils::GetUtf16Length(uchar32); + --utf32_index_; + } + return true; +} + +bool CharacterIterator::MoveToUtf32(int desired_utf32_index) { + return (desired_utf32_index > utf32_index_) + ? AdvanceToUtf32(desired_utf32_index) + : RewindToUtf32(desired_utf32_index); +} + +bool CharacterIterator::AdvanceToUtf32(int desired_utf32_index) { + UChar32 uchar32 = cached_current_char_; + while (utf32_index_ < desired_utf32_index) { + uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); if (uchar32 == i18n_utils::kInvalidUChar32) { // Unable to retrieve a valid UTF-32 character at the previous position. + cached_current_char_ = i18n_utils::kInvalidUChar32; + return false; + } + int utf16_length = i18n_utils::GetUtf16Length(uchar32); + int utf8_length = i18n_utils::GetUtf8Length(uchar32); + if (utf8_index_ + utf8_length > text_.length()) { + // Enforce the requirement. + cached_current_char_ = i18n_utils::kInvalidUChar32; + return false; + } + utf8_index_ += utf8_length; + utf16_index_ += utf16_length; + ++utf32_index_; + } + cached_current_char_ = + i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_); + return true; +} + +bool CharacterIterator::RewindToUtf32(int desired_utf32_index) { + if (desired_utf32_index < 0) { + return false; + } + UChar32 uchar32 = cached_current_char_; + while (utf32_index_ > desired_utf32_index) { + int utf8_index = utf8_index_ - 1; + utf8_index = GetUTF8StartPosition(text_, utf8_index); + if (utf8_index < 0) { + // Somehow, there wasn't a single UTF-8 lead byte at + // requested_byte_index or an earlier byte. + cached_current_char_ = i18n_utils::kInvalidUChar32; + return false; + } + // We've found the start of a unicode char! + uchar32 = + i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index); + int expected_length = utf8_index_ - utf8_index; + if (uchar32 == i18n_utils::kInvalidUChar32 || + expected_length != i18n_utils::GetUtf8Length(uchar32)) { + // Either unable to retrieve a valid UTF-32 character at the previous + // position or we skipped past an invalid sequence while seeking the + // previous start position. + cached_current_char_ = i18n_utils::kInvalidUChar32; return false; } + cached_current_char_ = uchar32; + utf8_index_ = utf8_index; utf16_index_ -= i18n_utils::GetUtf16Length(uchar32); + --utf32_index_; } return true; } diff --git a/icing/util/character-iterator.h b/icing/util/character-iterator.h index 22de6c5..c7569a7 100644 --- a/icing/util/character-iterator.h +++ b/icing/util/character-iterator.h @@ -15,6 +15,7 @@ #ifndef ICING_UTIL_CHARACTER_ITERATOR_H_ #define ICING_UTIL_CHARACTER_ITERATOR_H_ +#include "icing/legacy/core/icing-string-util.h" #include "icing/util/i18n-utils.h" namespace icing { @@ -23,23 +24,40 @@ namespace lib { class CharacterIterator { public: explicit CharacterIterator(std::string_view text) - : CharacterIterator(text, 0, 0) {} + : CharacterIterator(text, 0, 0, 0) {} - CharacterIterator(std::string_view text, int utf8_index, int utf16_index) - : text_(text), utf8_index_(utf8_index), utf16_index_(utf16_index) {} + CharacterIterator(std::string_view text, int utf8_index, int utf16_index, + int utf32_index) + : text_(text), + cached_current_char_(i18n_utils::kInvalidUChar32), + utf8_index_(utf8_index), + utf16_index_(utf16_index), + utf32_index_(utf32_index) {} - // Moves from current position to the character that includes the specified + // Returns the character that the iterator currently points to. + // i18n_utils::kInvalidUChar32 if unable to read that character. + UChar32 GetCurrentChar(); + + // Moves current position to desired_utf8_index. + // REQUIRES: 0 <= desired_utf8_index <= text_.length() + bool MoveToUtf8(int desired_utf8_index); + + // Advances from current position to the character that includes the specified // UTF-8 index. // REQUIRES: desired_utf8_index <= text_.length() // desired_utf8_index is allowed to point one index past the end, but no // further. bool AdvanceToUtf8(int desired_utf8_index); - // Moves from current position to the character that includes the specified + // Rewinds from current position to the character that includes the specified // UTF-8 index. // REQUIRES: 0 <= desired_utf8_index bool RewindToUtf8(int desired_utf8_index); + // Moves current position to desired_utf16_index. + // REQUIRES: 0 <= desired_utf16_index <= text_.utf16_length() + bool MoveToUtf16(int desired_utf16_index); + // Advances current position to desired_utf16_index. // REQUIRES: desired_utf16_index <= text_.utf16_length() // desired_utf16_index is allowed to point one index past the end, but no @@ -50,18 +68,42 @@ class CharacterIterator { // REQUIRES: 0 <= desired_utf16_index bool RewindToUtf16(int desired_utf16_index); + // Moves current position to desired_utf32_index. + // REQUIRES: 0 <= desired_utf32_index <= text_.utf32_length() + bool MoveToUtf32(int desired_utf32_index); + + // Advances current position to desired_utf32_index. + // REQUIRES: desired_utf32_index <= text_.utf32_length() + // desired_utf32_index is allowed to point one index past the end, but no + // further. + bool AdvanceToUtf32(int desired_utf32_index); + + // Rewinds current position to desired_utf32_index. + // REQUIRES: 0 <= desired_utf32_index + bool RewindToUtf32(int desired_utf32_index); + int utf8_index() const { return utf8_index_; } int utf16_index() const { return utf16_index_; } + int utf32_index() const { return utf32_index_; } bool operator==(const CharacterIterator& rhs) const { + // cached_current_char_ is just that: a cached value. As such, it's not + // considered for equality. return text_ == rhs.text_ && utf8_index_ == rhs.utf8_index_ && - utf16_index_ == rhs.utf16_index_; + utf16_index_ == rhs.utf16_index_ && utf32_index_ == rhs.utf32_index_; + } + + std::string DebugString() const { + return IcingStringUtil::StringPrintf("(u8:%d,u16:%d,u32:%d)", utf8_index_, + utf16_index_, utf32_index_); } private: std::string_view text_; + UChar32 cached_current_char_; int utf8_index_; int utf16_index_; + int utf32_index_; }; } // namespace lib diff --git a/icing/util/character-iterator_test.cc b/icing/util/character-iterator_test.cc new file mode 100644 index 0000000..445f837 --- /dev/null +++ b/icing/util/character-iterator_test.cc @@ -0,0 +1,235 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/util/character-iterator.h" + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/testing/icu-i18n-test-utils.h" + +namespace icing { +namespace lib { + +using ::testing::Eq; +using ::testing::IsFalse; +using ::testing::IsTrue; + +TEST(CharacterIteratorTest, BasicUtf8) { + constexpr std::string_view kText = "¿Dónde está la biblioteca?"; + CharacterIterator iterator(kText); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿")); + + EXPECT_THAT(iterator.AdvanceToUtf8(4), IsTrue()); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó")); + EXPECT_THAT(iterator, + Eq(CharacterIterator(kText, /*utf8_index=*/3, /*utf16_index=*/2, + /*utf32_index=*/2))); + + EXPECT_THAT(iterator.AdvanceToUtf8(18), IsTrue()); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b")); + EXPECT_THAT(iterator, + Eq(CharacterIterator(kText, /*utf8_index=*/18, /*utf16_index=*/15, + /*utf32_index=*/15))); + + EXPECT_THAT(iterator.AdvanceToUtf8(28), IsTrue()); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?")); + EXPECT_THAT(iterator, + Eq(CharacterIterator(kText, /*utf8_index=*/28, /*utf16_index=*/25, + /*utf32_index=*/25))); + + EXPECT_THAT(iterator.AdvanceToUtf8(29), IsTrue()); + EXPECT_THAT(iterator.GetCurrentChar(), Eq(0)); + EXPECT_THAT(iterator, + Eq(CharacterIterator(kText, /*utf8_index=*/29, /*utf16_index=*/26, + /*utf32_index=*/26))); + + EXPECT_THAT(iterator.RewindToUtf8(28), IsTrue()); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?")); + EXPECT_THAT(iterator, + Eq(CharacterIterator(kText, /*utf8_index=*/28, /*utf16_index=*/25, + /*utf32_index=*/25))); + + EXPECT_THAT(iterator.RewindToUtf8(18), IsTrue()); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b")); + EXPECT_THAT(iterator, + Eq(CharacterIterator(kText, /*utf8_index=*/18, /*utf16_index=*/15, + /*utf32_index=*/15))); + + EXPECT_THAT(iterator.RewindToUtf8(4), IsTrue()); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó")); + EXPECT_THAT(iterator, + Eq(CharacterIterator(kText, /*utf8_index=*/3, /*utf16_index=*/2, + /*utf32_index=*/2))); + + EXPECT_THAT(iterator.RewindToUtf8(0), IsTrue()); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿")); + EXPECT_THAT(iterator, + Eq(CharacterIterator(kText, /*utf8_index=*/0, /*utf16_index=*/0, + /*utf32_index=*/0))); +} + +TEST(CharacterIteratorTest, BasicUtf16) { + constexpr std::string_view kText = "¿Dónde está la biblioteca?"; + CharacterIterator iterator(kText); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿")); + + EXPECT_THAT(iterator.AdvanceToUtf16(2), IsTrue()); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó")); + EXPECT_THAT(iterator, + Eq(CharacterIterator(kText, /*utf8_index=*/3, /*utf16_index=*/2, + /*utf32_index=*/2))); + + EXPECT_THAT(iterator.AdvanceToUtf16(15), IsTrue()); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b")); + EXPECT_THAT(iterator, + Eq(CharacterIterator(kText, /*utf8_index=*/18, /*utf16_index=*/15, + /*utf32_index=*/15))); + + EXPECT_THAT(iterator.AdvanceToUtf16(25), IsTrue()); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?")); + EXPECT_THAT(iterator, + Eq(CharacterIterator(kText, /*utf8_index=*/28, /*utf16_index=*/25, + /*utf32_index=*/25))); + + EXPECT_THAT(iterator.AdvanceToUtf16(26), IsTrue()); + EXPECT_THAT(iterator.GetCurrentChar(), Eq(0)); + EXPECT_THAT(iterator, + Eq(CharacterIterator(kText, /*utf8_index=*/29, /*utf16_index=*/26, + /*utf32_index=*/26))); + + EXPECT_THAT(iterator.RewindToUtf16(25), IsTrue()); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?")); + EXPECT_THAT(iterator, + Eq(CharacterIterator(kText, /*utf8_index=*/28, /*utf16_index=*/25, + /*utf32_index=*/25))); + + EXPECT_THAT(iterator.RewindToUtf16(15), IsTrue()); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b")); + EXPECT_THAT(iterator, + Eq(CharacterIterator(kText, /*utf8_index=*/18, /*utf16_index=*/15, + /*utf32_index=*/15))); + + EXPECT_THAT(iterator.RewindToUtf16(2), IsTrue()); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó")); + EXPECT_THAT(iterator, + Eq(CharacterIterator(kText, /*utf8_index=*/3, /*utf16_index=*/2, + /*utf32_index=*/2))); + + EXPECT_THAT(iterator.RewindToUtf8(0), IsTrue()); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿")); + EXPECT_THAT(iterator, + Eq(CharacterIterator(kText, /*utf8_index=*/0, /*utf16_index=*/0, + /*utf32_index=*/0))); +} + +TEST(CharacterIteratorTest, BasicUtf32) { + constexpr std::string_view kText = "¿Dónde está la biblioteca?"; + CharacterIterator iterator(kText); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿")); + + EXPECT_THAT(iterator.AdvanceToUtf32(2), IsTrue()); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó")); + EXPECT_THAT(iterator, + Eq(CharacterIterator(kText, /*utf8_index=*/3, /*utf16_index=*/2, + /*utf32_index=*/2))); + + EXPECT_THAT(iterator.AdvanceToUtf32(15), IsTrue()); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b")); + EXPECT_THAT(iterator, + Eq(CharacterIterator(kText, /*utf8_index=*/18, /*utf16_index=*/15, + /*utf32_index=*/15))); + + EXPECT_THAT(iterator.AdvanceToUtf32(25), IsTrue()); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?")); + EXPECT_THAT(iterator, + Eq(CharacterIterator(kText, /*utf8_index=*/28, /*utf16_index=*/25, + /*utf32_index=*/25))); + + EXPECT_THAT(iterator.AdvanceToUtf32(26), IsTrue()); + EXPECT_THAT(iterator.GetCurrentChar(), Eq(0)); + EXPECT_THAT(iterator, + Eq(CharacterIterator(kText, /*utf8_index=*/29, /*utf16_index=*/26, + /*utf32_index=*/26))); + + EXPECT_THAT(iterator.RewindToUtf32(25), IsTrue()); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?")); + EXPECT_THAT(iterator, + Eq(CharacterIterator(kText, /*utf8_index=*/28, /*utf16_index=*/25, + /*utf32_index=*/25))); + + EXPECT_THAT(iterator.RewindToUtf32(15), IsTrue()); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b")); + EXPECT_THAT(iterator, + Eq(CharacterIterator(kText, /*utf8_index=*/18, /*utf16_index=*/15, + /*utf32_index=*/15))); + + EXPECT_THAT(iterator.RewindToUtf32(2), IsTrue()); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó")); + EXPECT_THAT(iterator, + Eq(CharacterIterator(kText, /*utf8_index=*/3, /*utf16_index=*/2, + /*utf32_index=*/2))); + + EXPECT_THAT(iterator.RewindToUtf32(0), IsTrue()); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿")); + EXPECT_THAT(iterator, + Eq(CharacterIterator(kText, /*utf8_index=*/0, /*utf16_index=*/0, + /*utf32_index=*/0))); +} + +TEST(CharacterIteratorTest, InvalidUtf) { + // "\255" is an invalid sequence. + constexpr std::string_view kText = "foo \255 bar"; + CharacterIterator iterator(kText); + + // Try to advance to the 'b' in 'bar'. This will fail and leave us pointed at + // the invalid sequence '\255'. Get CurrentChar() should return an invalid + // character. + EXPECT_THAT(iterator.AdvanceToUtf8(6), IsFalse()); + EXPECT_THAT(iterator.GetCurrentChar(), Eq(i18n_utils::kInvalidUChar32)); + CharacterIterator exp_iterator(kText, /*utf8_index=*/4, /*utf16_index=*/4, + /*utf32_index=*/4); + EXPECT_THAT(iterator, Eq(exp_iterator)); + + EXPECT_THAT(iterator.AdvanceToUtf16(6), IsFalse()); + EXPECT_THAT(iterator.GetCurrentChar(), Eq(i18n_utils::kInvalidUChar32)); + EXPECT_THAT(iterator, Eq(exp_iterator)); + + EXPECT_THAT(iterator.AdvanceToUtf32(6), IsFalse()); + EXPECT_THAT(iterator.GetCurrentChar(), Eq(i18n_utils::kInvalidUChar32)); + EXPECT_THAT(iterator, Eq(exp_iterator)); + + // Create the iterator with it pointing at the 'b' in 'bar'. + iterator = CharacterIterator(kText, /*utf8_index=*/6, /*utf16_index=*/6, + /*utf32_index=*/6); + EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b")); + + // Try to advance to the last 'o' in 'foo'. This will fail and leave us + // pointed at the ' ' before the invalid sequence '\255'. + exp_iterator = CharacterIterator(kText, /*utf8_index=*/5, /*utf16_index=*/5, + /*utf32_index=*/5); + EXPECT_THAT(iterator.RewindToUtf8(2), IsFalse()); + EXPECT_THAT(iterator.GetCurrentChar(), Eq(' ')); + EXPECT_THAT(iterator, Eq(exp_iterator)); + + EXPECT_THAT(iterator.RewindToUtf16(2), IsFalse()); + EXPECT_THAT(iterator.GetCurrentChar(), Eq(' ')); + EXPECT_THAT(iterator, Eq(exp_iterator)); + + EXPECT_THAT(iterator.RewindToUtf32(2), IsFalse()); + EXPECT_THAT(iterator.GetCurrentChar(), Eq(' ')); + EXPECT_THAT(iterator, Eq(exp_iterator)); +} + +} // namespace lib +} // namespace icing diff --git a/icing/util/document-validator_test.cc b/icing/util/document-validator_test.cc index f05e8a6..cb013d7 100644 --- a/icing/util/document-validator_test.cc +++ b/icing/util/document-validator_test.cc @@ -21,6 +21,7 @@ #include "icing/document-builder.h" #include "icing/file/filesystem.h" #include "icing/proto/schema.pb.h" +#include "icing/schema-builder.h" #include "icing/schema/schema-store.h" #include "icing/testing/common-matchers.h" #include "icing/testing/fake-clock.h" @@ -45,17 +46,52 @@ constexpr char kPropertyEmails[] = "emails"; constexpr char kDefaultNamespace[] = "icing"; constexpr char kDefaultString[] = "This is a string."; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_OPTIONAL = + PropertyConfigProto_Cardinality_Code_OPTIONAL; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REQUIRED = + PropertyConfigProto_Cardinality_Code_REQUIRED; +constexpr PropertyConfigProto_Cardinality_Code CARDINALITY_REPEATED = + PropertyConfigProto_Cardinality_Code_REPEATED; + +constexpr PropertyConfigProto_DataType_Code TYPE_STRING = + PropertyConfigProto_DataType_Code_STRING; + class DocumentValidatorTest : public ::testing::Test { protected: DocumentValidatorTest() {} void SetUp() override { - SchemaProto schema; - auto type_config = schema.add_types(); - CreateEmailTypeConfig(type_config); - - type_config = schema.add_types(); - CreateConversationTypeConfig(type_config); + SchemaProto schema = + SchemaBuilder() + .AddType( + SchemaTypeConfigBuilder() + .SetType(kTypeEmail) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertySubject) + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertyText) + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertyRecipients) + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_REPEATED))) + .AddType( + SchemaTypeConfigBuilder() + .SetType(kTypeConversation) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertyName) + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty( + PropertyConfigBuilder() + .SetName(kPropertyEmails) + .SetDataTypeDocument( + kTypeEmail, /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); ICING_ASSERT_OK_AND_ASSIGN( schema_store_, @@ -66,25 +102,6 @@ class DocumentValidatorTest : public ::testing::Test { std::make_unique<DocumentValidator>(schema_store_.get()); } - static void CreateEmailTypeConfig(SchemaTypeConfigProto* type_config) { - type_config->set_schema_type(kTypeEmail); - - auto subject = type_config->add_properties(); - subject->set_property_name(kPropertySubject); - subject->set_data_type(PropertyConfigProto::DataType::STRING); - subject->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - - auto text = type_config->add_properties(); - text->set_property_name(kPropertyText); - text->set_data_type(PropertyConfigProto::DataType::STRING); - text->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL); - - auto recipients = type_config->add_properties(); - recipients->set_property_name(kPropertyRecipients); - recipients->set_data_type(PropertyConfigProto::DataType::STRING); - recipients->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); - } - static DocumentBuilder SimpleEmailBuilder() { return DocumentBuilder() .SetKey(kDefaultNamespace, "email/1") @@ -95,21 +112,6 @@ class DocumentValidatorTest : public ::testing::Test { kDefaultString); } - static void CreateConversationTypeConfig(SchemaTypeConfigProto* type_config) { - type_config->set_schema_type(kTypeConversation); - - auto name = type_config->add_properties(); - name->set_property_name(kPropertyName); - name->set_data_type(PropertyConfigProto::DataType::STRING); - name->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED); - - auto emails = type_config->add_properties(); - emails->set_property_name(kPropertyEmails); - emails->set_data_type(PropertyConfigProto::DataType::DOCUMENT); - emails->set_cardinality(PropertyConfigProto::Cardinality::REPEATED); - emails->set_schema_type(kTypeEmail); - } - static DocumentBuilder SimpleConversationBuilder() { return DocumentBuilder() .SetKey(kDefaultNamespace, "conversation/1") @@ -326,12 +328,26 @@ TEST_F(DocumentValidatorTest, ValidateNestedPropertyInvalid) { } TEST_F(DocumentValidatorTest, HandleTypeConfigMapChangesOk) { - SchemaProto email_schema; - auto type_config = email_schema.add_types(); - CreateEmailTypeConfig(type_config); + SchemaProto email_schema = + SchemaBuilder() + .AddType(SchemaTypeConfigBuilder() + .SetType(kTypeEmail) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertySubject) + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertyText) + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_OPTIONAL)) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertyRecipients) + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); - // Create a custom directory so we don't collide with the test's preset schema - // in SetUp + // Create a custom directory so we don't collide + // with the test's preset schema in SetUp const std::string custom_schema_dir = GetTestTempDir() + "/custom_schema"; filesystem_.DeleteDirectoryRecursively(custom_schema_dir.c_str()); filesystem_.CreateDirectoryRecursively(custom_schema_dir.c_str()); @@ -352,9 +368,21 @@ TEST_F(DocumentValidatorTest, HandleTypeConfigMapChangesOk) { HasSubstr("'Conversation' not found"))); // Add the 'Conversation' type - SchemaProto email_and_conversation_schema = email_schema; - type_config = email_and_conversation_schema.add_types(); - CreateConversationTypeConfig(type_config); + SchemaProto email_and_conversation_schema = + SchemaBuilder(email_schema) + .AddType(SchemaTypeConfigBuilder() + .SetType(kTypeConversation) + .AddProperty(PropertyConfigBuilder() + .SetName(kPropertyName) + .SetDataType(TYPE_STRING) + .SetCardinality(CARDINALITY_REQUIRED)) + .AddProperty( + PropertyConfigBuilder() + .SetName(kPropertyEmails) + .SetDataTypeDocument( + kTypeEmail, /*index_nested_properties=*/true) + .SetCardinality(CARDINALITY_REPEATED))) + .Build(); // DocumentValidator should be able to handle the SchemaStore getting updated // separately diff --git a/icing/util/math-util.h b/icing/util/math-util.h index fc11a09..3f2a69d 100644 --- a/icing/util/math-util.h +++ b/icing/util/math-util.h @@ -37,7 +37,7 @@ inline double SafeDivide(double first, double second) { template <typename IntType> static IntType RoundDownTo(IntType input_value, IntType rounding_value) { static_assert(std::numeric_limits<IntType>::is_integer, - "RoundUpTo() operation type is not integer"); + "RoundDownTo() operation type is not integer"); if (input_value <= 0) { return 0; diff --git a/java/src/com/google/android/icing/BreakIteratorBatcher.java b/java/src/com/google/android/icing/BreakIteratorBatcher.java index 58efbfc..2b87327 100644 --- a/java/src/com/google/android/icing/BreakIteratorBatcher.java +++ b/java/src/com/google/android/icing/BreakIteratorBatcher.java @@ -14,9 +14,6 @@ package com.google.android.icing; -import androidx.annotation.NonNull; -import androidx.annotation.RestrictTo; - import java.text.BreakIterator; import java.util.ArrayList; import java.util.List; @@ -38,20 +35,17 @@ import java.util.Locale; * utf16Boundaries = brkItrBatcher.next(5); * assertThat(utf16Boundaries).asList().containsExactly(9); * }</pre> - * - * @hide */ -@RestrictTo(RestrictTo.Scope.LIBRARY_GROUP) public class BreakIteratorBatcher { private final BreakIterator iterator; - public BreakIteratorBatcher(@NonNull Locale locale) { + public BreakIteratorBatcher(Locale locale) { this.iterator = BreakIterator.getWordInstance(locale); } /* Direct calls to BreakIterator */ - public void setText(@NonNull String text) { + public void setText(String text) { iterator.setText(text); } @@ -73,7 +67,6 @@ public class BreakIteratorBatcher { * the end of the text (returns BreakIterator#DONE), then only the results of the previous calls * in that batch will be returned. */ - @NonNull public int[] next(int batchSize) { List<Integer> breakIndices = new ArrayList<>(batchSize); for (int i = 0; i < batchSize; ++i) { diff --git a/java/src/com/google/android/icing/IcingSearchEngine.java b/java/src/com/google/android/icing/IcingSearchEngine.java index 88d0578..1f5fb51 100644 --- a/java/src/com/google/android/icing/IcingSearchEngine.java +++ b/java/src/com/google/android/icing/IcingSearchEngine.java @@ -31,6 +31,7 @@ import com.google.android.icing.proto.IcingSearchEngineOptions; import com.google.android.icing.proto.InitializeResultProto; import com.google.android.icing.proto.OptimizeResultProto; import com.google.android.icing.proto.PersistToDiskResultProto; +import com.google.android.icing.proto.PersistType; import com.google.android.icing.proto.PutResultProto; import com.google.android.icing.proto.ReportUsageResultProto; import com.google.android.icing.proto.ResetResultProto; @@ -41,6 +42,7 @@ import com.google.android.icing.proto.SearchResultProto; import com.google.android.icing.proto.SearchSpecProto; import com.google.android.icing.proto.SetSchemaResultProto; import com.google.android.icing.proto.StatusProto; +import com.google.android.icing.proto.StorageInfoResultProto; import com.google.android.icing.proto.UsageReport; import com.google.protobuf.ExtensionRegistryLite; import com.google.protobuf.InvalidProtocolBufferException; @@ -51,9 +53,11 @@ import java.io.Closeable; * * <p>If this instance has been closed, the instance is no longer usable. * + * <p>Keep this class to be non-Final so that it can be mocked in AppSearch. + * * <p>NOTE: This class is NOT thread-safe. */ -public final class IcingSearchEngine implements Closeable { +public class IcingSearchEngine implements Closeable { private static final String TAG = "IcingSearchEngine"; private static final ExtensionRegistryLite EXTENSION_REGISTRY_LITE = @@ -434,10 +438,10 @@ public final class IcingSearchEngine implements Closeable { } @NonNull - public PersistToDiskResultProto persistToDisk() { + public PersistToDiskResultProto persistToDisk(@NonNull PersistType.Code persistTypeCode) { throwIfClosed(); - byte[] persistToDiskResultBytes = nativePersistToDisk(this); + byte[] persistToDiskResultBytes = nativePersistToDisk(this, persistTypeCode.getNumber()); if (persistToDiskResultBytes == null) { Log.e(TAG, "Received null PersistToDiskResultProto from native."); return PersistToDiskResultProto.newBuilder() @@ -501,6 +505,29 @@ public final class IcingSearchEngine implements Closeable { } @NonNull + public StorageInfoResultProto getStorageInfo() { + throwIfClosed(); + + byte[] storageInfoResultProtoBytes = nativeGetStorageInfo(this); + if (storageInfoResultProtoBytes == null) { + Log.e(TAG, "Received null StorageInfoResultProto from native."); + return StorageInfoResultProto.newBuilder() + .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL)) + .build(); + } + + try { + return StorageInfoResultProto.parseFrom( + storageInfoResultProtoBytes, EXTENSION_REGISTRY_LITE); + } catch (InvalidProtocolBufferException e) { + Log.e(TAG, "Error parsing GetOptimizeInfoResultProto.", e); + return StorageInfoResultProto.newBuilder() + .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL)) + .build(); + } + } + + @NonNull public ResetResultProto reset() { throwIfClosed(); @@ -568,11 +595,13 @@ public final class IcingSearchEngine implements Closeable { private static native byte[] nativeDeleteByQuery( IcingSearchEngine instance, byte[] searchSpecBytes); - private static native byte[] nativePersistToDisk(IcingSearchEngine instance); + private static native byte[] nativePersistToDisk(IcingSearchEngine instance, int persistType); private static native byte[] nativeOptimize(IcingSearchEngine instance); private static native byte[] nativeGetOptimizeInfo(IcingSearchEngine instance); + private static native byte[] nativeGetStorageInfo(IcingSearchEngine instance); + private static native byte[] nativeReset(IcingSearchEngine instance); } diff --git a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java index 56edaf1..0cee80c 100644 --- a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java +++ b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java @@ -32,6 +32,7 @@ import com.google.android.icing.proto.IcingSearchEngineOptions; import com.google.android.icing.proto.InitializeResultProto; import com.google.android.icing.proto.OptimizeResultProto; import com.google.android.icing.proto.PersistToDiskResultProto; +import com.google.android.icing.proto.PersistType; import com.google.android.icing.proto.PropertyConfigProto; import com.google.android.icing.proto.PropertyProto; import com.google.android.icing.proto.PutResultProto; @@ -44,7 +45,10 @@ import com.google.android.icing.proto.ScoringSpecProto; import com.google.android.icing.proto.SearchResultProto; import com.google.android.icing.proto.SearchSpecProto; import com.google.android.icing.proto.SetSchemaResultProto; +import com.google.android.icing.proto.SnippetMatchProto; +import com.google.android.icing.proto.SnippetProto; import com.google.android.icing.proto.StatusProto; +import com.google.android.icing.proto.StorageInfoResultProto; import com.google.android.icing.proto.StringIndexingConfig; import com.google.android.icing.proto.StringIndexingConfig.TokenizerType; import com.google.android.icing.proto.TermMatchType; @@ -394,7 +398,8 @@ public final class IcingSearchEngineTest { public void testPersistToDisk() throws Exception { assertStatusOk(icingSearchEngine.initialize().getStatus()); - PersistToDiskResultProto persistToDiskResultProto = icingSearchEngine.persistToDisk(); + PersistToDiskResultProto persistToDiskResultProto = + icingSearchEngine.persistToDisk(PersistType.Code.LITE); assertStatusOk(persistToDiskResultProto.getStatus()); } @@ -417,6 +422,14 @@ public final class IcingSearchEngineTest { } @Test + public void testGetStorageInfo() throws Exception { + assertStatusOk(icingSearchEngine.initialize().getStatus()); + + StorageInfoResultProto storageInfoResultProto = icingSearchEngine.getStorageInfo(); + assertStatusOk(storageInfoResultProto.getStatus()); + } + + @Test public void testGetAllNamespaces() throws Exception { assertStatusOk(icingSearchEngine.initialize().getStatus()); @@ -475,6 +488,141 @@ public final class IcingSearchEngineTest { assertStatusOk(reportUsageResultProto.getStatus()); } + @Test + public void testCJKTSnippets() throws Exception { + assertStatusOk(icingSearchEngine.initialize().getStatus()); + + SchemaProto schema = SchemaProto.newBuilder().addTypes(createEmailTypeConfig()).build(); + assertStatusOk( + icingSearchEngine.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false).getStatus()); + + // String: "天是蓝的" + // ^ ^^ ^ + // UTF16 idx: 0 1 2 3 + // Breaks into segments: "天", "是", "蓝", "的" + // "The sky is blue" + String chinese = "天是蓝的"; + assertThat(chinese.length()).isEqualTo(4); + DocumentProto emailDocument1 = + createEmailDocument("namespace", "uri1").toBuilder() + .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues(chinese)) + .build(); + assertStatusOk(icingSearchEngine.put(emailDocument1).getStatus()); + + // Search and request snippet matching but no windowing. + SearchSpecProto searchSpec = + SearchSpecProto.newBuilder() + .setQuery("是") + .setTermMatchType(TermMatchType.Code.PREFIX) + .build(); + ResultSpecProto resultSpecProto = + ResultSpecProto.newBuilder() + .setSnippetSpec( + ResultSpecProto.SnippetSpecProto.newBuilder() + .setNumToSnippet(Integer.MAX_VALUE) + .setNumMatchesPerProperty(Integer.MAX_VALUE)) + .build(); + + // Search and make sure that we got a single successful results + SearchResultProto searchResultProto = + icingSearchEngine.search( + searchSpec, ScoringSpecProto.getDefaultInstance(), resultSpecProto); + assertStatusOk(searchResultProto.getStatus()); + assertThat(searchResultProto.getResultsCount()).isEqualTo(1); + + // Ensure that one and only one property was matched and it was "subject" + SnippetProto snippetProto = searchResultProto.getResults(0).getSnippet(); + assertThat(snippetProto.getEntriesList()).hasSize(1); + SnippetProto.EntryProto entryProto = snippetProto.getEntries(0); + assertThat(entryProto.getPropertyName()).isEqualTo("subject"); + + // Get the content for "subject" and see what the match is. + DocumentProto resultDocument = searchResultProto.getResults(0).getDocument(); + assertThat(resultDocument.getPropertiesList()).hasSize(1); + PropertyProto subjectProperty = resultDocument.getProperties(0); + assertThat(subjectProperty.getName()).isEqualTo("subject"); + assertThat(subjectProperty.getStringValuesList()).hasSize(1); + String content = subjectProperty.getStringValues(0); + + // Ensure that there is one and only one match within "subject" + assertThat(entryProto.getSnippetMatchesList()).hasSize(1); + SnippetMatchProto matchProto = entryProto.getSnippetMatches(0); + + int matchStart = matchProto.getExactMatchUtf16Position(); + int matchEnd = matchStart + matchProto.getExactMatchUtf16Length(); + assertThat(matchStart).isEqualTo(1); + assertThat(matchEnd).isEqualTo(2); + String match = content.substring(matchStart, matchEnd); + assertThat(match).isEqualTo("是"); + } + + @Test + public void testUtf16MultiByteSnippets() throws Exception { + assertStatusOk(icingSearchEngine.initialize().getStatus()); + + SchemaProto schema = SchemaProto.newBuilder().addTypes(createEmailTypeConfig()).build(); + assertStatusOk( + icingSearchEngine.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false).getStatus()); + + // String: "𐀀𐀁 𐀂𐀃 𐀄" + // ^ ^ ^ + // UTF16 idx: 0 5 10 + // Breaks into segments: "𐀀𐀁", "𐀂𐀃", "𐀄" + String text = "𐀀𐀁 𐀂𐀃 𐀄"; + assertThat(text.length()).isEqualTo(12); + DocumentProto emailDocument1 = + createEmailDocument("namespace", "uri1").toBuilder() + .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues(text)) + .build(); + assertStatusOk(icingSearchEngine.put(emailDocument1).getStatus()); + + // Search and request snippet matching but no windowing. + SearchSpecProto searchSpec = + SearchSpecProto.newBuilder() + .setQuery("𐀂") + .setTermMatchType(TermMatchType.Code.PREFIX) + .build(); + ResultSpecProto resultSpecProto = + ResultSpecProto.newBuilder() + .setSnippetSpec( + ResultSpecProto.SnippetSpecProto.newBuilder() + .setNumToSnippet(Integer.MAX_VALUE) + .setNumMatchesPerProperty(Integer.MAX_VALUE)) + .build(); + + // Search and make sure that we got a single successful results + SearchResultProto searchResultProto = + icingSearchEngine.search( + searchSpec, ScoringSpecProto.getDefaultInstance(), resultSpecProto); + assertStatusOk(searchResultProto.getStatus()); + assertThat(searchResultProto.getResultsCount()).isEqualTo(1); + + // Ensure that one and only one property was matched and it was "subject" + SnippetProto snippetProto = searchResultProto.getResults(0).getSnippet(); + assertThat(snippetProto.getEntriesList()).hasSize(1); + SnippetProto.EntryProto entryProto = snippetProto.getEntries(0); + assertThat(entryProto.getPropertyName()).isEqualTo("subject"); + + // Get the content for "subject" and see what the match is. + DocumentProto resultDocument = searchResultProto.getResults(0).getDocument(); + assertThat(resultDocument.getPropertiesList()).hasSize(1); + PropertyProto subjectProperty = resultDocument.getProperties(0); + assertThat(subjectProperty.getName()).isEqualTo("subject"); + assertThat(subjectProperty.getStringValuesList()).hasSize(1); + String content = subjectProperty.getStringValues(0); + + // Ensure that there is one and only one match within "subject" + assertThat(entryProto.getSnippetMatchesList()).hasSize(1); + SnippetMatchProto matchProto = entryProto.getSnippetMatches(0); + + int matchStart = matchProto.getExactMatchUtf16Position(); + int matchEnd = matchStart + matchProto.getExactMatchUtf16Length(); + assertThat(matchStart).isEqualTo(5); + assertThat(matchEnd).isEqualTo(9); + String match = content.substring(matchStart, matchEnd); + assertThat(match).isEqualTo("𐀂𐀃"); + } + private static void assertStatusOk(StatusProto status) { assertWithMessage(status.getMessage()).that(status.getCode()).isEqualTo(StatusProto.Code.OK); } diff --git a/proto/icing/proto/document.proto b/proto/icing/proto/document.proto index d55b7e2..2e8321b 100644 --- a/proto/icing/proto/document.proto +++ b/proto/icing/proto/document.proto @@ -110,11 +110,11 @@ message PutResultProto { // go/icing-library-apis. optional StatusProto status = 1; - // Stats of the function call. Inside NativePutDocumentStats, the function + // Stats of the function call. Inside PutDocumentStatsProto, the function // call latency 'latency_ms' will always be populated. The other fields will // be accurate only when the status above is OK. See logging.proto for // details. - optional NativePutDocumentStats native_put_document_stats = 2; + optional PutDocumentStatsProto put_document_stats = 2; } // Result of a call to IcingSearchEngine.Get @@ -167,7 +167,7 @@ message DeleteResultProto { optional StatusProto status = 1; // Stats for delete execution performance. - optional NativeDeleteStats delete_stats = 2; + optional DeleteStatsProto delete_stats = 2; } // Result of a call to IcingSearchEngine.DeleteByNamespace @@ -186,7 +186,7 @@ message DeleteByNamespaceResultProto { optional StatusProto status = 1; // Stats for delete execution performance. - optional NativeDeleteStats delete_stats = 2; + optional DeleteStatsProto delete_stats = 2; } // Result of a call to IcingSearchEngine.DeleteBySchemaType @@ -205,11 +205,11 @@ message DeleteBySchemaTypeResultProto { optional StatusProto status = 1; // Stats for delete execution performance. - optional NativeDeleteStats delete_stats = 2; + optional DeleteStatsProto delete_stats = 2; } // Result of a call to IcingSearchEngine.DeleteByQuery -// Next tag: 3 +// Next tag: 4 message DeleteByQueryResultProto { // Status code can be one of: // OK @@ -224,5 +224,7 @@ message DeleteByQueryResultProto { optional StatusProto status = 1; // Stats for delete execution performance. - optional NativeDeleteStats delete_stats = 2; + optional DeleteByQueryStatsProto delete_by_query_stats = 3; + + reserved 2; } diff --git a/proto/icing/proto/document_wrapper.proto b/proto/icing/proto/document_wrapper.proto index e8eb992..929ee33 100644 --- a/proto/icing/proto/document_wrapper.proto +++ b/proto/icing/proto/document_wrapper.proto @@ -20,7 +20,6 @@ import "icing/proto/document.proto"; option java_package = "com.google.android.icing.proto"; option java_multiple_files = true; - option objc_class_prefix = "ICNG"; // DocumentWrapper as a wrapper of the user-facing DocumentProto is meant to @@ -30,6 +29,5 @@ option objc_class_prefix = "ICNG"; message DocumentWrapper { optional DocumentProto document = 1; - // Indicates if the document is marked as deleted - optional bool deleted = 2; + reserved 2; } diff --git a/proto/icing/proto/initialize.proto b/proto/icing/proto/initialize.proto index ae2944c..ab2556d 100644 --- a/proto/icing/proto/initialize.proto +++ b/proto/icing/proto/initialize.proto @@ -16,12 +16,11 @@ syntax = "proto2"; package icing.lib; -import "icing/proto/status.proto"; import "icing/proto/logging.proto"; +import "icing/proto/status.proto"; option java_package = "com.google.android.icing.proto"; option java_multiple_files = true; - option objc_class_prefix = "ICNG"; // Next tag: 5 @@ -89,11 +88,11 @@ message InitializeResultProto { // go/icing-library-apis. optional StatusProto status = 1; - // Stats of the function call. Inside NativeInitializeStats, the function call + // Stats of the function call. Inside InitializeStatsProto, the function call // latency 'latency_ms' will always be populated. The other fields will be // accurate only when the status above is OK or WARNING_DATA_LOSS. See // logging.proto for details. - optional NativeInitializeStats native_initialize_stats = 2; + optional InitializeStatsProto initialize_stats = 2; // TODO(b/147699081): Add a field to indicate lost_schema and lost_documents. // go/icing-library-apis. diff --git a/proto/icing/proto/internal/optimize.proto b/proto/icing/proto/internal/optimize.proto new file mode 100644 index 0000000..4ed3d73 --- /dev/null +++ b/proto/icing/proto/internal/optimize.proto @@ -0,0 +1,29 @@ +// Copyright 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package icing.lib; + +option java_package = "com.google.android.icing.internal.proto"; +option java_multiple_files = true; +option objc_class_prefix = "ICNG"; + +// A status that is saved internally in Icing to track information about how +// often Optimize runs. +// Next tag: 2 +message OptimizeStatusProto { + // The Epoch time at which the last successfuly optimize ran. + optional int64 last_successful_optimize_run_time_ms = 1; +} diff --git a/proto/icing/proto/logging.proto b/proto/icing/proto/logging.proto index 09ec756..7abbf4a 100644 --- a/proto/icing/proto/logging.proto +++ b/proto/icing/proto/logging.proto @@ -23,8 +23,8 @@ option java_multiple_files = true; option objc_class_prefix = "ICNG"; // Stats of the top-level function IcingSearchEngine::Initialize(). -// Next tag: 11 -message NativeInitializeStats { +// Next tag: 12 +message InitializeStatsProto { // Overall time used for the function call. optional int32 latency_ms = 1; @@ -40,8 +40,9 @@ message NativeInitializeStats { // Data in index is inconsistent with ground truth. INCONSISTENT_WITH_GROUND_TRUTH = 2; - // Total checksum of all the components does not match. - TOTAL_CHECKSUM_MISMATCH = 3; + // Changes were made to the schema, but possibly not fully applied to the + // document store and the index - requiring a recovery. + SCHEMA_CHANGES_OUT_OF_SYNC = 3; // Random I/O errors. IO_ERROR = 4; @@ -49,13 +50,13 @@ message NativeInitializeStats { // Possible recovery causes for document store: // - DATA_LOSS - // - TOTAL_CHECKSUM_MISMATCH + // - SCHEMA_CHANGES_OUT_OF_SYNC // - IO_ERROR optional RecoveryCause document_store_recovery_cause = 2; // Possible recovery causes for index: // - INCONSISTENT_WITH_GROUND_TRUTH - // - TOTAL_CHECKSUM_MISMATCH + // - SCHEMA_CHANGES_OUT_OF_SYNC // - IO_ERROR optional RecoveryCause index_restoration_cause = 3; @@ -91,11 +92,15 @@ message NativeInitializeStats { // Number of schema types currently in schema store. optional int32 num_schema_types = 10; + + // Number of consecutive initialization failures that immediately preceded + // this initialization. + optional int32 num_previous_init_failures = 11; } // Stats of the top-level function IcingSearchEngine::Put(). // Next tag: 7 -message NativePutDocumentStats { +message PutDocumentStatsProto { // Overall time used for the function call. optional int32 latency_ms = 1; @@ -125,8 +130,11 @@ message NativePutDocumentStats { // Stats of the top-level function IcingSearchEngine::Search() and // IcingSearchEngine::GetNextPage(). -// Next tag: 15 -message NativeQueryStats { +// Next tag: 17 +message QueryStatsProto { + // The UTF-8 length of the query string + optional int32 query_length = 16; + // Number of terms in the query string. optional int32 num_terms = 1; @@ -154,7 +162,7 @@ message NativeQueryStats { optional int32 num_documents_scored = 8; // How many of the results in the page returned were snippeted. - optional bool num_results_snippeted = 9; + optional int32 num_results_with_snippets = 15; // Overall time used for the function call. optional int32 latency_ms = 10; @@ -172,13 +180,14 @@ message NativeQueryStats { // Time used to fetch the document protos. Note that it includes the // time to snippet if ‘has_snippets’ is true. optional int32 document_retrieval_latency_ms = 14; + + reserved 9; } // Stats of the top-level functions IcingSearchEngine::Delete, -// IcingSearchEngine::DeleteByNamespace, IcingSearchEngine::DeleteBySchemaType, -// IcingSearchEngine::DeleteByQuery. +// IcingSearchEngine::DeleteByNamespace, IcingSearchEngine::DeleteBySchemaType. // Next tag: 4 -message NativeDeleteStats { +message DeleteStatsProto { // Overall time used for the function call. optional int32 latency_ms = 1; @@ -190,8 +199,10 @@ message NativeDeleteStats { // Delete one document. SINGLE = 1; - // Delete by query. - QUERY = 2; + // Delete by query. This value is deprecated. + // IcingSearchEngine::DeleteByQuery will return a DeleteByQueryStatsProto + // rather than a DeleteStatsProto. + DEPRECATED_QUERY = 2 [deprecated = true]; // Delete by namespace. NAMESPACE = 3; @@ -204,4 +215,33 @@ message NativeDeleteStats { // Number of documents deleted by this call. optional int32 num_documents_deleted = 3; -}
\ No newline at end of file +} + +// Stats of the top-level functions IcingSearchEngine::DeleteByQuery. +// Next tag: 9 +message DeleteByQueryStatsProto { + // Overall time used for the function call. + optional int32 latency_ms = 1; + + // Number of documents deleted by this call. + optional int32 num_documents_deleted = 2; + + // The UTF-8 length of the query string + optional int32 query_length = 3; + + // Number of terms in the query string. + optional int32 num_terms = 4; + + // Number of namespaces filtered. + optional int32 num_namespaces_filtered = 5; + + // Number of schema types filtered. + optional int32 num_schema_types_filtered = 6; + + // Time used to parse the query, including 2 parts: tokenizing and + // transforming tokens into an iterator tree. + optional int32 parse_query_latency_ms = 7; + + // Time used to delete each document. + optional int32 document_removal_latency_ms = 8; +} diff --git a/proto/icing/proto/optimize.proto b/proto/icing/proto/optimize.proto index 1baa64c..42290f3 100644 --- a/proto/icing/proto/optimize.proto +++ b/proto/icing/proto/optimize.proto @@ -23,7 +23,7 @@ option java_multiple_files = true; option objc_class_prefix = "ICNG"; // Result of a call to IcingSearchEngine.Optimize -// Next tag: 2 +// Next tag: 3 message OptimizeResultProto { // Status code can be one of: // OK @@ -35,12 +35,13 @@ message OptimizeResultProto { // See status.proto for more details. optional StatusProto status = 1; + optional OptimizeStatsProto optimize_stats = 2; // TODO(b/147699081): Add a field to indicate lost_schema and lost_documents. // go/icing-library-apis. } // Result of a call to IcingSearchEngine.GetOptimizeInfo -// Next tag: 4 +// Next tag: 5 message GetOptimizeInfoResultProto { // Status code can be one of: // OK @@ -57,4 +58,37 @@ message GetOptimizeInfoResultProto { // Estimated bytes that could be recovered. The exact size per document isn't // tracked, so this is based off an average document size. optional int64 estimated_optimizable_bytes = 3; + + // The amount of time since the last optimize ran. + optional int64 time_since_last_optimize_ms = 4; +} + +// Next tag: 10 +message OptimizeStatsProto { + // Overall time used for the function call. + optional int32 latency_ms = 1; + + // Time used to optimize the document store. + optional int32 document_store_optimize_latency_ms = 2; + + // Time used to restore the index. + optional int32 index_restoration_latency_ms = 3; + + // Number of documents before the optimization. + optional int32 num_original_documents = 4; + + // Number of documents deleted. + optional int32 num_deleted_documents = 5; + + // Number of documents expired. + optional int32 num_expired_documents = 6; + + // Size of storage before the optimize. + optional int64 storage_size_before = 7; + + // Size of storage after the optimize. + optional int64 storage_size_after = 8; + + // The amount of time since the last optimize ran. + optional int64 time_since_last_optimize_ms = 9; } diff --git a/proto/icing/proto/persist.proto b/proto/icing/proto/persist.proto index 77cf987..8d6b372 100644 --- a/proto/icing/proto/persist.proto +++ b/proto/icing/proto/persist.proto @@ -22,6 +22,28 @@ option java_package = "com.google.android.icing.proto"; option java_multiple_files = true; option objc_class_prefix = "ICNG"; +// The type of persistence guarantee that PersistToDisk should provide. +// Next tag: 3 +message PersistType { + enum Code { + // Default. Should never be used. + UNKNOWN = 0; + + // Only persist the ground truth. A successful PersistToDisk(LITE) should + // ensure that no data is lost the next time Icing initializes. This + // should be called after each batch of mutations. + LITE = 1; + + // Persists all data in internal Icing components. A successful + // PersistToDisk(FULL) should not only ensure no data loss like + // PersistToDisk(LITE), but also prevent the need to recover internal data + // structures the next time Icing initializes. This should be called at + // some point before the app terminates. + FULL = 2; + } + optional Code code = 1; +} + // Result of a call to IcingSearchEngine.Persist // Next tag: 2 message PersistToDiskResultProto { diff --git a/proto/icing/proto/schema.proto b/proto/icing/proto/schema.proto index 4188a8c..c611cbf 100644 --- a/proto/icing/proto/schema.proto +++ b/proto/icing/proto/schema.proto @@ -197,7 +197,7 @@ message SchemaProto { } // Result of a call to IcingSearchEngine.SetSchema -// Next tag: 4 +// Next tag: 8 message SetSchemaResultProto { // Status code can be one of: // OK @@ -221,6 +221,21 @@ message SetSchemaResultProto { // documents that fail validation against the new schema types would also be // deleted. repeated string incompatible_schema_types = 3; + + // Schema types that did not exist in the previous schema and were added with + // the new schema type. + repeated string new_schema_types = 4; + + // Schema types that were changed in a way that was backwards compatible and + // didn't invalidate the index. + repeated string fully_compatible_changed_schema_types = 5; + + // Schema types that were changed in a way that was backwards compatible, but + // invalidated the index. + repeated string index_incompatible_changed_schema_types = 6; + + // Overall time used for the function call. + optional int32 latency_ms = 7; } // Result of a call to IcingSearchEngine.GetSchema diff --git a/proto/icing/proto/search.proto b/proto/icing/proto/search.proto index 6c4e3c9..544995e 100644 --- a/proto/icing/proto/search.proto +++ b/proto/icing/proto/search.proto @@ -65,7 +65,7 @@ message SearchSpecProto { // Client-supplied specifications on what to include/how to format the search // results. -// Next tag: 5 +// Next tag: 6 message ResultSpecProto { // The results will be returned in pages, and num_per_page specifies the // number of documents in one page. @@ -102,34 +102,95 @@ message ResultSpecProto { // has been specified for a schema type, then *all* properties of that schema // type will be retrieved. repeated TypePropertyMask type_property_masks = 4; + + // Groupings of namespaces whose total returned results should be + // limited together. + // Next tag: 3 + message ResultGrouping { + // The namespaces in this grouping. + repeated string namespaces = 1; + + // The maximum number of results in this grouping that should be returned. + optional int32 max_results = 2; + } + + // How to limit the number of results returned per set of namespaces. If + // results match for a namespace that is not present in any result groupings, + // then those results will be returned without limit. + // + // Non-existent namespaces will be ignored. + // + // Example : Suppose that there are four namespaces each with three results + // matching the query for "foo". Without any result groupings, Icing would + // return the following results: + // ["ns0doc0", "ns0doc1", "ns1doc0", "ns3doc0", "ns0doc2", "ns3doc1", + // "ns2doc1", "ns3doc2", "ns2doc0", "ns1doc1", "ns2doc2", "ns1doc1"]. + // + // and the following result groupings: + // [ { ["namespace0"], 2 }, { ["namespace1", "namespace2"], 2} ] + // + // The following results will be returned: + // ["ns0doc0", "ns0doc1", "ns1doc0", "ns3doc0", "ns3doc1", "ns2doc1", + // "ns3doc2"]. + repeated ResultGrouping result_groupings = 5; } // The representation of a single match within a DocumentProto property. -// Next tag: 6 +// +// Example : A document whose content is "Necesito comprar comida mañana." and a +// query for "mana" with window=15 +// Next tag: 12 message SnippetMatchProto { - // Properties may have multiple values. values_index indicates which of these - // multiple string values the match occurred in. For properties with only one - // value, the values_index will always be 0. - // Ex. "Recipients" [ - // { { "Name" : "Daffy Duck" } - // { "EmailAddress" : "daffduck@gmail.com" } }, - // { { "Name" : "Donald Duck" } - // { "EmailAddress" : "donduck@gmail.com" } } - // "Daffy Duck" is the string value with a value_index of 0 for property - // "Recipients.Name". "Donald Duck" is the string value with a value_index of - // 1 for property "Recipients.Name". - optional int32 values_index = 1; - - // The position and length within the matched string at which the exact - // match begins. - optional int32 exact_match_position = 2; - - optional int32 exact_match_bytes = 3; - - // The position and length of the suggested snippet window. - optional int32 window_position = 4; - - optional int32 window_bytes = 5; + // The index of the byte in the string at which the match begins and the + // length in bytes of the match. + // + // For the example above, the values of these fields would be + // exact_match_byte_position=24, exact_match_byte_length=7 "mañana" + optional int32 exact_match_byte_position = 2; + optional int32 exact_match_byte_length = 3; + + // The length in bytes of the subterm that matches the query. The beginning of + // the submatch is the same as exact_match_byte_position. + // + // For the example above, the value of this field would be 5. With + // exact_match_byte_position=24 above, it would produce the substring "maña" + optional int32 submatch_byte_length = 10; + + // The index of the UTF-16 code unit in the string at which the match begins + // and the length in UTF-16 code units of the match. This is for use with + // UTF-16 encoded strings like Java.lang.String. + // + // For the example above, the values of these fields would be + // exact_match_utf16_position=24, exact_match_utf16_length=6 "mañana" + optional int32 exact_match_utf16_position = 6; + optional int32 exact_match_utf16_length = 7; + + // The length in UTF-16 code units of the subterm that matches the query. The + // beginning of the submatch is the same as exact_match_utf16_position. This + // is for use with UTF-16 encoded strings like Java.lang.String. + // + // For the example above, the value of this field would be 4. With + // exact_match_utf16_position=24 above, it would produce the substring "maña" + optional int32 submatch_utf16_length = 11; + + // The index of the byte in the string at which the suggested snippet window + // begins and the length in bytes of the window. + // + // For the example above, the values of these fields would be + // window_byte_position=17, window_byte_length=15 "comida mañana." + optional int32 window_byte_position = 4; + optional int32 window_byte_length = 5; + + // The index of the UTF-16 code unit in the string at which the suggested + // snippet window begins and the length in UTF-16 code units of the window. + // This is for use with UTF-16 encoded strings like Java.lang.String. + // + // For the example above, the values of these fields would be + // window_utf16_position=17, window_utf16_length=14 "comida mañana." + optional int32 window_utf16_position = 8; + optional int32 window_utf16_length = 9; + + reserved 1; } // A Proto representing all snippets for a single DocumentProto. @@ -139,9 +200,29 @@ message SnippetProto { // property values in the corresponding DocumentProto. // Next tag: 3 message EntryProto { - // A '.'-delimited sequence of property names indicating which property in - // the DocumentProto these snippets correspond to. - // Example properties: 'body', 'sender.name', 'sender.emailaddress', etc. + // A property path indicating which property in the DocumentProto these + // snippets correspond to. Property paths will contain 1) property names, + // 2) the property separator character '.' used to represent nested property + // and 3) indices surrounded by brackets to represent a specific value in + // that property. + // + // Example properties: + // - 'body' : the first and only string value of a top-level + // property called 'body'. + // - 'sender.name' : the first and only string value of a property + // called 'name' that is a subproperty of a + // property called 'sender'. + // - 'bcc[1].emailaddress': the first and only string value of a property + // called 'emailaddress' that is a subproperty of + // the second document value of a property called + // 'bcc'. + // - 'attachments[0]' : the first (of more than one) string value of a + // property called 'attachments'. + // NOTE: If there is only a single value for a property (like + // 'sender.name'), then no value index will be added to the property path. + // An index of [0] is implied. If there is more than one value for a + // property, then the value index will be added to the property path (like + // 'attachements[0]'). optional string property_name = 1; repeated SnippetMatchProto snippet_matches = 2; @@ -167,7 +248,7 @@ message SearchResultProto { optional StatusProto status = 1; // The Results that matched the query. Empty if there was an error. - // Next tag: 3 + // Next tag: 4 message ResultProto { // Document that matches the SearchSpecProto. optional DocumentProto document = 1; @@ -175,6 +256,10 @@ message SearchResultProto { // Snippeting information for the document if requested in the // ResultSpecProto. A default instance, if not requested. optional SnippetProto snippet = 2; + + // The score that the document was ranked by. The meaning of this score is + // determined by ScoringSpecProto.rank_by. + optional double score = 3; } repeated ResultProto results = 2; @@ -198,7 +283,7 @@ message SearchResultProto { // LINT.ThenChange(//depot/google3/icing/result/result-state-manager.h:kInvalidNextPageToken) // Stats for query execution performance. - optional NativeQueryStats query_stats = 5; + optional QueryStatsProto query_stats = 5; } // Next tag: 3 diff --git a/proto/icing/proto/storage.proto b/proto/icing/proto/storage.proto new file mode 100644 index 0000000..39dab6b --- /dev/null +++ b/proto/icing/proto/storage.proto @@ -0,0 +1,187 @@ +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package icing.lib; + +import "icing/proto/status.proto"; + +option java_package = "com.google.android.icing.proto"; +option java_multiple_files = true; +option objc_class_prefix = "ICNG"; + +// Next tag: 10 +message NamespaceStorageInfoProto { + // Name of the namespace + optional string namespace = 1; + + // Number of alive documents in this namespace. + optional int32 num_alive_documents = 2; + + // NOTE: We don't have stats on number of deleted documents in a namespace + // since we completely erase all data on a document when it's deleted. And we + // can't figure out which namespace it belonged to. + + // Number of expired documents in this namespace. + optional int32 num_expired_documents = 3; + + // LINT.IfChange(namespace_storage_info_usage_types) + // Number of alive documents that have a UsageReport.usage_type reported + optional int32 num_alive_documents_usage_type1 = 4; + optional int32 num_alive_documents_usage_type2 = 5; + optional int32 num_alive_documents_usage_type3 = 6; + + // Number of expired documents that have a UsageReport.usage_type reported + optional int32 num_expired_documents_usage_type1 = 7; + optional int32 num_expired_documents_usage_type2 = 8; + optional int32 num_expired_documents_usage_type3 = 9; + // LINT.ThenChange() +} + +// Next tag: 15 +message DocumentStorageInfoProto { + // Total number of alive documents. + optional int32 num_alive_documents = 1; + + // Total number of deleted documents. + optional int32 num_deleted_documents = 2; + + // Total number of expired documents. + optional int32 num_expired_documents = 3; + + // Total size of the document store in bytes. Will be set to -1 if an IO error + // is encountered while calculating this field. + optional int64 document_store_size = 4; + + // Total size of the ground truth in bytes. The ground truth may + // include deleted or expired documents. Will be set to -1 if an IO error is + // encountered while calculating this field. + optional int64 document_log_size = 5; + + // Size of the key mapper in bytes. Will be set to -1 if an IO error is + // encountered while calculating this field. + optional int64 key_mapper_size = 6; + + // Size of the document id mapper in bytes. Will be set to -1 if an IO error + // is encountered while calculating this field. + optional int64 document_id_mapper_size = 7; + + // Size of the score cache in bytes. Will be set to -1 if an IO error is + // encountered while calculating this field. + optional int64 score_cache_size = 8; + + // Size of the filter cache in bytes. Will be set to -1 if an IO error is + // encountered while calculating this field. + optional int64 filter_cache_size = 9; + + // Size of the corpus mapper in bytes. Will be set to -1 if an IO error is + // encountered while calculating this field. + optional int64 corpus_mapper_size = 10; + + // Size of the corpus score cache in bytes. Will be set to -1 if an IO error + // is encountered while calculating this field. + optional int64 corpus_score_cache_size = 11; + + // Size of the namespace id mapper in bytes. Will be set to -1 if an IO error + // is encountered while calculating this field. + optional int64 namespace_id_mapper_size = 12; + + // Number of namespaces seen from the current documents. + // + // TODO(cassiewang): This isn't technically needed anymore since clients can + // get this number from namespace_storage_info. Consider removing this. + optional int32 num_namespaces = 13; + + // Storage information of each namespace. + repeated NamespaceStorageInfoProto namespace_storage_info = 14; +} + +// Next tag: 5 +message SchemaStoreStorageInfoProto { + // Size of the schema store in bytes. Will be set to -1 if an IO error is + // encountered while calculating this field. + optional int64 schema_store_size = 1; + + // Total number of schema types. + optional int32 num_schema_types = 2; + + // Total number of all sections across all types + optional int32 num_total_sections = 3; + + // Total number of types at the current section limit. + optional int32 num_schema_types_sections_exhausted = 4; +} + +// Next tag: 9 +message IndexStorageInfoProto { + // Total size of the index in bytes. Will be set to -1 if an IO error is + // encountered while calculating this field. + optional int64 index_size = 1; + + // Size of the lite index lexicon in bytes. Will be set to -1 if an IO error + // is encountered while calculating this field. + optional int64 lite_index_lexicon_size = 2; + + // Size of the lite index hit buffer in bytes. Will be set to -1 if an IO + // error is encountered while calculating this field. + optional int64 lite_index_hit_buffer_size = 3; + + // Size of the main index lexicon in bytes. Will be set to -1 if an IO error + // is encountered while calculating this field. + optional int64 main_index_lexicon_size = 4; + + // Size of the main index storage in bytes. Will be set to -1 if an IO error + // is encountered while calculating this field. + optional int64 main_index_storage_size = 5; + + // Size of one main index block in bytes. + optional int64 main_index_block_size = 6; + + // Number of main index blocks. + optional int32 num_blocks = 7; + + // Percentage of the main index blocks that are free, assuming + // allocated blocks are fully used. + optional float min_free_fraction = 8; +} + +// Next tag: 5 +message StorageInfoProto { + // Total size of Icing’s storage in bytes. Will be set to -1 if an IO error is + // encountered while calculating this field. + optional int64 total_storage_size = 1; + + // Storage information of the document store. + optional DocumentStorageInfoProto document_storage_info = 2; + + // Storage information of the schema store. + optional SchemaStoreStorageInfoProto schema_store_storage_info = 3; + + // Storage information of the index. + optional IndexStorageInfoProto index_storage_info = 4; +} + +// Next tag: 3 +message StorageInfoResultProto { + // Status code can be one of: + // OK + // FAILED_PRECONDITION + // + // See status.proto for more details. + optional StatusProto status = 1; + + // Storage information of Icing. + optional StorageInfoProto storage_info = 2; +} diff --git a/proto/icing/proto/usage.proto b/proto/icing/proto/usage.proto index 7f31a2b..eaa2671 100644 --- a/proto/icing/proto/usage.proto +++ b/proto/icing/proto/usage.proto @@ -20,13 +20,11 @@ import "icing/proto/status.proto"; option java_package = "com.google.android.icing.proto"; option java_multiple_files = true; - option objc_class_prefix = "ICNG"; // Representation of a usage report that is generated from the client and sent // to Icing. // Next tag: 5 -// LINT.IfChange message UsageReport { // Namespace of the document. optional string document_namespace = 1; @@ -37,6 +35,7 @@ message UsageReport { // Timestamp in milliseconds of when the usage happens. optional int64 usage_timestamp_ms = 3; + // LINT.IfChange // Next tag: 3 enum UsageType { // A custom usage type that clients can assign a meaning to. UsageReports of @@ -50,9 +49,12 @@ message UsageReport { // Same as above. USAGE_TYPE3 = 2; } + // LINT.ThenChange( + // //depot/google3/icing/store/usage-store.h:UsageScores, + // //depot/google3/icing/proto/\ + // storage.proto:namespace_storage_info_usage_types) optional UsageType usage_type = 4; } -// LINT.ThenChange(//depot/google3/icing/store/usage-store.h:UsageScores) // Result of a call to IcingSearchEngine.ReportUsage // Next tag: 2 @@ -64,4 +66,4 @@ message ReportUsageResultProto { // // See status.proto for more details. optional StatusProto status = 1; -}
\ No newline at end of file +} diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt index af8248d..f0c066f 100644 --- a/synced_AOSP_CL_number.txt +++ b/synced_AOSP_CL_number.txt @@ -1 +1 @@ -set(synced_AOSP_CL_number=351841227) +set(synced_AOSP_CL_number=390638574) |