aboutsummaryrefslogtreecommitdiff
path: root/icing/file/memory-mapped-file.h
blob: 185d940b2459aeee52d3fe861e74b34ade71a8c7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
// Copyright (C) 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Allows memory-mapping a full file or a specific region within the file.
// It also supports efficiently switching the region being mapped.
//
// Note on Performance:
// It supports different optimized strategies for common patterns on both
// read-only and read-write files. This includes using read-ahead buffers for
// faster reads as well as background-sync vs manual-sync of changes to disk.
// For more details, see comments at MemoryMappedFile::Strategy.
//
// ** Usage 1: pre-mmap large memory and grow the underlying file internally **
//
// // Create MemoryMappedFile instance.
// ICING_ASSIGN_OR_RETURN(
//     std::unique_ptr<MemoryMappedFile> mmapped_file,
//     MemoryMappedFile::Create(filesystem, "/file.pb",
//                              READ_WRITE_AUTO_SYNC,
//                              max_file_size,
//                              /*pre_mapping_file_offset=*/0,
//                              /*pre_mapping_mmap_size=*/1024 * 1024));
//
// // Found that we need 4K bytes for the file and mmapped region.
// mmapped_file->GrowAndRemapIfNecessary(
//     /*new_file_offset=*/0, /*new_mmap_size=*/4 * 1024);
// char read_byte = mmapped_file->region()[4000];
// mmapped_file->mutable_region()[4001] = write_byte;
//
// mmapped_file->PersistToDisk(); // Optional; immediately writes changes to
// disk.
//
// // Found that we need 2048 * 1024 bytes for the file and mmapped region.
// mmapped_file->GrowAndRemapIfNecessary(
//     /*new_file_offset=*/0, /*new_mmap_size=*/2048 * 1024);
// mmapped_file->mutable_region()[2000 * 1024] = write_byte;
// mmapped_file.reset();
//
// ** Usage 2: load by segments **
//
// ICING_ASSIGN_OR_RETURN(
//     std::unique_ptr<MemoryMappedFile> mmapped_file,
//     MemoryMappedFile::Create(filesystem, "/file.pb",
//                              READ_WRITE_AUTO_SYNC,
//                              max_file_size,
//                              /*pre_mapping_file_offset=*/0,
//                              /*pre_mapping_mmap_size=*/16 * 1024));
//
// // load the first 16K.
// mmapped_file->GrowAndRemapIfNecessary(
//     /*new_file_offset=*/0, /*new_mmap_size=*/16 * 1024);
// char read_byte = mmapped_file->region()[100];
// mmapped_file->mutable_region()[10] = write_byte;
//
// mmapped_file->PersistToDisk(); // Optional; immediately writes changes to
// disk.
//
// // load the next 16K.
// mmapped_file->GrowAndRemapIfNecessary(
//     /*new_file_offset=*/16 * 1024, /*new_mmap_size=*/16 * 1024);
// mmapped_file->mutable_region()[10] = write_byte;
// mmapped_file.reset();

#ifndef ICING_FILE_MEMORY_MAPPED_FILE_H_
#define ICING_FILE_MEMORY_MAPPED_FILE_H_

#include <unistd.h>

#include <algorithm>
#include <cstdint>
#include <string>
#include <string_view>

#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/file/filesystem.h"

namespace icing {
namespace lib {

class MemoryMappedFile {
 public:
  static int64_t __attribute__((const)) system_page_size() {
    static const int64_t page_size =
        static_cast<int64_t>(sysconf(_SC_PAGE_SIZE));
    return page_size;
  }

  enum Strategy {
    // Memory map a read-only file into a read-only memory region.
    READ_ONLY,

    // Memory map a read-write file into a writable memory region. Any changes
    // made to the region are automatically flushed to the underlying file in
    // the background.
    READ_WRITE_AUTO_SYNC,

    // Memory map a read-write file into a writable memory region. Changes made
    // to this region will never be auto-synced to the underlying file. Unless
    // the caller explicitly calls PersistToDisk(), all changes will be lost
    // when the MemoryMappedFile is destroyed.
    READ_WRITE_MANUAL_SYNC,
  };

  // Absolute max file size, 16 GiB.
  static constexpr int64_t kMaxFileSize = INT64_C(1) << 34;

  // Default max file size, 1 MiB.
  static constexpr int64_t kDefaultMaxFileSize = INT64_C(1) << 20;

  // Creates a new MemoryMappedFile to read/write content to.
  //
  // filesystem    : Object to make system level calls
  // file_path     : Full path of the file that needs to be memory-mapped.
  // mmap_strategy : Strategy/optimizations to access the content.
  // max_file_size : Maximum file size for MemoryMappedFile, default
  //                 kDefaultMaxFileSize.
  //
  // Returns:
  //   A MemoryMappedFile instance on success
  //   OUT_OF_RANGE_ERROR if max_file_size is invalid
  //   INTERNAL_ERROR on I/O error
  static libtextclassifier3::StatusOr<MemoryMappedFile> Create(
      const Filesystem& filesystem, std::string_view file_path,
      Strategy mmap_strategy, int64_t max_file_size = kDefaultMaxFileSize);

  // Creates a new MemoryMappedFile to read/write content to. It remaps when
  // creating the instance, but doesn't check or grow the actual file size, so
  // the caller should call GrowAndRemapIfNecessary before accessing region.
  //
  // filesystem    : Object to make system level calls
  // file_path     : Full path of the file that needs to be memory-mapped.
  // mmap_strategy : Strategy/optimizations to access the content.
  // max_file_size : Maximum file size for MemoryMappedFile.
  // pre_mapping_file_offset : The offset of the file to be memory mapped.
  // pre_mapping_mmap_size   : mmap size for pre-mapping.
  //
  // Returns:
  //   A MemoryMappedFile instance on success
  //   OUT_OF_RANGE_ERROR if max_file_size, file_offset, or mmap_size is invalid
  //   INTERNAL_ERROR on I/O error
  static libtextclassifier3::StatusOr<MemoryMappedFile> Create(
      const Filesystem& filesystem, std::string_view file_path,
      Strategy mmap_strategy, int64_t max_file_size,
      int64_t pre_mapping_file_offset, int64_t pre_mapping_mmap_size);

  // Delete copy constructor and assignment operator.
  MemoryMappedFile(const MemoryMappedFile& other) = delete;
  MemoryMappedFile& operator=(const MemoryMappedFile& other) = delete;

  MemoryMappedFile(MemoryMappedFile&& other);
  MemoryMappedFile& operator=(MemoryMappedFile&& other);

  // Frees any region that is still memory-mapped region.
  ~MemoryMappedFile();

  // TODO(b/247671531): migrate all callers to use GrowAndRemapIfNecessary and
  // deprecate this API.
  //
  // Memory-map the newly specified region within the file specified by
  // file_offset and mmap_size. Unmaps any previously mmapped region.
  // It doesn't handle the underlying file growth.
  //
  // Returns any encountered IO error.
  libtextclassifier3::Status Remap(int64_t file_offset, int64_t mmap_size);

  // Attempt to memory-map the newly specified region within the file specified
  // by new_file_offset and new_mmap_size. It handles mmap and file growth
  // intelligently.
  // - Compute least file size needed according to new_file_offset and
  //   new_mmap_size, and compare with the current file size. If requiring file
  //   growth, then grow the underlying file (Write) or return error if
  //   strategy_ is READ_ONLY.
  // - If new_file_offset is different from the current file_offset_ or
  //   new_mmap_size is greater than the current mmap_size_, then memory-map
  //   the newly specified region and unmap any previously mmapped region.
  //
  // This API is useful for file growth since it grows the underlying file
  // internally and handles remapping intelligently. By pre-mmapping a large
  // memory, we only need to grow the underlying file (Write) without remapping
  // in each round of growth, which significantly reduces the cost of system
  // call and memory paging after remap.
  //
  // Returns:
  //   OK on success
  //   OUT_OF_RANGE_ERROR if new_file_offset and new_mmap_size is invalid
  //   Any error from GrowFileSize() and RemapImpl()
  libtextclassifier3::Status GrowAndRemapIfNecessary(int64_t new_file_offset,
                                                     int64_t new_mmap_size);

  // unmap and free-up the region that has currently been memory mapped.
  void Unmap();

  // Explicitly persist any changes made to the currently mapped region to disk.
  //
  // NOTE: This is only valid if Strategy=READ_WRITE was used.
  //
  // Returns:
  //   OK on success
  //   INTERNAL on I/O error
  //   FAILED_PRECONDITION if Strategy is not implemented
  libtextclassifier3::Status PersistToDisk();

  // Advise the system to help it optimize the memory-mapped region for
  // upcoming read/write operations.
  //
  // NOTE: See linux documentation of madvise() for additional details.
  enum AccessPattern {
    // Future memory access are expected to be in random order. So, readhead
    // will have limited impact on latency.
    ACCESS_RANDOM,

    // Future memory access are expected to be sequential. So, some readahead
    // can greatly improve latency.
    ACCESS_SEQUENTIAL,

    // Future memory access is expected to be high-volume and all over the file.
    // So, preloading the whole region into memory would greatly improve
    // latency.
    ACCESS_ALL,

    // Future memory access is expected to be rare. So, it is best to free up
    // as much of preloaded memory as possible.
    ACCESS_NONE,
  };
  libtextclassifier3::Status OptimizeFor(AccessPattern access_pattern);

  Strategy strategy() const { return strategy_; }

  int64_t max_file_size() const { return max_file_size_; }

  // Accessors to the memory-mapped region. Returns null if nothing is mapped.
  const char* region() const {
    return reinterpret_cast<const char*>(mmap_result_) + alignment_adjustment_;
  }
  char* mutable_region() {
    return reinterpret_cast<char*>(mmap_result_) + alignment_adjustment_;
  }

  int64_t file_offset() const { return file_offset_; }

  // TODO(b/247671531): remove this API after migrating all callers to use
  //                    GrowAndRemapIfNecessary.
  int64_t region_size() const { return mmap_size_; }

  // The size that is safe for the client to read/write. This is only valid for
  // callers that use GrowAndRemapIfNecessary.
  int64_t available_size() const {
    return std::min(mmap_size_,
                    std::max(INT64_C(0), file_size_ - file_offset_));
  }

 private:
  explicit MemoryMappedFile(const Filesystem& filesystem,
                            std::string_view file_path, Strategy mmap_strategy,
                            int64_t max_file_size, int64_t file_size);

  // Grow the underlying file to new_file_size.
  // Note: it is possible that Write() (implemented in the file system call
  // library) grows the underlying file partially and returns error due to
  // failures, so the cached file_size_ may contain out-of-date value, but it is
  // still guaranteed that file_size_ is always smaller or equal to the actual
  // file size. In the next round of growing:
  // - If new_file_size is not greater than file_size_, then we're still
  //   confident that the actual file size is large enough and therefore skip
  //   the grow process.
  // - If new_file_size is greater than file_size_, then we will invoke the
  //   system call to sync the actual file size. At this moment, file_size_ is
  //   the actual file size and therefore we can grow the underlying file size
  //   correctly.
  //
  // Returns:
  //   OK on success
  //   FAILED_PRECONDITION_ERROR if requiring file growth and strategy_ is
  //                             READ_ONLY
  //   OUT_OF_RANGE_ERROR if new_mmap_size exceeds max_file_size_
  //   INTERNAL_ERROR on I/O error
  libtextclassifier3::Status GrowFileSize(int64_t new_file_size);

  // Memory-map the newly specified region within the file specified by
  // new_file_offset and new_mmap_size. Unmaps any previously mmapped region.
  // It doesn't handle the underlying file growth.
  //
  // Returns:
  //   OK on success
  //   OUT_OF_RANGE_ERROR if new_file_offset and new_mmap_size is invalid
  //   INTERNAL_ERROR on I/O error
  libtextclassifier3::Status RemapImpl(int64_t new_file_offset,
                                       int64_t new_mmap_size);

  // Swaps the contents of this with other.
  void Swap(MemoryMappedFile* other);

  int64_t adjusted_offset() const {
    return file_offset_ - alignment_adjustment_;
  }

  int64_t adjusted_mmap_size() const {
    return alignment_adjustment_ + mmap_size_;
  }

  // Cached constructor params.
  const Filesystem* filesystem_;
  std::string file_path_;
  Strategy strategy_ = Strategy::READ_WRITE_AUTO_SYNC;

  // Raw file related fields:
  // - max_file_size_
  // - file_size_

  // Max file size for MemoryMappedFile. It should not exceed the absolute max
  // size of memory mapped file (kMaxFileSize). It is only used in
  // GrowAndRemapIfNecessary(), the new API that handles underlying file growth
  // internally and remaps intelligently.
  //
  // Note: max_file_size_ will be specified in runtime and the caller should
  // make sure its value is correct and reasonable.
  int64_t max_file_size_ = 0;

  // Cached file size to avoid calling system call too frequently. It is only
  // used in GrowAndRemapIfNecessary(), the new API that handles underlying file
  // growth internally and remaps intelligently.
  //
  // Note: it is guaranteed that file_size_ is smaller or equal to the actual
  // file size as long as the underlying file hasn't been truncated or deleted
  // externally. See GrowFileSize() for more details.
  int64_t file_size_ = 0;

  // Memory mapped related fields:
  // - mmap_result_
  // - file_offset_
  // - alignment_adjustment_
  // - mmap_size_

  // Raw pointer (or error) returned by calls to mmap().
  void* mmap_result_ = nullptr;

  // Offset within the file at which the current memory-mapped region starts.
  int64_t file_offset_ = 0;

  // Size that is currently memory-mapped.
  // Note that the mmapped size can be larger than the underlying file size. We
  // can reduce remapping by pre-mmapping a large memory and grow the file size
  // later. See GrowAndRemapIfNecessary().
  int64_t mmap_size_ = 0;

  // The difference between file_offset_ and the actual adjusted (aligned)
  // offset.
  // Since mmap requires the offset to be a multiple of system page size, we
  // have to align file_offset_ to the last multiple of system page size.
  int64_t alignment_adjustment_ = 0;

  // E.g. system_page_size = 5, RemapImpl(/*new_file_offset=*/8, mmap_size)
  //
  // File layout:               xxxxx xxxxx xxxxx xxxxx xxxxx xx
  // file_offset_:                       8
  // adjusted_offset():               5
  // region()/mutable_region():          |
  // mmap_result_:                    |
  //
  // alignment_adjustment_: file_offset_ - adjusted_offset()
  // mmap_size_:            mmap_size
  // region_size():         mmap_size_
  // available_size():      std::min(mmap_size_,
  //                                 std::max(0, file_size_ - file_offset_))
  // region_range:          [file_offset_, file_offset + mmap_size)
  // adjusted_mmap_size():  alignment_adjustment_ + mmap_size_
  // adjusted_mmap_range:   [alignment_offset, file_offset + mmap_size)
};

}  // namespace lib
}  // namespace icing

#endif  // ICING_FILE_MEMORY_MAPPED_FILE_H_