diff options
author | Joel Galenson <jgalenson@google.com> | 2021-10-05 02:02:26 +0000 |
---|---|---|
committer | Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> | 2021-10-05 02:02:26 +0000 |
commit | 83f6d7e5a5c0153fa2c01727461d62ea178b318e (patch) | |
tree | 7be84b30235be3873090ff421bd1268f8a562088 | |
parent | 3591b5228a8e83955adbd3068ba9a129a0a2dd7a (diff) | |
parent | 1e1ba1afe0a4fd1c126425cc82b1e975544551a2 (diff) | |
download | memchr-android-s-v2-preview-1.tar.gz |
Upgrade rust/crates/memchr to 2.4.1 am: 1e1ba1afe0android-s-v2-preview-2android-s-v2-preview-1android-s-v2-beta-2android-s-v2-preview-1
Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/memchr/+/1832701
Change-Id: I7423cf5b174e0d3517de35ae008e7de75f11fdd0
-rw-r--r-- | .cargo_vcs_info.json | 2 | ||||
-rw-r--r-- | Android.bp | 2 | ||||
-rw-r--r-- | Cargo.toml | 25 | ||||
-rw-r--r-- | Cargo.toml.orig | 15 | ||||
-rw-r--r-- | METADATA | 8 | ||||
-rw-r--r-- | README.md | 24 | ||||
-rwxr-xr-x | scripts/make-byte-frequency-table | 74 | ||||
-rw-r--r-- | src/cow.rs | 2 | ||||
-rw-r--r-- | src/lib.rs | 2 | ||||
-rw-r--r-- | src/memmem/genericsimd.rs | 6 | ||||
-rw-r--r-- | src/memmem/prefilter/fallback.rs | 2 | ||||
-rw-r--r-- | src/memmem/prefilter/mod.rs | 4 | ||||
-rw-r--r-- | src/tests/memchr/testdata.rs | 2 |
13 files changed, 142 insertions, 26 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json index ee47bc5..2507469 100644 --- a/.cargo_vcs_info.json +++ b/.cargo_vcs_info.json @@ -1,5 +1,5 @@ { "git": { - "sha1": "0ba6735559ae9c1cb7e6470a97834a95cc06fbbb" + "sha1": "8e1da98fee06d66c13e66c330e3a3dd6ccf0e3a0" } } @@ -42,6 +42,8 @@ rust_library { name: "libmemchr", host_supported: true, crate_name: "memchr", + cargo_env_compat: true, + cargo_pkg_version: "2.4.1", srcs: ["src/lib.rs"], edition: "2018", features: [ @@ -3,26 +3,25 @@ # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies -# to registry (e.g., crates.io) dependencies +# to registry (e.g., crates.io) dependencies. # -# If you believe there's an error in this file please file an -# issue against the rust-lang/cargo repository. If you're -# editing this file be aware that the upstream Cargo.toml -# will likely look very different (and much more reasonable) +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. [package] edition = "2018" name = "memchr" -version = "2.4.0" +version = "2.4.1" authors = ["Andrew Gallant <jamslam@gmail.com>", "bluss"] exclude = ["/bench", "/.github", "/fuzz"] description = "Safe interface to memchr." -homepage = "https://github.com/BurntSushi/rust-memchr" +homepage = "https://github.com/BurntSushi/memchr" documentation = "https://docs.rs/memchr/" readme = "README.md" keywords = ["memchr", "char", "scan", "strchr", "string"] license = "Unlicense/MIT" -repository = "https://github.com/BurntSushi/rust-memchr" +repository = "https://github.com/BurntSushi/memchr" [profile.bench] debug = true @@ -36,6 +35,15 @@ debug = true [lib] name = "memchr" bench = false +[dependencies.compiler_builtins] +version = "0.1.2" +optional = true + +[dependencies.core] +version = "1.0.0" +optional = true +package = "rustc-std-workspace-core" + [dependencies.libc] version = "0.2.18" optional = true @@ -46,5 +54,6 @@ default-features = false [features] default = ["std"] +rustc-dep-of-std = ["core", "compiler_builtins"] std = [] use_std = ["std"] diff --git a/Cargo.toml.orig b/Cargo.toml.orig index 6218b17..2348487 100644 --- a/Cargo.toml.orig +++ b/Cargo.toml.orig @@ -1,11 +1,11 @@ [package] name = "memchr" -version = "2.4.0" #:version +version = "2.4.1" #:version authors = ["Andrew Gallant <jamslam@gmail.com>", "bluss"] description = "Safe interface to memchr." documentation = "https://docs.rs/memchr/" -homepage = "https://github.com/BurntSushi/rust-memchr" -repository = "https://github.com/BurntSushi/rust-memchr" +homepage = "https://github.com/BurntSushi/memchr" +repository = "https://github.com/BurntSushi/memchr" readme = "README.md" keywords = ["memchr", "char", "scan", "strchr", "string"] license = "Unlicense/MIT" @@ -31,9 +31,18 @@ std = [] # then, it is alias for the 'std' feature. use_std = ["std"] +# Internal feature, only used when building as part of libstd, not part of the +# stable interface of this crate. +rustc-dep-of-std = ['core', 'compiler_builtins'] + [dependencies] libc = { version = "0.2.18", default-features = false, optional = true } +# Internal feature, only used when building as part of libstd, not part of the +# stable interface of this crate. +core = { version = '1.0.0', optional = true, package = 'rustc-std-workspace-core' } +compiler_builtins = { version = '0.1.2', optional = true } + [dev-dependencies] quickcheck = { version = "1.0.3", default-features = false } @@ -7,13 +7,13 @@ third_party { } url { type: ARCHIVE - value: "https://static.crates.io/crates/memchr/memchr-2.4.0.crate" + value: "https://static.crates.io/crates/memchr/memchr-2.4.1.crate" } - version: "2.4.0" + version: "2.4.1" license_type: NOTICE last_upgrade_date { year: 2021 - month: 5 - day: 19 + month: 9 + day: 22 } } @@ -2,7 +2,7 @@ memchr ====== This library provides heavily optimized routines for string search primitives. -[![Build status](https://github.com/BurntSushi/rust-memchr/workflows/ci/badge.svg)](https://github.com/BurntSushi/rust-memchr/actions) +[![Build status](https://github.com/BurntSushi/memchr/workflows/ci/badge.svg)](https://github.com/BurntSushi/memchr/actions) [![](https://meritbadge.herokuapp.com/memchr)](https://crates.io/crates/memchr) Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/). @@ -84,4 +84,24 @@ approaches: * A huge suite of benchmarks that are also run as tests. Benchmarks always confirm that the expected result occurs. -Improvements to the testing infrastructue are very welcome. +Improvements to the testing infrastructure are very welcome. + + +### Algorithms used + +At time of writing, this crate's implementation of substring search actually +has a few different algorithms to choose from depending on the situation. + +* For very small haystacks, + [Rabin-Karp](https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm) + is used to reduce latency. Rabin-Karp has very small overhead and can often + complete before other searchers have even been constructed. +* For small needles, a variant of the + ["Generic SIMD"](http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd) + algorithm is used. Instead of using the first and last bytes, a heuristic is + used to select bytes based on a background distribution of byte frequencies. +* In all other cases, + [Two-Way](https://en.wikipedia.org/wiki/Two-way_string-matching_algorithm) + is used. If possible, a prefilter based on the "Generic SIMD" algorithm + linked above is used to find candidates quickly. A dynamic heuristic is used + to detect if the prefilter is ineffective, and if so, disables it. diff --git a/scripts/make-byte-frequency-table b/scripts/make-byte-frequency-table new file mode 100755 index 0000000..37eeca7 --- /dev/null +++ b/scripts/make-byte-frequency-table @@ -0,0 +1,74 @@ +#!/usr/bin/env python + +# This does simple normalized frequency analysis on UTF-8 encoded text. The +# result of the analysis is translated to a ranked list, where every byte is +# assigned a rank. This list is written to src/freqs.rs. +# +# Currently, the frequencies are generated from the following corpuses: +# +# * The CIA world fact book +# * The source code of rustc +# * Septuaginta + +from __future__ import absolute_import, division, print_function + +import argparse +from collections import Counter +import sys + +preamble = ''' +// NOTE: The following code was generated by "scripts/frequencies.py", do not +// edit directly +'''.lstrip() + + +def eprint(*args, **kwargs): + kwargs['file'] = sys.stderr + print(*args, **kwargs) + + +def main(): + p = argparse.ArgumentParser() + p.add_argument('corpus', metavar='FILE', nargs='+') + args = p.parse_args() + + # Get frequency counts of each byte. + freqs = Counter() + for i in range(0, 256): + freqs[i] = 0 + + eprint('reading entire corpus into memory') + corpus = [] + for fpath in args.corpus: + corpus.append(open(fpath, 'rb').read()) + + eprint('computing byte frequencies') + for c in corpus: + for byte in c: + freqs[byte] += 1.0 / float(len(c)) + + eprint('writing Rust code') + # Get the rank of each byte. A lower rank => lower relative frequency. + rank = [0] * 256 + for i, (byte, _) in enumerate(freqs.most_common()): + # print(byte) + rank[byte] = 255 - i + + # Forcefully set the highest rank possible for bytes that start multi-byte + # UTF-8 sequences. The idea here is that a continuation byte will be more + # discerning in a homogenous haystack. + for byte in range(0xC0, 0xFF + 1): + rank[byte] = 255 + + # Now write Rust. + olines = ['pub const BYTE_FREQUENCIES: [u8; 256] = ['] + for byte in range(256): + olines.append(' %3d, // %r' % (rank[byte], chr(byte))) + olines.append('];') + + print(preamble) + print('\n'.join(olines)) + + +if __name__ == '__main__': + main() @@ -5,7 +5,7 @@ use core::ops; /// The purpose of this type is to permit usage of a "borrowed or owned /// byte string" in a way that keeps std/no-std compatibility. That is, in /// no-std mode, this type devolves into a simple &[u8] with no owned variant -/// availble. We can't just use a plain Cow because Cow is not in core. +/// available. We can't just use a plain Cow because Cow is not in core. #[derive(Clone, Debug)] pub struct CowBytes<'a>(Imp<'a>); @@ -160,7 +160,7 @@ standard library exposes a platform independent SIMD API. #![cfg_attr(miri, allow(dead_code, unused_macros))] // Supporting 8-bit (or others) would be fine. If you need it, please submit a -// bug report at https://github.com/BurntSushi/rust-memchr +// bug report at https://github.com/BurntSushi/memchr #[cfg(not(any( target_pointer_width = "16", target_pointer_width = "32", diff --git a/src/memmem/genericsimd.rs b/src/memmem/genericsimd.rs index 2417f91..28bfdab 100644 --- a/src/memmem/genericsimd.rs +++ b/src/memmem/genericsimd.rs @@ -195,7 +195,9 @@ pub(crate) unsafe fn fwd_find<V: Vector>( } /// Search for an occurrence of two rare bytes from the needle in the chunk -/// pointed to by ptr, with the end of the haystack pointed to by end_ptr. +/// pointed to by ptr, with the end of the haystack pointed to by end_ptr. When +/// an occurrence is found, memcmp is run to check if a match occurs at the +/// corresponding position. /// /// rare1chunk and rare2chunk correspond to vectors with the rare1 and rare2 /// bytes repeated in each 8-bit lane, respectively. @@ -210,7 +212,7 @@ pub(crate) unsafe fn fwd_find<V: Vector>( /// /// It must be safe to do an unaligned read of size(V) bytes starting at both /// (ptr + rare1i) and (ptr + rare2i). It must also be safe to do unaligned -/// loads on ptr up to end_ptr. +/// loads on ptr up to (end_ptr - needle.len()). #[inline(always)] unsafe fn fwd_find_in_chunk<V: Vector>( fwd: &Forward, diff --git a/src/memmem/prefilter/fallback.rs b/src/memmem/prefilter/fallback.rs index 8ae1f4e..ae1bbcc 100644 --- a/src/memmem/prefilter/fallback.rs +++ b/src/memmem/prefilter/fallback.rs @@ -20,7 +20,7 @@ make use of the background frequency distribution of bytes though.) This fallback implementation was originally formulated in regex many moons ago: https://github.com/rust-lang/regex/blob/3db8722d0b204a85380fe2a65e13d7065d7dd968/src/literal/imp.rs#L370-L501 -Prior to that, I'm not aware of anyone using this technique in any prominant +Prior to that, I'm not aware of anyone using this technique in any prominent substring search implementation. Although, I'm sure folks have had this same insight long before me. diff --git a/src/memmem/prefilter/mod.rs b/src/memmem/prefilter/mod.rs index 2481cfe..6461f33 100644 --- a/src/memmem/prefilter/mod.rs +++ b/src/memmem/prefilter/mod.rs @@ -18,7 +18,7 @@ const MAX_FALLBACK_RANK: usize = 250; /// instead of needing to pass around all three as distinct function /// parameters. pub(crate) struct Pre<'a> { - /// State that tracks the effectivess of a prefilter. + /// State that tracks the effectiveness of a prefilter. pub(crate) state: &'a mut PrefilterState, /// The actual prefilter function. pub(crate) prefn: PrefilterFn, @@ -146,7 +146,7 @@ impl core::fmt::Debug for PrefilterFn { /// The use of prefilters in this implementation does use a heuristic to detect /// when a prefilter might not be carrying its weight, and will dynamically /// disable its use. Nevertheless, this configuration option gives callers -/// the ability to disable pefilters if you have knowledge that they won't be +/// the ability to disable prefilters if you have knowledge that they won't be /// useful. #[derive(Clone, Copy, Debug)] #[non_exhaustive] diff --git a/src/tests/memchr/testdata.rs b/src/tests/memchr/testdata.rs index acb1fd8..6dda524 100644 --- a/src/tests/memchr/testdata.rs +++ b/src/tests/memchr/testdata.rs @@ -160,7 +160,7 @@ impl MemchrTest { // // You might think this would cause most needles to not be found, but // we actually expand our tests to include corpus sizes all the way up - // to >500 bytes, so we should exericse most branches. + // to >500 bytes, so we should exercise most branches. for align in 0..130 { let corpus = self.corpus(align); assert_eq!( |