aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoel Galenson <jgalenson@google.com>2021-10-05 02:02:26 +0000
committerAutomerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>2021-10-05 02:02:26 +0000
commit83f6d7e5a5c0153fa2c01727461d62ea178b318e (patch)
tree7be84b30235be3873090ff421bd1268f8a562088
parent3591b5228a8e83955adbd3068ba9a129a0a2dd7a (diff)
parent1e1ba1afe0a4fd1c126425cc82b1e975544551a2 (diff)
downloadmemchr-android-s-v2-preview-1.tar.gz
Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/memchr/+/1832701 Change-Id: I7423cf5b174e0d3517de35ae008e7de75f11fdd0
-rw-r--r--.cargo_vcs_info.json2
-rw-r--r--Android.bp2
-rw-r--r--Cargo.toml25
-rw-r--r--Cargo.toml.orig15
-rw-r--r--METADATA8
-rw-r--r--README.md24
-rwxr-xr-xscripts/make-byte-frequency-table74
-rw-r--r--src/cow.rs2
-rw-r--r--src/lib.rs2
-rw-r--r--src/memmem/genericsimd.rs6
-rw-r--r--src/memmem/prefilter/fallback.rs2
-rw-r--r--src/memmem/prefilter/mod.rs4
-rw-r--r--src/tests/memchr/testdata.rs2
13 files changed, 142 insertions, 26 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
index ee47bc5..2507469 100644
--- a/.cargo_vcs_info.json
+++ b/.cargo_vcs_info.json
@@ -1,5 +1,5 @@
{
"git": {
- "sha1": "0ba6735559ae9c1cb7e6470a97834a95cc06fbbb"
+ "sha1": "8e1da98fee06d66c13e66c330e3a3dd6ccf0e3a0"
}
}
diff --git a/Android.bp b/Android.bp
index 1ce90ac..bc7a1c2 100644
--- a/Android.bp
+++ b/Android.bp
@@ -42,6 +42,8 @@ rust_library {
name: "libmemchr",
host_supported: true,
crate_name: "memchr",
+ cargo_env_compat: true,
+ cargo_pkg_version: "2.4.1",
srcs: ["src/lib.rs"],
edition: "2018",
features: [
diff --git a/Cargo.toml b/Cargo.toml
index ed97e9f..e739019 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,26 +3,25 @@
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
-# to registry (e.g., crates.io) dependencies
+# to registry (e.g., crates.io) dependencies.
#
-# If you believe there's an error in this file please file an
-# issue against the rust-lang/cargo repository. If you're
-# editing this file be aware that the upstream Cargo.toml
-# will likely look very different (and much more reasonable)
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
[package]
edition = "2018"
name = "memchr"
-version = "2.4.0"
+version = "2.4.1"
authors = ["Andrew Gallant <jamslam@gmail.com>", "bluss"]
exclude = ["/bench", "/.github", "/fuzz"]
description = "Safe interface to memchr."
-homepage = "https://github.com/BurntSushi/rust-memchr"
+homepage = "https://github.com/BurntSushi/memchr"
documentation = "https://docs.rs/memchr/"
readme = "README.md"
keywords = ["memchr", "char", "scan", "strchr", "string"]
license = "Unlicense/MIT"
-repository = "https://github.com/BurntSushi/rust-memchr"
+repository = "https://github.com/BurntSushi/memchr"
[profile.bench]
debug = true
@@ -36,6 +35,15 @@ debug = true
[lib]
name = "memchr"
bench = false
+[dependencies.compiler_builtins]
+version = "0.1.2"
+optional = true
+
+[dependencies.core]
+version = "1.0.0"
+optional = true
+package = "rustc-std-workspace-core"
+
[dependencies.libc]
version = "0.2.18"
optional = true
@@ -46,5 +54,6 @@ default-features = false
[features]
default = ["std"]
+rustc-dep-of-std = ["core", "compiler_builtins"]
std = []
use_std = ["std"]
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
index 6218b17..2348487 100644
--- a/Cargo.toml.orig
+++ b/Cargo.toml.orig
@@ -1,11 +1,11 @@
[package]
name = "memchr"
-version = "2.4.0" #:version
+version = "2.4.1" #:version
authors = ["Andrew Gallant <jamslam@gmail.com>", "bluss"]
description = "Safe interface to memchr."
documentation = "https://docs.rs/memchr/"
-homepage = "https://github.com/BurntSushi/rust-memchr"
-repository = "https://github.com/BurntSushi/rust-memchr"
+homepage = "https://github.com/BurntSushi/memchr"
+repository = "https://github.com/BurntSushi/memchr"
readme = "README.md"
keywords = ["memchr", "char", "scan", "strchr", "string"]
license = "Unlicense/MIT"
@@ -31,9 +31,18 @@ std = []
# then, it is alias for the 'std' feature.
use_std = ["std"]
+# Internal feature, only used when building as part of libstd, not part of the
+# stable interface of this crate.
+rustc-dep-of-std = ['core', 'compiler_builtins']
+
[dependencies]
libc = { version = "0.2.18", default-features = false, optional = true }
+# Internal feature, only used when building as part of libstd, not part of the
+# stable interface of this crate.
+core = { version = '1.0.0', optional = true, package = 'rustc-std-workspace-core' }
+compiler_builtins = { version = '0.1.2', optional = true }
+
[dev-dependencies]
quickcheck = { version = "1.0.3", default-features = false }
diff --git a/METADATA b/METADATA
index 0944612..2112cd0 100644
--- a/METADATA
+++ b/METADATA
@@ -7,13 +7,13 @@ third_party {
}
url {
type: ARCHIVE
- value: "https://static.crates.io/crates/memchr/memchr-2.4.0.crate"
+ value: "https://static.crates.io/crates/memchr/memchr-2.4.1.crate"
}
- version: "2.4.0"
+ version: "2.4.1"
license_type: NOTICE
last_upgrade_date {
year: 2021
- month: 5
- day: 19
+ month: 9
+ day: 22
}
}
diff --git a/README.md b/README.md
index 3b92e18..df75816 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@ memchr
======
This library provides heavily optimized routines for string search primitives.
-[![Build status](https://github.com/BurntSushi/rust-memchr/workflows/ci/badge.svg)](https://github.com/BurntSushi/rust-memchr/actions)
+[![Build status](https://github.com/BurntSushi/memchr/workflows/ci/badge.svg)](https://github.com/BurntSushi/memchr/actions)
[![](https://meritbadge.herokuapp.com/memchr)](https://crates.io/crates/memchr)
Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/).
@@ -84,4 +84,24 @@ approaches:
* A huge suite of benchmarks that are also run as tests. Benchmarks always
confirm that the expected result occurs.
-Improvements to the testing infrastructue are very welcome.
+Improvements to the testing infrastructure are very welcome.
+
+
+### Algorithms used
+
+At time of writing, this crate's implementation of substring search actually
+has a few different algorithms to choose from depending on the situation.
+
+* For very small haystacks,
+ [Rabin-Karp](https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm)
+ is used to reduce latency. Rabin-Karp has very small overhead and can often
+ complete before other searchers have even been constructed.
+* For small needles, a variant of the
+ ["Generic SIMD"](http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd)
+ algorithm is used. Instead of using the first and last bytes, a heuristic is
+ used to select bytes based on a background distribution of byte frequencies.
+* In all other cases,
+ [Two-Way](https://en.wikipedia.org/wiki/Two-way_string-matching_algorithm)
+ is used. If possible, a prefilter based on the "Generic SIMD" algorithm
+ linked above is used to find candidates quickly. A dynamic heuristic is used
+ to detect if the prefilter is ineffective, and if so, disables it.
diff --git a/scripts/make-byte-frequency-table b/scripts/make-byte-frequency-table
new file mode 100755
index 0000000..37eeca7
--- /dev/null
+++ b/scripts/make-byte-frequency-table
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+
+# This does simple normalized frequency analysis on UTF-8 encoded text. The
+# result of the analysis is translated to a ranked list, where every byte is
+# assigned a rank. This list is written to src/freqs.rs.
+#
+# Currently, the frequencies are generated from the following corpuses:
+#
+# * The CIA world fact book
+# * The source code of rustc
+# * Septuaginta
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+from collections import Counter
+import sys
+
+preamble = '''
+// NOTE: The following code was generated by "scripts/frequencies.py", do not
+// edit directly
+'''.lstrip()
+
+
+def eprint(*args, **kwargs):
+ kwargs['file'] = sys.stderr
+ print(*args, **kwargs)
+
+
+def main():
+ p = argparse.ArgumentParser()
+ p.add_argument('corpus', metavar='FILE', nargs='+')
+ args = p.parse_args()
+
+ # Get frequency counts of each byte.
+ freqs = Counter()
+ for i in range(0, 256):
+ freqs[i] = 0
+
+ eprint('reading entire corpus into memory')
+ corpus = []
+ for fpath in args.corpus:
+ corpus.append(open(fpath, 'rb').read())
+
+ eprint('computing byte frequencies')
+ for c in corpus:
+ for byte in c:
+ freqs[byte] += 1.0 / float(len(c))
+
+ eprint('writing Rust code')
+ # Get the rank of each byte. A lower rank => lower relative frequency.
+ rank = [0] * 256
+ for i, (byte, _) in enumerate(freqs.most_common()):
+ # print(byte)
+ rank[byte] = 255 - i
+
+ # Forcefully set the highest rank possible for bytes that start multi-byte
+ # UTF-8 sequences. The idea here is that a continuation byte will be more
+ # discerning in a homogenous haystack.
+ for byte in range(0xC0, 0xFF + 1):
+ rank[byte] = 255
+
+ # Now write Rust.
+ olines = ['pub const BYTE_FREQUENCIES: [u8; 256] = [']
+ for byte in range(256):
+ olines.append(' %3d, // %r' % (rank[byte], chr(byte)))
+ olines.append('];')
+
+ print(preamble)
+ print('\n'.join(olines))
+
+
+if __name__ == '__main__':
+ main()
diff --git a/src/cow.rs b/src/cow.rs
index 95a0824..0b7d0da 100644
--- a/src/cow.rs
+++ b/src/cow.rs
@@ -5,7 +5,7 @@ use core::ops;
/// The purpose of this type is to permit usage of a "borrowed or owned
/// byte string" in a way that keeps std/no-std compatibility. That is, in
/// no-std mode, this type devolves into a simple &[u8] with no owned variant
-/// availble. We can't just use a plain Cow because Cow is not in core.
+/// available. We can't just use a plain Cow because Cow is not in core.
#[derive(Clone, Debug)]
pub struct CowBytes<'a>(Imp<'a>);
diff --git a/src/lib.rs b/src/lib.rs
index c3154b6..e0b4ce3 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -160,7 +160,7 @@ standard library exposes a platform independent SIMD API.
#![cfg_attr(miri, allow(dead_code, unused_macros))]
// Supporting 8-bit (or others) would be fine. If you need it, please submit a
-// bug report at https://github.com/BurntSushi/rust-memchr
+// bug report at https://github.com/BurntSushi/memchr
#[cfg(not(any(
target_pointer_width = "16",
target_pointer_width = "32",
diff --git a/src/memmem/genericsimd.rs b/src/memmem/genericsimd.rs
index 2417f91..28bfdab 100644
--- a/src/memmem/genericsimd.rs
+++ b/src/memmem/genericsimd.rs
@@ -195,7 +195,9 @@ pub(crate) unsafe fn fwd_find<V: Vector>(
}
/// Search for an occurrence of two rare bytes from the needle in the chunk
-/// pointed to by ptr, with the end of the haystack pointed to by end_ptr.
+/// pointed to by ptr, with the end of the haystack pointed to by end_ptr. When
+/// an occurrence is found, memcmp is run to check if a match occurs at the
+/// corresponding position.
///
/// rare1chunk and rare2chunk correspond to vectors with the rare1 and rare2
/// bytes repeated in each 8-bit lane, respectively.
@@ -210,7 +212,7 @@ pub(crate) unsafe fn fwd_find<V: Vector>(
///
/// It must be safe to do an unaligned read of size(V) bytes starting at both
/// (ptr + rare1i) and (ptr + rare2i). It must also be safe to do unaligned
-/// loads on ptr up to end_ptr.
+/// loads on ptr up to (end_ptr - needle.len()).
#[inline(always)]
unsafe fn fwd_find_in_chunk<V: Vector>(
fwd: &Forward,
diff --git a/src/memmem/prefilter/fallback.rs b/src/memmem/prefilter/fallback.rs
index 8ae1f4e..ae1bbcc 100644
--- a/src/memmem/prefilter/fallback.rs
+++ b/src/memmem/prefilter/fallback.rs
@@ -20,7 +20,7 @@ make use of the background frequency distribution of bytes though.)
This fallback implementation was originally formulated in regex many moons ago:
https://github.com/rust-lang/regex/blob/3db8722d0b204a85380fe2a65e13d7065d7dd968/src/literal/imp.rs#L370-L501
-Prior to that, I'm not aware of anyone using this technique in any prominant
+Prior to that, I'm not aware of anyone using this technique in any prominent
substring search implementation. Although, I'm sure folks have had this same
insight long before me.
diff --git a/src/memmem/prefilter/mod.rs b/src/memmem/prefilter/mod.rs
index 2481cfe..6461f33 100644
--- a/src/memmem/prefilter/mod.rs
+++ b/src/memmem/prefilter/mod.rs
@@ -18,7 +18,7 @@ const MAX_FALLBACK_RANK: usize = 250;
/// instead of needing to pass around all three as distinct function
/// parameters.
pub(crate) struct Pre<'a> {
- /// State that tracks the effectivess of a prefilter.
+ /// State that tracks the effectiveness of a prefilter.
pub(crate) state: &'a mut PrefilterState,
/// The actual prefilter function.
pub(crate) prefn: PrefilterFn,
@@ -146,7 +146,7 @@ impl core::fmt::Debug for PrefilterFn {
/// The use of prefilters in this implementation does use a heuristic to detect
/// when a prefilter might not be carrying its weight, and will dynamically
/// disable its use. Nevertheless, this configuration option gives callers
-/// the ability to disable pefilters if you have knowledge that they won't be
+/// the ability to disable prefilters if you have knowledge that they won't be
/// useful.
#[derive(Clone, Copy, Debug)]
#[non_exhaustive]
diff --git a/src/tests/memchr/testdata.rs b/src/tests/memchr/testdata.rs
index acb1fd8..6dda524 100644
--- a/src/tests/memchr/testdata.rs
+++ b/src/tests/memchr/testdata.rs
@@ -160,7 +160,7 @@ impl MemchrTest {
//
// You might think this would cause most needles to not be found, but
// we actually expand our tests to include corpus sizes all the way up
- // to >500 bytes, so we should exericse most branches.
+ // to >500 bytes, so we should exercise most branches.
for align in 0..130 {
let corpus = self.corpus(align);
assert_eq!(