diff options
author | Charisee <chiw@google.com> | 2022-07-22 20:01:06 +0000 |
---|---|---|
committer | Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> | 2022-07-22 20:01:06 +0000 |
commit | 5336c3bdcab3e2a70a677d5deca50c3b2899dd3c (patch) | |
tree | 7762bd080c3057ee3849952e47dc5ea5cd762863 | |
parent | 6210f1f71eb4646703a62f35bd7bfc0c17f37208 (diff) | |
parent | df330aa3af20e55463488d359db22b4cf8188b82 (diff) | |
download | os_str_bytes-5336c3bdcab3e2a70a677d5deca50c3b2899dd3c.tar.gz |
Import source for the os_str_bytes crate am: 110dd358c8 am: bb3cea62ce am: df330aa3af
Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/os_str_bytes/+/2154886
Change-Id: Ie39343b9b2e65447b189e0daea11d995dc804790
Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
-rw-r--r-- | COPYRIGHT | 5 | ||||
-rw-r--r-- | Cargo.toml | 33 | ||||
l--------- | LICENSE | 1 | ||||
-rw-r--r-- | LICENSE-APACHE | 201 | ||||
-rw-r--r-- | LICENSE-MIT | 21 | ||||
-rw-r--r-- | METADATA | 13 | ||||
-rw-r--r-- | MODULE_LICENSE_MIT | 0 | ||||
-rw-r--r-- | OWNERS | 1 | ||||
-rw-r--r-- | README.md | 98 | ||||
-rw-r--r-- | rustfmt.toml | 1 | ||||
-rw-r--r-- | src/common/mod.rs | 41 | ||||
-rw-r--r-- | src/common/raw.rs | 38 | ||||
-rw-r--r-- | src/iter.rs | 113 | ||||
-rw-r--r-- | src/lib.rs | 432 | ||||
-rw-r--r-- | src/pattern.rs | 71 | ||||
-rw-r--r-- | src/raw_str.rs | 1156 | ||||
-rw-r--r-- | src/util.rs | 10 | ||||
-rw-r--r-- | src/wasm32/mod.rs | 56 | ||||
-rw-r--r-- | src/wasm32/raw.rs | 39 | ||||
-rw-r--r-- | src/windows/mod.rs | 152 | ||||
-rw-r--r-- | src/windows/raw.rs | 42 | ||||
-rw-r--r-- | src/windows/wtf8/code_points.rs | 117 | ||||
-rw-r--r-- | src/windows/wtf8/convert.rs | 166 | ||||
-rw-r--r-- | src/windows/wtf8/mod.rs | 18 | ||||
-rw-r--r-- | src/windows/wtf8/string.rs | 63 | ||||
-rw-r--r-- | tests/common.rs | 94 | ||||
-rw-r--r-- | tests/debug.rs | 34 | ||||
-rw-r--r-- | tests/edge_cases.rs | 7 | ||||
-rw-r--r-- | tests/index.rs | 86 | ||||
-rw-r--r-- | tests/integration.rs | 75 | ||||
-rw-r--r-- | tests/random.rs | 126 | ||||
-rw-r--r-- | tests/raw.rs | 108 |
32 files changed, 3418 insertions, 0 deletions
diff --git a/COPYRIGHT b/COPYRIGHT new file mode 100644 index 0000000..65dfcfc --- /dev/null +++ b/COPYRIGHT @@ -0,0 +1,5 @@ +Copyright (c) 2019 dylni (https://github.com/dylni) + +Licensed under the Apache License, Version 2.0 <LICENSE-APACHE> or the MIT +license <LICENSE-MIT>, at your option. All files in this project may not be +copied, modified, or distributed except according to those terms. diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..1f7b398 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,33 @@ +[package] +name = "os_str_bytes" +version = "6.1.0" +authors = ["dylni"] +edition = "2021" +rust-version = "1.57.0" +description = """ +Utilities for converting between byte sequences and platform-native strings +""" +readme = "README.md" +repository = "https://github.com/dylni/os_str_bytes" +license = "MIT OR Apache-2.0" +keywords = ["bytes", "osstr", "osstring", "path", "windows"] +categories = ["command-line-interface", "development-tools::ffi", "encoding", "os", "rust-patterns"] +exclude = [".*", "/rustfmt.toml", "/tests"] + +[package.metadata.docs.rs] +all-features = true +rustc-args = ["--cfg", "os_str_bytes_docs_rs"] +rustdoc-args = ["--cfg", "os_str_bytes_docs_rs"] + +[dependencies] +memchr = { version = "2.4", optional = true } +print_bytes = { version = "0.6", optional = true } +uniquote = { version = "3.0", optional = true } + +[dev-dependencies] +getrandom = "0.2" + +[features] +default = ["memchr", "raw_os_str"] + +raw_os_str = [] @@ -0,0 +1 @@ +LICENSE-MIT
\ No newline at end of file diff --git a/LICENSE-APACHE b/LICENSE-APACHE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/LICENSE-MIT b/LICENSE-MIT new file mode 100644 index 0000000..fd9dc88 --- /dev/null +++ b/LICENSE-MIT @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 dylni (https://github.com/dylni) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/METADATA b/METADATA new file mode 100644 index 0000000..e2c6635 --- /dev/null +++ b/METADATA @@ -0,0 +1,13 @@ +name: "os_str_bytes" +description: + "This crate allows interacting with the data stored by OsStr and OsString, without resorting to panics or corruption for invalid UTF-8. Thus, methods can be used that are already defined on [u8] and Vec<u8>." + +third_party { + url { + type: GIT + value: "https://github.com/dylni/os_str_bytes" + } + version: "6.1.0" + last_upgrade_date { year: 2022 month: 6 day: 29 } + license_type: NOTICE +} diff --git a/MODULE_LICENSE_MIT b/MODULE_LICENSE_MIT new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/MODULE_LICENSE_MIT @@ -0,0 +1 @@ +include platform/prebuilts/rust:master:/OWNERS diff --git a/README.md b/README.md new file mode 100644 index 0000000..d0fe83b --- /dev/null +++ b/README.md @@ -0,0 +1,98 @@ +# OsStr Bytes + +This crate allows interacting with the data stored by [`OsStr`] and +[`OsString`], without resorting to panics or corruption for invalid UTF-8. +Thus, methods can be used that are already defined on [`[u8]`][slice] and +[`Vec<u8>`]. + +Typically, the only way to losslessly construct [`OsStr`] or [`OsString`] from +a byte sequence is to use `OsStr::new(str::from_utf8(bytes)?)`, which requires +the bytes to be valid in UTF-8. However, since this crate makes conversions +directly between the platform encoding and raw bytes, even some strings invalid +in UTF-8 can be converted. + +[![GitHub Build Status](https://github.com/dylni/os_str_bytes/workflows/build/badge.svg?branch=master)](https://github.com/dylni/os_str_bytes/actions?query=branch%3Amaster) + +## Usage + +Add the following lines to your "Cargo.toml" file: + +```toml +[dependencies] +os_str_bytes = "6.1" +``` + +See the [documentation] for available functionality and examples. + +## Rust version support + +The minimum supported Rust toolchain version depends on the platform: + +<table> + <tr> + <th>Target</th> + <th>Target Triple</th> + <th>Minimum Version</th> + </tr> + <tr> + <td>Fortanix</td> + <td><code>*-fortanix-*-sgx</code></td> + <td>nightly (<a href="https://doc.rust-lang.org/unstable-book/library-features/sgx-platform.html"><code>sgx_platform</code></a>)</td> + </tr> + <tr> + <td>HermitCore</td> + <td><code>*-*-hermit</code></td> + <td>1.57.0</td> + </tr> + <tr> + <td>SOLID</td> + <td><code>*-*-solid_asp3(-*)</code></td> + <td>1.57.0</td> + </tr> + <tr> + <td>Unix</td> + <td>Unix</td> + <td>1.57.0</td> + </tr> + <tr> + <td>WASI</td> + <td><code>*-wasi</code></td> + <td>1.57.0</td> + </tr> + <tr> + <td>WebAssembly</td> + <td><code>wasm32-*-unknown</code></td> + <td>1.57.0</td> + </tr> + <tr> + <td>Windows</td> + <td><code>*-*-windows-*</code></td> + <td>1.57.0</td> + </tr> +</table> + +Minor version updates may increase these version requirements. However, the +previous two Rust releases will always be supported. If the minimum Rust +version must not be increased, use a tilde requirement to prevent updating this +crate's minor version: + +```toml +[dependencies] +os_str_bytes = "~6.1" +``` + +## License + +Licensing terms are specified in [COPYRIGHT]. + +Unless you explicitly state otherwise, any contribution submitted for inclusion +in this crate, as defined in [LICENSE-APACHE], shall be licensed according to +[COPYRIGHT], without any additional terms or conditions. + +[COPYRIGHT]: https://github.com/dylni/os_str_bytes/blob/master/COPYRIGHT +[documentation]: https://docs.rs/os_str_bytes +[LICENSE-APACHE]: https://github.com/dylni/os_str_bytes/blob/master/LICENSE-APACHE +[slice]: https://doc.rust-lang.org/std/primitive.slice.html +[`OsStr`]: https://doc.rust-lang.org/std/ffi/struct.OsStr.html +[`OsString`]: https://doc.rust-lang.org/std/ffi/struct.OsString.html +[`Vec<u8>`]: https://doc.rust-lang.org/std/vec/struct.Vec.html diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..a1ffd27 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1 @@ +max_width = 79 diff --git a/src/common/mod.rs b/src/common/mod.rs new file mode 100644 index 0000000..dd49890 --- /dev/null +++ b/src/common/mod.rs @@ -0,0 +1,41 @@ +use std::borrow::Cow; +use std::convert::Infallible; +use std::ffi::OsStr; +use std::ffi::OsString; +use std::result; + +#[cfg(all(target_vendor = "fortanix", target_env = "sgx"))] +use std::os::fortanix_sgx as os; +#[cfg(target_os = "solid_asp3")] +use std::os::solid as os; +#[cfg(any(target_os = "hermit", unix))] +use std::os::unix as os; +#[cfg(target_os = "wasi")] +use std::os::wasi as os; + +use os::ffi::OsStrExt; +use os::ffi::OsStringExt; + +if_raw_str! { + pub(super) mod raw; +} + +pub(super) type EncodingError = Infallible; + +type Result<T> = result::Result<T, EncodingError>; + +pub(super) fn os_str_from_bytes(string: &[u8]) -> Result<Cow<'_, OsStr>> { + Ok(Cow::Borrowed(OsStrExt::from_bytes(string))) +} + +pub(super) fn os_str_to_bytes(os_string: &OsStr) -> Cow<'_, [u8]> { + Cow::Borrowed(OsStrExt::as_bytes(os_string)) +} + +pub(super) fn os_string_from_vec(string: Vec<u8>) -> Result<OsString> { + Ok(OsStringExt::from_vec(string)) +} + +pub(super) fn os_string_into_vec(os_string: OsString) -> Vec<u8> { + OsStringExt::into_vec(os_string) +} diff --git a/src/common/raw.rs b/src/common/raw.rs new file mode 100644 index 0000000..070a62c --- /dev/null +++ b/src/common/raw.rs @@ -0,0 +1,38 @@ +use std::fmt; +use std::fmt::Formatter; + +#[inline(always)] +pub(crate) const fn is_continuation(_: u8) -> bool { + false +} + +#[inline(always)] +pub(crate) fn decode_code_point(_: &[u8]) -> u32 { + unreachable!(); +} + +pub(crate) fn ends_with(string: &[u8], suffix: &[u8]) -> bool { + string.ends_with(suffix) +} + +pub(crate) fn starts_with(string: &[u8], prefix: &[u8]) -> bool { + string.starts_with(prefix) +} + +pub(crate) fn debug(string: &[u8], f: &mut Formatter<'_>) -> fmt::Result { + for byte in string { + write!(f, "\\x{:02X}", byte)?; + } + Ok(()) +} + +#[cfg(feature = "uniquote")] +pub(crate) mod uniquote { + use uniquote::Formatter; + use uniquote::Quote; + use uniquote::Result; + + pub(crate) fn escape(string: &[u8], f: &mut Formatter<'_>) -> Result { + string.escape(f) + } +} diff --git a/src/iter.rs b/src/iter.rs new file mode 100644 index 0000000..5cb7299 --- /dev/null +++ b/src/iter.rs @@ -0,0 +1,113 @@ +//! Iterators provided by this crate. + +#![cfg_attr(os_str_bytes_docs_rs, doc(cfg(feature = "raw_os_str")))] + +use std::fmt; +use std::fmt::Debug; +use std::fmt::Formatter; +use std::iter::FusedIterator; +use std::str; + +use super::pattern::Encoded; +use super::Pattern; +use super::RawOsStr; + +// [memchr::memmem::FindIter] is not currently used, since this struct would +// become self-referential. Additionally, that iterator does not implement +// [DoubleEndedIterator], and its implementation would likely require +// significant changes to implement that trait. +/// The iterator returned by [`RawOsStr::split`]. +pub struct Split<'a, P> +where + P: Pattern, +{ + string: Option<&'a RawOsStr>, + pat: P::__Encoded, +} + +impl<'a, P> Split<'a, P> +where + P: Pattern, +{ + pub(super) fn new(string: &'a RawOsStr, pat: P) -> Self { + let pat = pat.__encode(); + assert!( + !pat.__get().is_empty(), + "cannot split using an empty pattern", + ); + Self { + string: Some(string), + pat, + } + } +} + +macro_rules! impl_next { + ( $self:ident , $split_method:ident , $swap_fn:expr ) => {{ + $self + .string? + .$split_method(&$self.pat) + .map(|substrings| { + let (substring, string) = $swap_fn(substrings); + $self.string = Some(string); + substring + }) + .or_else(|| $self.string.take()) + }}; +} + +impl<P> DoubleEndedIterator for Split<'_, P> +where + P: Pattern, +{ + fn next_back(&mut self) -> Option<Self::Item> { + impl_next!(self, rsplit_once_raw, |(prefix, suffix)| (suffix, prefix)) + } +} + +impl<'a, P> Iterator for Split<'a, P> +where + P: Pattern, +{ + type Item = &'a RawOsStr; + + #[inline] + fn last(mut self) -> Option<Self::Item> { + self.next_back() + } + + fn next(&mut self) -> Option<Self::Item> { + impl_next!(self, split_once_raw, |x| x) + } +} + +impl<P> Clone for Split<'_, P> +where + P: Pattern, +{ + #[inline] + fn clone(&self) -> Self { + Self { + string: self.string, + pat: self.pat.clone(), + } + } +} + +impl<P> Debug for Split<'_, P> +where + P: Pattern, +{ + #[inline] + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_struct("Split") + .field("string", &self.string) + .field( + "pat", + &str::from_utf8(self.pat.__get()).expect("invalid pattern"), + ) + .finish() + } +} + +impl<P> FusedIterator for Split<'_, P> where P: Pattern {} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..f73c2d5 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,432 @@ +//! This crate allows interacting with the data stored by [`OsStr`] and +//! [`OsString`], without resorting to panics or corruption for invalid UTF-8. +//! Thus, methods can be used that are already defined on [`[u8]`][slice] and +//! [`Vec<u8>`]. +//! +//! Typically, the only way to losslessly construct [`OsStr`] or [`OsString`] +//! from a byte sequence is to use `OsStr::new(str::from_utf8(bytes)?)`, which +//! requires the bytes to be valid in UTF-8. However, since this crate makes +//! conversions directly between the platform encoding and raw bytes, even some +//! strings invalid in UTF-8 can be converted. +//! +//! In most cases, [`RawOsStr`] and [`RawOsString`] should be used. +//! [`OsStrBytes`] and [`OsStringBytes`] provide lower-level APIs that are +//! easier to misuse. +//! +//! # Encoding +//! +//! The encoding of bytes returned or accepted by methods of this crate is +//! intentionally left unspecified. It may vary for different platforms, so +//! defining it would run contrary to the goal of generic string handling. +//! However, the following invariants will always be upheld: +//! +//! - The encoding will be compatible with UTF-8. In particular, splitting an +//! encoded byte sequence by a UTF-8–encoded character always produces other +//! valid byte sequences. They can be re-encoded without error using +//! [`OsStrBytes::from_raw_bytes`] and similar methods. +//! +//! - All characters valid in platform strings are representable. [`OsStr`] and +//! [`OsString`] can always be losslessly reconstructed from extracted bytes. +//! +//! Note that the chosen encoding may not match how Rust stores these strings +//! internally, which is undocumented. For instance, the result of calling +//! [`OsStr::len`] will not necessarily match the number of bytes this crate +//! uses to represent the same string. +//! +//! Additionally, concatenation may yield unexpected results without a UTF-8 +//! separator. If two platform strings need to be concatenated, the only safe +//! way to do so is using [`OsString::push`]. This limitation also makes it +//! undesirable to use the bytes in interchange. +//! +//! Since this encoding can change between versions and platforms, it should +//! not be used for storage. The standard library provides implementations of +//! [`OsStrExt`] and [`OsStringExt`] for various platforms, which should be +//! preferred for that use case. +//! +//! # User Input +//! +//! Traits in this crate should ideally not be used to convert byte sequences +//! that did not originate from [`OsStr`] or a related struct. The encoding +//! used by this crate is an implementation detail, so it does not make sense +//! to expose it to users. +//! +//! Crate [bstr] offers some useful alternative methods, such as +//! [`ByteSlice::to_os_str`] and [`ByteVec::into_os_string`], that are meant +//! for user input. But, they reject some byte sequences used to represent +//! valid platform strings, which would be undesirable for reliable path +//! handling. They are best used only when accepting unknown input. +//! +//! This crate is meant to help when you already have an instance of [`OsStr`] +//! and need to modify the data in a lossless way. +//! +//! # Features +//! +//! These features are optional and can be enabled or disabled in a +//! "Cargo.toml" file. +//! +//! ### Default Features +//! +//! - **memchr** - +//! Changes the implementation to use crate [memchr] for better performance. +//! This feature is useless when "raw\_os\_str" is disabled. +//! +//! For more information, see [`RawOsStr`][memchr complexity]. +//! +//! - **raw\_os\_str** - +//! Enables use of [`RawOsStr`] and [`RawOsString`]. +//! +//! ### Optional Features +//! +//! - **print\_bytes** - +//! Provides implementations of [`print_bytes::ToBytes`] for [`RawOsStr`] and +//! [`RawOsString`]. +//! +//! - **uniquote** - +//! Provides implementations of [`uniquote::Quote`] for [`RawOsStr`] and +//! [`RawOsString`]. +//! +//! # Implementation +//! +//! Some methods return [`Cow`] to account for platform differences. However, +//! no guarantee is made that the same variant of that enum will always be +//! returned for the same platform. Whichever can be constructed most +//! efficiently will be returned. +//! +//! All traits are [sealed], meaning that they can only be implemented by this +//! crate. Otherwise, backward compatibility would be more difficult to +//! maintain for new features. +//! +//! # Complexity +//! +//! The time complexities of trait methods will vary based on what +//! functionality is available for the platform. At worst, they will all be +//! linear, but some can take constant time. For example, +//! [`OsStringBytes::from_raw_vec`] might be able to reuse the allocation for +//! its argument. +//! +//! # Examples +//! +//! ``` +//! # #[cfg(any())] +//! use std::env; +//! use std::fs; +//! # use std::io; +//! +//! use os_str_bytes::OsStrBytes; +//! +//! # mod env { +//! # use std::env; +//! # use std::ffi::OsString; +//! # +//! # pub fn args_os() -> impl Iterator<Item = OsString> { +//! # let mut file = env::temp_dir(); +//! # file.push("os_str_bytes\u{E9}.txt"); +//! # return vec![OsString::new(), file.into_os_string()].into_iter(); +//! # } +//! # } +//! # +//! for file in env::args_os().skip(1) { +//! if file.to_raw_bytes().first() != Some(&b'-') { +//! let string = "Hello, world!"; +//! fs::write(&file, string)?; +//! assert_eq!(string, fs::read_to_string(file)?); +//! } +//! } +//! # +//! # Ok::<_, io::Error>(()) +//! ``` +//! +//! [bstr]: https://crates.io/crates/bstr +//! [`ByteSlice::to_os_str`]: https://docs.rs/bstr/0.2.12/bstr/trait.ByteSlice.html#method.to_os_str +//! [`ByteVec::into_os_string`]: https://docs.rs/bstr/0.2.12/bstr/trait.ByteVec.html#method.into_os_string +//! [memchr complexity]: RawOsStr#complexity +//! [memchr]: https://crates.io/crates/memchr +//! [`OsStrExt`]: ::std::os::unix::ffi::OsStrExt +//! [`OsStringExt`]: ::std::os::unix::ffi::OsStringExt +//! [sealed]: https://rust-lang.github.io/api-guidelines/future-proofing.html#c-sealed +//! [print\_bytes]: https://crates.io/crates/print_bytes + +// Only require a nightly compiler when building documentation for docs.rs. +// This is a private option that should not be used. +// https://github.com/rust-lang/docs.rs/issues/147#issuecomment-389544407 +// https://github.com/dylni/os_str_bytes/issues/2 +#![cfg_attr(os_str_bytes_docs_rs, feature(doc_cfg))] +// Nightly is also currently required for the SGX platform. +#![cfg_attr( + all(target_vendor = "fortanix", target_env = "sgx"), + feature(sgx_platform) +)] +#![warn(unsafe_op_in_unsafe_fn)] +#![warn(unused_results)] + +use std::borrow::Cow; +use std::error::Error; +use std::ffi::OsStr; +use std::ffi::OsString; +use std::fmt; +use std::fmt::Display; +use std::fmt::Formatter; +use std::path::Path; +use std::path::PathBuf; +use std::result; + +macro_rules! if_raw_str { + ( $($item:item)+ ) => { + $( + #[cfg(feature = "raw_os_str")] + $item + )+ + }; +} + +#[cfg_attr( + all(target_arch = "wasm32", target_os = "unknown"), + path = "wasm32/mod.rs" +)] +#[cfg_attr(windows, path = "windows/mod.rs")] +#[cfg_attr( + not(any(all(target_arch = "wasm32", target_os = "unknown"), windows)), + path = "common/mod.rs" +)] +mod imp; + +mod util; + +if_raw_str! { + pub mod iter; + + mod pattern; + pub use pattern::Pattern; + + mod raw_str; + pub use raw_str::RawOsStr; + pub use raw_str::RawOsString; +} + +/// The error that occurs when a byte sequence is not representable in the +/// platform encoding. +/// +/// [`Result::unwrap`] should almost always be called on results containing +/// this error. It should be known whether or not byte sequences are properly +/// encoded for the platform, since [the module-level documentation][encoding] +/// discourages using encoded bytes in interchange. Results are returned +/// primarily to make panicking behavior explicit. +/// +/// On Unix, this error is never returned, but [`OsStrExt`] or [`OsStringExt`] +/// should be used instead if that needs to be guaranteed. +/// +/// [encoding]: self#encoding +/// [`OsStrExt`]: ::std::os::unix::ffi::OsStrExt +/// [`OsStringExt`]: ::std::os::unix::ffi::OsStringExt +/// [`Result::unwrap`]: ::std::result::Result::unwrap +#[derive(Debug, Eq, PartialEq)] +pub struct EncodingError(imp::EncodingError); + +impl Display for EncodingError { + #[inline] + fn fmt(&self, formatter: &mut Formatter<'_>) -> fmt::Result { + self.0.fmt(formatter) + } +} + +impl Error for EncodingError {} + +type Result<T> = result::Result<T, EncodingError>; + +/// A platform agnostic variant of [`OsStrExt`]. +/// +/// For more information, see [the module-level documentation][module]. +/// +/// [module]: self +/// [`OsStrExt`]: ::std::os::unix::ffi::OsStrExt +pub trait OsStrBytes: private::Sealed + ToOwned { + /// Converts a byte slice into an equivalent platform-native string. + /// + /// Provided byte strings should always be valid for the [unspecified + /// encoding] used by this crate. + /// + /// # Errors + /// + /// See documentation for [`EncodingError`]. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// use std::ffi::OsStr; + /// # use std::io; + /// + /// use os_str_bytes::OsStrBytes; + /// + /// let os_string = env::current_exe()?; + /// let os_bytes = os_string.to_raw_bytes(); + /// assert_eq!(os_string, OsStr::from_raw_bytes(os_bytes).unwrap()); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + /// + /// [unspecified encoding]: self#encoding + fn from_raw_bytes<'a, S>(string: S) -> Result<Cow<'a, Self>> + where + S: Into<Cow<'a, [u8]>>; + + /// Converts a platform-native string into an equivalent byte slice. + /// + /// The returned bytes string will use an [unspecified encoding]. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// # use std::io; + /// + /// use os_str_bytes::OsStrBytes; + /// + /// let os_string = env::current_exe()?; + /// println!("{:?}", os_string.to_raw_bytes()); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + /// + /// [unspecified encoding]: self#encoding + #[must_use] + fn to_raw_bytes(&self) -> Cow<'_, [u8]>; +} + +impl OsStrBytes for OsStr { + #[inline] + fn from_raw_bytes<'a, S>(string: S) -> Result<Cow<'a, Self>> + where + S: Into<Cow<'a, [u8]>>, + { + match string.into() { + Cow::Borrowed(string) => { + imp::os_str_from_bytes(string).map_err(EncodingError) + } + Cow::Owned(string) => { + OsStringBytes::from_raw_vec(string).map(Cow::Owned) + } + } + } + + #[inline] + fn to_raw_bytes(&self) -> Cow<'_, [u8]> { + imp::os_str_to_bytes(self) + } +} + +impl OsStrBytes for Path { + #[inline] + fn from_raw_bytes<'a, S>(string: S) -> Result<Cow<'a, Self>> + where + S: Into<Cow<'a, [u8]>>, + { + OsStr::from_raw_bytes(string).map(|os_string| match os_string { + Cow::Borrowed(os_string) => Cow::Borrowed(Self::new(os_string)), + Cow::Owned(os_string) => Cow::Owned(os_string.into()), + }) + } + + #[inline] + fn to_raw_bytes(&self) -> Cow<'_, [u8]> { + self.as_os_str().to_raw_bytes() + } +} + +/// A platform agnostic variant of [`OsStringExt`]. +/// +/// For more information, see [the module-level documentation][module]. +/// +/// [module]: self +/// [`OsStringExt`]: ::std::os::unix::ffi::OsStringExt +pub trait OsStringBytes: private::Sealed + Sized { + /// Converts a byte vector into an equivalent platform-native string. + /// + /// Provided byte strings should always be valid for the [unspecified + /// encoding] used by this crate. + /// + /// # Errors + /// + /// See documentation for [`EncodingError`]. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// use std::ffi::OsString; + /// # use std::io; + /// + /// use os_str_bytes::OsStringBytes; + /// + /// let os_string = env::current_exe()?; + /// let os_bytes = os_string.clone().into_raw_vec(); + /// assert_eq!(os_string, OsString::from_raw_vec(os_bytes).unwrap()); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + /// + /// [unspecified encoding]: self#encoding + fn from_raw_vec(string: Vec<u8>) -> Result<Self>; + + /// Converts a platform-native string into an equivalent byte vector. + /// + /// The returned byte string will use an [unspecified encoding]. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// # use std::io; + /// + /// use os_str_bytes::OsStringBytes; + /// + /// let os_string = env::current_exe()?; + /// println!("{:?}", os_string.into_raw_vec()); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + /// + /// [unspecified encoding]: self#encoding + #[must_use] + fn into_raw_vec(self) -> Vec<u8>; +} + +impl OsStringBytes for OsString { + #[inline] + fn from_raw_vec(string: Vec<u8>) -> Result<Self> { + imp::os_string_from_vec(string).map_err(EncodingError) + } + + #[inline] + fn into_raw_vec(self) -> Vec<u8> { + imp::os_string_into_vec(self) + } +} + +impl OsStringBytes for PathBuf { + #[inline] + fn from_raw_vec(string: Vec<u8>) -> Result<Self> { + OsString::from_raw_vec(string).map(Into::into) + } + + #[inline] + fn into_raw_vec(self) -> Vec<u8> { + self.into_os_string().into_raw_vec() + } +} + +mod private { + use std::ffi::OsStr; + use std::ffi::OsString; + use std::path::Path; + use std::path::PathBuf; + + pub trait Sealed {} + impl Sealed for char {} + impl Sealed for OsStr {} + impl Sealed for OsString {} + impl Sealed for Path {} + impl Sealed for PathBuf {} + impl Sealed for &str {} + impl Sealed for &String {} +} diff --git a/src/pattern.rs b/src/pattern.rs new file mode 100644 index 0000000..267a679 --- /dev/null +++ b/src/pattern.rs @@ -0,0 +1,71 @@ +use super::private; + +pub trait Encoded { + fn __get(&self) -> &[u8]; +} + +#[derive(Clone)] +pub struct EncodedChar { + buffer: [u8; 4], + length: usize, +} + +impl Encoded for EncodedChar { + #[inline] + fn __get(&self) -> &[u8] { + &self.buffer[..self.length] + } +} + +impl Encoded for &str { + #[inline] + fn __get(&self) -> &[u8] { + self.as_bytes() + } +} + +/// Allows a type to be used for searching by [`RawOsStr`] and [`RawOsString`]. +/// +/// This trait is very similar to [`str::pattern::Pattern`], but its methods +/// are private and it is implemented for different types. +/// +/// [`RawOsStr`]: super::RawOsStr +/// [`RawOsString`]: super::RawOsString +/// [`str::pattern::Pattern`]: ::std::str::pattern::Pattern +#[cfg_attr(os_str_bytes_docs_rs, doc(cfg(feature = "raw_os_str")))] +pub trait Pattern: private::Sealed { + #[doc(hidden)] + type __Encoded: Clone + Encoded; + + #[doc(hidden)] + fn __encode(self) -> Self::__Encoded; +} + +impl Pattern for char { + type __Encoded = EncodedChar; + + fn __encode(self) -> Self::__Encoded { + let mut encoded = EncodedChar { + buffer: [0; 4], + length: 0, + }; + encoded.length = self.encode_utf8(&mut encoded.buffer).len(); + encoded + } +} + +impl Pattern for &str { + type __Encoded = Self; + + fn __encode(self) -> Self::__Encoded { + self + } +} + +impl<'a> Pattern for &'a String { + type __Encoded = <&'a str as Pattern>::__Encoded; + + fn __encode(self) -> Self::__Encoded { + (**self).__encode() + } +} diff --git a/src/raw_str.rs b/src/raw_str.rs new file mode 100644 index 0000000..ccec858 --- /dev/null +++ b/src/raw_str.rs @@ -0,0 +1,1156 @@ +use std::borrow::Borrow; +use std::borrow::Cow; +use std::borrow::ToOwned; +use std::ffi::OsStr; +use std::ffi::OsString; +use std::fmt; +use std::fmt::Debug; +use std::fmt::Display; +use std::fmt::Formatter; +use std::mem; +use std::ops::Deref; +use std::ops::Index; +use std::ops::Range; +use std::ops::RangeFrom; +use std::ops::RangeFull; +use std::ops::RangeInclusive; +use std::ops::RangeTo; +use std::ops::RangeToInclusive; +use std::str; + +#[cfg(feature = "memchr")] +use memchr::memmem::find; +#[cfg(feature = "memchr")] +use memchr::memmem::rfind; + +use super::imp::raw; +use super::iter::Split; +use super::pattern::Encoded as EncodedPattern; +use super::OsStrBytes; +use super::OsStringBytes; +use super::Pattern; + +#[cfg(not(feature = "memchr"))] +fn find(string: &[u8], pat: &[u8]) -> Option<usize> { + for i in 0..=string.len().checked_sub(pat.len())? { + if string[i..].starts_with(pat) { + return Some(i); + } + } + None +} + +#[cfg(not(feature = "memchr"))] +fn rfind(string: &[u8], pat: &[u8]) -> Option<usize> { + for i in (pat.len()..=string.len()).rev() { + if string[..i].ends_with(pat) { + return Some(i - pat.len()); + } + } + None +} + +macro_rules! impl_trim_matches { + ( $self:ident , $pat:expr , $strip_method:ident ) => {{ + let pat = $pat.__encode(); + let pat = pat.__get(); + if pat.is_empty() { + return $self; + } + + let mut string = &$self.0; + while let Some(substring) = string.$strip_method(pat) { + string = substring; + } + Self::from_raw_bytes_unchecked(string) + }}; +} + +macro_rules! impl_split_once_raw { + ( $self:ident , $pat:expr , $find_fn:expr ) => {{ + let pat = $pat.__get(); + + let index = $find_fn(&$self.0, pat)?; + let prefix = &$self.0[..index]; + let suffix = &$self.0[index + pat.len()..]; + Some(( + Self::from_raw_bytes_unchecked(prefix), + Self::from_raw_bytes_unchecked(suffix), + )) + }}; +} + +/// A container for the byte strings converted by [`OsStrBytes`]. +/// +/// This wrapper is intended to prevent violating the invariants of the +/// [unspecified encoding] used by this crate and minimize encoding +/// conversions. +/// +/// Although this type is annotated with `#[repr(transparent)]`, the inner +/// representation is not stable. Transmuting between this type and any other +/// causes immediate undefined behavior. +/// +/// # Indices +/// +/// Methods of this struct that accept indices require that the index lie on a +/// UTF-8 boundary. Although it is possible to manipulate platform strings +/// based on other indices, this crate currently does not support them for +/// slicing methods. They would add significant complication to the +/// implementation and are generally not necessary. However, all indices +/// returned by this struct can be used for slicing. +/// +/// On Unix, all indices are permitted, to avoid false positives. However, +/// relying on this implementation detail is discouraged. Platform-specific +/// indices are error-prone. +/// +/// # Complexity +/// +/// All searching methods have worst-case multiplicative time complexity (i.e., +/// `O(self.raw_len() * pat.len())`). Enabling the "memchr" feature allows +/// these methods to instead run in linear time in the worst case (documented +/// for [`memchr::memmem::find`][memchr complexity]). +/// +/// [memchr complexity]: memchr::memmem::find#complexity +/// [unspecified encoding]: super#encoding +#[derive(Eq, Hash, Ord, PartialEq, PartialOrd)] +#[cfg_attr(os_str_bytes_docs_rs, doc(cfg(feature = "raw_os_str")))] +#[repr(transparent)] +pub struct RawOsStr([u8]); + +impl RawOsStr { + fn from_raw_bytes_unchecked(string: &[u8]) -> &Self { + // SAFETY: This struct has a layout that makes this operation safe. + unsafe { mem::transmute(string) } + } + + /// Converts a platform-native string into a representation that can be + /// more easily manipulated. + /// + /// This method performs the necessary conversion immediately, so it can be + /// expensive to call. It is recommended to continue using the returned + /// instance as long as possible (instead of the original [`OsStr`]), to + /// avoid repeated conversions. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// # use std::io; + /// + /// use os_str_bytes::RawOsStr; + /// + /// let os_string = env::current_exe()?.into_os_string(); + /// println!("{:?}", RawOsStr::new(&os_string)); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + #[inline] + #[must_use] + pub fn new(string: &OsStr) -> Cow<'_, Self> { + match string.to_raw_bytes() { + Cow::Borrowed(string) => { + Cow::Borrowed(Self::from_raw_bytes_unchecked(string)) + } + Cow::Owned(string) => Cow::Owned(RawOsString(string)), + } + } + + /// Wraps a string, without copying or encoding conversion. + /// + /// This method is much more efficient than [`RawOsStr::new`], since the + /// [encoding] used by this crate is compatible with UTF-8. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let string = "foobar"; + /// let raw = RawOsStr::from_str(string); + /// assert_eq!(string, raw); + /// ``` + /// + /// [encoding]: super#encoding + #[allow(clippy::should_implement_trait)] + #[inline] + #[must_use] + pub fn from_str(string: &str) -> &Self { + Self::from_raw_bytes_unchecked(string.as_bytes()) + } + + /// Returns the byte string stored by this container. + /// + /// The result will match what would be returned by + /// [`OsStrBytes::to_raw_bytes`] for the same string. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// # use std::io; + /// + /// use os_str_bytes::OsStrBytes; + /// use os_str_bytes::RawOsStr; + /// + /// let os_string = env::current_exe()?.into_os_string(); + /// let raw = RawOsStr::new(&os_string); + /// assert_eq!(os_string.to_raw_bytes(), raw.as_raw_bytes()); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + #[inline] + #[must_use] + pub fn as_raw_bytes(&self) -> &[u8] { + &self.0 + } + + /// Equivalent to [`str::contains`]. + /// + /// # Panics + /// + /// Panics if the pattern is a byte outside of the ASCII range. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("foobar"); + /// assert!(raw.contains("oo")); + /// assert!(!raw.contains("of")); + /// ``` + #[inline] + #[must_use] + pub fn contains<P>(&self, pat: P) -> bool + where + P: Pattern, + { + self.find(pat).is_some() + } + + /// Equivalent to [`str::ends_with`]. + /// + /// # Panics + /// + /// Panics if the pattern is a byte outside of the ASCII range. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("foobar"); + /// assert!(raw.ends_with("bar")); + /// assert!(!raw.ends_with("foo")); + /// ``` + #[inline] + #[must_use] + pub fn ends_with<P>(&self, pat: P) -> bool + where + P: Pattern, + { + let pat = pat.__encode(); + let pat = pat.__get(); + + self.0.ends_with(pat) + } + + /// Equivalent to [`str::ends_with`] but accepts this type for the pattern. + /// + /// # Panics + /// + /// Panics if the pattern is a byte outside of the ASCII range. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("foobar"); + /// assert!(raw.ends_with_os(RawOsStr::from_str("bar"))); + /// assert!(!raw.ends_with_os(RawOsStr::from_str("foo"))); + /// ``` + #[inline] + #[must_use] + pub fn ends_with_os(&self, pat: &Self) -> bool { + raw::ends_with(&self.0, &pat.0) + } + + /// Equivalent to [`str::find`]. + /// + /// # Panics + /// + /// Panics if the pattern is a byte outside of the ASCII range. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("foobar"); + /// assert_eq!(Some(1), raw.find("o")); + /// assert_eq!(None, raw.find("of")); + /// ``` + #[inline] + #[must_use] + pub fn find<P>(&self, pat: P) -> Option<usize> + where + P: Pattern, + { + let pat = pat.__encode(); + let pat = pat.__get(); + + find(&self.0, pat) + } + + /// Equivalent to [`str::is_empty`]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// assert!(RawOsStr::from_str("").is_empty()); + /// assert!(!RawOsStr::from_str("foobar").is_empty()); + /// ``` + #[inline] + #[must_use] + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + /// Returns the length of the byte string stored by this container. + /// + /// Only the following assumptions can be made about the result: + /// - The length of any Unicode character is the length of its UTF-8 + /// representation (i.e., [`char::len_utf8`]). + /// - Splitting a string at a UTF-8 boundary will return two strings with + /// lengths that sum to the length of the original string. + /// + /// This method may return a different result than would [`OsStr::len`] + /// when called on same string, since [`OsStr`] uses an unspecified + /// encoding. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// assert_eq!(6, RawOsStr::from_str("foobar").raw_len()); + /// assert_eq!(0, RawOsStr::from_str("").raw_len()); + /// ``` + #[inline] + #[must_use] + pub fn raw_len(&self) -> usize { + self.0.len() + } + + /// Equivalent to [`str::rfind`]. + /// + /// # Panics + /// + /// Panics if the pattern is a byte outside of the ASCII range. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("foobar"); + /// assert_eq!(Some(2), raw.rfind("o")); + /// assert_eq!(None, raw.rfind("of")); + /// ``` + #[inline] + #[must_use] + pub fn rfind<P>(&self, pat: P) -> Option<usize> + where + P: Pattern, + { + let pat = pat.__encode(); + let pat = pat.__get(); + + rfind(&self.0, pat) + } + + pub(super) fn rsplit_once_raw<P>(&self, pat: &P) -> Option<(&Self, &Self)> + where + P: EncodedPattern, + { + impl_split_once_raw!(self, pat, rfind) + } + + /// Equivalent to [`str::rsplit_once`]. + /// + /// # Panics + /// + /// Panics if the pattern is a byte outside of the ASCII range. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("foobar"); + /// assert_eq!( + /// Some((RawOsStr::from_str("fo"), RawOsStr::from_str("bar"))), + /// raw.rsplit_once("o"), + /// ); + /// assert_eq!(None, raw.rsplit_once("of")); + /// ``` + #[inline] + #[must_use] + pub fn rsplit_once<P>(&self, pat: P) -> Option<(&Self, &Self)> + where + P: Pattern, + { + self.rsplit_once_raw(&pat.__encode()) + } + + // https://github.com/rust-lang/rust/blob/49c68bd53f90e375bfb3cbba8c1c67a9e0adb9c0/src/libcore/str/mod.rs#L2184-L2221 + #[cold] + #[inline(never)] + #[track_caller] + fn index_boundary_error(&self, index: usize) -> ! { + debug_assert!(raw::is_continuation(self.0[index])); + + let start = self.0[..index] + .iter() + .rposition(|&x| !raw::is_continuation(x)) + .expect("invalid raw bytes"); + let mut end = index + 1; + end += self.0[end..] + .iter() + .position(|&x| !raw::is_continuation(x)) + .unwrap_or_else(|| self.raw_len() - end); + let code_point = raw::decode_code_point(&self.0[start..end]); + panic!( + "byte index {} is not a valid boundary; it is inside U+{:04X} \ + (bytes {}..{})", + index, code_point, start, end, + ); + } + + #[track_caller] + fn check_bound(&self, index: usize) { + if let Some(&byte) = self.0.get(index) { + if raw::is_continuation(byte) { + self.index_boundary_error(index); + } + } + } + + /// Equivalent to [`str::split`], but empty patterns are not accepted. + /// + /// # Panics + /// + /// Panics if the pattern is a byte outside of the ASCII range or empty. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("foobar"); + /// assert_eq!(["f", "", "bar"], *raw.split("o").collect::<Vec<_>>()); + /// ``` + #[inline] + #[must_use] + pub fn split<P>(&self, pat: P) -> Split<'_, P> + where + P: Pattern, + { + Split::new(self, pat) + } + + /// Equivalent to [`str::split_at`]. + /// + /// # Panics + /// + /// Panics if the index is not a [valid boundary]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("foobar"); + /// assert_eq!( + /// ((RawOsStr::from_str("fo"), RawOsStr::from_str("obar"))), + /// raw.split_at(2), + /// ); + /// ``` + /// + /// [valid boundary]: #indices + #[inline] + #[must_use] + pub fn split_at(&self, mid: usize) -> (&Self, &Self) { + self.check_bound(mid); + + let (prefix, suffix) = self.0.split_at(mid); + ( + Self::from_raw_bytes_unchecked(prefix), + Self::from_raw_bytes_unchecked(suffix), + ) + } + + pub(super) fn split_once_raw<P>(&self, pat: &P) -> Option<(&Self, &Self)> + where + P: EncodedPattern, + { + impl_split_once_raw!(self, pat, find) + } + + /// Equivalent to [`str::split_once`]. + /// + /// # Panics + /// + /// Panics if the pattern is a byte outside of the ASCII range. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("foobar"); + /// assert_eq!( + /// Some((RawOsStr::from_str("f"), RawOsStr::from_str("obar"))), + /// raw.split_once("o"), + /// ); + /// assert_eq!(None, raw.split_once("of")); + /// ``` + #[inline] + #[must_use] + pub fn split_once<P>(&self, pat: P) -> Option<(&Self, &Self)> + where + P: Pattern, + { + self.split_once_raw(&pat.__encode()) + } + + /// Equivalent to [`str::starts_with`]. + /// + /// # Panics + /// + /// Panics if the pattern is a byte outside of the ASCII range. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("foobar"); + /// assert!(raw.starts_with("foo")); + /// assert!(!raw.starts_with("bar")); + /// ``` + #[inline] + #[must_use] + pub fn starts_with<P>(&self, pat: P) -> bool + where + P: Pattern, + { + let pat = pat.__encode(); + let pat = pat.__get(); + + self.0.starts_with(pat) + } + + /// Equivalent to [`str::starts_with`] but accepts this type for the + /// pattern. + /// + /// # Panics + /// + /// Panics if the pattern is a byte outside of the ASCII range. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("foobar"); + /// assert!(raw.starts_with_os(RawOsStr::from_str("foo"))); + /// assert!(!raw.starts_with_os(RawOsStr::from_str("bar"))); + /// ``` + #[inline] + #[must_use] + pub fn starts_with_os(&self, pat: &Self) -> bool { + raw::starts_with(&self.0, &pat.0) + } + + /// Equivalent to [`str::strip_prefix`]. + /// + /// # Panics + /// + /// Panics if the pattern is a byte outside of the ASCII range. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("111foo1bar111"); + /// assert_eq!( + /// Some(RawOsStr::from_str("11foo1bar111")), + /// raw.strip_prefix("1"), + /// ); + /// assert_eq!(None, raw.strip_prefix("o")); + /// ``` + #[inline] + #[must_use] + pub fn strip_prefix<P>(&self, pat: P) -> Option<&Self> + where + P: Pattern, + { + let pat = pat.__encode(); + let pat = pat.__get(); + + self.0.strip_prefix(pat).map(Self::from_raw_bytes_unchecked) + } + + /// Equivalent to [`str::strip_suffix`]. + /// + /// # Panics + /// + /// Panics if the pattern is a byte outside of the ASCII range. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("111foo1bar111"); + /// assert_eq!( + /// Some(RawOsStr::from_str("111foo1bar11")), + /// raw.strip_suffix("1"), + /// ); + /// assert_eq!(None, raw.strip_suffix("o")); + /// ``` + #[inline] + #[must_use] + pub fn strip_suffix<P>(&self, pat: P) -> Option<&Self> + where + P: Pattern, + { + let pat = pat.__encode(); + let pat = pat.__get(); + + self.0.strip_suffix(pat).map(Self::from_raw_bytes_unchecked) + } + + /// Converts this representation back to a platform-native string. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// # use std::io; + /// + /// use os_str_bytes::RawOsStr; + /// + /// let os_string = env::current_exe()?.into_os_string(); + /// let raw = RawOsStr::new(&os_string); + /// assert_eq!(os_string, raw.to_os_str()); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + #[inline] + #[must_use] + pub fn to_os_str(&self) -> Cow<'_, OsStr> { + OsStr::from_raw_bytes(&self.0).expect("invalid raw bytes") + } + + /// Equivalent to [`OsStr::to_str`]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let string = "foobar"; + /// let raw = RawOsStr::from_str(string); + /// assert_eq!(Some(string), raw.to_str()); + /// ``` + #[inline] + #[must_use] + pub fn to_str(&self) -> Option<&str> { + str::from_utf8(&self.0).ok() + } + + /// Converts this string to the best UTF-8 representation possible. + /// + /// Invalid sequences will be replaced with + /// [`char::REPLACEMENT_CHARACTER`]. + /// + /// This method may return a different result than would + /// [`OsStr::to_string_lossy`] when called on same string, since [`OsStr`] + /// uses an unspecified encoding. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// # use std::io; + /// + /// use os_str_bytes::RawOsStr; + /// + /// let os_string = env::current_exe()?.into_os_string(); + /// let raw = RawOsStr::new(&os_string); + /// println!("{}", raw.to_str_lossy()); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + #[inline] + #[must_use] + pub fn to_str_lossy(&self) -> Cow<'_, str> { + String::from_utf8_lossy(&self.0) + } + + /// Equivalent to [`str::trim_end_matches`]. + /// + /// # Panics + /// + /// Panics if the pattern is a byte outside of the ASCII range. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("111foo1bar111"); + /// assert_eq!("111foo1bar", raw.trim_end_matches("1")); + /// assert_eq!("111foo1bar111", raw.trim_end_matches("o")); + /// ``` + #[must_use] + pub fn trim_end_matches<P>(&self, pat: P) -> &Self + where + P: Pattern, + { + impl_trim_matches!(self, pat, strip_suffix) + } + + /// Equivalent to [`str::trim_start_matches`]. + /// + /// # Panics + /// + /// Panics if the pattern is a byte outside of the ASCII range. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsStr; + /// + /// let raw = RawOsStr::from_str("111foo1bar111"); + /// assert_eq!("foo1bar111", raw.trim_start_matches("1")); + /// assert_eq!("111foo1bar111", raw.trim_start_matches("o")); + /// ``` + #[must_use] + pub fn trim_start_matches<P>(&self, pat: P) -> &Self + where + P: Pattern, + { + impl_trim_matches!(self, pat, strip_prefix) + } +} + +impl AsRef<Self> for RawOsStr { + #[inline] + fn as_ref(&self) -> &Self { + self + } +} + +impl AsRef<RawOsStr> for str { + #[inline] + fn as_ref(&self) -> &RawOsStr { + RawOsStr::from_str(self) + } +} + +impl AsRef<RawOsStr> for String { + #[inline] + fn as_ref(&self) -> &RawOsStr { + (**self).as_ref() + } +} + +impl Default for &RawOsStr { + #[inline] + fn default() -> Self { + RawOsStr::from_str("") + } +} + +impl<'a> From<&'a RawOsStr> for Cow<'a, RawOsStr> { + #[inline] + fn from(other: &'a RawOsStr) -> Self { + Cow::Borrowed(other) + } +} + +macro_rules! r#impl { + ( + $index_type:ty + $(, $index_var:ident , $first_bound:expr $(, $second_bound:expr)?)? + ) => { + impl Index<$index_type> for RawOsStr { + type Output = Self; + + #[inline] + fn index(&self, idx: $index_type) -> &Self::Output { + $( + let $index_var = &idx; + self.check_bound($first_bound); + $(self.check_bound($second_bound);)? + )? + + Self::from_raw_bytes_unchecked(&self.0[idx]) + } + } + }; +} +r#impl!(Range<usize>, x, x.start, x.end); +r#impl!(RangeFrom<usize>, x, x.start); +r#impl!(RangeFull); +// [usize::MAX] will always be a valid inclusive end index. +#[rustfmt::skip] +r#impl!(RangeInclusive<usize>, x, *x.start(), x.end().wrapping_add(1)); +r#impl!(RangeTo<usize>, x, x.end); +r#impl!(RangeToInclusive<usize>, x, x.end.wrapping_add(1)); + +impl ToOwned for RawOsStr { + type Owned = RawOsString; + + #[inline] + fn to_owned(&self) -> Self::Owned { + RawOsString(self.0.to_owned()) + } +} + +/// A container for the byte strings converted by [`OsStringBytes`]. +/// +/// For more information, see [`RawOsStr`]. +/// +/// [unspecified encoding]: super#encoding +#[derive(Clone, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] +#[cfg_attr(os_str_bytes_docs_rs, doc(cfg(feature = "raw_os_str")))] +pub struct RawOsString(Vec<u8>); + +impl RawOsString { + /// Converts a platform-native string into a representation that can be + /// more easily manipulated. + /// + /// For more information, see [`RawOsStr::new`]. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// # use std::io; + /// + /// use os_str_bytes::RawOsString; + /// + /// let os_string = env::current_exe()?.into_os_string(); + /// println!("{:?}", RawOsString::new(os_string)); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + #[inline] + #[must_use] + pub fn new(string: OsString) -> Self { + Self(string.into_raw_vec()) + } + + /// Wraps a string, without copying or encoding conversion. + /// + /// This method is much more efficient than [`RawOsString::new`], since the + /// [encoding] used by this crate is compatible with UTF-8. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsString; + /// + /// let string = "foobar".to_owned(); + /// let raw = RawOsString::from_string(string.clone()); + /// assert_eq!(string, raw); + /// ``` + /// + /// [encoding]: super#encoding + #[inline] + #[must_use] + pub fn from_string(string: String) -> Self { + Self(string.into_bytes()) + } + + /// Converts this representation back to a platform-native string. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// # use std::io; + /// + /// use os_str_bytes::RawOsString; + /// + /// let os_string = env::current_exe()?.into_os_string(); + /// let raw = RawOsString::new(os_string.clone()); + /// assert_eq!(os_string, raw.into_os_string()); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + #[inline] + #[must_use] + pub fn into_os_string(self) -> OsString { + OsString::from_raw_vec(self.0).expect("invalid raw bytes") + } + + /// Returns the byte string stored by this container. + /// + /// The result will match what would be returned by + /// [`OsStringBytes::into_raw_vec`] for the same string. + /// + /// # Examples + /// + /// ``` + /// use std::env; + /// # use std::io; + /// + /// use os_str_bytes::OsStringBytes; + /// use os_str_bytes::RawOsString; + /// + /// let os_string = env::current_exe()?.into_os_string(); + /// let raw = RawOsString::new(os_string.clone()); + /// assert_eq!(os_string.into_raw_vec(), raw.into_raw_vec()); + /// # + /// # Ok::<_, io::Error>(()) + /// ``` + #[inline] + #[must_use] + pub fn into_raw_vec(self) -> Vec<u8> { + self.0 + } + + /// Equivalent to [`OsString::into_string`]. + /// + /// # Examples + /// + /// ``` + /// use os_str_bytes::RawOsString; + /// + /// let string = "foobar".to_owned(); + /// let raw = RawOsString::from_string(string.clone()); + /// assert_eq!(Ok(string), raw.into_string()); + /// ``` + #[inline] + pub fn into_string(self) -> Result<String, Self> { + String::from_utf8(self.0).map_err(|x| Self(x.into_bytes())) + } +} + +impl AsRef<RawOsStr> for RawOsString { + #[inline] + fn as_ref(&self) -> &RawOsStr { + self + } +} + +impl Borrow<RawOsStr> for RawOsString { + #[inline] + fn borrow(&self) -> &RawOsStr { + self + } +} + +impl Deref for RawOsString { + type Target = RawOsStr; + + #[inline] + fn deref(&self) -> &Self::Target { + RawOsStr::from_raw_bytes_unchecked(&self.0) + } +} + +impl From<String> for RawOsString { + #[inline] + fn from(other: String) -> Self { + Self::from_string(other) + } +} + +impl From<RawOsString> for Cow<'_, RawOsStr> { + #[inline] + fn from(other: RawOsString) -> Self { + Cow::Owned(other) + } +} + +macro_rules! r#impl { + ( $index_type:ty ) => { + impl Index<$index_type> for RawOsString { + type Output = RawOsStr; + + #[inline] + fn index(&self, idx: $index_type) -> &Self::Output { + &(**self)[idx] + } + } + }; +} +r#impl!(Range<usize>); +r#impl!(RangeFrom<usize>); +r#impl!(RangeFull); +r#impl!(RangeInclusive<usize>); +r#impl!(RangeTo<usize>); +r#impl!(RangeToInclusive<usize>); + +struct Buffer<'a>(&'a [u8]); + +impl Debug for Buffer<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.write_str("\"")?; + + let mut string = self.0; + let mut invalid_length = 0; + while !string.is_empty() { + let (invalid, substring) = string.split_at(invalid_length); + + let valid = match str::from_utf8(substring) { + Ok(valid) => { + string = &[]; + valid + } + Err(error) => { + let (valid, substring) = + substring.split_at(error.valid_up_to()); + + let invalid_char_length = + error.error_len().unwrap_or_else(|| substring.len()); + if valid.is_empty() { + invalid_length += invalid_char_length; + continue; + } + string = substring; + invalid_length = invalid_char_length; + + // SAFETY: This slice was validated to be UTF-8. + unsafe { str::from_utf8_unchecked(valid) } + } + }; + + raw::debug(invalid, f)?; + Display::fmt(&valid.escape_debug(), f)?; + } + + f.write_str("\"") + } +} + +macro_rules! r#impl { + ( $type:ty ) => { + impl Debug for $type { + #[inline] + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_tuple(stringify!($type)) + .field(&Buffer(&self.0)) + .finish() + } + } + }; +} +r#impl!(RawOsStr); +r#impl!(RawOsString); + +macro_rules! r#impl { + ( $type:ty , $other_type:ty ) => { + impl PartialEq<$other_type> for $type { + #[inline] + fn eq(&self, other: &$other_type) -> bool { + let raw: &RawOsStr = self; + let other: &RawOsStr = other.as_ref(); + raw == other + } + } + + impl PartialEq<$type> for $other_type { + #[inline] + fn eq(&self, other: &$type) -> bool { + other == self + } + } + }; +} +r#impl!(RawOsStr, RawOsString); +r#impl!(&RawOsStr, RawOsString); +r#impl!(RawOsStr, str); +r#impl!(RawOsStr, String); +r#impl!(&RawOsStr, String); +r#impl!(RawOsString, str); +r#impl!(RawOsString, &str); +r#impl!(RawOsString, String); + +#[cfg(feature = "print_bytes")] +#[cfg_attr(os_str_bytes_docs_rs, doc(cfg(feature = "print_bytes")))] +mod print_bytes { + use print_bytes::ByteStr; + use print_bytes::ToBytes; + #[cfg(windows)] + use print_bytes::WideStr; + + #[cfg(windows)] + use crate::imp::raw; + + use super::RawOsStr; + use super::RawOsString; + + impl ToBytes for RawOsStr { + #[inline] + fn to_bytes(&self) -> ByteStr<'_> { + self.0.to_bytes() + } + + #[cfg(windows)] + #[inline] + fn to_wide(&self) -> Option<WideStr> { + Some(WideStr::new(raw::encode_wide_unchecked(&self.0).collect())) + } + } + + impl ToBytes for RawOsString { + #[inline] + fn to_bytes(&self) -> ByteStr<'_> { + (**self).to_bytes() + } + + #[cfg(windows)] + #[inline] + fn to_wide(&self) -> Option<WideStr> { + (**self).to_wide() + } + } +} + +#[cfg(feature = "uniquote")] +#[cfg_attr(os_str_bytes_docs_rs, doc(cfg(feature = "uniquote")))] +mod uniquote { + use uniquote::Formatter; + use uniquote::Quote; + use uniquote::Result; + + use crate::imp::raw; + + use super::RawOsStr; + use super::RawOsString; + + impl Quote for RawOsStr { + #[inline] + fn escape(&self, f: &mut Formatter<'_>) -> Result { + raw::uniquote::escape(&self.0, f) + } + } + + impl Quote for RawOsString { + #[inline] + fn escape(&self, f: &mut Formatter<'_>) -> Result { + (**self).escape(f) + } + } +} diff --git a/src/util.rs b/src/util.rs new file mode 100644 index 0000000..bd28b7b --- /dev/null +++ b/src/util.rs @@ -0,0 +1,10 @@ +pub(super) const BYTE_SHIFT: u8 = 6; + +pub(super) const CONT_MASK: u8 = (1 << BYTE_SHIFT) - 1; + +pub(super) const CONT_TAG: u8 = 0b1000_0000; + +#[cfg_attr(not(windows), allow(dead_code))] +pub(super) const fn is_continuation(byte: u8) -> bool { + byte & !CONT_MASK == CONT_TAG +} diff --git a/src/wasm32/mod.rs b/src/wasm32/mod.rs new file mode 100644 index 0000000..f8ae368 --- /dev/null +++ b/src/wasm32/mod.rs @@ -0,0 +1,56 @@ +use std::borrow::Cow; +use std::error::Error; +use std::ffi::OsStr; +use std::ffi::OsString; +use std::fmt; +use std::fmt::Display; +use std::fmt::Formatter; +use std::result; +use std::str; +use std::str::Utf8Error; + +if_raw_str! { + pub(super) mod raw; +} + +#[derive(Debug, Eq, PartialEq)] +pub(super) struct EncodingError(Utf8Error); + +impl Display for EncodingError { + fn fmt(&self, formatter: &mut Formatter<'_>) -> fmt::Result { + write!(formatter, "os_str_bytes: {}", self.0) + } +} + +impl Error for EncodingError {} + +type Result<T> = result::Result<T, EncodingError>; + +macro_rules! expect_utf8 { + ( $result:expr ) => { + $result.expect( + "platform string contains invalid UTF-8, which should not be \ + possible", + ) + }; +} + +pub(super) fn os_str_from_bytes(string: &[u8]) -> Result<Cow<'_, OsStr>> { + str::from_utf8(string) + .map(|x| Cow::Borrowed(OsStr::new(x))) + .map_err(EncodingError) +} + +pub(super) fn os_str_to_bytes(os_string: &OsStr) -> Cow<'_, [u8]> { + Cow::Borrowed(expect_utf8!(os_string.to_str()).as_bytes()) +} + +pub(super) fn os_string_from_vec(string: Vec<u8>) -> Result<OsString> { + String::from_utf8(string) + .map(Into::into) + .map_err(|x| EncodingError(x.utf8_error())) +} + +pub(super) fn os_string_into_vec(os_string: OsString) -> Vec<u8> { + expect_utf8!(os_string.into_string()).into_bytes() +} diff --git a/src/wasm32/raw.rs b/src/wasm32/raw.rs new file mode 100644 index 0000000..5645900 --- /dev/null +++ b/src/wasm32/raw.rs @@ -0,0 +1,39 @@ +use std::fmt; +use std::fmt::Formatter; +use std::str; + +pub(crate) use crate::util::is_continuation; + +pub(crate) fn decode_code_point(string: &[u8]) -> u32 { + let string = str::from_utf8(string).expect("invalid string"); + let mut chars = string.chars(); + let ch = chars + .next() + .expect("cannot parse code point from empty string"); + assert_eq!(None, chars.next(), "multiple code points found"); + ch.into() +} + +pub(crate) fn ends_with(string: &[u8], suffix: &[u8]) -> bool { + string.ends_with(suffix) +} + +pub(crate) fn starts_with(string: &[u8], prefix: &[u8]) -> bool { + string.starts_with(prefix) +} + +pub(crate) fn debug(string: &[u8], _: &mut Formatter<'_>) -> fmt::Result { + assert!(string.is_empty()); + Ok(()) +} + +#[cfg(feature = "uniquote")] +pub(crate) mod uniquote { + use uniquote::Formatter; + use uniquote::Quote; + use uniquote::Result; + + pub(crate) fn escape(string: &[u8], f: &mut Formatter<'_>) -> Result { + string.escape(f) + } +} diff --git a/src/windows/mod.rs b/src/windows/mod.rs new file mode 100644 index 0000000..3b6105b --- /dev/null +++ b/src/windows/mod.rs @@ -0,0 +1,152 @@ +// These functions are necessarily inefficient, because they must revert +// encoding conversions performed by the standard library. However, there is +// currently no better alternative. + +use std::borrow::Cow; +use std::error::Error; +use std::ffi::OsStr; +use std::ffi::OsString; +use std::fmt; +use std::fmt::Display; +use std::fmt::Formatter; +use std::os::windows::ffi::OsStrExt; +use std::os::windows::ffi::OsStringExt; +use std::result; +use std::str; + +if_raw_str! { + pub(super) mod raw; +} + +mod wtf8; +use wtf8::encode_wide; +use wtf8::DecodeWide; + +#[derive(Debug, Eq, PartialEq)] +pub(super) enum EncodingError { + Byte(u8), + CodePoint(u32), + End(), +} + +impl EncodingError { + fn position(&self) -> Cow<'_, str> { + match self { + Self::Byte(byte) => Cow::Owned(format!("byte b'\\x{:02X}'", byte)), + Self::CodePoint(code_point) => { + Cow::Owned(format!("code point U+{:04X}", code_point)) + } + Self::End() => Cow::Borrowed("end of string"), + } + } +} + +impl Display for EncodingError { + fn fmt(&self, formatter: &mut Formatter<'_>) -> fmt::Result { + write!( + formatter, + "byte sequence is not representable in the platform encoding; \ + error at {}", + self.position(), + ) + } +} + +impl Error for EncodingError {} + +type Result<T> = result::Result<T, EncodingError>; + +fn from_bytes(string: &[u8]) -> Result<OsString> { + let encoder = encode_wide(string); + + // Collecting an iterator into a result ignores the size hint: + // https://github.com/rust-lang/rust/issues/48994 + let mut encoded_string = Vec::with_capacity(encoder.size_hint().0); + for wchar in encoder { + encoded_string.push(wchar?); + } + Ok(OsStringExt::from_wide(&encoded_string)) +} + +fn to_bytes(os_string: &OsStr) -> Vec<u8> { + let encoder = OsStrExt::encode_wide(os_string); + + let mut string = Vec::with_capacity(encoder.size_hint().0); + string.extend(DecodeWide::new(encoder)); + string +} + +pub(super) fn os_str_from_bytes(string: &[u8]) -> Result<Cow<'_, OsStr>> { + from_bytes(string).map(Cow::Owned) +} + +pub(super) fn os_str_to_bytes(os_string: &OsStr) -> Cow<'_, [u8]> { + Cow::Owned(to_bytes(os_string)) +} + +pub(super) fn os_string_from_vec(string: Vec<u8>) -> Result<OsString> { + from_bytes(&string) +} + +pub(super) fn os_string_into_vec(os_string: OsString) -> Vec<u8> { + to_bytes(&os_string) +} + +#[cfg(test)] +mod tests { + use std::ffi::OsStr; + + use crate::OsStrBytes; + + use super::EncodingError; + + #[test] + fn test_invalid() { + use EncodingError::Byte; + use EncodingError::CodePoint; + use EncodingError::End; + + test_error(Byte(b'\x83'), b"\x0C\x83\xD7\x3E"); + test_error(Byte(b'\x52'), b"\x19\xF7\x52\x84"); + test_error(Byte(b'\xB8'), b"\x70\xB8\x1F\x66"); + test_error(CodePoint(0x34_0388), b"\x70\xFD\x80\x8E\x88"); + test_error(Byte(b'\x80'), b"\x80"); + test_error(Byte(b'\x80'), b"\x80\x80"); + test_error(Byte(b'\x80'), b"\x80\x80\x80"); + test_error(Byte(b'\x81'), b"\x81"); + test_error(Byte(b'\x88'), b"\x88\xB4\xC7\x46"); + test_error(Byte(b'\x97'), b"\x97\xCE\x06"); + test_error(Byte(b'\x00'), b"\xC2\x00"); + test_error(Byte(b'\x7F'), b"\xC2\x7F"); + test_error(Byte(b'\x09'), b"\xCD\x09\x95"); + test_error(Byte(b'\x43'), b"\xCD\x43\x5F\xA0"); + test_error(Byte(b'\x69'), b"\xD7\x69\xB2"); + test_error(CodePoint(0x528), b"\xE0\x94\xA8"); + test_error(CodePoint(0x766), b"\xE0\x9D\xA6\x12\xAE"); + test_error(Byte(b'\xFD'), b"\xE2\xAB\xFD\x51"); + test_error(Byte(b'\xC4'), b"\xE3\xC4"); + test_error(CodePoint(0xDC00), b"\xED\xA0\x80\xED\xB0\x80"); + test_error(End(), b"\xF1"); + test_error(End(), b"\xF1\x80"); + test_error(End(), b"\xF1\x80\x80"); + test_error(Byte(b'\xF1'), b"\xF1\x80\x80\xF1"); + test_error(CodePoint(0x11_09CC), b"\xF4\x90\xA7\x8C"); + test_error(CodePoint(0x15_EC46), b"\xF5\x9E\xB1\x86"); + test_error(End(), b"\xFB"); + test_error(End(), b"\xFB\x80"); + test_error(End(), b"\xFB\x80\x80"); + test_error(CodePoint(0x2C_0000), b"\xFB\x80\x80\x80"); + test_error(End(), b"\xFF"); + test_error(End(), b"\xFF\x80"); + test_error(End(), b"\xFF\x80\x80"); + test_error(CodePoint(0x3C_0000), b"\xFF\x80\x80\x80"); + test_error(CodePoint(0x3C_6143), b"\xFF\x86\x85\x83"); + + fn test_error(error: EncodingError, string: &[u8]) { + assert_eq!( + Err(error), + OsStr::from_raw_bytes(string).map_err(|x| x.0), + ); + } + } +} diff --git a/src/windows/raw.rs b/src/windows/raw.rs new file mode 100644 index 0000000..630eb01 --- /dev/null +++ b/src/windows/raw.rs @@ -0,0 +1,42 @@ +use std::fmt; +use std::fmt::Formatter; + +pub(crate) use crate::util::is_continuation; + +use super::wtf8; +pub(crate) use super::wtf8::ends_with; +pub(crate) use super::wtf8::starts_with; +use super::wtf8::CodePoints; + +pub(crate) fn encode_wide_unchecked( + string: &[u8], +) -> impl '_ + Iterator<Item = u16> { + wtf8::encode_wide(string).map(|x| x.expect("invalid string")) +} + +pub(crate) fn decode_code_point(string: &[u8]) -> u32 { + let mut code_points = CodePoints::new(string.iter().copied()); + let code_point = code_points + .next() + .expect("cannot parse code point from empty string") + .expect("invalid string"); + assert_eq!(None, code_points.next(), "multiple code points found"); + code_point +} + +pub(crate) fn debug(string: &[u8], f: &mut Formatter<'_>) -> fmt::Result { + for wchar in encode_wide_unchecked(string) { + write!(f, "\\u{{{:X}}}", wchar)?; + } + Ok(()) +} + +#[cfg(feature = "uniquote")] +pub(crate) mod uniquote { + use uniquote::Formatter; + use uniquote::Result; + + pub(crate) fn escape(string: &[u8], f: &mut Formatter<'_>) -> Result { + f.escape_utf16(super::encode_wide_unchecked(string)) + } +} diff --git a/src/windows/wtf8/code_points.rs b/src/windows/wtf8/code_points.rs new file mode 100644 index 0000000..b265db3 --- /dev/null +++ b/src/windows/wtf8/code_points.rs @@ -0,0 +1,117 @@ +use std::iter::Peekable; +use std::mem; + +use crate::util::is_continuation; +use crate::util::BYTE_SHIFT; +use crate::util::CONT_MASK; + +use super::EncodingError; +use super::Result; + +pub(in super::super) struct CodePoints<I> +where + I: Iterator<Item = u8>, +{ + iter: Peekable<I>, + surrogate: bool, +} + +impl<I> CodePoints<I> +where + I: Iterator<Item = u8>, +{ + pub(in super::super) fn new<S>(string: S) -> Self + where + S: IntoIterator<IntoIter = I, Item = I::Item>, + { + Self { + iter: string.into_iter().peekable(), + surrogate: false, + } + } + + fn consume_next(&mut self, code_point: &mut u32) -> Result<()> { + if let Some(&byte) = self.iter.peek() { + if !is_continuation(byte) { + self.surrogate = false; + // Not consuming this byte will be useful if this crate ever + // offers a way to encode lossily. + return Err(EncodingError::Byte(byte)); + } + *code_point = + (*code_point << BYTE_SHIFT) | u32::from(byte & CONT_MASK); + + let removed = self.iter.next(); + debug_assert_eq!(Some(byte), removed); + } else { + return Err(EncodingError::End()); + } + Ok(()) + } + + pub(super) fn inner_size_hint(&self) -> (usize, Option<usize>) { + self.iter.size_hint() + } +} + +impl<I> Iterator for CodePoints<I> +where + I: Iterator<Item = u8>, +{ + type Item = Result<u32>; + + fn next(&mut self) -> Option<Self::Item> { + let byte = self.iter.next()?; + let mut code_point: u32 = byte.into(); + + macro_rules! consume_next { + () => {{ + if let Err(error) = self.consume_next(&mut code_point) { + return Some(Err(error)); + } + }}; + } + + let prev_surrogate = mem::replace(&mut self.surrogate, false); + + let mut invalid = false; + if !byte.is_ascii() { + if byte < 0xC2 { + return Some(Err(EncodingError::Byte(byte))); + } + + if byte < 0xE0 { + code_point &= 0x1F; + } else { + code_point &= 0x0F; + consume_next!(); + + if byte >= 0xF0 { + if code_point.wrapping_sub(0x10) >= 0x100 { + invalid = true; + } + consume_next!(); + + // This condition is optimized to detect surrogate code points. + } else if code_point & 0xFE0 == 0x360 { + if code_point & 0x10 == 0 { + self.surrogate = true; + } else if prev_surrogate { + // Decoding a broken surrogate pair would be lossy. + invalid = true; + } + } + + if code_point < 0x20 { + invalid = true; + } + } + consume_next!(); + } + if invalid { + return Some(Err(EncodingError::CodePoint(code_point))); + } + + Some(Ok(code_point)) + } +} diff --git a/src/windows/wtf8/convert.rs b/src/windows/wtf8/convert.rs new file mode 100644 index 0000000..fcaf562 --- /dev/null +++ b/src/windows/wtf8/convert.rs @@ -0,0 +1,166 @@ +use std::char; +use std::char::DecodeUtf16; +use std::num::NonZeroU16; + +use crate::util::BYTE_SHIFT; +use crate::util::CONT_MASK; +use crate::util::CONT_TAG; + +use super::CodePoints; +use super::Result; + +const MIN_HIGH_SURROGATE: u16 = 0xD800; + +const MIN_LOW_SURROGATE: u16 = 0xDC00; + +const MIN_SURROGATE_CODE: u32 = (u16::MAX as u32) + 1; + +macro_rules! static_assert { + ( $condition:expr ) => { + const _: () = assert!($condition, "static assertion failed"); + }; +} + +pub(in super::super) struct DecodeWide<I> +where + I: Iterator<Item = u16>, +{ + iter: DecodeUtf16<I>, + code_point: u32, + shift: u8, +} + +impl<I> DecodeWide<I> +where + I: Iterator<Item = u16>, +{ + pub(in super::super) fn new<S>(string: S) -> Self + where + S: IntoIterator<IntoIter = I, Item = I::Item>, + { + Self { + iter: char::decode_utf16(string), + code_point: 0, + shift: 0, + } + } +} + +impl<I> Iterator for DecodeWide<I> +where + I: Iterator<Item = u16>, +{ + type Item = u8; + + fn next(&mut self) -> Option<Self::Item> { + if let Some(shift) = self.shift.checked_sub(BYTE_SHIFT) { + self.shift = shift; + return Some( + ((self.code_point >> self.shift) as u8 & CONT_MASK) | CONT_TAG, + ); + } + + self.code_point = self + .iter + .next()? + .map(Into::into) + .unwrap_or_else(|x| x.unpaired_surrogate().into()); + + macro_rules! decode { + ( $tag:expr ) => { + Some((self.code_point >> self.shift) as u8 | $tag) + }; + } + macro_rules! try_decode { + ( $tag:expr , $upper_bound:expr ) => { + if self.code_point < $upper_bound { + return decode!($tag); + } + self.shift += BYTE_SHIFT; + }; + } + try_decode!(0, 0x80); + try_decode!(0xC0, 0x800); + try_decode!(0xE0, MIN_SURROGATE_CODE); + decode!(0xF0) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + let (low, high) = self.iter.size_hint(); + let shift = self.shift.into(); + ( + low.saturating_add(shift), + high.and_then(|x| x.checked_mul(4)) + .and_then(|x| x.checked_add(shift)), + ) + } +} + +struct EncodeWide<I> +where + I: Iterator<Item = u8>, +{ + iter: CodePoints<I>, + surrogate: Option<NonZeroU16>, +} + +impl<I> EncodeWide<I> +where + I: Iterator<Item = u8>, +{ + pub(in super::super) fn new<S>(string: S) -> Self + where + S: IntoIterator<IntoIter = I, Item = I::Item>, + { + Self { + iter: CodePoints::new(string), + surrogate: None, + } + } +} + +impl<I> Iterator for EncodeWide<I> +where + I: Iterator<Item = u8>, +{ + type Item = Result<u16>; + + fn next(&mut self) -> Option<Self::Item> { + if let Some(surrogate) = self.surrogate.take() { + return Some(Ok(surrogate.get())); + } + + self.iter.next().map(|code_point| { + code_point.map(|code_point| { + code_point + .checked_sub(MIN_SURROGATE_CODE) + .map(|offset| { + static_assert!(MIN_LOW_SURROGATE != 0); + + self.surrogate = Some(unsafe { + NonZeroU16::new_unchecked( + (offset & 0x3FF) as u16 | MIN_LOW_SURROGATE, + ) + }); + (offset >> 10) as u16 | MIN_HIGH_SURROGATE + }) + .unwrap_or(code_point as u16) + }) + }) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + let (low, high) = self.iter.inner_size_hint(); + let additional = self.surrogate.is_some().into(); + ( + (low.saturating_add(2) / 3).saturating_add(additional), + high.and_then(|x| x.checked_add(additional)), + ) + } +} + +pub(in super::super) fn encode_wide( + string: &[u8], +) -> impl '_ + Iterator<Item = Result<u16>> { + EncodeWide::new(string.iter().copied()) +} diff --git a/src/windows/wtf8/mod.rs b/src/windows/wtf8/mod.rs new file mode 100644 index 0000000..d8b0dc4 --- /dev/null +++ b/src/windows/wtf8/mod.rs @@ -0,0 +1,18 @@ +// This module implements the WTF-8 encoding specification: +// https://simonsapin.github.io/wtf-8/ + +use super::EncodingError; +use super::Result; + +mod code_points; +pub(super) use code_points::CodePoints; + +mod convert; +pub(super) use convert::encode_wide; +pub(super) use convert::DecodeWide; + +if_raw_str! { + mod string; + pub(crate) use string::ends_with; + pub(crate) use string::starts_with; +} diff --git a/src/windows/wtf8/string.rs b/src/windows/wtf8/string.rs new file mode 100644 index 0000000..10b8faf --- /dev/null +++ b/src/windows/wtf8/string.rs @@ -0,0 +1,63 @@ +use crate::util::is_continuation; + +use super::encode_wide; + +const SURROGATE_LENGTH: usize = 3; + +pub(crate) fn ends_with(string: &[u8], mut suffix: &[u8]) -> bool { + let index = match string.len().checked_sub(suffix.len()) { + Some(index) => index, + None => return false, + }; + if let Some(&byte) = string.get(index) { + if is_continuation(byte) { + let index = index.checked_sub(1).expect("invalid string"); + let mut wide_surrogate = match suffix.get(..SURROGATE_LENGTH) { + Some(surrogate) => encode_wide(surrogate), + None => return false, + }; + let surrogate_wchar = wide_surrogate + .next() + .expect("failed decoding non-empty suffix"); + + if wide_surrogate.next().is_some() + || encode_wide(&string[index..]) + .take_while(Result::is_ok) + .nth(1) + != Some(surrogate_wchar) + { + return false; + } + suffix = &suffix[SURROGATE_LENGTH..]; + } + } + string.ends_with(suffix) +} + +pub(crate) fn starts_with(string: &[u8], mut prefix: &[u8]) -> bool { + if let Some(&byte) = string.get(prefix.len()) { + if is_continuation(byte) { + let index = match prefix.len().checked_sub(SURROGATE_LENGTH) { + Some(index) => index, + None => return false, + }; + let (substring, surrogate) = prefix.split_at(index); + let mut wide_surrogate = encode_wide(surrogate); + let surrogate_wchar = wide_surrogate + .next() + .expect("failed decoding non-empty prefix"); + + if surrogate_wchar.is_err() + || wide_surrogate.next().is_some() + || encode_wide(&string[index..]) + .next() + .expect("failed decoding non-empty substring") + != surrogate_wchar + { + return false; + } + prefix = substring; + } + } + string.starts_with(prefix) +} diff --git a/tests/common.rs b/tests/common.rs new file mode 100644 index 0000000..c0909bc --- /dev/null +++ b/tests/common.rs @@ -0,0 +1,94 @@ +#![allow(dead_code)] +#![warn(unsafe_op_in_unsafe_fn)] + +use std::borrow::Cow; +use std::ffi::OsStr; +use std::ffi::OsString; +#[cfg(feature = "raw_os_str")] +use std::mem; +use std::path::Path; +use std::path::PathBuf; +use std::result; + +use os_str_bytes::EncodingError; +use os_str_bytes::OsStrBytes; +use os_str_bytes::OsStringBytes; +#[cfg(feature = "raw_os_str")] +use os_str_bytes::RawOsStr; + +pub(crate) type Result<T> = result::Result<T, EncodingError>; + +pub(crate) const WTF8_STRING: &[u8] = b"foo\xED\xA0\xBD\xF0\x9F\x92\xA9bar"; + +// SAFETY: This string is valid in WTF-8. +#[cfg(all(any(unix, windows), feature = "raw_os_str"))] +pub(crate) const RAW_WTF8_STRING: &RawOsStr = + unsafe { from_raw_bytes_unchecked(WTF8_STRING) }; + +#[cfg(feature = "raw_os_str")] +pub(crate) const unsafe fn from_raw_bytes_unchecked( + string: &[u8], +) -> &RawOsStr { + // SAFETY: This implementation detail can only be assumed by this crate. + unsafe { mem::transmute(string) } +} + +#[track_caller] +fn test_from_bytes<'a, T, U, S>(result: &Result<U>, string: S) +where + S: Into<Cow<'a, [u8]>>, + T: 'a + AsRef<OsStr> + OsStrBytes + ?Sized, + U: AsRef<OsStr>, +{ + assert_eq!( + result.as_ref().map(AsRef::as_ref), + T::from_raw_bytes(string).as_deref().map(AsRef::as_ref), + ); +} + +pub(crate) fn from_bytes(string: &[u8]) -> Result<Cow<'_, OsStr>> { + let os_string = OsStr::from_raw_bytes(string); + + test_from_bytes::<Path, _, _>(&os_string, string); + + os_string +} + +pub(crate) fn from_vec(string: Vec<u8>) -> Result<OsString> { + let os_string = OsString::from_raw_vec(string.clone()); + test_from_bytes::<OsStr, _, _>(&os_string, string.clone()); + + let path = PathBuf::from_raw_vec(string.clone()); + test_from_bytes::<Path, _, _>(&path, string); + assert_eq!(os_string, path.map(PathBuf::into_os_string)); + + os_string +} + +pub(crate) fn test_bytes(string: &[u8]) -> Result<()> { + let os_string = from_bytes(string)?; + assert_eq!(string.len(), os_string.len()); + assert_eq!(string, &*os_string.to_raw_bytes()); + Ok(()) +} + +pub(crate) fn test_vec(string: &[u8]) -> Result<()> { + let os_string = from_vec(string.to_owned())?; + assert_eq!(string.len(), os_string.len()); + assert_eq!(string, os_string.into_raw_vec()); + Ok(()) +} + +pub(crate) fn test_utf8_bytes(string: &str) { + let os_string = OsStr::new(string); + let string = string.as_bytes(); + assert_eq!(Ok(Cow::Borrowed(os_string)), from_bytes(string)); + assert_eq!(string, &*os_string.to_raw_bytes()); +} + +pub(crate) fn test_utf8_vec(string: &str) { + let os_string = string.to_owned().into(); + let string = string.as_bytes(); + assert_eq!(Ok(&os_string), from_vec(string.to_owned()).as_ref()); + assert_eq!(string, os_string.into_raw_vec()); +} diff --git a/tests/debug.rs b/tests/debug.rs new file mode 100644 index 0000000..c252deb --- /dev/null +++ b/tests/debug.rs @@ -0,0 +1,34 @@ +#![cfg(feature = "raw_os_str")] + +use os_str_bytes::RawOsStr; + +mod common; +use common::RAW_WTF8_STRING; + +fn test(result: &str, string: &RawOsStr) { + assert_eq!(format!("RawOsStr({})", result), format!("{:?}", string)); + assert_eq!( + format!("RawOsString({})", result), + format!("{:?}", string.to_owned()), + ); +} + +#[test] +fn test_debug_empty() { + test("\"\"", RawOsStr::from_str("")); +} + +#[test] +fn test_debug_wft8() { + let wchar = if cfg!(unix) { + "\\xED\\xA0\\xBD" + } else { + "\\u{D83D}" + }; + test(&format!("\"foo{}\u{1F4A9}bar\"", wchar), RAW_WTF8_STRING); +} + +#[test] +fn test_debug_quote() { + test("\"foo\\\"bar\"", RawOsStr::from_str("foo\"bar")); +} diff --git a/tests/edge_cases.rs b/tests/edge_cases.rs new file mode 100644 index 0000000..a0fa529 --- /dev/null +++ b/tests/edge_cases.rs @@ -0,0 +1,7 @@ +mod common; +use common::test_bytes; + +#[test] +fn test_edge_cases() { + assert_eq!(Ok(()), test_bytes(b"\xED\xAB\xBE\xF4\x8D\xBC\x9A")); +} diff --git a/tests/index.rs b/tests/index.rs new file mode 100644 index 0000000..50abd6c --- /dev/null +++ b/tests/index.rs @@ -0,0 +1,86 @@ +#![cfg(feature = "raw_os_str")] + +use std::ops::Index; +use std::panic; +use std::panic::UnwindSafe; + +use os_str_bytes::RawOsStr; + +mod common; +use common::RAW_WTF8_STRING; + +#[test] +fn test_valid_indices() { + test(0); + test(1); + test(2); + test(3); + test(6); + test(10); + test(11); + test(12); + test(13); + + #[track_caller] + fn test(index: usize) { + let _ = RAW_WTF8_STRING.index(index..); + } +} + +macro_rules! test { + ( $name:ident , $index:literal , $code_point:expr ) => { + // https://github.com/rust-lang/rust/issues/88430 + #[test] + fn $name() { + let index_fn = || RAW_WTF8_STRING.index($index..); + if cfg!(unix) { + let _ = index_fn(); + return; + } + + let error = panic::catch_unwind(index_fn) + .expect_err("test did not panic as expected"); + let error: &String = + error.downcast_ref().expect("incorrect panic message type"); + assert_eq!( + concat!( + "byte index ", + $index, + " is not a valid boundary; it is inside ", + $code_point + ), + error, + ); + } + }; +} + +test!(test_index_4, 4, "U+D83D (bytes 3..6)"); + +test!(test_index_5, 5, "U+D83D (bytes 3..6)"); + +test!(test_index_7, 7, "U+1F4A9 (bytes 6..10)"); + +test!(test_index_8, 8, "U+1F4A9 (bytes 6..10)"); + +test!(test_index_9, 9, "U+1F4A9 (bytes 6..10)"); + +#[test] +fn test_index_panics() { + let string = RawOsStr::from_str("\u{F6}"); + test(|| string.index(1..2)); + test(|| string.index(0..1)); + test(|| string.index(1..)); + test(|| string.index(0..=0)); + test(|| string.index(..1)); + test(|| string.index(..=0)); + test(|| string.split_at(1)); + + #[track_caller] + fn test<F, R>(f: F) + where + F: FnOnce() -> R + UnwindSafe, + { + assert_eq!(!cfg!(unix), panic::catch_unwind(f).is_err()); + } +} diff --git a/tests/integration.rs b/tests/integration.rs new file mode 100644 index 0000000..0107fe5 --- /dev/null +++ b/tests/integration.rs @@ -0,0 +1,75 @@ +use std::str; + +mod common; +use common::test_bytes; +use common::test_utf8_bytes; +use common::test_utf8_vec; +use common::test_vec; +use common::Result; +use common::WTF8_STRING; + +const INVALID_STRING: &[u8] = b"\xF1foo\xF1\x80bar\xF1\x80\x80baz"; + +const UTF8_STRING: &str = "string"; + +fn test_string_is_invalid_utf8(string: &[u8]) { + assert!(str::from_utf8(string).is_err()); +} + +fn test_invalid_result(result: &Result<()>) { + if cfg!(windows) { + assert!(result.is_err()); + } else { + assert_eq!(&Ok(()), result); + } +} + +#[test] +fn test_empty_bytes() { + test_utf8_bytes(""); +} + +#[test] +fn test_empty_vec() { + test_utf8_vec(""); +} + +#[test] +fn test_nonempty_utf8_bytes() { + test_utf8_bytes(UTF8_STRING); +} + +#[test] +fn test_nonempty_utf8_vec() { + test_utf8_vec(UTF8_STRING); +} + +#[test] +fn test_invalid_string_is_invalid_utf8() { + test_string_is_invalid_utf8(INVALID_STRING); +} + +#[test] +fn test_invalid_bytes() { + test_invalid_result(&test_bytes(INVALID_STRING)); +} + +#[test] +fn test_invalid_vec() { + test_invalid_result(&test_vec(INVALID_STRING)); +} + +#[test] +fn test_wtf8_string_is_invalid_utf8() { + test_string_is_invalid_utf8(WTF8_STRING); +} + +#[test] +fn test_wtf8_bytes() { + assert_eq!(Ok(()), test_bytes(WTF8_STRING)); +} + +#[test] +fn test_wtf8_vec() { + assert_eq!(Ok(()), test_vec(WTF8_STRING)); +} diff --git a/tests/random.rs b/tests/random.rs new file mode 100644 index 0000000..ad6e8d2 --- /dev/null +++ b/tests/random.rs @@ -0,0 +1,126 @@ +use std::borrow::Cow; +use std::ffi::OsStr; +use std::ffi::OsString; + +use getrandom::getrandom; + +use os_str_bytes::OsStrBytes; +use os_str_bytes::OsStringBytes; + +mod common; +use common::from_bytes; +use common::from_vec; + +const SMALL_LENGTH: usize = 16; + +const LARGE_LENGTH: usize = 1024; + +const ITERATIONS: usize = 1024; + +fn random_os_string( + buffer_length: usize, +) -> Result<OsString, getrandom::Error> { + let mut buffer = vec![0; buffer_length]; + #[cfg(unix)] + { + use std::os::unix::ffi::OsStringExt; + + getrandom(&mut buffer)?; + Ok(OsStringExt::from_vec(buffer)) + } + #[cfg(windows)] + { + use std::os::windows::ffi::OsStringExt; + use std::slice; + + getrandom(as_mut_bytes(&mut buffer))?; + return Ok(OsStringExt::from_wide(&buffer)); + + fn as_mut_bytes(buffer: &mut [u16]) -> &mut [u8] { + // SAFETY: [u16] can always be transmuted to two [u8] bytes. + unsafe { + slice::from_raw_parts_mut( + buffer.as_mut_ptr() as *mut u8, + buffer.len() * 2, + ) + } + } + } + #[cfg(not(any(unix, windows)))] + Err(getrandom::Error::UNSUPPORTED) +} + +#[test] +fn test_random_bytes() -> Result<(), getrandom::Error> { + let os_string = random_os_string(LARGE_LENGTH)?; + let string = os_string.to_raw_bytes(); + assert_eq!(os_string.len(), string.len()); + assert_eq!(Ok(Cow::Borrowed(&*os_string)), from_bytes(&string)); + Ok(()) +} + +#[test] +fn test_random_vec() -> Result<(), getrandom::Error> { + let os_string = random_os_string(LARGE_LENGTH)?; + let string = os_string.clone().into_raw_vec(); + assert_eq!(os_string.len(), string.len()); + assert_eq!(Ok(os_string), from_vec(string)); + Ok(()) +} + +#[test] +fn test_lossless() -> Result<(), getrandom::Error> { + for _ in 0..ITERATIONS { + let mut string = vec![0; SMALL_LENGTH]; + getrandom(&mut string)?; + if let Ok(os_string) = OsStr::from_raw_bytes(&string) { + let encoded_string = os_string.to_raw_bytes(); + assert_eq!(string, &*encoded_string); + } + } + Ok(()) +} + +#[cfg(feature = "raw_os_str")] +#[test] +fn test_raw() -> Result<(), getrandom::Error> { + use os_str_bytes::RawOsStr; + use os_str_bytes::RawOsString; + + macro_rules! test { + ( + $result:expr , + $method:ident (& $string:ident , & $substring:ident ) + ) => { + #[allow(clippy::bool_assert_comparison)] + { + assert_eq!( + $result, + $string.$method(&$substring), + concat!(stringify!($method), "({:?}, {:?})"), + $string, + $substring, + ); + } + }; + } + + for _ in 0..ITERATIONS { + let mut string = random_os_string(SMALL_LENGTH)?; + let prefix = RawOsStr::new(&string).into_owned(); + let suffix = random_os_string(SMALL_LENGTH)?; + string.push(&suffix); + + let string = RawOsString::new(string); + let suffix = RawOsString::new(suffix); + + test!(true, ends_with_os(&string, &suffix)); + test!(true, starts_with_os(&string, &prefix)); + + if prefix != suffix { + test!(false, ends_with_os(&string, &prefix)); + test!(false, starts_with_os(&string, &suffix)); + } + } + Ok(()) +} diff --git a/tests/raw.rs b/tests/raw.rs new file mode 100644 index 0000000..fe29705 --- /dev/null +++ b/tests/raw.rs @@ -0,0 +1,108 @@ +#![cfg(feature = "raw_os_str")] + +use std::ffi::OsStr; + +use os_str_bytes::EncodingError; +use os_str_bytes::OsStrBytes; +use os_str_bytes::RawOsStr; + +mod common; +use common::RAW_WTF8_STRING; + +fn from_raw_bytes(string: &[u8]) -> Result<&RawOsStr, EncodingError> { + // SAFETY: The string is validated before conversion. + OsStr::from_raw_bytes(string) + .map(|_| unsafe { common::from_raw_bytes_unchecked(string) }) +} + +#[test] +fn test_ends_with() { + test(true, b""); + test(true, b"r"); + test(true, b"ar"); + test(true, b"bar"); + if cfg!(not(windows)) { + test(true, b"\xA9bar"); + test(true, b"\x92\xA9bar"); + test(true, b"\x9F\x92\xA9bar"); + } + test(cfg!(windows), b"\xED\xB2\xA9bar"); + test(true, b"\xF0\x9F\x92\xA9bar"); + test(true, b"\xED\xA0\xBD\xF0\x9F\x92\xA9bar"); + test(true, b"o\xED\xA0\xBD\xF0\x9F\x92\xA9bar"); + test(true, b"oo\xED\xA0\xBD\xF0\x9F\x92\xA9bar"); + test(true, b"foo\xED\xA0\xBD\xF0\x9F\x92\xA9bar"); + + test(false, b"\xED\xA0\xBDbar"); + test(false, b"\xED\xB2\xA9aar"); + + fn test(result: bool, suffix: &[u8]) { + let suffix = from_raw_bytes(suffix).unwrap(); + assert_eq!(result, RAW_WTF8_STRING.ends_with_os(suffix)); + } +} + +#[test] +fn test_empty_ends_with() { + macro_rules! test { + ( $result:expr , $string:expr , $substring:expr ) => { + #[allow(clippy::bool_assert_comparison)] + { + assert_eq!( + $result, + RawOsStr::from_str($string) + .ends_with_os(RawOsStr::from_str($substring)), + ); + } + }; + } + test!(true, "", ""); + test!(false, "", "r"); + test!(false, "", "ar"); +} + +#[test] +fn test_starts_with() { + test(true, b""); + test(true, b"f"); + test(true, b"fo"); + test(true, b"foo"); + test(true, b"foo\xED\xA0\xBD"); + if cfg!(not(windows)) { + test(true, b"foo\xED\xA0\xBD\xF0"); + test(true, b"foo\xED\xA0\xBD\xF0\x9F"); + test(true, b"foo\xED\xA0\xBD\xF0\x9F\x92"); + } + test(cfg!(windows), b"foo\xED\xA0\xBD\xED\xA0\xBD"); + test(true, b"foo\xED\xA0\xBD\xF0\x9F\x92\xA9"); + test(true, b"foo\xED\xA0\xBD\xF0\x9F\x92\xA9b"); + test(true, b"foo\xED\xA0\xBD\xF0\x9F\x92\xA9ba"); + test(true, b"foo\xED\xA0\xBD\xF0\x9F\x92\xA9bar"); + + test(false, b"foo\xED\xB2\xA9"); + test(false, b"fof\xED\xA0\xBD\xED\xA0\xBD"); + + fn test(result: bool, prefix: &[u8]) { + let prefix = from_raw_bytes(prefix).unwrap(); + assert_eq!(result, RAW_WTF8_STRING.starts_with_os(prefix)); + } +} + +#[test] +fn test_empty_starts_with() { + macro_rules! test { + ( $result:expr , $string:expr , $substring:expr ) => { + #[allow(clippy::bool_assert_comparison)] + { + assert_eq!( + $result, + RawOsStr::from_str($string) + .starts_with_os(RawOsStr::from_str($substring)), + ); + } + }; + } + test!(true, "", ""); + test!(false, "", "f"); + test!(false, "", "fo"); +} |