aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCharisee <chiw@google.com>2022-07-22 20:01:06 +0000
committerAutomerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>2022-07-22 20:01:06 +0000
commit5336c3bdcab3e2a70a677d5deca50c3b2899dd3c (patch)
tree7762bd080c3057ee3849952e47dc5ea5cd762863
parent6210f1f71eb4646703a62f35bd7bfc0c17f37208 (diff)
parentdf330aa3af20e55463488d359db22b4cf8188b82 (diff)
downloados_str_bytes-5336c3bdcab3e2a70a677d5deca50c3b2899dd3c.tar.gz
Import source for the os_str_bytes crate am: 110dd358c8 am: bb3cea62ce am: df330aa3af
Original change: https://android-review.googlesource.com/c/platform/external/rust/crates/os_str_bytes/+/2154886 Change-Id: Ie39343b9b2e65447b189e0daea11d995dc804790 Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
-rw-r--r--COPYRIGHT5
-rw-r--r--Cargo.toml33
l---------LICENSE1
-rw-r--r--LICENSE-APACHE201
-rw-r--r--LICENSE-MIT21
-rw-r--r--METADATA13
-rw-r--r--MODULE_LICENSE_MIT0
-rw-r--r--OWNERS1
-rw-r--r--README.md98
-rw-r--r--rustfmt.toml1
-rw-r--r--src/common/mod.rs41
-rw-r--r--src/common/raw.rs38
-rw-r--r--src/iter.rs113
-rw-r--r--src/lib.rs432
-rw-r--r--src/pattern.rs71
-rw-r--r--src/raw_str.rs1156
-rw-r--r--src/util.rs10
-rw-r--r--src/wasm32/mod.rs56
-rw-r--r--src/wasm32/raw.rs39
-rw-r--r--src/windows/mod.rs152
-rw-r--r--src/windows/raw.rs42
-rw-r--r--src/windows/wtf8/code_points.rs117
-rw-r--r--src/windows/wtf8/convert.rs166
-rw-r--r--src/windows/wtf8/mod.rs18
-rw-r--r--src/windows/wtf8/string.rs63
-rw-r--r--tests/common.rs94
-rw-r--r--tests/debug.rs34
-rw-r--r--tests/edge_cases.rs7
-rw-r--r--tests/index.rs86
-rw-r--r--tests/integration.rs75
-rw-r--r--tests/random.rs126
-rw-r--r--tests/raw.rs108
32 files changed, 3418 insertions, 0 deletions
diff --git a/COPYRIGHT b/COPYRIGHT
new file mode 100644
index 0000000..65dfcfc
--- /dev/null
+++ b/COPYRIGHT
@@ -0,0 +1,5 @@
+Copyright (c) 2019 dylni (https://github.com/dylni)
+
+Licensed under the Apache License, Version 2.0 <LICENSE-APACHE> or the MIT
+license <LICENSE-MIT>, at your option. All files in this project may not be
+copied, modified, or distributed except according to those terms.
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..1f7b398
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,33 @@
+[package]
+name = "os_str_bytes"
+version = "6.1.0"
+authors = ["dylni"]
+edition = "2021"
+rust-version = "1.57.0"
+description = """
+Utilities for converting between byte sequences and platform-native strings
+"""
+readme = "README.md"
+repository = "https://github.com/dylni/os_str_bytes"
+license = "MIT OR Apache-2.0"
+keywords = ["bytes", "osstr", "osstring", "path", "windows"]
+categories = ["command-line-interface", "development-tools::ffi", "encoding", "os", "rust-patterns"]
+exclude = [".*", "/rustfmt.toml", "/tests"]
+
+[package.metadata.docs.rs]
+all-features = true
+rustc-args = ["--cfg", "os_str_bytes_docs_rs"]
+rustdoc-args = ["--cfg", "os_str_bytes_docs_rs"]
+
+[dependencies]
+memchr = { version = "2.4", optional = true }
+print_bytes = { version = "0.6", optional = true }
+uniquote = { version = "3.0", optional = true }
+
+[dev-dependencies]
+getrandom = "0.2"
+
+[features]
+default = ["memchr", "raw_os_str"]
+
+raw_os_str = []
diff --git a/LICENSE b/LICENSE
new file mode 120000
index 0000000..7f9a88e
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1 @@
+LICENSE-MIT \ No newline at end of file
diff --git a/LICENSE-APACHE b/LICENSE-APACHE
new file mode 100644
index 0000000..261eeb9
--- /dev/null
+++ b/LICENSE-APACHE
@@ -0,0 +1,201 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/LICENSE-MIT b/LICENSE-MIT
new file mode 100644
index 0000000..fd9dc88
--- /dev/null
+++ b/LICENSE-MIT
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 dylni (https://github.com/dylni)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/METADATA b/METADATA
new file mode 100644
index 0000000..e2c6635
--- /dev/null
+++ b/METADATA
@@ -0,0 +1,13 @@
+name: "os_str_bytes"
+description:
+ "This crate allows interacting with the data stored by OsStr and OsString, without resorting to panics or corruption for invalid UTF-8. Thus, methods can be used that are already defined on [u8] and Vec<u8>."
+
+third_party {
+ url {
+ type: GIT
+ value: "https://github.com/dylni/os_str_bytes"
+ }
+ version: "6.1.0"
+ last_upgrade_date { year: 2022 month: 6 day: 29 }
+ license_type: NOTICE
+}
diff --git a/MODULE_LICENSE_MIT b/MODULE_LICENSE_MIT
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/MODULE_LICENSE_MIT
diff --git a/OWNERS b/OWNERS
new file mode 100644
index 0000000..45dc4dd
--- /dev/null
+++ b/OWNERS
@@ -0,0 +1 @@
+include platform/prebuilts/rust:master:/OWNERS
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d0fe83b
--- /dev/null
+++ b/README.md
@@ -0,0 +1,98 @@
+# OsStr Bytes
+
+This crate allows interacting with the data stored by [`OsStr`] and
+[`OsString`], without resorting to panics or corruption for invalid UTF-8.
+Thus, methods can be used that are already defined on [`[u8]`][slice] and
+[`Vec<u8>`].
+
+Typically, the only way to losslessly construct [`OsStr`] or [`OsString`] from
+a byte sequence is to use `OsStr::new(str::from_utf8(bytes)?)`, which requires
+the bytes to be valid in UTF-8. However, since this crate makes conversions
+directly between the platform encoding and raw bytes, even some strings invalid
+in UTF-8 can be converted.
+
+[![GitHub Build Status](https://github.com/dylni/os_str_bytes/workflows/build/badge.svg?branch=master)](https://github.com/dylni/os_str_bytes/actions?query=branch%3Amaster)
+
+## Usage
+
+Add the following lines to your "Cargo.toml" file:
+
+```toml
+[dependencies]
+os_str_bytes = "6.1"
+```
+
+See the [documentation] for available functionality and examples.
+
+## Rust version support
+
+The minimum supported Rust toolchain version depends on the platform:
+
+<table>
+ <tr>
+ <th>Target</th>
+ <th>Target Triple</th>
+ <th>Minimum Version</th>
+ </tr>
+ <tr>
+ <td>Fortanix</td>
+ <td><code>*-fortanix-*-sgx</code></td>
+ <td>nightly (<a href="https://doc.rust-lang.org/unstable-book/library-features/sgx-platform.html"><code>sgx_platform</code></a>)</td>
+ </tr>
+ <tr>
+ <td>HermitCore</td>
+ <td><code>*-*-hermit</code></td>
+ <td>1.57.0</td>
+ </tr>
+ <tr>
+ <td>SOLID</td>
+ <td><code>*-*-solid_asp3(-*)</code></td>
+ <td>1.57.0</td>
+ </tr>
+ <tr>
+ <td>Unix</td>
+ <td>Unix</td>
+ <td>1.57.0</td>
+ </tr>
+ <tr>
+ <td>WASI</td>
+ <td><code>*-wasi</code></td>
+ <td>1.57.0</td>
+ </tr>
+ <tr>
+ <td>WebAssembly</td>
+ <td><code>wasm32-*-unknown</code></td>
+ <td>1.57.0</td>
+ </tr>
+ <tr>
+ <td>Windows</td>
+ <td><code>*-*-windows-*</code></td>
+ <td>1.57.0</td>
+ </tr>
+</table>
+
+Minor version updates may increase these version requirements. However, the
+previous two Rust releases will always be supported. If the minimum Rust
+version must not be increased, use a tilde requirement to prevent updating this
+crate's minor version:
+
+```toml
+[dependencies]
+os_str_bytes = "~6.1"
+```
+
+## License
+
+Licensing terms are specified in [COPYRIGHT].
+
+Unless you explicitly state otherwise, any contribution submitted for inclusion
+in this crate, as defined in [LICENSE-APACHE], shall be licensed according to
+[COPYRIGHT], without any additional terms or conditions.
+
+[COPYRIGHT]: https://github.com/dylni/os_str_bytes/blob/master/COPYRIGHT
+[documentation]: https://docs.rs/os_str_bytes
+[LICENSE-APACHE]: https://github.com/dylni/os_str_bytes/blob/master/LICENSE-APACHE
+[slice]: https://doc.rust-lang.org/std/primitive.slice.html
+[`OsStr`]: https://doc.rust-lang.org/std/ffi/struct.OsStr.html
+[`OsString`]: https://doc.rust-lang.org/std/ffi/struct.OsString.html
+[`Vec<u8>`]: https://doc.rust-lang.org/std/vec/struct.Vec.html
diff --git a/rustfmt.toml b/rustfmt.toml
new file mode 100644
index 0000000..a1ffd27
--- /dev/null
+++ b/rustfmt.toml
@@ -0,0 +1 @@
+max_width = 79
diff --git a/src/common/mod.rs b/src/common/mod.rs
new file mode 100644
index 0000000..dd49890
--- /dev/null
+++ b/src/common/mod.rs
@@ -0,0 +1,41 @@
+use std::borrow::Cow;
+use std::convert::Infallible;
+use std::ffi::OsStr;
+use std::ffi::OsString;
+use std::result;
+
+#[cfg(all(target_vendor = "fortanix", target_env = "sgx"))]
+use std::os::fortanix_sgx as os;
+#[cfg(target_os = "solid_asp3")]
+use std::os::solid as os;
+#[cfg(any(target_os = "hermit", unix))]
+use std::os::unix as os;
+#[cfg(target_os = "wasi")]
+use std::os::wasi as os;
+
+use os::ffi::OsStrExt;
+use os::ffi::OsStringExt;
+
+if_raw_str! {
+ pub(super) mod raw;
+}
+
+pub(super) type EncodingError = Infallible;
+
+type Result<T> = result::Result<T, EncodingError>;
+
+pub(super) fn os_str_from_bytes(string: &[u8]) -> Result<Cow<'_, OsStr>> {
+ Ok(Cow::Borrowed(OsStrExt::from_bytes(string)))
+}
+
+pub(super) fn os_str_to_bytes(os_string: &OsStr) -> Cow<'_, [u8]> {
+ Cow::Borrowed(OsStrExt::as_bytes(os_string))
+}
+
+pub(super) fn os_string_from_vec(string: Vec<u8>) -> Result<OsString> {
+ Ok(OsStringExt::from_vec(string))
+}
+
+pub(super) fn os_string_into_vec(os_string: OsString) -> Vec<u8> {
+ OsStringExt::into_vec(os_string)
+}
diff --git a/src/common/raw.rs b/src/common/raw.rs
new file mode 100644
index 0000000..070a62c
--- /dev/null
+++ b/src/common/raw.rs
@@ -0,0 +1,38 @@
+use std::fmt;
+use std::fmt::Formatter;
+
+#[inline(always)]
+pub(crate) const fn is_continuation(_: u8) -> bool {
+ false
+}
+
+#[inline(always)]
+pub(crate) fn decode_code_point(_: &[u8]) -> u32 {
+ unreachable!();
+}
+
+pub(crate) fn ends_with(string: &[u8], suffix: &[u8]) -> bool {
+ string.ends_with(suffix)
+}
+
+pub(crate) fn starts_with(string: &[u8], prefix: &[u8]) -> bool {
+ string.starts_with(prefix)
+}
+
+pub(crate) fn debug(string: &[u8], f: &mut Formatter<'_>) -> fmt::Result {
+ for byte in string {
+ write!(f, "\\x{:02X}", byte)?;
+ }
+ Ok(())
+}
+
+#[cfg(feature = "uniquote")]
+pub(crate) mod uniquote {
+ use uniquote::Formatter;
+ use uniquote::Quote;
+ use uniquote::Result;
+
+ pub(crate) fn escape(string: &[u8], f: &mut Formatter<'_>) -> Result {
+ string.escape(f)
+ }
+}
diff --git a/src/iter.rs b/src/iter.rs
new file mode 100644
index 0000000..5cb7299
--- /dev/null
+++ b/src/iter.rs
@@ -0,0 +1,113 @@
+//! Iterators provided by this crate.
+
+#![cfg_attr(os_str_bytes_docs_rs, doc(cfg(feature = "raw_os_str")))]
+
+use std::fmt;
+use std::fmt::Debug;
+use std::fmt::Formatter;
+use std::iter::FusedIterator;
+use std::str;
+
+use super::pattern::Encoded;
+use super::Pattern;
+use super::RawOsStr;
+
+// [memchr::memmem::FindIter] is not currently used, since this struct would
+// become self-referential. Additionally, that iterator does not implement
+// [DoubleEndedIterator], and its implementation would likely require
+// significant changes to implement that trait.
+/// The iterator returned by [`RawOsStr::split`].
+pub struct Split<'a, P>
+where
+ P: Pattern,
+{
+ string: Option<&'a RawOsStr>,
+ pat: P::__Encoded,
+}
+
+impl<'a, P> Split<'a, P>
+where
+ P: Pattern,
+{
+ pub(super) fn new(string: &'a RawOsStr, pat: P) -> Self {
+ let pat = pat.__encode();
+ assert!(
+ !pat.__get().is_empty(),
+ "cannot split using an empty pattern",
+ );
+ Self {
+ string: Some(string),
+ pat,
+ }
+ }
+}
+
+macro_rules! impl_next {
+ ( $self:ident , $split_method:ident , $swap_fn:expr ) => {{
+ $self
+ .string?
+ .$split_method(&$self.pat)
+ .map(|substrings| {
+ let (substring, string) = $swap_fn(substrings);
+ $self.string = Some(string);
+ substring
+ })
+ .or_else(|| $self.string.take())
+ }};
+}
+
+impl<P> DoubleEndedIterator for Split<'_, P>
+where
+ P: Pattern,
+{
+ fn next_back(&mut self) -> Option<Self::Item> {
+ impl_next!(self, rsplit_once_raw, |(prefix, suffix)| (suffix, prefix))
+ }
+}
+
+impl<'a, P> Iterator for Split<'a, P>
+where
+ P: Pattern,
+{
+ type Item = &'a RawOsStr;
+
+ #[inline]
+ fn last(mut self) -> Option<Self::Item> {
+ self.next_back()
+ }
+
+ fn next(&mut self) -> Option<Self::Item> {
+ impl_next!(self, split_once_raw, |x| x)
+ }
+}
+
+impl<P> Clone for Split<'_, P>
+where
+ P: Pattern,
+{
+ #[inline]
+ fn clone(&self) -> Self {
+ Self {
+ string: self.string,
+ pat: self.pat.clone(),
+ }
+ }
+}
+
+impl<P> Debug for Split<'_, P>
+where
+ P: Pattern,
+{
+ #[inline]
+ fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+ f.debug_struct("Split")
+ .field("string", &self.string)
+ .field(
+ "pat",
+ &str::from_utf8(self.pat.__get()).expect("invalid pattern"),
+ )
+ .finish()
+ }
+}
+
+impl<P> FusedIterator for Split<'_, P> where P: Pattern {}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..f73c2d5
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,432 @@
+//! This crate allows interacting with the data stored by [`OsStr`] and
+//! [`OsString`], without resorting to panics or corruption for invalid UTF-8.
+//! Thus, methods can be used that are already defined on [`[u8]`][slice] and
+//! [`Vec<u8>`].
+//!
+//! Typically, the only way to losslessly construct [`OsStr`] or [`OsString`]
+//! from a byte sequence is to use `OsStr::new(str::from_utf8(bytes)?)`, which
+//! requires the bytes to be valid in UTF-8. However, since this crate makes
+//! conversions directly between the platform encoding and raw bytes, even some
+//! strings invalid in UTF-8 can be converted.
+//!
+//! In most cases, [`RawOsStr`] and [`RawOsString`] should be used.
+//! [`OsStrBytes`] and [`OsStringBytes`] provide lower-level APIs that are
+//! easier to misuse.
+//!
+//! # Encoding
+//!
+//! The encoding of bytes returned or accepted by methods of this crate is
+//! intentionally left unspecified. It may vary for different platforms, so
+//! defining it would run contrary to the goal of generic string handling.
+//! However, the following invariants will always be upheld:
+//!
+//! - The encoding will be compatible with UTF-8. In particular, splitting an
+//! encoded byte sequence by a UTF-8–encoded character always produces other
+//! valid byte sequences. They can be re-encoded without error using
+//! [`OsStrBytes::from_raw_bytes`] and similar methods.
+//!
+//! - All characters valid in platform strings are representable. [`OsStr`] and
+//! [`OsString`] can always be losslessly reconstructed from extracted bytes.
+//!
+//! Note that the chosen encoding may not match how Rust stores these strings
+//! internally, which is undocumented. For instance, the result of calling
+//! [`OsStr::len`] will not necessarily match the number of bytes this crate
+//! uses to represent the same string.
+//!
+//! Additionally, concatenation may yield unexpected results without a UTF-8
+//! separator. If two platform strings need to be concatenated, the only safe
+//! way to do so is using [`OsString::push`]. This limitation also makes it
+//! undesirable to use the bytes in interchange.
+//!
+//! Since this encoding can change between versions and platforms, it should
+//! not be used for storage. The standard library provides implementations of
+//! [`OsStrExt`] and [`OsStringExt`] for various platforms, which should be
+//! preferred for that use case.
+//!
+//! # User Input
+//!
+//! Traits in this crate should ideally not be used to convert byte sequences
+//! that did not originate from [`OsStr`] or a related struct. The encoding
+//! used by this crate is an implementation detail, so it does not make sense
+//! to expose it to users.
+//!
+//! Crate [bstr] offers some useful alternative methods, such as
+//! [`ByteSlice::to_os_str`] and [`ByteVec::into_os_string`], that are meant
+//! for user input. But, they reject some byte sequences used to represent
+//! valid platform strings, which would be undesirable for reliable path
+//! handling. They are best used only when accepting unknown input.
+//!
+//! This crate is meant to help when you already have an instance of [`OsStr`]
+//! and need to modify the data in a lossless way.
+//!
+//! # Features
+//!
+//! These features are optional and can be enabled or disabled in a
+//! "Cargo.toml" file.
+//!
+//! ### Default Features
+//!
+//! - **memchr** -
+//! Changes the implementation to use crate [memchr] for better performance.
+//! This feature is useless when "raw\_os\_str" is disabled.
+//!
+//! For more information, see [`RawOsStr`][memchr complexity].
+//!
+//! - **raw\_os\_str** -
+//! Enables use of [`RawOsStr`] and [`RawOsString`].
+//!
+//! ### Optional Features
+//!
+//! - **print\_bytes** -
+//! Provides implementations of [`print_bytes::ToBytes`] for [`RawOsStr`] and
+//! [`RawOsString`].
+//!
+//! - **uniquote** -
+//! Provides implementations of [`uniquote::Quote`] for [`RawOsStr`] and
+//! [`RawOsString`].
+//!
+//! # Implementation
+//!
+//! Some methods return [`Cow`] to account for platform differences. However,
+//! no guarantee is made that the same variant of that enum will always be
+//! returned for the same platform. Whichever can be constructed most
+//! efficiently will be returned.
+//!
+//! All traits are [sealed], meaning that they can only be implemented by this
+//! crate. Otherwise, backward compatibility would be more difficult to
+//! maintain for new features.
+//!
+//! # Complexity
+//!
+//! The time complexities of trait methods will vary based on what
+//! functionality is available for the platform. At worst, they will all be
+//! linear, but some can take constant time. For example,
+//! [`OsStringBytes::from_raw_vec`] might be able to reuse the allocation for
+//! its argument.
+//!
+//! # Examples
+//!
+//! ```
+//! # #[cfg(any())]
+//! use std::env;
+//! use std::fs;
+//! # use std::io;
+//!
+//! use os_str_bytes::OsStrBytes;
+//!
+//! # mod env {
+//! # use std::env;
+//! # use std::ffi::OsString;
+//! #
+//! # pub fn args_os() -> impl Iterator<Item = OsString> {
+//! # let mut file = env::temp_dir();
+//! # file.push("os_str_bytes\u{E9}.txt");
+//! # return vec![OsString::new(), file.into_os_string()].into_iter();
+//! # }
+//! # }
+//! #
+//! for file in env::args_os().skip(1) {
+//! if file.to_raw_bytes().first() != Some(&b'-') {
+//! let string = "Hello, world!";
+//! fs::write(&file, string)?;
+//! assert_eq!(string, fs::read_to_string(file)?);
+//! }
+//! }
+//! #
+//! # Ok::<_, io::Error>(())
+//! ```
+//!
+//! [bstr]: https://crates.io/crates/bstr
+//! [`ByteSlice::to_os_str`]: https://docs.rs/bstr/0.2.12/bstr/trait.ByteSlice.html#method.to_os_str
+//! [`ByteVec::into_os_string`]: https://docs.rs/bstr/0.2.12/bstr/trait.ByteVec.html#method.into_os_string
+//! [memchr complexity]: RawOsStr#complexity
+//! [memchr]: https://crates.io/crates/memchr
+//! [`OsStrExt`]: ::std::os::unix::ffi::OsStrExt
+//! [`OsStringExt`]: ::std::os::unix::ffi::OsStringExt
+//! [sealed]: https://rust-lang.github.io/api-guidelines/future-proofing.html#c-sealed
+//! [print\_bytes]: https://crates.io/crates/print_bytes
+
+// Only require a nightly compiler when building documentation for docs.rs.
+// This is a private option that should not be used.
+// https://github.com/rust-lang/docs.rs/issues/147#issuecomment-389544407
+// https://github.com/dylni/os_str_bytes/issues/2
+#![cfg_attr(os_str_bytes_docs_rs, feature(doc_cfg))]
+// Nightly is also currently required for the SGX platform.
+#![cfg_attr(
+ all(target_vendor = "fortanix", target_env = "sgx"),
+ feature(sgx_platform)
+)]
+#![warn(unsafe_op_in_unsafe_fn)]
+#![warn(unused_results)]
+
+use std::borrow::Cow;
+use std::error::Error;
+use std::ffi::OsStr;
+use std::ffi::OsString;
+use std::fmt;
+use std::fmt::Display;
+use std::fmt::Formatter;
+use std::path::Path;
+use std::path::PathBuf;
+use std::result;
+
+macro_rules! if_raw_str {
+ ( $($item:item)+ ) => {
+ $(
+ #[cfg(feature = "raw_os_str")]
+ $item
+ )+
+ };
+}
+
+#[cfg_attr(
+ all(target_arch = "wasm32", target_os = "unknown"),
+ path = "wasm32/mod.rs"
+)]
+#[cfg_attr(windows, path = "windows/mod.rs")]
+#[cfg_attr(
+ not(any(all(target_arch = "wasm32", target_os = "unknown"), windows)),
+ path = "common/mod.rs"
+)]
+mod imp;
+
+mod util;
+
+if_raw_str! {
+ pub mod iter;
+
+ mod pattern;
+ pub use pattern::Pattern;
+
+ mod raw_str;
+ pub use raw_str::RawOsStr;
+ pub use raw_str::RawOsString;
+}
+
+/// The error that occurs when a byte sequence is not representable in the
+/// platform encoding.
+///
+/// [`Result::unwrap`] should almost always be called on results containing
+/// this error. It should be known whether or not byte sequences are properly
+/// encoded for the platform, since [the module-level documentation][encoding]
+/// discourages using encoded bytes in interchange. Results are returned
+/// primarily to make panicking behavior explicit.
+///
+/// On Unix, this error is never returned, but [`OsStrExt`] or [`OsStringExt`]
+/// should be used instead if that needs to be guaranteed.
+///
+/// [encoding]: self#encoding
+/// [`OsStrExt`]: ::std::os::unix::ffi::OsStrExt
+/// [`OsStringExt`]: ::std::os::unix::ffi::OsStringExt
+/// [`Result::unwrap`]: ::std::result::Result::unwrap
+#[derive(Debug, Eq, PartialEq)]
+pub struct EncodingError(imp::EncodingError);
+
+impl Display for EncodingError {
+ #[inline]
+ fn fmt(&self, formatter: &mut Formatter<'_>) -> fmt::Result {
+ self.0.fmt(formatter)
+ }
+}
+
+impl Error for EncodingError {}
+
+type Result<T> = result::Result<T, EncodingError>;
+
+/// A platform agnostic variant of [`OsStrExt`].
+///
+/// For more information, see [the module-level documentation][module].
+///
+/// [module]: self
+/// [`OsStrExt`]: ::std::os::unix::ffi::OsStrExt
+pub trait OsStrBytes: private::Sealed + ToOwned {
+ /// Converts a byte slice into an equivalent platform-native string.
+ ///
+ /// Provided byte strings should always be valid for the [unspecified
+ /// encoding] used by this crate.
+ ///
+ /// # Errors
+ ///
+ /// See documentation for [`EncodingError`].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use std::env;
+ /// use std::ffi::OsStr;
+ /// # use std::io;
+ ///
+ /// use os_str_bytes::OsStrBytes;
+ ///
+ /// let os_string = env::current_exe()?;
+ /// let os_bytes = os_string.to_raw_bytes();
+ /// assert_eq!(os_string, OsStr::from_raw_bytes(os_bytes).unwrap());
+ /// #
+ /// # Ok::<_, io::Error>(())
+ /// ```
+ ///
+ /// [unspecified encoding]: self#encoding
+ fn from_raw_bytes<'a, S>(string: S) -> Result<Cow<'a, Self>>
+ where
+ S: Into<Cow<'a, [u8]>>;
+
+ /// Converts a platform-native string into an equivalent byte slice.
+ ///
+ /// The returned bytes string will use an [unspecified encoding].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use std::env;
+ /// # use std::io;
+ ///
+ /// use os_str_bytes::OsStrBytes;
+ ///
+ /// let os_string = env::current_exe()?;
+ /// println!("{:?}", os_string.to_raw_bytes());
+ /// #
+ /// # Ok::<_, io::Error>(())
+ /// ```
+ ///
+ /// [unspecified encoding]: self#encoding
+ #[must_use]
+ fn to_raw_bytes(&self) -> Cow<'_, [u8]>;
+}
+
+impl OsStrBytes for OsStr {
+ #[inline]
+ fn from_raw_bytes<'a, S>(string: S) -> Result<Cow<'a, Self>>
+ where
+ S: Into<Cow<'a, [u8]>>,
+ {
+ match string.into() {
+ Cow::Borrowed(string) => {
+ imp::os_str_from_bytes(string).map_err(EncodingError)
+ }
+ Cow::Owned(string) => {
+ OsStringBytes::from_raw_vec(string).map(Cow::Owned)
+ }
+ }
+ }
+
+ #[inline]
+ fn to_raw_bytes(&self) -> Cow<'_, [u8]> {
+ imp::os_str_to_bytes(self)
+ }
+}
+
+impl OsStrBytes for Path {
+ #[inline]
+ fn from_raw_bytes<'a, S>(string: S) -> Result<Cow<'a, Self>>
+ where
+ S: Into<Cow<'a, [u8]>>,
+ {
+ OsStr::from_raw_bytes(string).map(|os_string| match os_string {
+ Cow::Borrowed(os_string) => Cow::Borrowed(Self::new(os_string)),
+ Cow::Owned(os_string) => Cow::Owned(os_string.into()),
+ })
+ }
+
+ #[inline]
+ fn to_raw_bytes(&self) -> Cow<'_, [u8]> {
+ self.as_os_str().to_raw_bytes()
+ }
+}
+
+/// A platform agnostic variant of [`OsStringExt`].
+///
+/// For more information, see [the module-level documentation][module].
+///
+/// [module]: self
+/// [`OsStringExt`]: ::std::os::unix::ffi::OsStringExt
+pub trait OsStringBytes: private::Sealed + Sized {
+ /// Converts a byte vector into an equivalent platform-native string.
+ ///
+ /// Provided byte strings should always be valid for the [unspecified
+ /// encoding] used by this crate.
+ ///
+ /// # Errors
+ ///
+ /// See documentation for [`EncodingError`].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use std::env;
+ /// use std::ffi::OsString;
+ /// # use std::io;
+ ///
+ /// use os_str_bytes::OsStringBytes;
+ ///
+ /// let os_string = env::current_exe()?;
+ /// let os_bytes = os_string.clone().into_raw_vec();
+ /// assert_eq!(os_string, OsString::from_raw_vec(os_bytes).unwrap());
+ /// #
+ /// # Ok::<_, io::Error>(())
+ /// ```
+ ///
+ /// [unspecified encoding]: self#encoding
+ fn from_raw_vec(string: Vec<u8>) -> Result<Self>;
+
+ /// Converts a platform-native string into an equivalent byte vector.
+ ///
+ /// The returned byte string will use an [unspecified encoding].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use std::env;
+ /// # use std::io;
+ ///
+ /// use os_str_bytes::OsStringBytes;
+ ///
+ /// let os_string = env::current_exe()?;
+ /// println!("{:?}", os_string.into_raw_vec());
+ /// #
+ /// # Ok::<_, io::Error>(())
+ /// ```
+ ///
+ /// [unspecified encoding]: self#encoding
+ #[must_use]
+ fn into_raw_vec(self) -> Vec<u8>;
+}
+
+impl OsStringBytes for OsString {
+ #[inline]
+ fn from_raw_vec(string: Vec<u8>) -> Result<Self> {
+ imp::os_string_from_vec(string).map_err(EncodingError)
+ }
+
+ #[inline]
+ fn into_raw_vec(self) -> Vec<u8> {
+ imp::os_string_into_vec(self)
+ }
+}
+
+impl OsStringBytes for PathBuf {
+ #[inline]
+ fn from_raw_vec(string: Vec<u8>) -> Result<Self> {
+ OsString::from_raw_vec(string).map(Into::into)
+ }
+
+ #[inline]
+ fn into_raw_vec(self) -> Vec<u8> {
+ self.into_os_string().into_raw_vec()
+ }
+}
+
+mod private {
+ use std::ffi::OsStr;
+ use std::ffi::OsString;
+ use std::path::Path;
+ use std::path::PathBuf;
+
+ pub trait Sealed {}
+ impl Sealed for char {}
+ impl Sealed for OsStr {}
+ impl Sealed for OsString {}
+ impl Sealed for Path {}
+ impl Sealed for PathBuf {}
+ impl Sealed for &str {}
+ impl Sealed for &String {}
+}
diff --git a/src/pattern.rs b/src/pattern.rs
new file mode 100644
index 0000000..267a679
--- /dev/null
+++ b/src/pattern.rs
@@ -0,0 +1,71 @@
+use super::private;
+
+pub trait Encoded {
+ fn __get(&self) -> &[u8];
+}
+
+#[derive(Clone)]
+pub struct EncodedChar {
+ buffer: [u8; 4],
+ length: usize,
+}
+
+impl Encoded for EncodedChar {
+ #[inline]
+ fn __get(&self) -> &[u8] {
+ &self.buffer[..self.length]
+ }
+}
+
+impl Encoded for &str {
+ #[inline]
+ fn __get(&self) -> &[u8] {
+ self.as_bytes()
+ }
+}
+
+/// Allows a type to be used for searching by [`RawOsStr`] and [`RawOsString`].
+///
+/// This trait is very similar to [`str::pattern::Pattern`], but its methods
+/// are private and it is implemented for different types.
+///
+/// [`RawOsStr`]: super::RawOsStr
+/// [`RawOsString`]: super::RawOsString
+/// [`str::pattern::Pattern`]: ::std::str::pattern::Pattern
+#[cfg_attr(os_str_bytes_docs_rs, doc(cfg(feature = "raw_os_str")))]
+pub trait Pattern: private::Sealed {
+ #[doc(hidden)]
+ type __Encoded: Clone + Encoded;
+
+ #[doc(hidden)]
+ fn __encode(self) -> Self::__Encoded;
+}
+
+impl Pattern for char {
+ type __Encoded = EncodedChar;
+
+ fn __encode(self) -> Self::__Encoded {
+ let mut encoded = EncodedChar {
+ buffer: [0; 4],
+ length: 0,
+ };
+ encoded.length = self.encode_utf8(&mut encoded.buffer).len();
+ encoded
+ }
+}
+
+impl Pattern for &str {
+ type __Encoded = Self;
+
+ fn __encode(self) -> Self::__Encoded {
+ self
+ }
+}
+
+impl<'a> Pattern for &'a String {
+ type __Encoded = <&'a str as Pattern>::__Encoded;
+
+ fn __encode(self) -> Self::__Encoded {
+ (**self).__encode()
+ }
+}
diff --git a/src/raw_str.rs b/src/raw_str.rs
new file mode 100644
index 0000000..ccec858
--- /dev/null
+++ b/src/raw_str.rs
@@ -0,0 +1,1156 @@
+use std::borrow::Borrow;
+use std::borrow::Cow;
+use std::borrow::ToOwned;
+use std::ffi::OsStr;
+use std::ffi::OsString;
+use std::fmt;
+use std::fmt::Debug;
+use std::fmt::Display;
+use std::fmt::Formatter;
+use std::mem;
+use std::ops::Deref;
+use std::ops::Index;
+use std::ops::Range;
+use std::ops::RangeFrom;
+use std::ops::RangeFull;
+use std::ops::RangeInclusive;
+use std::ops::RangeTo;
+use std::ops::RangeToInclusive;
+use std::str;
+
+#[cfg(feature = "memchr")]
+use memchr::memmem::find;
+#[cfg(feature = "memchr")]
+use memchr::memmem::rfind;
+
+use super::imp::raw;
+use super::iter::Split;
+use super::pattern::Encoded as EncodedPattern;
+use super::OsStrBytes;
+use super::OsStringBytes;
+use super::Pattern;
+
+#[cfg(not(feature = "memchr"))]
+fn find(string: &[u8], pat: &[u8]) -> Option<usize> {
+ for i in 0..=string.len().checked_sub(pat.len())? {
+ if string[i..].starts_with(pat) {
+ return Some(i);
+ }
+ }
+ None
+}
+
+#[cfg(not(feature = "memchr"))]
+fn rfind(string: &[u8], pat: &[u8]) -> Option<usize> {
+ for i in (pat.len()..=string.len()).rev() {
+ if string[..i].ends_with(pat) {
+ return Some(i - pat.len());
+ }
+ }
+ None
+}
+
+macro_rules! impl_trim_matches {
+ ( $self:ident , $pat:expr , $strip_method:ident ) => {{
+ let pat = $pat.__encode();
+ let pat = pat.__get();
+ if pat.is_empty() {
+ return $self;
+ }
+
+ let mut string = &$self.0;
+ while let Some(substring) = string.$strip_method(pat) {
+ string = substring;
+ }
+ Self::from_raw_bytes_unchecked(string)
+ }};
+}
+
+macro_rules! impl_split_once_raw {
+ ( $self:ident , $pat:expr , $find_fn:expr ) => {{
+ let pat = $pat.__get();
+
+ let index = $find_fn(&$self.0, pat)?;
+ let prefix = &$self.0[..index];
+ let suffix = &$self.0[index + pat.len()..];
+ Some((
+ Self::from_raw_bytes_unchecked(prefix),
+ Self::from_raw_bytes_unchecked(suffix),
+ ))
+ }};
+}
+
+/// A container for the byte strings converted by [`OsStrBytes`].
+///
+/// This wrapper is intended to prevent violating the invariants of the
+/// [unspecified encoding] used by this crate and minimize encoding
+/// conversions.
+///
+/// Although this type is annotated with `#[repr(transparent)]`, the inner
+/// representation is not stable. Transmuting between this type and any other
+/// causes immediate undefined behavior.
+///
+/// # Indices
+///
+/// Methods of this struct that accept indices require that the index lie on a
+/// UTF-8 boundary. Although it is possible to manipulate platform strings
+/// based on other indices, this crate currently does not support them for
+/// slicing methods. They would add significant complication to the
+/// implementation and are generally not necessary. However, all indices
+/// returned by this struct can be used for slicing.
+///
+/// On Unix, all indices are permitted, to avoid false positives. However,
+/// relying on this implementation detail is discouraged. Platform-specific
+/// indices are error-prone.
+///
+/// # Complexity
+///
+/// All searching methods have worst-case multiplicative time complexity (i.e.,
+/// `O(self.raw_len() * pat.len())`). Enabling the "memchr" feature allows
+/// these methods to instead run in linear time in the worst case (documented
+/// for [`memchr::memmem::find`][memchr complexity]).
+///
+/// [memchr complexity]: memchr::memmem::find#complexity
+/// [unspecified encoding]: super#encoding
+#[derive(Eq, Hash, Ord, PartialEq, PartialOrd)]
+#[cfg_attr(os_str_bytes_docs_rs, doc(cfg(feature = "raw_os_str")))]
+#[repr(transparent)]
+pub struct RawOsStr([u8]);
+
+impl RawOsStr {
+ fn from_raw_bytes_unchecked(string: &[u8]) -> &Self {
+ // SAFETY: This struct has a layout that makes this operation safe.
+ unsafe { mem::transmute(string) }
+ }
+
+ /// Converts a platform-native string into a representation that can be
+ /// more easily manipulated.
+ ///
+ /// This method performs the necessary conversion immediately, so it can be
+ /// expensive to call. It is recommended to continue using the returned
+ /// instance as long as possible (instead of the original [`OsStr`]), to
+ /// avoid repeated conversions.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use std::env;
+ /// # use std::io;
+ ///
+ /// use os_str_bytes::RawOsStr;
+ ///
+ /// let os_string = env::current_exe()?.into_os_string();
+ /// println!("{:?}", RawOsStr::new(&os_string));
+ /// #
+ /// # Ok::<_, io::Error>(())
+ /// ```
+ #[inline]
+ #[must_use]
+ pub fn new(string: &OsStr) -> Cow<'_, Self> {
+ match string.to_raw_bytes() {
+ Cow::Borrowed(string) => {
+ Cow::Borrowed(Self::from_raw_bytes_unchecked(string))
+ }
+ Cow::Owned(string) => Cow::Owned(RawOsString(string)),
+ }
+ }
+
+ /// Wraps a string, without copying or encoding conversion.
+ ///
+ /// This method is much more efficient than [`RawOsStr::new`], since the
+ /// [encoding] used by this crate is compatible with UTF-8.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use os_str_bytes::RawOsStr;
+ ///
+ /// let string = "foobar";
+ /// let raw = RawOsStr::from_str(string);
+ /// assert_eq!(string, raw);
+ /// ```
+ ///
+ /// [encoding]: super#encoding
+ #[allow(clippy::should_implement_trait)]
+ #[inline]
+ #[must_use]
+ pub fn from_str(string: &str) -> &Self {
+ Self::from_raw_bytes_unchecked(string.as_bytes())
+ }
+
+ /// Returns the byte string stored by this container.
+ ///
+ /// The result will match what would be returned by
+ /// [`OsStrBytes::to_raw_bytes`] for the same string.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use std::env;
+ /// # use std::io;
+ ///
+ /// use os_str_bytes::OsStrBytes;
+ /// use os_str_bytes::RawOsStr;
+ ///
+ /// let os_string = env::current_exe()?.into_os_string();
+ /// let raw = RawOsStr::new(&os_string);
+ /// assert_eq!(os_string.to_raw_bytes(), raw.as_raw_bytes());
+ /// #
+ /// # Ok::<_, io::Error>(())
+ /// ```
+ #[inline]
+ #[must_use]
+ pub fn as_raw_bytes(&self) -> &[u8] {
+ &self.0
+ }
+
+ /// Equivalent to [`str::contains`].
+ ///
+ /// # Panics
+ ///
+ /// Panics if the pattern is a byte outside of the ASCII range.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use os_str_bytes::RawOsStr;
+ ///
+ /// let raw = RawOsStr::from_str("foobar");
+ /// assert!(raw.contains("oo"));
+ /// assert!(!raw.contains("of"));
+ /// ```
+ #[inline]
+ #[must_use]
+ pub fn contains<P>(&self, pat: P) -> bool
+ where
+ P: Pattern,
+ {
+ self.find(pat).is_some()
+ }
+
+ /// Equivalent to [`str::ends_with`].
+ ///
+ /// # Panics
+ ///
+ /// Panics if the pattern is a byte outside of the ASCII range.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use os_str_bytes::RawOsStr;
+ ///
+ /// let raw = RawOsStr::from_str("foobar");
+ /// assert!(raw.ends_with("bar"));
+ /// assert!(!raw.ends_with("foo"));
+ /// ```
+ #[inline]
+ #[must_use]
+ pub fn ends_with<P>(&self, pat: P) -> bool
+ where
+ P: Pattern,
+ {
+ let pat = pat.__encode();
+ let pat = pat.__get();
+
+ self.0.ends_with(pat)
+ }
+
+ /// Equivalent to [`str::ends_with`] but accepts this type for the pattern.
+ ///
+ /// # Panics
+ ///
+ /// Panics if the pattern is a byte outside of the ASCII range.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use os_str_bytes::RawOsStr;
+ ///
+ /// let raw = RawOsStr::from_str("foobar");
+ /// assert!(raw.ends_with_os(RawOsStr::from_str("bar")));
+ /// assert!(!raw.ends_with_os(RawOsStr::from_str("foo")));
+ /// ```
+ #[inline]
+ #[must_use]
+ pub fn ends_with_os(&self, pat: &Self) -> bool {
+ raw::ends_with(&self.0, &pat.0)
+ }
+
+ /// Equivalent to [`str::find`].
+ ///
+ /// # Panics
+ ///
+ /// Panics if the pattern is a byte outside of the ASCII range.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use os_str_bytes::RawOsStr;
+ ///
+ /// let raw = RawOsStr::from_str("foobar");
+ /// assert_eq!(Some(1), raw.find("o"));
+ /// assert_eq!(None, raw.find("of"));
+ /// ```
+ #[inline]
+ #[must_use]
+ pub fn find<P>(&self, pat: P) -> Option<usize>
+ where
+ P: Pattern,
+ {
+ let pat = pat.__encode();
+ let pat = pat.__get();
+
+ find(&self.0, pat)
+ }
+
+ /// Equivalent to [`str::is_empty`].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use os_str_bytes::RawOsStr;
+ ///
+ /// assert!(RawOsStr::from_str("").is_empty());
+ /// assert!(!RawOsStr::from_str("foobar").is_empty());
+ /// ```
+ #[inline]
+ #[must_use]
+ pub fn is_empty(&self) -> bool {
+ self.0.is_empty()
+ }
+
+ /// Returns the length of the byte string stored by this container.
+ ///
+ /// Only the following assumptions can be made about the result:
+ /// - The length of any Unicode character is the length of its UTF-8
+ /// representation (i.e., [`char::len_utf8`]).
+ /// - Splitting a string at a UTF-8 boundary will return two strings with
+ /// lengths that sum to the length of the original string.
+ ///
+ /// This method may return a different result than would [`OsStr::len`]
+ /// when called on same string, since [`OsStr`] uses an unspecified
+ /// encoding.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use os_str_bytes::RawOsStr;
+ ///
+ /// assert_eq!(6, RawOsStr::from_str("foobar").raw_len());
+ /// assert_eq!(0, RawOsStr::from_str("").raw_len());
+ /// ```
+ #[inline]
+ #[must_use]
+ pub fn raw_len(&self) -> usize {
+ self.0.len()
+ }
+
+ /// Equivalent to [`str::rfind`].
+ ///
+ /// # Panics
+ ///
+ /// Panics if the pattern is a byte outside of the ASCII range.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use os_str_bytes::RawOsStr;
+ ///
+ /// let raw = RawOsStr::from_str("foobar");
+ /// assert_eq!(Some(2), raw.rfind("o"));
+ /// assert_eq!(None, raw.rfind("of"));
+ /// ```
+ #[inline]
+ #[must_use]
+ pub fn rfind<P>(&self, pat: P) -> Option<usize>
+ where
+ P: Pattern,
+ {
+ let pat = pat.__encode();
+ let pat = pat.__get();
+
+ rfind(&self.0, pat)
+ }
+
+ pub(super) fn rsplit_once_raw<P>(&self, pat: &P) -> Option<(&Self, &Self)>
+ where
+ P: EncodedPattern,
+ {
+ impl_split_once_raw!(self, pat, rfind)
+ }
+
+ /// Equivalent to [`str::rsplit_once`].
+ ///
+ /// # Panics
+ ///
+ /// Panics if the pattern is a byte outside of the ASCII range.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use os_str_bytes::RawOsStr;
+ ///
+ /// let raw = RawOsStr::from_str("foobar");
+ /// assert_eq!(
+ /// Some((RawOsStr::from_str("fo"), RawOsStr::from_str("bar"))),
+ /// raw.rsplit_once("o"),
+ /// );
+ /// assert_eq!(None, raw.rsplit_once("of"));
+ /// ```
+ #[inline]
+ #[must_use]
+ pub fn rsplit_once<P>(&self, pat: P) -> Option<(&Self, &Self)>
+ where
+ P: Pattern,
+ {
+ self.rsplit_once_raw(&pat.__encode())
+ }
+
+ // https://github.com/rust-lang/rust/blob/49c68bd53f90e375bfb3cbba8c1c67a9e0adb9c0/src/libcore/str/mod.rs#L2184-L2221
+ #[cold]
+ #[inline(never)]
+ #[track_caller]
+ fn index_boundary_error(&self, index: usize) -> ! {
+ debug_assert!(raw::is_continuation(self.0[index]));
+
+ let start = self.0[..index]
+ .iter()
+ .rposition(|&x| !raw::is_continuation(x))
+ .expect("invalid raw bytes");
+ let mut end = index + 1;
+ end += self.0[end..]
+ .iter()
+ .position(|&x| !raw::is_continuation(x))
+ .unwrap_or_else(|| self.raw_len() - end);
+ let code_point = raw::decode_code_point(&self.0[start..end]);
+ panic!(
+ "byte index {} is not a valid boundary; it is inside U+{:04X} \
+ (bytes {}..{})",
+ index, code_point, start, end,
+ );
+ }
+
+ #[track_caller]
+ fn check_bound(&self, index: usize) {
+ if let Some(&byte) = self.0.get(index) {
+ if raw::is_continuation(byte) {
+ self.index_boundary_error(index);
+ }
+ }
+ }
+
+ /// Equivalent to [`str::split`], but empty patterns are not accepted.
+ ///
+ /// # Panics
+ ///
+ /// Panics if the pattern is a byte outside of the ASCII range or empty.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use os_str_bytes::RawOsStr;
+ ///
+ /// let raw = RawOsStr::from_str("foobar");
+ /// assert_eq!(["f", "", "bar"], *raw.split("o").collect::<Vec<_>>());
+ /// ```
+ #[inline]
+ #[must_use]
+ pub fn split<P>(&self, pat: P) -> Split<'_, P>
+ where
+ P: Pattern,
+ {
+ Split::new(self, pat)
+ }
+
+ /// Equivalent to [`str::split_at`].
+ ///
+ /// # Panics
+ ///
+ /// Panics if the index is not a [valid boundary].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use os_str_bytes::RawOsStr;
+ ///
+ /// let raw = RawOsStr::from_str("foobar");
+ /// assert_eq!(
+ /// ((RawOsStr::from_str("fo"), RawOsStr::from_str("obar"))),
+ /// raw.split_at(2),
+ /// );
+ /// ```
+ ///
+ /// [valid boundary]: #indices
+ #[inline]
+ #[must_use]
+ pub fn split_at(&self, mid: usize) -> (&Self, &Self) {
+ self.check_bound(mid);
+
+ let (prefix, suffix) = self.0.split_at(mid);
+ (
+ Self::from_raw_bytes_unchecked(prefix),
+ Self::from_raw_bytes_unchecked(suffix),
+ )
+ }
+
+ pub(super) fn split_once_raw<P>(&self, pat: &P) -> Option<(&Self, &Self)>
+ where
+ P: EncodedPattern,
+ {
+ impl_split_once_raw!(self, pat, find)
+ }
+
+ /// Equivalent to [`str::split_once`].
+ ///
+ /// # Panics
+ ///
+ /// Panics if the pattern is a byte outside of the ASCII range.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use os_str_bytes::RawOsStr;
+ ///
+ /// let raw = RawOsStr::from_str("foobar");
+ /// assert_eq!(
+ /// Some((RawOsStr::from_str("f"), RawOsStr::from_str("obar"))),
+ /// raw.split_once("o"),
+ /// );
+ /// assert_eq!(None, raw.split_once("of"));
+ /// ```
+ #[inline]
+ #[must_use]
+ pub fn split_once<P>(&self, pat: P) -> Option<(&Self, &Self)>
+ where
+ P: Pattern,
+ {
+ self.split_once_raw(&pat.__encode())
+ }
+
+ /// Equivalent to [`str::starts_with`].
+ ///
+ /// # Panics
+ ///
+ /// Panics if the pattern is a byte outside of the ASCII range.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use os_str_bytes::RawOsStr;
+ ///
+ /// let raw = RawOsStr::from_str("foobar");
+ /// assert!(raw.starts_with("foo"));
+ /// assert!(!raw.starts_with("bar"));
+ /// ```
+ #[inline]
+ #[must_use]
+ pub fn starts_with<P>(&self, pat: P) -> bool
+ where
+ P: Pattern,
+ {
+ let pat = pat.__encode();
+ let pat = pat.__get();
+
+ self.0.starts_with(pat)
+ }
+
+ /// Equivalent to [`str::starts_with`] but accepts this type for the
+ /// pattern.
+ ///
+ /// # Panics
+ ///
+ /// Panics if the pattern is a byte outside of the ASCII range.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use os_str_bytes::RawOsStr;
+ ///
+ /// let raw = RawOsStr::from_str("foobar");
+ /// assert!(raw.starts_with_os(RawOsStr::from_str("foo")));
+ /// assert!(!raw.starts_with_os(RawOsStr::from_str("bar")));
+ /// ```
+ #[inline]
+ #[must_use]
+ pub fn starts_with_os(&self, pat: &Self) -> bool {
+ raw::starts_with(&self.0, &pat.0)
+ }
+
+ /// Equivalent to [`str::strip_prefix`].
+ ///
+ /// # Panics
+ ///
+ /// Panics if the pattern is a byte outside of the ASCII range.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use os_str_bytes::RawOsStr;
+ ///
+ /// let raw = RawOsStr::from_str("111foo1bar111");
+ /// assert_eq!(
+ /// Some(RawOsStr::from_str("11foo1bar111")),
+ /// raw.strip_prefix("1"),
+ /// );
+ /// assert_eq!(None, raw.strip_prefix("o"));
+ /// ```
+ #[inline]
+ #[must_use]
+ pub fn strip_prefix<P>(&self, pat: P) -> Option<&Self>
+ where
+ P: Pattern,
+ {
+ let pat = pat.__encode();
+ let pat = pat.__get();
+
+ self.0.strip_prefix(pat).map(Self::from_raw_bytes_unchecked)
+ }
+
+ /// Equivalent to [`str::strip_suffix`].
+ ///
+ /// # Panics
+ ///
+ /// Panics if the pattern is a byte outside of the ASCII range.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use os_str_bytes::RawOsStr;
+ ///
+ /// let raw = RawOsStr::from_str("111foo1bar111");
+ /// assert_eq!(
+ /// Some(RawOsStr::from_str("111foo1bar11")),
+ /// raw.strip_suffix("1"),
+ /// );
+ /// assert_eq!(None, raw.strip_suffix("o"));
+ /// ```
+ #[inline]
+ #[must_use]
+ pub fn strip_suffix<P>(&self, pat: P) -> Option<&Self>
+ where
+ P: Pattern,
+ {
+ let pat = pat.__encode();
+ let pat = pat.__get();
+
+ self.0.strip_suffix(pat).map(Self::from_raw_bytes_unchecked)
+ }
+
+ /// Converts this representation back to a platform-native string.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use std::env;
+ /// # use std::io;
+ ///
+ /// use os_str_bytes::RawOsStr;
+ ///
+ /// let os_string = env::current_exe()?.into_os_string();
+ /// let raw = RawOsStr::new(&os_string);
+ /// assert_eq!(os_string, raw.to_os_str());
+ /// #
+ /// # Ok::<_, io::Error>(())
+ /// ```
+ #[inline]
+ #[must_use]
+ pub fn to_os_str(&self) -> Cow<'_, OsStr> {
+ OsStr::from_raw_bytes(&self.0).expect("invalid raw bytes")
+ }
+
+ /// Equivalent to [`OsStr::to_str`].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use os_str_bytes::RawOsStr;
+ ///
+ /// let string = "foobar";
+ /// let raw = RawOsStr::from_str(string);
+ /// assert_eq!(Some(string), raw.to_str());
+ /// ```
+ #[inline]
+ #[must_use]
+ pub fn to_str(&self) -> Option<&str> {
+ str::from_utf8(&self.0).ok()
+ }
+
+ /// Converts this string to the best UTF-8 representation possible.
+ ///
+ /// Invalid sequences will be replaced with
+ /// [`char::REPLACEMENT_CHARACTER`].
+ ///
+ /// This method may return a different result than would
+ /// [`OsStr::to_string_lossy`] when called on same string, since [`OsStr`]
+ /// uses an unspecified encoding.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use std::env;
+ /// # use std::io;
+ ///
+ /// use os_str_bytes::RawOsStr;
+ ///
+ /// let os_string = env::current_exe()?.into_os_string();
+ /// let raw = RawOsStr::new(&os_string);
+ /// println!("{}", raw.to_str_lossy());
+ /// #
+ /// # Ok::<_, io::Error>(())
+ /// ```
+ #[inline]
+ #[must_use]
+ pub fn to_str_lossy(&self) -> Cow<'_, str> {
+ String::from_utf8_lossy(&self.0)
+ }
+
+ /// Equivalent to [`str::trim_end_matches`].
+ ///
+ /// # Panics
+ ///
+ /// Panics if the pattern is a byte outside of the ASCII range.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use os_str_bytes::RawOsStr;
+ ///
+ /// let raw = RawOsStr::from_str("111foo1bar111");
+ /// assert_eq!("111foo1bar", raw.trim_end_matches("1"));
+ /// assert_eq!("111foo1bar111", raw.trim_end_matches("o"));
+ /// ```
+ #[must_use]
+ pub fn trim_end_matches<P>(&self, pat: P) -> &Self
+ where
+ P: Pattern,
+ {
+ impl_trim_matches!(self, pat, strip_suffix)
+ }
+
+ /// Equivalent to [`str::trim_start_matches`].
+ ///
+ /// # Panics
+ ///
+ /// Panics if the pattern is a byte outside of the ASCII range.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use os_str_bytes::RawOsStr;
+ ///
+ /// let raw = RawOsStr::from_str("111foo1bar111");
+ /// assert_eq!("foo1bar111", raw.trim_start_matches("1"));
+ /// assert_eq!("111foo1bar111", raw.trim_start_matches("o"));
+ /// ```
+ #[must_use]
+ pub fn trim_start_matches<P>(&self, pat: P) -> &Self
+ where
+ P: Pattern,
+ {
+ impl_trim_matches!(self, pat, strip_prefix)
+ }
+}
+
+impl AsRef<Self> for RawOsStr {
+ #[inline]
+ fn as_ref(&self) -> &Self {
+ self
+ }
+}
+
+impl AsRef<RawOsStr> for str {
+ #[inline]
+ fn as_ref(&self) -> &RawOsStr {
+ RawOsStr::from_str(self)
+ }
+}
+
+impl AsRef<RawOsStr> for String {
+ #[inline]
+ fn as_ref(&self) -> &RawOsStr {
+ (**self).as_ref()
+ }
+}
+
+impl Default for &RawOsStr {
+ #[inline]
+ fn default() -> Self {
+ RawOsStr::from_str("")
+ }
+}
+
+impl<'a> From<&'a RawOsStr> for Cow<'a, RawOsStr> {
+ #[inline]
+ fn from(other: &'a RawOsStr) -> Self {
+ Cow::Borrowed(other)
+ }
+}
+
+macro_rules! r#impl {
+ (
+ $index_type:ty
+ $(, $index_var:ident , $first_bound:expr $(, $second_bound:expr)?)?
+ ) => {
+ impl Index<$index_type> for RawOsStr {
+ type Output = Self;
+
+ #[inline]
+ fn index(&self, idx: $index_type) -> &Self::Output {
+ $(
+ let $index_var = &idx;
+ self.check_bound($first_bound);
+ $(self.check_bound($second_bound);)?
+ )?
+
+ Self::from_raw_bytes_unchecked(&self.0[idx])
+ }
+ }
+ };
+}
+r#impl!(Range<usize>, x, x.start, x.end);
+r#impl!(RangeFrom<usize>, x, x.start);
+r#impl!(RangeFull);
+// [usize::MAX] will always be a valid inclusive end index.
+#[rustfmt::skip]
+r#impl!(RangeInclusive<usize>, x, *x.start(), x.end().wrapping_add(1));
+r#impl!(RangeTo<usize>, x, x.end);
+r#impl!(RangeToInclusive<usize>, x, x.end.wrapping_add(1));
+
+impl ToOwned for RawOsStr {
+ type Owned = RawOsString;
+
+ #[inline]
+ fn to_owned(&self) -> Self::Owned {
+ RawOsString(self.0.to_owned())
+ }
+}
+
+/// A container for the byte strings converted by [`OsStringBytes`].
+///
+/// For more information, see [`RawOsStr`].
+///
+/// [unspecified encoding]: super#encoding
+#[derive(Clone, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
+#[cfg_attr(os_str_bytes_docs_rs, doc(cfg(feature = "raw_os_str")))]
+pub struct RawOsString(Vec<u8>);
+
+impl RawOsString {
+ /// Converts a platform-native string into a representation that can be
+ /// more easily manipulated.
+ ///
+ /// For more information, see [`RawOsStr::new`].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use std::env;
+ /// # use std::io;
+ ///
+ /// use os_str_bytes::RawOsString;
+ ///
+ /// let os_string = env::current_exe()?.into_os_string();
+ /// println!("{:?}", RawOsString::new(os_string));
+ /// #
+ /// # Ok::<_, io::Error>(())
+ /// ```
+ #[inline]
+ #[must_use]
+ pub fn new(string: OsString) -> Self {
+ Self(string.into_raw_vec())
+ }
+
+ /// Wraps a string, without copying or encoding conversion.
+ ///
+ /// This method is much more efficient than [`RawOsString::new`], since the
+ /// [encoding] used by this crate is compatible with UTF-8.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use os_str_bytes::RawOsString;
+ ///
+ /// let string = "foobar".to_owned();
+ /// let raw = RawOsString::from_string(string.clone());
+ /// assert_eq!(string, raw);
+ /// ```
+ ///
+ /// [encoding]: super#encoding
+ #[inline]
+ #[must_use]
+ pub fn from_string(string: String) -> Self {
+ Self(string.into_bytes())
+ }
+
+ /// Converts this representation back to a platform-native string.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use std::env;
+ /// # use std::io;
+ ///
+ /// use os_str_bytes::RawOsString;
+ ///
+ /// let os_string = env::current_exe()?.into_os_string();
+ /// let raw = RawOsString::new(os_string.clone());
+ /// assert_eq!(os_string, raw.into_os_string());
+ /// #
+ /// # Ok::<_, io::Error>(())
+ /// ```
+ #[inline]
+ #[must_use]
+ pub fn into_os_string(self) -> OsString {
+ OsString::from_raw_vec(self.0).expect("invalid raw bytes")
+ }
+
+ /// Returns the byte string stored by this container.
+ ///
+ /// The result will match what would be returned by
+ /// [`OsStringBytes::into_raw_vec`] for the same string.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use std::env;
+ /// # use std::io;
+ ///
+ /// use os_str_bytes::OsStringBytes;
+ /// use os_str_bytes::RawOsString;
+ ///
+ /// let os_string = env::current_exe()?.into_os_string();
+ /// let raw = RawOsString::new(os_string.clone());
+ /// assert_eq!(os_string.into_raw_vec(), raw.into_raw_vec());
+ /// #
+ /// # Ok::<_, io::Error>(())
+ /// ```
+ #[inline]
+ #[must_use]
+ pub fn into_raw_vec(self) -> Vec<u8> {
+ self.0
+ }
+
+ /// Equivalent to [`OsString::into_string`].
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use os_str_bytes::RawOsString;
+ ///
+ /// let string = "foobar".to_owned();
+ /// let raw = RawOsString::from_string(string.clone());
+ /// assert_eq!(Ok(string), raw.into_string());
+ /// ```
+ #[inline]
+ pub fn into_string(self) -> Result<String, Self> {
+ String::from_utf8(self.0).map_err(|x| Self(x.into_bytes()))
+ }
+}
+
+impl AsRef<RawOsStr> for RawOsString {
+ #[inline]
+ fn as_ref(&self) -> &RawOsStr {
+ self
+ }
+}
+
+impl Borrow<RawOsStr> for RawOsString {
+ #[inline]
+ fn borrow(&self) -> &RawOsStr {
+ self
+ }
+}
+
+impl Deref for RawOsString {
+ type Target = RawOsStr;
+
+ #[inline]
+ fn deref(&self) -> &Self::Target {
+ RawOsStr::from_raw_bytes_unchecked(&self.0)
+ }
+}
+
+impl From<String> for RawOsString {
+ #[inline]
+ fn from(other: String) -> Self {
+ Self::from_string(other)
+ }
+}
+
+impl From<RawOsString> for Cow<'_, RawOsStr> {
+ #[inline]
+ fn from(other: RawOsString) -> Self {
+ Cow::Owned(other)
+ }
+}
+
+macro_rules! r#impl {
+ ( $index_type:ty ) => {
+ impl Index<$index_type> for RawOsString {
+ type Output = RawOsStr;
+
+ #[inline]
+ fn index(&self, idx: $index_type) -> &Self::Output {
+ &(**self)[idx]
+ }
+ }
+ };
+}
+r#impl!(Range<usize>);
+r#impl!(RangeFrom<usize>);
+r#impl!(RangeFull);
+r#impl!(RangeInclusive<usize>);
+r#impl!(RangeTo<usize>);
+r#impl!(RangeToInclusive<usize>);
+
+struct Buffer<'a>(&'a [u8]);
+
+impl Debug for Buffer<'_> {
+ fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+ f.write_str("\"")?;
+
+ let mut string = self.0;
+ let mut invalid_length = 0;
+ while !string.is_empty() {
+ let (invalid, substring) = string.split_at(invalid_length);
+
+ let valid = match str::from_utf8(substring) {
+ Ok(valid) => {
+ string = &[];
+ valid
+ }
+ Err(error) => {
+ let (valid, substring) =
+ substring.split_at(error.valid_up_to());
+
+ let invalid_char_length =
+ error.error_len().unwrap_or_else(|| substring.len());
+ if valid.is_empty() {
+ invalid_length += invalid_char_length;
+ continue;
+ }
+ string = substring;
+ invalid_length = invalid_char_length;
+
+ // SAFETY: This slice was validated to be UTF-8.
+ unsafe { str::from_utf8_unchecked(valid) }
+ }
+ };
+
+ raw::debug(invalid, f)?;
+ Display::fmt(&valid.escape_debug(), f)?;
+ }
+
+ f.write_str("\"")
+ }
+}
+
+macro_rules! r#impl {
+ ( $type:ty ) => {
+ impl Debug for $type {
+ #[inline]
+ fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+ f.debug_tuple(stringify!($type))
+ .field(&Buffer(&self.0))
+ .finish()
+ }
+ }
+ };
+}
+r#impl!(RawOsStr);
+r#impl!(RawOsString);
+
+macro_rules! r#impl {
+ ( $type:ty , $other_type:ty ) => {
+ impl PartialEq<$other_type> for $type {
+ #[inline]
+ fn eq(&self, other: &$other_type) -> bool {
+ let raw: &RawOsStr = self;
+ let other: &RawOsStr = other.as_ref();
+ raw == other
+ }
+ }
+
+ impl PartialEq<$type> for $other_type {
+ #[inline]
+ fn eq(&self, other: &$type) -> bool {
+ other == self
+ }
+ }
+ };
+}
+r#impl!(RawOsStr, RawOsString);
+r#impl!(&RawOsStr, RawOsString);
+r#impl!(RawOsStr, str);
+r#impl!(RawOsStr, String);
+r#impl!(&RawOsStr, String);
+r#impl!(RawOsString, str);
+r#impl!(RawOsString, &str);
+r#impl!(RawOsString, String);
+
+#[cfg(feature = "print_bytes")]
+#[cfg_attr(os_str_bytes_docs_rs, doc(cfg(feature = "print_bytes")))]
+mod print_bytes {
+ use print_bytes::ByteStr;
+ use print_bytes::ToBytes;
+ #[cfg(windows)]
+ use print_bytes::WideStr;
+
+ #[cfg(windows)]
+ use crate::imp::raw;
+
+ use super::RawOsStr;
+ use super::RawOsString;
+
+ impl ToBytes for RawOsStr {
+ #[inline]
+ fn to_bytes(&self) -> ByteStr<'_> {
+ self.0.to_bytes()
+ }
+
+ #[cfg(windows)]
+ #[inline]
+ fn to_wide(&self) -> Option<WideStr> {
+ Some(WideStr::new(raw::encode_wide_unchecked(&self.0).collect()))
+ }
+ }
+
+ impl ToBytes for RawOsString {
+ #[inline]
+ fn to_bytes(&self) -> ByteStr<'_> {
+ (**self).to_bytes()
+ }
+
+ #[cfg(windows)]
+ #[inline]
+ fn to_wide(&self) -> Option<WideStr> {
+ (**self).to_wide()
+ }
+ }
+}
+
+#[cfg(feature = "uniquote")]
+#[cfg_attr(os_str_bytes_docs_rs, doc(cfg(feature = "uniquote")))]
+mod uniquote {
+ use uniquote::Formatter;
+ use uniquote::Quote;
+ use uniquote::Result;
+
+ use crate::imp::raw;
+
+ use super::RawOsStr;
+ use super::RawOsString;
+
+ impl Quote for RawOsStr {
+ #[inline]
+ fn escape(&self, f: &mut Formatter<'_>) -> Result {
+ raw::uniquote::escape(&self.0, f)
+ }
+ }
+
+ impl Quote for RawOsString {
+ #[inline]
+ fn escape(&self, f: &mut Formatter<'_>) -> Result {
+ (**self).escape(f)
+ }
+ }
+}
diff --git a/src/util.rs b/src/util.rs
new file mode 100644
index 0000000..bd28b7b
--- /dev/null
+++ b/src/util.rs
@@ -0,0 +1,10 @@
+pub(super) const BYTE_SHIFT: u8 = 6;
+
+pub(super) const CONT_MASK: u8 = (1 << BYTE_SHIFT) - 1;
+
+pub(super) const CONT_TAG: u8 = 0b1000_0000;
+
+#[cfg_attr(not(windows), allow(dead_code))]
+pub(super) const fn is_continuation(byte: u8) -> bool {
+ byte & !CONT_MASK == CONT_TAG
+}
diff --git a/src/wasm32/mod.rs b/src/wasm32/mod.rs
new file mode 100644
index 0000000..f8ae368
--- /dev/null
+++ b/src/wasm32/mod.rs
@@ -0,0 +1,56 @@
+use std::borrow::Cow;
+use std::error::Error;
+use std::ffi::OsStr;
+use std::ffi::OsString;
+use std::fmt;
+use std::fmt::Display;
+use std::fmt::Formatter;
+use std::result;
+use std::str;
+use std::str::Utf8Error;
+
+if_raw_str! {
+ pub(super) mod raw;
+}
+
+#[derive(Debug, Eq, PartialEq)]
+pub(super) struct EncodingError(Utf8Error);
+
+impl Display for EncodingError {
+ fn fmt(&self, formatter: &mut Formatter<'_>) -> fmt::Result {
+ write!(formatter, "os_str_bytes: {}", self.0)
+ }
+}
+
+impl Error for EncodingError {}
+
+type Result<T> = result::Result<T, EncodingError>;
+
+macro_rules! expect_utf8 {
+ ( $result:expr ) => {
+ $result.expect(
+ "platform string contains invalid UTF-8, which should not be \
+ possible",
+ )
+ };
+}
+
+pub(super) fn os_str_from_bytes(string: &[u8]) -> Result<Cow<'_, OsStr>> {
+ str::from_utf8(string)
+ .map(|x| Cow::Borrowed(OsStr::new(x)))
+ .map_err(EncodingError)
+}
+
+pub(super) fn os_str_to_bytes(os_string: &OsStr) -> Cow<'_, [u8]> {
+ Cow::Borrowed(expect_utf8!(os_string.to_str()).as_bytes())
+}
+
+pub(super) fn os_string_from_vec(string: Vec<u8>) -> Result<OsString> {
+ String::from_utf8(string)
+ .map(Into::into)
+ .map_err(|x| EncodingError(x.utf8_error()))
+}
+
+pub(super) fn os_string_into_vec(os_string: OsString) -> Vec<u8> {
+ expect_utf8!(os_string.into_string()).into_bytes()
+}
diff --git a/src/wasm32/raw.rs b/src/wasm32/raw.rs
new file mode 100644
index 0000000..5645900
--- /dev/null
+++ b/src/wasm32/raw.rs
@@ -0,0 +1,39 @@
+use std::fmt;
+use std::fmt::Formatter;
+use std::str;
+
+pub(crate) use crate::util::is_continuation;
+
+pub(crate) fn decode_code_point(string: &[u8]) -> u32 {
+ let string = str::from_utf8(string).expect("invalid string");
+ let mut chars = string.chars();
+ let ch = chars
+ .next()
+ .expect("cannot parse code point from empty string");
+ assert_eq!(None, chars.next(), "multiple code points found");
+ ch.into()
+}
+
+pub(crate) fn ends_with(string: &[u8], suffix: &[u8]) -> bool {
+ string.ends_with(suffix)
+}
+
+pub(crate) fn starts_with(string: &[u8], prefix: &[u8]) -> bool {
+ string.starts_with(prefix)
+}
+
+pub(crate) fn debug(string: &[u8], _: &mut Formatter<'_>) -> fmt::Result {
+ assert!(string.is_empty());
+ Ok(())
+}
+
+#[cfg(feature = "uniquote")]
+pub(crate) mod uniquote {
+ use uniquote::Formatter;
+ use uniquote::Quote;
+ use uniquote::Result;
+
+ pub(crate) fn escape(string: &[u8], f: &mut Formatter<'_>) -> Result {
+ string.escape(f)
+ }
+}
diff --git a/src/windows/mod.rs b/src/windows/mod.rs
new file mode 100644
index 0000000..3b6105b
--- /dev/null
+++ b/src/windows/mod.rs
@@ -0,0 +1,152 @@
+// These functions are necessarily inefficient, because they must revert
+// encoding conversions performed by the standard library. However, there is
+// currently no better alternative.
+
+use std::borrow::Cow;
+use std::error::Error;
+use std::ffi::OsStr;
+use std::ffi::OsString;
+use std::fmt;
+use std::fmt::Display;
+use std::fmt::Formatter;
+use std::os::windows::ffi::OsStrExt;
+use std::os::windows::ffi::OsStringExt;
+use std::result;
+use std::str;
+
+if_raw_str! {
+ pub(super) mod raw;
+}
+
+mod wtf8;
+use wtf8::encode_wide;
+use wtf8::DecodeWide;
+
+#[derive(Debug, Eq, PartialEq)]
+pub(super) enum EncodingError {
+ Byte(u8),
+ CodePoint(u32),
+ End(),
+}
+
+impl EncodingError {
+ fn position(&self) -> Cow<'_, str> {
+ match self {
+ Self::Byte(byte) => Cow::Owned(format!("byte b'\\x{:02X}'", byte)),
+ Self::CodePoint(code_point) => {
+ Cow::Owned(format!("code point U+{:04X}", code_point))
+ }
+ Self::End() => Cow::Borrowed("end of string"),
+ }
+ }
+}
+
+impl Display for EncodingError {
+ fn fmt(&self, formatter: &mut Formatter<'_>) -> fmt::Result {
+ write!(
+ formatter,
+ "byte sequence is not representable in the platform encoding; \
+ error at {}",
+ self.position(),
+ )
+ }
+}
+
+impl Error for EncodingError {}
+
+type Result<T> = result::Result<T, EncodingError>;
+
+fn from_bytes(string: &[u8]) -> Result<OsString> {
+ let encoder = encode_wide(string);
+
+ // Collecting an iterator into a result ignores the size hint:
+ // https://github.com/rust-lang/rust/issues/48994
+ let mut encoded_string = Vec::with_capacity(encoder.size_hint().0);
+ for wchar in encoder {
+ encoded_string.push(wchar?);
+ }
+ Ok(OsStringExt::from_wide(&encoded_string))
+}
+
+fn to_bytes(os_string: &OsStr) -> Vec<u8> {
+ let encoder = OsStrExt::encode_wide(os_string);
+
+ let mut string = Vec::with_capacity(encoder.size_hint().0);
+ string.extend(DecodeWide::new(encoder));
+ string
+}
+
+pub(super) fn os_str_from_bytes(string: &[u8]) -> Result<Cow<'_, OsStr>> {
+ from_bytes(string).map(Cow::Owned)
+}
+
+pub(super) fn os_str_to_bytes(os_string: &OsStr) -> Cow<'_, [u8]> {
+ Cow::Owned(to_bytes(os_string))
+}
+
+pub(super) fn os_string_from_vec(string: Vec<u8>) -> Result<OsString> {
+ from_bytes(&string)
+}
+
+pub(super) fn os_string_into_vec(os_string: OsString) -> Vec<u8> {
+ to_bytes(&os_string)
+}
+
+#[cfg(test)]
+mod tests {
+ use std::ffi::OsStr;
+
+ use crate::OsStrBytes;
+
+ use super::EncodingError;
+
+ #[test]
+ fn test_invalid() {
+ use EncodingError::Byte;
+ use EncodingError::CodePoint;
+ use EncodingError::End;
+
+ test_error(Byte(b'\x83'), b"\x0C\x83\xD7\x3E");
+ test_error(Byte(b'\x52'), b"\x19\xF7\x52\x84");
+ test_error(Byte(b'\xB8'), b"\x70\xB8\x1F\x66");
+ test_error(CodePoint(0x34_0388), b"\x70\xFD\x80\x8E\x88");
+ test_error(Byte(b'\x80'), b"\x80");
+ test_error(Byte(b'\x80'), b"\x80\x80");
+ test_error(Byte(b'\x80'), b"\x80\x80\x80");
+ test_error(Byte(b'\x81'), b"\x81");
+ test_error(Byte(b'\x88'), b"\x88\xB4\xC7\x46");
+ test_error(Byte(b'\x97'), b"\x97\xCE\x06");
+ test_error(Byte(b'\x00'), b"\xC2\x00");
+ test_error(Byte(b'\x7F'), b"\xC2\x7F");
+ test_error(Byte(b'\x09'), b"\xCD\x09\x95");
+ test_error(Byte(b'\x43'), b"\xCD\x43\x5F\xA0");
+ test_error(Byte(b'\x69'), b"\xD7\x69\xB2");
+ test_error(CodePoint(0x528), b"\xE0\x94\xA8");
+ test_error(CodePoint(0x766), b"\xE0\x9D\xA6\x12\xAE");
+ test_error(Byte(b'\xFD'), b"\xE2\xAB\xFD\x51");
+ test_error(Byte(b'\xC4'), b"\xE3\xC4");
+ test_error(CodePoint(0xDC00), b"\xED\xA0\x80\xED\xB0\x80");
+ test_error(End(), b"\xF1");
+ test_error(End(), b"\xF1\x80");
+ test_error(End(), b"\xF1\x80\x80");
+ test_error(Byte(b'\xF1'), b"\xF1\x80\x80\xF1");
+ test_error(CodePoint(0x11_09CC), b"\xF4\x90\xA7\x8C");
+ test_error(CodePoint(0x15_EC46), b"\xF5\x9E\xB1\x86");
+ test_error(End(), b"\xFB");
+ test_error(End(), b"\xFB\x80");
+ test_error(End(), b"\xFB\x80\x80");
+ test_error(CodePoint(0x2C_0000), b"\xFB\x80\x80\x80");
+ test_error(End(), b"\xFF");
+ test_error(End(), b"\xFF\x80");
+ test_error(End(), b"\xFF\x80\x80");
+ test_error(CodePoint(0x3C_0000), b"\xFF\x80\x80\x80");
+ test_error(CodePoint(0x3C_6143), b"\xFF\x86\x85\x83");
+
+ fn test_error(error: EncodingError, string: &[u8]) {
+ assert_eq!(
+ Err(error),
+ OsStr::from_raw_bytes(string).map_err(|x| x.0),
+ );
+ }
+ }
+}
diff --git a/src/windows/raw.rs b/src/windows/raw.rs
new file mode 100644
index 0000000..630eb01
--- /dev/null
+++ b/src/windows/raw.rs
@@ -0,0 +1,42 @@
+use std::fmt;
+use std::fmt::Formatter;
+
+pub(crate) use crate::util::is_continuation;
+
+use super::wtf8;
+pub(crate) use super::wtf8::ends_with;
+pub(crate) use super::wtf8::starts_with;
+use super::wtf8::CodePoints;
+
+pub(crate) fn encode_wide_unchecked(
+ string: &[u8],
+) -> impl '_ + Iterator<Item = u16> {
+ wtf8::encode_wide(string).map(|x| x.expect("invalid string"))
+}
+
+pub(crate) fn decode_code_point(string: &[u8]) -> u32 {
+ let mut code_points = CodePoints::new(string.iter().copied());
+ let code_point = code_points
+ .next()
+ .expect("cannot parse code point from empty string")
+ .expect("invalid string");
+ assert_eq!(None, code_points.next(), "multiple code points found");
+ code_point
+}
+
+pub(crate) fn debug(string: &[u8], f: &mut Formatter<'_>) -> fmt::Result {
+ for wchar in encode_wide_unchecked(string) {
+ write!(f, "\\u{{{:X}}}", wchar)?;
+ }
+ Ok(())
+}
+
+#[cfg(feature = "uniquote")]
+pub(crate) mod uniquote {
+ use uniquote::Formatter;
+ use uniquote::Result;
+
+ pub(crate) fn escape(string: &[u8], f: &mut Formatter<'_>) -> Result {
+ f.escape_utf16(super::encode_wide_unchecked(string))
+ }
+}
diff --git a/src/windows/wtf8/code_points.rs b/src/windows/wtf8/code_points.rs
new file mode 100644
index 0000000..b265db3
--- /dev/null
+++ b/src/windows/wtf8/code_points.rs
@@ -0,0 +1,117 @@
+use std::iter::Peekable;
+use std::mem;
+
+use crate::util::is_continuation;
+use crate::util::BYTE_SHIFT;
+use crate::util::CONT_MASK;
+
+use super::EncodingError;
+use super::Result;
+
+pub(in super::super) struct CodePoints<I>
+where
+ I: Iterator<Item = u8>,
+{
+ iter: Peekable<I>,
+ surrogate: bool,
+}
+
+impl<I> CodePoints<I>
+where
+ I: Iterator<Item = u8>,
+{
+ pub(in super::super) fn new<S>(string: S) -> Self
+ where
+ S: IntoIterator<IntoIter = I, Item = I::Item>,
+ {
+ Self {
+ iter: string.into_iter().peekable(),
+ surrogate: false,
+ }
+ }
+
+ fn consume_next(&mut self, code_point: &mut u32) -> Result<()> {
+ if let Some(&byte) = self.iter.peek() {
+ if !is_continuation(byte) {
+ self.surrogate = false;
+ // Not consuming this byte will be useful if this crate ever
+ // offers a way to encode lossily.
+ return Err(EncodingError::Byte(byte));
+ }
+ *code_point =
+ (*code_point << BYTE_SHIFT) | u32::from(byte & CONT_MASK);
+
+ let removed = self.iter.next();
+ debug_assert_eq!(Some(byte), removed);
+ } else {
+ return Err(EncodingError::End());
+ }
+ Ok(())
+ }
+
+ pub(super) fn inner_size_hint(&self) -> (usize, Option<usize>) {
+ self.iter.size_hint()
+ }
+}
+
+impl<I> Iterator for CodePoints<I>
+where
+ I: Iterator<Item = u8>,
+{
+ type Item = Result<u32>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ let byte = self.iter.next()?;
+ let mut code_point: u32 = byte.into();
+
+ macro_rules! consume_next {
+ () => {{
+ if let Err(error) = self.consume_next(&mut code_point) {
+ return Some(Err(error));
+ }
+ }};
+ }
+
+ let prev_surrogate = mem::replace(&mut self.surrogate, false);
+
+ let mut invalid = false;
+ if !byte.is_ascii() {
+ if byte < 0xC2 {
+ return Some(Err(EncodingError::Byte(byte)));
+ }
+
+ if byte < 0xE0 {
+ code_point &= 0x1F;
+ } else {
+ code_point &= 0x0F;
+ consume_next!();
+
+ if byte >= 0xF0 {
+ if code_point.wrapping_sub(0x10) >= 0x100 {
+ invalid = true;
+ }
+ consume_next!();
+
+ // This condition is optimized to detect surrogate code points.
+ } else if code_point & 0xFE0 == 0x360 {
+ if code_point & 0x10 == 0 {
+ self.surrogate = true;
+ } else if prev_surrogate {
+ // Decoding a broken surrogate pair would be lossy.
+ invalid = true;
+ }
+ }
+
+ if code_point < 0x20 {
+ invalid = true;
+ }
+ }
+ consume_next!();
+ }
+ if invalid {
+ return Some(Err(EncodingError::CodePoint(code_point)));
+ }
+
+ Some(Ok(code_point))
+ }
+}
diff --git a/src/windows/wtf8/convert.rs b/src/windows/wtf8/convert.rs
new file mode 100644
index 0000000..fcaf562
--- /dev/null
+++ b/src/windows/wtf8/convert.rs
@@ -0,0 +1,166 @@
+use std::char;
+use std::char::DecodeUtf16;
+use std::num::NonZeroU16;
+
+use crate::util::BYTE_SHIFT;
+use crate::util::CONT_MASK;
+use crate::util::CONT_TAG;
+
+use super::CodePoints;
+use super::Result;
+
+const MIN_HIGH_SURROGATE: u16 = 0xD800;
+
+const MIN_LOW_SURROGATE: u16 = 0xDC00;
+
+const MIN_SURROGATE_CODE: u32 = (u16::MAX as u32) + 1;
+
+macro_rules! static_assert {
+ ( $condition:expr ) => {
+ const _: () = assert!($condition, "static assertion failed");
+ };
+}
+
+pub(in super::super) struct DecodeWide<I>
+where
+ I: Iterator<Item = u16>,
+{
+ iter: DecodeUtf16<I>,
+ code_point: u32,
+ shift: u8,
+}
+
+impl<I> DecodeWide<I>
+where
+ I: Iterator<Item = u16>,
+{
+ pub(in super::super) fn new<S>(string: S) -> Self
+ where
+ S: IntoIterator<IntoIter = I, Item = I::Item>,
+ {
+ Self {
+ iter: char::decode_utf16(string),
+ code_point: 0,
+ shift: 0,
+ }
+ }
+}
+
+impl<I> Iterator for DecodeWide<I>
+where
+ I: Iterator<Item = u16>,
+{
+ type Item = u8;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ if let Some(shift) = self.shift.checked_sub(BYTE_SHIFT) {
+ self.shift = shift;
+ return Some(
+ ((self.code_point >> self.shift) as u8 & CONT_MASK) | CONT_TAG,
+ );
+ }
+
+ self.code_point = self
+ .iter
+ .next()?
+ .map(Into::into)
+ .unwrap_or_else(|x| x.unpaired_surrogate().into());
+
+ macro_rules! decode {
+ ( $tag:expr ) => {
+ Some((self.code_point >> self.shift) as u8 | $tag)
+ };
+ }
+ macro_rules! try_decode {
+ ( $tag:expr , $upper_bound:expr ) => {
+ if self.code_point < $upper_bound {
+ return decode!($tag);
+ }
+ self.shift += BYTE_SHIFT;
+ };
+ }
+ try_decode!(0, 0x80);
+ try_decode!(0xC0, 0x800);
+ try_decode!(0xE0, MIN_SURROGATE_CODE);
+ decode!(0xF0)
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ let (low, high) = self.iter.size_hint();
+ let shift = self.shift.into();
+ (
+ low.saturating_add(shift),
+ high.and_then(|x| x.checked_mul(4))
+ .and_then(|x| x.checked_add(shift)),
+ )
+ }
+}
+
+struct EncodeWide<I>
+where
+ I: Iterator<Item = u8>,
+{
+ iter: CodePoints<I>,
+ surrogate: Option<NonZeroU16>,
+}
+
+impl<I> EncodeWide<I>
+where
+ I: Iterator<Item = u8>,
+{
+ pub(in super::super) fn new<S>(string: S) -> Self
+ where
+ S: IntoIterator<IntoIter = I, Item = I::Item>,
+ {
+ Self {
+ iter: CodePoints::new(string),
+ surrogate: None,
+ }
+ }
+}
+
+impl<I> Iterator for EncodeWide<I>
+where
+ I: Iterator<Item = u8>,
+{
+ type Item = Result<u16>;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ if let Some(surrogate) = self.surrogate.take() {
+ return Some(Ok(surrogate.get()));
+ }
+
+ self.iter.next().map(|code_point| {
+ code_point.map(|code_point| {
+ code_point
+ .checked_sub(MIN_SURROGATE_CODE)
+ .map(|offset| {
+ static_assert!(MIN_LOW_SURROGATE != 0);
+
+ self.surrogate = Some(unsafe {
+ NonZeroU16::new_unchecked(
+ (offset & 0x3FF) as u16 | MIN_LOW_SURROGATE,
+ )
+ });
+ (offset >> 10) as u16 | MIN_HIGH_SURROGATE
+ })
+ .unwrap_or(code_point as u16)
+ })
+ })
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ let (low, high) = self.iter.inner_size_hint();
+ let additional = self.surrogate.is_some().into();
+ (
+ (low.saturating_add(2) / 3).saturating_add(additional),
+ high.and_then(|x| x.checked_add(additional)),
+ )
+ }
+}
+
+pub(in super::super) fn encode_wide(
+ string: &[u8],
+) -> impl '_ + Iterator<Item = Result<u16>> {
+ EncodeWide::new(string.iter().copied())
+}
diff --git a/src/windows/wtf8/mod.rs b/src/windows/wtf8/mod.rs
new file mode 100644
index 0000000..d8b0dc4
--- /dev/null
+++ b/src/windows/wtf8/mod.rs
@@ -0,0 +1,18 @@
+// This module implements the WTF-8 encoding specification:
+// https://simonsapin.github.io/wtf-8/
+
+use super::EncodingError;
+use super::Result;
+
+mod code_points;
+pub(super) use code_points::CodePoints;
+
+mod convert;
+pub(super) use convert::encode_wide;
+pub(super) use convert::DecodeWide;
+
+if_raw_str! {
+ mod string;
+ pub(crate) use string::ends_with;
+ pub(crate) use string::starts_with;
+}
diff --git a/src/windows/wtf8/string.rs b/src/windows/wtf8/string.rs
new file mode 100644
index 0000000..10b8faf
--- /dev/null
+++ b/src/windows/wtf8/string.rs
@@ -0,0 +1,63 @@
+use crate::util::is_continuation;
+
+use super::encode_wide;
+
+const SURROGATE_LENGTH: usize = 3;
+
+pub(crate) fn ends_with(string: &[u8], mut suffix: &[u8]) -> bool {
+ let index = match string.len().checked_sub(suffix.len()) {
+ Some(index) => index,
+ None => return false,
+ };
+ if let Some(&byte) = string.get(index) {
+ if is_continuation(byte) {
+ let index = index.checked_sub(1).expect("invalid string");
+ let mut wide_surrogate = match suffix.get(..SURROGATE_LENGTH) {
+ Some(surrogate) => encode_wide(surrogate),
+ None => return false,
+ };
+ let surrogate_wchar = wide_surrogate
+ .next()
+ .expect("failed decoding non-empty suffix");
+
+ if wide_surrogate.next().is_some()
+ || encode_wide(&string[index..])
+ .take_while(Result::is_ok)
+ .nth(1)
+ != Some(surrogate_wchar)
+ {
+ return false;
+ }
+ suffix = &suffix[SURROGATE_LENGTH..];
+ }
+ }
+ string.ends_with(suffix)
+}
+
+pub(crate) fn starts_with(string: &[u8], mut prefix: &[u8]) -> bool {
+ if let Some(&byte) = string.get(prefix.len()) {
+ if is_continuation(byte) {
+ let index = match prefix.len().checked_sub(SURROGATE_LENGTH) {
+ Some(index) => index,
+ None => return false,
+ };
+ let (substring, surrogate) = prefix.split_at(index);
+ let mut wide_surrogate = encode_wide(surrogate);
+ let surrogate_wchar = wide_surrogate
+ .next()
+ .expect("failed decoding non-empty prefix");
+
+ if surrogate_wchar.is_err()
+ || wide_surrogate.next().is_some()
+ || encode_wide(&string[index..])
+ .next()
+ .expect("failed decoding non-empty substring")
+ != surrogate_wchar
+ {
+ return false;
+ }
+ prefix = substring;
+ }
+ }
+ string.starts_with(prefix)
+}
diff --git a/tests/common.rs b/tests/common.rs
new file mode 100644
index 0000000..c0909bc
--- /dev/null
+++ b/tests/common.rs
@@ -0,0 +1,94 @@
+#![allow(dead_code)]
+#![warn(unsafe_op_in_unsafe_fn)]
+
+use std::borrow::Cow;
+use std::ffi::OsStr;
+use std::ffi::OsString;
+#[cfg(feature = "raw_os_str")]
+use std::mem;
+use std::path::Path;
+use std::path::PathBuf;
+use std::result;
+
+use os_str_bytes::EncodingError;
+use os_str_bytes::OsStrBytes;
+use os_str_bytes::OsStringBytes;
+#[cfg(feature = "raw_os_str")]
+use os_str_bytes::RawOsStr;
+
+pub(crate) type Result<T> = result::Result<T, EncodingError>;
+
+pub(crate) const WTF8_STRING: &[u8] = b"foo\xED\xA0\xBD\xF0\x9F\x92\xA9bar";
+
+// SAFETY: This string is valid in WTF-8.
+#[cfg(all(any(unix, windows), feature = "raw_os_str"))]
+pub(crate) const RAW_WTF8_STRING: &RawOsStr =
+ unsafe { from_raw_bytes_unchecked(WTF8_STRING) };
+
+#[cfg(feature = "raw_os_str")]
+pub(crate) const unsafe fn from_raw_bytes_unchecked(
+ string: &[u8],
+) -> &RawOsStr {
+ // SAFETY: This implementation detail can only be assumed by this crate.
+ unsafe { mem::transmute(string) }
+}
+
+#[track_caller]
+fn test_from_bytes<'a, T, U, S>(result: &Result<U>, string: S)
+where
+ S: Into<Cow<'a, [u8]>>,
+ T: 'a + AsRef<OsStr> + OsStrBytes + ?Sized,
+ U: AsRef<OsStr>,
+{
+ assert_eq!(
+ result.as_ref().map(AsRef::as_ref),
+ T::from_raw_bytes(string).as_deref().map(AsRef::as_ref),
+ );
+}
+
+pub(crate) fn from_bytes(string: &[u8]) -> Result<Cow<'_, OsStr>> {
+ let os_string = OsStr::from_raw_bytes(string);
+
+ test_from_bytes::<Path, _, _>(&os_string, string);
+
+ os_string
+}
+
+pub(crate) fn from_vec(string: Vec<u8>) -> Result<OsString> {
+ let os_string = OsString::from_raw_vec(string.clone());
+ test_from_bytes::<OsStr, _, _>(&os_string, string.clone());
+
+ let path = PathBuf::from_raw_vec(string.clone());
+ test_from_bytes::<Path, _, _>(&path, string);
+ assert_eq!(os_string, path.map(PathBuf::into_os_string));
+
+ os_string
+}
+
+pub(crate) fn test_bytes(string: &[u8]) -> Result<()> {
+ let os_string = from_bytes(string)?;
+ assert_eq!(string.len(), os_string.len());
+ assert_eq!(string, &*os_string.to_raw_bytes());
+ Ok(())
+}
+
+pub(crate) fn test_vec(string: &[u8]) -> Result<()> {
+ let os_string = from_vec(string.to_owned())?;
+ assert_eq!(string.len(), os_string.len());
+ assert_eq!(string, os_string.into_raw_vec());
+ Ok(())
+}
+
+pub(crate) fn test_utf8_bytes(string: &str) {
+ let os_string = OsStr::new(string);
+ let string = string.as_bytes();
+ assert_eq!(Ok(Cow::Borrowed(os_string)), from_bytes(string));
+ assert_eq!(string, &*os_string.to_raw_bytes());
+}
+
+pub(crate) fn test_utf8_vec(string: &str) {
+ let os_string = string.to_owned().into();
+ let string = string.as_bytes();
+ assert_eq!(Ok(&os_string), from_vec(string.to_owned()).as_ref());
+ assert_eq!(string, os_string.into_raw_vec());
+}
diff --git a/tests/debug.rs b/tests/debug.rs
new file mode 100644
index 0000000..c252deb
--- /dev/null
+++ b/tests/debug.rs
@@ -0,0 +1,34 @@
+#![cfg(feature = "raw_os_str")]
+
+use os_str_bytes::RawOsStr;
+
+mod common;
+use common::RAW_WTF8_STRING;
+
+fn test(result: &str, string: &RawOsStr) {
+ assert_eq!(format!("RawOsStr({})", result), format!("{:?}", string));
+ assert_eq!(
+ format!("RawOsString({})", result),
+ format!("{:?}", string.to_owned()),
+ );
+}
+
+#[test]
+fn test_debug_empty() {
+ test("\"\"", RawOsStr::from_str(""));
+}
+
+#[test]
+fn test_debug_wft8() {
+ let wchar = if cfg!(unix) {
+ "\\xED\\xA0\\xBD"
+ } else {
+ "\\u{D83D}"
+ };
+ test(&format!("\"foo{}\u{1F4A9}bar\"", wchar), RAW_WTF8_STRING);
+}
+
+#[test]
+fn test_debug_quote() {
+ test("\"foo\\\"bar\"", RawOsStr::from_str("foo\"bar"));
+}
diff --git a/tests/edge_cases.rs b/tests/edge_cases.rs
new file mode 100644
index 0000000..a0fa529
--- /dev/null
+++ b/tests/edge_cases.rs
@@ -0,0 +1,7 @@
+mod common;
+use common::test_bytes;
+
+#[test]
+fn test_edge_cases() {
+ assert_eq!(Ok(()), test_bytes(b"\xED\xAB\xBE\xF4\x8D\xBC\x9A"));
+}
diff --git a/tests/index.rs b/tests/index.rs
new file mode 100644
index 0000000..50abd6c
--- /dev/null
+++ b/tests/index.rs
@@ -0,0 +1,86 @@
+#![cfg(feature = "raw_os_str")]
+
+use std::ops::Index;
+use std::panic;
+use std::panic::UnwindSafe;
+
+use os_str_bytes::RawOsStr;
+
+mod common;
+use common::RAW_WTF8_STRING;
+
+#[test]
+fn test_valid_indices() {
+ test(0);
+ test(1);
+ test(2);
+ test(3);
+ test(6);
+ test(10);
+ test(11);
+ test(12);
+ test(13);
+
+ #[track_caller]
+ fn test(index: usize) {
+ let _ = RAW_WTF8_STRING.index(index..);
+ }
+}
+
+macro_rules! test {
+ ( $name:ident , $index:literal , $code_point:expr ) => {
+ // https://github.com/rust-lang/rust/issues/88430
+ #[test]
+ fn $name() {
+ let index_fn = || RAW_WTF8_STRING.index($index..);
+ if cfg!(unix) {
+ let _ = index_fn();
+ return;
+ }
+
+ let error = panic::catch_unwind(index_fn)
+ .expect_err("test did not panic as expected");
+ let error: &String =
+ error.downcast_ref().expect("incorrect panic message type");
+ assert_eq!(
+ concat!(
+ "byte index ",
+ $index,
+ " is not a valid boundary; it is inside ",
+ $code_point
+ ),
+ error,
+ );
+ }
+ };
+}
+
+test!(test_index_4, 4, "U+D83D (bytes 3..6)");
+
+test!(test_index_5, 5, "U+D83D (bytes 3..6)");
+
+test!(test_index_7, 7, "U+1F4A9 (bytes 6..10)");
+
+test!(test_index_8, 8, "U+1F4A9 (bytes 6..10)");
+
+test!(test_index_9, 9, "U+1F4A9 (bytes 6..10)");
+
+#[test]
+fn test_index_panics() {
+ let string = RawOsStr::from_str("\u{F6}");
+ test(|| string.index(1..2));
+ test(|| string.index(0..1));
+ test(|| string.index(1..));
+ test(|| string.index(0..=0));
+ test(|| string.index(..1));
+ test(|| string.index(..=0));
+ test(|| string.split_at(1));
+
+ #[track_caller]
+ fn test<F, R>(f: F)
+ where
+ F: FnOnce() -> R + UnwindSafe,
+ {
+ assert_eq!(!cfg!(unix), panic::catch_unwind(f).is_err());
+ }
+}
diff --git a/tests/integration.rs b/tests/integration.rs
new file mode 100644
index 0000000..0107fe5
--- /dev/null
+++ b/tests/integration.rs
@@ -0,0 +1,75 @@
+use std::str;
+
+mod common;
+use common::test_bytes;
+use common::test_utf8_bytes;
+use common::test_utf8_vec;
+use common::test_vec;
+use common::Result;
+use common::WTF8_STRING;
+
+const INVALID_STRING: &[u8] = b"\xF1foo\xF1\x80bar\xF1\x80\x80baz";
+
+const UTF8_STRING: &str = "string";
+
+fn test_string_is_invalid_utf8(string: &[u8]) {
+ assert!(str::from_utf8(string).is_err());
+}
+
+fn test_invalid_result(result: &Result<()>) {
+ if cfg!(windows) {
+ assert!(result.is_err());
+ } else {
+ assert_eq!(&Ok(()), result);
+ }
+}
+
+#[test]
+fn test_empty_bytes() {
+ test_utf8_bytes("");
+}
+
+#[test]
+fn test_empty_vec() {
+ test_utf8_vec("");
+}
+
+#[test]
+fn test_nonempty_utf8_bytes() {
+ test_utf8_bytes(UTF8_STRING);
+}
+
+#[test]
+fn test_nonempty_utf8_vec() {
+ test_utf8_vec(UTF8_STRING);
+}
+
+#[test]
+fn test_invalid_string_is_invalid_utf8() {
+ test_string_is_invalid_utf8(INVALID_STRING);
+}
+
+#[test]
+fn test_invalid_bytes() {
+ test_invalid_result(&test_bytes(INVALID_STRING));
+}
+
+#[test]
+fn test_invalid_vec() {
+ test_invalid_result(&test_vec(INVALID_STRING));
+}
+
+#[test]
+fn test_wtf8_string_is_invalid_utf8() {
+ test_string_is_invalid_utf8(WTF8_STRING);
+}
+
+#[test]
+fn test_wtf8_bytes() {
+ assert_eq!(Ok(()), test_bytes(WTF8_STRING));
+}
+
+#[test]
+fn test_wtf8_vec() {
+ assert_eq!(Ok(()), test_vec(WTF8_STRING));
+}
diff --git a/tests/random.rs b/tests/random.rs
new file mode 100644
index 0000000..ad6e8d2
--- /dev/null
+++ b/tests/random.rs
@@ -0,0 +1,126 @@
+use std::borrow::Cow;
+use std::ffi::OsStr;
+use std::ffi::OsString;
+
+use getrandom::getrandom;
+
+use os_str_bytes::OsStrBytes;
+use os_str_bytes::OsStringBytes;
+
+mod common;
+use common::from_bytes;
+use common::from_vec;
+
+const SMALL_LENGTH: usize = 16;
+
+const LARGE_LENGTH: usize = 1024;
+
+const ITERATIONS: usize = 1024;
+
+fn random_os_string(
+ buffer_length: usize,
+) -> Result<OsString, getrandom::Error> {
+ let mut buffer = vec![0; buffer_length];
+ #[cfg(unix)]
+ {
+ use std::os::unix::ffi::OsStringExt;
+
+ getrandom(&mut buffer)?;
+ Ok(OsStringExt::from_vec(buffer))
+ }
+ #[cfg(windows)]
+ {
+ use std::os::windows::ffi::OsStringExt;
+ use std::slice;
+
+ getrandom(as_mut_bytes(&mut buffer))?;
+ return Ok(OsStringExt::from_wide(&buffer));
+
+ fn as_mut_bytes(buffer: &mut [u16]) -> &mut [u8] {
+ // SAFETY: [u16] can always be transmuted to two [u8] bytes.
+ unsafe {
+ slice::from_raw_parts_mut(
+ buffer.as_mut_ptr() as *mut u8,
+ buffer.len() * 2,
+ )
+ }
+ }
+ }
+ #[cfg(not(any(unix, windows)))]
+ Err(getrandom::Error::UNSUPPORTED)
+}
+
+#[test]
+fn test_random_bytes() -> Result<(), getrandom::Error> {
+ let os_string = random_os_string(LARGE_LENGTH)?;
+ let string = os_string.to_raw_bytes();
+ assert_eq!(os_string.len(), string.len());
+ assert_eq!(Ok(Cow::Borrowed(&*os_string)), from_bytes(&string));
+ Ok(())
+}
+
+#[test]
+fn test_random_vec() -> Result<(), getrandom::Error> {
+ let os_string = random_os_string(LARGE_LENGTH)?;
+ let string = os_string.clone().into_raw_vec();
+ assert_eq!(os_string.len(), string.len());
+ assert_eq!(Ok(os_string), from_vec(string));
+ Ok(())
+}
+
+#[test]
+fn test_lossless() -> Result<(), getrandom::Error> {
+ for _ in 0..ITERATIONS {
+ let mut string = vec![0; SMALL_LENGTH];
+ getrandom(&mut string)?;
+ if let Ok(os_string) = OsStr::from_raw_bytes(&string) {
+ let encoded_string = os_string.to_raw_bytes();
+ assert_eq!(string, &*encoded_string);
+ }
+ }
+ Ok(())
+}
+
+#[cfg(feature = "raw_os_str")]
+#[test]
+fn test_raw() -> Result<(), getrandom::Error> {
+ use os_str_bytes::RawOsStr;
+ use os_str_bytes::RawOsString;
+
+ macro_rules! test {
+ (
+ $result:expr ,
+ $method:ident (& $string:ident , & $substring:ident )
+ ) => {
+ #[allow(clippy::bool_assert_comparison)]
+ {
+ assert_eq!(
+ $result,
+ $string.$method(&$substring),
+ concat!(stringify!($method), "({:?}, {:?})"),
+ $string,
+ $substring,
+ );
+ }
+ };
+ }
+
+ for _ in 0..ITERATIONS {
+ let mut string = random_os_string(SMALL_LENGTH)?;
+ let prefix = RawOsStr::new(&string).into_owned();
+ let suffix = random_os_string(SMALL_LENGTH)?;
+ string.push(&suffix);
+
+ let string = RawOsString::new(string);
+ let suffix = RawOsString::new(suffix);
+
+ test!(true, ends_with_os(&string, &suffix));
+ test!(true, starts_with_os(&string, &prefix));
+
+ if prefix != suffix {
+ test!(false, ends_with_os(&string, &prefix));
+ test!(false, starts_with_os(&string, &suffix));
+ }
+ }
+ Ok(())
+}
diff --git a/tests/raw.rs b/tests/raw.rs
new file mode 100644
index 0000000..fe29705
--- /dev/null
+++ b/tests/raw.rs
@@ -0,0 +1,108 @@
+#![cfg(feature = "raw_os_str")]
+
+use std::ffi::OsStr;
+
+use os_str_bytes::EncodingError;
+use os_str_bytes::OsStrBytes;
+use os_str_bytes::RawOsStr;
+
+mod common;
+use common::RAW_WTF8_STRING;
+
+fn from_raw_bytes(string: &[u8]) -> Result<&RawOsStr, EncodingError> {
+ // SAFETY: The string is validated before conversion.
+ OsStr::from_raw_bytes(string)
+ .map(|_| unsafe { common::from_raw_bytes_unchecked(string) })
+}
+
+#[test]
+fn test_ends_with() {
+ test(true, b"");
+ test(true, b"r");
+ test(true, b"ar");
+ test(true, b"bar");
+ if cfg!(not(windows)) {
+ test(true, b"\xA9bar");
+ test(true, b"\x92\xA9bar");
+ test(true, b"\x9F\x92\xA9bar");
+ }
+ test(cfg!(windows), b"\xED\xB2\xA9bar");
+ test(true, b"\xF0\x9F\x92\xA9bar");
+ test(true, b"\xED\xA0\xBD\xF0\x9F\x92\xA9bar");
+ test(true, b"o\xED\xA0\xBD\xF0\x9F\x92\xA9bar");
+ test(true, b"oo\xED\xA0\xBD\xF0\x9F\x92\xA9bar");
+ test(true, b"foo\xED\xA0\xBD\xF0\x9F\x92\xA9bar");
+
+ test(false, b"\xED\xA0\xBDbar");
+ test(false, b"\xED\xB2\xA9aar");
+
+ fn test(result: bool, suffix: &[u8]) {
+ let suffix = from_raw_bytes(suffix).unwrap();
+ assert_eq!(result, RAW_WTF8_STRING.ends_with_os(suffix));
+ }
+}
+
+#[test]
+fn test_empty_ends_with() {
+ macro_rules! test {
+ ( $result:expr , $string:expr , $substring:expr ) => {
+ #[allow(clippy::bool_assert_comparison)]
+ {
+ assert_eq!(
+ $result,
+ RawOsStr::from_str($string)
+ .ends_with_os(RawOsStr::from_str($substring)),
+ );
+ }
+ };
+ }
+ test!(true, "", "");
+ test!(false, "", "r");
+ test!(false, "", "ar");
+}
+
+#[test]
+fn test_starts_with() {
+ test(true, b"");
+ test(true, b"f");
+ test(true, b"fo");
+ test(true, b"foo");
+ test(true, b"foo\xED\xA0\xBD");
+ if cfg!(not(windows)) {
+ test(true, b"foo\xED\xA0\xBD\xF0");
+ test(true, b"foo\xED\xA0\xBD\xF0\x9F");
+ test(true, b"foo\xED\xA0\xBD\xF0\x9F\x92");
+ }
+ test(cfg!(windows), b"foo\xED\xA0\xBD\xED\xA0\xBD");
+ test(true, b"foo\xED\xA0\xBD\xF0\x9F\x92\xA9");
+ test(true, b"foo\xED\xA0\xBD\xF0\x9F\x92\xA9b");
+ test(true, b"foo\xED\xA0\xBD\xF0\x9F\x92\xA9ba");
+ test(true, b"foo\xED\xA0\xBD\xF0\x9F\x92\xA9bar");
+
+ test(false, b"foo\xED\xB2\xA9");
+ test(false, b"fof\xED\xA0\xBD\xED\xA0\xBD");
+
+ fn test(result: bool, prefix: &[u8]) {
+ let prefix = from_raw_bytes(prefix).unwrap();
+ assert_eq!(result, RAW_WTF8_STRING.starts_with_os(prefix));
+ }
+}
+
+#[test]
+fn test_empty_starts_with() {
+ macro_rules! test {
+ ( $result:expr , $string:expr , $substring:expr ) => {
+ #[allow(clippy::bool_assert_comparison)]
+ {
+ assert_eq!(
+ $result,
+ RawOsStr::from_str($string)
+ .starts_with_os(RawOsStr::from_str($substring)),
+ );
+ }
+ };
+ }
+ test!(true, "", "");
+ test!(false, "", "f");
+ test!(false, "", "fo");
+}