Skip to content

Commit 90f72bf

Browse files
authored
Merge pull request #116 from mxsm/codex/pr-001-utf8-invariants
Feat: Harden string invariants, serde semantics, Cargo features, and release automation
2 parents 21aa0bf + 08189cd commit 90f72bf

9 files changed

Lines changed: 277 additions & 81 deletions

File tree

.github/workflows/release.yml

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
name: Release
2+
3+
on:
4+
push:
5+
tags:
6+
- "v*.*.*"
7+
workflow_dispatch:
8+
inputs:
9+
version:
10+
description: "Version to release, for example 1.1.0"
11+
required: true
12+
type: string
13+
14+
permissions:
15+
contents: write
16+
17+
env:
18+
CARGO_TERM_COLOR: always
19+
RUST_BACKTRACE: full
20+
21+
jobs:
22+
publish:
23+
runs-on: ubuntu-latest
24+
25+
steps:
26+
- name: Checkout code
27+
uses: actions/checkout@v4
28+
with:
29+
fetch-depth: 0
30+
31+
- name: Set up Rust
32+
uses: dtolnay/rust-toolchain@stable
33+
with:
34+
toolchain: stable
35+
36+
- name: Resolve release version
37+
id: version
38+
shell: bash
39+
run: |
40+
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
41+
VERSION="${{ inputs.version }}"
42+
else
43+
VERSION="${GITHUB_REF_NAME#v}"
44+
fi
45+
46+
if [[ ! "$VERSION" =~ ^[0-9]+\.[0-9]+\.[0-9]+([+-][0-9A-Za-z.-]+)?$ ]]; then
47+
echo "Invalid release version: $VERSION" >&2
48+
exit 1
49+
fi
50+
51+
MANIFEST_VERSION="$(cargo metadata --no-deps --format-version 1 | python3 -c 'import json, sys; print(json.load(sys.stdin)["packages"][0]["version"])')"
52+
if [[ "$MANIFEST_VERSION" != "$VERSION" ]]; then
53+
echo "Cargo.toml version $MANIFEST_VERSION does not match release version $VERSION" >&2
54+
exit 1
55+
fi
56+
57+
echo "version=$VERSION" >> "$GITHUB_OUTPUT"
58+
echo "tag=v$VERSION" >> "$GITHUB_OUTPUT"
59+
60+
- name: Create tag for manual release
61+
if: github.event_name == 'workflow_dispatch'
62+
shell: bash
63+
run: |
64+
TAG="${{ steps.version.outputs.tag }}"
65+
CURRENT_SHA="$(git rev-parse HEAD)"
66+
67+
if git rev-parse "$TAG" >/dev/null 2>&1; then
68+
TAG_SHA="$(git rev-list -n 1 "$TAG")"
69+
if [[ "$TAG_SHA" != "$CURRENT_SHA" ]]; then
70+
echo "Tag $TAG already exists at $TAG_SHA, not current HEAD $CURRENT_SHA" >&2
71+
exit 1
72+
fi
73+
else
74+
git config user.name "github-actions[bot]"
75+
git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
76+
git tag "$TAG"
77+
git push origin "$TAG"
78+
fi
79+
80+
- name: Check formatting
81+
run: cargo fmt -- --check
82+
83+
- name: Lint
84+
run: cargo clippy --all-features -- -D warnings
85+
86+
- name: Test all features
87+
run: cargo test --all-features
88+
89+
- name: Test no-default feature matrix
90+
run: cargo test --no-default-features --features serde,bytes,simd
91+
92+
- name: Package crate
93+
run: cargo package
94+
95+
- name: Publish crate to crates.io
96+
run: cargo publish --token "$CARGO_REGISTRY_TOKEN"
97+
env:
98+
CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
99+
100+
- name: Create GitHub release
101+
env:
102+
GH_TOKEN: ${{ github.token }}
103+
TAG: ${{ steps.version.outputs.tag }}
104+
VERSION: ${{ steps.version.outputs.version }}
105+
run: |
106+
gh release create "$TAG" \
107+
"target/package/cheetah-string-$VERSION.crate#cheetah-string-$VERSION.crate" \
108+
--verify-tag \
109+
--title "cheetah-string $TAG" \
110+
--generate-notes

Cargo.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "cheetah-string"
3-
version = "1.0.1"
3+
version = "1.1.0"
44
authors = ["mxsm <mxsm@apache.org>"]
55
edition = "2021"
66
homepage = "https://github.com/mxsm/cheetah-string"
@@ -15,14 +15,14 @@ A lightweight, high-performance string manipulation library optimized for speed-
1515
"""
1616

1717
[dependencies]
18-
bytes = "1.10.0"
18+
bytes = { version = "1.10.0", optional = true, default-features = false }
1919
serde = { version = "1.0", optional = true, default-features = false, features = ["alloc"] }
2020

2121
[features]
2222
default = ["std"]
2323
std = []
24-
serde = ["serde/alloc"]
25-
bytes = []
24+
serde = ["dep:serde", "serde/alloc"]
25+
bytes = ["dep:bytes"]
2626
simd = []
2727

2828
[dev-dependencies]

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,14 @@ Add this to your `Cargo.toml`:
4545

4646
```toml
4747
[dependencies]
48-
cheetah-string = "1.0.0"
48+
cheetah-string = "1.1.0"
4949
```
5050

5151
### Optional Features
5252

5353
```toml
5454
[dependencies]
55-
cheetah-string = { version = "1.0.0", features = ["bytes", "serde", "simd"] }
55+
cheetah-string = { version = "1.1.0", features = ["bytes", "serde", "simd"] }
5656
```
5757

5858
Available features:

benches/comprehensive.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -362,13 +362,13 @@ fn bench_internal_hot_paths(c: &mut Criterion) {
362362
});
363363

364364
let long_bytes = vec![b'a'; 256];
365-
group.bench_function("CheetahString::from(Vec<u8> 256B)", |b| {
366-
b.iter(|| black_box(CheetahString::from(long_bytes.clone())))
365+
group.bench_function("CheetahString::try_from_vec(256B)", |b| {
366+
b.iter(|| black_box(CheetahString::try_from_vec(long_bytes.clone()).unwrap()))
367367
});
368368

369-
group.bench_function("String::from(CheetahString::from(Vec<u8> 256B))", |b| {
369+
group.bench_function("String::from(CheetahString::try_from_vec(256B))", |b| {
370370
b.iter(|| {
371-
let value = CheetahString::from(long_bytes.clone());
371+
let value = CheetahString::try_from_vec(long_bytes.clone()).unwrap();
372372
black_box(String::from(value))
373373
})
374374
});

src/cheetah_string.rs

Lines changed: 113 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -47,19 +47,12 @@ impl<'a> From<&'a str> for CheetahString {
4747
}
4848
}
4949

50-
/// # Safety Warning
51-
///
52-
/// This implementation uses `unsafe` code and may cause undefined behavior
53-
/// if the bytes are not valid UTF-8. Consider using `CheetahString::try_from_bytes()`
54-
/// for safe UTF-8 validation.
55-
///
56-
/// This implementation will be deprecated in a future version.
57-
impl From<&[u8]> for CheetahString {
50+
impl<'a> TryFrom<&'a [u8]> for CheetahString {
51+
type Error = Utf8Error;
52+
5853
#[inline]
59-
fn from(b: &[u8]) -> Self {
60-
// SAFETY: This is unsafe and may cause UB if bytes are not valid UTF-8.
61-
// This will be deprecated in favor of try_from_bytes in the next version.
62-
CheetahString::from_slice(unsafe { str::from_utf8_unchecked(b) })
54+
fn try_from(b: &'a [u8]) -> Result<Self, Self::Error> {
55+
CheetahString::try_from_bytes(b)
6356
}
6457
}
6558

@@ -71,19 +64,12 @@ impl FromStr for CheetahString {
7164
}
7265
}
7366

74-
/// # Safety Warning
75-
///
76-
/// This implementation uses `unsafe` code and may cause undefined behavior
77-
/// if the bytes are not valid UTF-8. Consider using `CheetahString::try_from_vec()`
78-
/// for safe UTF-8 validation.
79-
///
80-
/// This implementation will be deprecated in a future version.
81-
impl From<Vec<u8>> for CheetahString {
67+
impl TryFrom<Vec<u8>> for CheetahString {
68+
type Error = Utf8Error;
69+
8270
#[inline]
83-
fn from(v: Vec<u8>) -> Self {
84-
// SAFETY: This constructor does not validate UTF-8 and may cause UB
85-
// if the bytes are later observed as a string.
86-
CheetahString::from_vec(v)
71+
fn try_from(v: Vec<u8>) -> Result<Self, Self::Error> {
72+
CheetahString::try_from_vec(v)
8773
}
8874
}
8975

@@ -159,10 +145,12 @@ impl<'a> FromIterator<&'a String> for CheetahString {
159145
}
160146

161147
#[cfg(feature = "bytes")]
162-
impl From<bytes::Bytes> for CheetahString {
148+
impl TryFrom<bytes::Bytes> for CheetahString {
149+
type Error = Utf8Error;
150+
163151
#[inline]
164-
fn from(b: bytes::Bytes) -> Self {
165-
CheetahString::from_bytes(b)
152+
fn try_from(b: bytes::Bytes) -> Result<Self, Self::Error> {
153+
CheetahString::try_from_bytes_buf(b)
166154
}
167155
}
168156

@@ -277,8 +265,29 @@ impl CheetahString {
277265
}
278266
}
279267

280-
#[inline]
268+
#[deprecated(
269+
since = "1.1.0",
270+
note = "use try_from_vec for checked construction or from_utf8_unchecked_vec for an explicit unsafe constructor"
271+
)]
281272
pub fn from_vec(s: Vec<u8>) -> Self {
273+
CheetahString::try_from_vec(s).expect(
274+
"CheetahString::from_vec requires valid UTF-8; use try_from_vec for fallible construction",
275+
)
276+
}
277+
278+
/// Creates a `CheetahString` from a byte vector without validating UTF-8.
279+
///
280+
/// # Safety
281+
///
282+
/// The caller must guarantee that `s` contains valid UTF-8 for the entire
283+
/// lifetime of the returned `CheetahString`.
284+
#[inline]
285+
pub unsafe fn from_utf8_unchecked_vec(s: Vec<u8>) -> Self {
286+
CheetahString::from_validated_vec_unchecked(s)
287+
}
288+
289+
#[inline]
290+
fn from_validated_vec_unchecked(s: Vec<u8>) -> Self {
282291
if s.len() <= INLINE_CAPACITY {
283292
let mut data = [0u8; INLINE_CAPACITY];
284293
data[..s.len()].copy_from_slice(&s);
@@ -314,9 +323,8 @@ impl CheetahString {
314323
/// assert!(CheetahString::try_from_vec(invalid).is_err());
315324
/// ```
316325
pub fn try_from_vec(v: Vec<u8>) -> Result<Self, Utf8Error> {
317-
// Validate UTF-8
318326
str::from_utf8(&v)?;
319-
Ok(CheetahString::from_vec(v))
327+
Ok(CheetahString::from_validated_vec_unchecked(v))
320328
}
321329

322330
/// Creates a `CheetahString` from a byte slice with UTF-8 validation.
@@ -342,8 +350,51 @@ impl CheetahString {
342350
Ok(CheetahString::from_slice(s))
343351
}
344352

353+
/// Creates a `CheetahString` from a byte slice without validating UTF-8.
354+
///
355+
/// # Safety
356+
///
357+
/// The caller must guarantee that `b` contains valid UTF-8.
358+
#[inline]
359+
pub unsafe fn from_utf8_unchecked_bytes(b: &[u8]) -> Self {
360+
// SAFETY: The caller guarantees that `b` contains valid UTF-8.
361+
CheetahString::from_slice(unsafe { str::from_utf8_unchecked(b) })
362+
}
363+
364+
/// Creates a `CheetahString` from a shared byte vector with UTF-8 validation.
365+
///
366+
/// # Errors
367+
///
368+
/// Returns an error if the bytes are not valid UTF-8.
369+
#[inline]
370+
pub fn try_from_arc_vec(s: Arc<Vec<u8>>) -> Result<Self, Utf8Error> {
371+
str::from_utf8(s.as_slice())?;
372+
Ok(CheetahString::from_validated_arc_vec_unchecked(s))
373+
}
374+
375+
#[deprecated(
376+
since = "1.1.0",
377+
note = "use try_from_arc_vec for checked construction or from_utf8_unchecked_arc_vec for an explicit unsafe constructor"
378+
)]
345379
#[inline]
346380
pub fn from_arc_vec(s: Arc<Vec<u8>>) -> Self {
381+
CheetahString::try_from_arc_vec(s).expect(
382+
"CheetahString::from_arc_vec requires valid UTF-8; use try_from_arc_vec for fallible construction",
383+
)
384+
}
385+
386+
/// Creates a `CheetahString` from a shared byte vector without validating UTF-8.
387+
///
388+
/// # Safety
389+
///
390+
/// The caller must guarantee that `s` contains valid UTF-8.
391+
#[inline]
392+
pub unsafe fn from_utf8_unchecked_arc_vec(s: Arc<Vec<u8>>) -> Self {
393+
CheetahString::from_validated_arc_vec_unchecked(s)
394+
}
395+
396+
#[inline]
397+
fn from_validated_arc_vec_unchecked(s: Arc<Vec<u8>>) -> Self {
347398
CheetahString {
348399
inner: InnerString::ArcVecString(s),
349400
}
@@ -418,7 +469,37 @@ impl CheetahString {
418469

419470
#[inline]
420471
#[cfg(feature = "bytes")]
472+
#[deprecated(
473+
since = "1.1.0",
474+
note = "use try_from_bytes_buf for checked construction or from_utf8_unchecked_bytes_buf for an explicit unsafe constructor"
475+
)]
421476
pub fn from_bytes(b: bytes::Bytes) -> Self {
477+
CheetahString::try_from_bytes_buf(b).expect(
478+
"CheetahString::from_bytes requires valid UTF-8; use try_from_bytes_buf for fallible construction",
479+
)
480+
}
481+
482+
#[inline]
483+
#[cfg(feature = "bytes")]
484+
pub fn try_from_bytes_buf(b: bytes::Bytes) -> Result<Self, Utf8Error> {
485+
str::from_utf8(b.as_ref())?;
486+
Ok(CheetahString::from_validated_bytes_unchecked(b))
487+
}
488+
489+
/// Creates a `CheetahString` from `bytes::Bytes` without validating UTF-8.
490+
///
491+
/// # Safety
492+
///
493+
/// The caller must guarantee that `b` contains valid UTF-8.
494+
#[inline]
495+
#[cfg(feature = "bytes")]
496+
pub unsafe fn from_utf8_unchecked_bytes_buf(b: bytes::Bytes) -> Self {
497+
CheetahString::from_validated_bytes_unchecked(b)
498+
}
499+
500+
#[inline]
501+
#[cfg(feature = "bytes")]
502+
fn from_validated_bytes_unchecked(b: bytes::Bytes) -> Self {
422503
CheetahString {
423504
inner: InnerString::Bytes(b),
424505
}
@@ -1455,6 +1536,7 @@ impl<'a> DoubleEndedIterator for SplitWrapper<'a> {
14551536
#[cfg(test)]
14561537
mod tests {
14571538
use super::*;
1539+
use alloc::{format, vec};
14581540

14591541
#[test]
14601542
fn with_capacity_above_inline_uses_heap_storage() {
@@ -1524,7 +1606,7 @@ mod tests {
15241606
#[test]
15251607
fn long_vec_conversion_uses_arc_vec_storage() {
15261608
let value = "a".repeat(INLINE_CAPACITY + 1).into_bytes();
1527-
let s = CheetahString::from(value);
1609+
let s = CheetahString::try_from_vec(value).expect("valid utf-8");
15281610

15291611
match &s.inner {
15301612
InnerString::ArcVecString(inner) => {

src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
//! To enable SIMD acceleration:
2222
//! ```toml
2323
//! [dependencies]
24-
//! cheetah-string = { version = "1.0.0", features = ["simd"] }
24+
//! cheetah-string = { version = "1.1.0", features = ["simd"] }
2525
//! ```
2626
//!
2727
//! # Examples

0 commit comments

Comments
 (0)