From a2820465ed36cf3458df21b605ada89a5d269c8e Mon Sep 17 00:00:00 2001 From: Jake Shadle Date: Fri, 28 Nov 2025 18:12:22 +0100 Subject: [PATCH 01/14] Checkpoint --- .gitattributes | 1 + Cargo.lock | 182 +++++++++++++- Cargo.toml | 9 + src/detection.rs | 175 ++++++++++++++ src/detection/cache.bin.zstd | 3 + src/detection/cache.rs | 367 ++++++++++++++++++++++++++++ src/detection/detect.rs | 141 +++++++++++ src/detection/inline-cache.rs | 9 + src/detection/license.rs | 443 ++++++++++++++++++++++++++++++++++ src/detection/ngram.rs | 171 +++++++++++++ src/detection/preproc.rs | 440 +++++++++++++++++++++++++++++++++ src/lib.rs | 5 + update/Cargo.toml | 1 + update/src/main.rs | 72 ++++++ 14 files changed, 2017 insertions(+), 2 deletions(-) create mode 100644 src/detection.rs create mode 100644 src/detection/cache.bin.zstd create mode 100644 src/detection/cache.rs create mode 100644 src/detection/detect.rs create mode 100644 src/detection/inline-cache.rs create mode 100644 src/detection/license.rs create mode 100644 src/detection/ngram.rs create mode 100644 src/detection/preproc.rs diff --git a/.gitattributes b/.gitattributes index d773688..782bf9c 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1,2 @@ src/text/** linguist-vendored +src/detection/cache.bin.zstd filter=lfs diff=lfs merge=lfs -text diff --git a/Cargo.lock b/Cargo.lock index 647bf30..94f0a7b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + [[package]] name = "bstr" version = "1.11.0" @@ -13,6 +22,18 @@ dependencies = [ "serde", ] +[[package]] +name = "cc" +version = "1.2.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c481bdbf0ed3b892f6f806287d72acd515b352a4ec27a208489b8c1bc839633a" +dependencies = [ + "find-msvc-tools", + "jobserver", + "libc", + "shlex", +] + [[package]] name = "console" version = "0.15.8" @@ -25,12 +46,58 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + [[package]] name = "encode_unicode" version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" +[[package]] +name = "find-msvc-tools" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a3076410a55c90011c298b04d0cfa770b00fa04e1e3c97d3f6c9de105a03844" + +[[package]] +name = "jobserver" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" +dependencies = [ + "libc", +] + [[package]] name = "lazy_static" version = "1.5.0" @@ -49,6 +116,12 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + [[package]] name = "proc-macro2" version = "1.0.89" @@ -67,11 +140,54 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + [[package]] name = "regex-automata" -version = "0.4.9" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" [[package]] name = "serde" @@ -93,6 +209,12 @@ dependencies = [ "syn", ] +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + [[package]] name = "similar" version = "2.6.0" @@ -123,8 +245,12 @@ checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" name = "spdx" version = "0.12.0" dependencies = [ + "rayon", + "regex", "similar-asserts", "smallvec", + "unicode-normalization", + "zstd", ] [[package]] @@ -138,12 +264,36 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tinyvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + [[package]] name = "unicode-ident" version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" +[[package]] +name = "unicode-normalization" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" +dependencies = [ + "tinyvec", +] + [[package]] name = "unicode-segmentation" version = "1.12.0" @@ -222,3 +372,31 @@ name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/Cargo.toml b/Cargo.toml index 659f306..d3ae4fc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,10 +26,19 @@ include = [ [features] # Includes the full canonical text of each license text = [] +# Allows analysis of text to determine if it might be an SPDX license text +detection = ["regex", "unicode-normalization"] +detection-cache = ["detection", "zstd"] +detection-inline-cache = ["detection-cache"] +detection-parallel = ["detection", "rayon"] [dependencies] +rayon = { version = "1.11", optional = true } +regex = { version = "1.12", optional = true } # In most cases expressions are quite small so we can avoid heap allocations smallvec = "1.15" +unicode-normalization = { version = "0.1", optional = true } +zstd = { version = "0.13", optional = true } [dev-dependencies] # Used to print colored diffs in case of test failures diff --git a/src/detection.rs b/src/detection.rs new file mode 100644 index 0000000..fe34e9a --- /dev/null +++ b/src/detection.rs @@ -0,0 +1,175 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! This module is basically an inling of [askalono](https://github.com/jpeddicord/askalono) +//! +//! Askalono is not really maintained and also depends on other unmaintained +//! crates, since this crate is used by both cargo-deny and cargo-about in +//! conjunction with askalono for checking licenses, I'm pulling it directly into +//! this crate just to avoid all of the external dependencies + +use std::collections::HashMap; + +#[cfg(feature = "detection-cache")] +mod cache; +#[cfg(feature = "detection-inline-cache")] +mod inline_cache; +mod detect; +mod license; +pub use license::{LicenseType, TextData}; +mod ngram; +mod preproc; + +pub struct LicenseEntry { + pub original: TextData, + pub aliases: Vec, + pub headers: Vec, + pub alternates: Vec, +} + +impl LicenseEntry { + pub fn new(original: TextData) -> Self { + Self { + original, + aliases: Vec::new(), + alternates: Vec::new(), + headers: Vec::new(), + } + } +} + +/// A representation of a collection of known licenses. +/// +/// This struct is generally what you want to start with if you're looking to +/// match text against a database of licenses. Load a cache from disk using +/// `from_cache`, then use the `analyze` function to determine what a text most +/// closely matches. +#[derive(Default)] +pub struct Store { + pub(crate) licenses: HashMap, +} + +impl Store { + /// Create a new `Store`. + /// + /// More often, you probably want to use `from_cache` instead of creating + /// an empty store. + pub fn new() -> Self { + Self { + licenses: HashMap::new(), + } + } + + /// Get the number of licenses in the store. + /// + /// This only counts licenses by name -- headers, aliases, and alternates + /// aren't included in the count. + #[inline] + pub fn len(&self) -> usize { + self.licenses.len() + } + + /// Check if the store is empty. + #[inline] + pub fn is_empty(&self) -> bool { + self.licenses.is_empty() + } + + /// Get all licenses by name via iterator. + #[inline] + pub fn licenses(&self) -> impl Iterator { + self.licenses.keys() + } + + /// Get a license's standard `TextData` by name. + #[inline] + pub fn get_original(&self, name: &str) -> Option<&TextData> { + self.licenses.get(name).map(|le| &le.original) + } + + /// Add a single license to the store. + /// + /// If the license with the given name already existed, it and all of its + /// variants will be replaced. + #[inline] + pub fn add_license(&mut self, name: String, data: TextData) { + let entry = LicenseEntry::new(data); + self.licenses.insert(name, entry); + } + + /// Inserts a full `LicenseEntry` + #[inline] + pub fn insert_entry(&mut self, name: String, entry: LicenseEntry) { + self.licenses.insert(name, entry); + } + + /// Add a variant (a header or alternate formatting) of a given license to + /// the store. + /// + /// The license must already exist. This function cannot be used to replace + /// the original/canonical text of the license. + #[inline] + pub fn add_variant( + &mut self, + name: &str, + variant: LicenseType, + data: TextData, + ) -> Result<(), StoreError> { + let entry = self + .licenses + .get_mut(name) + .ok_or(StoreError::UnknownLicense)?; + + match variant { + LicenseType::Alternate => { + entry.alternates.push(data); + } + LicenseType::Header => { + entry.headers.push(data); + } + LicenseType::Original => { + return Err(StoreError::OriginalInvalidForVariant); + } + } + + Ok(()) + } + + /// Get the list of aliases for a given license. + #[inline] + pub fn aliases(&self, name: &str) -> Option<&Vec> { + self + .licenses + .get(name).map(|le| &le.aliases) + } + + /// Set the list of aliases for a given license. + #[inline] + pub fn set_aliases(&mut self, name: &str, aliases: Vec) -> Result<(), StoreError> { + let entry = self + .licenses + .get_mut(name) + .ok_or(StoreError::UnknownLicense)?; + entry.aliases = aliases; + Ok(()) + } +} + +#[derive(Copy, Clone, PartialEq, Debug)] +pub enum StoreError { + /// The license name was not in the Store + UnknownLicense, + /// Attempted to call `Store::add_variant` with `LicenseType::Original` + OriginalInvalidForVariant, +} + +impl std::fmt::Display for StoreError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::UnknownLicense => f.write_str("specified license did not exist in the store"), + Self::OriginalInvalidForVariant => f.write_str("attempted to add an original license text as a variant"), + } + } +} + +impl std::error::Error for StoreError {} \ No newline at end of file diff --git a/src/detection/cache.bin.zstd b/src/detection/cache.bin.zstd new file mode 100644 index 0000000..7701a05 --- /dev/null +++ b/src/detection/cache.bin.zstd @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:614c37fc12984b7f6a279fda327d69baace94f6cddd9c9ea30d081e95521c3ed +size 2020167 diff --git a/src/detection/cache.rs b/src/detection/cache.rs new file mode 100644 index 0000000..89d3e3a --- /dev/null +++ b/src/detection/cache.rs @@ -0,0 +1,367 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use crate::detection::{Store, LicenseEntry, license::TextData, ngram::NgramSet}; +use std::io; + +const CACHE_VERSION: &str = "spdx-crate-01"; + +#[derive(Debug)] +pub enum CacheError { + Io(io::Error), + InvalidVersion { + actual: String, + expected: &'static str, + }, + Proto(ProtoError), +} + +impl std::fmt::Display for CacheError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Io(io) => write!(f, "{io}"), + Self::Proto(p) => write!(f, "{p}"), + Self::InvalidVersion { actual, expected } => { + write!(f, "expected version {expected}, but got version {actual}") + } + } + } +} + +impl std::error::Error for CacheError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + Self::Io(io) => Some(io), + Self::Proto(p) => Some(p), + Self::InvalidVersion { .. } => None, + } + } +} + +impl From for CacheError { + fn from(b: BinErr) -> Self { + match b { + BinErr::Io(i) => Self::Io(i), + BinErr::Proto(p) => Self::Proto(p), + } + } +} + +impl From for CacheError { + fn from(e: io::Error) -> Self { + Self::Io(e) + } +} + +impl Store { + /// Create a store from a cache file. + /// + /// This method is highly useful for quickly loading a cache, as creating + /// one from text data is rather slow. This method can typically load + /// the full SPDX set from disk in 200-300 ms. The cache will be + /// sanity-checked to ensure it was generated with a similar version of + /// askalono. + pub fn from_cache(mut readable: R) -> Result + where + R: io::Read + Sized, + { + let mut header = [0u8; 13]; + readable.read_exact(&mut header)?; + + if header != CACHE_VERSION.as_bytes() { + return Err(CacheError::InvalidVersion { + actual: String::from_utf8_lossy(&header).into_owned(), + expected: CACHE_VERSION + }); + } + + let mut dec = zstd::Decoder::new(readable)?; + Ok(Self::bread(&mut dec)?) + } + + /// Serialize the current store. + pub fn to_cache(&self, mut writable: W) -> Result<(), CacheError> + where + W: io::Write + Sized, + { + writable.write_all(CACHE_VERSION.as_bytes())?; + + let mut enc = zstd::Encoder::new(writable, 21)?; + self.bwrite(&mut enc)?; + enc.finish()?; + + Ok(()) + } +} + +#[derive(Debug)] +pub enum ProtoError { + TooLong(usize), + Utf8(std::string::FromUtf8Error), +} + +impl std::fmt::Display for ProtoError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::TooLong(tl) => write!(f, "{tl:016x} is too large to fit in a u16"), + Self::Utf8(u) => write!(f, "{u}"), + } + } +} + +impl std::error::Error for ProtoError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + if let Self::Utf8(u) = self { + Some(u) + } else { + None + } + } +} + +enum BinErr { + Io(io::Error), + Proto(ProtoError), +} + +impl From for BinErr { + fn from(e: io::Error) -> Self { + Self::Io(e) + } +} + +impl From for BinErr { + fn from(e: ProtoError) -> Self { + Self::Proto(e) + } +} + +#[inline] +fn write_u16(u: usize, w: &mut W) -> Result<(), BinErr> +where W: io::Write + Sized +{ + let u: u16 = u.try_into().map_err(|_e| ProtoError::TooLong(u))?; + w.write_all(&u.to_le_bytes()).map_err(BinErr::Io) +} + +#[inline] +fn read_u16(r: &mut R) -> Result +where R: io::Read + Sized +{ + let mut u = [0u8; 2]; + r.read_exact(&mut u)?; + Ok(u16::from_le_bytes(u) as usize) +} + +#[inline] +fn write_u64(u: usize, w: &mut W) -> Result<(), BinErr> +where W: io::Write + Sized +{ + w.write_all(&(u as u64).to_le_bytes()).map_err(BinErr::Io) +} + +#[inline] +fn read_u64(r: &mut R) -> Result +where R: io::Read + Sized +{ + let mut b = [0u8; 8]; + r.read_exact(&mut b)?; + Ok(u64::from_le_bytes(b) as usize) +} + +impl Bin for String { + fn bwrite(&self, w: &mut W) -> Result<(), BinErr> + where W: io::Write + Sized + { + write_u16(self.len(), w)?; + w.write_all(self.as_bytes()).map_err(BinErr::Io) + } + + fn bread(r: &mut R) -> Result + where R: io::Read + Sized + { + let mut len = read_u16(r)?; + let mut pos = 0; + let mut s = vec![0; len]; + + while len > 0 { + let read = r.read(&mut s[pos..])?; + pos += read; + len -= read; + } + + Ok(String::from_utf8(s).map_err(ProtoError::Utf8)?) + } +} + +#[inline] +fn write_vec(v: &[B], w: &mut W) -> Result<(), BinErr> + where W: io::Write + Sized, + B: Bin, +{ + write_u16(v.len(), w)?; + + for b in v { + b.bwrite(w)?; + } + + Ok(()) +} + +#[inline] +fn read_vec(r: &mut R) -> Result, BinErr> + where R: io::Read + Sized, + B: Bin, +{ + let len = read_u16(r)?; + + let mut v = Vec::with_capacity(len); + + for _ in 0..len { + v.push(B::bread(r)?); + } + + Ok(v) +} + +trait Bin: Sized { + fn bwrite(&self, w: &mut W) -> Result<(), BinErr> + where W: io::Write + Sized; + fn bread(r: &mut R) -> Result + where R: io::Read + Sized; +} + +impl Bin for Store { + fn bwrite(&self, w: &mut W) -> Result<(), BinErr> + where W: io::Write + Sized + { + write_u16(self.licenses.len(), w)?; + + for (k, v) in &self.licenses { + k.bwrite(w)?; + v.bwrite(w)?; + } + + Ok(()) + } + + fn bread(r: &mut R) -> Result + where R: io::Read + Sized + { + let map_count = read_u16(r)?; + + let mut licenses = std::collections::HashMap::new(); + + for _ in 0..map_count { + let key = String::bread(r)?; + let value = LicenseEntry::bread(r)?; + + licenses.insert(key, value); + } + + Ok(Self { licenses }) + } +} + +impl Bin for LicenseEntry { + fn bwrite(&self, w: &mut W) -> Result<(), BinErr> + where W: io::Write + Sized + { + self.original.bwrite(w)?; + write_vec(&self.aliases, w)?; + write_vec(&self.headers, w)?; + write_vec(&self.alternates, w)?; + + Ok(()) + } + + fn bread(r: &mut R) -> Result + where R: io::Read + Sized + { + Ok(Self { + original: TextData::bread(r)?, + aliases: read_vec(r)?, + headers: read_vec(r)?, + alternates: read_vec(r)?, + }) + } +} + +impl Bin for TextData { + fn bwrite(&self, w: &mut W) -> Result<(), BinErr> + where W: io::Write + Sized + { + self.match_data.bwrite(w)?; + write_u64(self.lines_view.0, w)?; + write_u64(self.lines_view.1, w)?; + write_vec(&self.lines_normalized, w)?; + self.text_processed.bwrite(w)?; + + Ok(()) + } + + fn bread(r: &mut R) -> Result + where R: io::Read + Sized + { + Ok(Self { + match_data: NgramSet::bread(r)?, + lines_view: (read_u64(r)?, read_u64(r)?), + lines_normalized: read_vec(r)?, + text_processed: String::bread(r)?, + }) + } +} + +impl Bin for u32 { + fn bwrite(&self, w: &mut W) -> Result<(), BinErr> + where W: io::Write + Sized + { + w.write_all(&self.to_le_bytes()).map_err(BinErr::Io) + } + + fn bread(r: &mut R) -> Result + where R: io::Read + Sized + { + let mut b = [0;4]; + r.read_exact(&mut b)?; + Ok(u32::from_le_bytes(b)) + } +} + +impl Bin for NgramSet { + fn bwrite(&self, w: &mut W) -> Result<(), BinErr> + where W: io::Write + Sized + { + write_u16(self.map.len(), w)?; + for (k, v) in &self.map { + k.bwrite(w)?; + v.bwrite(w)?; + } + w.write_all(&[self.n])?; + write_u64(self.size, w)?; + + Ok(()) + } + + fn bread(r: &mut R) -> Result + where R: io::Read + Sized + { + let map_len = read_u16(r)?; + let mut map = std::collections::HashMap::new(); + for _ in 0..map_len { + let k = String::bread(r)?; + let v = u32::bread(r)?; + + map.insert(k, v); + } + let mut n = [0; 1]; + r.read_exact(&mut n)?; + let size = read_u64(r)?; + + Ok(Self { + map, + n: n[0], + size, + }) + } +} \ No newline at end of file diff --git a/src/detection/detect.rs b/src/detection/detect.rs new file mode 100644 index 0000000..c934235 --- /dev/null +++ b/src/detection/detect.rs @@ -0,0 +1,141 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::{cmp::Ordering, fmt}; + +use crate::detection::{ + license::LicenseType, + license::TextData, + {LicenseEntry, Store}, +}; + +/// Information about text that was compared against licenses in the store. +/// +/// This only contains information about the overall match; to uncover more +/// data you can run methods like `optimize_bounds` on `TextData`. +/// +/// Its lifetime is tied to the lifetime of the `Store` it was generated from. +#[derive(Clone)] +pub struct Match<'a> { + /// Confidence score of the match, ranging from 0 to 1. + pub score: f32, + /// The name of the closest matching license in the `Store`. This will + /// always be something that exists in the store, regardless of the score. + pub name: &'a str, + /// The type of the license that matched. Useful to know if the match was + /// the complete text, a header, or something else. + pub license_type: LicenseType, + /// A reference to the license data that matched inside the `Store`. May be + /// useful for diagnostic purposes or to further optimize the result. + pub data: &'a TextData, +} + +/// A lighter version of Match to be used during analysis. +/// Reduces the need for cloning a bunch of fields. +struct PartialMatch<'a> { + pub name: &'a str, + pub score: f32, + pub license_type: LicenseType, + pub data: &'a TextData, +} + +impl<'a> PartialOrd for PartialMatch<'a> { + fn partial_cmp(&self, other: &PartialMatch<'_>) -> Option { + self.score.partial_cmp(&other.score) + } +} + +impl<'a> PartialEq for PartialMatch<'a> { + fn eq(&self, other: &PartialMatch<'_>) -> bool { + self.score.eq(&other.score) + && self.name == other.name + && self.license_type == other.license_type + } +} + +impl<'a> fmt::Debug for Match<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "Match {{ score: {}, name: {}, license_type: {:?} }}", + self.score, self.name, self.license_type + ) + } +} + +impl Store { + /// Compare the given `TextData` against all licenses in the `Store`. + /// + /// This parallelizes the search as much as it can to find the best match. + /// Once a match is obtained, it can be optimized further; see methods on + /// `TextData` for more information. + pub fn analyze<'a>(&'a self, text: &TextData) -> Match<'a> { + let mut res: Vec>; + + let analyze_fold = + |mut acc: Vec>, (name, data): (&'a String, &'a LicenseEntry)| { + acc.push(PartialMatch { + score: data.original.match_score(text), + name, + license_type: LicenseType::Original, + data: &data.original, + }); + data.alternates.iter().for_each(|alt| { + acc.push(PartialMatch { + score: alt.match_score(text), + name, + license_type: LicenseType::Alternate, + data: alt, + }); + }); + data.headers.iter().for_each(|head| { + acc.push(PartialMatch { + score: head.match_score(text), + name, + license_type: LicenseType::Header, + data: head, + }); + }); + + acc + }; + + // parallel analysis + #[cfg(feature = "detection-parallel")] + { + use rayon::prelude::*; + res = self + .licenses + .par_iter() + .fold(Vec::new, analyze_fold) + .reduce( + Vec::new, + |mut a: Vec>, b: Vec>| { + a.extend(b); + a + }, + ); + res.par_sort_unstable_by(|a, b| b.partial_cmp(a).unwrap()); + } + + // single-threaded analysis + #[cfg(not(feature = "detection-parallel"))] + { + res = self + .licenses + .iter() + // len of licenses isn't strictly correct, but it'll do + .fold(Vec::with_capacity(self.licenses.len()), analyze_fold); + res.sort_unstable_by(|a, b| b.partial_cmp(a).unwrap()); + } + + let m = &res[0]; + + Match { + score: m.score, + name: m.name, + license_type: m.license_type, + data: m.data, + } + } +} diff --git a/src/detection/inline-cache.rs b/src/detection/inline-cache.rs new file mode 100644 index 0000000..1f3e851 --- /dev/null +++ b/src/detection/inline-cache.rs @@ -0,0 +1,9 @@ +const CACHE: &[u8] = include_bytes!("cache.bin.zstd"); + +impl crate::detection::Store { + /// Attempts to load the cached store inlined into this crate's source + #[inline] + pub fn load_inline() -> Result { + Self::from_cache(CACHE) + } +} \ No newline at end of file diff --git a/src/detection/license.rs b/src/detection/license.rs new file mode 100644 index 0000000..4a1ee89 --- /dev/null +++ b/src/detection/license.rs @@ -0,0 +1,443 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::{collections::HashMap, fmt}; + +use crate::detection::{ + ngram::NgramSet, + preproc::{apply_aggressive, apply_normalizers}, +}; + +/// The type of a license entry (typically in a `Store`). +#[derive(Clone, Copy, PartialEq, Debug)] +pub enum LicenseType { + /// The canonical text of the license. + Original, + /// A license header. There may be more than one in a `Store`. + Header, + /// An alternate form of a license. This is intended to be used for + /// alternate _formats_ of a license, not for variants where the text has + /// different meaning. Not currently used in askalono's SPDX dataset. + Alternate, +} + +// impl LicenseType { +// #[inline] +// fn do_unpack(s: &str) -> Result { +// let lt = match s { +// "original" => Self::Original, +// "header" => Self::Header, +// "alternate" => Self::Alternate, +// _ => return Err(msgpacker::Error::InvalidEnumVariant), +// }; +// Ok(lt) +// } +// } + +// impl msgpacker::Packable for LicenseType { +// fn pack(&self, buf: &mut T) -> usize +// where +// T: Extend, +// { +// let s = match self { +// Self::Original => "original", +// Self::Header => "header", +// Self::Alternate => "alternate", +// }; + +// s.pack(buf) +// } +// } + +// impl msgpacker::Unpackable for LicenseType { +// type Error = msgpacker::Error; + +// fn unpack(buf: &[u8]) -> Result<(usize, Self), Self::Error> { +// let (read, s) = String::unpack(buf)?; +// let this = Self::do_unpack(&s)?; +// Ok((read, this)) +// } + +// fn unpack_iter(bytes: I) -> Result<(usize, Self), Self::Error> +// where +// I: IntoIterator, +// { +// let (read, s) = String::unpack_iter(bytes)?; +// let this = Self::do_unpack(&s)?; +// Ok((read, this)) +// } +// } + +impl fmt::Display for LicenseType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "{}", + match *self { + LicenseType::Original => "original text", + LicenseType::Header => "license header", + LicenseType::Alternate => "alternate text", + } + ) + } +} + +/// A structure representing compiled text/matching data. +/// +/// This is the key structure used to compare two texts against one another. It +/// handles pre-processing the text to n-grams, scoring, and optimizing the +/// result to try to identify specific details about a match. +/// +/// # Examples +/// +/// Basic scoring of two texts: +/// +/// ``` +/// use askalono::TextData; +/// +/// let license = TextData::from("My First License"); +/// let sample = TextData::from("copyright 20xx me irl\n\n // my first license"); +/// assert_eq!(sample.match_score(&license), 1.0); +/// ``` +/// +/// The above example is a perfect match, as identifiable copyright statements +/// are stripped out during pre-processing. +/// +/// Building on that, `TextData` is able to tell you _where_ in the text a +/// license is located: +/// +/// ``` +/// # use std::error::Error; +/// # use askalono::TextData; +/// # fn main() -> Result<(), Box> { +/// # let license = TextData::from("My First License"); +/// let sample = TextData::from("copyright 20xx me irl\n// My First License\nfn hello() {\n ..."); +/// let (optimized, score) = sample.optimize_bounds(&license); +/// assert_eq!((1, 2), optimized.lines_view()); +/// assert!(score > 0.99f32, "license within text matches"); +/// # Ok(()) +/// # } +/// ``` +#[derive(Clone, Debug)] +pub struct TextData { + pub(crate) match_data: NgramSet, + pub(crate) lines_view: (usize, usize), + pub(crate) lines_normalized: Vec, + pub(crate) text_processed: String, +} + +impl TextData { + /// Create a new `TextData` structure from a string. + /// + /// The given text will be normalized, then smashed down into n-grams for + /// matching. By default, the normalized text is stored inside the + /// structure for future diagnostics. This is necessary for optimizing a + /// match and for diffing against other texts. If you don't want this extra + /// data, you can call `without_text` throw it out. Generally, as a user of + /// this library you want to keep the text data, but askalono will throw it + /// away in its own `Store` as it's not needed. + pub fn new(text: &str) -> Self { + let lines_normalized = apply_normalizers(text); + let normalized_joined = lines_normalized.join("\n"); + let text_processed = apply_aggressive(&normalized_joined); + let match_data = NgramSet::from_str(&text_processed, 2); + + Self { + match_data, + lines_view: (0, lines_normalized.len()), + lines_normalized, + text_processed, + } + } + + /// Consume this `TextData`, returning one without normalized/processed + /// text stored. + /// + /// Unless you know you don't want the text, you probably don't want to use + /// this. Other methods on `TextData` require that text is present. + pub fn without_text(self) -> Self { + Self { + match_data: self.match_data, + lines_view: (0, 0), + lines_normalized: Vec::new(), + text_processed: String::new(), + } + } + + /// Get the bounds of the active line view. + /// + /// This represents the "active" region of lines that matches are generated + /// from. The bounds are a 0-indexed `(start, end)` tuple, with inclusive + /// start and exclusive end indicies. See `optimize_bounds`. + /// + /// This is largely for informational purposes; other methods in + /// `TextView`, such as `lines` and `match_score`, will already account for + /// the line range. However, it's useful to call it after running + /// `optimize_bounds` to discover where the input text was discovered. + pub fn lines_view(&self) -> (usize, usize) { + self.lines_view + } + + /// Clone this `TextView`, creating a copy with the given view. + /// + /// This will re-generate match data for the given view. It's used in + /// `optimize_bounds` to shrink/expand the view of the text to discover + /// bounds. + /// + /// Other methods on `TextView` respect this boundary, so it's not needed + /// outside this struct. + pub fn with_view(&self, start: usize, end: usize) -> Self { + let view = &self.lines_normalized[start..end]; + let view_joined = view.join("\n"); + let text_processed = apply_aggressive(&view_joined); + + Self { + match_data: NgramSet::from_str(&text_processed, 2), + lines_view: (start, end), + lines_normalized: self.lines_normalized.clone(), + text_processed, + } + } + + /// "Erase" the current lines in view and restore the view to its original + /// bounds. + /// + /// For example, consider a file with two licenses in it. One was identified + /// (and located) with `optimize_bounds`. Now you want to find the other: + /// white-out the matched lines, and re-run the overall search to find a + /// new high score. + pub fn white_out(&self) -> Self { + // note that we're not using the view here... + let lines = &self.lines_normalized; + + // ...because it's used here to exclude lines + let new_normalized: Vec = lines + .iter() + .enumerate() + .map(|(i, line)| { + if i >= self.lines_view.0 && i < self.lines_view.1 { + "".to_string() + } else { + line.clone() + } + }) + .collect(); + + let text_processed = apply_aggressive(&new_normalized.join("\n")); + Self { + match_data: NgramSet::from_str(&text_processed, 2), + lines_view: (0, new_normalized.len()), + lines_normalized: new_normalized, + text_processed, + } + } + + /// Get a slice of the normalized lines in this `TextData`. + pub fn lines(&self) -> &[String] { + &self.lines_normalized[self.lines_view.0..self.lines_view.1] + } + + #[doc(hidden)] + pub fn text_processed(&self) -> &str { + &self.text_processed + } + + /// Compare this `TextData` with another, returning a similarity score. + /// + /// This is what's used during analysis to rank licenses. + pub fn match_score(&self, other: &Self) -> f32 { + self.match_data.dice(&other.match_data) + } + + #[inline] + pub fn ngram_matches(&self, other: &Self) -> bool { + self.match_data.eq(&other.match_data) + } + + /// Attempt to optimize a known match to locate possible line ranges. + /// + /// Returns a new `TextData` struct and a score. The returned struct is a + /// clone of `self`, with its view set to the best match against `other`. + /// + /// This will respect any views set on the `TextData` (an optimized result + /// won't go outside the original view). + /// + /// Note that this won't be 100% optimal if there are blank lines + /// surrounding the actual match, since successive blank lines in a range + /// will likely have the same score. + /// + /// You should check the value of `lines_view` on the returned struct to + /// find the line ranges. + pub fn optimize_bounds(&self, other: &Self) -> (Self, f32) { + let view = self.lines_view; + + // optimize the ending bounds of the text match + let (end_optimized, _) = self.search_optimize( + &|end| self.with_view(view.0, end).match_score(other), + &|end| self.with_view(view.0, end), + ); + let new_end = end_optimized.lines_view.1; + + // then optimize the starting bounds + let (optimized, score) = end_optimized.search_optimize( + &|start| end_optimized.with_view(start, new_end).match_score(other), + &|start| end_optimized.with_view(start, new_end), + ); + (optimized, score) + } + + fn search_optimize( + &self, + score: &dyn Fn(usize) -> f32, + value: &dyn Fn(usize) -> Self, + ) -> (Self, f32) { + // cache score checks, since they're kinda expensive + let mut memo: HashMap = HashMap::new(); + let mut check_score = + |index: usize| -> f32 { *memo.entry(index).or_insert_with(|| score(index)) }; + + fn search(score: &mut dyn FnMut(usize) -> f32, left: usize, right: usize) -> (usize, f32) { + if right - left <= 3 { + // find the index of the highest score in the remaining items + return (left..=right) + .map(|x| (x, score(x))) + .fold((0usize, 0f32), |acc, x| if x.1 >= acc.1 { x } else { acc }); + } + + let low = (left * 2 + right) / 3; + let high = (left + right * 2) / 3; + let score_low = score(low); + let score_high = score(high); + + if score_low > score_high { + search(score, left, high - 1) + } else { + search(score, low + 1, right) + } + } + + let optimal = search(&mut check_score, self.lines_view.0, self.lines_view.1); + (value(optimal.0), optimal.1) + } +} + +impl<'a> From<&'a str> for TextData { + fn from(text: &'a str) -> Self { + Self::new(text) + } +} + +impl From for TextData { + fn from(text: String) -> Self { + Self::new(&text) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // psst: + // cargo test -- --nocapture + + #[test] + fn optimize_bounds() { + let license_text = "this is a license text\nor it pretends to be one\nit's just a test"; + let sample_text = "this is a license text\nor it pretends to be one\nit's just a test\nwords\n\nhere is some\ncode\nhello();\n\n//a comment too"; + let license = TextData::from(license_text).without_text(); + let sample = TextData::from(sample_text); + + let (optimized, _) = sample.optimize_bounds(&license); + println!("{:?}", optimized.lines_view); + println!("{:?}", optimized.lines_normalized); + assert_eq!((0, 3), optimized.lines_view); + + // add more to the string, try again (avoid int trunc screwups) + let sample_text = format!("{}\none more line", sample_text); + let sample = TextData::from(sample_text.as_str()); + let (optimized, _) = sample.optimize_bounds(&license); + println!("{:?}", optimized.lines_view); + println!("{:?}", optimized.lines_normalized); + assert_eq!((0, 3), optimized.lines_view); + + // add to the beginning too + let sample_text = format!("some content\nat\n\nthe beginning\n{}", sample_text); + let sample = TextData::from(sample_text.as_str()); + let (optimized, _) = sample.optimize_bounds(&license); + println!("{:?}", optimized.lines_view); + println!("{:?}", optimized.lines_normalized); + // end bounds at 7 and 8 have the same score, since they're empty lines (not + // counted). askalono is not smart enough to trim this as close as it + // can. + assert!( + (4, 7) == optimized.lines_view || (4, 8) == optimized.lines_view, + "bounds are (4, 7) or (4, 8)" + ); + } + + // if a view is set on the text data, optimize_bounds must not find text + // outside of that range + #[test] + fn optimize_doesnt_grow_view() { + let sample_text = "0\n1\n2\naaa aaa\naaa\naaa\naaa\n7\n8"; + let license_text = "aaa aaa aaa aaa aaa"; + let sample = TextData::from(sample_text); + let license = TextData::from(license_text).without_text(); + + // sanity: the optimized bounds should be at (3, 7) + let (optimized, _) = sample.optimize_bounds(&license); + assert_eq!((3, 7), optimized.lines_view); + + // this should still work + let sample = sample.with_view(3, 7); + let (optimized, _) = sample.optimize_bounds(&license); + assert_eq!((3, 7), optimized.lines_view); + + // but if we shrink the view further, it shouldn't be outside that range + let sample = sample.with_view(4, 6); + let (optimized, _) = sample.optimize_bounds(&license); + assert_eq!((4, 6), optimized.lines_view); + + // restoring the view should still be OK too + let sample = sample.with_view(0, 9); + let (optimized, _) = sample.optimize_bounds(&license); + assert_eq!((3, 7), optimized.lines_view); + } + + // ensure we don't choke on small TextData matches + #[test] + fn match_small() { + let a = TextData::from("a b"); + let b = TextData::from("a\nlong\nlicense\nfile\n\n\n\n\nabcdefg"); + + let x = a.match_score(&b); + let y = b.match_score(&a); + + assert_eq!(x, y); + } + + // don't choke on empty TextData either + #[test] + fn match_empty() { + let a = TextData::from(""); + let b = TextData::from("a\nlong\nlicense\nfile\n\n\n\n\nabcdefg"); + + let x = a.match_score(&b); + let y = b.match_score(&a); + + assert_eq!(x, y); + } + + #[test] + fn view_and_white_out() { + let a = TextData::from("aaa\nbbb\nccc\nddd"); + assert_eq!(Some("aaa bbb ccc ddd"), a.text_processed()); + + let b = a.with_view(1, 3); + assert_eq!(2, b.lines().len()); + assert_eq!(Some("bbb ccc"), b.text_processed()); + + let c = b.white_out(); + assert_eq!(Some("aaa ddd"), c.text_processed()); + } +} diff --git a/src/detection/ngram.rs b/src/detection/ngram.rs new file mode 100644 index 0000000..0211e3c --- /dev/null +++ b/src/detection/ngram.rs @@ -0,0 +1,171 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::{ + cmp::min, + collections::{hash_map::Iter, HashMap, VecDeque}, +}; + +#[derive(Clone, Debug)] +pub struct NgramSet { + pub(crate) map: HashMap, + pub(crate) n: u8, + pub(crate) size: usize, +} + +impl NgramSet { + #[inline] + pub fn new(n: u8) -> Self { + Self { + map: HashMap::new(), + n, + size: 0, + } + } + + #[inline] + pub fn from_str(s: &str, n: u8) -> Self { + let mut set = Self::new(n); + set.analyze(s); + set + } + + pub fn analyze(&mut self, s: &str) { + let words = s.split(' '); + + let mut deque: VecDeque<&str> = VecDeque::with_capacity(self.n as usize); + for w in words { + deque.push_back(w); + if deque.len() == self.n as usize { + let gram = { + let mut g = String::with_capacity(deque.iter().map(|s| s.len()).sum::() + self.n as usize - 1); + + for (i, s) in deque.iter().enumerate() { + if i > 0 { + g.push(' '); + } + + g.push_str(s); + } + + g + }; + + self.add_gram(gram); + deque.pop_front(); + } + } + } + + #[inline] + fn add_gram(&mut self, gram: String) { + let n = self.map.entry(gram).or_insert(0); + *n += 1; + self.size += 1; + } + + #[inline] + pub fn get(&self, gram: &str) -> u32 { + if let Some(count) = self.map.get(gram) { + *count + } else { + 0 + } + } + + #[inline] + pub fn len(&self) -> usize { + self.size + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.size == 0 + } + + pub fn dice(&self, other: &Self) -> f32 { + // no sense comparing sets of different sizes + if other.n != self.n { + return 0f32; + } + + // there's obviously no match if either are empty strings; + // if we don't check here we could end up with NaN below + // when both are empty + if self.is_empty() || other.is_empty() { + return 0f32; + } + + // choose the smaller map to iterate + let (x, y) = if self.len() < other.len() { + (self, other) + } else { + (other, self) + }; + + let mut matches = 0; + for (gram, count) in x { + matches += min(*count, y.get(gram)); + } + + (2.0 * matches as f32) / ((self.len() + other.len()) as f32) + } +} + +impl PartialEq for NgramSet { + fn eq(&self, other: &Self) -> bool { + self.n == other.n && self.size == other.size && self.map == other.map + } +} + +impl<'a> IntoIterator for &'a NgramSet { + type Item = (&'a String, &'a u32); + type IntoIter = Iter<'a, String, u32>; + + fn into_iter(self) -> Self::IntoIter { + self.map.iter() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // this is a pretty banal test, but it's a starting point :P + #[test] + fn can_construct() { + let set = NgramSet::new(2); + assert_eq!(set.size, 0); + assert_eq!(set.n, 2); + } + + #[test] + fn no_nan() { + let a = NgramSet::from_str("", 2); + let b = NgramSet::from_str("", 2); + + let score = a.dice(&b); + + assert!(!score.is_nan()); + } + + #[test] + fn same_size() { + let a = NgramSet::from_str("", 2); + let b = NgramSet::from_str("", 3); + + let score = a.dice(&b); + + assert_eq!(0f32, score); + } + + #[test] + fn identical() { + let a = NgramSet::from_str("one two three apple banana", 2); + let b = NgramSet::from_str("one two three apple banana", 2); + + let score = a.dice(&b); + + assert_eq!(1f32, score); + } +} diff --git a/src/detection/preproc.rs b/src/detection/preproc.rs new file mode 100644 index 0000000..b0819f3 --- /dev/null +++ b/src/detection/preproc.rs @@ -0,0 +1,440 @@ +// Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::{borrow::Cow, collections::HashMap, sync::LazyLock}; + +use regex::{Regex, Replacer}; +use unicode_normalization::UnicodeNormalization; + +type PreprocFn = dyn Fn(Cow<'_, str>) -> Cow<'_, str>; + +trait CowRegex { + fn replace_all_cow<'a, R: Replacer>(&self, text: Cow<'a, str>, replace: R) -> Cow<'a, str>; +} + +impl CowRegex for Regex { + fn replace_all_cow<'a, R: Replacer>(&self, text: Cow<'a, str>, replace: R) -> Cow<'a, str> { + match text { + Cow::Borrowed(find) => self.replace_all(find, replace), + Cow::Owned(find) => Cow::Owned(self.replace_all(&find, replace).into_owned()), + } + } +} + +/// A list of preprocessors that normalize text without removing anything +/// substantial. These operate on one line at a time. +pub const PREPROC_NORMALIZE: [&PreprocFn; 6] = [ + &normalize_unicode, + &remove_junk, + &blackbox_urls, + &normalize_horizontal_whitespace, + &normalize_punctuation, + &trim, +]; + +/// A list of preprocessors that more aggressively normalize/mangle text +/// to make for friendlier matching. May remove statements and lines, and +/// more heavily normalize punctuation. +pub const PREPROC_AGGRESSIVE: [&PreprocFn; 8] = [ + &remove_common_tokens, + &normalize_vertical_whitespace, + &remove_punctuation, + &lowercaseify, + &remove_title_line, + &remove_copyright_statements, + &collapse_whitespace, + &trim, +]; + +pub fn apply_normalizers(text: &str) -> Vec { + let mut lines = Vec::new(); + for line in text.split('\n') { + let mut out = Cow::from(line); + for preproc in &PREPROC_NORMALIZE { + out = preproc(out); + } + lines.push(out.into()); + } + lines +} + +pub fn apply_aggressive(text: &str) -> String { + let mut out = text.into(); + for preproc in &PREPROC_AGGRESSIVE { + out = preproc(out); + } + out.into() +} + +// Line-by-line normalizers + +fn normalize_unicode(input: Cow<'_, str>) -> Cow<'_, str> { + input.nfc().collect::().into() +} + +fn remove_junk(input: Cow<'_, str>) -> Cow<'_, str> { + static RX: LazyLock = LazyLock::new(|| { + Regex::new(r"[^\w\s\pP]+").unwrap() + }); + + RX.replace_all_cow(input, "") +} + +fn blackbox_urls(input: Cow<'_, str>) -> Cow<'_, str> { + static RX: LazyLock = LazyLock::new(|| { + Regex::new(r"https?://\S+").unwrap() + }); + + RX.replace_all_cow(input, "http://blackboxed/url") +} + +fn normalize_horizontal_whitespace(input: Cow<'_, str>) -> Cow<'_, str> { + static RX: LazyLock = LazyLock::new(|| { + Regex::new(r"(?x)[ \t\p{Zs} \\ / \| \x2044 ]+").unwrap() + }); + + RX.replace_all_cow(input, " ") +} + +fn normalize_punctuation(input: Cow<'_, str>) -> Cow<'_, str> { + struct Rx { + quotes: Regex, + dash: Regex, + open: Regex, + close: Regex, + under: Regex, + copy: Regex, + } + static RX: LazyLock = LazyLock::new(|| { + Rx { + quotes: Regex::new(r#"["'\p{Pi}\p{Pf}]+"#).unwrap(), + dash: Regex::new(r"\p{Pd}+").unwrap(), + open:Regex::new(r"\p{Ps}+").unwrap(), + close: Regex::new(r"\p{Pe}+").unwrap(), + under: Regex::new(r"\p{Pc}+").unwrap(), + copy:Regex::new(r"[©Ⓒⓒ]").unwrap(), + } + }); + + let mut out = input; + let rx = &RX; + out = rx.quotes.replace_all_cow(out, "'"); + out = rx.dash.replace_all_cow(out, "-"); + out = rx.open.replace_all_cow(out, "("); + out = rx.close.replace_all_cow(out, ")"); + out = rx.under.replace_all_cow(out, "_"); + rx.copy.replace_all_cow(out, "(c)") +} + +fn trim(input: Cow<'_, str>) -> Cow<'_, str> { + match input { + Cow::Borrowed(text) => text.trim().into(), + Cow::Owned(text) => Cow::Owned(text.trim().to_owned()), + } +} + +// Aggressive preprocessors + +// Cut prefix of string near given byte index. +// If given index doesn't lie at char boundary, +// returns the biggest prefix with length not exceeding idx. +// If index is bigger than length or string, returns the whole string. +fn trim_byte_adjusted(s: &str, idx: usize) -> &str { + if idx >= s.len() { + return s; + } + + if let Some(sub) = s.get(..idx) { + sub + } else { + // Inspect bytes before index + let trailing_continuation = s.as_bytes()[..idx] + .iter() + .rev() + // Multibyte characters are encoded in UTF-8 in the following manner: + // first byte | rest of bytes + // 1..10xxxxx 10xxxxxx + // ^^^^ number of ones is equal to number of bytes in codepoint + // Number of 10xxxxxx bytes in codepoint is at most 3 in valid UTF-8-encoded string, + // so this loop actually runs a little iterations + .take_while(|&byte| byte & 0b1100_0000 == 0b1000_0000) + .count(); + // Subtract 1 to take the first byte in codepoint into account + &s[..idx - trailing_continuation - 1] + } +} + +fn lcs_substr<'a>(f_line: &'a str, s_line: &'a str) -> &'a str { + // find the length of common prefix in byte representations of strings + let prefix_len = f_line + .as_bytes() + .iter() + .zip(s_line.as_bytes()) + .take_while(|&(&f, &s)| f == s) + .count(); + + trim_byte_adjusted(f_line, prefix_len).trim() +} + +fn remove_common_tokens(input: Cow<'_, str>) -> Cow<'_, str> { + let mut l_iter = input.split('\n'); + + let mut prefix_counts = HashMap::<_, u32>::new(); + + // pass 1: iterate through the text to record common prefixes + if let Some(first) = l_iter.next() { + let mut pair = ("", first); + let line_pairs = std::iter::from_fn(|| { + pair = (pair.1, l_iter.next()?); + Some(pair) + }); + for (a, b) in line_pairs { + let common = lcs_substr(a, b); + + // why start at 1, then immediately add 1? + // lcs_substr compares two lines! + // this doesn't need to be exact, just consistent. + if common.len() > 3 { + *prefix_counts.entry(common).or_insert(1) += 1; + } + } + } + + // look at the most common observed prefix + let most_common = match prefix_counts.iter().max_by_key(|&(_k, v)| v) { + Some((prefix, _count)) => prefix, + None => return input, + }; + + // reconcile the count with other longer prefixes that may be stored + let common_count = prefix_counts + .iter() + .filter_map(|(s, count)| Some(count).filter(|_| s.starts_with(most_common))) + .sum::(); + + let line_count = input.split('\n').count(); + + // the common string must be at least 80% of the text + let prefix_threshold = (0.8f32 * line_count as f32) as _; + if common_count < prefix_threshold { + return input; + } + + // pass 2: remove that substring + let mut rem = String::with_capacity(input.len()); + for line in input.split('\n') { + rem.push_str(line.strip_prefix(most_common).unwrap_or(line).trim()); + rem.push('\n'); + } + + // pop trailing newline + rem.pop(); + rem.into() +} + +fn normalize_vertical_whitespace(input: Cow<'_, str>) -> Cow<'_, str> { + struct Rx { + misc: Regex, + num: Regex, + } + static RX: LazyLock = LazyLock::new(|| { + Rx { + misc: Regex::new(r"[\r\n\v\f]").unwrap(), + num: Regex::new(r"\n{3,}").unwrap(), + } + }); + + let mut out = input; + let rx = &RX; + out = rx.misc.replace_all_cow(out, "\n"); + rx.num.replace_all_cow(out, "\n\n") +} + +fn remove_punctuation(input: Cow<'_, str>) -> Cow<'_, str> { + static RX: LazyLock = LazyLock::new(|| { + Regex::new(r"[^\w\s]+").unwrap() + }); + + RX.replace_all_cow(input, "") +} + +fn lowercaseify(input: Cow<'_, str>) -> Cow<'_, str> { + input.to_lowercase().into() +} + +fn remove_title_line(input: Cow<'_, str>) -> Cow<'_, str> { + static RX: LazyLock = LazyLock::new(|| { + Regex::new(r"^.*license( version \S+)?( copyright.*)?\n\n").unwrap() + }); + + RX.replace_all_cow(input, "") +} + +fn remove_copyright_statements(input: Cow<'_, str>) -> Cow<'_, str> { + static RX: LazyLock = LazyLock::new(|| { + Regex::new( + r"(?mx) + ( + # either a new paragraph, or the beginning of the text + empty lines + (\n\n|\A\n*) + # any number of lines starting with 'copyright' followed by a new paragraph + (^\x20*copyright.*?$)+ + \n\n + ) + | + ( + # or the very first line if it has 'copyright' in it + \A.*copyright.*$ + ) + | + ( + # or any lines that really look like a copyright statement + ^copyright (\s+(c|\d+))+ .*?$ + ) + " + ) + .unwrap() + }); + + RX.replace_all_cow(input, "\n\n") +} + +fn collapse_whitespace(input: Cow<'_, str>) -> Cow<'_, str> { + static RX: LazyLock = LazyLock::new(|| { + Regex::new(r"\s+").unwrap() + }); + RX.replace_all_cow(input, " ") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn trim_byte_adjusted_respects_multibyte_characters() { + let input = "RustКраб橙蟹🦀"; + let expected = [ + "", + "R", + "Ru", + "Rus", + "Rust", + "Rust", + "RustК", + "RustК", + "RustКр", + "RustКр", + "RustКра", + "RustКра", + "RustКраб", + "RustКраб", + "RustКраб", + "RustКраб橙", + "RustКраб橙", + "RustКраб橙", + "RustКраб橙蟹", + "RustКраб橙蟹", + "RustКраб橙蟹", + "RustКраб橙蟹", + "RustКраб橙蟹🦀", + ]; + + for (i, &outcome) in expected.iter().enumerate() { + assert_eq!(outcome, trim_byte_adjusted(input, i)) + } + } + + #[test] + fn greatest_substring_removal() { + // the funky string syntax \n\ is to add a newline but skip the + // leading whitespace in the source code + let text = "%%Copyright: Copyright\n\ + %%Copyright: All rights reserved.\n\ + %%Copyright: Redistribution and use in source and binary forms, with or\n\ + %%Copyright: without modification, are permitted provided that the\n\ + %%Copyright: following conditions are met:\n\ + \n\ + abcd"; + + let new_text = remove_common_tokens(text.into()); + println!("{}", new_text); + + assert!( + !new_text.contains("%%Copyright"), + "new text shouldn't contain the common substring" + ); + } + + #[test] + fn greatest_substring_removal_keep_inner() { + let text = "this string should still have\n\ + this word -> this <- in it even though\n\ + this is still the most common word"; + let new_text = remove_common_tokens(text.into()); + println!("-- {}", new_text); + // the "this" at the start of the line can be discarded... + assert!(!new_text.contains("\nthis")); + // ...but the "this" in the middle of sentences shouldn't be + assert!(new_text.contains("this")); + + let text = "aaaa bbbb cccc dddd\n\ + eeee ffff aaaa gggg\n\ + hhhh iiii jjjj"; + let new_text = remove_common_tokens(text.into()); + println!("-- {}", new_text); + assert!(new_text.contains("aaaa")); // similar to above test + } + + #[test] + fn greatest_substring_removal_42() { + // https://github.com/jpeddicord/askalono/issues/42 + let text = "AAAAAA line 1\n\ + AAAAAA another line here\n\ + AAAAAA yet another line here\n\ + AAAAAA how long will this go on\n\ + AAAAAA another line here\n\ + AAAAAA more\n\ + AAAAAA one more\n\ + AAAAAA two more\n\ + AAAAAA three more\n\ + AAAAAA four more\n\ + AAAAAA five more\n\ + AAAAAA six more\n\ + \n\ + preserve\n\ + keep"; + let new_text = remove_common_tokens(text.into()); + println!("{}", new_text); + + assert!(new_text.contains("preserve")); + assert!(new_text.contains("keep")); + assert!(!new_text.contains("AAAAAA")); + } + + #[test] + fn normalize_no_line_mangle() { + let text = "some license + + copyright 2012 person + + \tlicense\r + text + + \t + + + + goes + here"; + + let text_lines = text.lines().count(); + + let normalized = apply_normalizers(text); + let normalized_lines = normalized.len(); + + assert_eq!( + text_lines, normalized_lines, + "normalizers shouldnt change line counts" + ); + } +} diff --git a/src/lib.rs b/src/lib.rs index 7403e6a..d8145b0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,6 +6,11 @@ pub mod identifiers; /// Contains types for lexing an SPDX license expression pub mod lexer; mod licensee; + +/// Allows analysis of text to determine if it resembles a license +#[cfg(feature = "detection")] +pub mod detection; + /// Auto-generated full canonical text of each license #[cfg(feature = "text")] pub mod text; diff --git a/update/Cargo.toml b/update/Cargo.toml index f5a4c46..ecf9b69 100644 --- a/update/Cargo.toml +++ b/update/Cargo.toml @@ -8,3 +8,4 @@ publish = false [dependencies] anyhow = "1.0" serde_json = "1.0.48" +spdx = { path = "..", features = ["detection-cache"] } diff --git a/update/src/main.rs b/update/src/main.rs index 0ecb246..ccdd43c 100644 --- a/update/src/main.rs +++ b/update/src/main.rs @@ -314,6 +314,75 @@ use crate::{{Exception, License, flags::*}}; write_license_texts(texts, v.into_iter().map(|(name, _, _)| name)) } +fn write_cache() -> anyhow::Result<()> { + let json: Map = serde_json::from_str( + &std::fs::read_to_string(format!("{ROOT}/json/licenses.json")) + .context("unable to open licenses.json")?, + ) + .context("failed to deserialize licenses.json")?; + + let licenses = get(&json, "licenses")?; + let licenses = if let Value::Array(v) = licenses { + v + } else { + bail!("Malformed JSON: {licenses:?}") + }; + + use spdx::detection as sd; + + let mut texts = std::collections::BTreeMap::::new(); + + for lic in licenses.iter() { + let lic = if let Value::Object(ref m) = *lic { + m + } else { + bail!("Malformed JSON: {lic:?}") + }; + + let lic_id = get(lic, "licenseId")?.as_str().context("licenseId was not a string")?; + + let details: Map = serde_json::from_str(&std::fs::read_to_string(format!("{ROOT}/json/details/{lic_id}.json")).with_context(|| format!("failed to read license details for {lic_id}"))?).with_context(|| format!("failed to deserialize details for {lic_id}"))?; + + let text = get(&details, "licenseText")?.as_str().context("licenseText was not a string")?; + + let content = sd::TextData::new(text); + + let mut already_existed = false; + for (name, v) in &mut texts { + if !v.original.ngram_matches(&content) { + continue; + } + + v.aliases.push(lic_id.to_owned()); + println!("{lic_id} already stored; added as an alias for {name}"); + already_existed = true; + } + + if already_existed { + continue; + } + + let license = texts + .entry(lic_id.to_owned()) + .or_insert_with(|| sd::LicenseEntry::new(content)); + + if let Some(header_text) = details.get("standardLicenseHeader").and_then(|h| h.as_str()) { + license.headers.push(sd::TextData::new(header_text)); + } + } + + let mut s = sd::Store::new(); + for (key, entry) in texts { + s.insert_entry(key, entry); + } + + let mut f = std::fs::File::create("src/detection/cache.bin.zstd")?; + s.to_cache(&mut f).context("failed to store cache")?; + f.flush().context("failed to flush cache to disk")?; + + Ok(()) +} + fn real_main() -> Result<()> { let mut upstream_tag = None; let mut debug = false; @@ -377,6 +446,8 @@ fn real_main() -> Result<()> { .success() ); + let t = std::thread::spawn(write_cache); + { let mut identifiers = io::BufWriter::new(std::fs::File::create("src/identifiers.rs")?); @@ -448,6 +519,7 @@ fn real_main() -> Result<()> { .write(&readme.as_bytes()[end_index..]) .context("failed to write suffix")?; + t.join().unwrap()?; Ok(()) } From ea23de075040aab961540d39993fa9029ce07f2a Mon Sep 17 00:00:00 2001 From: Jake Shadle Date: Mon, 1 Dec 2025 12:05:20 +0100 Subject: [PATCH 02/14] Add scanning --- src/detection.rs | 17 +- .../{inline-cache.rs => inline_cache.rs} | 0 src/detection/license.rs | 58 +- src/detection/preproc.rs | 54 +- src/detection/scan.rs | 521 ++++++++++++++++++ 5 files changed, 553 insertions(+), 97 deletions(-) rename src/detection/{inline-cache.rs => inline_cache.rs} (100%) create mode 100644 src/detection/scan.rs diff --git a/src/detection.rs b/src/detection.rs index fe34e9a..e301e13 100644 --- a/src/detection.rs +++ b/src/detection.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 //! This module is basically an inling of [askalono](https://github.com/jpeddicord/askalono) -//! +//! //! Askalono is not really maintained and also depends on other unmaintained //! crates, since this crate is used by both cargo-deny and cargo-about in //! conjunction with askalono for checking licenses, I'm pulling it directly into @@ -12,13 +12,14 @@ use std::collections::HashMap; #[cfg(feature = "detection-cache")] mod cache; +mod detect; #[cfg(feature = "detection-inline-cache")] mod inline_cache; -mod detect; mod license; pub use license::{LicenseType, TextData}; mod ngram; mod preproc; +pub mod scan; pub struct LicenseEntry { pub original: TextData, @@ -119,7 +120,7 @@ impl Store { .licenses .get_mut(name) .ok_or(StoreError::UnknownLicense)?; - + match variant { LicenseType::Alternate => { entry.alternates.push(data); @@ -138,9 +139,7 @@ impl Store { /// Get the list of aliases for a given license. #[inline] pub fn aliases(&self, name: &str) -> Option<&Vec> { - self - .licenses - .get(name).map(|le| &le.aliases) + self.licenses.get(name).map(|le| &le.aliases) } /// Set the list of aliases for a given license. @@ -167,9 +166,11 @@ impl std::fmt::Display for StoreError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::UnknownLicense => f.write_str("specified license did not exist in the store"), - Self::OriginalInvalidForVariant => f.write_str("attempted to add an original license text as a variant"), + Self::OriginalInvalidForVariant => { + f.write_str("attempted to add an original license text as a variant") + } } } } -impl std::error::Error for StoreError {} \ No newline at end of file +impl std::error::Error for StoreError {} diff --git a/src/detection/inline-cache.rs b/src/detection/inline_cache.rs similarity index 100% rename from src/detection/inline-cache.rs rename to src/detection/inline_cache.rs diff --git a/src/detection/license.rs b/src/detection/license.rs index 4a1ee89..f3a0ca9 100644 --- a/src/detection/license.rs +++ b/src/detection/license.rs @@ -21,53 +21,6 @@ pub enum LicenseType { Alternate, } -// impl LicenseType { -// #[inline] -// fn do_unpack(s: &str) -> Result { -// let lt = match s { -// "original" => Self::Original, -// "header" => Self::Header, -// "alternate" => Self::Alternate, -// _ => return Err(msgpacker::Error::InvalidEnumVariant), -// }; -// Ok(lt) -// } -// } - -// impl msgpacker::Packable for LicenseType { -// fn pack(&self, buf: &mut T) -> usize -// where -// T: Extend, -// { -// let s = match self { -// Self::Original => "original", -// Self::Header => "header", -// Self::Alternate => "alternate", -// }; - -// s.pack(buf) -// } -// } - -// impl msgpacker::Unpackable for LicenseType { -// type Error = msgpacker::Error; - -// fn unpack(buf: &[u8]) -> Result<(usize, Self), Self::Error> { -// let (read, s) = String::unpack(buf)?; -// let this = Self::do_unpack(&s)?; -// Ok((read, this)) -// } - -// fn unpack_iter(bytes: I) -> Result<(usize, Self), Self::Error> -// where -// I: IntoIterator, -// { -// let (read, s) = String::unpack_iter(bytes)?; -// let this = Self::do_unpack(&s)?; -// Ok((read, this)) -// } -// } - impl fmt::Display for LicenseType { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( @@ -237,11 +190,6 @@ impl TextData { &self.lines_normalized[self.lines_view.0..self.lines_view.1] } - #[doc(hidden)] - pub fn text_processed(&self) -> &str { - &self.text_processed - } - /// Compare this `TextData` with another, returning a similarity score. /// /// This is what's used during analysis to rank licenses. @@ -431,13 +379,13 @@ mod tests { #[test] fn view_and_white_out() { let a = TextData::from("aaa\nbbb\nccc\nddd"); - assert_eq!(Some("aaa bbb ccc ddd"), a.text_processed()); + assert_eq!("aaa bbb ccc ddd", a.text_processed); let b = a.with_view(1, 3); assert_eq!(2, b.lines().len()); - assert_eq!(Some("bbb ccc"), b.text_processed()); + assert_eq!("bbb ccc", b.text_processed); let c = b.white_out(); - assert_eq!(Some("aaa ddd"), c.text_processed()); + assert_eq!("aaa ddd", c.text_processed); } } diff --git a/src/detection/preproc.rs b/src/detection/preproc.rs index b0819f3..1f5d27d 100644 --- a/src/detection/preproc.rs +++ b/src/detection/preproc.rs @@ -73,25 +73,20 @@ fn normalize_unicode(input: Cow<'_, str>) -> Cow<'_, str> { } fn remove_junk(input: Cow<'_, str>) -> Cow<'_, str> { - static RX: LazyLock = LazyLock::new(|| { - Regex::new(r"[^\w\s\pP]+").unwrap() - }); + static RX: LazyLock = LazyLock::new(|| Regex::new(r"[^\w\s\pP]+").unwrap()); RX.replace_all_cow(input, "") } fn blackbox_urls(input: Cow<'_, str>) -> Cow<'_, str> { - static RX: LazyLock = LazyLock::new(|| { - Regex::new(r"https?://\S+").unwrap() - }); + static RX: LazyLock = LazyLock::new(|| Regex::new(r"https?://\S+").unwrap()); RX.replace_all_cow(input, "http://blackboxed/url") } fn normalize_horizontal_whitespace(input: Cow<'_, str>) -> Cow<'_, str> { - static RX: LazyLock = LazyLock::new(|| { - Regex::new(r"(?x)[ \t\p{Zs} \\ / \| \x2044 ]+").unwrap() - }); + static RX: LazyLock = + LazyLock::new(|| Regex::new(r"(?x)[ \t\p{Zs} \\ / \| \x2044 ]+").unwrap()); RX.replace_all_cow(input, " ") } @@ -105,15 +100,13 @@ fn normalize_punctuation(input: Cow<'_, str>) -> Cow<'_, str> { under: Regex, copy: Regex, } - static RX: LazyLock = LazyLock::new(|| { - Rx { - quotes: Regex::new(r#"["'\p{Pi}\p{Pf}]+"#).unwrap(), - dash: Regex::new(r"\p{Pd}+").unwrap(), - open:Regex::new(r"\p{Ps}+").unwrap(), - close: Regex::new(r"\p{Pe}+").unwrap(), - under: Regex::new(r"\p{Pc}+").unwrap(), - copy:Regex::new(r"[©Ⓒⓒ]").unwrap(), - } + static RX: LazyLock = LazyLock::new(|| Rx { + quotes: Regex::new(r#"["'\p{Pi}\p{Pf}]+"#).unwrap(), + dash: Regex::new(r"\p{Pd}+").unwrap(), + open: Regex::new(r"\p{Ps}+").unwrap(), + close: Regex::new(r"\p{Pe}+").unwrap(), + under: Regex::new(r"\p{Pc}+").unwrap(), + copy: Regex::new(r"[©Ⓒⓒ]").unwrap(), }); let mut out = input; @@ -237,11 +230,9 @@ fn normalize_vertical_whitespace(input: Cow<'_, str>) -> Cow<'_, str> { misc: Regex, num: Regex, } - static RX: LazyLock = LazyLock::new(|| { - Rx { - misc: Regex::new(r"[\r\n\v\f]").unwrap(), - num: Regex::new(r"\n{3,}").unwrap(), - } + static RX: LazyLock = LazyLock::new(|| Rx { + misc: Regex::new(r"[\r\n\v\f]").unwrap(), + num: Regex::new(r"\n{3,}").unwrap(), }); let mut out = input; @@ -251,9 +242,7 @@ fn normalize_vertical_whitespace(input: Cow<'_, str>) -> Cow<'_, str> { } fn remove_punctuation(input: Cow<'_, str>) -> Cow<'_, str> { - static RX: LazyLock = LazyLock::new(|| { - Regex::new(r"[^\w\s]+").unwrap() - }); + static RX: LazyLock = LazyLock::new(|| Regex::new(r"[^\w\s]+").unwrap()); RX.replace_all_cow(input, "") } @@ -263,9 +252,8 @@ fn lowercaseify(input: Cow<'_, str>) -> Cow<'_, str> { } fn remove_title_line(input: Cow<'_, str>) -> Cow<'_, str> { - static RX: LazyLock = LazyLock::new(|| { - Regex::new(r"^.*license( version \S+)?( copyright.*)?\n\n").unwrap() - }); + static RX: LazyLock = + LazyLock::new(|| Regex::new(r"^.*license( version \S+)?( copyright.*)?\n\n").unwrap()); RX.replace_all_cow(input, "") } @@ -291,7 +279,7 @@ fn remove_copyright_statements(input: Cow<'_, str>) -> Cow<'_, str> { # or any lines that really look like a copyright statement ^copyright (\s+(c|\d+))+ .*?$ ) - " + ", ) .unwrap() }); @@ -300,9 +288,7 @@ fn remove_copyright_statements(input: Cow<'_, str>) -> Cow<'_, str> { } fn collapse_whitespace(input: Cow<'_, str>) -> Cow<'_, str> { - static RX: LazyLock = LazyLock::new(|| { - Regex::new(r"\s+").unwrap() - }); + static RX: LazyLock = LazyLock::new(|| Regex::new(r"\s+").unwrap()); RX.replace_all_cow(input, " ") } @@ -340,7 +326,7 @@ mod tests { ]; for (i, &outcome) in expected.iter().enumerate() { - assert_eq!(outcome, trim_byte_adjusted(input, i)) + assert_eq!(outcome, trim_byte_adjusted(input, i)); } } diff --git a/src/detection/scan.rs b/src/detection/scan.rs new file mode 100644 index 0000000..22ae040 --- /dev/null +++ b/src/detection/scan.rs @@ -0,0 +1,521 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::{borrow::Cow, fmt}; + +use crate::detection::{ + Store, + detect::Match, + license::{LicenseType, TextData}, +}; + +/// A struct describing a license that was identified, as well as its type. +#[derive(Copy, Clone)] +pub struct IdentifiedLicense<'a> { + /// The identifier of the license. + pub name: &'a str, + /// The type of the license that was matched. + pub kind: LicenseType, + /// A reference to the license data inside the store. + pub data: &'a TextData, +} + +impl<'a> fmt::Debug for IdentifiedLicense<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("IdentifiedLicense") + .field("name", &self.name) + .field("kind", &self.kind) + .finish() + } +} + +/// Information about scanned content. +/// +/// Produced by `ScanStrategy.scan`. +#[derive(Debug)] +pub struct ScanResult<'a> { + /// The confidence of the match from 0.0 to 1.0. + pub score: f32, + /// The identified license of the overall text, or None if nothing met the + /// confidence threshold. + pub license: Option>, + /// Any licenses discovered inside the text, if `optimize` was enabled. + pub containing: Vec>, +} + +/// A struct describing a single license identified within a larger text. +#[derive(Debug, Copy, Clone)] +pub struct ContainedResult<'a> { + /// The confidence of the match within the line range from 0.0 to 1.0. + pub score: f32, + /// The license identified in this portion of the text. + pub license: IdentifiedLicense<'a>, + /// A 0-indexed (inclusive, exclusive) range of line numbers identifying + /// where in the overall text a license was identified. + /// + /// See `TextData.lines_view()` for more information. + pub line_range: (usize, usize), +} + +/// A `ScanStrategy` can be used as a high-level wrapped over a `Store`'s +/// analysis logic. +/// +/// A strategy configured here can be run repeatedly to scan a document for +/// multiple licenses, or to automatically optimize to locate texts within a +/// larger text. +/// +/// # Examples +/// +/// ```rust,should_panic +/// # use std::error::Error; +/// use askalono::{Scanner, Store}; +/// +/// # fn main() -> Result<(), Box> { +/// let store = Store::new(); +/// // [...] +/// let strategy = Scanner::new(&store) +/// .confidence_threshold(0.9) +/// .optimize(true); +/// let results = strategy.scan(&"my text to scan".into())?; +/// # Ok(()) +/// # } +/// ``` +pub struct Scanner<'a> { + store: &'a Store, + mode: ScanMode, + confidence_threshold: f32, + shallow_limit: f32, + optimize: bool, + max_passes: u16, +} + +/// Available scanning strategy modes. +pub enum ScanMode { + /// A general-purpose strategy that iteratively locates the + /// highest license match in a file, then the next, and so on until not + /// finding any more strong matches. + Elimination, + /// A strategy intended for use with attribution documents, or + /// text files containing multiple licenses (and not much else). + /// + /// It's more accurate than `Elimination`, but significantly slower. + TopDown { + /// A smaller step size will be more accurate at a significant cost of + /// speed. + /// + /// Defaults to 5. + step_size: usize, + }, +} + +impl ScanMode { + /// Creates a `TopDown` strategy with the default step size + #[inline] + pub fn top_down() -> Self { + Self::TopDown { step_size: 5 } + } +} + +impl<'a> Scanner<'a> { + /// Construct a new scanning strategy tied to the given `Store`. + /// + /// By default, the strategy has conservative defaults and won't perform + /// any deeper investigation into the contents of files. + #[inline] + pub fn new(store: &'a Store) -> Self { + Self::with_scan_mode(store, ScanMode::Elimination) + } + + /// Constructs a scanning strategy with the specified mode + #[inline] + pub fn with_scan_mode(store: &'a Store, mode: ScanMode) -> Self { + Self { + store, + mode, + confidence_threshold: 0.9, + shallow_limit: 0.99, + optimize: false, + max_passes: 10, + } + } +} + +impl Scanner<'_> { + /// Set the confidence threshold for this strategy. + /// + /// The overall license match must meet this number in order to be + /// reported. Additionally, if contained licenses are reported in the scan + /// (when `optimize` is enabled), they'll also need to meet this bar. + /// + /// Set this to 1.0 for only exact matches, and 0.0 to report even the + /// weakest match. + pub fn confidence_threshold(mut self, confidence_threshold: f32) -> Self { + self.confidence_threshold = confidence_threshold; + self + } + + /// Set a fast-exit parameter that allows the strategy to skip the rest of + /// a scan for strong matches. + /// + /// This should be set higher than the confidence threshold; ideally close + /// to 1.0. If the overall match score is above this limit, the scanner + /// will return early and not bother performing deeper checks. + /// + /// This is really only useful in conjunction with `optimize`. A value of + /// 0.0 will fast-return on any match meeting the confidence threshold, + /// while a value of 1.0 will only stop on a perfect match. + pub fn shallow_limit(mut self, shallow_limit: f32) -> Self { + self.shallow_limit = shallow_limit; + self + } + + /// Indicate whether a deeper scan should be performed. + /// + /// This is ignored if the shallow limit is met. It's not enabled by + /// default, however, so if you want deeper results you should set + /// `shallow_limit` fairly high and enable this. + pub fn optimize(mut self, optimize: bool) -> Self { + self.optimize = optimize; + self + } + + /// The maximum number of identifications to perform before exiting a scan + /// of a single text. + /// + /// This is largely to prevent misconfigurations and infinite loop + /// scenarios, but if you have a document with a large number of licenses + /// then you may want to tune this to a value above the number of licenses + /// you expect to be identified. + pub fn max_passes(mut self, max_passes: u16) -> Self { + self.max_passes = max_passes; + self + } + + /// Scan the given text content using this strategy's configured + /// preferences. + /// + /// Returns a `ScanResult` containing all discovered information. + #[inline] + pub fn scan(&'_ self, text: &TextData) -> ScanResult<'_> { + match self.mode { + ScanMode::Elimination => self.scan_elimination(text), + ScanMode::TopDown { step_size } => self.scan_topdown(text, step_size), + } + } + + fn scan_elimination(&'_ self, text: &TextData) -> ScanResult<'_> { + let mut analysis = self.store.analyze(text); + let score = analysis.score; + let mut license = None; + let mut containing = Vec::new(); + + // meets confidence threshold? record that + if analysis.score > self.confidence_threshold { + license = Some(IdentifiedLicense { + name: analysis.name, + kind: analysis.license_type, + data: analysis.data, + }); + + // above the shallow limit -> exit + if analysis.score > self.shallow_limit { + return ScanResult { + score, + license, + containing, + }; + } + } + + if !self.optimize { + return ScanResult { + score, + license, + containing, + }; + } + + // repeatedly try to dig deeper + // this loop effectively iterates once for each license it finds + let mut current_text: Cow<'_, TextData> = Cow::Borrowed(text); + for _n in 0..self.max_passes { + let (optimized, optimized_score) = current_text.optimize_bounds(analysis.data); + + // stop if we didn't find anything acceptable + if optimized_score < self.confidence_threshold { + break; + } + + // otherwise, save it + containing.push(ContainedResult { + score: optimized_score, + license: IdentifiedLicense { + name: analysis.name, + kind: analysis.license_type, + data: analysis.data, + }, + line_range: optimized.lines_view(), + }); + + // and white-out + reanalyze for next iteration + current_text = Cow::Owned(optimized.white_out()); + analysis = self.store.analyze(¤t_text); + } + + ScanResult { + score, + license, + containing, + } + } + + fn scan_topdown(&'_ self, text: &TextData, step_size: usize) -> ScanResult<'_> { + let (_, text_end) = text.lines_view(); + let mut containing = Vec::new(); + + // find licenses working down thru the text's lines + let mut current_start = 0usize; + while current_start < text_end { + let result = self.topdown_find_contained_license(text, current_start, step_size); + + let contained = match result { + Some(c) => c, + None => break, + }; + + current_start = contained.line_range.1 + 1; + containing.push(contained); + } + + ScanResult { + score: 0.0, + license: None, + containing, + } + } + + fn topdown_find_contained_license( + &'_ self, + text: &TextData, + starting_at: usize, + step_size: usize, + ) -> Option> { + let (_, text_end) = text.lines_view(); + let mut found: (usize, usize, Option>) = (0, 0, None); + + // speed: only start tracking once conf is met, and bail out after + let mut hit_threshold = false; + + // move the start of window... + 'start: for start in (starting_at..text_end).step_by(step_size) { + // ...and also the end of window to find high scores. + for end in (start..=text_end).step_by(step_size) { + let view = text.with_view(start, end); + let analysis = self.store.analyze(&view); + + // just getting a feel for the data at this point, not yet + // optimizing the view. + + // entering threshold: save the starting location + if !hit_threshold && analysis.score >= self.confidence_threshold { + hit_threshold = true; + } + + if hit_threshold { + if analysis.score < self.confidence_threshold { + // exiting threshold + break 'start; + } else { + // maintaining threshold (also true for entering) + found = (start, end, Some(analysis)); + } + } + } + } + + // at this point we have a *rough* bounds for a match. + // now we can optimize to find the best one + let matched = found.2?; + let check = matched.data; + let view = text.with_view(found.0, found.1); + let (optimized, optimized_score) = view.optimize_bounds(check); + + if optimized_score < self.confidence_threshold { + return None; + } + + Some(ContainedResult { + score: optimized_score, + license: IdentifiedLicense { + name: matched.name, + kind: matched.license_type, + data: matched.data, + }, + line_range: optimized.lines_view(), + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn can_construct() { + let store = Store::new(); + Scanner::new(&store); + Scanner::new(&store).confidence_threshold(0.5); + Scanner::new(&store) + .shallow_limit(0.99) + .optimize(true) + .max_passes(100); + } + + #[test] + fn shallow_scan() { + let store = create_dummy_store(); + let test_data = TextData::new("lorem ipsum\naaaaa bbbbb\nccccc\nhello"); + + // the above text should have a result with a confidence minimum of 0.5 + let strategy = Scanner::new(&store) + .confidence_threshold(0.5) + .shallow_limit(0.0); + let result = strategy.scan(&test_data); + assert!( + result.score > 0.5, + "score must meet threshold; was {}", + result.score + ); + assert_eq!( + result.license.expect("result has a license").name, + "license-1" + ); + + // but it won't pass with a threshold of 0.8 + let strategy = Scanner::new(&store) + .confidence_threshold(0.8) + .shallow_limit(0.0); + let result = strategy.scan(&test_data); + assert!(result.license.is_none(), "result license is None"); + } + + #[test] + fn single_optimize() { + let store = create_dummy_store(); + // this TextData matches license-2 with an overall score of ~0.46 and optimized + // score of ~0.57 + let test_data = TextData::new( + "lorem\nipsum abc def ghi jkl\n1234 5678 1234\n0000\n1010101010\n\n8888 9999\nwhatsit hello\narst neio qwfp colemak is the best keyboard layout", + ); + + // check that we can spot the gibberish license in the sea of other gibberish + let strategy = Scanner::new(&store) + .confidence_threshold(0.5) + .optimize(true) + .shallow_limit(1.0); + let result = strategy.scan(&test_data); + assert!(result.license.is_none(), "result license is None"); + assert_eq!(result.containing.len(), 1); + let contained = &result.containing[0]; + assert_eq!(contained.license.name, "license-2"); + assert!( + contained.score > 0.5, + "contained score is greater than threshold" + ); + } + + #[test] + fn find_multiple_licenses_elimination() { + let store = create_dummy_store(); + // this TextData matches license-2 with an overall score of ~0.46 and optimized + // score of ~0.57 + let test_data = TextData::new( + "lorem\nipsum abc def ghi jkl\n1234 5678 1234\n0000\n1010101010\n\n8888 9999\nwhatsit hello\narst neio qwfp colemak is the best keyboard layout\naaaaa\nbbbbb\nccccc", + ); + + // check that we can spot the gibberish license in the sea of other gibberish + let strategy = Scanner::new(&store) + .confidence_threshold(0.5) + .optimize(true) + .shallow_limit(1.0); + let result = strategy.scan(&test_data); + assert!(result.license.is_none(), "result license is None"); + assert_eq!(2, result.containing.len()); + + // inspect the array and ensure we got both licenses + let mut found1 = 0; + let mut found2 = 0; + for contained in &result.containing { + match contained.license.name { + "license-1" => { + assert!(contained.score > 0.5, "license-1 score meets threshold"); + found1 += 1; + } + "license-2" => { + assert!(contained.score > 0.5, "license-2 score meets threshold"); + found2 += 1; + } + _ => { + panic!("somehow got an unknown license name"); + } + } + } + + assert!( + found1 == 1 && found2 == 1, + "found both licenses exactly once" + ); + } + + #[test] + fn find_multiple_licenses_topdown() { + let store = create_dummy_store(); + // this TextData matches license-2 with an overall score of ~0.46 and optimized + // score of ~0.57 + let test_data = TextData::new( + "lorem\nipsum abc def ghi jkl\n1234 5678 1234\n0000\n1010101010\n\n8888 9999\nwhatsit hello\narst neio qwfp colemak is the best keyboard layout\naaaaa\nbbbbb\nccccc", + ); + + // check that we can spot the gibberish license in the sea of other gibberish + let strategy = Scanner::with_scan_mode(&store, ScanMode::TopDown { step_size: 1 }) + .confidence_threshold(0.5); + let result = strategy.scan(&test_data); + assert!(result.license.is_none(), "result license is None"); + println!("{:?}", result); + assert_eq!(2, result.containing.len()); + + // inspect the array and ensure we got both licenses + let mut found1 = 0; + let mut found2 = 0; + for contained in &result.containing { + match contained.license.name { + "license-1" => { + assert!(contained.score > 0.5, "license-1 score meets threshold"); + found1 += 1; + } + "license-2" => { + assert!(contained.score > 0.5, "license-2 score meets threshold"); + found2 += 1; + } + _ => { + panic!("somehow got an unknown license name"); + } + } + } + + assert!( + found1 == 1 && found2 == 1, + "found both licenses exactly once" + ); + } + + fn create_dummy_store() -> Store { + let mut store = Store::new(); + store.add_license("license-1".into(), "aaaaa\nbbbbb\nccccc".into()); + store.add_license( + "license-2".into(), + "1234 5678 1234\n0000\n1010101010\n\n8888 9999".into(), + ); + store + } +} From 599a7615e4dfa31efbbc5ece9225e8917ad8f7c4 Mon Sep 17 00:00:00 2001 From: Jake Shadle Date: Mon, 1 Dec 2025 14:39:55 +0100 Subject: [PATCH 03/14] Add inline cache load test --- src/detection.rs | 6 ++++++ src/detection/cache.bin.zstd | 4 ++-- tests/detection.rs | 26 ++++++++++++++++++++++++++ update/src/main.rs | 13 +++++++++++++ 4 files changed, 47 insertions(+), 2 deletions(-) create mode 100644 tests/detection.rs diff --git a/src/detection.rs b/src/detection.rs index e301e13..5ed5c18 100644 --- a/src/detection.rs +++ b/src/detection.rs @@ -104,6 +104,12 @@ impl Store { self.licenses.insert(name, entry); } + /// Gets an iterator over all of the licenses + #[inline] + pub fn iter(&self) -> std::collections::hash_map::Iter<'_, String, LicenseEntry> { + self.licenses.iter() + } + /// Add a variant (a header or alternate formatting) of a given license to /// the store. /// diff --git a/src/detection/cache.bin.zstd b/src/detection/cache.bin.zstd index 7701a05..76b2611 100644 --- a/src/detection/cache.bin.zstd +++ b/src/detection/cache.bin.zstd @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:614c37fc12984b7f6a279fda327d69baace94f6cddd9c9ea30d081e95521c3ed -size 2020167 +oid sha256:3443c72f3d1bbd769c78fc1bba82af691eb7e34c6e04adc61a86a2a03ae97805 +size 2018306 diff --git a/tests/detection.rs b/tests/detection.rs new file mode 100644 index 0000000..bd200f4 --- /dev/null +++ b/tests/detection.rs @@ -0,0 +1,26 @@ +#![cfg(feature = "detection")] + +#[cfg(feature = "detection-inline-cache")] +#[test] +fn reads_inline_cache() { + let store = spdx::detection::Store::load_inline().expect("failed to load cache"); + + let mut set = std::collections::BTreeSet::new(); + + for (k, v) in store.iter() { + set.insert(k.as_str()); + + for alias in &v.aliases { + set.insert(alias.as_str()); + } + } + + // We manually add the NOASSERTION "fake" license id since it's not part of + // SPDX, but might be in the future https://github.com/spdx/spdx-spec/issues/50 + // so that should be the only license that isn't present in the store + for lic in spdx::identifiers::LICENSES { + if lic.name != "NOASSERTION" { + assert!(set.contains(lic.name), "failed to find expected license {} in inline cache store", lic.name); + } + } +} \ No newline at end of file diff --git a/update/src/main.rs b/update/src/main.rs index ccdd43c..fe3271a 100644 --- a/update/src/main.rs +++ b/update/src/main.rs @@ -355,6 +355,13 @@ fn write_cache() -> anyhow::Result<()> { v.aliases.push(lic_id.to_owned()); println!("{lic_id} already stored; added as an alias for {name}"); + + if lic_id.starts_with("GFDL-") { + if let Some(id) = lic_id.strip_suffix("-invariants-only") { + v.aliases.push(format!("{id}-invariants")); + } + } + already_existed = true; } @@ -369,6 +376,12 @@ fn write_cache() -> anyhow::Result<()> { if let Some(header_text) = details.get("standardLicenseHeader").and_then(|h| h.as_str()) { license.headers.push(sd::TextData::new(header_text)); } + + if lic_id.starts_with("GFDL-") { + if let Some(id) = lic_id.strip_suffix("-invariants-only") { + license.aliases.push(format!("{id}-invariants")); + } + } } let mut s = sd::Store::new(); From cb4124afc66a7b914a2459e400bb987165f711ae Mon Sep 17 00:00:00 2001 From: Jake Shadle Date: Mon, 1 Dec 2025 14:40:11 +0100 Subject: [PATCH 04/14] Cleanup --- src/detection/cache.rs | 87 +++++++++++++++++++++-------------- src/detection/inline_cache.rs | 2 +- src/detection/license.rs | 7 +-- src/detection/ngram.rs | 6 ++- src/detection/scan.rs | 4 +- tests/detection.rs | 8 +++- 6 files changed, 67 insertions(+), 47 deletions(-) diff --git a/src/detection/cache.rs b/src/detection/cache.rs index 89d3e3a..fb4ac38 100644 --- a/src/detection/cache.rs +++ b/src/detection/cache.rs @@ -1,7 +1,7 @@ // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -use crate::detection::{Store, LicenseEntry, license::TextData, ngram::NgramSet}; +use crate::detection::{LicenseEntry, Store, license::TextData, ngram::NgramSet}; use std::io; const CACHE_VERSION: &str = "spdx-crate-01"; @@ -58,9 +58,10 @@ impl Store { /// /// This method is highly useful for quickly loading a cache, as creating /// one from text data is rather slow. This method can typically load - /// the full SPDX set from disk in 200-300 ms. The cache will be - /// sanity-checked to ensure it was generated with a similar version of - /// askalono. + /// the full SPDX set from disk in < 100ms. + /// + /// The cache contains a simple version header that ensure that the cache + /// is loadable pub fn from_cache(mut readable: R) -> Result where R: io::Read + Sized, @@ -71,7 +72,7 @@ impl Store { if header != CACHE_VERSION.as_bytes() { return Err(CacheError::InvalidVersion { actual: String::from_utf8_lossy(&header).into_owned(), - expected: CACHE_VERSION + expected: CACHE_VERSION, }); } @@ -138,7 +139,8 @@ impl From for BinErr { #[inline] fn write_u16(u: usize, w: &mut W) -> Result<(), BinErr> -where W: io::Write + Sized +where + W: io::Write + Sized, { let u: u16 = u.try_into().map_err(|_e| ProtoError::TooLong(u))?; w.write_all(&u.to_le_bytes()).map_err(BinErr::Io) @@ -146,7 +148,8 @@ where W: io::Write + Sized #[inline] fn read_u16(r: &mut R) -> Result -where R: io::Read + Sized +where + R: io::Read + Sized, { let mut u = [0u8; 2]; r.read_exact(&mut u)?; @@ -155,14 +158,16 @@ where R: io::Read + Sized #[inline] fn write_u64(u: usize, w: &mut W) -> Result<(), BinErr> -where W: io::Write + Sized +where + W: io::Write + Sized, { w.write_all(&(u as u64).to_le_bytes()).map_err(BinErr::Io) } #[inline] fn read_u64(r: &mut R) -> Result -where R: io::Read + Sized +where + R: io::Read + Sized, { let mut b = [0u8; 8]; r.read_exact(&mut b)?; @@ -171,14 +176,16 @@ where R: io::Read + Sized impl Bin for String { fn bwrite(&self, w: &mut W) -> Result<(), BinErr> - where W: io::Write + Sized + where + W: io::Write + Sized, { write_u16(self.len(), w)?; w.write_all(self.as_bytes()).map_err(BinErr::Io) } fn bread(r: &mut R) -> Result - where R: io::Read + Sized + where + R: io::Read + Sized, { let mut len = read_u16(r)?; let mut pos = 0; @@ -196,8 +203,9 @@ impl Bin for String { #[inline] fn write_vec(v: &[B], w: &mut W) -> Result<(), BinErr> - where W: io::Write + Sized, - B: Bin, +where + W: io::Write + Sized, + B: Bin, { write_u16(v.len(), w)?; @@ -210,8 +218,9 @@ fn write_vec(v: &[B], w: &mut W) -> Result<(), BinErr> #[inline] fn read_vec(r: &mut R) -> Result, BinErr> - where R: io::Read + Sized, - B: Bin, +where + R: io::Read + Sized, + B: Bin, { let len = read_u16(r)?; @@ -226,14 +235,17 @@ fn read_vec(r: &mut R) -> Result, BinErr> trait Bin: Sized { fn bwrite(&self, w: &mut W) -> Result<(), BinErr> - where W: io::Write + Sized; + where + W: io::Write + Sized; fn bread(r: &mut R) -> Result - where R: io::Read + Sized; + where + R: io::Read + Sized; } impl Bin for Store { fn bwrite(&self, w: &mut W) -> Result<(), BinErr> - where W: io::Write + Sized + where + W: io::Write + Sized, { write_u16(self.licenses.len(), w)?; @@ -246,11 +258,12 @@ impl Bin for Store { } fn bread(r: &mut R) -> Result - where R: io::Read + Sized + where + R: io::Read + Sized, { let map_count = read_u16(r)?; - let mut licenses = std::collections::HashMap::new(); + let mut licenses = std::collections::HashMap::new(); for _ in 0..map_count { let key = String::bread(r)?; @@ -265,7 +278,8 @@ impl Bin for Store { impl Bin for LicenseEntry { fn bwrite(&self, w: &mut W) -> Result<(), BinErr> - where W: io::Write + Sized + where + W: io::Write + Sized, { self.original.bwrite(w)?; write_vec(&self.aliases, w)?; @@ -276,7 +290,8 @@ impl Bin for LicenseEntry { } fn bread(r: &mut R) -> Result - where R: io::Read + Sized + where + R: io::Read + Sized, { Ok(Self { original: TextData::bread(r)?, @@ -289,7 +304,8 @@ impl Bin for LicenseEntry { impl Bin for TextData { fn bwrite(&self, w: &mut W) -> Result<(), BinErr> - where W: io::Write + Sized + where + W: io::Write + Sized, { self.match_data.bwrite(w)?; write_u64(self.lines_view.0, w)?; @@ -301,7 +317,8 @@ impl Bin for TextData { } fn bread(r: &mut R) -> Result - where R: io::Read + Sized + where + R: io::Read + Sized, { Ok(Self { match_data: NgramSet::bread(r)?, @@ -314,15 +331,17 @@ impl Bin for TextData { impl Bin for u32 { fn bwrite(&self, w: &mut W) -> Result<(), BinErr> - where W: io::Write + Sized + where + W: io::Write + Sized, { w.write_all(&self.to_le_bytes()).map_err(BinErr::Io) } fn bread(r: &mut R) -> Result - where R: io::Read + Sized + where + R: io::Read + Sized, { - let mut b = [0;4]; + let mut b = [0; 4]; r.read_exact(&mut b)?; Ok(u32::from_le_bytes(b)) } @@ -330,7 +349,8 @@ impl Bin for u32 { impl Bin for NgramSet { fn bwrite(&self, w: &mut W) -> Result<(), BinErr> - where W: io::Write + Sized + where + W: io::Write + Sized, { write_u16(self.map.len(), w)?; for (k, v) in &self.map { @@ -344,7 +364,8 @@ impl Bin for NgramSet { } fn bread(r: &mut R) -> Result - where R: io::Read + Sized + where + R: io::Read + Sized, { let map_len = read_u16(r)?; let mut map = std::collections::HashMap::new(); @@ -358,10 +379,6 @@ impl Bin for NgramSet { r.read_exact(&mut n)?; let size = read_u64(r)?; - Ok(Self { - map, - n: n[0], - size, - }) + Ok(Self { map, n: n[0], size }) } -} \ No newline at end of file +} diff --git a/src/detection/inline_cache.rs b/src/detection/inline_cache.rs index 1f3e851..07c3bf4 100644 --- a/src/detection/inline_cache.rs +++ b/src/detection/inline_cache.rs @@ -6,4 +6,4 @@ impl crate::detection::Store { pub fn load_inline() -> Result { Self::from_cache(CACHE) } -} \ No newline at end of file +} diff --git a/src/detection/license.rs b/src/detection/license.rs index f3a0ca9..3f6dc24 100644 --- a/src/detection/license.rs +++ b/src/detection/license.rs @@ -46,7 +46,7 @@ impl fmt::Display for LicenseType { /// Basic scoring of two texts: /// /// ``` -/// use askalono::TextData; +/// use spdx::detection::TextData; /// /// let license = TextData::from("My First License"); /// let sample = TextData::from("copyright 20xx me irl\n\n // my first license"); @@ -61,7 +61,7 @@ impl fmt::Display for LicenseType { /// /// ``` /// # use std::error::Error; -/// # use askalono::TextData; +/// # use spdx::detection::TextData; /// # fn main() -> Result<(), Box> { /// # let license = TextData::from("My First License"); /// let sample = TextData::from("copyright 20xx me irl\n// My First License\nfn hello() {\n ..."); @@ -285,9 +285,6 @@ impl From for TextData { mod tests { use super::*; - // psst: - // cargo test -- --nocapture - #[test] fn optimize_bounds() { let license_text = "this is a license text\nor it pretends to be one\nit's just a test"; diff --git a/src/detection/ngram.rs b/src/detection/ngram.rs index 0211e3c..c2c1929 100644 --- a/src/detection/ngram.rs +++ b/src/detection/ngram.rs @@ -3,7 +3,7 @@ use std::{ cmp::min, - collections::{hash_map::Iter, HashMap, VecDeque}, + collections::{HashMap, VecDeque, hash_map::Iter}, }; #[derive(Clone, Debug)] @@ -38,7 +38,9 @@ impl NgramSet { deque.push_back(w); if deque.len() == self.n as usize { let gram = { - let mut g = String::with_capacity(deque.iter().map(|s| s.len()).sum::() + self.n as usize - 1); + let mut g = String::with_capacity( + deque.iter().map(|s| s.len()).sum::() + self.n as usize - 1, + ); for (i, s) in deque.iter().enumerate() { if i > 0 { diff --git a/src/detection/scan.rs b/src/detection/scan.rs index 22ae040..2398271 100644 --- a/src/detection/scan.rs +++ b/src/detection/scan.rs @@ -68,7 +68,7 @@ pub struct ContainedResult<'a> { /// /// ```rust,should_panic /// # use std::error::Error; -/// use askalono::{Scanner, Store}; +/// use spdx::detection::{scan::Scanner, Store}; /// /// # fn main() -> Result<(), Box> { /// let store = Store::new(); @@ -76,7 +76,7 @@ pub struct ContainedResult<'a> { /// let strategy = Scanner::new(&store) /// .confidence_threshold(0.9) /// .optimize(true); -/// let results = strategy.scan(&"my text to scan".into())?; +/// let results = strategy.scan(&"my text to scan".into()); /// # Ok(()) /// # } /// ``` diff --git a/tests/detection.rs b/tests/detection.rs index bd200f4..275ce72 100644 --- a/tests/detection.rs +++ b/tests/detection.rs @@ -20,7 +20,11 @@ fn reads_inline_cache() { // so that should be the only license that isn't present in the store for lic in spdx::identifiers::LICENSES { if lic.name != "NOASSERTION" { - assert!(set.contains(lic.name), "failed to find expected license {} in inline cache store", lic.name); + assert!( + set.contains(lic.name), + "failed to find expected license {} in inline cache store", + lic.name + ); } } -} \ No newline at end of file +} From 8ec378881811a5709bc50f765a7cd556baf93f41 Mon Sep 17 00:00:00 2001 From: Jake Shadle Date: Mon, 1 Dec 2025 15:06:33 +0100 Subject: [PATCH 05/14] Enable LFS checkout --- .github/workflows/ci.yaml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 61b0894..ae09f29 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -12,7 +12,7 @@ name: CI jobs: lint: name: Lint - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable @@ -30,16 +30,18 @@ jobs: deny-check: name: cargo-deny check - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 - uses: EmbarkStudios/cargo-deny-action@v2 msrv-check: name: Minimum Stable Rust Version Check - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 + with: + lfs: true - uses: dtolnay/rust-toolchain@1.85.0 - run: cargo fetch - name: cargo check @@ -49,7 +51,7 @@ jobs: name: Test strategy: matrix: - os: [ubuntu-22.04] + os: [ubuntu-24.04] runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 From 8bae709cde66c70430b4be2be69f07b92bc0f639 Mon Sep 17 00:00:00 2001 From: Jake Shadle Date: Mon, 1 Dec 2025 15:15:10 +0100 Subject: [PATCH 06/14] Sigh lfs --- .github/workflows/ci.yaml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index ae09f29..85ce4fb 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -14,7 +14,7 @@ jobs: name: Lint runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - uses: dtolnay/rust-toolchain@stable with: components: "rustfmt,clippy" @@ -32,16 +32,17 @@ jobs: name: cargo-deny check runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - uses: EmbarkStudios/cargo-deny-action@v2 msrv-check: name: Minimum Stable Rust Version Check runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: lfs: true + - run: git lfs checkout - uses: dtolnay/rust-toolchain@1.85.0 - run: cargo fetch - name: cargo check @@ -54,7 +55,7 @@ jobs: os: [ubuntu-24.04] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - uses: dtolnay/rust-toolchain@stable - run: cargo fetch - name: cargo build From 645909da35ed0cc48c4ac62e1aa54feaa7551ec4 Mon Sep 17 00:00:00 2001 From: Jake Shadle Date: Mon, 1 Dec 2025 15:20:50 +0100 Subject: [PATCH 07/14] wtf github --- src/detection/inline_cache.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/detection/inline_cache.rs b/src/detection/inline_cache.rs index 07c3bf4..e5e26e2 100644 --- a/src/detection/inline_cache.rs +++ b/src/detection/inline_cache.rs @@ -4,6 +4,7 @@ impl crate::detection::Store { /// Attempts to load the cached store inlined into this crate's source #[inline] pub fn load_inline() -> Result { + panic!("what in the actual fuck {:?}", std::str::from_utf8(CACHE)); Self::from_cache(CACHE) } } From a96b5f16902bf7866c60bcdccbd5b0b4fb0a02c1 Mon Sep 17 00:00:00 2001 From: Jake Shadle Date: Mon, 1 Dec 2025 15:23:54 +0100 Subject: [PATCH 08/14] Idiot --- .github/workflows/ci.yaml | 6 +++--- src/detection/inline_cache.rs | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 85ce4fb..1052966 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -40,9 +40,6 @@ jobs: runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v6 - with: - lfs: true - - run: git lfs checkout - uses: dtolnay/rust-toolchain@1.85.0 - run: cargo fetch - name: cargo check @@ -56,6 +53,9 @@ jobs: runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v6 + with: + lfs: true + - run: git lfs checkout - uses: dtolnay/rust-toolchain@stable - run: cargo fetch - name: cargo build diff --git a/src/detection/inline_cache.rs b/src/detection/inline_cache.rs index e5e26e2..07c3bf4 100644 --- a/src/detection/inline_cache.rs +++ b/src/detection/inline_cache.rs @@ -4,7 +4,6 @@ impl crate::detection::Store { /// Attempts to load the cached store inlined into this crate's source #[inline] pub fn load_inline() -> Result { - panic!("what in the actual fuck {:?}", std::str::from_utf8(CACHE)); Self::from_cache(CACHE) } } From 378d1553227bead0c290bb17e44f910e5cc8265e Mon Sep 17 00:00:00 2001 From: Jake Shadle Date: Mon, 1 Dec 2025 15:26:32 +0100 Subject: [PATCH 09/14] Use single job for CI pass --- .github/workflows/ci.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 1052966..6477e8a 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -61,3 +61,9 @@ jobs: - name: cargo build run: cargo build --tests --all-features - run: cargo test --all-features + + test_success: + runs-on: ubuntu-24.04 + needs: [lint, test, deny-check, msrv-check] + steps: + - run: echo "All test jobs passed" From 90a1366f59b9653516a6b41fbc53a130be79d4f9 Mon Sep 17 00:00:00 2001 From: Jake Shadle Date: Mon, 1 Dec 2025 15:54:44 +0100 Subject: [PATCH 10/14] Fixup docs --- .cargo/config.toml | 1 + src/detection.rs | 9 +++++++++ src/detection/license.rs | 1 + src/expression.rs | 12 ++++++++++-- src/lexer.rs | 4 ++++ src/lib.rs | 26 ++++++++++++++++++++++++-- src/licensee.rs | 1 + 7 files changed, 50 insertions(+), 4 deletions(-) diff --git a/.cargo/config.toml b/.cargo/config.toml index b263294..ecadf49 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -74,4 +74,5 @@ rustflags = [ "-Wnonstandard_style", "-Wrust_2018_idioms", # END - Embark standard lints v6 for Rust 1.55+ + "-Dmissing_docs", ] \ No newline at end of file diff --git a/src/detection.rs b/src/detection.rs index 5ed5c18..16b3673 100644 --- a/src/detection.rs +++ b/src/detection.rs @@ -19,16 +19,24 @@ mod license; pub use license::{LicenseType, TextData}; mod ngram; mod preproc; +/// Contains utilities for scanning texts for license information pub mod scan; +/// An entry in a [`Store`] pub struct LicenseEntry { + /// The original license text pub original: TextData, + /// Set of license identifiers that are aliases (ie. same license text) as + /// this entry pub aliases: Vec, + /// Set of headers that can be used to specify this license applies to a larger file pub headers: Vec, + /// Similar license texts that will also be scored as this license if detected pub alternates: Vec, } impl LicenseEntry { + /// Creates a new [`Self`] with the specified text pub fn new(original: TextData) -> Self { Self { original, @@ -160,6 +168,7 @@ impl Store { } } +/// The errors that can occur when interacting with a [`Store`] #[derive(Copy, Clone, PartialEq, Debug)] pub enum StoreError { /// The license name was not in the Store diff --git a/src/detection/license.rs b/src/detection/license.rs index 3f6dc24..795e9ae 100644 --- a/src/detection/license.rs +++ b/src/detection/license.rs @@ -197,6 +197,7 @@ impl TextData { self.match_data.dice(&other.match_data) } + /// Determines if this [`TextData`] is equal to another #[inline] pub fn ngram_matches(&self, other: &Self) -> bool { self.match_data.eq(&other.match_data) diff --git a/src/expression.rs b/src/expression.rs index a326f20..a48496e 100644 --- a/src/expression.rs +++ b/src/expression.rs @@ -8,11 +8,14 @@ pub use minimize::MinimizeError; use smallvec::SmallVec; use std::fmt; -/// A license requirement inside an SPDX license expression, including -/// the span in the expression where it is located +/// A license requirement inside an SPDX license expression +/// +/// Inclueds the span in the expression where it is located #[derive(Debug, Clone)] pub struct ExpressionReq { + /// The license requirement pub req: LicenseReq, + /// The span in the original license expression string containing the requirement pub span: std::ops::Range, } @@ -25,13 +28,18 @@ impl PartialEq for ExpressionReq { /// The joining operators supported by SPDX 2.1 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Copy, Clone)] pub enum Operator { + /// Conjunctive `AND|and` operator that combines two valid license expressions And, + /// Disjunctive `OR|or` operator that combines two valid license expressions Or, } +/// An expression node #[derive(Debug, Clone, PartialEq)] pub enum ExprNode { + /// An operator Op(Operator), + /// A requirement Req(ExpressionReq), } diff --git a/src/lexer.rs b/src/lexer.rs index 70bb4d4..f1a2b3a 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -70,14 +70,18 @@ pub enum Token<'a> { Spdx(LicenseId), /// A `LicenseRef-` prefixed id, with an optional `DocumentRef-` LicenseRef { + /// An optional document reference doc_ref: Option<&'a str>, + /// The name of the license reference lic_ref: &'a str, }, /// A recognized SPDX exception id Exception(ExceptionId), /// A `AdditionRef-` prefixed id, with an optional `DocumentRef-` AdditionRef { + /// An optional document reference doc_ref: Option<&'a str>, + /// The name of the addition reference add_ref: &'a str, }, /// A postfix `+` indicating "or later" for a particular SPDX license id diff --git a/src/lib.rs b/src/lib.rs index d8145b0..41a2ed4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,15 @@ +#![cfg_attr(docsrs, feature(doc_cfg))] + +//! A crate for parsing and evaluating [SPDX license expressions](https://spdx.github.io/spdx-spec/v3.0.1/annexes/spdx-license-expressions/) +//! +//! The `detection` feature also allows detecting the SPDX license for a body of text, if any. + + /// Error types pub mod error; pub mod expression; /// Auto-generated lists of license identifiers and exception identifiers +#[allow(missing_docs)] pub mod identifiers; /// Contains types for lexing an SPDX license expression pub mod lexer; @@ -13,6 +21,7 @@ pub mod detection; /// Auto-generated full canonical text of each license #[cfg(feature = "text")] +#[allow(missing_docs)] pub mod text; pub use error::ParseError; @@ -24,7 +33,9 @@ use std::{ fmt, }; +/// Flags that can apply to licenses and/or license exceptions pub mod flags { + /// Inner type of the flags pub type Type = u8; /// Whether the license is listed as free by the [Free Software Foundation](https://www.gnu.org/licenses/license-list.en.html) @@ -324,13 +335,17 @@ impl fmt::Display for LicenseReq { } } +/// SPDX allows the use of `LicenseRef-` to provide +/// arbitrary licenses that aren't a part of the official SPDX license list #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] pub struct LicenseRef { - /// Purpose: Identify any external SPDX documents referenced within this SPDX document. + /// Identify any external SPDX documents referenced within this SPDX document. + /// /// See the [spec](https://spdx.org/spdx-specification-21-web-version#h.h430e9ypa0j9) for /// more details. pub doc_ref: Option, - /// Purpose: Provide a locally unique identifier to refer to licenses that are not found on the SPDX License List. + /// Provide a locally unique identifier to refer to licenses that are not found on the SPDX License List. + /// /// See the [spec](https://spdx.org/spdx-specification-21-web-version#h.4f1mdlm) for /// more details. pub lic_ref: String, @@ -353,11 +368,14 @@ impl fmt::Display for LicenseRef { pub enum LicenseItem { /// A regular SPDX license id Spdx { + /// The license identifier id: LicenseId, /// Indicates the license had a `+`, allowing the licensee to license /// the software under either the specific version, or any later versions or_later: bool, }, + /// SPDX allows the use of `LicenseRef-` to provide + /// arbitrary licenses that aren't a part of the official SPDX license list Other(Box), } @@ -437,6 +455,8 @@ impl fmt::Display for LicenseItem { } } +/// A user supplied `AddtionRef-` to specify additional text to +/// associate with a license that falls outside the SPDX license list #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] pub struct AdditionRef { /// Purpose: Identify any external SPDX documents referenced within this SPDX document. @@ -466,6 +486,8 @@ impl fmt::Display for AdditionRef { pub enum AdditionItem { /// A regular SPDX license exception id Spdx(ExceptionId), + /// A user supplied `AddtionRef-` to specify additional text to + /// associate with a license that falls outside the SPDX license list Other(Box), } diff --git a/src/licensee.rs b/src/licensee.rs index d7ff5f3..6377d55 100644 --- a/src/licensee.rs +++ b/src/licensee.rs @@ -234,6 +234,7 @@ impl Licensee { req.addition == self.inner.addition } + /// Converts this [`Self`] into a [`LicenseReq`] #[must_use] pub fn into_req(self) -> LicenseReq { self.inner From d0720851a007680c5115e03984c0f585f81844c6 Mon Sep 17 00:00:00 2001 From: Jake Shadle Date: Mon, 1 Dec 2025 16:07:41 +0100 Subject: [PATCH 11/14] More docs --- .cargo/config.toml | 1 - Cargo.toml | 4 ++++ README.md | 12 ++++++++++++ src/lib.rs | 7 ++----- 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/.cargo/config.toml b/.cargo/config.toml index ecadf49..b263294 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -74,5 +74,4 @@ rustflags = [ "-Wnonstandard_style", "-Wrust_2018_idioms", # END - Embark standard lints v6 for Rust 1.55+ - "-Dmissing_docs", ] \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index d3ae4fc..c491b05 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,8 +28,12 @@ include = [ text = [] # Allows analysis of text to determine if it might be an SPDX license text detection = ["regex", "unicode-normalization"] +# Allows de/serialization of a spdx::detection::Store for quicker loading detection-cache = ["detection", "zstd"] +# Inlines a cache into this crate, which contains all of the licenses from the +# SPDX crate that the crate version was packaged with detection-inline-cache = ["detection-cache"] +# Performs license detection in parallel within the same text detection-parallel = ["detection", "rayon"] [dependencies] diff --git a/README.md b/README.md index 3842bed..c6bacad 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,18 @@ +## About + +This crate's main purpose is to parse and evaluate SPDX license expressions. It also optionally provides the ability to scan text data for SPDX license information. Each version of this crate contains a specific version of the official [SPDX license list](https://spdx.org/licenses/) which can be retrieved via the `spdx::identifiers::VERSION` constant. + +## Features + +- `text` - Includes the full canonical text of each license +- `detection` - Allows analysis of text to determine if it might be an SPDX license text, or have an SPDX license header +- `detection-cache` - Allows de/serialization of a `Store` for quicker loading +- `detection-inline-cache` - Inlines a `Store` cache into this crate, which allows easier loading in downstream crates at the cost of increased binary size +- `detection-parallel` - Performs license detection in parallel within the same text + ## Usage ```rust diff --git a/src/lib.rs b/src/lib.rs index 41a2ed4..cee9ed0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,9 +1,6 @@ #![cfg_attr(docsrs, feature(doc_cfg))] - -//! A crate for parsing and evaluating [SPDX license expressions](https://spdx.github.io/spdx-spec/v3.0.1/annexes/spdx-license-expressions/) -//! -//! The `detection` feature also allows detecting the SPDX license for a body of text, if any. - +#![deny(missing_docs)] +#![doc = include_str!("../README.md")] /// Error types pub mod error; From 22494ff0465dea43aefc962e9d0d88a924338765 Mon Sep 17 00:00:00 2001 From: Jake Shadle Date: Mon, 1 Dec 2025 16:09:44 +0100 Subject: [PATCH 12/14] Update CHANGELOG --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 69a578b..d3be890 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] - ReleaseDate +### Added +- [PR#84](https://github.com/EmbarkStudios/spdx/pull/84) resolved [#67](https://github.com/EmbarkStudios/spdx/issues/67) by inling the `askalono` crate to allow detection of license texts or headers from arbitrary text data. There are multiple features flags associated with this new feature. + ## [0.12.0] - 2025-08-19 ### Added - [PR#81](https://github.com/EmbarkStudios/spdx/pull/81) resolved [#68](https://github.com/EmbarkStudios/spdx/issues/68) by adding support for the ` WITH [%s"DocumentRef-"(idstring)":"]%s"AdditionRef-"(idstring)` syntax. Thanks [@weihanglo](https://github.com/weihanglo)! From 87e419bc403f2e70d346d0179a44f649dcd6334f Mon Sep 17 00:00:00 2001 From: Jake Shadle Date: Mon, 1 Dec 2025 16:10:45 +0100 Subject: [PATCH 13/14] Format --- src/expression.rs | 2 +- src/lib.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/expression.rs b/src/expression.rs index a48496e..8dfd267 100644 --- a/src/expression.rs +++ b/src/expression.rs @@ -9,7 +9,7 @@ use smallvec::SmallVec; use std::fmt; /// A license requirement inside an SPDX license expression -/// +/// /// Inclueds the span in the expression where it is located #[derive(Debug, Clone)] pub struct ExpressionReq { diff --git a/src/lib.rs b/src/lib.rs index cee9ed0..f0e56b4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -337,12 +337,12 @@ impl fmt::Display for LicenseReq { #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] pub struct LicenseRef { /// Identify any external SPDX documents referenced within this SPDX document. - /// + /// /// See the [spec](https://spdx.org/spdx-specification-21-web-version#h.h430e9ypa0j9) for /// more details. pub doc_ref: Option, /// Provide a locally unique identifier to refer to licenses that are not found on the SPDX License List. - /// + /// /// See the [spec](https://spdx.org/spdx-specification-21-web-version#h.4f1mdlm) for /// more details. pub lic_ref: String, From ddb659d4cb383d55308662c97f0626d1ed54c655 Mon Sep 17 00:00:00 2001 From: Jake Shadle Date: Mon, 1 Dec 2025 16:13:46 +0100 Subject: [PATCH 14/14] Fix clippy lint --- README.md | 54 ++++++++++++++++++++++++++---------------------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index c6bacad..5ee09c1 100644 --- a/README.md +++ b/README.md @@ -34,34 +34,32 @@ This crate's main purpose is to parse and evaluate SPDX license expressions. It ```rust use spdx::Expression; -fn main() { - let this_is_fine = Expression::parse("MIT OR Apache-2.0").unwrap(); - - assert!(this_is_fine.evaluate(|req| { - if let spdx::LicenseItem::Spdx { id, .. } = req.license { - // Both MIT and Apache-2.0 are OSI approved, so this expression - // evaluates to true - return id.is_osi_approved(); - } - - false - })); - - assert!(!this_is_fine.evaluate(|req| { - if let spdx::LicenseItem::Spdx { id, .. } = req.license { - // This is saying we don't accept any licenses that are OSI approved - // so the expression will evaluate to false as both sides of the OR - // are now rejected - return !id.is_osi_approved(); - } - - false - })); - - // `NOPE` is not a valid SPDX license identifier, so this expression - // will fail to parse - let _this_is_not = Expression::parse("MIT OR NOPE").unwrap_err(); -} +let this_is_fine = Expression::parse("MIT OR Apache-2.0").unwrap(); + +assert!(this_is_fine.evaluate(|req| { + if let spdx::LicenseItem::Spdx { id, .. } = req.license { + // Both MIT and Apache-2.0 are OSI approved, so this expression + // evaluates to true + return id.is_osi_approved(); + } + + false +})); + +assert!(!this_is_fine.evaluate(|req| { + if let spdx::LicenseItem::Spdx { id, .. } = req.license { + // This is saying we don't accept any licenses that are OSI approved + // so the expression will evaluate to false as both sides of the OR + // are now rejected + return !id.is_osi_approved(); + } + + false +})); + +// `NOPE` is not a valid SPDX license identifier, so this expression +// will fail to parse +let _this_is_not = Expression::parse("MIT OR NOPE").unwrap_err(); ``` ## Updating SPDX list