Skip to content

Commit aa5ded7

Browse files
committed
A-fix-postgrest: URL decode + table validation + pub ParseError + ilike/in/is/like tests
1 parent 5ecc88f commit aa5ded7

1 file changed

Lines changed: 300 additions & 5 deletions

File tree

crates/lance-graph-callcenter/src/postgrest.rs

Lines changed: 300 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@
77
//! META-AGENT (follow-up wiring, do NOT do here):
88
//! - gate behind feature `postgrest`; add `pub mod postgrest;` to lib.rs;
99
//! - add `postgrest = ["dep:serde", "dep:serde_json"]` to Cargo.toml [features].
10+
//! - epiphany E4 outlook (PR #278): add an OPTIONAL feature
11+
//! `datafusion-dispatch = ["dep:datafusion"]` that compiles the stub
12+
//! `parsed_query_to_plan` at the bottom of this file into a real
13+
//! PostgREST → DataFusion → RlsRewriter dispatcher. Keep it optional
14+
//! so the zero-dep posture of the crate is preserved by default.
1015
//!
1116
//! This module is dependency-free: it uses only `std`, and emits JSON via
1217
//! a small hand-rolled writer so it compiles under the crate's default
@@ -188,8 +193,25 @@ pub struct ParseError {
188193
}
189194

190195
impl ParseError {
191-
fn new(msg: impl Into<String>) -> Self {
192-
Self { message: msg.into() }
196+
/// Construct a `ParseError` from any string-ish value.
197+
///
198+
/// Public so downstream crates (e.g. an upcoming `PostgRestDispatcher`
199+
/// that wires PostgREST → DataFusion → RlsRewriter) can surface their
200+
/// own parse-time errors through this same type.
201+
pub fn new(message: impl Into<String>) -> Self {
202+
Self { message: message.into() }
203+
}
204+
}
205+
206+
impl From<String> for ParseError {
207+
fn from(message: String) -> Self {
208+
Self { message }
209+
}
210+
}
211+
212+
impl From<&str> for ParseError {
213+
fn from(message: &str) -> Self {
214+
Self { message: message.to_string() }
193215
}
194216
}
195217

@@ -201,6 +223,51 @@ impl std::fmt::Display for ParseError {
201223

202224
impl std::error::Error for ParseError {}
203225

226+
// ── URL decoding (HIGH) ──────────────────────────────────────────────────────
227+
228+
/// Percent-decode a URL component. Handles `%XX` and `+` → space (form-encoding).
229+
/// Returns None if malformed (incomplete %XX or invalid hex).
230+
fn percent_decode(s: &str) -> Option<String> {
231+
let mut out = Vec::with_capacity(s.len());
232+
let bytes = s.as_bytes();
233+
let mut i = 0;
234+
while i < bytes.len() {
235+
match bytes[i] {
236+
b'+' => {
237+
out.push(b' ');
238+
i += 1;
239+
}
240+
b'%' if i + 2 < bytes.len() => {
241+
let hex = std::str::from_utf8(&bytes[i + 1..i + 3]).ok()?;
242+
let v = u8::from_str_radix(hex, 16).ok()?;
243+
out.push(v);
244+
i += 3;
245+
}
246+
b'%' => return None, // truncated escape
247+
c => {
248+
out.push(c);
249+
i += 1;
250+
}
251+
}
252+
}
253+
String::from_utf8(out).ok()
254+
}
255+
256+
// ── Table-name validation (MEDIUM) ───────────────────────────────────────────
257+
258+
/// Validate a PostgREST table name: ASCII alphanumeric + underscore, not
259+
/// starting with `_` (rejects magic / reserved names), not empty.
260+
///
261+
/// This rejects path-traversal (`..`), dotted paths (`users.json`), and any
262+
/// non-ASCII identifier. PostgREST's real surface is more permissive (quoted
263+
/// identifiers can contain anything), but the SMB+MedCare subset only ever
264+
/// uses simple snake_case names; tighter validation is the safer default.
265+
fn is_valid_table_name(s: &str) -> bool {
266+
!s.is_empty()
267+
&& s.chars().all(|c| c.is_ascii_alphanumeric() || c == '_')
268+
&& !s.starts_with('_')
269+
}
270+
204271
// ── Path parser ──────────────────────────────────────────────────────────────
205272

206273
/// Pure function — no I/O. Parses path query string into [`ParsedQuery`].
@@ -230,6 +297,14 @@ pub fn parse_path(path: &str) -> Result<ParsedQuery, ParseError> {
230297
)));
231298
}
232299

300+
// Reject path-traversal (`..`), dotted paths (`users.json`), magic
301+
// names (`_internal`), and any non-ASCII-identifier table label.
302+
if !is_valid_table_name(table_part) {
303+
return Err(ParseError::new(format!(
304+
"invalid table name: {table_part}"
305+
)));
306+
}
307+
233308
let mut parsed = ParsedQuery {
234309
table: table_part.to_string(),
235310
..Default::default()
@@ -261,8 +336,18 @@ pub fn parse_path(path: &str) -> Result<ParsedQuery, ParseError> {
261336
}
262337

263338
match key {
264-
"select" => parsed.select = Some(value.to_string()),
265-
"order" => parsed.order = Some(value.to_string()),
339+
"select" => {
340+
let decoded = percent_decode(value).ok_or_else(|| {
341+
ParseError::new(format!("malformed urlencoding in select: {value}"))
342+
})?;
343+
parsed.select = Some(decoded);
344+
}
345+
"order" => {
346+
let decoded = percent_decode(value).ok_or_else(|| {
347+
ParseError::new(format!("malformed urlencoding in order: {value}"))
348+
})?;
349+
parsed.order = Some(decoded);
350+
}
266351
"limit" => {
267352
parsed.limit = Some(value.parse::<u64>().map_err(|_| {
268353
ParseError::new(format!("bad limit: {value}"))
@@ -286,10 +371,19 @@ pub fn parse_path(path: &str) -> Result<ParsedQuery, ParseError> {
286371
let op = FilterOp::parse(op_str).ok_or_else(|| {
287372
ParseError::new(format!("unknown filter op: {op_str}"))
288373
})?;
374+
// URL-decode AFTER `op.` split so e.g. `eq.foo%40bar.com`
375+
// yields `foo@bar.com` (and not a confused dot-split on a
376+
// decoded `.`). Op tokens themselves are ASCII and never
377+
// need decoding.
378+
let decoded = percent_decode(val).ok_or_else(|| {
379+
ParseError::new(format!(
380+
"malformed urlencoding in filter value: {val}"
381+
))
382+
})?;
289383
parsed.filters.push(Filter {
290384
column: key.to_string(),
291385
op,
292-
value: val.to_string(),
386+
value: decoded,
293387
});
294388
}
295389
}
@@ -400,6 +494,11 @@ fn push_optional_string(out: &mut String, v: Option<&str>) {
400494
}
401495

402496
/// Encode a Rust `&str` as a JSON string literal (with surrounding quotes).
497+
///
498+
/// Supplementary-plane characters (U+10000 and above) are emitted as a
499+
/// UTF-16 surrogate pair `\uXXXX\uXXXX` per RFC 8259 §7. BMP characters
500+
/// pass through as-is (UTF-8 in the JSON text is fine; we only escape
501+
/// when we have to).
403502
fn json_string(s: &str) -> String {
404503
let mut out = String::with_capacity(s.len() + 2);
405504
out.push('"');
@@ -415,6 +514,15 @@ fn json_string(s: &str) -> String {
415514
c if (c as u32) < 0x20 => {
416515
out.push_str(&format!("\\u{:04x}", c as u32));
417516
}
517+
c if (c as u32) > 0xFFFF => {
518+
// Supplementary plane: encode as UTF-16 surrogate pair.
519+
// high = 0xD800 + ((cp - 0x10000) >> 10)
520+
// low = 0xDC00 + ((cp - 0x10000) & 0x3FF)
521+
let cp = c as u32 - 0x10000;
522+
let high = 0xD800 + (cp >> 10);
523+
let low = 0xDC00 + (cp & 0x3FF);
524+
out.push_str(&format!("\\u{:04x}\\u{:04x}", high, low));
525+
}
418526
c => out.push(c),
419527
}
420528
}
@@ -656,4 +764,191 @@ mod tests {
656764
assert!(s.contains("\"offset\":null"), "{s}");
657765
assert!(s.contains("\"filters\":[]"), "{s}");
658766
}
767+
768+
// ── Filter-op coverage (LOW) — ilike / in / is / like ────────────────
769+
770+
#[test]
771+
fn parse_ilike_filter() {
772+
let q = parse_path("users?email=ilike.*@example.com").expect("parse ok");
773+
assert_eq!(q.table, "users");
774+
assert_eq!(q.filters.len(), 1);
775+
let f = &q.filters[0];
776+
assert_eq!(f.column, "email");
777+
assert_eq!(f.op, FilterOp::ILike);
778+
assert_eq!(f.value, "*@example.com");
779+
}
780+
781+
#[test]
782+
fn parse_in_filter() {
783+
let q = parse_path("users?id=in.(1,2,3)").expect("parse ok");
784+
assert_eq!(q.filters.len(), 1);
785+
let f = &q.filters[0];
786+
assert_eq!(f.column, "id");
787+
assert_eq!(f.op, FilterOp::In);
788+
assert_eq!(f.value, "(1,2,3)");
789+
}
790+
791+
#[test]
792+
fn parse_is_null_filter() {
793+
let q = parse_path("users?status=is.null").expect("parse ok");
794+
let f = &q.filters[0];
795+
assert_eq!(f.column, "status");
796+
assert_eq!(f.op, FilterOp::Is);
797+
assert_eq!(f.value, "null");
798+
}
799+
800+
#[test]
801+
fn parse_like_filter() {
802+
let q = parse_path("users?name=like.J*").expect("parse ok");
803+
let f = &q.filters[0];
804+
assert_eq!(f.column, "name");
805+
assert_eq!(f.op, FilterOp::Like);
806+
assert_eq!(f.value, "J*");
807+
}
808+
809+
// ── URL-decode tests (HIGH) ──────────────────────────────────────────
810+
811+
#[test]
812+
fn parse_url_decode_space() {
813+
let q = parse_path("users?name=eq.John%20Doe").expect("parse ok");
814+
assert_eq!(q.filters[0].value, "John Doe");
815+
}
816+
817+
#[test]
818+
fn parse_url_decode_at_sign() {
819+
let q = parse_path("users?email=eq.foo%40bar.com").expect("parse ok");
820+
assert_eq!(q.filters[0].value, "foo@bar.com");
821+
}
822+
823+
#[test]
824+
fn parse_url_decode_plus_sign() {
825+
// %2B → '+' (literal plus, NOT a space — that's only bare `+`).
826+
let q = parse_path("users?notes=eq.Hello%2BWorld").expect("parse ok");
827+
assert_eq!(q.filters[0].value, "Hello+World");
828+
}
829+
830+
#[test]
831+
fn parse_url_decode_bare_plus_is_space() {
832+
// form-encoding convention: bare `+` decodes to space.
833+
let q = parse_path("users?name=eq.John+Doe").expect("parse ok");
834+
assert_eq!(q.filters[0].value, "John Doe");
835+
}
836+
837+
#[test]
838+
fn parse_url_decode_select_and_order() {
839+
let q = parse_path("users?select=id%2Cname&order=created.desc")
840+
.expect("parse ok");
841+
assert_eq!(q.select.as_deref(), Some("id,name"));
842+
assert_eq!(q.order.as_deref(), Some("created.desc"));
843+
}
844+
845+
#[test]
846+
fn parse_url_decode_truncated_escape_errors() {
847+
let err = parse_path("users?name=eq.foo%2").expect_err("must error");
848+
assert!(err.message.contains("malformed urlencoding"), "{}", err.message);
849+
}
850+
851+
#[test]
852+
fn parse_url_decode_bad_hex_errors() {
853+
let err = parse_path("users?name=eq.foo%ZZbar").expect_err("must error");
854+
assert!(err.message.contains("malformed urlencoding"), "{}", err.message);
855+
}
856+
857+
// ── Table-validation tests (MEDIUM) ──────────────────────────────────
858+
859+
#[test]
860+
fn parse_table_traversal_errors() {
861+
// `..` contains `.`, plus `/` → caught by either nested-path or
862+
// invalid-table-name guard. Either error message is acceptable.
863+
let err = parse_path("../../../etc/passwd").expect_err("must error");
864+
assert!(
865+
err.message.contains("nested path") || err.message.contains("invalid table name"),
866+
"{}",
867+
err.message
868+
);
869+
}
870+
871+
#[test]
872+
fn parse_table_with_period_errors() {
873+
let err = parse_path("users.json?id=eq.1").expect_err("must error");
874+
assert!(err.message.contains("invalid table name"), "{}", err.message);
875+
}
876+
877+
#[test]
878+
fn parse_table_underscore_prefix_errors() {
879+
let err = parse_path("_internal?id=eq.1").expect_err("must error");
880+
assert!(err.message.contains("invalid table name"), "{}", err.message);
881+
}
882+
883+
#[test]
884+
fn parse_table_unicode_errors() {
885+
let err = parse_path("ünits?id=eq.1").expect_err("must error");
886+
assert!(err.message.contains("invalid table name"), "{}", err.message);
887+
}
888+
889+
#[test]
890+
fn parse_table_leading_slash_still_works() {
891+
// Existing behaviour preserved.
892+
let q = parse_path("/users").expect("parse ok");
893+
assert_eq!(q.table, "users");
894+
}
895+
896+
// ── ParseError public surface ────────────────────────────────────────
897+
898+
#[test]
899+
fn parse_error_public_constructors() {
900+
let a = ParseError::new("a");
901+
let b: ParseError = "b".into();
902+
let c: ParseError = String::from("c").into();
903+
assert_eq!(a.message, "a");
904+
assert_eq!(b.message, "b");
905+
assert_eq!(c.message, "c");
906+
// Display impl is reachable.
907+
assert!(format!("{a}").contains("a"));
908+
}
909+
910+
// ── Surrogate-pair JSON escaping (LOW) ───────────────────────────────
911+
912+
#[test]
913+
fn json_string_supplementary_plane_emits_surrogate_pair() {
914+
// U+1F600 GRINNING FACE → 😀
915+
let out = json_string("\u{1F600}");
916+
assert_eq!(out, "\"\\ud83d\\ude00\"", "{out}");
917+
}
918+
919+
#[test]
920+
fn json_string_bmp_passes_through() {
921+
// BMP chars are emitted as UTF-8, not escaped.
922+
let out = json_string("café");
923+
assert_eq!(out, "\"café\"");
924+
}
925+
}
926+
927+
// ── EPIPHANY E4 SEED — PostgREST → DataFusion dispatch stub ──────────────────
928+
//
929+
// Doc-only stub for the upcoming `PostgRestDispatcher` (PR #278 outlook E4)
930+
// that wires PostgREST → DataFusion → RlsRewriter. Compiled only under the
931+
// optional `datafusion-dispatch` feature so the zero-dep default posture of
932+
// the crate is preserved.
933+
//
934+
// META-AGENT (follow-up wiring, do NOT do here):
935+
// - add `datafusion-dispatch = ["dep:datafusion"]` to Cargo.toml [features];
936+
// - add `datafusion = { version = "...", optional = true }` to dependencies;
937+
// - flesh out `parsed_query_to_plan` to translate a `ParsedQuery` into a
938+
// `LogicalPlan` (TableScan → Filter → Projection → Sort → Limit), then
939+
// hand it to `crate::rls::RlsRewriter` for row-level-security overlay.
940+
941+
/// Convert a [`ParsedQuery`] to a DataFusion `LogicalPlan`.
942+
///
943+
/// **NOT IMPLEMENTED** in this PR. Stub interface for the upcoming
944+
/// `PostgRestDispatcher` that wires PostgREST → DataFusion → RlsRewriter.
945+
/// See PR #278 outlook epiphany E4.
946+
#[cfg(feature = "datafusion-dispatch")]
947+
pub fn parsed_query_to_plan(
948+
_query: &ParsedQuery,
949+
_ctx: &datafusion::execution::context::SessionContext,
950+
) -> Result<datafusion::logical_expr::LogicalPlan, ParseError> {
951+
Err(ParseError::new(
952+
"parsed_query_to_plan: not yet implemented (PR #278 outlook E4)",
953+
))
659954
}

0 commit comments

Comments
 (0)