|
38 | 38 |
|
39 | 39 | use chrono::{DateTime, Utc}; |
40 | 40 |
|
41 | | -use crate::openhuman::memory::tree::content_store::paths::{sanitize_filename, SummaryTreeKind}; |
| 41 | +use crate::openhuman::memory::tree::content_store::paths::{ |
| 42 | + sanitize_filename, slugify_source_id, SummaryTreeKind, |
| 43 | +}; |
42 | 44 | use crate::openhuman::memory::tree::types::{Chunk, SourceKind}; |
43 | 45 |
|
| 46 | +/// Build the canonical Obsidian `source/<slug>` tag for a given |
| 47 | +/// `source_id`. Used to seed the `tags:` block on every chunk and |
| 48 | +/// every source-tree summary so the Obsidian graph view can filter by |
| 49 | +/// source. |
| 50 | +/// |
| 51 | +/// Slug rules match `slugify_source_id` (lowercase ASCII, `-` separators, |
| 52 | +/// alphanumerics + `_` preserved) so the tag matches the on-disk |
| 53 | +/// `raw/<slug>/...` directory name byte-for-byte. |
| 54 | +pub fn source_tag(source_id: &str) -> String { |
| 55 | + format!("source/{}", slugify_source_id(source_id)) |
| 56 | +} |
| 57 | + |
| 58 | +/// Prepend the source tag to `tags`, dedup, and return the new list. |
| 59 | +/// Order is preserved otherwise — `source/...` always comes first so |
| 60 | +/// it shows up at the top of the YAML block. |
| 61 | +pub fn with_source_tag(source_id: &str, tags: &[String]) -> Vec<String> { |
| 62 | + let st = source_tag(source_id); |
| 63 | + let mut out = Vec::with_capacity(tags.len() + 1); |
| 64 | + out.push(st.clone()); |
| 65 | + for t in tags { |
| 66 | + if t != &st { |
| 67 | + out.push(t.clone()); |
| 68 | + } |
| 69 | + } |
| 70 | + out |
| 71 | +} |
| 72 | + |
| 73 | +/// Parse the value of a top-level YAML scalar field (e.g. `source_id`, |
| 74 | +/// `tree_scope`, `tree_kind`) from a frontmatter string. Strips |
| 75 | +/// surrounding double-quotes if present so the returned slice matches |
| 76 | +/// what the original composer passed in. Returns `None` if the key is |
| 77 | +/// not present at the top level of the frontmatter. |
| 78 | +pub fn scan_fm_field<'a>(fm: &'a str, key: &str) -> Option<String> { |
| 79 | + let prefix = format!("{key}: "); |
| 80 | + for raw in fm.lines() { |
| 81 | + // Skip indented lines (those are list items / nested mappings). |
| 82 | + if raw.starts_with(' ') || raw.starts_with('\t') { |
| 83 | + continue; |
| 84 | + } |
| 85 | + if let Some(rest) = raw.strip_prefix(&prefix) { |
| 86 | + let trimmed = rest.trim(); |
| 87 | + if let Some(inner) = trimmed.strip_prefix('"').and_then(|s| s.strip_suffix('"')) { |
| 88 | + return Some(inner.replace("\\\"", "\"").replace("\\\\", "\\")); |
| 89 | + } |
| 90 | + return Some(trimmed.to_string()); |
| 91 | + } |
| 92 | + } |
| 93 | + None |
| 94 | +} |
| 95 | + |
44 | 96 | /// Compose the full file content (front-matter + body) for `chunk`. |
45 | 97 | /// |
46 | 98 | /// Returns `(full_file_bytes, body_bytes)`. The caller writes `full_file_bytes` |
@@ -78,13 +130,13 @@ fn build_front_matter(chunk: &Chunk) -> Vec<u8> { |
78 | 130 | fm.push_str(&format!("source_ref: {}\n", yaml_scalar(&sr.value))); |
79 | 131 | } |
80 | 132 |
|
81 | | - if meta.tags.is_empty() { |
82 | | - fm.push_str("tags: []\n"); |
83 | | - } else { |
84 | | - fm.push_str("tags:\n"); |
85 | | - for tag in &meta.tags { |
86 | | - fm.push_str(&format!(" - {}\n", yaml_scalar(tag))); |
87 | | - } |
| 133 | + // Always seed the source tag so the Obsidian graph filter can pick |
| 134 | + // up `source/<slug>` for every chunk regardless of what the |
| 135 | + // ingest-side tag list contained. |
| 136 | + let seeded_tags = with_source_tag(&meta.source_id, &meta.tags); |
| 137 | + fm.push_str("tags:\n"); |
| 138 | + for tag in &seeded_tags { |
| 139 | + fm.push_str(&format!(" - {}\n", yaml_scalar(tag))); |
88 | 140 | } |
89 | 141 |
|
90 | 142 | // Email-specific fields: participants list + Obsidian alias. |
@@ -369,7 +421,16 @@ fn build_summary_front_matter(r: &SummaryComposeInput<'_>) -> String { |
369 | 421 | fm.push_str("aliases:\n"); |
370 | 422 | fm.push_str(&format!(" - {}\n", yaml_scalar(&alias))); |
371 | 423 |
|
372 | | - fm.push_str("tags: []\n"); |
| 424 | + // Source-tree summaries get a `source/<slug>` seed tag for graph |
| 425 | + // filtering. Global / topic trees aggregate across sources, so the |
| 426 | + // `source/...` tag has no single value there — leave them untagged |
| 427 | + // at compose time (LLM extraction adds entity tags later). |
| 428 | + if matches!(r.tree_kind, SummaryTreeKind::Source) { |
| 429 | + fm.push_str("tags:\n"); |
| 430 | + fm.push_str(&format!(" - {}\n", yaml_scalar(&source_tag(r.tree_scope)))); |
| 431 | + } else { |
| 432 | + fm.push_str("tags: []\n"); |
| 433 | + } |
373 | 434 | fm.push_str("---\n"); |
374 | 435 | fm |
375 | 436 | } |
@@ -790,7 +851,10 @@ mod tests { |
790 | 851 | fm.contains(" - \"[[child-2]]\""), |
791 | 852 | "must list child ids as Obsidian wikilinks; got:\n{fm}" |
792 | 853 | ); |
793 | | - assert!(fm.contains("tags: []"), "must start with empty tags"); |
| 854 | + assert!( |
| 855 | + fm.contains(" - source/"), |
| 856 | + "source-tree summary must seed source tag; got:\n{fm}" |
| 857 | + ); |
794 | 858 | // aliases must mention the scope |
795 | 859 | assert!(fm.contains("aliases:"), "must have aliases"); |
796 | 860 | assert!( |
|
0 commit comments