Skip to content

Commit fb6f1e3

Browse files
authored
feat(memory): kind-aware raw layout, _source.md mirror, source/<slug> tag (tinyhumansai#1351)
1 parent 836a6d8 commit fb6f1e3

7 files changed

Lines changed: 510 additions & 54 deletions

File tree

src/openhuman/composio/providers/gmail/ingest.rs

Lines changed: 8 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,8 @@ use crate::openhuman::memory::tree::canonicalize::email::{EmailMessage, EmailThr
2525
use crate::openhuman::memory::tree::canonicalize::email_clean::{
2626
extract_email, parse_message_date,
2727
};
28-
use crate::openhuman::memory::tree::content_store::paths::slugify_source_id;
2928
use crate::openhuman::memory::tree::content_store::raw::{
30-
self as raw_store, slug_account_email, RawItem,
29+
self as raw_store, raw_rel_path, slug_account_email, RawItem, RawKind,
3130
};
3231
use crate::openhuman::memory::tree::ingest::{ingest_email, IngestResult};
3332
use crate::openhuman::memory::tree::store::{set_chunk_raw_refs, RawRef};
@@ -222,7 +221,7 @@ fn parse_address_list(v: Option<&Value>) -> Vec<String> {
222221
///
223222
/// In addition to the chunked content_store output, we mirror every
224223
/// admitted message as a verbatim `.md` under
225-
/// `<content_root>/raw/<source_slug>/<created_at_ms>_<message_id>.md`.
224+
/// `<content_root>/raw/<source_slug>/emails/<created_at_ms>_<message_id>.md`.
226225
/// Useful for debugging, Obsidian browsing, and as a stable archive
227226
/// independent of the chunker / summariser.
228227
///
@@ -334,7 +333,6 @@ async fn ingest_per_message(
334333
owner: &str,
335334
page_messages: &[Value],
336335
) -> usize {
337-
let source_slug = slugify_source_id(source_id);
338336
let mut total_chunks = 0usize;
339337
for raw in page_messages {
340338
let id = raw
@@ -349,11 +347,11 @@ async fn ingest_per_message(
349347
continue;
350348
};
351349

352-
let raw_path = format!(
353-
"raw/{}/{}_{}.md",
354-
source_slug,
350+
let raw_path = raw_rel_path(
351+
source_id,
352+
RawKind::Email,
355353
sent_at.timestamp_millis(),
356-
sanitize_uid_for_path(msg_id)
354+
msg_id,
357355
);
358356

359357
let thread_subject = pick_thread_subject(std::slice::from_ref(&message));
@@ -393,27 +391,9 @@ async fn ingest_per_message(
393391
total_chunks
394392
}
395393

396-
/// Same character map the raw-archive writer uses for filenames.
397-
/// Mirrors `raw_store::write_raw_items::sanitize_uid` but local so a
398-
/// future rule change on either side stays decoupled.
399-
fn sanitize_uid_for_path(uid: &str) -> String {
400-
let cleaned: String = uid
401-
.chars()
402-
.map(|c| match c {
403-
'\\' | '/' | ':' | '*' | '?' | '"' | '<' | '>' | '|' | ' ' => '-',
404-
other => other,
405-
})
406-
.collect();
407-
if cleaned.is_empty() {
408-
"unknown".into()
409-
} else {
410-
cleaned
411-
}
412-
}
413-
414394
/// Mirror a page of raw Gmail messages into the on-disk raw archive.
415395
///
416-
/// Files land under `<content_root>/raw/<source_slug>/<ts_ms>_<msg_id>.md`.
396+
/// Files land under `<content_root>/raw/<source_slug>/emails/<ts_ms>_<msg_id>.md`.
417397
/// We write the **backend-produced markdown verbatim** — the
418398
/// `markdown` field on each message is the per-message slice of the
419399
/// response-level `markdownFormatted`, pinned by
@@ -493,6 +473,7 @@ fn write_raw_archive(config: &Config, source_id: &str, page: &[Value]) -> Result
493473
uid: id,
494474
created_at_ms: *ts,
495475
markdown: md.as_str(),
476+
kind: RawKind::Email,
496477
})
497478
.collect();
498479

src/openhuman/memory/tree/content_store/compose.rs

Lines changed: 74 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,61 @@
3838
3939
use chrono::{DateTime, Utc};
4040

41-
use crate::openhuman::memory::tree::content_store::paths::{sanitize_filename, SummaryTreeKind};
41+
use crate::openhuman::memory::tree::content_store::paths::{
42+
sanitize_filename, slugify_source_id, SummaryTreeKind,
43+
};
4244
use crate::openhuman::memory::tree::types::{Chunk, SourceKind};
4345

46+
/// Build the canonical Obsidian `source/<slug>` tag for a given
47+
/// `source_id`. Used to seed the `tags:` block on every chunk and
48+
/// every source-tree summary so the Obsidian graph view can filter by
49+
/// source.
50+
///
51+
/// Slug rules match `slugify_source_id` (lowercase ASCII, `-` separators,
52+
/// alphanumerics + `_` preserved) so the tag matches the on-disk
53+
/// `raw/<slug>/...` directory name byte-for-byte.
54+
pub fn source_tag(source_id: &str) -> String {
55+
format!("source/{}", slugify_source_id(source_id))
56+
}
57+
58+
/// Prepend the source tag to `tags`, dedup, and return the new list.
59+
/// Order is preserved otherwise — `source/...` always comes first so
60+
/// it shows up at the top of the YAML block.
61+
pub fn with_source_tag(source_id: &str, tags: &[String]) -> Vec<String> {
62+
let st = source_tag(source_id);
63+
let mut out = Vec::with_capacity(tags.len() + 1);
64+
out.push(st.clone());
65+
for t in tags {
66+
if t != &st {
67+
out.push(t.clone());
68+
}
69+
}
70+
out
71+
}
72+
73+
/// Parse the value of a top-level YAML scalar field (e.g. `source_id`,
74+
/// `tree_scope`, `tree_kind`) from a frontmatter string. Strips
75+
/// surrounding double-quotes if present so the returned slice matches
76+
/// what the original composer passed in. Returns `None` if the key is
77+
/// not present at the top level of the frontmatter.
78+
pub fn scan_fm_field<'a>(fm: &'a str, key: &str) -> Option<String> {
79+
let prefix = format!("{key}: ");
80+
for raw in fm.lines() {
81+
// Skip indented lines (those are list items / nested mappings).
82+
if raw.starts_with(' ') || raw.starts_with('\t') {
83+
continue;
84+
}
85+
if let Some(rest) = raw.strip_prefix(&prefix) {
86+
let trimmed = rest.trim();
87+
if let Some(inner) = trimmed.strip_prefix('"').and_then(|s| s.strip_suffix('"')) {
88+
return Some(inner.replace("\\\"", "\"").replace("\\\\", "\\"));
89+
}
90+
return Some(trimmed.to_string());
91+
}
92+
}
93+
None
94+
}
95+
4496
/// Compose the full file content (front-matter + body) for `chunk`.
4597
///
4698
/// Returns `(full_file_bytes, body_bytes)`. The caller writes `full_file_bytes`
@@ -78,13 +130,13 @@ fn build_front_matter(chunk: &Chunk) -> Vec<u8> {
78130
fm.push_str(&format!("source_ref: {}\n", yaml_scalar(&sr.value)));
79131
}
80132

81-
if meta.tags.is_empty() {
82-
fm.push_str("tags: []\n");
83-
} else {
84-
fm.push_str("tags:\n");
85-
for tag in &meta.tags {
86-
fm.push_str(&format!(" - {}\n", yaml_scalar(tag)));
87-
}
133+
// Always seed the source tag so the Obsidian graph filter can pick
134+
// up `source/<slug>` for every chunk regardless of what the
135+
// ingest-side tag list contained.
136+
let seeded_tags = with_source_tag(&meta.source_id, &meta.tags);
137+
fm.push_str("tags:\n");
138+
for tag in &seeded_tags {
139+
fm.push_str(&format!(" - {}\n", yaml_scalar(tag)));
88140
}
89141

90142
// Email-specific fields: participants list + Obsidian alias.
@@ -369,7 +421,16 @@ fn build_summary_front_matter(r: &SummaryComposeInput<'_>) -> String {
369421
fm.push_str("aliases:\n");
370422
fm.push_str(&format!(" - {}\n", yaml_scalar(&alias)));
371423

372-
fm.push_str("tags: []\n");
424+
// Source-tree summaries get a `source/<slug>` seed tag for graph
425+
// filtering. Global / topic trees aggregate across sources, so the
426+
// `source/...` tag has no single value there — leave them untagged
427+
// at compose time (LLM extraction adds entity tags later).
428+
if matches!(r.tree_kind, SummaryTreeKind::Source) {
429+
fm.push_str("tags:\n");
430+
fm.push_str(&format!(" - {}\n", yaml_scalar(&source_tag(r.tree_scope))));
431+
} else {
432+
fm.push_str("tags: []\n");
433+
}
373434
fm.push_str("---\n");
374435
fm
375436
}
@@ -790,7 +851,10 @@ mod tests {
790851
fm.contains(" - \"[[child-2]]\""),
791852
"must list child ids as Obsidian wikilinks; got:\n{fm}"
792853
);
793-
assert!(fm.contains("tags: []"), "must start with empty tags");
854+
assert!(
855+
fm.contains(" - source/"),
856+
"source-tree summary must seed source tag; got:\n{fm}"
857+
);
794858
// aliases must mention the scope
795859
assert!(fm.contains("aliases:"), "must have aliases");
796860
assert!(

0 commit comments

Comments
 (0)