-
Notifications
You must be signed in to change notification settings - Fork 187
Expand file tree
/
Copy pathentity_parser.py
More file actions
154 lines (126 loc) · 5.23 KB
/
entity_parser.py
File metadata and controls
154 lines (126 loc) · 5.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
"""Parser for markdown files into Entity objects.
Uses markdown-it with plugins to parse structured data from markdown content.
"""
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any, Optional
import dateparser
import frontmatter
import yaml
from loguru import logger
from markdown_it import MarkdownIt
from basic_memory.markdown.plugins import observation_plugin, relation_plugin
from basic_memory.markdown.schemas import (
EntityFrontmatter,
EntityMarkdown,
Observation,
Relation,
)
from basic_memory.utils import parse_tags
md = MarkdownIt().use(observation_plugin).use(relation_plugin)
@dataclass
class EntityContent:
content: str
observations: list[Observation] = field(default_factory=list)
relations: list[Relation] = field(default_factory=list)
def parse(content: str) -> EntityContent:
"""Parse markdown content into EntityMarkdown."""
# Parse content for observations and relations using markdown-it
observations = []
relations = []
if content:
for token in md.parse(content):
# check for observations and relations
if token.meta:
if "observation" in token.meta:
obs = token.meta["observation"]
observation = Observation.model_validate(obs)
observations.append(observation)
if "relations" in token.meta:
rels = token.meta["relations"]
relations.extend([Relation.model_validate(r) for r in rels])
return EntityContent(
content=content,
observations=observations,
relations=relations,
)
# def parse_tags(tags: Any) -> list[str]:
# """Parse tags into list of strings."""
# if isinstance(tags, (list, tuple)):
# return [str(t).strip() for t in tags if str(t).strip()]
# return [t.strip() for t in tags.split(",") if t.strip()]
class EntityParser:
"""Parser for markdown files into Entity objects."""
def __init__(self, base_path: Path):
"""Initialize parser with base path for relative permalink generation."""
self.base_path = base_path.resolve()
def parse_date(self, value: Any) -> Optional[datetime]:
"""Parse date strings using dateparser for maximum flexibility.
Supports human friendly formats like:
- 2024-01-15
- Jan 15, 2024
- 2024-01-15 10:00 AM
- yesterday
- 2 days ago
"""
if isinstance(value, datetime):
return value
if isinstance(value, str):
parsed = dateparser.parse(value)
if parsed:
return parsed
return None
async def parse_file(self, path: Path | str) -> EntityMarkdown:
"""Parse markdown file into EntityMarkdown."""
# Check if the path is already absolute
if (
isinstance(path, Path)
and path.is_absolute()
or (isinstance(path, str) and Path(path).is_absolute())
):
absolute_path = Path(path)
else:
absolute_path = self.get_file_path(path)
# Parse frontmatter and content using python-frontmatter
file_content = absolute_path.read_text(encoding="utf-8")
return await self.parse_file_content(absolute_path, file_content)
def get_file_path(self, path):
"""Get absolute path for a file using the base path for the project."""
return self.base_path / path
async def parse_file_content(self, absolute_path, file_content):
# Parse frontmatter with proper error handling for malformed YAML (issue #185)
try:
post = frontmatter.loads(file_content)
except yaml.YAMLError as e:
# Log the YAML parsing error with file context
logger.warning(
f"Failed to parse YAML frontmatter in {absolute_path}: {e}. "
f"Treating file as plain markdown without frontmatter."
)
# Create a post with no frontmatter - treat entire content as markdown
post = frontmatter.Post(file_content, metadata={})
# Extract file stat info
file_stats = absolute_path.stat()
metadata = post.metadata
# Ensure required fields have defaults (issue #184)
metadata["title"] = post.metadata.get("title", absolute_path.stem)
# Handle type - use default if missing OR explicitly set to None/null
entity_type = post.metadata.get("type")
metadata["type"] = entity_type if entity_type is not None else "note"
tags = parse_tags(post.metadata.get("tags", [])) # pyright: ignore
if tags:
metadata["tags"] = tags
# frontmatter - use metadata with defaults applied
entity_frontmatter = EntityFrontmatter(
metadata=metadata,
)
entity_content = parse(post.content)
return EntityMarkdown(
frontmatter=entity_frontmatter,
content=post.content,
observations=entity_content.observations,
relations=entity_content.relations,
created=datetime.fromtimestamp(file_stats.st_ctime).astimezone(),
modified=datetime.fromtimestamp(file_stats.st_mtime).astimezone(),
)