basic-memory/src/basic_memory/markdown/markdown_processor.py at 23f7f067a46c93615b3b3a67c90afc6befe06f89 · basicmachines-co/basic-memory · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from pathlib import Path
from typing import Optional
from collections import OrderedDict

import frontmatter
from frontmatter import Post
from loguru import logger

from basic_memory import file_utils
from basic_memory.markdown.entity_parser import EntityParser
from basic_memory.markdown.schemas import EntityMarkdown, Observation, Relation


class DirtyFileError(Exception):
    """Raised when attempting to write to a file that has been modified."""

    pass


class MarkdownProcessor:
    """Process markdown files while preserving content and structure.

    used only for import

    This class handles the file I/O aspects of our markdown processing. It:
    1. Uses EntityParser for reading/parsing files into our schema
    2. Handles writing files with proper frontmatter
    3. Formats structured sections (observations/relations) consistently
    4. Preserves user content exactly as written
    5. Performs atomic writes using temp files

    It does NOT:
    1. Modify the schema directly (that's done by services)
    2. Handle in-place updates (everything is read->modify->write)
    3. Track schema changes (that's done by the database)
    """

    def __init__(self, entity_parser: EntityParser):
        """Initialize processor with base path and parser."""
        self.entity_parser = entity_parser

    async def read_file(self, path: Path) -> EntityMarkdown:
        """Read and parse file into EntityMarkdown schema.

        This is step 1 of our read->modify->write pattern.
        We use EntityParser to handle all the markdown parsing.
        """
        return await self.entity_parser.parse_file(path)

    async def write_file(
        self,
        path: Path,
        markdown: EntityMarkdown,
        expected_checksum: Optional[str] = None,
    ) -> str:
        """Write EntityMarkdown schema back to file.

        This is step 3 of our read->modify->write pattern.
        The entire file is rewritten atomically on each update.

        File Structure:
        ---
        frontmatter fields
        ---
        user content area (preserved exactly)

        ## Observations (if any)
        formatted observations

        ## Relations (if any)
        formatted relations

        Args:
            path: Where to write the file
            markdown: Complete schema to write
            expected_checksum: If provided, verify file hasn't changed

        Returns:
            Checksum of written file

        Raises:
            DirtyFileError: If file has been modified (when expected_checksum provided)
        """
        # Dirty check if needed
        if expected_checksum is not None:
            current_content = path.read_text(encoding="utf-8")
            current_checksum = await file_utils.compute_checksum(current_content)
            if current_checksum != expected_checksum:
                raise DirtyFileError(f"File {path} has been modified")

        # Convert frontmatter to dict
        frontmatter_dict = OrderedDict()
        frontmatter_dict["title"] = markdown.frontmatter.title
        frontmatter_dict["type"] = markdown.frontmatter.type
        frontmatter_dict["permalink"] = markdown.frontmatter.permalink

        metadata = markdown.frontmatter.metadata or {}
        for k, v in metadata.items():
            frontmatter_dict[k] = v

        # Start with user content (or minimal title for new files)
        content = markdown.content or f"# {markdown.frontmatter.title}\n"

        # Add structured sections with proper spacing
        content = content.rstrip()  # Remove trailing whitespace

        # add a blank line if we have semantic content
        if markdown.observations or markdown.relations:
            content += "\n"

        if markdown.observations:
            content += self.format_observations(markdown.observations)
        if markdown.relations:
            content += self.format_relations(markdown.relations)

        # Create Post object for frontmatter
        post = Post(content, **frontmatter_dict)
        final_content = frontmatter.dumps(post, sort_keys=False)

        logger.debug(f"writing file {path} with content:\n{final_content}")

        # Write atomically and return checksum of updated file
        path.parent.mkdir(parents=True, exist_ok=True)
        await file_utils.write_file_atomic(path, final_content)
        return await file_utils.compute_checksum(final_content)

    def format_observations(self, observations: list[Observation]) -> str:
        """Format observations section in standard way.

        Format: - [category] content #tag1 #tag2 (context)
        """
        lines = [f"{obs}" for obs in observations]
        return "\n".join(lines) + "\n"

    def format_relations(self, relations: list[Relation]) -> str:
        """Format relations section in standard way.

        Format: - relation_type [[target]] (context)
        """
        lines = [f"{rel}" for rel in relations]
        return "\n".join(lines) + "\n"