Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions llm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@


class Fragment(str):
"""A string subclass that carries an optional source attribution.

Used to track where prompt/system content originated so that provenance
can be stored alongside the text in the database.
"""

def __new__(cls, content, *args, **kwargs):
# For immutable classes like str, __new__ creates the string object
return super().__new__(cls, content)
Expand All @@ -35,6 +41,10 @@ def id(self):


def mimetype_from_string(content) -> Optional[str]:
"""Detect the MIME type of binary content using magic bytes.

Returns None if the type cannot be determined.
"""
try:
type_ = puremagic.from_string(content, mime=True)
return MIME_TYPE_FIXES.get(type_, type_)
Expand All @@ -43,6 +53,10 @@ def mimetype_from_string(content) -> Optional[str]:


def mimetype_from_path(path) -> Optional[str]:
"""Detect the MIME type of a file using magic bytes.

Returns None if the type cannot be determined.
"""
try:
type_ = puremagic.from_file(path, mime=True)
return MIME_TYPE_FIXES.get(type_, type_)
Expand All @@ -53,6 +67,11 @@ def mimetype_from_path(path) -> Optional[str]:
def dicts_to_table_string(
headings: List[str], dicts: List[Dict[str, str]]
) -> List[str]:
"""Format a list of dicts as a plain-text table, returning one string per row.

Columns are separated by four spaces and each value is left-justified to the
width of the widest entry in that column (including the heading).
"""
max_lengths = [len(h) for h in headings]

# Compute maximum length for each column
Expand Down Expand Up @@ -218,6 +237,11 @@ def extract_fenced_code_block(text: str, last: bool = False) -> Optional[str]:


def make_schema_id(schema: dict) -> Tuple[str, str]:
"""Return a stable (id, compact-JSON) pair for the given JSON schema dict.

The id is a 32-character hex BLAKE2b digest of the compact JSON
representation, suitable for use as a database key.
"""
schema_json = json.dumps(schema, separators=(",", ":"))
schema_id = hashlib.blake2b(schema_json.encode(), digest_size=16).hexdigest()
return schema_id, schema_json
Expand Down Expand Up @@ -272,6 +296,15 @@ def output_rows_as_json(rows, nl=False, compact=False, json_cols=()):


def resolve_schema_input(db, schema_input, load_template):
"""Resolve a schema input string to a JSON schema dict.

Accepts the following formats (tried in order):
- ``t:name`` — look up a stored template and return its schema
- Inline JSON starting with ``{``
- Schema DSL (comma- or space-separated field specs)
- A filesystem path to a JSON file
- A schema ID stored in the database
"""
# schema_input might be JSON or a filepath or an ID or t:name
if not schema_input:
return
Expand Down Expand Up @@ -476,6 +509,12 @@ def truncate_string(


def ensure_fragment(db, content):
"""Insert a fragment into the database if it does not already exist.

Returns the integer primary-key id of the (possibly pre-existing) row.
``content`` may be a plain str or a :class:`Fragment` instance; in the
latter case its ``source`` attribute is persisted alongside the text.
"""
sql = """
insert into fragments (hash, content, datetime_utc, source)
values (:hash, :content, datetime('now'), :source)
Expand All @@ -493,6 +532,12 @@ def ensure_fragment(db, content):


def ensure_tool(db, tool):
"""Insert a tool into the database if it does not already exist.

Returns the integer primary-key id of the (possibly pre-existing) row.
Deduplication is based on a hash of the tool's name, description, and
input schema.
"""
sql = """
insert into tools (hash, name, description, input_schema, plugin)
values (:hash, :name, :description, :input_schema, :plugin)
Expand Down