From 6a8835c0032518ae828825e47981b477e3509839 Mon Sep 17 00:00:00 2001 From: Lukas Schaefer Date: Tue, 12 May 2026 10:58:45 +0200 Subject: [PATCH 1/3] feat: support reformat paragraphs task type Signed-off-by: Lukas Schaefer --- lib/reformat_paragraphs.py | 92 ++++++++++++++++++++++++++++++++++++++ lib/task_processors.py | 2 + 2 files changed, 94 insertions(+) create mode 100644 lib/reformat_paragraphs.py diff --git a/lib/reformat_paragraphs.py b/lib/reformat_paragraphs.py new file mode 100644 index 0000000..db54a39 --- /dev/null +++ b/lib/reformat_paragraphs.py @@ -0,0 +1,92 @@ +# SPDX-FileCopyrightText: 2026 Nextcloud GmbH and Nextcloud contributors +# SPDX-License-Identifier: AGPL-3.0-or-later + +from typing import Any +from langchain.prompts import PromptTemplate +from langchain.schema.prompt_template import BasePromptTemplate +from langchain_core.messages import SystemMessage, HumanMessage +from langchain_core.runnables import Runnable + + +class ReformatParagraphsProcessor: + """ + Segments text by subject changes; model returns anchor phrases only. + """ + system_prompt = ( + "You segment continuous text by subject changes and output anchor phrases only." + ) + + user_prompt: BasePromptTemplate = PromptTemplate( + input_variables=["text"], + template="""You will receive a continuous block of text without line breaks. Your task is to identify points in the text where the subject or topic changes (e.g., a shift to a new person, place, concept, or thematic focus) and insert a line break at that specific transition. + +Do NOT break lines based on sentence length or grammar unless the subject actually changes. + +Once you have identified these segments, do NOT output the full text. Instead, for each new line created by a subject change, output ONLY the first 3-5 words of that line. These serve as anchors for programmatic retrieval. + +Format your output as a plain list of these anchor words, one per line. Do not include numbers, bullet points, or any additional commentary. + +Example input: "The market for electric vehicles is expanding rapidly. In contrast, traditional motorcycle sales are declining globally. Aside from transportation, the price of copper remains volatile." + +Example output: +The market for electric vehicles +In contrast, traditional motorcycle +Aside from transportation, the price + +--- + +Continuous text to segment: + +{text} +""") + + runnable: Runnable + + def __init__(self, runnable: Runnable): + self.runnable = runnable + + @staticmethod + def _parse_anchors_from_model_output(raw: str) -> list[str]: + if raw == "": + return [] + anchors: list[str] = [] + for line in raw.split("\n"): + line = line.strip() + if line == "": + continue + anchors.append(line) + return anchors + + @staticmethod + def _insert_paragraph_breaks_by_anchors(text: str, anchors: list[str]) -> str: + if len(anchors) < 2: + return text + result = text + search_offset = 0 + delta = 0 + for i in range(1, len(anchors)): + anchor = anchors[i] + pos = text.find(anchor, search_offset) + if pos == -1: + continue + insert_at = pos + delta + replace_from = insert_at + while replace_from > 0 and result[replace_from - 1].isspace(): + replace_from -= 1 + result = result[:replace_from] + "\n\n" + result[insert_at:] + delta += 2 - (insert_at - replace_from) + search_offset = pos + len(anchor) + return result + + def __call__(self, inputs: dict[str, Any]) -> dict[str, Any]: + messages = [ + SystemMessage(content=self.system_prompt), + HumanMessage(content=self.user_prompt.format( + text=inputs['input'] + )) + ] + output = self.runnable.invoke(messages) + raw = output.content + anchors = self._parse_anchors_from_model_output(raw) + reformatted = self._insert_paragraph_breaks_by_anchors(inputs["input"], anchors) + return {"output": reformatted} \ No newline at end of file diff --git a/lib/task_processors.py b/lib/task_processors.py index 045ed0b..697df6b 100644 --- a/lib/task_processors.py +++ b/lib/task_processors.py @@ -24,6 +24,7 @@ from chatwithtools import ChatWithToolsProcessor from topics import TopicsProcessor from summarize import SummarizeProcessor +from reformat_paragraphs import ReformatParagraphsProcessor dir_path = os.path.dirname(os.path.realpath(__file__)) models_folder_path = os.path.join(dir_path , "../models/") @@ -141,5 +142,6 @@ def generate_task_processors_for_model(file_name, task_processors): task_processors[model_name + ":core:text2text:proofread"] = lambda: ProofreadProcessor(generate_chat_chain(file_name)) task_processors[model_name + ":core:text2text:changetone"] = lambda: ChangeToneProcessor(generate_chat_chain(file_name)) task_processors[model_name + ":core:text2text:chatwithtools"] = lambda: ChatWithToolsProcessor(generate_chat_chain(file_name)) + task_processors[model_name + ":core:text2text:reformatparagraphs"] = lambda: ReformatParagraphsProcessor(generate_chat_chain(file_name)) # chains[model_name + ":core:contextwrite"] = lambda: ContextWriteChain(llm_chain=llm_chain()) From a02364386a483cfc1d5ffffd20cd7feee624b54c Mon Sep 17 00:00:00 2001 From: Lukas Schaefer Date: Thu, 18 Jun 2026 13:40:55 -0400 Subject: [PATCH 2/3] Finetune prompt and add gemma Signed-off-by: Lukas Schaefer --- default_config/config.json | 9 +++++++++ lib/main.py | 1 + lib/reformat_paragraphs.py | 10 ++++++---- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/default_config/config.json b/default_config/config.json index 9850a28..9febb69 100644 --- a/default_config/config.json +++ b/default_config/config.json @@ -58,6 +58,15 @@ "temperature": 0.15 } }, + "gemma-4-E4B_q4_0-it": { + "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n{user_prompt}<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n", + "loader_config": { + "n_ctx": 24000, + "max_tokens": 8192, + "stop": ["<|eot_id|>"], + "temperature": 0.7 + } + }, "Qwen3.5-9B-Q4_K_M": { "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n{user_prompt}<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n", "loader_config": { diff --git a/lib/main.py b/lib/main.py index 652b200..6bfdc8f 100644 --- a/lib/main.py +++ b/lib/main.py @@ -64,6 +64,7 @@ def log(nc, level, content): pass models_to_fetch = { + "https://huggingface.co/google/gemma-4-E4B-it-qat-q4_0-gguf/resolve/main/gemma-4-E4B_q4_0-it.gguf": {"save_path": os.path.join(persistent_storage(), "gemma-4-E4B_q4_0-it.gguf")}, "https://huggingface.co/unsloth/Qwen3.5-9B-GGUF/resolve/3885219b6810b007914f3a7950a8d1b469d598a5/Qwen3.5-9B-Q4_K_M.gguf": { "save_path": os.path.join(persistent_storage(), "Qwen3.5-9B-Q4_K_M.gguf") }, "https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/4f0c246f125fc7594238ebe7beb1435a8335f519/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf": { "save_path": os.path.join(persistent_storage(), "Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf") }, "https://huggingface.co/unsloth/Olmo-3-7B-Instruct-GGUF/resolve/86844f8ff856ba9cc8a22c7b9edd7cf95129a580/Olmo-3-7B-Instruct-Q4_K_M.gguf": { "save_path": os.path.join(persistent_storage(), "Olmo-3-7B-Instruct-Q4_K_M.gguf") }, diff --git a/lib/reformat_paragraphs.py b/lib/reformat_paragraphs.py index db54a39..cf8f7a2 100644 --- a/lib/reformat_paragraphs.py +++ b/lib/reformat_paragraphs.py @@ -7,22 +7,24 @@ from langchain_core.messages import SystemMessage, HumanMessage from langchain_core.runnables import Runnable +from lib.streaming import StreamContext + class ReformatParagraphsProcessor: """ Segments text by subject changes; model returns anchor phrases only. """ system_prompt = ( - "You segment continuous text by subject changes and output anchor phrases only." + "You output anchors from a continuous text based on subject changes." ) user_prompt: BasePromptTemplate = PromptTemplate( input_variables=["text"], - template="""You will receive a continuous block of text without line breaks. Your task is to identify points in the text where the subject or topic changes (e.g., a shift to a new person, place, concept, or thematic focus) and insert a line break at that specific transition. + template="""You will receive a continuous block of text without line breaks. Your task is to identify points in the text where the subject or topic changes (e.g., a shift to a new person, place, concept, or thematic focus). Do NOT break lines based on sentence length or grammar unless the subject actually changes. -Once you have identified these segments, do NOT output the full text. Instead, for each new line created by a subject change, output ONLY the first 3-5 words of that line. These serve as anchors for programmatic retrieval. +Once you have identified these segments, do NOT output the full text. Instead, for each paragraph created, output ONLY the first 3-5 words verbatim of that paragraph. These serve as anchors for programmatic retrieval. Format your output as a plain list of these anchor words, one per line. Do not include numbers, bullet points, or any additional commentary. @@ -78,7 +80,7 @@ def _insert_paragraph_breaks_by_anchors(text: str, anchors: list[str]) -> str: search_offset = pos + len(anchor) return result - def __call__(self, inputs: dict[str, Any]) -> dict[str, Any]: + def __call__(self, inputs: dict[str, Any], context: StreamContext) -> dict[str, Any]: messages = [ SystemMessage(content=self.system_prompt), HumanMessage(content=self.user_prompt.format( From 9e81bc3f5c1cc51d140bb37aa262410f75e67b66 Mon Sep 17 00:00:00 2001 From: Lukas Schaefer Date: Wed, 24 Jun 2026 09:40:03 -0400 Subject: [PATCH 3/3] Fix ci Signed-off-by: Lukas Schaefer --- lib/reformat_paragraphs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/reformat_paragraphs.py b/lib/reformat_paragraphs.py index cf8f7a2..fc99ecc 100644 --- a/lib/reformat_paragraphs.py +++ b/lib/reformat_paragraphs.py @@ -7,7 +7,7 @@ from langchain_core.messages import SystemMessage, HumanMessage from langchain_core.runnables import Runnable -from lib.streaming import StreamContext +from streaming import StreamContext class ReformatParagraphsProcessor: