From 6a8835c0032518ae828825e47981b477e3509839 Mon Sep 17 00:00:00 2001
From: Lukas Schaefer <lukas@lschaefer.xyz>
Date: Tue, 12 May 2026 10:58:45 +0200
Subject: [PATCH 1/3] feat: support reformat paragraphs task type

Signed-off-by: Lukas Schaefer <lukas@lschaefer.xyz>
---
 lib/reformat_paragraphs.py | 92 ++++++++++++++++++++++++++++++++++++++
 lib/task_processors.py     |  2 +
 2 files changed, 94 insertions(+)
 create mode 100644 lib/reformat_paragraphs.py

diff --git a/lib/reformat_paragraphs.py b/lib/reformat_paragraphs.py
new file mode 100644
index 0000000..db54a39
--- /dev/null
+++ b/lib/reformat_paragraphs.py
@@ -0,0 +1,92 @@
+# SPDX-FileCopyrightText: 2026 Nextcloud GmbH and Nextcloud contributors
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+from typing import Any
+from langchain.prompts import PromptTemplate
+from langchain.schema.prompt_template import BasePromptTemplate
+from langchain_core.messages import SystemMessage, HumanMessage
+from langchain_core.runnables import Runnable
+
+
+class ReformatParagraphsProcessor:
+    """
+    Segments text by subject changes; model returns anchor phrases only.
+    """
+    system_prompt = (
+        "You segment continuous text by subject changes and output anchor phrases only."
+    )
+
+    user_prompt: BasePromptTemplate = PromptTemplate(
+        input_variables=["text"],
+        template="""You will receive a continuous block of text without line breaks. Your task is to identify points in the text where the subject or topic changes (e.g., a shift to a new person, place, concept, or thematic focus) and insert a line break at that specific transition.
+
+Do NOT break lines based on sentence length or grammar unless the subject actually changes.
+
+Once you have identified these segments, do NOT output the full text. Instead, for each new line created by a subject change, output ONLY the first 3-5 words of that line. These serve as anchors for programmatic retrieval.
+
+Format your output as a plain list of these anchor words, one per line. Do not include numbers, bullet points, or any additional commentary.
+
+Example input: "The market for electric vehicles is expanding rapidly. In contrast, traditional motorcycle sales are declining globally. Aside from transportation, the price of copper remains volatile."
+
+Example output:
+The market for electric vehicles
+In contrast, traditional motorcycle
+Aside from transportation, the price
+
+---
+
+Continuous text to segment:
+
+{text}
+""")
+
+    runnable: Runnable
+
+    def __init__(self, runnable: Runnable):
+        self.runnable = runnable
+
+    @staticmethod
+    def _parse_anchors_from_model_output(raw: str) -> list[str]:
+        if raw == "":
+            return []
+        anchors: list[str] = []
+        for line in raw.split("\n"):
+            line = line.strip()
+            if line == "":
+                continue
+            anchors.append(line)
+        return anchors
+
+    @staticmethod
+    def _insert_paragraph_breaks_by_anchors(text: str, anchors: list[str]) -> str:
+        if len(anchors) < 2:
+            return text
+        result = text
+        search_offset = 0
+        delta = 0
+        for i in range(1, len(anchors)):
+            anchor = anchors[i]
+            pos = text.find(anchor, search_offset)
+            if pos == -1:
+                continue
+            insert_at = pos + delta
+            replace_from = insert_at
+            while replace_from > 0 and result[replace_from - 1].isspace():
+                replace_from -= 1
+            result = result[:replace_from] + "\n\n" + result[insert_at:]
+            delta += 2 - (insert_at - replace_from)
+            search_offset = pos + len(anchor)
+        return result
+
+    def __call__(self, inputs: dict[str, Any]) -> dict[str, Any]:
+        messages = [
+            SystemMessage(content=self.system_prompt),
+            HumanMessage(content=self.user_prompt.format(
+                text=inputs['input']
+            ))
+        ]
+        output = self.runnable.invoke(messages)
+        raw = output.content
+        anchors = self._parse_anchors_from_model_output(raw)
+        reformatted = self._insert_paragraph_breaks_by_anchors(inputs["input"], anchors)
+        return {"output": reformatted}
\ No newline at end of file
diff --git a/lib/task_processors.py b/lib/task_processors.py
index 045ed0b..697df6b 100644
--- a/lib/task_processors.py
+++ b/lib/task_processors.py
@@ -24,6 +24,7 @@
 from chatwithtools import ChatWithToolsProcessor
 from topics import TopicsProcessor
 from summarize import SummarizeProcessor
+from reformat_paragraphs import ReformatParagraphsProcessor
 
 dir_path = os.path.dirname(os.path.realpath(__file__))
 models_folder_path = os.path.join(dir_path , "../models/")
@@ -141,5 +142,6 @@ def generate_task_processors_for_model(file_name, task_processors):
     task_processors[model_name + ":core:text2text:proofread"] = lambda: ProofreadProcessor(generate_chat_chain(file_name))
     task_processors[model_name + ":core:text2text:changetone"] = lambda: ChangeToneProcessor(generate_chat_chain(file_name))
     task_processors[model_name + ":core:text2text:chatwithtools"] = lambda: ChatWithToolsProcessor(generate_chat_chain(file_name))
+    task_processors[model_name + ":core:text2text:reformatparagraphs"] = lambda: ReformatParagraphsProcessor(generate_chat_chain(file_name))
     
     # chains[model_name + ":core:contextwrite"] = lambda: ContextWriteChain(llm_chain=llm_chain())

From a02364386a483cfc1d5ffffd20cd7feee624b54c Mon Sep 17 00:00:00 2001
From: Lukas Schaefer <lukas@lschaefer.xyz>
Date: Thu, 18 Jun 2026 13:40:55 -0400
Subject: [PATCH 2/3] Finetune prompt and add gemma

Signed-off-by: Lukas Schaefer <lukas@lschaefer.xyz>
---
 default_config/config.json |  9 +++++++++
 lib/main.py                |  1 +
 lib/reformat_paragraphs.py | 10 ++++++----
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/default_config/config.json b/default_config/config.json
index 9850a28..9febb69 100644
--- a/default_config/config.json
+++ b/default_config/config.json
@@ -58,6 +58,15 @@
             "temperature": 0.15
         }
     },
+    "gemma-4-E4B_q4_0-it": {
+      "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n{user_prompt}<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n",
+        "loader_config": {
+            "n_ctx": 24000,
+            "max_tokens": 8192,
+            "stop": ["<|eot_id|>"],
+            "temperature": 0.7
+        }
+    },
     "Qwen3.5-9B-Q4_K_M": {
         "prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n{user_prompt}<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n",
         "loader_config": {
diff --git a/lib/main.py b/lib/main.py
index 652b200..6bfdc8f 100644
--- a/lib/main.py
+++ b/lib/main.py
@@ -64,6 +64,7 @@ def log(nc, level, content):
         pass
 
 models_to_fetch = {
+	"https://huggingface.co/google/gemma-4-E4B-it-qat-q4_0-gguf/resolve/main/gemma-4-E4B_q4_0-it.gguf": {"save_path": os.path.join(persistent_storage(), "gemma-4-E4B_q4_0-it.gguf")},
     "https://huggingface.co/unsloth/Qwen3.5-9B-GGUF/resolve/3885219b6810b007914f3a7950a8d1b469d598a5/Qwen3.5-9B-Q4_K_M.gguf": { "save_path": os.path.join(persistent_storage(), "Qwen3.5-9B-Q4_K_M.gguf") },
     "https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/4f0c246f125fc7594238ebe7beb1435a8335f519/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf": { "save_path": os.path.join(persistent_storage(), "Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf") },
     "https://huggingface.co/unsloth/Olmo-3-7B-Instruct-GGUF/resolve/86844f8ff856ba9cc8a22c7b9edd7cf95129a580/Olmo-3-7B-Instruct-Q4_K_M.gguf": { "save_path": os.path.join(persistent_storage(), "Olmo-3-7B-Instruct-Q4_K_M.gguf") },
diff --git a/lib/reformat_paragraphs.py b/lib/reformat_paragraphs.py
index db54a39..cf8f7a2 100644
--- a/lib/reformat_paragraphs.py
+++ b/lib/reformat_paragraphs.py
@@ -7,22 +7,24 @@
 from langchain_core.messages import SystemMessage, HumanMessage
 from langchain_core.runnables import Runnable
 
+from lib.streaming import StreamContext
+
 
 class ReformatParagraphsProcessor:
     """
     Segments text by subject changes; model returns anchor phrases only.
     """
     system_prompt = (
-        "You segment continuous text by subject changes and output anchor phrases only."
+        "You output anchors from a continuous text based on subject changes."
     )
 
     user_prompt: BasePromptTemplate = PromptTemplate(
         input_variables=["text"],
-        template="""You will receive a continuous block of text without line breaks. Your task is to identify points in the text where the subject or topic changes (e.g., a shift to a new person, place, concept, or thematic focus) and insert a line break at that specific transition.
+        template="""You will receive a continuous block of text without line breaks. Your task is to identify points in the text where the subject or topic changes (e.g., a shift to a new person, place, concept, or thematic focus).
 
 Do NOT break lines based on sentence length or grammar unless the subject actually changes.
 
-Once you have identified these segments, do NOT output the full text. Instead, for each new line created by a subject change, output ONLY the first 3-5 words of that line. These serve as anchors for programmatic retrieval.
+Once you have identified these segments, do NOT output the full text. Instead, for each paragraph created, output ONLY the first 3-5 words verbatim of that paragraph. These serve as anchors for programmatic retrieval.
 
 Format your output as a plain list of these anchor words, one per line. Do not include numbers, bullet points, or any additional commentary.
 
@@ -78,7 +80,7 @@ def _insert_paragraph_breaks_by_anchors(text: str, anchors: list[str]) -> str:
             search_offset = pos + len(anchor)
         return result
 
-    def __call__(self, inputs: dict[str, Any]) -> dict[str, Any]:
+    def __call__(self, inputs: dict[str, Any], context: StreamContext) -> dict[str, Any]:
         messages = [
             SystemMessage(content=self.system_prompt),
             HumanMessage(content=self.user_prompt.format(

From 9e81bc3f5c1cc51d140bb37aa262410f75e67b66 Mon Sep 17 00:00:00 2001
From: Lukas Schaefer <lukas@lschaefer.xyz>
Date: Wed, 24 Jun 2026 09:40:03 -0400
Subject: [PATCH 3/3] Fix ci

Signed-off-by: Lukas Schaefer <lukas@lschaefer.xyz>
---
 lib/reformat_paragraphs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/reformat_paragraphs.py b/lib/reformat_paragraphs.py
index cf8f7a2..fc99ecc 100644
--- a/lib/reformat_paragraphs.py
+++ b/lib/reformat_paragraphs.py
@@ -7,7 +7,7 @@
 from langchain_core.messages import SystemMessage, HumanMessage
 from langchain_core.runnables import Runnable
 
-from lib.streaming import StreamContext
+from streaming import StreamContext
 
 
 class ReformatParagraphsProcessor: