Skip to content

Commit 563e771

Browse files
author
marce
committed
refactor(rag): aprimoramento do chunking semantico, sublinear tf e extracao de entidades NER
1 parent 2c4c7a3 commit 563e771

1 file changed

Lines changed: 46 additions & 7 deletions

File tree

skills/maswos-v5-nexus/servers/rag_server.py

Lines changed: 46 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,14 @@ def __init__(self, dim=256):
3131
def tokenize(self, text: str) -> list:
3232
text = re.sub(r'[^a-zA-Zá-úÁ-Ú0-9\s]', ' ', text.lower())
3333
tokens = text.split()
34-
stopwords = {"o", "a", "os", "as", "e", "de", "do", "da", "em", "um", "uma", "para", "com", "na", "no"}
34+
stopwords = {
35+
"o", "a", "os", "as", "e", "de", "do", "da", "em", "um", "uma", "para",
36+
"com", "na", "no", "que", "se", "por", "dos", "das", "nos", "nas", "ao",
37+
"aos", "como", "mas", "ou", "mais", "foi", "sua", "seu", "suas", "seus",
38+
"esta", "este", "estas", "estes", "isso", "isto", "aquilo", "uns", "umas",
39+
"pelo", "pela", "qual", "quais", "quem", "quando", "onde", "porque",
40+
"sobre", "entre", "também", "são", "ser", "tem", "ter", "está", "estão"
41+
}
3542
return [t for t in tokens if len(t) > 2 and t not in stopwords]
3643

3744
def fit(self, documents: list):
@@ -64,7 +71,9 @@ def embed(self, text: str) -> list:
6471
embedding = [0.0] * self.dim
6572
for i, word in enumerate(self.vocabulary):
6673
if word in tf:
67-
embedding[i] = tf[word] * self.idf.get(word, 1.0)
74+
# Sublinear TF (1 + log(tf)) para evitar dominância de palavras repetidas
75+
sublinear_tf = 1 + math.log(tf[word])
76+
embedding[i] = sublinear_tf * self.idf.get(word, 1.0)
6877
norm = math.sqrt(sum(v**2 for v in embedding))
6978
if norm > 0:
7079
embedding = [v / norm for v in embedding]
@@ -82,13 +91,43 @@ def cosine_similarity(self, a: list, b: list) -> float:
8291

8392
# ── Indexação de Documentos Reais (Não Simulado) ──
8493
def chunk_text(text: str, max_words=150) -> list:
85-
words = text.split()
86-
return [" ".join(words[i:i+max_words]) for i in range(0, len(words), max_words)]
94+
# Divisão primária por parágrafos para manter coesão semântica
95+
paragraphs = [p.strip() for p in re.split(r'\n\s*\n', text) if p.strip()]
96+
chunks = []
97+
current_chunk = []
98+
current_length = 0
99+
100+
for p in paragraphs:
101+
words = p.split()
102+
if current_length + len(words) > max_words and current_chunk:
103+
chunks.append(" ".join(current_chunk))
104+
current_chunk = []
105+
current_length = 0
106+
107+
# Se um parágrafo for colossal (excede o max_words), quebra forçada
108+
if len(words) > max_words:
109+
for i in range(0, len(words), max_words):
110+
chunks.append(" ".join(words[i:i+max_words]))
111+
else:
112+
current_chunk.extend(words)
113+
current_length += len(words)
114+
115+
if current_chunk:
116+
chunks.append(" ".join(current_chunk))
117+
return chunks
87118

88119
def extract_entities(text: str) -> list:
89-
# Simulação real de extração de NER baseada em maiúsculas (Graph RAG base)
90-
words = re.findall(r'\b[A-Z][A-Za-zÀ-ÿ]{3,}\b', text)
91-
return list(set([w.lower() for w in words]))
120+
# 1. Entidades compostas com inicial maiúscula (Ex: OpenCode Ecosystem)
121+
compound = re.findall(r'\b(?:[A-ZÀ-Ÿ][a-zà-ÿ]+(?:\s+[A-ZÀ-Ÿ][a-zà-ÿ]+)+)\b', text)
122+
# 2. Acrônimos ou tecnologias em maiúsculas (Ex: RAG, TF-IDF, JSON, API)
123+
acronyms = re.findall(r'\b[A-Z]{2,}(?:-[A-Z]+)*\b', text)
124+
# 3. Entidades simples capitalizadas (mínimo 4 letras)
125+
simple = re.findall(r'\b[A-ZÀ-Ÿ][a-zà-ÿ]{3,}\b', text)
126+
127+
entities = set(compound + acronyms + simple)
128+
# Filtrar stopwords capitalizadas de início de frase
129+
false_positives = {"Para", "Como", "Este", "Esta", "Quando", "Onde", "Mas", "Por", "Que", "Qual", "Sobre", "Além"}
130+
return list(set([e.lower() for e in entities if e not in false_positives]))
92131

93132
def index_real_documents():
94133
db = vector_db()

0 commit comments

Comments
 (0)