@@ -31,7 +31,14 @@ def __init__(self, dim=256):
3131 def tokenize (self , text : str ) -> list :
3232 text = re .sub (r'[^a-zA-Zá-úÁ-Ú0-9\s]' , ' ' , text .lower ())
3333 tokens = text .split ()
34- stopwords = {"o" , "a" , "os" , "as" , "e" , "de" , "do" , "da" , "em" , "um" , "uma" , "para" , "com" , "na" , "no" }
34+ stopwords = {
35+ "o" , "a" , "os" , "as" , "e" , "de" , "do" , "da" , "em" , "um" , "uma" , "para" ,
36+ "com" , "na" , "no" , "que" , "se" , "por" , "dos" , "das" , "nos" , "nas" , "ao" ,
37+ "aos" , "como" , "mas" , "ou" , "mais" , "foi" , "sua" , "seu" , "suas" , "seus" ,
38+ "esta" , "este" , "estas" , "estes" , "isso" , "isto" , "aquilo" , "uns" , "umas" ,
39+ "pelo" , "pela" , "qual" , "quais" , "quem" , "quando" , "onde" , "porque" ,
40+ "sobre" , "entre" , "também" , "são" , "ser" , "tem" , "ter" , "está" , "estão"
41+ }
3542 return [t for t in tokens if len (t ) > 2 and t not in stopwords ]
3643
3744 def fit (self , documents : list ):
@@ -64,7 +71,9 @@ def embed(self, text: str) -> list:
6471 embedding = [0.0 ] * self .dim
6572 for i , word in enumerate (self .vocabulary ):
6673 if word in tf :
67- embedding [i ] = tf [word ] * self .idf .get (word , 1.0 )
74+ # Sublinear TF (1 + log(tf)) para evitar dominância de palavras repetidas
75+ sublinear_tf = 1 + math .log (tf [word ])
76+ embedding [i ] = sublinear_tf * self .idf .get (word , 1.0 )
6877 norm = math .sqrt (sum (v ** 2 for v in embedding ))
6978 if norm > 0 :
7079 embedding = [v / norm for v in embedding ]
@@ -82,13 +91,43 @@ def cosine_similarity(self, a: list, b: list) -> float:
8291
8392# ── Indexação de Documentos Reais (Não Simulado) ──
8493def chunk_text (text : str , max_words = 150 ) -> list :
85- words = text .split ()
86- return [" " .join (words [i :i + max_words ]) for i in range (0 , len (words ), max_words )]
94+ # Divisão primária por parágrafos para manter coesão semântica
95+ paragraphs = [p .strip () for p in re .split (r'\n\s*\n' , text ) if p .strip ()]
96+ chunks = []
97+ current_chunk = []
98+ current_length = 0
99+
100+ for p in paragraphs :
101+ words = p .split ()
102+ if current_length + len (words ) > max_words and current_chunk :
103+ chunks .append (" " .join (current_chunk ))
104+ current_chunk = []
105+ current_length = 0
106+
107+ # Se um parágrafo for colossal (excede o max_words), quebra forçada
108+ if len (words ) > max_words :
109+ for i in range (0 , len (words ), max_words ):
110+ chunks .append (" " .join (words [i :i + max_words ]))
111+ else :
112+ current_chunk .extend (words )
113+ current_length += len (words )
114+
115+ if current_chunk :
116+ chunks .append (" " .join (current_chunk ))
117+ return chunks
87118
88119def extract_entities (text : str ) -> list :
89- # Simulação real de extração de NER baseada em maiúsculas (Graph RAG base)
90- words = re .findall (r'\b[A-Z][A-Za-zÀ-ÿ]{3,}\b' , text )
91- return list (set ([w .lower () for w in words ]))
120+ # 1. Entidades compostas com inicial maiúscula (Ex: OpenCode Ecosystem)
121+ compound = re .findall (r'\b(?:[A-ZÀ-Ÿ][a-zà-ÿ]+(?:\s+[A-ZÀ-Ÿ][a-zà-ÿ]+)+)\b' , text )
122+ # 2. Acrônimos ou tecnologias em maiúsculas (Ex: RAG, TF-IDF, JSON, API)
123+ acronyms = re .findall (r'\b[A-Z]{2,}(?:-[A-Z]+)*\b' , text )
124+ # 3. Entidades simples capitalizadas (mínimo 4 letras)
125+ simple = re .findall (r'\b[A-ZÀ-Ÿ][a-zà-ÿ]{3,}\b' , text )
126+
127+ entities = set (compound + acronyms + simple )
128+ # Filtrar stopwords capitalizadas de início de frase
129+ false_positives = {"Para" , "Como" , "Este" , "Esta" , "Quando" , "Onde" , "Mas" , "Por" , "Que" , "Qual" , "Sobre" , "Além" }
130+ return list (set ([e .lower () for e in entities if e not in false_positives ]))
92131
93132def index_real_documents ():
94133 db = vector_db ()
0 commit comments