Skip to content

Commit 91ee1ed

Browse files
committed
fix knowledge unit extractor
1 parent b858603 commit 91ee1ed

13 files changed

Lines changed: 119 additions & 66 deletions

File tree

KAG_VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.8.0.20250619
1+
0.8.0.20250624.2053

kag/builder/component/extractor/knowledge_unit_extractor.py

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ def named_entity_recognition(self, passage: str):
183183
ner_parse_rst = self._named_entity_recognition_process(passage, ner_result)
184184
if not ner_parse_rst:
185185
raise
186-
return
186+
return ner_parse_rst
187187

188188
@retry(
189189
stop=stop_after_attempt(3),
@@ -241,7 +241,7 @@ async def aknowledge_unit_extra(self, passage: str, entities: List[Dict]):
241241
Returns:
242242
Standardized entity information.
243243
"""
244-
return await self.llm.ainvoke(
244+
return self.llm.invoke(
245245
{"input": passage, "named_entities": entities},
246246
self.kn_prompt,
247247
with_except=False,
@@ -583,15 +583,12 @@ def triple_to_knowledge_unit(triple):
583583
knowledge_unit_nodes.append(
584584
{"name": knowledge_id, "category": "KnowledgeUnit"}
585585
)
586+
core_entities = {}
587+
for item in knowledge_value.get("core_entities", "").split(","):
588+
if not item.strip():
589+
continue
590+
core_entities[item.strip()] = "Others"
586591

587-
if knowledge_value["knowledgetype"] == "triple":
588-
core_entities = {
589-
item.strip(): "Others"
590-
for item in knowledge_value.get("core_entities", "").split(",")
591-
if len(item.strip()) > 1
592-
}
593-
else:
594-
core_entities = knowledge_value.get("core_entities", {})
595592
for core_entity, ent_type in core_entities.items():
596593
if core_entity == "":
597594
continue
@@ -634,8 +631,8 @@ def _invoke(self, input: Input, **kwargs) -> List[Output]:
634631
{k: v for k, v in ent.items() if k in ["name", "category"]}
635632
for ent in entities
636633
]
637-
knowledge_unit_entities = self.aknowledge_unit_extra(passage, filtered_entities)
638-
triples = (self.triples_extraction(passage, filtered_entities),)
634+
knowledge_unit_entities = self.knowledge_unit_extra(passage, filtered_entities)
635+
triples = self.triples_extraction(passage, filtered_entities)
639636

640637
knowledge_unit_nodes = self.assemble_knowledge_unit(
641638
sub_graph, entities, knowledge_unit_entities, triples

kag/builder/prompt/default/knowledge_unit_ner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ def __init__(self, language: str = "", **kwargs):
224224
host_addr=KAG_PROJECT_CONF.host_addr, project_id=KAG_PROJECT_CONF.project_id
225225
).extract_types(KAG_PROJECT_CONF.language)
226226
self.template = Template(self.template).safe_substitute(
227-
schema=json.dumps(self.schema)
227+
schema=json.dumps(self.schema, ensure_ascii=False)
228228
)
229229

230230
@property

kag/common/conf.py

Lines changed: 3 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -237,25 +237,7 @@ def set_task_config(task_with_kb_id, config: KAGConfigMgr):
237237
:param config: Configuration object to store
238238
"""
239239
with KAG_QA_TASK_CONFIG_LOCK:
240-
KAG_QA_TASK_CONFIG[task_with_kb_id] = config
241-
242-
@staticmethod
243-
def set_kb_config(task_id, kb_project_id, kb_config):
244-
"""Setting up kb specific configuration (separate namespace)"""
245-
config_key = f"{task_id}_{kb_project_id}"
246-
with KAG_QA_TASK_CONFIG_LOCK:
247-
KAG_QA_TASK_CONFIG[config_key] = kb_config
248-
249-
@staticmethod
250-
def cleanup_task_config(task_with_kb_id):
251-
"""
252-
Remove the configuration for a specific task.
253-
254-
:param task_with_kb_id: Task ID whose configuration needs to be cleaned up
255-
"""
256-
with KAG_QA_TASK_CONFIG_LOCK:
257-
if task_with_kb_id in KAG_QA_TASK_CONFIG:
258-
del KAG_QA_TASK_CONFIG[task_with_kb_id]
240+
KAG_QA_TASK_CONFIG.put(task_with_kb_id, config)
259241

260242

261243
def init_env(config_file: str = None):
@@ -264,16 +246,15 @@ def init_env(config_file: str = None):
264246
prod = False
265247
if project_id is not None and host_addr is not None and not validate_config_file(config_file):
266248
prod = True
267-
os.environ[KAGConstants.ENV_KAG_PROJECT_ID] = str(KAG_PROJECT_CONF.project_id)
268-
os.environ[KAGConstants.ENV_KAG_PROJECT_HOST_ADDR] = str(KAG_PROJECT_CONF.host_addr)
269249
global KAG_CONFIG
270250
KAG_CONFIG.initialize(prod, config_file)
271-
272251
if prod:
273252
msg = "Done init config from server"
274253
else:
275254
msg = "Done init config from local file"
276255
logger.debug(msg)
256+
os.environ[KAGConstants.ENV_KAG_PROJECT_ID] = str(KAG_PROJECT_CONF.project_id)
257+
os.environ[KAGConstants.ENV_KAG_PROJECT_HOST_ADDR] = str(KAG_PROJECT_CONF.host_addr)
277258
if len(KAG_CONFIG.all_config) > 0:
278259
dump_flag = os.getenv(KAGConstants.ENV_KAG_DEBUG_DUMP_CONFIG)
279260
pprint.pprint(KAG_CONFIG.all_config, indent=2)

kag/common/tools/algorithm_tool/graph_retriever/kg_fr_with_knowledge_unit_retriever.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def __init__(
101101

102102
self.search_api = search_api or SearchApiABC.from_config(
103103
{
104-
"type": "openspg_graph_api",
104+
"type": "openspg_search_api",
105105
"project_id": self.kag_project_config.project_id,
106106
"host_addr": self.kag_project_config.host_addr,
107107
}

kag/common/utils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -483,6 +483,12 @@ def extract_box_answer(text):
483483
else:
484484
return extracted_answers[0]
485485

486+
def remove_boxed(text):
487+
# 匹配 \boxed{内容} 并提取内容部分
488+
pattern = r'\\boxed\{([^}]*)\}'
489+
# 使用正则替换为仅保留大括号中的内容
490+
result = re.sub(pattern, r'\1', text)
491+
return result
486492

487493
def search_plan_extraction(text):
488494
text = text.replace("\n", "")

kag/examples/baike/kag_config.yaml

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -39,16 +39,14 @@ kag_builder_pipeline:
3939
chain:
4040
type: unstructured_builder_chain # kag.builder.default_chain.DefaultUnstructuredBuilderChain
4141
extractor:
42-
type: schema_constraint_extractor # kag.builder.component.extractor.schema_constraint_extractor.SchemaConstraintExtractor
42+
type: knowledge_unit_extractor # kag.builder.component.extractor.schema_free_extractor.SchemaFreeExtractor
4343
llm: *openie_llm
4444
ner_prompt:
45-
type: spg_entity # kag.builder.prompt.spg_prompt.SPGEntityPrompt
46-
event_prompt:
47-
type: spg_event # kag.builder.prompt.spg_prompt.SPGEventPrompt
48-
std_prompt:
49-
type: default_std # kag.builder.prompt.default.std.OpenIEEntitystandardizationdPrompt
50-
relation_prompt:
51-
type: spg_relation # kag.builder.prompt.spg_prompt.SPGRelationPrompt
45+
type: knowledge_unit_ner
46+
triple_prompt:
47+
type: knowledge_unit_triple
48+
kn_prompt:
49+
type: knowledge_unit
5250
reader:
5351
type: txt_reader # kag.builder.component.reader.txt_reader.TXTReader
5452
post_processor:
@@ -125,16 +123,6 @@ rc: &rc
125123
vectorize_model: *vectorize_model
126124
top_k: 20
127125

128-
kag_merger:
129-
type: kg_merger
130-
top_k: 20
131-
llm_module: *chat_llm
132-
summary_prompt:
133-
type: default_thought_then_answer
134-
vectorize_model: *vectorize_model
135-
graph_api: *graph_api
136-
search_api: *search_api
137-
138126
kag_hybrid_executor: &kag_hybrid_executor_conf
139127
type: kag_hybrid_retrieval_executor
140128
retrievers:

kag/examples/baike/schema/BaiKe.schema

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,5 @@
11
namespace BaiKe
22

3-
Chunk(文本块): EntityType
4-
properties:
5-
content(内容): Text
6-
index: TextAndVector
7-
83
ArtificialObject(人造物体): EntityType
94
properties:
105
desc(描述): Text
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
namespace BaiKe
2+
3+
Doc(文档): IndexType
4+
properties:
5+
content(内容): Text
6+
index: TextAndVector
7+
8+
Chunk(文本块): IndexType
9+
properties:
10+
content(内容): Text
11+
index: TextAndVector
12+
relations:
13+
sourceChunk(关联文档): Doc
14+
15+
Summary(摘要): IndexType
16+
properties:
17+
title(标题): Text
18+
index: TextAndVector
19+
content(内容): Text
20+
index: TextAndVector
21+
relations:
22+
relateTo(关联文本块): Chunk
23+
childOf(上级摘要): Summary
24+
25+
KnowledgeUnit(知识点): IndexType
26+
properties:
27+
structedContent(结构化文本): Text
28+
index: TextAndVector
29+
ontology(本体): Text
30+
desc(描述): Text
31+
index: TextAndVector
32+
relatedQuery(关联问): AtomicQuery
33+
extendedKnowledge(关联外扩知识点):Text
34+
content(内容): Text
35+
index: TextAndVector
36+
knowledgeType(知识类型): Text
37+
38+
AtomicQuery(原子问): IndexType
39+
properties:
40+
title(标题): Text
41+
index: TextAndVector
42+
relations:
43+
relateTo(关联文本块): Chunk
44+
similar(相似问题): AtomicQuery
45+
relatedTo(相关): KnowledgeUnit
46+
47+
Cell(图表单元格): IndexType
48+
properties:
49+
title(标题): Text
50+
index: TextAndVector
51+
content(内容): Text
52+
index: TextAndVector
53+
54+
Diagram(图表): IndexType
55+
properties:
56+
title(标题): Text
57+
index: TextAndVector
58+
content(内容): Text
59+
index: TextAndVector
60+
relations:
61+
relateTo(关联文本块): Chunk
62+
contain(包含): Cell
63+
64+
Outline(大纲): IndexType
65+
properties:
66+
title(标题): Text
67+
index: TextAndVector
68+
content(内容): Text
69+
index: TextAndVector
70+
relations:
71+
relateTo(关联文本块): Chunk
72+
childOf(上级大纲): Outline

kag/indexer/kag_index_manager.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -588,7 +588,10 @@ def build_extractor_config(
588588
kb_task_project_id = kwargs.get(KAGConstants.KAG_QA_TASK_CONFIG_KEY, None)
589589
return [
590590
{
591-
"type": "schema_free_extractor",
591+
"type": "knowledge_unit_extractor",
592+
"ner_prompt": "knowledge_unit_ner",
593+
"triple_prompt": "knowledge_unit_triple",
594+
"kn_prompt": "knowledge_unit",
592595
"llm": llm_config,
593596
"kag_qa_task_config_key": kb_task_project_id,
594597
}
@@ -645,6 +648,14 @@ def build_retriever_config(
645648
{
646649
"type": "kg_fr_knowledge_unit",
647650
"top_k": 20,
651+
"search_api": {
652+
"type": "openspg_search_api",
653+
"kag_qa_task_config_key": kb_task_project_id,
654+
},
655+
"graph_api": {
656+
"type": "openspg_graph_api",
657+
"kag_qa_task_config_key": kb_task_project_id,
658+
},
648659
"path_select": {
649660
"type": "fuzzy_one_hop_select",
650661
"llm_client": llm_config,
@@ -719,6 +730,7 @@ def build_retriever_config(
719730
"vector_chunk_retriever": {
720731
"type": "vector_chunk_retriever",
721732
"vectorize_model": vectorize_model_config,
733+
"score_threshold": 0.65,
722734
"search_api": {
723735
"type": "openspg_search_api",
724736
"kag_qa_task_config_key": kb_task_project_id,

0 commit comments

Comments
 (0)