diff --git a/kag/common/conf.py b/kag/common/conf.py index 84fdf6eeb..fa4627db2 100644 --- a/kag/common/conf.py +++ b/kag/common/conf.py @@ -148,7 +148,7 @@ def load_config(prod: bool = False, config_file: str = None): if not validate_config_file(config_file): config_file = _closest_cfg() if os.path.exists(config_file) and os.path.isfile(config_file): - logger.info(f"found config file: {config_file}") + logger.debug(f"found config file: {config_file}") with open(config_file, "r") as reader: config = reader.read() config = Template(config).render(**dict(os.environ)) @@ -181,9 +181,9 @@ def init_log_config(self, config): def initialize(self, prod: bool = True, config_file: str = None): config = load_config(prod, config_file) if self._is_initialized: - logger.info("WARN: Reinitialize the KAG configuration.") - logger.info(f"original config: {self.config}\n\n") - logger.info(f"new config: {config}") + logger.debug("WARN: Reinitialize the KAG configuration.") + logger.debug(f"original config: {self.config}\n\n") + logger.debug(f"new config: {config}") self.prod = prod self.config = config global_config = self.config.get(KAGConstants.PROJECT_CONFIG_KEY, {}) @@ -261,6 +261,5 @@ def init_env(config_file: str = None): os.environ[KAGConstants.ENV_KAG_PROJECT_HOST_ADDR] = str(KAG_PROJECT_CONF.host_addr) if len(KAG_CONFIG.all_config) > 0: dump_flag = os.getenv(KAGConstants.ENV_KAG_DEBUG_DUMP_CONFIG) - pprint.pprint(KAG_CONFIG.all_config, indent=2) if dump_flag is not None and dump_flag.strip() == "1": pprint.pprint(KAG_CONFIG.all_config, indent=2) diff --git a/kag/examples/baike/schema/index.schema b/kag/examples/baike/schema/index.schema deleted file mode 100644 index 21f1c86cb..000000000 --- a/kag/examples/baike/schema/index.schema +++ /dev/null @@ -1,72 +0,0 @@ -namespace BaiKe - -Doc(文档): IndexType - properties: - content(内容): Text - index: TextAndVector - -Chunk(文本块): IndexType - properties: - content(内容): Text - index: TextAndVector - relations: - sourceChunk(关联文档): Doc - -Summary(摘要): IndexType - properties: - title(标题): Text - index: TextAndVector - content(内容): Text - index: TextAndVector - relations: - sourceChunk(关联文本块): Chunk - childOf(上级摘要): Summary - -KnowledgeUnit(知识点): IndexType - properties: - structedContent(结构化文本): Text - index: TextAndVector - ontology(本体): Text - desc(描述): Text - index: TextAndVector - relatedQuery(关联问): AtomicQuery - extendedKnowledge(关联外扩知识点):Text - content(内容): Text - index: TextAndVector - knowledgeType(知识类型): Text - -AtomicQuery(原子问): IndexType - properties: - title(标题): Text - index: TextAndVector - relations: - sourceChunk(关联文本块): Chunk - similar(相似问题): AtomicQuery - relatedTo(相关): KnowledgeUnit - -Cell(图表单元格): IndexType - properties: - title(标题): Text - index: TextAndVector - content(内容): Text - index: TextAndVector - -Diagram(图表): IndexType - properties: - title(标题): Text - index: TextAndVector - content(内容): Text - index: TextAndVector - relations: - sourceChunk(关联文本块): Chunk - contain(包含): Cell - -Outline(大纲): IndexType - properties: - title(标题): Text - index: TextAndVector - content(内容): Text - index: TextAndVector - relations: - sourceChunk(关联文本块): Chunk - childOf(上级大纲): Outline \ No newline at end of file diff --git a/kag/examples/baike/solver/run_llm.py b/kag/examples/baike/solver/run_llm.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/kag/indexer/kag_index_manager.py b/kag/indexer/kag_index_manager.py index daae7129a..32afa05c0 100644 --- a/kag/indexer/kag_index_manager.py +++ b/kag/indexer/kag_index_manager.py @@ -91,14 +91,17 @@ def description(self) -> str: @property def schema(self) -> str: return """ +Chunk(文本块): IndexType + properties: + content(内容): Text + index: TextAndVector AtomicQuery(原子问): IndexType properties: title(标题): Text index: TextAndVector relations: sourceChunk(关联文本块): Chunk - similar(相似问题): AtomicQuery - relatedTo(相关): KnowledgeUnit + similar(相似问题): AtomicQuery """ @property @@ -292,6 +295,10 @@ def description(self) -> str: @property def schema(self) -> str: return """ +Chunk(文本块): IndexType + properties: + content(内容): Text + index: TextAndVector Table(表格): IndexType properties: content(内容): Text @@ -374,6 +381,10 @@ def description(self) -> str: @property def schema(self) -> str: return """ +Chunk(文本块): IndexType + properties: + content(内容): Text + index: TextAndVector Summary(文本摘要): IndexType properties: content(内容): Text @@ -453,6 +464,10 @@ def description(self) -> str: @property def schema(self) -> str: return """ +Chunk(文本块): IndexType + properties: + content(内容): Text + index: TextAndVector Outline(标题大纲): IndexType properties: content(内容): Text diff --git a/kag/open_benchmark/2wiki/schema/index.schema b/kag/open_benchmark/2wiki/schema/index.schema deleted file mode 100644 index 2bf2f7394..000000000 --- a/kag/open_benchmark/2wiki/schema/index.schema +++ /dev/null @@ -1,72 +0,0 @@ -namespace TwoWiki - -Doc(文档): IndexType - properties: - content(内容): Text - index: TextAndVector - -Chunk(文本块): IndexType - properties: - content(内容): Text - index: TextAndVector - relations: - sourceChunk(关联文档): Doc - -Summary(摘要): IndexType - properties: - title(标题): Text - index: TextAndVector - content(内容): Text - index: TextAndVector - relations: - sourceChunk(关联文本块): Chunk - childOf(上级摘要): Summary - -KnowledgeUnit(知识点): IndexType - properties: - structedContent(结构化文本): Text - index: TextAndVector - ontology(本体): Text - desc(描述): Text - index: TextAndVector - relatedQuery(关联问): AtomicQuery - extendedKnowledge(关联外扩知识点):Text - content(内容): Text - index: TextAndVector - knowledgeType(知识类型): Text - -AtomicQuery(原子问): IndexType - properties: - title(标题): Text - index: TextAndVector - relations: - sourceChunk(关联文本块): Chunk - similar(相似问题): AtomicQuery - relatedTo(相关): KnowledgeUnit - -Cell(图表单元格): IndexType - properties: - title(标题): Text - index: TextAndVector - content(内容): Text - index: TextAndVector - -Diagram(图表): IndexType - properties: - title(标题): Text - index: TextAndVector - content(内容): Text - index: TextAndVector - relations: - sourceChunk(关联文本块): Chunk - contain(包含): Cell - -Outline(大纲): IndexType - properties: - title(标题): Text - index: TextAndVector - content(内容): Text - index: TextAndVector - relations: - sourceChunk(关联文本块): Chunk - childOf(上级大纲): Outline \ No newline at end of file diff --git a/kag/open_benchmark/hotpotqa/schema/index.schema b/kag/open_benchmark/hotpotqa/schema/index.schema deleted file mode 100644 index b92e43f5d..000000000 --- a/kag/open_benchmark/hotpotqa/schema/index.schema +++ /dev/null @@ -1,72 +0,0 @@ -namespace HotpotQA - -Doc(文档): IndexType - properties: - content(内容): Text - index: TextAndVector - -Chunk(文本块): IndexType - properties: - content(内容): Text - index: TextAndVector - relations: - sourceChunk(关联文档): Doc - -Summary(摘要): IndexType - properties: - title(标题): Text - index: TextAndVector - content(内容): Text - index: TextAndVector - relations: - sourceChunk(关联文本块): Chunk - childOf(上级摘要): Summary - -KnowledgeUnit(知识点): IndexType - properties: - structedContent(结构化文本): Text - index: TextAndVector - ontology(本体): Text - desc(描述): Text - index: TextAndVector - relatedQuery(关联问): AtomicQuery - extendedKnowledge(关联外扩知识点):Text - content(内容): Text - index: TextAndVector - knowledgeType(知识类型): Text - -AtomicQuery(原子问): IndexType - properties: - title(标题): Text - index: TextAndVector - relations: - sourceChunk(关联文本块): Chunk - similar(相似问题): AtomicQuery - relatedTo(相关): KnowledgeUnit - -Cell(图表单元格): IndexType - properties: - title(标题): Text - index: TextAndVector - content(内容): Text - index: TextAndVector - -Diagram(图表): IndexType - properties: - title(标题): Text - index: TextAndVector - content(内容): Text - index: TextAndVector - relations: - sourceChunk(关联文本块): Chunk - contain(包含): Cell - -Outline(大纲): IndexType - properties: - title(标题): Text - index: TextAndVector - content(内容): Text - index: TextAndVector - relations: - sourceChunk(关联文本块): Chunk - childOf(上级大纲): Outline \ No newline at end of file diff --git a/kag/open_benchmark/musique/schema/index.schema b/kag/open_benchmark/musique/schema/index.schema deleted file mode 100644 index 026cdba10..000000000 --- a/kag/open_benchmark/musique/schema/index.schema +++ /dev/null @@ -1,72 +0,0 @@ -namespace MuSiQue - -Doc(文档): IndexType - properties: - content(内容): Text - index: TextAndVector - -Chunk(文本块): IndexType - properties: - content(内容): Text - index: TextAndVector - relations: - sourceChunk(关联文档): Doc - -Summary(摘要): IndexType - properties: - title(标题): Text - index: TextAndVector - content(内容): Text - index: TextAndVector - relations: - sourceChunk(关联文本块): Chunk - childOf(上级摘要): Summary - -KnowledgeUnit(知识点): IndexType - properties: - structedContent(结构化文本): Text - index: TextAndVector - ontology(本体): Text - desc(描述): Text - index: TextAndVector - relatedQuery(关联问): AtomicQuery - extendedKnowledge(关联外扩知识点):Text - content(内容): Text - index: TextAndVector - knowledgeType(知识类型): Text - -AtomicQuery(原子问): IndexType - properties: - title(标题): Text - index: TextAndVector - relations: - sourceChunk(关联文本块): Chunk - similar(相似问题): AtomicQuery - relatedTo(相关): KnowledgeUnit - -Cell(图表单元格): IndexType - properties: - title(标题): Text - index: TextAndVector - content(内容): Text - index: TextAndVector - -Diagram(图表): IndexType - properties: - title(标题): Text - index: TextAndVector - content(内容): Text - index: TextAndVector - relations: - sourceChunk(关联文本块): Chunk - contain(包含): Cell - -Outline(大纲): IndexType - properties: - title(标题): Text - index: TextAndVector - content(内容): Text - index: TextAndVector - relations: - sourceChunk(关联文本块): Chunk - childOf(上级大纲): Outline \ No newline at end of file diff --git a/kag/templates/schema/{{default}}.schema.tmpl b/kag/templates/schema/{{default}}.schema.tmpl index c00593272..4f7b2523a 100644 --- a/kag/templates/schema/{{default}}.schema.tmpl +++ b/kag/templates/schema/{{default}}.schema.tmpl @@ -1,10 +1,5 @@ namespace {{namespace}} -Chunk(文本块): EntityType - properties: - content(内容): Text - index: TextAndVector - ArtificialObject(人造物体): EntityType properties: desc(描述): Text diff --git a/kag/templates/schema/{{medical}}.schema.tmpl b/kag/templates/schema/{{medical}}.schema.tmpl index 52230d791..862b5a3bf 100644 --- a/kag/templates/schema/{{medical}}.schema.tmpl +++ b/kag/templates/schema/{{medical}}.schema.tmpl @@ -1,11 +1,6 @@ namespace {{namespace}} -Chunk(文本块): EntityType - properties: - content(内容): Text - index: TextAndVector - HealthFood(保健食品): EntityType properties: desc(描述): Text diff --git a/knext/command/sub_command/schema.py b/knext/command/sub_command/schema.py index e2596c8d8..535aa9639 100644 --- a/knext/command/sub_command/schema.py +++ b/knext/command/sub_command/schema.py @@ -12,9 +12,9 @@ import os from pathlib import Path -import yaml import click import knext.project +from kag.indexer import KAGIndexManager from knext.schema.marklang.concept_rule_ml import SPGConceptRuleMarkLang from knext.schema.marklang.schema_ml import SPGSchemaMarkLang @@ -30,21 +30,33 @@ def commit_schema(): knext.project.DEFAULT_SCHEMA_DIR, knext.project.DEFAULT_SCHEMA_FILE.replace("$namespace", env.namespace), ) - index_file = os.path.join( - env.project_path, - knext.project.DEFAULT_SCHEMA_DIR, - knext.project.DEFAULT_INDEX_FILE, - ) + if Path(schema_file).exists(): ml = SPGSchemaMarkLang(schema_file, host_addr=env.host_addr, project_id=env.project_id) else: ml = None - if Path(index_file).exists(): - index_ml = SPGSchemaMarkLang(index_file, host_addr=env.host_addr, project_id=env.project_id) - else: - index_ml = None + index_managers = KAGIndexManager.list_available() + index_ml = None + for index_manager_name in index_managers: + config = { + "type": index_manager_name, + "llm_config": None, + "vectorize_model_config": None, + } + + index_mgr = KAGIndexManager.from_config(config) + schema_str = index_mgr.schema + if not schema_str: + continue + schema_str = f"namespace {env.namespace}\n" + schema_str + cur_index_ml = SPGSchemaMarkLang(filename="", script_data_str=schema_str, host_addr=env.host_addr, project_id=env.project_id) + if index_ml is None: + index_ml = cur_index_ml + else: + index_ml.types.update(cur_index_ml.types) + if ml is None and index_ml is None: - click.secho(f"ERROR: File {schema_file} and {index_file} not exists.", fg="bright_red") + click.secho(f"ERROR: File {schema_file} and index not exists.", fg="bright_red") return if ml is None: ml = index_ml diff --git a/knext/project/__init__.py b/knext/project/__init__.py index 308fa6fec..e53999873 100644 --- a/knext/project/__init__.py +++ b/knext/project/__init__.py @@ -12,6 +12,5 @@ DEFAULT_SCHEMA_DIR = "schema" DEFAULT_SCHEMA_FILE = "$namespace.schema" -DEFAULT_INDEX_FILE = "index.schema" DEFAULT_BUILDER_DIR = "builder" DEFAULT_REASONER_DIR = "reasoner" diff --git a/knext/schema/marklang/schema_ml.py b/knext/schema/marklang/schema_ml.py index ab00c1262..054f97ed3 100644 --- a/knext/schema/marklang/schema_ml.py +++ b/knext/schema/marklang/schema_ml.py @@ -150,9 +150,15 @@ class SPGSchemaMarkLang: types = {} defined_types = {} - def __init__(self, filename, with_server=True, host_addr=None, project_id=None): + def __init__(self, filename, with_server=True, host_addr=None, project_id=None, script_data_str=""): self.reset() - self.schema_file = filename + if script_data_str: + self.schema_file_data = script_data_str + elif os.path.exists(filename): + with open(filename, "r", encoding="utf-8") as f: + self.schema_file_data = f.read() + else: + raise Exception("Schema file not exists") self.current_line_num = 0 if with_server: self.schema = SchemaClient(host_addr, project_id) @@ -1039,9 +1045,7 @@ def load_script(self): """ Load and then parse the script file """ - - file = open(self.schema_file, "r", encoding="utf-8") - lines = file.read().splitlines() + lines = self.schema_file_data.splitlines() self.preload_types(lines) for line in lines: self.current_line_num += 1