-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathknowledge_graph.py
More file actions
127 lines (102 loc) · 3.75 KB
/
knowledge_graph.py
File metadata and controls
127 lines (102 loc) · 3.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from flask import Flask, jsonify
import os
import glob
from langchain_neo4j import Neo4jGraph
from langchain_groq import ChatGroq
from langchain_core.documents import Document
from langchain_experimental.graph_transformers import LLMGraphTransformer
import textwrap
import uuid
app = Flask(__name__)
# Neo4j and Groq configuration
NEO4J_URI = ""
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = ""
GROQ_API_KEY = ""
# Set environment variables
os.environ["NEO4J_URI"] = NEO4J_URI
os.environ["NEO4J_USERNAME"] = NEO4J_USERNAME
os.environ["NEO4J_PASSWORD"] = NEO4J_PASSWORD
# Initialize Neo4j graph
try:
graph = Neo4jGraph(
url=NEO4J_URI,
username=NEO4J_USERNAME,
password=NEO4J_PASSWORD
)
print("Neo4j graph initialized successfully")
except Exception as e:
print(f"Failed to initialize Neo4j graph: {str(e)}")
graph = None
# Initialize LLM
llm = ChatGroq(groq_api_key=GROQ_API_KEY, model_name="Gemma2-9b-It")
llm_transformer = LLMGraphTransformer(llm=llm)
# Function to read text files from dataset folder
def read_text_files_from_dataset(dataset_folder="dataset"):
text = ""
text_files = glob.glob(os.path.join(dataset_folder, "*.txt"))
if not text_files:
raise FileNotFoundError(f"No .txt files found in the {dataset_folder} folder")
for file_path in text_files:
with open(file_path, 'r', encoding='utf-8') as file:
text += file.read() + "\n\n"
return text.strip()
# Split text into chunks
def split_text_into_chunks(text, max_chunk_size=1000):
paragraphs = text.split('\n\n')
chunks = []
current_chunk = ""
for paragraph in paragraphs:
if len(current_chunk) + len(paragraph) + 1 <= max_chunk_size:
current_chunk += paragraph + "\n\n"
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = paragraph + "\n\n"
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
# Load dataset and split into chunks
try:
text = read_text_files_from_dataset()
chunks = split_text_into_chunks(text)
print(f"Loaded {len(chunks)} chunks from dataset")
except Exception as e:
chunks = []
print(f"Error reading dataset: {str(e)}")
# Store current chunk index
current_chunk_index = 0
# Test route to verify app is running
@app.route('/')
def home():
return "Flask app is running!"
@app.route('/process_chunk', methods=['GET'])
def process_chunk():
global current_chunk_index
if not graph:
return jsonify({"status": "error", "message": "Neo4j graph not initialized"})
if not chunks:
return jsonify({"status": "error", "message": "No text files found in dataset folder"})
if current_chunk_index >= len(chunks):
return jsonify({"status": "error", "message": "No more chunks to process"})
chunk_text = chunks[current_chunk_index]
document = [Document(page_content=chunk_text)]
try:
graph_doc = llm_transformer.convert_to_graph_documents(document)
graph.add_graph_documents(
graph_documents=graph_doc,
baseEntityLabel=True,
include_source=True
)
current_chunk_index += 1
return jsonify({
"status": "success",
"chunk_index": current_chunk_index - 1,
"chunk_content": chunk_text[:100] + "..." if len(chunk_text) > 100 else chunk_text,
"nodes_added": len(graph_doc[0].nodes) if graph_doc else 0,
"relationships_added": len(graph_doc[0].relationships) if graph_doc else 0
})
except Exception as e:
return jsonify({"status": "error", "message": str(e)})
if __name__ == '__main__':
app.run(debug=True, host='localhost', port=5000)