Skip to content

Commit db7720d

Browse files
committed
Refactor logging in getCrawlContent and related functions: remove emojis and adjust formatting for improved readability.
1 parent 901fb45 commit db7720d

1 file changed

Lines changed: 39 additions & 39 deletions

File tree

crawlProcess.py

Lines changed: 39 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def trackError(component: str, error_type: str, error_message: str, keywordId: s
7676

7777
# Print formatted error
7878
print("\n" + "🔴" * 40)
79-
print(f" ERROR TRACKED:")
79+
print(f" ERROR TRACKED:")
8080
print(f" Component: {component}")
8181
print(f" Type: {error_type}")
8282
print(f" Message: {error_message}")
@@ -195,7 +195,7 @@ def chunkText(text: str, chunk_size: int = MAX_CHUNK_SIZE, overlap: int = CHUNK_
195195
break
196196
start = end - overlap
197197

198-
print(f" Created {len(chunks)} chunks")
198+
print(f" Created {len(chunks)} chunks")
199199
return chunks
200200

201201

@@ -628,18 +628,18 @@ async def getCrawlContent(keywordId:str) -> str:
628628
if 'content' in document and document['content']:
629629
content.append(str(document['content']))
630630

631-
print(f"📊 Found {len(content)} documents in database")
631+
print(f"Found {len(content)} documents in database")
632632

633633
if len(content) > 0:
634634
# Join all content from all documents
635635
joinAllContent = "".join(content)
636636
content_length = len(joinAllContent)
637-
print(f"📝 Total content length: {content_length} characters")
637+
print(f"Total content length: {content_length} characters")
638638
print(f" Preview (first 200 chars): {joinAllContent[:200]}...")
639639

640640
# Check if content needs chunking - if so, process it here
641641
if content_length > MAX_CHUNK_SIZE:
642-
print(f"⚠️ Content exceeds {MAX_CHUNK_SIZE} chars - chunking and processing here")
642+
print(f"Content exceeds {MAX_CHUNK_SIZE} chars - chunking and processing here")
643643

644644
# Create chunks
645645
chunks = chunkText(joinAllContent, MAX_CHUNK_SIZE, CHUNK_OVERLAP)
@@ -648,19 +648,19 @@ async def getCrawlContent(keywordId:str) -> str:
648648
all_partial_kgs = []
649649

650650
for i, chunk in enumerate(chunks):
651-
print(f"\n 📦 Processing chunk {i+1}/{len(chunks)} ({len(chunk)} chars)...")
651+
print(f"\n Processing chunk {i+1}/{len(chunks)} ({len(chunk)} chars)...")
652652

653653
try:
654654
partial_kg = processChunkToKG(chunk, keywordId, i+1, len(chunks))
655655
if partial_kg and (partial_kg.get("nodes") or partial_kg.get("edges")):
656656
all_partial_kgs.append(partial_kg)
657-
print(f"Chunk {i+1}: {len(partial_kg.get('nodes', []))} nodes, {len(partial_kg.get('edges', []))} edges")
657+
print(f"Chunk {i+1}: {len(partial_kg.get('nodes', []))} nodes, {len(partial_kg.get('edges', []))} edges")
658658
else:
659-
print(f" ⚠️ Chunk {i+1}: No KG data extracted")
659+
print(f" Chunk {i+1}: No KG data extracted")
660660

661661
except Exception as e:
662662
error_msg = f"Failed to process chunk {i+1}/{len(chunks)}: {str(e)}"
663-
print(f" {error_msg}")
663+
print(f" {error_msg}")
664664
trackError(
665665
component="getCrawlContent",
666666
error_type="ChunkProcessingError",
@@ -678,17 +678,17 @@ async def getCrawlContent(keywordId:str) -> str:
678678

679679
# Merge all partial KGs and return as JSON string
680680
if all_partial_kgs:
681-
print(f"\n 🔗 Merging {len(all_partial_kgs)} partial knowledge graphs...")
681+
print(f"\n Merging {len(all_partial_kgs)} partial knowledge graphs...")
682682
merged_kg = mergeKGJsons(all_partial_kgs)
683-
print(f" Final merged KG: {len(merged_kg.get('nodes', []))} nodes, {len(merged_kg.get('edges', []))} edges")
683+
print(f" Final merged KG: {len(merged_kg.get('nodes', []))} nodes, {len(merged_kg.get('edges', []))} edges")
684684

685685
# Save to Neo4j immediately after merging
686-
print(f"\n 💾 Saving merged KG to Neo4j...")
686+
print(f"\n Saving merged KG to Neo4j...")
687687
try:
688688
saveKGToNeo4j(keywordId, merged_kg)
689-
print(f" Successfully saved to Neo4j!")
689+
print(f" Successfully saved to Neo4j!")
690690
except Exception as e:
691-
print(f" Failed to save to Neo4j: {str(e)}")
691+
print(f" Failed to save to Neo4j: {str(e)}")
692692
trackError(
693693
component="getCrawlContent->saveKGToNeo4j",
694694
error_type=type(e).__name__,
@@ -706,16 +706,16 @@ async def getCrawlContent(keywordId:str) -> str:
706706
"kg_data": merged_kg
707707
})
708708
else:
709-
print(f" All chunks failed to produce valid KG data")
709+
print(f" All chunks failed to produce valid KG data")
710710
return json.dumps({
711711
"already_processed": True,
712712
"kg_data": {"nodes": [], "edges": [], "error": "All chunks failed"}
713713
})
714714
else:
715-
print(f" Content size OK ({content_length} chars) - returning for normal processing")
715+
print(f" Content size OK ({content_length} chars) - returning for normal processing")
716716
return joinAllContent
717717
else:
718-
print(" No content found in database")
718+
print(" No content found in database")
719719
return ""
720720

721721

@@ -732,22 +732,22 @@ def createKG(content:str , keywordId:str) -> object:
732732
try:
733733
parsed_content = json.loads(content)
734734
if isinstance(parsed_content, dict) and parsed_content.get("already_processed"):
735-
print(" Content was already chunked, processed, and saved by getCrawlContent")
735+
print(" Content was already chunked, processed, and saved by getCrawlContent")
736736
json_out = parsed_content.get("kg_data", {"nodes": [], "edges": []})
737737

738738
if json_out.get("nodes") or json_out.get("edges"):
739-
print(f" KG already saved in Neo4j: {len(json_out.get('nodes', []))} nodes, {len(json_out.get('edges', []))} edges")
739+
print(f" KG already saved in Neo4j: {len(json_out.get('nodes', []))} nodes, {len(json_out.get('edges', []))} edges")
740740
return json_out
741741
else:
742-
print("⚠️ Pre-processed KG is empty")
742+
print(" Pre-processed KG is empty")
743743
return json_out
744744
except (json.JSONDecodeError, TypeError):
745745
# Not pre-processed JSON, continue with normal flow
746746
pass
747747

748748
# Validate content before processing
749749
if not content or len(content.strip()) < 10:
750-
error_msg = f" Content is empty or too short (length: {len(content) if content else 0})"
750+
error_msg = f" Content is empty or too short (length: {len(content) if content else 0})"
751751
print(error_msg)
752752
trackError(
753753
component="createKG",
@@ -764,17 +764,17 @@ def createKG(content:str , keywordId:str) -> object:
764764
}
765765

766766
content_length = len(content)
767-
print(f" Processing content: {content_length} characters")
767+
print(f" Processing content: {content_length} characters")
768768
print(f" First 200 chars: {content[:200]}...")
769769

770770
# Process directly (content is small enough)
771771
print(f" Content size OK - processing without chunking")
772772
try:
773773
json_out = processChunkToKG(content, keywordId, 1, 1)
774-
print(f" KG JSON validated: {len(json_out.get('nodes', []))} nodes, {len(json_out.get('edges', []))} edges")
774+
print(f" KG JSON validated: {len(json_out.get('nodes', []))} nodes, {len(json_out.get('edges', []))} edges")
775775
except Exception as e:
776776
error_msg = f"Failed to process content: {str(e)}"
777-
print(f" {error_msg}")
777+
print(f" {error_msg}")
778778
trackError(
779779
component="createKG",
780780
error_type=type(e).__name__,
@@ -796,9 +796,9 @@ def createKG(content:str , keywordId:str) -> object:
796796
print(f"🔄 Calling saveKGToNeo4j with keywordId={keywordId}")
797797
print(f" KG contains: {len(json_out.get('nodes', []))} nodes, {len(json_out.get('edges', []))} edges")
798798
saveKGToNeo4j(keywordId, json_out)
799-
print(f" saveKGToNeo4j completed without exceptions")
799+
print(f" saveKGToNeo4j completed without exceptions")
800800
except Exception as e:
801-
print(f" Exception caught from saveKGToNeo4j: {type(e).__name__}: {str(e)}")
801+
print(f" Exception caught from saveKGToNeo4j: {type(e).__name__}: {str(e)}")
802802
trackError(
803803
component="createKG->saveKGToNeo4j",
804804
error_type=type(e).__name__,
@@ -821,19 +821,19 @@ def saveKGToNeo4j(keywordId: str, kg_json: dict):
821821

822822
# Validate KG data before saving
823823
if not kg_json or not isinstance(kg_json, dict):
824-
print(" Invalid KG JSON structure")
824+
print(" Invalid KG JSON structure")
825825
return
826826

827827
nodes = kg_json.get("nodes", [])
828828
edges = kg_json.get("edges", [])
829829

830-
print(f"📊 Preparing to merge:")
830+
print(f" Preparing to merge:")
831831
print(f" - {len(nodes)} nodes")
832832
print(f" - {len(edges)} edges")
833833
print(f" - KeywordId: {keywordId}")
834834

835835
if not nodes and not edges:
836-
print("⚠️ No nodes or edges to save")
836+
print(" No nodes or edges to save")
837837
return
838838

839839
with GraphDatabase.driver(URI, auth=AUTH) as driver:
@@ -865,10 +865,10 @@ def saveKGToNeo4j(keywordId: str, kg_json: dict):
865865
nodes_created += 1
866866

867867
except Exception as e:
868-
print(f" ⚠️ Failed to merge node {i+1}: {name} - {str(e)}")
868+
print(f" Failed to merge node {i+1}: {name} - {str(e)}")
869869
continue
870870

871-
print(f" Merged {nodes_created}/{len(nodes)} nodes (created or updated)")
871+
print(f" Merged {nodes_created}/{len(nodes)} nodes (created or updated)")
872872

873873
# MERGE relationships instead of CREATE
874874
edges_created = 0
@@ -881,7 +881,7 @@ def saveKGToNeo4j(keywordId: str, kg_json: dict):
881881
props["to"] = edge.get("to", "")
882882

883883
if not props["from"] or not props["to"]:
884-
print(f" ⚠️ Skipping edge {i+1}: missing from/to nodes")
884+
print(f" Skipping edge {i+1}: missing from/to nodes")
885885
continue
886886

887887
try:
@@ -899,14 +899,14 @@ def saveKGToNeo4j(keywordId: str, kg_json: dict):
899899
edges_created += 1
900900

901901
except Exception as e:
902-
print(f" ⚠️ Failed to merge edge {i+1}: {props['from']} -> {props['to']} - {str(e)}")
902+
print(f" Failed to merge edge {i+1}: {props['from']} -> {props['to']} - {str(e)}")
903903
continue
904904

905-
print(f" Merged {edges_created}/{len(edges)} edges (created or updated)")
906-
print(f" Successfully merged KG to Neo4j!")
905+
print(f" Merged {edges_created}/{len(edges)} edges (created or updated)")
906+
print(f" Successfully merged KG to Neo4j!")
907907

908908
except Exception as e:
909-
print(f" Neo4j error: {e}")
909+
print(f" Neo4j error: {e}")
910910
import traceback
911911
traceback.print_exc()
912912
raise HTTPException(status_code=500, detail=f"Neo4j error: {e}")
@@ -1000,7 +1000,7 @@ async def FullAutoAgent(keywordId):
10001000

10011001
# Log successful execution
10021002
messages = response.get("messages", [])
1003-
print(f"\n Agent completed successfully with {len(messages)} messages")
1003+
print(f"\n Agent completed successfully with {len(messages)} messages")
10041004

10051005
return response
10061006

@@ -1012,7 +1012,7 @@ async def FullAutoAgent(keywordId):
10121012
keywordId=keywordId_str,
10131013
details={"timeout_duration": "unknown"}
10141014
)
1015-
print(f" Agent timeout for keywordId: {keywordId_str}")
1015+
print(f" Agent timeout for keywordId: {keywordId_str}")
10161016
return {
10171017
"status": "failed",
10181018
"reason": "Agent execution timed out",
@@ -1030,7 +1030,7 @@ async def FullAutoAgent(keywordId):
10301030
"traceback": __import__('traceback').format_exc()
10311031
}
10321032
)
1033-
print(f" Error in FullAutoAgent: {e}")
1033+
print(f" Error in FullAutoAgent: {e}")
10341034
import traceback
10351035
traceback.print_exc()
10361036
return {

0 commit comments

Comments
 (0)