88from langchain_core .documents import Document
99from langchain_community .document_loaders import PyPDFLoader
1010from .base_node import BaseNode
11- from ..utils .remover import remover
11+ from ..utils .cleanup_html import cleanup_html
12+ import requests
13+ from bs4 import BeautifulSoup
1214
1315
1416class FetchNode (BaseNode ):
@@ -34,6 +36,7 @@ class FetchNode(BaseNode):
3436 def __init__ (self , input : str , output : List [str ], node_config : Optional [dict ] = None , node_name : str = "Fetch" ):
3537 super ().__init__ (node_name , "node" , input , output , 1 )
3638
39+
3740 self .headless = True if node_config is None else node_config .get (
3841 "headless" , True )
3942 self .verbose = False if node_config is None else node_config .get (
@@ -94,10 +97,22 @@ def execute(self, state):
9497 pass
9598
9699 elif not source .startswith ("http" ):
97- compressed_document = [Document (page_content = remover (source ), metadata = {
100+ compressed_document = [Document (page_content = cleanup_html (source ), metadata = {
98101 "source" : "local_dir"
99102 })]
100103
104+ elif self .useSoup :
105+ response = requests .get (source )
106+ if response .status_code == 200 :
107+ soup = BeautifulSoup (response .text , 'html.parser' )
108+ links = soup .find_all ('a' )
109+ link_urls = []
110+ for link in links :
111+ if 'href' in link .attrs :
112+ link_urls .append (link ['href' ])
113+ compressed_document = [Document (page_content = cleanup_html (soup .prettify (), link_urls ))]
114+ else :
115+ print (f"Failed to retrieve contents from the webpage at url: { url } " )
101116 else :
102117 if self .node_config is not None and self .node_config .get ("endpoint" ) is not None :
103118
@@ -114,7 +129,7 @@ def execute(self, state):
114129
115130 document = loader .load ()
116131 compressed_document = [
117- Document (page_content = remover (str (document [0 ].page_content )))]
132+ Document (page_content = cleanup_html (str (document [0 ].page_content )))]
118133
119134 state .update ({self .output [0 ]: compressed_document })
120135 return state
0 commit comments