Skip to content

Commit d6922ac

Browse files
authored
fix(deps): remove nltk to resolve CVE-2026-54293 (#305)
* fix(deps): remove nltk to resolve CVE-2026-54293 No patched version of nltk is available for the URL-encoded path traversal vulnerability (CVE-2026-54293). Remove it by: - Replacing UnstructuredHTMLLoader (unstructured -> nltk) with BSHTMLLoader (beautifulsoup4) in process_html.py BSHTMLLoader (beautifulsoup4) in process_html.py - Removing unstructured==0.18.18 and nltk==3.9.4 from pyproject.toml - Promoting beautifulsoup4 from dev to main dependencies - Deleting the now-unnecessary post_install.py NLTK data downloader - Removing the post_install.py step from both Dockerfiles Signed-off-by: Jack Luar <jluar@precisioninno.com> * fix(tests): update mock target from UnstructuredHTMLLoader to BSHTMLLoader process_html.py was migrated from UnstructuredHTMLLoader to BSHTMLLoader but the test mocks were not updated, causing 8 test failures in CI. Signed-off-by: Jack Luar <jluar@precisioninno.com> --------- Signed-off-by: Jack Luar <jluar@precisioninno.com>
1 parent f266ead commit d6922ac

7 files changed

Lines changed: 1964 additions & 2265 deletions

File tree

backend/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ RUN pip install uv
2020
COPY ./pyproject.toml /ORAssistant-backend/pyproject.toml
2121
COPY . .
2222

23-
RUN uv venv .venv && uv sync && uv run /ORAssistant-backend/src/post_install.py
23+
RUN uv venv .venv && uv sync
2424

2525
ARG SKIP_HF_DOWNLOAD=false
2626
RUN if [ "$SKIP_HF_DOWNLOAD" = "false" ]; then \

backend/Dockerfile_slim

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,7 @@ COPY ./pyproject.toml /ORAssistant-backend/pyproject.toml
2121
COPY . .
2222

2323
RUN uv venv .venv && \
24-
uv sync --dev && \
25-
uv run /ORAssistant-backend/src/post_install.py
24+
uv sync --dev
2625

2726
RUN git clone https://huggingface.co/datasets/The-OpenROAD-Project/ORAssistant_RAG_Dataset && \
2827
mkdir -p data && \

backend/pyproject.toml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,7 @@ dependencies = [
2929
"markdown==3.8.2",
3030
"myst-parser==4.0.1",
3131
"nest-asyncio>=1.6.0",
32-
"nltk==3.9.4",
33-
"openai==1.100.2",
32+
"openai==1.100.2",
3433
"protobuf>=5.29.6",
3534
"PyJWT>=2.13.0",
3635
"psycopg2-binary>=2.9.11",
@@ -50,12 +49,11 @@ dependencies = [
5049
"sqlalchemy>=2.0.43",
5150
"starlette>=1.3.1",
5251
"tenacity>=9.0.0",
53-
"unstructured==0.18.18",
52+
"beautifulsoup4>=4.12.3",
5453
]
5554

5655
[dependency-groups]
5756
dev = [
58-
"beautifulsoup4==4.12.3",
5957
"mypy>=1.17.1",
6058
"pre-commit==3.7.1",
6159
"pytest>=9.0.3",

backend/src/post_install.py

Lines changed: 0 additions & 6 deletions
This file was deleted.

backend/src/tools/process_html.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from typing import Optional
77

88
from langchain_core.documents import Document
9-
from langchain_community.document_loaders import UnstructuredHTMLLoader
9+
from langchain_community.document_loaders import BSHTMLLoader
1010
from langchain_text_splitters import RecursiveCharacterTextSplitter
1111

1212
from .chunk_documents import chunk_documents
@@ -43,7 +43,7 @@ def process_html(
4343

4444
documents = []
4545
for file_path in tqdm(html_files, desc="Loading HTML files"):
46-
content = UnstructuredHTMLLoader(file_path=file_path).load()
46+
content = BSHTMLLoader(file_path=file_path).load()
4747
for doc in content:
4848
doc.metadata["source"] = file_path.split("./")[-1]
4949
documents.extend(content)

backend/tests/test_process_html.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def test_process_html_nonexistent_folder(self):
2121
assert result == []
2222

2323
@patch("src.tools.process_html.glob.glob")
24-
@patch("src.tools.process_html.UnstructuredHTMLLoader")
24+
@patch("src.tools.process_html.BSHTMLLoader")
2525
@patch(
2626
"builtins.open",
2727
new_callable=mock_open,
@@ -52,7 +52,7 @@ def test_process_html_without_splitting(
5252
assert result[0].metadata["source"] == "test.html"
5353

5454
@patch("src.tools.process_html.glob.glob")
55-
@patch("src.tools.process_html.UnstructuredHTMLLoader")
55+
@patch("src.tools.process_html.BSHTMLLoader")
5656
@patch(
5757
"builtins.open",
5858
new_callable=mock_open,
@@ -95,7 +95,7 @@ def test_process_html_with_splitting(
9595
mock_chunk.assert_called_once_with(500, [mock_doc])
9696

9797
@patch("src.tools.process_html.glob.glob")
98-
@patch("src.tools.process_html.UnstructuredHTMLLoader")
98+
@patch("src.tools.process_html.BSHTMLLoader")
9999
@patch("builtins.open", new_callable=mock_open, read_data="{}")
100100
@patch("src.tools.process_html.os.path.exists")
101101
@patch("src.tools.process_html.os.listdir")
@@ -133,7 +133,7 @@ def test_process_html_split_without_chunk_size_raises_error(self):
133133
mock_open(read_data='{"test.html": "https://example.com"}'),
134134
):
135135
with patch(
136-
"src.tools.process_html.UnstructuredHTMLLoader"
136+
"src.tools.process_html.BSHTMLLoader"
137137
) as mock_loader:
138138
mock_doc = Mock()
139139
mock_doc.metadata = {"source": "test.html"}
@@ -145,7 +145,7 @@ def test_process_html_split_without_chunk_size_raises_error(self):
145145
process_html(temp_dir, split_text=True, chunk_size=None)
146146

147147
@patch("src.tools.process_html.glob.glob")
148-
@patch("src.tools.process_html.UnstructuredHTMLLoader")
148+
@patch("src.tools.process_html.BSHTMLLoader")
149149
@patch(
150150
"builtins.open",
151151
new_callable=mock_open,
@@ -198,7 +198,7 @@ def test_process_html_logs_error_for_empty_folder(self, mock_logging):
198198

199199
@patch("src.tools.process_html.logging")
200200
@patch("src.tools.process_html.glob.glob")
201-
@patch("src.tools.process_html.UnstructuredHTMLLoader")
201+
@patch("src.tools.process_html.BSHTMLLoader")
202202
@patch("builtins.open", new_callable=mock_open, read_data="{}")
203203
@patch("src.tools.process_html.os.path.exists")
204204
@patch("src.tools.process_html.os.listdir")
@@ -225,7 +225,7 @@ def test_process_html_logs_warning_for_missing_source(
225225
def test_process_html_metadata_transformation(self):
226226
"""Test that metadata is properly transformed."""
227227
with patch("src.tools.process_html.glob.glob") as mock_glob:
228-
with patch("src.tools.process_html.UnstructuredHTMLLoader") as mock_loader:
228+
with patch("src.tools.process_html.BSHTMLLoader") as mock_loader:
229229
with patch(
230230
"builtins.open",
231231
mock_open(read_data='{"test.html": "https://example.com"}'),
@@ -281,7 +281,7 @@ def test_process_html_real_file_structure(self):
281281
mock_open(read_data='{"docs/html/test.html": "https://example.com"}'),
282282
):
283283
with patch(
284-
"src.tools.process_html.UnstructuredHTMLLoader"
284+
"src.tools.process_html.BSHTMLLoader"
285285
) as mock_loader:
286286
mock_doc = Mock()
287287
mock_doc.metadata = {"source": "test.html"}

0 commit comments

Comments
 (0)