Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ RUN pip install uv
COPY ./pyproject.toml /ORAssistant-backend/pyproject.toml
COPY . .

RUN uv venv .venv && uv sync && uv run /ORAssistant-backend/src/post_install.py
RUN uv venv .venv && uv sync

ARG SKIP_HF_DOWNLOAD=false
RUN if [ "$SKIP_HF_DOWNLOAD" = "false" ]; then \
Expand Down
3 changes: 1 addition & 2 deletions backend/Dockerfile_slim
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ COPY ./pyproject.toml /ORAssistant-backend/pyproject.toml
COPY . .

RUN uv venv .venv && \
uv sync --dev && \
uv run /ORAssistant-backend/src/post_install.py
uv sync --dev

RUN git clone https://huggingface.co/datasets/The-OpenROAD-Project/ORAssistant_RAG_Dataset && \
mkdir -p data && \
Expand Down
6 changes: 2 additions & 4 deletions backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@ dependencies = [
"markdown==3.8.2",
"myst-parser==4.0.1",
"nest-asyncio>=1.6.0",
"nltk==3.9.4",
"openai==1.100.2",
"openai==1.100.2",
"protobuf>=5.29.6",
"PyJWT>=2.13.0",
"psycopg2-binary>=2.9.11",
Expand All @@ -50,12 +49,11 @@ dependencies = [
"sqlalchemy>=2.0.43",
"starlette>=1.3.1",
"tenacity>=9.0.0",
"unstructured==0.18.18",
"beautifulsoup4>=4.12.3",
]

[dependency-groups]
dev = [
"beautifulsoup4==4.12.3",
"mypy>=1.17.1",
"pre-commit==3.7.1",
"pytest>=9.0.3",
Expand Down
6 changes: 0 additions & 6 deletions backend/src/post_install.py

This file was deleted.

4 changes: 2 additions & 2 deletions backend/src/tools/process_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from typing import Optional

from langchain_core.documents import Document
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain_community.document_loaders import BSHTMLLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from .chunk_documents import chunk_documents
Expand Down Expand Up @@ -43,7 +43,7 @@ def process_html(

documents = []
for file_path in tqdm(html_files, desc="Loading HTML files"):
content = UnstructuredHTMLLoader(file_path=file_path).load()
content = BSHTMLLoader(file_path=file_path).load()
for doc in content:
doc.metadata["source"] = file_path.split("./")[-1]
documents.extend(content)
Expand Down
16 changes: 8 additions & 8 deletions backend/tests/test_process_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def test_process_html_nonexistent_folder(self):
assert result == []

@patch("src.tools.process_html.glob.glob")
@patch("src.tools.process_html.UnstructuredHTMLLoader")
@patch("src.tools.process_html.BSHTMLLoader")
@patch(
"builtins.open",
new_callable=mock_open,
Expand Down Expand Up @@ -52,7 +52,7 @@ def test_process_html_without_splitting(
assert result[0].metadata["source"] == "test.html"

@patch("src.tools.process_html.glob.glob")
@patch("src.tools.process_html.UnstructuredHTMLLoader")
@patch("src.tools.process_html.BSHTMLLoader")
@patch(
"builtins.open",
new_callable=mock_open,
Expand Down Expand Up @@ -95,7 +95,7 @@ def test_process_html_with_splitting(
mock_chunk.assert_called_once_with(500, [mock_doc])

@patch("src.tools.process_html.glob.glob")
@patch("src.tools.process_html.UnstructuredHTMLLoader")
@patch("src.tools.process_html.BSHTMLLoader")
@patch("builtins.open", new_callable=mock_open, read_data="{}")
@patch("src.tools.process_html.os.path.exists")
@patch("src.tools.process_html.os.listdir")
Expand Down Expand Up @@ -133,7 +133,7 @@ def test_process_html_split_without_chunk_size_raises_error(self):
mock_open(read_data='{"test.html": "https://example.com"}'),
):
with patch(
"src.tools.process_html.UnstructuredHTMLLoader"
"src.tools.process_html.BSHTMLLoader"
) as mock_loader:
mock_doc = Mock()
mock_doc.metadata = {"source": "test.html"}
Expand All @@ -145,7 +145,7 @@ def test_process_html_split_without_chunk_size_raises_error(self):
process_html(temp_dir, split_text=True, chunk_size=None)

@patch("src.tools.process_html.glob.glob")
@patch("src.tools.process_html.UnstructuredHTMLLoader")
@patch("src.tools.process_html.BSHTMLLoader")
@patch(
"builtins.open",
new_callable=mock_open,
Expand Down Expand Up @@ -198,7 +198,7 @@ def test_process_html_logs_error_for_empty_folder(self, mock_logging):

@patch("src.tools.process_html.logging")
@patch("src.tools.process_html.glob.glob")
@patch("src.tools.process_html.UnstructuredHTMLLoader")
@patch("src.tools.process_html.BSHTMLLoader")
@patch("builtins.open", new_callable=mock_open, read_data="{}")
@patch("src.tools.process_html.os.path.exists")
@patch("src.tools.process_html.os.listdir")
Expand All @@ -225,7 +225,7 @@ def test_process_html_logs_warning_for_missing_source(
def test_process_html_metadata_transformation(self):
"""Test that metadata is properly transformed."""
with patch("src.tools.process_html.glob.glob") as mock_glob:
with patch("src.tools.process_html.UnstructuredHTMLLoader") as mock_loader:
with patch("src.tools.process_html.BSHTMLLoader") as mock_loader:
with patch(
"builtins.open",
mock_open(read_data='{"test.html": "https://example.com"}'),
Expand Down Expand Up @@ -281,7 +281,7 @@ def test_process_html_real_file_structure(self):
mock_open(read_data='{"docs/html/test.html": "https://example.com"}'),
):
with patch(
"src.tools.process_html.UnstructuredHTMLLoader"
"src.tools.process_html.BSHTMLLoader"
) as mock_loader:
mock_doc = Mock()
mock_doc.metadata = {"source": "test.html"}
Expand Down
Loading
Loading