Skip to content

Commit 25a1f35

Browse files
committed
refactor: 移除目录限制,支持灵活文件路径访问
- 删除 DOCUMENT_DIRECTORY 环境变量依赖 - 删除 AppContext dataclass 和 app_lifespan 函数 - 删除 _get_document_path() 安全函数 - read_document() 直接使用 Path(filename) 处理路径 - 删除 test_lifespan.py 测试文件 - 更新 test_tools.py 测试用例 - 更新 README.md 和 README.zh-CN.md 文档 模仿 mcp-document-converter 的实现方式,支持绝对路径和相对路径访问文件。
1 parent ee7c374 commit 25a1f35

6 files changed

Lines changed: 95 additions & 526 deletions

File tree

README.md

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -200,12 +200,6 @@ Read any supported document type.
200200
|-----------|------|----------|-------------|
201201
| filename | string || Document file path, supports absolute or relative paths |
202202

203-
## Environment Variables
204-
205-
| Variable | Description | Default |
206-
|----------|-------------|---------|
207-
| `DOCUMENT_DIRECTORY` | Directory where documents are stored | `./documents` |
208-
209203
## Dependencies
210204

211205
### Core Dependencies

README.zh-CN.md

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -200,12 +200,6 @@ if DocumentReaderFactory.is_supported("file.xlsx"):
200200
|------|------|------|------|
201201
| filename | string || 文档文件路径,支持绝对路径或相对路径 |
202202

203-
## 环境变量
204-
205-
| 变量名 | 描述 | 默认值 |
206-
|--------|------|--------|
207-
| `DOCUMENT_DIRECTORY` | 存储文档的目录 | `./documents` |
208-
209203
## 依赖
210204

211205
### 核心依赖

mcp_documents_reader.py

Lines changed: 11 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,16 @@
11
import os
22
from abc import ABC, abstractmethod
3-
from collections.abc import AsyncIterator
4-
from contextlib import asynccontextmanager
5-
from dataclasses import dataclass
3+
from pathlib import Path
64

75
from docx import Document as DocxDocument
86
from mcp.server.fastmcp import FastMCP
97
from openpyxl import load_workbook
108
from pypdf import PdfReader as PyPdfReader
119
from typing_extensions import override
1210

13-
# Directory where documents are stored
14-
DOCUMENT_DIRECTORY = os.getenv("DOCUMENT_DIRECTORY", "./documents")
15-
16-
17-
@dataclass
18-
class AppContext:
19-
"""Application context for lifecycle management."""
20-
21-
document_directory: str
22-
23-
24-
# Initialize the MCP server (lifespan added below)
2511
mcp = FastMCP("Document Reader")
2612

2713

28-
@asynccontextmanager
29-
async def app_lifespan(server: FastMCP) -> AsyncIterator[AppContext]:
30-
"""Manage application lifecycle with type-safe context"""
31-
try:
32-
# Ensure document directory exists
33-
os.makedirs(DOCUMENT_DIRECTORY, exist_ok=True)
34-
yield AppContext(document_directory=DOCUMENT_DIRECTORY)
35-
finally:
36-
# Cleanup (if needed)
37-
pass
38-
39-
40-
# Assign lifespan to server
41-
mcp.lifespan = app_lifespan # type: ignore[reportAttributeAccessIssue]
42-
43-
44-
# ------------------------- Document Reader Architecture -------------------------
45-
46-
4714
class DocumentReader(ABC):
4815
"""Abstract base class for document readers"""
4916

@@ -63,12 +30,10 @@ def read(self, file_path: str) -> str:
6330
doc = DocxDocument(file_path)
6431
text = []
6532

66-
# Extract paragraph text
6733
for paragraph in doc.paragraphs:
6834
if paragraph.text:
6935
text.append(paragraph.text)
7036

71-
# Extract table content
7237
for table in doc.tables:
7338
for row in table.rows:
7439
row_text = []
@@ -96,7 +61,6 @@ def read(self, file_path: str) -> str:
9661
pdf_reader = PyPdfReader(file)
9762
text = []
9863

99-
# Extract text from each page
10064
for page in pdf_reader.pages:
10165
page_text = page.extract_text()
10266
if page_text:
@@ -114,7 +78,6 @@ class TxtReader(DocumentReader):
11478
@override
11579
def read(self, file_path: str) -> str:
11680
"""Read and extract text from TXT file with encoding handling"""
117-
# Supported encodings in priority order
11881
encodings = ["utf-8", "gbk", "gb2312", "latin-1"]
11982

12083
for encoding in encodings:
@@ -140,18 +103,16 @@ def read(self, file_path: str) -> str:
140103
wb = load_workbook(file_path, read_only=True)
141104
text = []
142105

143-
# Extract text from all sheets
144106
for sheet_name in wb.sheetnames:
145107
sheet = wb[sheet_name]
146108
text.append(f"=== Sheet: {sheet_name} ===")
147109

148-
# Extract cell content
149110
for row in sheet.iter_rows(values_only=True):
150111
row_text = [str(cell) if cell is not None else "" for cell in row]
151-
if any(row_text): # Only add non-empty rows
112+
if any(row_text):
152113
text.append("\t".join(row_text))
153114

154-
text.append("") # Add blank line between sheets
115+
text.append("")
155116

156117
extracted_text = "\n".join(text)
157118
wb.close()
@@ -165,7 +126,6 @@ def read(self, file_path: str) -> str:
165126
class DocumentReaderFactory:
166127
"""Factory for creating document readers based on file extension"""
167128

168-
# Mapping of file extensions to reader classes
169129
_readers: dict[str, type[DocumentReader]] = {
170130
".txt": TxtReader,
171131
".docx": DocxReader,
@@ -189,69 +149,31 @@ def is_supported(cls, file_path: str) -> bool:
189149
return ext in cls._readers
190150

191151

192-
# ------------------------- Tool Functions -------------------------
193-
194-
195-
def _get_document_path(ctx: object, filename: str) -> str:
196-
"""获取文档路径,防止路径遍历攻击。
197-
198-
Args:
199-
ctx: FastMCP 上下文对象
200-
filename: 文件名
201-
202-
Returns:
203-
str: 安全的完整文件路径
204-
205-
Raises:
206-
ValueError: 当检测到路径遍历攻击时
207-
"""
208-
doc_dir = getattr(ctx, "document_directory", DOCUMENT_DIRECTORY)
209-
210-
# 使用 basename 防止路径遍历
211-
safe_filename = os.path.basename(filename)
212-
213-
# 构建完整路径
214-
full_path = os.path.join(doc_dir, safe_filename)
215-
216-
# 验证路径在允许的目录内
217-
real_path = os.path.realpath(full_path)
218-
real_doc_dir = os.path.realpath(doc_dir)
219-
220-
if not real_path.startswith(real_doc_dir + os.sep) and real_path != real_doc_dir:
221-
raise ValueError("Access denied: path outside document directory")
222-
223-
return full_path
224-
225-
226152
@mcp.tool()
227-
def read_document(ctx: object, filename: str) -> str:
153+
def read_document(filename: str) -> str:
228154
"""
229155
Reads and extracts text from a specified document file.
230156
Supports multiple document types: TXT, DOCX, PDF, Excel (XLSX, XLS).
231157
232-
:param ctx: FastMCP context
233-
:param filename: Name of the document file to read
158+
:param filename: Path to the document file to read
159+
(supports absolute or relative paths)
234160
:return: Extracted text from the document
235161
"""
236-
try:
237-
doc_path = _get_document_path(ctx, filename)
238-
except ValueError:
239-
return "Error: Invalid file path."
162+
file_path = Path(filename)
240163

241-
if not os.path.exists(doc_path):
164+
if not file_path.exists():
242165
return f"Error: File '{filename}' not found."
243166

244-
if not DocumentReaderFactory.is_supported(doc_path):
167+
if not DocumentReaderFactory.is_supported(str(file_path)):
245168
return f"Error: Unsupported document type for file '{filename}'."
246169

247170
try:
248-
reader = DocumentReaderFactory.get_reader(doc_path)
249-
return reader.read(doc_path)
171+
reader = DocumentReaderFactory.get_reader(str(file_path))
172+
return reader.read(str(file_path))
250173
except Exception as e:
251174
return f"Error reading document: {str(e)}"
252175

253176

254-
# Run the MCP server
255177
def main():
256178
mcp.run()
257179

tests/conftest.py

Lines changed: 0 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
33
本模块提供测试所需的共享配置和 fixture 对象,包括:
44
- 测试文档文件路径
5-
- 模拟的上下文对象
65
- 临时目录管理
76
"""
87

@@ -12,7 +11,6 @@
1211

1312
import pytest
1413

15-
# 项目根目录
1614
PROJECT_ROOT = Path(__file__).parent.parent
1715
FIXTURES_DIR = PROJECT_ROOT / "tests" / "fixtures"
1816

@@ -153,62 +151,3 @@ def temp_document_dir() -> Generator[str, None, None]:
153151
"""
154152
with tempfile.TemporaryDirectory() as tmpdir:
155153
yield tmpdir
156-
157-
158-
@pytest.fixture
159-
def mock_context() -> object:
160-
"""创建模拟的上下文对象。
161-
162-
Returns:
163-
object: 包含 document_directory 属性的模拟上下文对象
164-
"""
165-
166-
class MockContext:
167-
"""模拟的上下文类。"""
168-
169-
def __init__(self) -> None:
170-
"""初始化模拟上下文。"""
171-
self.document_directory = str(FIXTURES_DIR)
172-
173-
return MockContext()
174-
175-
176-
@pytest.fixture
177-
def mock_context_with_temp_dir(temp_document_dir: str) -> object:
178-
"""创建带有临时目录的模拟上下文对象。
179-
180-
Args:
181-
temp_document_dir: 临时目录路径
182-
183-
Returns:
184-
object: 包含临时目录的模拟上下文对象
185-
"""
186-
187-
class MockContext:
188-
"""模拟的上下文类。"""
189-
190-
def __init__(self, doc_dir: str) -> None:
191-
"""初始化模拟上下文。
192-
193-
Args:
194-
doc_dir: 文档目录路径
195-
"""
196-
self.document_directory = doc_dir
197-
198-
return MockContext(temp_document_dir)
199-
200-
201-
@pytest.fixture
202-
def mock_context_no_attr() -> object:
203-
"""创建没有 document_directory 属性的模拟上下文对象。
204-
205-
Returns:
206-
object: 没有 document_directory 属性的模拟上下文对象
207-
"""
208-
209-
class MockContextNoAttr:
210-
"""没有 document_directory 属性的模拟上下文类。"""
211-
212-
pass
213-
214-
return MockContextNoAttr()

0 commit comments

Comments
 (0)