document-analysis-streamlit/main.py at master · CynicalHeart/document-analysis-streamlit · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
import streamlit as st
import asyncio
from app.services.ai_processor import process_document, process_document_stream
from app.utils.markitdown_converter import converter as markitdown_converter
from app.utils.document_converter import DocumentConverter
from app.config import settings
from app.services.image_service import ImageService

# 设置页面配置（必须在最前面）
st.set_page_config(page_title="智能文档分析工具", page_icon="📄", layout="wide")


# 页面标题
st.title("📄 智能文档分析工具")

# 侧边栏配置
with st.sidebar:
    st.header("配置")

    # 模型选择
    model_name = st.selectbox(
        "选择AI模型", ["deepseek", "openrouter"], help="选择用于分析文档的AI模型"
    )

    # 响应模式选择
    response_mode = st.radio(
        "响应模式", ["流式响应", "JSON标准响应"], help="选择结果的返回格式"
    )

    # 转换器选择
    converter_type = st.radio(
        "选择转换器", ["markitdown", "markdown"], help="选择用于转换文档的转换器"
    )

    # AI参数配置
    st.subheader("AI参数配置")
    temperature = st.slider(
        "Temperature",
        min_value=settings.DEFAULT_AI_PARAMS["temperature_range"][0],
        max_value=settings.DEFAULT_AI_PARAMS["temperature_range"][1],
        value=settings.DEFAULT_AI_PARAMS["temperature"],
        step=0.1,
        help="控制输出的随机性，值越高越有创造性",
    )

    max_tokens = st.slider(
        "Max Tokens",
        min_value=settings.DEFAULT_AI_PARAMS["max_tokens_range"][0],
        max_value=settings.DEFAULT_AI_PARAMS["max_tokens_range"][1],
        value=settings.DEFAULT_AI_PARAMS["max_tokens"],
        step=100,
        help="控制生成文本的最大长度",
    )

# 文件上传
uploaded_file = st.file_uploader(
    "上传需求文档",
    type=["docx", "pdf"],
    help="支持上传Word文档（.docx格式）/ PDF文档（.pdf格式），文件大小不超过10MB",
    accept_multiple_files=False,
)

# 检查文件是否发生变化
if uploaded_file is not None:
    # 如果session中没有当前文件信息，或者当前文件与session中存储的不同
    if (
        not hasattr(st.session_state, "current_file")
        or st.session_state.current_file != uploaded_file.name
    ):
        # 清除之前的所有结果
        if hasattr(st.session_state, "analysis_result"):
            del st.session_state.analysis_result
        if hasattr(st.session_state, "has_analysis"):
            del st.session_state.has_analysis
        if hasattr(st.session_state, "image_links"):
            del st.session_state.image_links
        # 更新当前文件信息
        st.session_state.current_file = uploaded_file.name

# 分析按钮和结果展示
if uploaded_file is not None:
    # 检查文件大小
    if uploaded_file.size > 10 * 1024 * 1024:  # 10MB
        st.error("文件大小超过10MB限制，请上传较小的文件。")
    else:
        # 获取文件类型
        file_type = "pdf" if uploaded_file.name.endswith(".pdf") else "docx"

        # 创建分析按钮
        if st.button("开始分析", type="primary"):
            # 创建结果占位符
            result_placeholder = st.empty()

            try:
                if response_mode == "流式响应":
                    # 流式响应模式
                    async def process_stream():
                        result_text = ""
                        # 创建一个容器来正确渲染Markdown
                        with st.container():
                            # 使用st.markdown创建一个可更新的Markdown区域
                            markdown_container = st.empty()
                            async for chunk in process_document_stream(
                                uploaded_file,
                                model_name,
                                file_type,
                                temperature,
                                max_tokens,
                            ):
                                result_text += chunk
                                # 使用unsafe_allow_html=True确保Markdown被正确渲染
                                markdown_container.markdown(
                                    result_text, unsafe_allow_html=True
                                )
                        return result_text

                    result_text = asyncio.run(process_stream())

                    # 保存分析结果到session_state
                    st.session_state.analysis_result = result_text
                    st.session_state.has_analysis = True

                else:
                    # JSON标准响应模式
                    result = asyncio.run(
                        process_document(
                            uploaded_file,
                            model_name,
                            file_type,
                            temperature,
                            max_tokens,
                        )
                    )
                    result_placeholder.json(result)

                    # 将JSON结果转换为Markdown格式
                    markdown_text = f"""# 文档分析结果

## 标签
{", ".join(result.get("tags", []))}

## 摘要
{result.get("summary", "")}

## 详细分析
{result.get("analysis", "")}
"""
                    # 保存分析结果到session_state
                    st.session_state.analysis_result = markdown_text
                    st.session_state.has_analysis = True

            except Exception as e:
                st.error(f"分析过程中出错: {str(e)}")

        # 创建提取图片按钮
        if st.button("提取图片", type="secondary"):
            try:
                image_service = ImageService()
                if file_type == "docx":
                    images = image_service.extract_images_from_docx(uploaded_file)
                else:
                    images = image_service.extract_images_from_pdf(uploaded_file)

                if images:
                    # 生成图片链接的Markdown文本
                    image_markdown = "# 文档图片链接\n\n"
                    for image in images:
                        image_markdown += (
                            f"![{image['original_name']}]({image['url']})\n\n"
                        )

                    # 保存图片链接到session_state
                    st.session_state.image_links = image_markdown

                    # 显示成功信息
                    st.info(f"成功提取并上传 {len(images)} 张图片到图床")

                    # 提供下载
                    st.download_button(
                        label="下载图片链接文件",
                        data=image_markdown,
                        file_name=f"{uploaded_file.name.split('.')[0]}_images.md",
                        mime="text/markdown",
                    )
                else:
                    st.info("文档中没有找到图片")

                # 清理临时文件
                image_service.clear_temp_images()

            except Exception as e:
                st.error(f"提取图片过程中出错: {str(e)}")

        # 创建转换按钮
        if st.button("转换为Markdown", type="secondary"):
            try:
                if (
                    hasattr(st.session_state, "has_analysis")
                    and st.session_state.has_analysis
                ):
                    # 如果有分析结果，使用分析结果
                    markdown_text = st.session_state.analysis_result
                    file_name = f"{uploaded_file.name.split('.')[0]}_analysis.md"
                else:
                    # 如果没有分析结果，转换原始文档
                    if converter_type == "markitdown":
                        markdown_text = markitdown_converter.convert_to_markdown(
                            uploaded_file, file_type
                        )
                    else:
                        converter = DocumentConverter()
                        markdown_text = converter.convert_to_markdown(
                            uploaded_file, file_type
                        )
                    file_name = f"{uploaded_file.name.split('.')[0]}.md"

                # 提供下载
                st.download_button(
                    label="下载Markdown文件",
                    data=markdown_text,
                    file_name=file_name,
                    mime="text/markdown",
                )

                st.info("文档转换成功，请点击下载按钮获取文件。")

            except Exception as e:
                st.error(f"转换过程中出错: {str(e)}")

# 页脚说明
st.markdown("---")
st.markdown("""
### 使用说明
1. 上传Word文档（.docx格式）/ PDF文档（.pdf格式）
2. 在侧边栏选择AI模型和响应模式
3. 点击"开始分析"按钮进行文档分析
4. 点击"提取图片"按钮提取文档中的图片
5. 点击"转换为Markdown"按钮进行格式转换

### 功能说明
- **高频词提取**：自动提取文档中的高频关键词
- **内容摘要**：生成文档的简洁摘要
- **需求理解**：分析文档的主要内容和关键需求
- **格式转换**：将文档转换为Markdown格式
- **图片处理**：提取文档中的图片并转换为在线链接

### 响应模式说明
- **流式响应**：实时显示分析结果，适合查看详细分析过程
- **JSON标准响应**：返回结构化数据，适合程序化处理

### 转换器说明
- **markitdown**：使用markitdown库进行转换，支持更多格式和样式
- **markdown**：使用标准markdown库进行转换，支持图片处理

### AI参数说明
- **Temperature**：控制输出的随机性，值越高越有创造性
- **Max Tokens**：控制生成文本的最大长度
""")