Skip to content

Commit c0ee47c

Browse files
add new features (#49)
1 parent 6ae6d4a commit c0ee47c

59 files changed

Lines changed: 16341 additions & 5216 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 44 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,12 @@ wheels/
1919
*.egg-info/
2020
.installed.cfg
2121
*.egg
22-
data/*
22+
2323
# Virtual Environment
2424
venv/
2525
env/
2626
ENV/
27-
.venv
27+
.venv/
2828

2929
# IDE
3030
.vscode/
@@ -34,51 +34,69 @@ ENV/
3434
*~
3535
.DS_Store
3636

37+
# Claude Code
38+
.claude/
39+
3740
# API Keys - 중요!
3841
.env
39-
.env.local
40-
.env.development
41-
.env.test
42-
.env.production
43-
42+
.env.*
4443
apis/gemini_keys.yaml
45-
!apis/gemini_keys.yaml.template
44+
apis/*.yaml
45+
!apis/*-example.yaml
46+
!apis/*.template.yaml
4647

4748
# Logs
4849
*.log
50+
logs/
4951

5052
# Jupyter Notebook
51-
.ipynb_checkpoints
53+
.ipynb_checkpoints/
5254

5355
# pytest
5456
.pytest_cache/
5557
.coverage
58+
htmlcov/
5659

5760
# MyPy
5861
.mypy_cache/
5962
.dmypy.json
6063
dmypy.json
6164

62-
# database
65+
# Data - 원본 데이터
66+
data/
67+
68+
# Output - 생성된 결과물
69+
output/
70+
output_*/
71+
I_origin_*/
72+
73+
# Temp - 임시 파일
74+
temp/
75+
76+
# Archives
77+
*.zip
78+
*.tar.gz
79+
*.rar
80+
81+
# Generated JSON (except input templates)
82+
pipeline_output*.json
83+
qa_difficulty_analysis_*.json
84+
qa_for_review_*.json
85+
eval_results_*.json
86+
87+
# Keep input templates
88+
!test_*_input.json
89+
90+
# Database/Token
6391
info/
6492
token.json
65-
*.json
66-
test_input.json
6793

68-
# env
94+
# Docs (if generated)
6995
.bemad/
70-
docs/
71-
pipeline_ui/backend/checkpoints/*
72-
pipeline_ui/backend/output/*
73-
pipeline_ui/backend/uploads/*
7496

75-
# Frontend (Node.js)
76-
pipeline_ui/frontend/node_modules/*
97+
# Pipeline UI
98+
pipeline_ui/backend/checkpoints/
99+
pipeline_ui/backend/output/
100+
pipeline_ui/backend/uploads/
101+
pipeline_ui/frontend/node_modules/
77102
pipeline_ui/frontend/package-lock.json
78-
79-
80-
I_origin_0/*
81-
I_origin_1/*
82-
I_origin_2/*
83-
84-
output/*

capture_html_images.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
"""
2+
Capture HTML files from output_* directories as images using Playwright.
3+
"""
4+
import argparse
5+
import asyncio
6+
from pathlib import Path
7+
from typing import List
8+
9+
from playwright.async_api import async_playwright
10+
11+
12+
async def capture_html_file_async(
13+
html_path: Path,
14+
output_path: Path,
15+
width: int = 800,
16+
) -> None:
17+
"""Capture a single HTML file as an image."""
18+
html_content = html_path.read_text(encoding="utf-8")
19+
20+
async with async_playwright() as p:
21+
browser = await p.chromium.launch(headless=True)
22+
try:
23+
page = await browser.new_page(viewport={"width": width, "height": 600})
24+
await page.set_content(html_content)
25+
await page.screenshot(path=output_path, full_page=True)
26+
finally:
27+
await browser.close()
28+
29+
30+
async def capture_batch_async(
31+
html_files: List[Path],
32+
output_dir: Path,
33+
width: int = 800,
34+
) -> None:
35+
"""Capture multiple HTML files, reusing a single browser instance."""
36+
async with async_playwright() as p:
37+
browser = await p.chromium.launch(headless=True)
38+
try:
39+
for html_path in html_files:
40+
output_path = output_dir / f"{html_path.stem}.png"
41+
if output_path.exists():
42+
print(f" [SKIP] {output_path.name} already exists")
43+
continue
44+
45+
try:
46+
html_content = html_path.read_text(encoding="utf-8")
47+
page = await browser.new_page(viewport={"width": width, "height": 600})
48+
await page.set_content(html_content)
49+
await page.screenshot(path=output_path, full_page=True)
50+
await page.close()
51+
print(f" [OK] {html_path.name} -> {output_path.name}")
52+
except Exception as e:
53+
print(f" [ERROR] {html_path.name}: {e}")
54+
finally:
55+
await browser.close()
56+
57+
58+
def main():
59+
parser = argparse.ArgumentParser(description="Capture HTML files as images")
60+
parser.add_argument(
61+
"--output-dirs",
62+
nargs="+",
63+
default=None,
64+
help="Specific output directories to process (e.g., output_academic output_finance)",
65+
)
66+
parser.add_argument(
67+
"--width",
68+
type=int,
69+
default=800,
70+
help="Viewport width for rendering (default: 800)",
71+
)
72+
parser.add_argument(
73+
"--force",
74+
action="store_true",
75+
help="Overwrite existing images",
76+
)
77+
args = parser.parse_args()
78+
79+
base_dir = Path(__file__).parent
80+
81+
# Find output_* directories
82+
if args.output_dirs:
83+
output_dirs = [base_dir / d for d in args.output_dirs]
84+
else:
85+
output_dirs = sorted(base_dir.glob("output_*"))
86+
output_dirs = [d for d in output_dirs if d.is_dir()]
87+
88+
if not output_dirs:
89+
print("No output_* directories found.")
90+
return
91+
92+
print(f"Found {len(output_dirs)} output directories to process")
93+
94+
for output_dir in output_dirs:
95+
html_dir = output_dir / "html"
96+
if not html_dir.exists():
97+
print(f"\n[SKIP] {output_dir.name}: no html/ subdirectory")
98+
continue
99+
100+
# Create images directory
101+
images_dir = output_dir / "images"
102+
images_dir.mkdir(exist_ok=True)
103+
104+
html_files = sorted(html_dir.glob("*.html"))
105+
if not html_files:
106+
print(f"\n[SKIP] {output_dir.name}: no HTML files found")
107+
continue
108+
109+
# Filter out already processed files unless --force
110+
if not args.force:
111+
html_files = [
112+
f for f in html_files
113+
if not (images_dir / f"{f.stem}.png").exists()
114+
]
115+
116+
if not html_files:
117+
print(f"\n[SKIP] {output_dir.name}: all files already processed")
118+
continue
119+
120+
print(f"\n[Processing] {output_dir.name}: {len(html_files)} HTML files")
121+
asyncio.run(capture_batch_async(html_files, images_dir, args.width))
122+
123+
print("\nDone!")
124+
125+
126+
if __name__ == "__main__":
127+
main()

eval/__init__.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,13 @@
2828
evaluate_predictions,
2929
run_evaluation,
3030
)
31+
from .evaluate_vllm import (
32+
EvalConfig,
33+
load_qa_from_pipeline_output,
34+
evaluate_domain,
35+
evaluate_all_domains,
36+
DOMAIN_DIRS,
37+
)
3138

3239
__all__ = [
3340
# Dataset
@@ -50,4 +57,10 @@
5057
# Evaluate
5158
"evaluate_predictions",
5259
"run_evaluation",
60+
# vLLM Evaluate
61+
"EvalConfig",
62+
"load_qa_from_pipeline_output",
63+
"evaluate_domain",
64+
"evaluate_all_domains",
65+
"DOMAIN_DIRS",
5366
]

0 commit comments

Comments
 (0)