Skip to content

Commit b56cb69

Browse files
committed
Update run_openai_public.sh and run_pipeline_json.py for new provider and model configurations; enhance style randomization in HTML output generation.
1 parent a65e9b1 commit b56cb69

10 files changed

Lines changed: 693 additions & 297 deletions

File tree

generate_synthetic_table/flow.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -765,6 +765,7 @@ def build_synthetic_table_graph(
765765
llm: ChatOpenAI,
766766
provider: str = "openai",
767767
qa_only: bool = False,
768+
skip_qa: bool = False,
768769
) -> StateGraph:
769770
"""
770771
Assemble the LangGraph pipeline.
@@ -773,6 +774,7 @@ def build_synthetic_table_graph(
773774
llm: LLM instance
774775
provider: LLM provider name
775776
qa_only: If True, generate QA directly from image without synthetic data generation
777+
skip_qa: If True, skip QA generation after table generation (table only mode)
776778
"""
777779

778780
graph = StateGraph(TableState)
@@ -783,7 +785,7 @@ def build_synthetic_table_graph(
783785
graph.add_edge(START, "generate_qa_from_image")
784786
graph.add_edge("generate_qa_from_image", END)
785787
else:
786-
# Full pipeline mode
788+
# Full pipeline mode (or table-only mode if skip_qa=True)
787789
graph.add_node("image_to_html", image_to_html_node(llm))
788790
graph.add_node("pymupdf_parse", pymupdf_parse_node)
789791
graph.add_node("validate_parsed_table", validate_parsed_table_node(llm))
@@ -795,7 +797,9 @@ def build_synthetic_table_graph(
795797
graph.add_node("self_reflection", self_reflection_node(llm))
796798
graph.add_node("revise_synthetic_table", revise_synthetic_table_node(llm))
797799
graph.add_node("parse_synthetic_table", parse_synthetic_table_node(llm))
798-
graph.add_node("generate_qa", generate_qa_node(llm))
800+
801+
if not skip_qa:
802+
graph.add_node("generate_qa", generate_qa_node(llm))
799803

800804
# Routing based on provider and input type
801805
def route_start(state: TableState) -> str:
@@ -842,8 +846,13 @@ def route_start(state: TableState) -> str:
842846
)
843847

844848
graph.add_edge("revise_synthetic_table", "self_reflection")
845-
graph.add_edge("parse_synthetic_table", "generate_qa")
846-
graph.add_edge("generate_qa", END)
849+
850+
# Final edge: skip QA if requested
851+
if skip_qa:
852+
graph.add_edge("parse_synthetic_table", END)
853+
else:
854+
graph.add_edge("parse_synthetic_table", "generate_qa")
855+
graph.add_edge("generate_qa", END)
847856

848857
return graph
849858

@@ -914,6 +923,7 @@ def run_synthetic_table_flow(
914923
azure_deployment: str | None = None,
915924
azure_endpoint: str | None = None,
916925
qa_only: bool = False,
926+
skip_qa: bool = False,
917927
image_paths: List[str] | None = None,
918928
domain: str | None = None,
919929
# 체크포인팅 옵션
@@ -935,6 +945,7 @@ def run_synthetic_table_flow(
935945
azure_deployment: Azure OpenAI deployment name
936946
azure_endpoint: Azure OpenAI endpoint URL
937947
qa_only: If True, skip synthetic data generation and only generate QA from image
948+
skip_qa: If True, generate table only without QA generation
938949
image_paths: Optional list of image paths for multi-image processing
939950
domain: Optional domain for prompt customization (e.g. 'public')
940951
enable_checkpointing: 체크포인팅 활성화 여부
@@ -955,7 +966,7 @@ def run_synthetic_table_flow(
955966
config_path=config_path,
956967
)
957968

958-
graph = build_synthetic_table_graph(llm, provider=provider, qa_only=qa_only)
969+
graph = build_synthetic_table_graph(llm, provider=provider, qa_only=qa_only, skip_qa=skip_qa)
959970

960971
# 체크포인팅 설정
961972
if enable_checkpointing:

generate_synthetic_table/prompts/academic.yaml

Lines changed: 51 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,8 @@ generate_qa_from_image: |
9595
generate_synthetic_table: |
9696
You are a Synthetic Data Generator specializing in Academic Data.
9797
98-
**CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA**
98+
**⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️**
9999
Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT academic data values.
100-
The goal is to create realistic synthetic academic data that looks like it could come from the same domain, but with entirely different students, courses, and metrics.
101100
102101
**Inputs:**
103102
1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):**
@@ -107,55 +106,67 @@ generate_synthetic_table: |
107106
{summary}
108107
109108
**Requirements:**
110-
1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges).
109+
1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges, rowspan, colspan).
111110
2. **Headers:** Keep header text the same (column names, category labels).
112-
3. **Data Transformation - MANDATORY:**
111+
3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:**
113112
- **ALL data cell values MUST be replaced with completely new synthetic values.**
114-
- **DO NOT copy any original data values** - generate fresh, realistic alternatives.
115-
- For student names: Generate new Korean student names (e.g., "김철수" → "이영희", "학생A" → "학생B")
116-
- For university names: Generate new Korean university names
117-
- For course titles: Generate new course names
118-
- For grades/scores: Generate new realistic values
119-
- For model names (if research table): Generate new model/method names
120-
- For dates: Generate new plausible dates
121-
4. **Domain Consistency:**
122-
- Ensure academic logic (credits sum correctly, GPA calculations valid)
123-
- Use realistic Korean academic terminology
124-
- Contexts: Transcripts, Research Papers, Enrollment Stats, Faculty Lists
125-
5. **Output:** Return ONLY the raw HTML string starting with `<table>` and ending with `</table>`.
126-
127-
**Example Transformation:**
128-
- Original: "서울대학교" → Synthetic: "고려대학교"
129-
- Original: "학점 4.2" → Synthetic: "학점 3.8"
130-
- Original: "BERT-Large" → Synthetic: "RoBERTa-Base"
131-
132-
Remember: The synthetic table should look like a completely different academic dataset from the same domain.
113+
- **NEVER copy any original data values** - generate fresh, realistic alternatives.
114+
- For student/model names: Generate DIFFERENT names
115+
- For university names: Generate DIFFERENT names
116+
- For grades/scores: Generate DIFFERENT realistic values
117+
- For course/research topics: Generate DIFFERENT titles
118+
- For dates: Generate DIFFERENT plausible dates
119+
4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:**
120+
- Look at the original image's color scheme and design
121+
- Use appropriate Tailwind color classes to match the original style
122+
- Basic structure: `<table class="w-full border-collapse text-sm">`
123+
- Headers/cells: Include `border`, `px-4 py-3`, appropriate colors
124+
- Lists: `class="list-disc ml-5 space-y-1"`
125+
- **DO NOT use inline style attributes**
126+
5. **Domain Consistency:** Ensure academic logic (credits sum correctly, GPA valid)
127+
6. **Output:** Return ONLY the raw HTML string starting with `<table>` and ending with `</table>`. No markdown code blocks.
128+
129+
**Example Transformation (Generic):**
130+
- Original name: "학생A" → Synthetic: "학생B"
131+
- Original score: "4.0" → Synthetic: "3.5"
132+
- Original model: "모델X" → Synthetic: "모델Y"
133+
134+
⚠️ If the generated content is identical or very similar to the original, the output is INVALID.
133135
134136
generate_synthetic_table_from_image: |
135137
You are a Synthetic Data Generator specializing in Academic Data.
136138
137-
**CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA**
139+
**⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️**
138140
Your task is NOT to OCR/transcribe the image. Instead, you must:
139141
1. Understand the table's STRUCTURE from the image
140142
2. Understand it's an ACADEMIC table
141-
3. Generate COMPLETELY NEW synthetic academic data that fits the domain but uses different values
143+
3. Generate COMPLETELY NEW synthetic academic data that fits the domain but uses ENTIRELY DIFFERENT values
142144
143145
**Inputs:**
144146
1. **Image:** An image of an academic table. Use this to understand structure and domain ONLY.
145147
146148
**Requirements:**
147-
1. **Structure Preservation:** Accurately reconstruct the table structure.
148-
2. **Headers:** Keep header text (column names, category labels) the same as in the image.
149-
3. **Data Generation - CRITICAL:**
150-
- **DO NOT copy the data values from the image** - this is NOT an OCR task
151-
- Generate COMPLETELY NEW synthetic academic values for all data cells
152-
- For student/model names: Generate new names (different from what you see)
153-
- For grades/scores: Generate new realistic values
154-
- For course/research topics: Generate new titles
155-
4. **Styling:** Use **Tailwind CSS** classes (same as default).
156-
- `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on `<table>`.
157-
- `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `<th>`.
158-
- `class="border border-slate-300 p-2"` on `<td>`.
159-
5. **Output Format:** Return ONLY the raw HTML string starting with `<table>` and ending with `</table>`.
160-
161-
Remember: The output should be a new synthetic academic dataset, not a transcription of the original.
149+
1. **Structure Preservation:** Accurately reconstruct the table structure, including rowspan/colspan.
150+
2. **Headers:** Keep header text the same as in the image.
151+
3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:**
152+
- **NEVER copy the data values from the image** - this is NOT an OCR task
153+
- **ALL cell content must be completely NEW and DIFFERENT**
154+
- For student/model names: Generate DIFFERENT names
155+
- For grades/scores: Generate DIFFERENT values
156+
- For course/research topics: Generate DIFFERENT titles
157+
4. **Styling:** Use **Tailwind CSS** classes exclusively (NO inline styles).
158+
- `<table>`: `class="w-full border-collapse text-sm"`
159+
- `<thead>`: `class="bg-gradient-to-r from-indigo-700 to-indigo-800 text-white"`
160+
- `<th>`: `class="border border-indigo-300 px-4 py-3 font-semibold text-left"`
161+
- `<tbody>`: `class="divide-y divide-slate-200"`
162+
- `<tr>` (body rows): `class="hover:bg-indigo-50 transition-colors"`
163+
- `<td>`: `class="border border-slate-200 px-4 py-3 text-slate-700"`
164+
- `<ul>`: `class="list-disc ml-5 space-y-1 text-slate-600"`
165+
- **DO NOT use inline style attributes**
166+
5. **Output Format:** Return ONLY the raw HTML string starting with `<table>` and ending with `</table>`. No markdown code blocks.
167+
168+
**Example (Generic):**
169+
- Name in image: "이름X" → Generate: "이름Y"
170+
- Score in image: "점수A" → Generate: "점수B"
171+
172+
⚠️ If the generated content is identical or very similar to the image, the output is INVALID.

generate_synthetic_table/prompts/business.yaml

Lines changed: 57 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ generate_qa_from_image: |
9595
generate_synthetic_table: |
9696
You are a Synthetic Data Generator specializing in Business Data.
9797
98-
**CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA**
98+
**⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️**
9999
Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT business data values.
100100
The goal is to create realistic synthetic business data that looks like it could come from the same domain, but with entirely different companies, employees, products, and metrics.
101101
@@ -107,54 +107,77 @@ generate_synthetic_table: |
107107
{summary}
108108
109109
**Requirements:**
110-
1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges).
111-
2. **Headers:** Keep header text the same (column names, category labels).
112-
3. **Data Transformation - MANDATORY:**
110+
1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges, rowspan, colspan).
111+
2. **Headers:** Keep header text the same (column names, category labels like 기업경쟁력, 시장경쟁력).
112+
3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:**
113113
- **ALL data cell values MUST be replaced with completely new synthetic values.**
114-
- **DO NOT copy any original data values** - generate fresh, realistic alternatives.
115-
- For company names: Generate new Korean company names (e.g., "삼성물산" → "현대상사", "A팀" → "B팀")
116-
- For employee names: Generate new Korean names
117-
- For product names: Generate new product line names
118-
- For revenue/sales figures: Generate new realistic amounts (different values)
119-
- For dates: Generate new plausible dates
120-
4. **Domain Consistency:**
114+
- **NEVER copy any original data values** - generate fresh, realistic alternatives.
115+
- For company/team names: Generate DIFFERENT names (e.g., "A팀" → "B팀")
116+
- For employee names: Generate DIFFERENT Korean names (e.g., "김OO" → "박OO")
117+
- For business metrics: Generate DIFFERENT numbers (e.g., "100억" → "150억")
118+
- For strategy/description text: Write DIFFERENT content with similar structure
119+
- For bullet point items: Create DIFFERENT but domain-appropriate content
120+
4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:**
121+
- Look at the original image's color scheme and design
122+
- Use appropriate Tailwind color classes to match the original style
123+
- Basic structure: `<table class="w-full border-collapse text-sm">`
124+
- Headers/cells: Include `border`, `px-4 py-3`, appropriate colors
125+
- Lists: `class="list-disc ml-5 space-y-1"`
126+
- **DO NOT use inline style attributes**
127+
5. **Domain Consistency:**
121128
- Ensure business logic (Q1+Q2+Q3+Q4=Total, percentages add up)
122129
- Use realistic Korean business terminology
123130
- Contexts: Sales Reports, Inventory, HR Employee Lists, Marketing Campaigns
124-
5. **Output:** Return ONLY the raw HTML string starting with `<table>` and ending with `</table>`.
131+
6. **Output:** Return ONLY the raw HTML string starting with `<table>` and ending with `</table>`. No markdown code blocks.
125132
126-
**Example Transformation:**
127-
- Original: "영업1팀" → Synthetic: "마케팅2팀"
128-
- Original: "매출 5억원" → Synthetic: "매출 7.3억원"
129-
- Original: "김부장" → Synthetic: "박과장"
133+
**Example Transformation (Generic):**
134+
- Original name: "A팀" → Synthetic: "B팀"
135+
- Original amount: "5억원" → Synthetic: "7.3억원"
136+
- Original description: "신규 사업 추진" → Synthetic: "해외 시장 진출"
130137
138+
⚠️ If the generated content is identical or very similar to the original, the output is INVALID.
131139
Remember: The synthetic table should look like a completely different business dataset from the same domain.
132140
133141
generate_synthetic_table_from_image: |
134142
You are a Synthetic Data Generator specializing in Business Data.
135143
136-
**CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA**
144+
**⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️**
137145
Your task is NOT to OCR/transcribe the image. Instead, you must:
138-
1. Understand the table's STRUCTURE from the image
139-
2. Understand it's a BUSINESS table
140-
3. Generate COMPLETELY NEW synthetic business data that fits the domain but uses different values
146+
1. Understand the table's STRUCTURE from the image (rows, columns, merged cells, nested structures)
147+
2. Understand it's a BUSINESS table (기업경쟁력, 시장경쟁력, 매출, 실적 등)
148+
3. Generate COMPLETELY NEW synthetic business data that fits the domain but uses ENTIRELY DIFFERENT values
141149
142150
**Inputs:**
143151
1. **Image:** An image of a business table. Use this to understand structure and domain ONLY.
144152
145153
**Requirements:**
146-
1. **Structure Preservation:** Accurately reconstruct the table structure.
147-
2. **Headers:** Keep header text (column names, category labels) the same as in the image.
148-
3. **Data Generation - CRITICAL:**
149-
- **DO NOT copy the data values from the image** - this is NOT an OCR task
150-
- Generate COMPLETELY NEW synthetic business values for all data cells
151-
- For company/team names: Generate new names (different from what you see)
152-
- For sales/revenue figures: Generate new realistic amounts
153-
- For employee names: Generate new Korean names
154-
4. **Styling:** Use **Tailwind CSS** classes (same as default).
155-
- `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on `<table>`.
156-
- `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `<th>`.
157-
- `class="border border-slate-300 p-2"` on `<td>`.
158-
5. **Output Format:** Return ONLY the raw HTML string starting with `<table>` and ending with `</table>`.
159-
154+
1. **Structure Preservation:** Accurately reconstruct the table structure, including `rowspan` and `colspan` for merged cells.
155+
2. **Headers:** Keep header text (column names, category labels like 기업경쟁력, 차별화 요소) the same as in the image.
156+
3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:**
157+
- **NEVER copy the data values from the image** - this is NOT an OCR task
158+
- **ALL cell content must be completely NEW and DIFFERENT from the original**
159+
- Generate COMPLETELY NEW synthetic business values for all data cells:
160+
* For company/team names: Generate DIFFERENT names (e.g., "A팀" → "B팀")
161+
* For business metrics: Generate DIFFERENT numbers (e.g., "100억" → "150억")
162+
* For strategy/description text: Write DIFFERENT content with similar structure
163+
* For bullet point items: Create DIFFERENT but domain-appropriate items
164+
* For employee names: Generate DIFFERENT Korean names (e.g., "김OO" → "박OO")
165+
- The synthetic table should look like a COMPLETELY DIFFERENT business report from the same industry
166+
4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:**
167+
- Look at the original image's color scheme and design
168+
- Use appropriate Tailwind color classes to match the original style
169+
- Basic structure: `<table class="w-full border-collapse text-sm">`
170+
- Headers/cells: Include `border`, `px-4 py-3`, appropriate colors
171+
- Lists: `class="list-disc ml-5 space-y-1"`
172+
- **DO NOT use inline style attributes**
173+
5. **Output Format:** Return ONLY the raw HTML string starting with `<table>` and ending with `</table>`. No markdown code blocks.
174+
175+
**Example of Expected Behavior (Generic):**
176+
If the image shows a business table with:
177+
- Team name: "영업팀" → Generate different: "마케팅팀"
178+
- Revenue: "10억원" → Generate different: "15억원"
179+
- Strategy: "시장 확대" → Generate different: "신규 진출"
180+
- Bullet point items → Generate completely different items
181+
182+
⚠️ If the generated content is identical or very similar to the image, the output is INVALID.
160183
Remember: The output should be a new synthetic business dataset, not a transcription of the original.

0 commit comments

Comments
 (0)