Skip to content

Commit 6ae6d4a

Browse files
Merge pull request #47 from Pseudo-Lab/feature/urgentpatchforsynthetictable-issue
some fixes
2 parents 3147acf + b56cb69 commit 6ae6d4a

10 files changed

Lines changed: 786 additions & 171 deletions

File tree

generate_synthetic_table/flow.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -765,6 +765,7 @@ def build_synthetic_table_graph(
765765
llm: ChatOpenAI,
766766
provider: str = "openai",
767767
qa_only: bool = False,
768+
skip_qa: bool = False,
768769
) -> StateGraph:
769770
"""
770771
Assemble the LangGraph pipeline.
@@ -773,6 +774,7 @@ def build_synthetic_table_graph(
773774
llm: LLM instance
774775
provider: LLM provider name
775776
qa_only: If True, generate QA directly from image without synthetic data generation
777+
skip_qa: If True, skip QA generation after table generation (table only mode)
776778
"""
777779

778780
graph = StateGraph(TableState)
@@ -783,7 +785,7 @@ def build_synthetic_table_graph(
783785
graph.add_edge(START, "generate_qa_from_image")
784786
graph.add_edge("generate_qa_from_image", END)
785787
else:
786-
# Full pipeline mode
788+
# Full pipeline mode (or table-only mode if skip_qa=True)
787789
graph.add_node("image_to_html", image_to_html_node(llm))
788790
graph.add_node("pymupdf_parse", pymupdf_parse_node)
789791
graph.add_node("validate_parsed_table", validate_parsed_table_node(llm))
@@ -795,7 +797,9 @@ def build_synthetic_table_graph(
795797
graph.add_node("self_reflection", self_reflection_node(llm))
796798
graph.add_node("revise_synthetic_table", revise_synthetic_table_node(llm))
797799
graph.add_node("parse_synthetic_table", parse_synthetic_table_node(llm))
798-
graph.add_node("generate_qa", generate_qa_node(llm))
800+
801+
if not skip_qa:
802+
graph.add_node("generate_qa", generate_qa_node(llm))
799803

800804
# Routing based on provider and input type
801805
def route_start(state: TableState) -> str:
@@ -842,8 +846,13 @@ def route_start(state: TableState) -> str:
842846
)
843847

844848
graph.add_edge("revise_synthetic_table", "self_reflection")
845-
graph.add_edge("parse_synthetic_table", "generate_qa")
846-
graph.add_edge("generate_qa", END)
849+
850+
# Final edge: skip QA if requested
851+
if skip_qa:
852+
graph.add_edge("parse_synthetic_table", END)
853+
else:
854+
graph.add_edge("parse_synthetic_table", "generate_qa")
855+
graph.add_edge("generate_qa", END)
847856

848857
return graph
849858

@@ -914,6 +923,7 @@ def run_synthetic_table_flow(
914923
azure_deployment: str | None = None,
915924
azure_endpoint: str | None = None,
916925
qa_only: bool = False,
926+
skip_qa: bool = False,
917927
image_paths: List[str] | None = None,
918928
domain: str | None = None,
919929
# 체크포인팅 옵션
@@ -935,6 +945,7 @@ def run_synthetic_table_flow(
935945
azure_deployment: Azure OpenAI deployment name
936946
azure_endpoint: Azure OpenAI endpoint URL
937947
qa_only: If True, skip synthetic data generation and only generate QA from image
948+
skip_qa: If True, generate table only without QA generation
938949
image_paths: Optional list of image paths for multi-image processing
939950
domain: Optional domain for prompt customization (e.g. 'public')
940951
enable_checkpointing: 체크포인팅 활성화 여부
@@ -955,7 +966,7 @@ def run_synthetic_table_flow(
955966
config_path=config_path,
956967
)
957968

958-
graph = build_synthetic_table_graph(llm, provider=provider, qa_only=qa_only)
969+
graph = build_synthetic_table_graph(llm, provider=provider, qa_only=qa_only, skip_qa=skip_qa)
959970

960971
# 체크포인팅 설정
961972
if enable_checkpointing:

generate_synthetic_table/prompts/academic.yaml

Lines changed: 62 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -94,37 +94,79 @@ generate_qa_from_image: |
9494
9595
generate_synthetic_table: |
9696
You are a Synthetic Data Generator specializing in Academic Data.
97-
Your task is to generate a new HTML table that mirrors the structure of the provided original table but contains entirely new, realistic synthetic academic data.
97+
98+
**⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️**
99+
Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT academic data values.
98100
99101
**Inputs:**
100-
1. **Original Table Structure:**
102+
1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):**
101103
{html}
102104
103-
2. **Table Summary:**
105+
2. **Table Summary (describes the data patterns to follow):**
104106
{summary}
105107
106108
**Requirements:**
107-
1. **Structure:** Keep the exact same HTML structure.
108-
2. **Data:** Replace ALL cell values with new, synthetic academic data.
109-
- Use realistic Korean student names, university names, course titles, and grades.
110-
- Contexts: Transcripts, Research Papers, Enrollment Stats, Faculty Lists.
111-
- Do NOT use real private data.
112-
3. **Consistency:** Ensure mathematical consistency (e.g., sum of credits, correct GPA calculations if visible).
113-
4. **Output:** Return ONLY the raw HTML string starting with `<table>` and ending with `</table>`.
109+
1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges, rowspan, colspan).
110+
2. **Headers:** Keep header text the same (column names, category labels).
111+
3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:**
112+
- **ALL data cell values MUST be replaced with completely new synthetic values.**
113+
- **NEVER copy any original data values** - generate fresh, realistic alternatives.
114+
- For student/model names: Generate DIFFERENT names
115+
- For university names: Generate DIFFERENT names
116+
- For grades/scores: Generate DIFFERENT realistic values
117+
- For course/research topics: Generate DIFFERENT titles
118+
- For dates: Generate DIFFERENT plausible dates
119+
4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:**
120+
- Look at the original image's color scheme and design
121+
- Use appropriate Tailwind color classes to match the original style
122+
- Basic structure: `<table class="w-full border-collapse text-sm">`
123+
- Headers/cells: Include `border`, `px-4 py-3`, appropriate colors
124+
- Lists: `class="list-disc ml-5 space-y-1"`
125+
- **DO NOT use inline style attributes**
126+
5. **Domain Consistency:** Ensure academic logic (credits sum correctly, GPA valid)
127+
6. **Output:** Return ONLY the raw HTML string starting with `<table>` and ending with `</table>`. No markdown code blocks.
128+
129+
**Example Transformation (Generic):**
130+
- Original name: "학생A" → Synthetic: "학생B"
131+
- Original score: "4.0" → Synthetic: "3.5"
132+
- Original model: "모델X" → Synthetic: "모델Y"
133+
134+
⚠️ If the generated content is identical or very similar to the original, the output is INVALID.
114135
115136
generate_synthetic_table_from_image: |
116137
You are a Synthetic Data Generator specializing in Academic Data.
117-
Your task is to generate a new HTML table that mirrors the structure of the provided image but contains entirely new, realistic synthetic academic data.
138+
139+
**⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️**
140+
Your task is NOT to OCR/transcribe the image. Instead, you must:
141+
1. Understand the table's STRUCTURE from the image
142+
2. Understand it's an ACADEMIC table
143+
3. Generate COMPLETELY NEW synthetic academic data that fits the domain but uses ENTIRELY DIFFERENT values
118144
119145
**Inputs:**
120-
1. **Image:** An image of an academic table.
146+
1. **Image:** An image of an academic table. Use this to understand structure and domain ONLY.
121147
122148
**Requirements:**
123-
1. **Structure Preservation:** Accurately reconstruct the table structure.
124-
2. **Data Generation:** Replace ALL cell values with new, synthetic academic data.
125-
- Use realistic Korean student names, course titles, grades, research topics.
126-
3. **Styling:** Use **Tailwind CSS** classes (same as default).
127-
- `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on `<table>`.
128-
- `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `<th>`.
129-
- `class="border border-slate-300 p-2"` on `<td>`.
130-
4. **Output Format:** Return ONLY the raw HTML string starting with `<table>` and ending with `</table>`.
149+
1. **Structure Preservation:** Accurately reconstruct the table structure, including rowspan/colspan.
150+
2. **Headers:** Keep header text the same as in the image.
151+
3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:**
152+
- **NEVER copy the data values from the image** - this is NOT an OCR task
153+
- **ALL cell content must be completely NEW and DIFFERENT**
154+
- For student/model names: Generate DIFFERENT names
155+
- For grades/scores: Generate DIFFERENT values
156+
- For course/research topics: Generate DIFFERENT titles
157+
4. **Styling:** Use **Tailwind CSS** classes exclusively (NO inline styles).
158+
- `<table>`: `class="w-full border-collapse text-sm"`
159+
- `<thead>`: `class="bg-gradient-to-r from-indigo-700 to-indigo-800 text-white"`
160+
- `<th>`: `class="border border-indigo-300 px-4 py-3 font-semibold text-left"`
161+
- `<tbody>`: `class="divide-y divide-slate-200"`
162+
- `<tr>` (body rows): `class="hover:bg-indigo-50 transition-colors"`
163+
- `<td>`: `class="border border-slate-200 px-4 py-3 text-slate-700"`
164+
- `<ul>`: `class="list-disc ml-5 space-y-1 text-slate-600"`
165+
- **DO NOT use inline style attributes**
166+
5. **Output Format:** Return ONLY the raw HTML string starting with `<table>` and ending with `</table>`. No markdown code blocks.
167+
168+
**Example (Generic):**
169+
- Name in image: "이름X" → Generate: "이름Y"
170+
- Score in image: "점수A" → Generate: "점수B"
171+
172+
⚠️ If the generated content is identical or very similar to the image, the output is INVALID.

generate_synthetic_table/prompts/business.yaml

Lines changed: 73 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -94,37 +94,90 @@ generate_qa_from_image: |
9494
9595
generate_synthetic_table: |
9696
You are a Synthetic Data Generator specializing in Business Data.
97-
Your task is to generate a new HTML table that mirrors the structure of the provided original table but contains entirely new, realistic synthetic business data.
97+
98+
**⚠️ CRITICAL INSTRUCTION: DO NOT COPY ORIGINAL DATA ⚠️**
99+
Your task is to generate a new HTML table with the SAME STRUCTURE as the original but COMPLETELY DIFFERENT business data values.
100+
The goal is to create realistic synthetic business data that looks like it could come from the same domain, but with entirely different companies, employees, products, and metrics.
98101
99102
**Inputs:**
100-
1. **Original Table Structure:**
103+
1. **Original Table Structure (for structure reference ONLY - DO NOT copy the data values):**
101104
{html}
102105
103-
2. **Table Summary:**
106+
2. **Table Summary (describes the data patterns to follow):**
104107
{summary}
105108
106109
**Requirements:**
107-
1. **Structure:** Keep the exact same HTML structure.
108-
2. **Data:** Replace ALL cell values with new, synthetic business data.
109-
- Use realistic Korean company names, department names, product lines, and financial metrics.
110-
- Contexts: Sales Reports, Inventory, HR Employee Lists, Marketing Campaigns.
111-
- Do NOT use real private data.
112-
3. **Consistency:** Ensure mathematical consistency (e.g., Q1 + Q2 + Q3 + Q4 = Total).
113-
4. **Output:** Return ONLY the raw HTML string starting with `<table>` and ending with `</table>`.
110+
1. **Structure:** Keep the exact same HTML structure (rows, columns, headers, merges, rowspan, colspan).
111+
2. **Headers:** Keep header text the same (column names, category labels like 기업경쟁력, 시장경쟁력).
112+
3. **⚠️ Data Transformation - ABSOLUTELY MANDATORY ⚠️:**
113+
- **ALL data cell values MUST be replaced with completely new synthetic values.**
114+
- **NEVER copy any original data values** - generate fresh, realistic alternatives.
115+
- For company/team names: Generate DIFFERENT names (e.g., "A팀" → "B팀")
116+
- For employee names: Generate DIFFERENT Korean names (e.g., "김OO" → "박OO")
117+
- For business metrics: Generate DIFFERENT numbers (e.g., "100억" → "150억")
118+
- For strategy/description text: Write DIFFERENT content with similar structure
119+
- For bullet point items: Create DIFFERENT but domain-appropriate content
120+
4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:**
121+
- Look at the original image's color scheme and design
122+
- Use appropriate Tailwind color classes to match the original style
123+
- Basic structure: `<table class="w-full border-collapse text-sm">`
124+
- Headers/cells: Include `border`, `px-4 py-3`, appropriate colors
125+
- Lists: `class="list-disc ml-5 space-y-1"`
126+
- **DO NOT use inline style attributes**
127+
5. **Domain Consistency:**
128+
- Ensure business logic (Q1+Q2+Q3+Q4=Total, percentages add up)
129+
- Use realistic Korean business terminology
130+
- Contexts: Sales Reports, Inventory, HR Employee Lists, Marketing Campaigns
131+
6. **Output:** Return ONLY the raw HTML string starting with `<table>` and ending with `</table>`. No markdown code blocks.
132+
133+
**Example Transformation (Generic):**
134+
- Original name: "A팀" → Synthetic: "B팀"
135+
- Original amount: "5억원" → Synthetic: "7.3억원"
136+
- Original description: "신규 사업 추진" → Synthetic: "해외 시장 진출"
137+
138+
⚠️ If the generated content is identical or very similar to the original, the output is INVALID.
139+
Remember: The synthetic table should look like a completely different business dataset from the same domain.
114140
115141
generate_synthetic_table_from_image: |
116142
You are a Synthetic Data Generator specializing in Business Data.
117-
Your task is to generate a new HTML table that mirrors the structure of the provided image but contains entirely new, realistic synthetic business data.
143+
144+
**⚠️ CRITICAL INSTRUCTION: DO NOT TRANSCRIBE - GENERATE NEW DATA ⚠️**
145+
Your task is NOT to OCR/transcribe the image. Instead, you must:
146+
1. Understand the table's STRUCTURE from the image (rows, columns, merged cells, nested structures)
147+
2. Understand it's a BUSINESS table (기업경쟁력, 시장경쟁력, 매출, 실적 등)
148+
3. Generate COMPLETELY NEW synthetic business data that fits the domain but uses ENTIRELY DIFFERENT values
118149
119150
**Inputs:**
120-
1. **Image:** An image of a business table.
151+
1. **Image:** An image of a business table. Use this to understand structure and domain ONLY.
121152
122153
**Requirements:**
123-
1. **Structure Preservation:** Accurately reconstruct the table structure.
124-
2. **Data Generation:** Replace ALL cell values with new, synthetic business data.
125-
- Use realistic Korean company names, products, sales figures.
126-
3. **Styling:** Use **Tailwind CSS** classes (same as default).
127-
- `class="border-collapse border border-slate-400 w-full text-sm text-left rtl:text-right text-gray-500"` on `<table>`.
128-
- `class="border border-slate-300 p-2 bg-gray-50 font-semibold"` on `<th>`.
129-
- `class="border border-slate-300 p-2"` on `<td>`.
130-
4. **Output Format:** Return ONLY the raw HTML string starting with `<table>` and ending with `</table>`.
154+
1. **Structure Preservation:** Accurately reconstruct the table structure, including `rowspan` and `colspan` for merged cells.
155+
2. **Headers:** Keep header text (column names, category labels like 기업경쟁력, 차별화 요소) the same as in the image.
156+
3. **⚠️ Data Generation - ABSOLUTELY CRITICAL ⚠️:**
157+
- **NEVER copy the data values from the image** - this is NOT an OCR task
158+
- **ALL cell content must be completely NEW and DIFFERENT from the original**
159+
- Generate COMPLETELY NEW synthetic business values for all data cells:
160+
* For company/team names: Generate DIFFERENT names (e.g., "A팀" → "B팀")
161+
* For business metrics: Generate DIFFERENT numbers (e.g., "100억" → "150억")
162+
* For strategy/description text: Write DIFFERENT content with similar structure
163+
* For bullet point items: Create DIFFERENT but domain-appropriate items
164+
* For employee names: Generate DIFFERENT Korean names (e.g., "김OO" → "박OO")
165+
- The synthetic table should look like a COMPLETELY DIFFERENT business report from the same industry
166+
4. **Styling:** Use **Tailwind CSS** classes (NO inline styles). **Observe and mimic the original image's visual style:**
167+
- Look at the original image's color scheme and design
168+
- Use appropriate Tailwind color classes to match the original style
169+
- Basic structure: `<table class="w-full border-collapse text-sm">`
170+
- Headers/cells: Include `border`, `px-4 py-3`, appropriate colors
171+
- Lists: `class="list-disc ml-5 space-y-1"`
172+
- **DO NOT use inline style attributes**
173+
5. **Output Format:** Return ONLY the raw HTML string starting with `<table>` and ending with `</table>`. No markdown code blocks.
174+
175+
**Example of Expected Behavior (Generic):**
176+
If the image shows a business table with:
177+
- Team name: "영업팀" → Generate different: "마케팅팀"
178+
- Revenue: "10억원" → Generate different: "15억원"
179+
- Strategy: "시장 확대" → Generate different: "신규 진출"
180+
- Bullet point items → Generate completely different items
181+
182+
⚠️ If the generated content is identical or very similar to the image, the output is INVALID.
183+
Remember: The output should be a new synthetic business dataset, not a transcription of the original.

0 commit comments

Comments
 (0)