Text-Reference-AIChatbot/main.py at main · zichenzha0/Text-Reference-AIChatbot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# Copyright (c) 2025 Zichen Zhao
# Columbia University School of Social Work
# Licensed under the MIT Academic Research License
# See LICENSE file in the project root for details.

"""
Main execution script for the benchmark pipeline.
"""

from __future__ import annotations

import pandas as pd

from src.commonconst import *
from src.data.data_processing import (
    extract_text_from_docx,
    save_processed_files,
)
from src.utils.evaluation_algo import (
    ensure_output_dirs,
    generate_evaluation_scores,
    generate_not_hate_metric_scores,
    generate_urgency_dimension_scores,
    generate_risk_factor_dimension_scores,
    save_evaluation_to_csv,
)
from src.utils.output_processing import process_all_outputs


def append_component_scores_to_evaluation(
    evaluation_df: pd.DataFrame,
    not_hate_df: pd.DataFrame,
    urgency_df: pd.DataFrame,
    risk_factor_df: pd.DataFrame,
) -> pd.DataFrame:
    """
    Appends the three split benchmark scores to the main evaluation_scores.csv table.
    This keeps classifier-based and reference-similarity scores in the same CSV as the
    primary ROUGE/METEOR/negative sentiment/readability metrics.
    """
    merged_df = evaluation_df.copy()
    component_dfs = [not_hate_df, urgency_df, risk_factor_df]

    for component_df in component_dfs:
        clean_component_df = component_df.copy()
        merge_cols = [col for col in clean_component_df.columns if col != "Response"]
        clean_component_df = clean_component_df[merge_cols]

        # Drop any component columns already present so the final CSV has clean names
        # instead of pandas-generated _x/_y suffixes.
        duplicate_cols = [
            col for col in clean_component_df.columns
            if col != "Chatbot" and col in merged_df.columns
        ]
        if duplicate_cols:
            merged_df = merged_df.drop(columns=duplicate_cols)

        merged_df = merged_df.merge(clean_component_df, on="Chatbot", how="left")

    return merged_df


def main():
    ensure_output_dirs()

    # Step 1: load raw docx text
    reference_text = extract_text_from_docx(REFERENCE_DOCX_PATH)
    chatbot_text = extract_text_from_docx(CHATBOT_DOCX_PATH)

    # Step 2: process and save all intermediate files
    save_processed_files(
        chatbot_text=chatbot_text,
        reference_text=reference_text,
        chatbot_output_path=CHATBOT_PROCESSED_CSV_PATH,
        reference_output_path=REFERENCE_PROCESSED_CSV_PATH,
        integrated_output_path=INTEGRATED_OUTPUT_CSV_PATH,
    )

    # Step 3: load integrated responses
    integrated_responses = pd.read_csv(INTEGRATED_OUTPUT_CSV_PATH)

    # Step 4: primary continuous metrics
    evaluation_df = generate_evaluation_scores(
        integrated_responses,
        include_overall_average=True,
    )

    # Step 5: split benchmark components
    not_hate_df = generate_not_hate_metric_scores(
        integrated_responses,
        include_overall_average=True,
    )
    urgency_df = generate_urgency_dimension_scores(
        integrated_responses,
        include_overall_average=True,
    )
    risk_factor_df = generate_risk_factor_dimension_scores(
        integrated_responses,
        include_overall_average=True,
    )

    # Step 6: append split component scores back into the main evaluation CSV
    evaluation_df = append_component_scores_to_evaluation(
        evaluation_df=evaluation_df,
        not_hate_df=not_hate_df,
        urgency_df=urgency_df,
        risk_factor_df=risk_factor_df,
    )
    save_evaluation_to_csv(OUTPUT_CSV_PATH, evaluation_df)

    # Step 7: plotting only; Plots/ contains figures, not CSV files.
    process_all_outputs(
        evaluation_df=evaluation_df,
        integrated_responses=integrated_responses,
        not_hate_df=not_hate_df,
        urgency_df=urgency_df,
        risk_factor_df=risk_factor_df,
    )

    print("Benchmark evaluation complete.")
    print(f"Main results saved to: {OUTPUT_CSV_PATH}")
    print(f"Integrated responses saved to: {INTEGRATED_OUTPUT_CSV_PATH}")
    print(f"All plots saved to: {PLOTS_DIR}")


if __name__ == "__main__":
    main()