RelEx/scripts/evaluation_multi.py at master · DFKI-NLP/RelEx · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from typing import Dict, Any

import os
import argparse
import logging
import json
from functools import partial
from relex.evaluation import semeval2010_task8_evaluation
from relex.evaluation import tacred_evaluation

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
ch = logging.StreamHandler()
logger.addHandler(ch)


def _get_parser():
    parser = argparse.ArgumentParser(description="Run evaluation")

    parser.add_argument(
        "--experiment-dir",
        type=str,
        required=True,
        help="directory containing experiment results",
    )
    parser.add_argument(
        "--dataset", type=str, required=True, help="dataset to be evaluated"
    )
    parser.add_argument(
        "--test-file",
        type=str,
        required=True,
        help="file containing the examples for evaluation",
    )
    parser.add_argument(
        "--official-eval-script",
        type=str,
        default=None,
        help="path to the official evaluation script, if required by the dataset",
    )
    parser.add_argument(
        "--batch-size", type=int, default=128, help="batch size to use for predictions"
    )
    parser.add_argument(
        "--cuda-device", type=int, default=-1, help="a cuda device to load the model on"
    )

    return parser


def evaluate_multi(
    experiment_dir: str,
    scorer,
    result_filename: str = "result.json",
    trial_dirname: str = "trial",
    metrics_filename: str = "metrics.json",
) -> Dict[str, Any]:

    summary = []
    for dirpath, _, filenames in os.walk(experiment_dir):
        for filename in filenames:
            if filename != result_filename:
                continue

            logger.info(f"Found experiment in {dirpath}")

            trial_dir = os.path.join(dirpath, trial_dirname)

            precision, recall, f1 = scorer(trial_dir)
            eval_results = dict(precision=precision, recall=recall, f1=f1)
            summary.append(dict(experiment=dirpath, results=eval_results))

            eval_result_path = os.path.join(dirpath, "evaluation_result.json")
            logger.info(f"Writing evaluation result to {eval_result_path}")

            with open(eval_result_path, "w") as result_f:
                json.dump(eval_results, result_f, indent=4, sort_keys=True)

    return summary


def main():
    parser = _get_parser()
    args = parser.parse_args()

    if args.dataset == "semeval2010":
        scorer = partial(
            semeval2010_task8_evaluation.evaluate,
            test_file=args.test_file,
            eval_script_file=args.official_eval_script,
            batch_size=args.batch_size,
            cuda_device=args.cuda_device,
        )

    elif args.dataset == "tacred":
        scorer = partial(
            tacred_evaluation.evaluate,
            test_file=args.test_file,
            batch_size=args.batch_size,
            cuda_device=args.cuda_device,
        )

    summary = evaluate_multi(args.experiment_dir, scorer)

    logger.info(f"Evaluation Summary: {json.dumps(summary, indent=4, sort_keys=True)}")


if __name__ == "__main__":
    main()