|
12 | 12 | from udapi.core.files import Files |
13 | 13 | import logging |
14 | 14 | from bisect import bisect_left |
| 15 | +import networkx as nx |
15 | 16 |
|
16 | 17 | def _m(range_s, range_e, offset): |
17 | 18 | return f"{range_s}-{offset}:{range_e}-{offset}" if offset else f"{range_s}:{range_e}" |
@@ -85,6 +86,25 @@ def process_document(self, document): |
85 | 86 | else: |
86 | 87 | logging.warning(f"Unexpected line in {self.files.filename}:\n{line}") |
87 | 88 |
|
| 89 | + # Some Brat ann files use link-based representation, e.g. |
| 90 | + # R123 Coreference Arg1:T11 Arg2:T13 |
| 91 | + # R124 Coreference Arg1:T12 Arg2:T14 |
| 92 | + # R125 Coreference Arg1:T13 Arg2:T14 |
| 93 | + # This actually means that all four mentions T11, T12, T13 and T14 are in the same cluster (entity). |
| 94 | + # However, clusters = [["T11", "T13"], ["T12", "T14"], ["T13", "T14"]] |
| 95 | + # and we need to convert it to clusters = [["T11", "T12", "T13", "T14"]] |
| 96 | + # Note that if creating entities for link, in their original order, |
| 97 | + # R123 and R125 would result in creating two entities and when hitting R125 |
| 98 | + # we would need to merge them, i.e. delete one of them and move their mentions to the other. |
| 99 | + # This is the solution of corefud.Link2Cluster, but here it seems easier to find connected components. |
| 100 | + coref_graph = nx.Graph() |
| 101 | + for mention_ids in clusters: |
| 102 | + coref_graph.add_node(mention_ids[0]) |
| 103 | + for mention_id in mention_ids[1:]: |
| 104 | + coref_graph.add_node(mention_id) |
| 105 | + coref_graph.add_edge(mention_id, mention_ids[0]) |
| 106 | + clusters = [list(component) for component in nx.connected_components(coref_graph)] |
| 107 | + |
88 | 108 | # Create entity objects for non-singletons. |
89 | 109 | entity_map = {} |
90 | 110 | for mention_ids in clusters: |
|
0 commit comments