Skip to content

Commit bd2e61f

Browse files
committed
FrLitBank actually uses linked-based coreference
so we need to use nx.connected_components(coref_graph), which should not hurt even for Brat ann files using cluster-based coreference (such as MiniCIEP+).
1 parent 585832c commit bd2e61f

File tree

1 file changed

+20
-0
lines changed

1 file changed

+20
-0
lines changed

udapi/block/read/addbratann.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from udapi.core.files import Files
1313
import logging
1414
from bisect import bisect_left
15+
import networkx as nx
1516

1617
def _m(range_s, range_e, offset):
1718
return f"{range_s}-{offset}:{range_e}-{offset}" if offset else f"{range_s}:{range_e}"
@@ -85,6 +86,25 @@ def process_document(self, document):
8586
else:
8687
logging.warning(f"Unexpected line in {self.files.filename}:\n{line}")
8788

89+
# Some Brat ann files use link-based representation, e.g.
90+
# R123 Coreference Arg1:T11 Arg2:T13
91+
# R124 Coreference Arg1:T12 Arg2:T14
92+
# R125 Coreference Arg1:T13 Arg2:T14
93+
# This actually means that all four mentions T11, T12, T13 and T14 are in the same cluster (entity).
94+
# However, clusters = [["T11", "T13"], ["T12", "T14"], ["T13", "T14"]]
95+
# and we need to convert it to clusters = [["T11", "T12", "T13", "T14"]]
96+
# Note that if creating entities for link, in their original order,
97+
# R123 and R125 would result in creating two entities and when hitting R125
98+
# we would need to merge them, i.e. delete one of them and move their mentions to the other.
99+
# This is the solution of corefud.Link2Cluster, but here it seems easier to find connected components.
100+
coref_graph = nx.Graph()
101+
for mention_ids in clusters:
102+
coref_graph.add_node(mention_ids[0])
103+
for mention_id in mention_ids[1:]:
104+
coref_graph.add_node(mention_id)
105+
coref_graph.add_edge(mention_id, mention_ids[0])
106+
clusters = [list(component) for component in nx.connected_components(coref_graph)]
107+
88108
# Create entity objects for non-singletons.
89109
entity_map = {}
90110
for mention_ids in clusters:

0 commit comments

Comments
 (0)