Skip to content

Commit 2afd833

Browse files
Merge pull request #243 from MITLibraries/tco-183
Integrate with external citation detector via Detector::MlCitation class
2 parents d2d6163 + 2fa42d2 commit 2afd833

14 files changed

Lines changed: 489 additions & 3 deletions

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,10 @@ changes, this is the signal which indicates that terms need to be re-evaluated.
7373

7474
### Optional
7575

76+
`DETECTOR_LAMBDA_CHALLENGE_SECRET`: The secret phrase required by the external citation detector to process any request. If not present, the detector will not respond.
77+
`DETECTOR_LAMBDA_PATH`: The path specified by the external citation detector for prediction requests. If not present, the citation detector will not be consulted.
78+
`DETECTOR_LAMBDA_URL`: The address for an external citation detector, if present. If not present, the citation detector will not be consulted.
79+
7680
`LIBKEY_KEY`: LibKey API key. Required if `LIBKEY_DOI` or `LIBKEY_PMID` are set.
7781
`LIBKEY_ID`: LibKey Library ID. Required if `LIBKEY_DOI` or `LIBKEY_PMID` are set.
7882
`LIBKEY_DOI`: If set, use LibKey for DOI metadata lookups. If not set, Unpaywall is used.

app/models/detector/citation.rb

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ class Detector
1010
# hallmarks of being a citation.
1111
# Phrases whose score is higher than the REQUIRED_SCORE value can be registered as a Detection.
1212
class Citation
13-
attr_reader :score, :subpatterns, :summary
13+
attr_reader :features, :score, :subpatterns, :summary
1414

1515
# shared singleton methods
1616
extend Detector::BulkChecker
@@ -67,10 +67,13 @@ def detection?
6767
# @return Nothing intentional. Data is written to Hashes `@subpatterns`, `@summary`,
6868
# and `@score` during processing.
6969
def initialize(phrase)
70+
@features = {}
7071
@subpatterns = {}
7172
@summary = {}
7273
pattern_checker(phrase)
7374
summarize(phrase)
75+
extract_features
76+
@subpatterns.delete_if { |_, v| v == [] }
7477
@score = calculate_score
7578
end
7679

@@ -135,13 +138,25 @@ def commas(phrase)
135138
phrase.count(',')
136139
end
137140

141+
# This converts the already-built @subpatterns and @summary instance variables into the @features instance variable,
142+
# which has a format suitable for sending to our prediction algorithm.
143+
def extract_features
144+
# Need to create a separate instance variable, so use .deep_dup
145+
@features = @subpatterns.deep_dup
146+
# Convert the @subpattern structure of {no: = [], pages: ['194-204']} (a hash of matched substrings, with some
147+
# empty) into {no: 0, pages: 1} (a hash of integers, some zero)
148+
@features = @features.transform_values(&:length)
149+
# Now join the re-shaped hash with the @summary variable, so everything is in one place.
150+
@features = @features.merge(summary)
151+
end
152+
138153
# This builds one of the two main components of the Citation detector - the subpattern report. It uses each of the
139154
# regular expressions in the CITATION_PATTERNS constant, extracting all matches using the scan method.
140155
#
141156
# @return hash
142157
def pattern_checker(phrase)
143158
CITATION_PATTERNS.each_pair do |type, pattern|
144-
@subpatterns[type.to_sym] = scan(pattern, phrase) if scan(pattern, phrase).present?
159+
@subpatterns[type.to_sym] = scan(pattern, phrase)
145160
end
146161
end
147162

app/models/detector/ml_citation.rb

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
# frozen_string_literal: true
2+
3+
class Detector
4+
class MlCitation
5+
attr_reader :detections
6+
7+
# For now the initialize method just needs to consult the external lambda.
8+
#
9+
# @param phrase String. Often a `Term.phrase`.
10+
# @return Nothing intentional. Data is written to Hash `@detections` during processing.
11+
def initialize(phrase)
12+
return unless self.class.expected_env?
13+
14+
response = fetch(phrase)
15+
@detections = response unless response == 'Error'
16+
end
17+
18+
def detection?
19+
@detections == true
20+
end
21+
22+
# expected_env? confirms that all three required environment variables are defined. It is provided for the Term
23+
# model to check prior to calling because this is still an optional extension to TACOS. If this method returns
24+
# false, the Term model will fall back to the initial citation detector.
25+
#
26+
# @return Boolean
27+
def self.expected_env?
28+
Rails.logger.error('No lambda URL defined') if lambda_url.nil?
29+
30+
Rails.logger.error('No lambda path defined') if lambda_path.nil?
31+
32+
Rails.logger.error('No lambda secret defined') if lambda_secret.nil?
33+
34+
[lambda_url, lambda_path, lambda_secret].all?(&:present?)
35+
end
36+
37+
# The record method runs a supplied term through the detector via its initialize method, which consults the lambda.
38+
# If a positive result is received, a Detection is registered.
39+
#
40+
# @param term [Term]
41+
# @return nil
42+
def self.record(term)
43+
result = Detector::MlCitation.new(term.phrase)
44+
return unless result.detection?
45+
46+
# Detections are registered to the "MlCitation" detector for now, but may end up replacing the "Citation" detector
47+
# in a future step.
48+
Detection.find_or_create_by(
49+
term:,
50+
detector: Detector.where(name: 'MlCitation').first,
51+
detector_version: ENV.fetch('DETECTOR_VERSION', 'unset')
52+
)
53+
54+
nil
55+
end
56+
57+
# lambda_path reads and returns the value of one environment variable.
58+
#
59+
# @note This is a public class method because the entire class ends up getting called in both class and instance
60+
# contexts, due to how detectors are built. The ideal state would be a private method, but that would require
61+
# changing how the class calls itself via the fetch method.
62+
#
63+
# @see Detector::MlCitation.expected_env?
64+
# @see Detector::MlCitation.fetch
65+
# @return String or nil
66+
def self.lambda_path
67+
ENV.fetch('DETECTOR_LAMBDA_PATH', nil)
68+
end
69+
70+
# lambda_secret reads and returns the value of one environment variable.
71+
#
72+
# @note This is a public class method because the entire class ends up getting called in both class and instance
73+
# contexts, due to how detectors are built. The ideal state would be a private method, but that would require
74+
# changing how the class calls itself via the fetch method.
75+
#
76+
# @see Detector::MlCitation.expected_env?
77+
# @see Detector::MlCitation.fetch
78+
# @return String or nil
79+
def self.lambda_secret
80+
ENV.fetch('DETECTOR_LAMBDA_CHALLENGE_SECRET', nil)
81+
end
82+
83+
# lambda_url reads and returns the value of one environment variable.
84+
#
85+
# @note This is a public class method because the entire class ends up getting called in both class and instance
86+
# contexts, due to how detectors are built. The ideal state would be a private method, but that would require
87+
# changing how the class calls itself via the fetch method.
88+
#
89+
# @see Detector::MlCitation.expected_env?
90+
# @see Detector::MlCitation.fetch
91+
# @return String or nil
92+
def self.lambda_url
93+
ENV.fetch('DETECTOR_LAMBDA_URL', nil)
94+
end
95+
96+
private
97+
98+
# define_lambda connects to the detector lambda.
99+
#
100+
# @return Faraday connection
101+
def define_lambda
102+
Faraday.new(
103+
url: self.class.lambda_url,
104+
params: {}
105+
)
106+
end
107+
108+
# define_payload defines the Hash that will be sent to the lambda.
109+
#
110+
# @return Hash
111+
def define_payload(phrase)
112+
{
113+
action: 'predict',
114+
features: extract_features(phrase),
115+
challenge_secret: self.class.lambda_secret
116+
}
117+
end
118+
119+
# extract_features passes the search phrase through the citation detector, and massages the resulting features object
120+
# to correspond with what the lambda expects.
121+
#
122+
# @return Hash
123+
def extract_features(phrase)
124+
features = Detector::Citation.new(phrase).features
125+
features[:apa] = features.delete :apa_volume_issue
126+
features[:year] = features.delete :year_parens
127+
features.delete :characters
128+
features
129+
end
130+
131+
# Fetch handles the communication with the detector lambda: defining the connection, building the payload, and any
132+
# error handling with the response.
133+
#
134+
# @return Boolean or 'Error'
135+
def fetch(phrase)
136+
lambda = define_lambda
137+
payload = define_payload(phrase)
138+
139+
response = lambda.post(self.class.lambda_path, payload.to_json)
140+
141+
if response.status == 200
142+
JSON.parse(response.body)['response'] == 'true'
143+
else
144+
Rails.logger.error(response.body)
145+
Sentry.set_extras({ body: response.body })
146+
Sentry.capture_message('Non-200 response received from detector lambda')
147+
148+
'Error'
149+
end
150+
end
151+
end
152+
end

app/models/term.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ def cluster
5050
#
5151
# @return nil
5252
def record_detections
53+
Detector::MlCitation.record(self) if Detector::MlCitation.expected_env?
5354
Detector::Citation.record(self)
5455
Detector::StandardIdentifiers.record(self)
5556
Detector::Journal.record(self)

db/seeds.rb

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
Detector.find_or_create_by(name: 'Journal')
4242
Detector.find_or_create_by(name: 'SuggestedResource')
4343
Detector.find_or_create_by(name: 'Citation')
44+
Detector.find_or_create_by(name: 'MlCitation')
4445
Detector.find_or_create_by(name: 'Barcode')
4546
Detector.find_or_create_by(name: 'SuggestedResourcePattern')
4647

@@ -75,6 +76,11 @@
7576
category: Category.find_by(name: 'Informational'),
7677
confidence: 0.7
7778
)
79+
DetectorCategory.find_or_create_by(
80+
detector: Detector.find_by(name: 'MlCitation'),
81+
category: Category.find_by(name: 'Transactional'),
82+
confidence: 0.95
83+
)
7884
DetectorCategory.find_or_create_by(
7985
detector: Detector.find_by(name: 'PMID'),
8086
category: Category.find_by(name: 'Transactional'),

test/fixtures/detector_categories.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,8 @@ eight:
4848
detector: barcode
4949
category: transactional
5050
confidence: 0.95
51+
52+
nine:
53+
detector: mlcitation
54+
category: transactional
55+
confidence: 0.95

test/fixtures/detectors.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ barcode:
1313
citation:
1414
name: 'Citation'
1515

16+
mlcitation:
17+
name: 'MlCitation'
18+
1619
doi:
1720
name: 'DOI'
1821

test/models/detector/citation_test.rb

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44

55
class Detector
66
class CitationTest < ActiveSupport::TestCase
7-
test 'detector::citation exposes three instance variables' do
7+
test 'detector::citation exposes four instance variables' do
88
t = terms('citation')
99
result = Detector::Citation.new(t.phrase)
1010

11+
assert_predicate result.features, :present?
12+
1113
assert_predicate result.score, :present?
1214

1315
assert_predicate result.summary, :present?
@@ -196,6 +198,29 @@ class CitationTest < ActiveSupport::TestCase
196198
assert_operator 0, :<, result.score
197199
end
198200

201+
test 'features instance method is a hash of integers' do
202+
result = Detector::Citation.new('simple search phrase')
203+
204+
assert_instance_of(Hash, result.features)
205+
206+
assert(result.features.all? { |_, v| v.integer? })
207+
end
208+
209+
test 'features instance method includes all elements of citation detector regardless of search string' do
210+
result_simple = Detector::Citation.new('simple')
211+
result_complex = Detector::Citation.new('Science Education and Cultural Diversity: Mapping the Field. Studies in Science Education, 24(1), 49–73.')
212+
213+
assert_equal result_simple.features.length, result_complex.features.length
214+
end
215+
216+
test 'features instance method should include all elements of citation patterns and summary thresholds' do
217+
patterns = Detector::Citation.const_get :CITATION_PATTERNS
218+
summary = Detector::Citation.const_get :SUMMARY_THRESHOLDS
219+
result = Detector::Citation.new('simple')
220+
221+
assert_equal (patterns.length + summary.length), result.features.length
222+
end
223+
199224
test 'detection? convenience method returns true for obvious citations' do
200225
result = Detector::Citation.new(terms('citation').phrase)
201226

0 commit comments

Comments
 (0)