Skip to content

Commit 05bdf55

Browse files
author
Evgheni C
committed
Don't store signature in ElasticSearch index
* Sort/cutoff by elasticsearch relevance score instead
1 parent d6fe49c commit 05bdf55

4 files changed

Lines changed: 68 additions & 38 deletions

File tree

image_match/elasticsearch_driver.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ def __init__(self, es, index='images', doc_type='image', timeout='10s', size=100
4949
def search_single_record(self, rec, pre_filter=None):
5050
path = rec.pop('path')
5151
signature = rec.pop('signature')
52+
5253
if 'metadata' in rec:
5354
rec.pop('metadata')
5455

@@ -70,27 +71,26 @@ def search_single_record(self, rec, pre_filter=None):
7071
size=self.size,
7172
timeout=self.timeout)['hits']['hits']
7273

73-
sigs = np.array([x['_source']['signature'] for x in res])
74-
75-
if sigs.size == 0:
74+
if len(res) == 0:
7675
return []
7776

78-
dists = normalized_distance(sigs, np.array(signature))
79-
8077
formatted_res = [{'id': x['_id'],
8178
'score': x['_score'],
8279
'metadata': x['_source'].get('metadata'),
8380
'path': x['_source'].get('url', x['_source'].get('path'))}
8481
for x in res]
8582

86-
for i, row in enumerate(formatted_res):
87-
row['dist'] = dists[i]
88-
formatted_res = filter(lambda y: y['dist'] < self.distance_cutoff, formatted_res)
83+
formatted_res = filter(lambda y: y['score'] > self.score_cutoff, formatted_res)
8984

9085
return formatted_res
9186

9287
def insert_single_record(self, rec, refresh_after=False):
9388
rec['timestamp'] = datetime.now()
89+
90+
# Don't store signature in index
91+
if 'signature' in rec:
92+
rec.pop('signature')
93+
9494
self.es.index(index=self.index, doc_type=self.doc_type, body=rec, refresh=refresh_after)
9595

9696
def delete_duplicates(self, path):

image_match/signature_database_base.py

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ def insert_single_record(self, rec):
117117
raise NotImplementedError
118118

119119
def __init__(self, k=16, N=63, n_grid=9,
120-
crop_percentile=(5, 95), distance_cutoff=0.45,
120+
crop_percentile=(5, 95), distance_cutoff=0.45, score_cutoff=9.0,
121121
*signature_args, **signature_kwargs):
122122
"""Set up storage scheme for images
123123
@@ -159,6 +159,8 @@ def __init__(self, k=16, N=63, n_grid=9,
159159
considering how much variance to keep in the image (default (5, 95))
160160
distance_cutoff (Optional [float]): maximum image signature distance to
161161
be considered a match (default 0.45)
162+
score_cutoff (Optional [float]): minimum ElasticSearch relevance score to
163+
be considered a match (default 9.0)
162164
*signature_args: Variable length argument list to pass to ImageSignature
163165
**signature_kwargs: Arbitrary keyword arguments to pass to ImageSignature
164166
@@ -175,14 +177,22 @@ def __init__(self, k=16, N=63, n_grid=9,
175177
self.N = N
176178
self.n_grid = n_grid
177179

178-
# Check float input
180+
# Check float input for distance cutoff
179181
if type(distance_cutoff) is not float:
180182
raise TypeError('distance_cutoff should be a float')
181183
if distance_cutoff < 0.:
182184
raise ValueError('distance_cutoff should be > 0 (got %r)' % distance_cutoff)
183185

184186
self.distance_cutoff = distance_cutoff
185187

188+
# Check float input for elasticsearch score cutoff
189+
if type(score_cutoff) is not float:
190+
raise TypeError('score_cutoff should be a float')
191+
if score_cutoff < 0.:
192+
raise ValueError('score_cutoff should be > 0 (got %r)' % score_cutoff)
193+
194+
self.score_cutoff = score_cutoff
195+
186196
self.crop_percentile = crop_percentile
187197

188198
self.gis = ImageSignature(n=n_grid, crop_percentiles=crop_percentile, *signature_args, **signature_kwargs)
@@ -222,7 +232,7 @@ def search_image(self, path, all_orientations=False, bytestream=False, pre_filte
222232
pre_filter (Optional[dict]): filters list before applying the matching algorithm
223233
(default None)
224234
Returns:
225-
a formatted list of dicts representing unique matches, sorted by dist
235+
a formatted list of dicts representing unique matches, sorted by dist or score (in case of using ElasticSearch)
226236
227237
For example, if three matches are found:
228238
@@ -238,6 +248,19 @@ def search_image(self, path, all_orientations=False, bytestream=False, pre_filte
238248
'path': u'https://c2.staticflickr.com/8/7158/6814444991_08d82de57e_z.jpg'}
239249
]
240250
251+
Here is an ElasticSearch example:
252+
253+
[
254+
{'score': 4.0,
255+
'id': u'AVM37oZq0osmmAxpPvx7',
256+
'path': u'https://pixabay.com/static/uploads/photo/2012/11/28/08/56/mona-lisa-67506_960_720.jpg'},
257+
{'score': 35.0,
258+
'id': u'AVM37nMg0osmmAxpPvx6',
259+
'path': u'https://upload.wikimedia.org/wikipedia/commons/thumb/e/ec/Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg/687px-Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg'},
260+
{'score': 10.0,
261+
'id': u'AVM37p530osmmAxpPvx9',
262+
'path': u'https://c2.staticflickr.com/8/7158/6814444991_08d82de57e_z.jpg'}
263+
]
241264
"""
242265
img = self.gis.preprocess_image(path, bytestream)
243266

@@ -277,12 +300,21 @@ def search_image(self, path, all_orientations=False, bytestream=False, pre_filte
277300

278301
ids = set()
279302
unique = []
303+
hasScore = False
280304
for item in result:
305+
if 'score' in item:
306+
hasScore = True
307+
281308
if item['id'] not in ids:
282309
unique.append(item)
283310
ids.add(item['id'])
284311

285-
r = sorted(unique, key=itemgetter('dist'))
312+
# If data comes from ElasticSearch - sort by score, otherwise - default to sorting by dist
313+
if hasScore:
314+
r = sorted(unique, key=itemgetter('score'), reverse=True)
315+
else:
316+
r = sorted(unique, key=itemgetter('dist'))
317+
286318
return r
287319

288320

tests/test_elasticsearch_driver.py

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,15 @@
1717
DOC_TYPE = 'image'
1818
MAPPINGS = {
1919
"mappings": {
20-
DOC_TYPE: {
20+
DOC_TYPE: {
2121
"dynamic": True,
22-
"properties": {
23-
"metadata": {
22+
"properties": {
23+
"metadata": {
2424
"type": "object",
2525
"dynamic": True,
26-
"properties": {
26+
"properties": {
2727
"tenant_id": { "type": "keyword" }
28-
}
28+
}
2929
}
3030
}
3131
}
@@ -122,7 +122,6 @@ def test_lookup_from_url(ses):
122122
assert len(r) == 1
123123
assert r[0]['path'] == 'test1.jpg'
124124
assert 'score' in r[0]
125-
assert 'dist' in r[0]
126125
assert 'id' in r[0]
127126

128127

@@ -132,7 +131,6 @@ def test_lookup_from_file(ses):
132131
assert len(r) == 1
133132
assert r[0]['path'] == 'test1.jpg'
134133
assert 'score' in r[0]
135-
assert 'dist' in r[0]
136134
assert 'id' in r[0]
137135

138136
def test_lookup_from_bytestream(ses):
@@ -142,7 +140,6 @@ def test_lookup_from_bytestream(ses):
142140
assert len(r) == 1
143141
assert r[0]['path'] == 'test1.jpg'
144142
assert 'score' in r[0]
145-
assert 'dist' in r[0]
146143
assert 'id' in r[0]
147144

148145
def test_lookup_with_cutoff(ses):
@@ -156,6 +153,7 @@ def check_distance_consistency(ses):
156153
ses.add_image('test1.jpg')
157154
ses.add_image('test2.jpg', refresh_after=True)
158155
r = ses.search_image('test1.jpg')
156+
print(r[0])
159157
assert r[0]['dist'] == 0.0
160158
assert r[-1]['dist'] == 0.42672771706789686
161159

@@ -171,7 +169,6 @@ def test_add_image_with_metadata(ses):
171169
assert r[0]['metadata'] == metadata
172170
assert 'path' in r[0]
173171
assert 'score' in r[0]
174-
assert 'dist' in r[0]
175172
assert 'id' in r[0]
176173

177174

@@ -190,13 +187,13 @@ def test_lookup_with_filter_by_metadata(ses):
190187
assert len(r) == 1
191188
assert r[0]['metadata'] == metadata
192189

193-
r = ses.search_image('test1.jpg', pre_filter={"term": {"metadata.tenant_id": "bar-2"}})
190+
r = ses.search_image('test2.jpg', pre_filter={"term": {"metadata.tenant_id": "bar-2"}})
194191
assert len(r) == 1
195192
assert r[0]['metadata'] == metadata2
196193

197194
r = ses.search_image('test1.jpg', pre_filter={"term": {"metadata.tenant_id": "bar-3"}})
198195
assert len(r) == 0
199-
196+
200197

201198
def test_all_orientations(ses):
202199
im = Image.open('test1.jpg')
@@ -206,12 +203,13 @@ def test_all_orientations(ses):
206203
r = ses.search_image('rotated_test1.jpg', all_orientations=True)
207204
assert len(r) == 1
208205
assert r[0]['path'] == 'test1.jpg'
209-
assert r[0]['dist'] < 0.05 # some error from rotation
206+
assert r[0]['score'] > 55 # some error from rotation
210207

211208
with open('rotated_test1.jpg', 'rb') as f:
212209
r = ses.search_image(f.read(), bytestream=True, all_orientations=True)
210+
print(r[0])
213211
assert len(r) == 1
214-
assert r[0]['dist'] < 0.05 # some error from rotation
212+
assert r[0]['score'] > 55 # some error from rotation
215213

216214

217215
def test_duplicate(ses):
@@ -220,8 +218,8 @@ def test_duplicate(ses):
220218
r = ses.search_image('test1.jpg')
221219
assert len(r) == 2
222220
assert r[0]['path'] == 'test1.jpg'
221+
assert r[1]['path'] == 'test1.jpg'
223222
assert 'score' in r[0]
224-
assert 'dist' in r[0]
225223
assert 'id' in r[0]
226224

227225

tests/test_elasticsearch_driver_metadata_as_nested.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,16 @@
1717
DOC_TYPE = 'image'
1818
MAPPINGS = {
1919
"mappings": {
20-
DOC_TYPE: {
20+
DOC_TYPE: {
2121
"dynamic": True,
22-
"properties": {
23-
"metadata": {
22+
"properties": {
23+
"metadata": {
2424
"type": "nested",
2525
"dynamic": True,
26-
"properties": {
26+
"properties": {
2727
"tenant_id": { "type": "keyword" },
2828
"project_id": { "type": "keyword" }
29-
}
29+
}
3030
}
3131
}
3232
}
@@ -101,23 +101,23 @@ def test_lookup_with_filter_by_metadata(ses):
101101
assert len(r) == 2
102102

103103
r = ses.search_image('test1.jpg', pre_filter=_nested_filter('foo', 'project-z'))
104-
assert len(r) == 0
104+
assert len(r) == 0
105105

106106
r = ses.search_image('test1.jpg', pre_filter=_nested_filter('bar', 'project-x'))
107107
assert len(r) == 1
108108

109109
r = ses.search_image('test1.jpg', pre_filter=_nested_filter('bar-2', 'project-x'))
110110
assert len(r) == 0
111-
111+
112112
r = ses.search_image('test1.jpg', pre_filter=_nested_filter('bar', 'project-z'))
113-
assert len(r) == 0
114-
113+
assert len(r) == 0
114+
115115
def _metadata(tenant_id, project_id):
116116
return dict(
117117
tenant_id=tenant_id,
118118
project_id=project_id
119119
)
120-
120+
121121
def _nested_filter(tenant_id, project_id):
122122
return {
123123
"nested" : {
@@ -129,6 +129,6 @@ def _nested_filter(tenant_id, project_id):
129129
{"term": {"metadata.project_id": project_id}}
130130
]
131131
}
132-
}
132+
}
133133
}
134-
}
134+
}

0 commit comments

Comments
 (0)