Skip to content

Commit fa30afa

Browse files
committed
Filter repositories by declared schema using repository properties
This updates the clone and pull tasks to only apply to repositories that declare their conformance with a particular OGM schema version (v1.0 or Aardvark). This is implemented via a repository custom property, which is set at the organization level and can be enabled on a per-repository basis. Unlike repository topics, this value is a controlled vocabulary and is unique to the OGM organization. This functionality replaces the "denylist" that was a hardcoded list of repositories that shouldn't be harvested (e.g. because they were tools or didn't contain metadata). In this version, providers can opt-in to harvesting by adding the relevant schema property. After this change, you no longer need to clone repositories that don't match the schema version you are using, which saves disk space and processing time.
1 parent 3eced7b commit fa30afa

3 files changed

Lines changed: 50 additions & 39 deletions

File tree

.rubocop_todo.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ Metrics/CyclomaticComplexity:
4141
# Offense count: 13
4242
# Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
4343
Metrics/MethodLength:
44-
Max: 21
44+
Max: 25
4545

4646
# Offense count: 2
4747
# Configuration parameters: AllowedMethods, AllowedPatterns.

lib/geo_combine/harvester.rb

Lines changed: 18 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,6 @@ module GeoCombine
1111
class Harvester
1212
attr_reader :ogm_path, :schema_version
1313

14-
# Non-metadata repositories that shouldn't be harvested
15-
def self.denylist
16-
[
17-
'GeoCombine',
18-
'aardvark',
19-
'metadata-issues',
20-
'ogm_utils-python',
21-
'opengeometadata.github.io',
22-
'opengeometadata-rails',
23-
'gbl-1_to_aardvark'
24-
]
25-
end
26-
2714
# GitHub API endpoint for OpenGeoMetadata repositories
2815
def self.ogm_api_uri
2916
URI('https://api.github.com/orgs/opengeometadata/repos?per_page=1000')
@@ -53,9 +40,16 @@ def docs_to_index
5340

5441
doc = JSON.parse(File.read(path))
5542
[doc].flatten.each do |record|
56-
# skip indexing if this record has a different schema version than what we want
5743
record_schema = record['gbl_mdVersion_s'] || record['geoblacklight_version']
5844
record_id = record['layer_slug_s'] || record['dc_identifier_s']
45+
46+
# skip indexing if no identifiable schema version
47+
unless record_schema
48+
@logger.debug "skipping #{record_id || path}; no schema version declared in record"
49+
next
50+
end
51+
52+
# skip indexing if this record has a different schema version than what we want
5953
if record_schema != @schema_version
6054
@logger.debug "skipping #{record_id}; schema version #{record_schema} doesn't match #{@schema_version}"
6155
next
@@ -87,19 +81,20 @@ def pull_all
8781
end
8882

8983
# Clone a repository via git
90-
# If the repository already exists, skip it.
84+
# Return the name of the repository cloned, or nil if skipped
9185
def clone(repo)
9286
repo_path = File.join(@ogm_path, repo)
9387
repo_info = repository_info(repo)
9488
repo_url = "https://github.com/OpenGeoMetadata/#{repo}.git"
95-
96-
# Skip if exists; warn if archived or empty
97-
if File.directory? repo_path
98-
@logger.warn "skipping clone to #{repo_path}; directory exists"
99-
return nil
89+
repo_schemas = Array(repo_info.dig('custom_properties', 'supported_schemas'))
90+
91+
# Skip if exists, archived, empty, or different schema
92+
return @logger.warn "skipping clone to #{repo_path}; directory exists" if File.directory? repo_path
93+
return @logger.warn "repository is archived: #{repo_url}" if repo_info['archived']
94+
return @logger.warn "repository is empty: #{repo_url}" if repo_info['size'].zero?
95+
unless repo_schemas.include? @schema_version
96+
return @logger.warn "repository #{repo_url} clone to #{repo_path}; repository properties don't include schema version #{@schema_version} (found #{repo_schemas.join(', ')})"
10097
end
101-
@logger.warn "repository is archived: #{repo_url}" if repo_info['archived']
102-
@logger.warn "repository is empty: #{repo_url}" if repo_info['size'].zero?
10398

10499
Git.clone(repo_url, nil, path: ogm_path, depth: 1)
105100
@logger.info "cloned #{repo_url} to #{repo_path}"
@@ -119,10 +114,10 @@ def clone_all
119114
# List of repository names to harvest
120115
def repositories
121116
@repositories ||= JSON.parse(Net::HTTP.get(self.class.ogm_api_uri))
117+
.filter { |repo| Array(repo.dig('custom_properties', 'supported_schemas')).include? @schema_version }
122118
.filter { |repo| repo['size'].positive? }
123119
.reject { |repo| repo['archived'] }
124120
.map { |repo| repo['name'] }
125-
.reject { |name| self.class.denylist.include? name }
126121
end
127122

128123
def repository_info(repo_name)

spec/lib/geo_combine/harvester_spec.rb

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
require 'spec_helper'
66

77
RSpec.describe GeoCombine::Harvester do
8-
subject(:harvester) { described_class.new(ogm_path: 'spec/fixtures/indexing', schema_version: '1.0') }
8+
subject(:harvester) { described_class.new(ogm_path: 'spec/fixtures/indexing', logger: logger) }
99

1010
let(:logger) { instance_double(Logger, warn: nil, info: nil, error: nil, debug: nil) }
1111
let(:repo_name) { 'my-institution' }
@@ -14,11 +14,12 @@
1414
let(:stub_repo) { instance_double(Git::Base) }
1515
let(:stub_gh_api) do
1616
[
17-
{ name: repo_name, size: 100 },
18-
{ name: 'another-institution', size: 100 },
19-
{ name: 'outdated-institution', size: 100, archived: true }, # archived
20-
{ name: 'aardvark', size: 300 }, # on denylist
21-
{ name: 'empty', size: 0 } # no data
17+
{ name: repo_name, size: 100, custom_properties: { supported_schemas: ['Aardvark'] } },
18+
{ name: 'another-institution', size: 100, custom_properties: { supported_schemas: ['Aardvark', '1.0'] } }, # multiple schemas
19+
{ name: 'v1-institution', size: 300, custom_properties: { supported_schemas: ['1.0'] } }, # schema mismatch
20+
{ name: 'outdated-institution', size: 100, custom_properties: { supported_schemas: ['Aardvark'] }, archived: true }, # archived
21+
{ name: 'empty', size: 0, custom_properties: { supported_schemas: ['Aardvark'] } }, # no data
22+
{ name: 'tool', size: 50 } # not a metadata repository
2223
]
2324
end
2425

@@ -42,15 +43,15 @@
4243
describe '#docs_to_index' do
4344
it 'yields each JSON record with its path, skipping layers.JSON' do
4445
expect { |b| harvester.docs_to_index(&b) }.to yield_successive_args(
45-
[JSON.parse(File.read('spec/fixtures/indexing/basic_geoblacklight.json')), 'spec/fixtures/indexing/basic_geoblacklight.json'],
46-
[JSON.parse(File.read('spec/fixtures/indexing/geoblacklight.json')), 'spec/fixtures/indexing/geoblacklight.json']
46+
[JSON.parse(File.read('spec/fixtures/indexing/aardvark.json')), 'spec/fixtures/indexing/aardvark.json']
4747
)
4848
end
4949

50-
it 'skips records with a different schema version' do
51-
harvester = described_class.new(ogm_path: 'spec/fixtures/indexing/', schema_version: 'Aardvark', logger:)
50+
it 'can yield JSON records for a different schema version' do
51+
harvester = described_class.new(ogm_path: 'spec/fixtures/indexing/', schema_version: '1.0', logger:)
5252
expect { |b| harvester.docs_to_index(&b) }.to yield_successive_args(
53-
[JSON.parse(File.read('spec/fixtures/indexing/aardvark.json')), 'spec/fixtures/indexing/aardvark.json']
53+
[JSON.parse(File.read('spec/fixtures/indexing/basic_geoblacklight.json')), 'spec/fixtures/indexing/basic_geoblacklight.json'],
54+
[JSON.parse(File.read('spec/fixtures/indexing/geoblacklight.json')), 'spec/fixtures/indexing/geoblacklight.json']
5455
)
5556
end
5657
end
@@ -79,15 +80,20 @@
7980
expect(harvester.pull_all).to eq(%w[my-institution another-institution])
8081
end
8182

82-
it 'skips repositories in the denylist' do
83+
it 'skips repositories with no schema declared' do
8384
harvester.pull_all
84-
expect(Git).not_to have_received(:open).with('https://github.com/OpenGeoMetadata/aardvark.git')
85+
expect(Git).not_to have_received(:open).with('https://github.com/OpenGeoMetadata/tool.git')
8586
end
8687

8788
it 'skips archived repositories' do
8889
harvester.pull_all
8990
expect(Git).not_to have_received(:open).with('https://github.com/OpenGeoMetadata/outdated-institution.git')
9091
end
92+
93+
it 'skips repositories with no data' do
94+
harvester.pull_all
95+
expect(Git).not_to have_received(:open).with('https://github.com/OpenGeoMetadata/empty.git')
96+
end
9197
end
9298

9399
describe '#clone' do
@@ -115,9 +121,19 @@
115121
expect(Git).to have_received(:clone).exactly(2).times
116122
end
117123

118-
it 'skips repositories in the denylist' do
124+
it 'skips repositories with no schema declared' do
125+
harvester.clone_all
126+
expect(Git).not_to have_received(:clone).with('https://github.com/OpenGeoMetadata/tool.git')
127+
end
128+
129+
it 'skips archived repositories' do
130+
harvester.clone_all
131+
expect(Git).not_to have_received(:clone).with('https://github.com/OpenGeoMetadata/outdated-institution.git')
132+
end
133+
134+
it 'skips repositories with no data' do
119135
harvester.clone_all
120-
expect(Git).not_to have_received(:clone).with('https://github.com/OpenGeoMetadata/aardvark.git')
136+
expect(Git).not_to have_received(:clone).with('https://github.com/OpenGeoMetadata/empty.git')
121137
end
122138

123139
it 'returns the names of repositories cloned' do

0 commit comments

Comments
 (0)