Skip to content

Commit 65f9a60

Browse files
committed
Add JSON ingestion schema extension modules
1 parent 2afdcd4 commit 65f9a60

42 files changed

Lines changed: 1849 additions & 3 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
# Copyright 2024 - 2026 Block, Inc.
2+
#
3+
# Use of this source code is governed by an MIT-style
4+
# license that can be found in the LICENSE file or at
5+
# https://opensource.org/licenses/MIT.
6+
#
7+
# frozen_string_literal: true
8+
9+
require "elastic_graph/constants"
10+
require "elastic_graph/graphql/scalar_coercion_adapters/valid_time_zones"
11+
require "elastic_graph/json_ingestion/schema_definition/factory_extension"
12+
require "elastic_graph/json_ingestion/schema_definition/indexing/field_type/enum_extension"
13+
require "elastic_graph/json_ingestion/schema_definition/indexing/field_type/object_extension"
14+
require "elastic_graph/json_ingestion/schema_definition/indexing/field_type/scalar_extension"
15+
require "elastic_graph/json_ingestion/schema_definition/indexing/field_type/union_extension"
16+
require "elastic_graph/json_ingestion/schema_definition/state_extension"
17+
require "elastic_graph/schema_definition/indexing/field_type/enum"
18+
require "elastic_graph/schema_definition/indexing/field_type/object"
19+
require "elastic_graph/schema_definition/indexing/field_type/scalar"
20+
require "elastic_graph/schema_definition/indexing/field_type/union"
21+
22+
module ElasticGraph
23+
module JSONIngestion
24+
# Namespace for all JSON Schema schema definition support.
25+
#
26+
# {SchemaDefinition::APIExtension} is the primary entry point and should be used as a schema definition extension module.
27+
module SchemaDefinition
28+
# Module designed to be extended onto an {ElasticGraph::SchemaDefinition::API} instance
29+
# to add JSON Schema ingestion serializer capabilities.
30+
module APIExtension
31+
# Default JSON schema options applied to ElasticGraph's built-in scalar types when this extension
32+
# is loaded. Keyed by the un-overridden type name; the lookup at runtime maps each key through
33+
# `type_name_overrides` so renamed built-ins still receive the right options.
34+
BUILT_IN_SCALAR_JSON_SCHEMA_OPTIONS_BY_NAME = {
35+
"Boolean" => {type: "boolean"},
36+
"Float" => {type: "number"},
37+
"ID" => {type: "string"},
38+
"Int" => {type: "integer", minimum: INT_MIN, maximum: INT_MAX},
39+
"String" => {type: "string"},
40+
"Cursor" => {type: "string"},
41+
"Date" => {type: "string", format: "date"},
42+
"DateTime" => {type: "string", format: "date-time"},
43+
"LocalTime" => {type: "string", pattern: VALID_LOCAL_TIME_JSON_SCHEMA_PATTERN},
44+
"TimeZone" => {type: "string", enum: GraphQL::ScalarCoercionAdapters::VALID_TIME_ZONES.to_a.freeze},
45+
"Untyped" => {type: ["array", "boolean", "integer", "number", "object", "string"].freeze},
46+
"JsonSafeLong" => {type: "integer", minimum: JSON_SAFE_LONG_MIN, maximum: JSON_SAFE_LONG_MAX},
47+
"LongString" => {type: "integer", minimum: LONG_STRING_MIN, maximum: LONG_STRING_MAX}
48+
}.freeze
49+
50+
# Wires up the factory extension when this module is extended onto an API instance.
51+
#
52+
# @param api [ElasticGraph::SchemaDefinition::API] the API instance to extend
53+
# @return [void]
54+
# @api private
55+
def self.extended(api)
56+
# Prepend our indexing-field-type extensions onto the core classes so they participate in
57+
# `to_json_schema` / `format_field_json_schema_customizations` / `json_schema_field_metadata_by_field_name`.
58+
# Guarded so re-extending an already-extended API instance is a no-op.
59+
ElasticGraph::SchemaDefinition::Indexing::FieldType::Enum.prepend(Indexing::FieldType::EnumExtension) unless ElasticGraph::SchemaDefinition::Indexing::FieldType::Enum < Indexing::FieldType::EnumExtension
60+
ElasticGraph::SchemaDefinition::Indexing::FieldType::Object.prepend(Indexing::FieldType::ObjectExtension) unless ElasticGraph::SchemaDefinition::Indexing::FieldType::Object < Indexing::FieldType::ObjectExtension
61+
ElasticGraph::SchemaDefinition::Indexing::FieldType::Scalar.prepend(Indexing::FieldType::ScalarExtension) unless ElasticGraph::SchemaDefinition::Indexing::FieldType::Scalar < Indexing::FieldType::ScalarExtension
62+
ElasticGraph::SchemaDefinition::Indexing::FieldType::Union.prepend(Indexing::FieldType::UnionExtension) unless ElasticGraph::SchemaDefinition::Indexing::FieldType::Union < Indexing::FieldType::UnionExtension
63+
64+
state = api.state.extend(StateExtension) # : ElasticGraph::SchemaDefinition::State & StateExtension
65+
state.reserved_type_names << EVENT_ENVELOPE_JSON_SCHEMA_NAME
66+
api.factory.extend FactoryExtension
67+
68+
# Build a lookup from final (post-`type_name_overrides`) names to JSON schema options. We can't
69+
# key directly on `type.name` because users may have overridden the names of built-in scalars
70+
# (e.g. `Cursor` → `PreCursor`); the keys in `BUILT_IN_SCALAR_JSON_SCHEMA_OPTIONS_BY_NAME` are
71+
# always the un-overridden names.
72+
options_by_final_name = BUILT_IN_SCALAR_JSON_SCHEMA_OPTIONS_BY_NAME.to_h do |name, options|
73+
[api.state.type_ref(name).to_final_form.name, options]
74+
end
75+
76+
api.on_built_in_types do |type|
77+
if (options = options_by_final_name[type.name])
78+
scalar_type = type # : ElasticGraph::SchemaDefinition::SchemaElements::ScalarType & SchemaElements::ScalarTypeExtension
79+
scalar_type.json_schema(**options)
80+
elsif type.name == api.state.type_ref("GeoLocation").to_final_form.name
81+
# @type var geo_location_type: ElasticGraph::SchemaDefinition::SchemaElements::TypeWithSubfields & SchemaElements::ObjectInterfaceExtension
82+
geo_location_type = _ = type
83+
names = api.state.schema_elements
84+
85+
# We use `nullable: false` because `GeoLocation` is indexed as a single `geo_point` field,
86+
# and therefore can't support a `latitude` without a `longitude` or vice-versa.
87+
latitude = geo_location_type.graphql_fields_by_name.fetch(names.latitude) # : ElasticGraph::SchemaDefinition::SchemaElements::Field & SchemaElements::FieldExtension
88+
longitude = geo_location_type.graphql_fields_by_name.fetch(names.longitude) # : ElasticGraph::SchemaDefinition::SchemaElements::Field & SchemaElements::FieldExtension
89+
latitude.json_schema minimum: -90, maximum: 90, nullable: false
90+
longitude.json_schema minimum: -180, maximum: 180, nullable: false
91+
end
92+
end
93+
end
94+
95+
# Defines the version number of the current JSON schema. Importantly, every time a change is made that impacts the JSON schema
96+
# artifact, the version number must be incremented to ensure that each different version of the JSON schema is identified by a unique
97+
# version number. The publisher will then include this version number in published events to identify the version of the schema it
98+
# was using. This avoids the need to deploy the publisher and ElasticGraph indexer at the same time to keep them in sync.
99+
#
100+
# @note While this is an important part of how ElasticGraph is designed to support schema evolution, it can be annoying constantly
101+
# have to increment this while rapidly changing the schema during prototyping. You can disable the requirement to increment this
102+
# on every JSON schema change by setting `enforce_json_schema_version` to `false` in your `Rakefile`.
103+
#
104+
# @param version [Integer] current version number of the JSON schema artifact
105+
# @return [void]
106+
# @see Local::RakeTasks#enforce_json_schema_version
107+
def json_schema_version(version)
108+
state = json_ingestion_state
109+
110+
if !version.is_a?(Integer) || version < 1
111+
raise Errors::SchemaError, "`json_schema_version` must be a positive integer. Specified version: #{version}"
112+
end
113+
114+
if state.json_schema_version
115+
raise Errors::SchemaError, "`json_schema_version` can only be set once on a schema. Previously-set version: #{state.json_schema_version}"
116+
end
117+
118+
state.json_schema_version = version
119+
state.json_schema_version_setter_location = caller_locations(1, 1).to_a.first
120+
nil
121+
end
122+
123+
# Defines strictness of the JSON schema validation. By default, the JSON schema will require all fields to be provided by the
124+
# publisher (but they can be nullable) and will ignore extra fields that are not defined in the schema. Use this method to
125+
# configure this behavior.
126+
#
127+
# @param allow_omitted_fields [bool] Whether nullable fields can be omitted from indexing events.
128+
# @param allow_extra_fields [bool] Whether extra fields (e.g. beyond fields defined in the schema) can be included in indexing events.
129+
# @return [void]
130+
#
131+
# @note If you allow both omitted fields and extra fields, ElasticGraph's JSON schema validation will allow (and ignore) misspelled
132+
# field names in indexing events. For example, if the ElasticGraph schema has a nullable field named `parentId` but the publisher
133+
# accidentally provides it as `parent_id`, ElasticGraph would happily ignore the `parent_id` field entirely, because `parentId`
134+
# is allowed to be omitted and `parent_id` would be treated as an extra field. Therefore, we recommend that you only set one of
135+
# these to `true` (or none).
136+
def json_schema_strictness(allow_omitted_fields: false, allow_extra_fields: true)
137+
state = json_ingestion_state
138+
139+
unless [true, false].include?(allow_omitted_fields)
140+
raise Errors::SchemaError, "`allow_omitted_fields` must be true or false"
141+
end
142+
143+
unless [true, false].include?(allow_extra_fields)
144+
raise Errors::SchemaError, "`allow_extra_fields` must be true or false"
145+
end
146+
147+
state.allow_omitted_json_schema_fields = allow_omitted_fields
148+
state.allow_extra_json_schema_fields = allow_extra_fields
149+
nil
150+
end
151+
152+
private
153+
154+
# Returns the API's `state` narrowed to include this gem's `StateExtension`. Centralizes
155+
# the Steep cast that's needed because Steep can't see the `extend(StateExtension)` applied
156+
# at runtime in `extended`.
157+
def json_ingestion_state
158+
state # : ElasticGraph::SchemaDefinition::State & StateExtension
159+
end
160+
end
161+
end
162+
end
163+
end
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
# Copyright 2024 - 2026 Block, Inc.
2+
#
3+
# Use of this source code is governed by an MIT-style
4+
# license that can be found in the LICENSE file or at
5+
# https://opensource.org/licenses/MIT.
6+
#
7+
# frozen_string_literal: true
8+
9+
require "elastic_graph/json_ingestion/schema_definition/indexing/index_extension"
10+
require "elastic_graph/json_ingestion/schema_definition/results_extension"
11+
require "elastic_graph/json_ingestion/schema_definition/schema_artifact_manager_extension"
12+
require "elastic_graph/json_ingestion/schema_definition/schema_elements/enum_type_extension"
13+
require "elastic_graph/json_ingestion/schema_definition/schema_elements/field_extension"
14+
require "elastic_graph/json_ingestion/schema_definition/schema_elements/object_interface_extension"
15+
require "elastic_graph/json_ingestion/schema_definition/schema_elements/scalar_type_extension"
16+
require "elastic_graph/json_ingestion/schema_definition/schema_elements/type_reference_extension"
17+
18+
module ElasticGraph
19+
module JSONIngestion
20+
module SchemaDefinition
21+
# Extension module applied to `ElasticGraph::SchemaDefinition::Factory` to wire up
22+
# JSON Schema support on Results and SchemaArtifactManager instances.
23+
#
24+
# @api private
25+
module FactoryExtension
26+
# @private
27+
def new_enum_type(name)
28+
super(name) do |type|
29+
type.extend SchemaElements::EnumTypeExtension
30+
yield type if block_given?
31+
end
32+
end
33+
34+
# @private
35+
def new_field(**kwargs, &block)
36+
super(**kwargs) do |field|
37+
field.extend SchemaElements::FieldExtension
38+
block&.call(field)
39+
end
40+
end
41+
42+
# @private
43+
def new_index(name, settings, type, &block)
44+
super(name, settings, type) do |index|
45+
index.extend Indexing::IndexExtension
46+
index.require_id_in_json_schema
47+
block&.call(index)
48+
end
49+
end
50+
51+
# @private
52+
def new_interface_type(name)
53+
super(name) do |type|
54+
type.extend SchemaElements::ObjectInterfaceExtension
55+
yield type if block_given?
56+
end
57+
end
58+
59+
# @private
60+
def new_object_type(name)
61+
super(name) do |type|
62+
type.extend SchemaElements::ObjectInterfaceExtension
63+
yield type if block_given?
64+
end
65+
end
66+
67+
# @private
68+
def new_scalar_type(name)
69+
super(name) do |type|
70+
type.extend SchemaElements::ScalarTypeExtension
71+
yield type if block_given?
72+
type.validate_json_schema_configuration! unless state.initially_registered_built_in_types.empty?
73+
end
74+
end
75+
76+
# @private
77+
def new_type_reference(name)
78+
super(name).extend(SchemaElements::TypeReferenceExtension)
79+
end
80+
81+
# Creates a new Results instance with JSON Schema extensions.
82+
#
83+
# @return [ElasticGraph::SchemaDefinition::Results] the created results instance
84+
def new_results
85+
super.extend(ResultsExtension)
86+
end
87+
88+
# Creates a new SchemaArtifactManager instance with JSON Schema extensions.
89+
#
90+
# @return [ElasticGraph::SchemaDefinition::SchemaArtifactManager] the created artifact manager
91+
def new_schema_artifact_manager(...)
92+
super.extend(SchemaArtifactManagerExtension)
93+
end
94+
end
95+
end
96+
end
97+
end
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
# Copyright 2024 - 2026 Block, Inc.
2+
#
3+
# Use of this source code is governed by an MIT-style
4+
# license that can be found in the LICENSE file or at
5+
# https://opensource.org/licenses/MIT.
6+
#
7+
# frozen_string_literal: true
8+
9+
require "elastic_graph/constants"
10+
require "elastic_graph/json_ingestion/schema_definition/indexing/json_schema_field_metadata"
11+
require "elastic_graph/support/hash_util"
12+
13+
module ElasticGraph
14+
module JSONIngestion
15+
module SchemaDefinition
16+
# Namespace for JSON-schema-aware indexing components.
17+
module Indexing
18+
# Extends indexing fields with JSON schema generation behavior.
19+
#
20+
# @api private
21+
module FieldExtension
22+
# JSON schema overrides that automatically apply to specific mapping types so that the JSON schema
23+
# validation will reject values which cannot be indexed into fields of a specific mapping type.
24+
#
25+
# @see https://www.elastic.co/guide/en/elasticsearch/reference/current/number.html Elasticsearch numeric field type documentation
26+
# @note We don't handle `integer` here because it's the default numeric type (handled by our definition of the `Int` scalar type).
27+
# @note Likewise, we don't handle `long` here because a custom scalar type must be used for that since GraphQL's `Int` type can't handle long values.
28+
JSON_SCHEMA_OVERRIDES_BY_MAPPING_TYPE = {
29+
"byte" => {"minimum" => -(2**7), "maximum" => (2**7) - 1},
30+
"short" => {"minimum" => -(2**15), "maximum" => (2**15) - 1},
31+
"keyword" => {"maxLength" => DEFAULT_MAX_KEYWORD_LENGTH},
32+
"text" => {"maxLength" => DEFAULT_MAX_TEXT_LENGTH}
33+
}
34+
35+
# @return [Hash<Symbol, Object>] user-specified JSON schema customizations for this field
36+
def json_schema_customizations
37+
@json_schema_customizations
38+
end
39+
40+
# @private
41+
def with_json_schema(json_schema_layers:, json_schema_customizations:)
42+
@json_schema_layers = json_schema_layers
43+
@json_schema_customizations = json_schema_customizations
44+
self
45+
end
46+
47+
# Returns the JSON schema definition for this field.
48+
#
49+
# @return [Hash<String, Object>] the JSON schema hash
50+
def json_schema
51+
@json_schema ||=
52+
json_schema_layers
53+
.reverse # resolve layers from innermost to outermost wrappings
54+
.reduce(inner_json_schema) { |acc, layer| process_layer(layer, acc) }
55+
.merge(outer_json_schema_customizations)
56+
.merge({"description" => doc_comment}.compact)
57+
.then { |hash| Support::HashUtil.stringify_keys(hash) }
58+
end
59+
60+
# @return [JSONSchemaFieldMetadata] metadata about this field for inclusion in the JSON schema
61+
def json_schema_metadata
62+
JSONSchemaFieldMetadata.new(type: type.name, name_in_index: name_in_index)
63+
end
64+
65+
def nullable?
66+
json_schema_layers.include?(:nullable)
67+
end
68+
69+
private
70+
71+
def json_schema_layers
72+
@json_schema_layers
73+
end
74+
75+
def inner_json_schema
76+
user_specified_customizations =
77+
if user_specified_json_schema_customizations_go_on_outside?
78+
{} # : ::Hash[::String, untyped]
79+
else
80+
Support::HashUtil.stringify_keys(json_schema_customizations)
81+
end
82+
83+
customizations_from_mapping = JSON_SCHEMA_OVERRIDES_BY_MAPPING_TYPE[mapping["type"]] || {}
84+
customizations = customizations_from_mapping.merge(user_specified_customizations)
85+
# @type var field_type: _JSONFieldType
86+
field_type = _ = indexing_field_type
87+
customizations = field_type.format_field_json_schema_customizations(customizations)
88+
89+
ref = {"$ref" => "#/$defs/#{type.unwrapped_name}"}
90+
return ref if customizations.empty?
91+
92+
# Combine any customizations with the type ref under an "allOf" subschema:
93+
# all of these properties must hold true for the type to be valid.
94+
#
95+
# Note that if we simply combine the customizations with the `$ref`
96+
# at the same level, it will not work, because other subschema
97+
# properties are ignored when they are in the same object as a `$ref`:
98+
# https://github.com/json-schema-org/JSON-Schema-Test-Suite/blob/2.0.0/tests/draft7/ref.json#L165-L168
99+
{"allOf" => [ref, customizations]}
100+
end
101+
102+
def outer_json_schema_customizations
103+
return {} unless user_specified_json_schema_customizations_go_on_outside?
104+
Support::HashUtil.stringify_keys(json_schema_customizations)
105+
end
106+
107+
# Indicates if the user-specified JSON schema customizations should go on the inside
108+
# (where they normally go) or on the outside. They only go on the outside when it's
109+
# an array field, because then they apply to the array itself instead of the items in the
110+
# array.
111+
def user_specified_json_schema_customizations_go_on_outside?
112+
json_schema_layers.include?(:array)
113+
end
114+
115+
def process_layer(layer, schema)
116+
case layer
117+
when :nullable
118+
# Here we use "anyOf" to ensure that JSON can either match the schema OR null.
119+
#
120+
# (Using "oneOf" would mean that if we had a schema that also allowed null,
121+
# null would never be allowed, since "oneOf" must match exactly one subschema).
122+
{
123+
"anyOf" => [
124+
schema,
125+
{"type" => "null"}
126+
]
127+
}
128+
when :array
129+
{"type" => "array", "items" => schema}
130+
end
131+
end
132+
end
133+
end
134+
end
135+
end
136+
end

0 commit comments

Comments
 (0)