diff --git a/ingestion/src/metadata/profiler/metrics/static/column_count.py b/ingestion/src/metadata/profiler/metrics/static/column_count.py index a5e4bc461335..88cc525a7d2f 100644 --- a/ingestion/src/metadata/profiler/metrics/static/column_count.py +++ b/ingestion/src/metadata/profiler/metrics/static/column_count.py @@ -12,6 +12,7 @@ """ Table Column Count Metric definition """ + # pylint: disable=duplicate-code from typing import TYPE_CHECKING, Optional diff --git a/ingestion/src/metadata/profiler/metrics/static/column_names.py b/ingestion/src/metadata/profiler/metrics/static/column_names.py index 00568a68e2dd..6c085e7cbe4f 100644 --- a/ingestion/src/metadata/profiler/metrics/static/column_names.py +++ b/ingestion/src/metadata/profiler/metrics/static/column_names.py @@ -12,6 +12,7 @@ """ Table Column Count Metric definition """ + # pylint: disable=duplicate-code from typing import TYPE_CHECKING, Optional diff --git a/ingestion/src/metadata/profiler/metrics/static/count.py b/ingestion/src/metadata/profiler/metrics/static/count.py index f30c1b170143..8adbe4335ef0 100644 --- a/ingestion/src/metadata/profiler/metrics/static/count.py +++ b/ingestion/src/metadata/profiler/metrics/static/count.py @@ -12,6 +12,7 @@ """ Count Metric definition """ + # pylint: disable=duplicate-code import traceback diff --git a/ingestion/src/metadata/profiler/metrics/static/count_in_set.py b/ingestion/src/metadata/profiler/metrics/static/count_in_set.py index da2955201355..37e927584c95 100644 --- a/ingestion/src/metadata/profiler/metrics/static/count_in_set.py +++ b/ingestion/src/metadata/profiler/metrics/static/count_in_set.py @@ -12,6 +12,7 @@ """ CountInSet Metric definition """ + # pylint: disable=duplicate-code import traceback from typing import TYPE_CHECKING, List, Optional diff --git a/ingestion/src/metadata/profiler/metrics/static/distinct_count.py b/ingestion/src/metadata/profiler/metrics/static/distinct_count.py index b6ae7942b2ca..e05ed6f56ada 100644 --- a/ingestion/src/metadata/profiler/metrics/static/distinct_count.py +++ b/ingestion/src/metadata/profiler/metrics/static/distinct_count.py @@ -12,6 +12,7 @@ """ Distinct Count Metric definition """ + # pylint: disable=duplicate-code import json @@ -25,6 +26,7 @@ from metadata.generated.schema.configuration.profilerConfiguration import MetricType from metadata.profiler.metrics.core import StaticMetric, _label from metadata.profiler.orm.functions.count import CountFn +from metadata.profiler.orm.registry import is_complex_type from metadata.utils.logger import profiler_logger logger = profiler_logger() @@ -52,6 +54,8 @@ def fn(self): """ Distinct Count metric for Sqlalchemy connectors """ + if is_complex_type(self.col.type): + return None return func.count(distinct(CountFn(column(self.col.name, self.col.type)))) def df_fn(self, dfs: Optional["PandasRunner"] = None): diff --git a/ingestion/src/metadata/profiler/metrics/static/ilike_count.py b/ingestion/src/metadata/profiler/metrics/static/ilike_count.py index 9e3237ad33eb..3d9bebc673ff 100644 --- a/ingestion/src/metadata/profiler/metrics/static/ilike_count.py +++ b/ingestion/src/metadata/profiler/metrics/static/ilike_count.py @@ -12,6 +12,7 @@ """ ILIKE Count Metric definition """ + # pylint: disable=duplicate-code from sqlalchemy import case, column @@ -19,6 +20,7 @@ from metadata.generated.schema.configuration.profilerConfiguration import MetricType from metadata.profiler.metrics.core import StaticMetric, _label from metadata.profiler.orm.functions.sum import SumFn +from metadata.profiler.orm.registry import is_complex_type class ILikeCount(StaticMetric): @@ -46,6 +48,8 @@ def metric_type(self): @_label def fn(self): + if is_complex_type(self.col.type): + return None if not hasattr(self, "expression"): raise AttributeError( "ILike Count requires an expression to be set: add_props(expression=...)(Metrics.iLikeCount)" diff --git a/ingestion/src/metadata/profiler/metrics/static/like_count.py b/ingestion/src/metadata/profiler/metrics/static/like_count.py index 87d05ed1119f..b42f28da1852 100644 --- a/ingestion/src/metadata/profiler/metrics/static/like_count.py +++ b/ingestion/src/metadata/profiler/metrics/static/like_count.py @@ -12,6 +12,7 @@ """ Like Count Metric definition """ + # pylint: disable=duplicate-code from sqlalchemy import case, column @@ -19,6 +20,7 @@ from metadata.generated.schema.configuration.profilerConfiguration import MetricType from metadata.profiler.metrics.core import StaticMetric, _label from metadata.profiler.orm.functions.sum import SumFn +from metadata.profiler.orm.registry import is_complex_type class LikeCount(StaticMetric): @@ -46,6 +48,8 @@ def metric_type(self): @_label def fn(self): + if is_complex_type(self.col.type): + return None if not hasattr(self, "expression"): raise AttributeError( "Like Count requires an expression to be set: add_props(expression=...)(Metrics.likeCount)" diff --git a/ingestion/src/metadata/profiler/metrics/static/max.py b/ingestion/src/metadata/profiler/metrics/static/max.py index 235555cb9fcd..52e6fa2d7f46 100644 --- a/ingestion/src/metadata/profiler/metrics/static/max.py +++ b/ingestion/src/metadata/profiler/metrics/static/max.py @@ -12,6 +12,7 @@ """ Max Metric definition """ + from functools import partial from typing import TYPE_CHECKING, Callable, Optional diff --git a/ingestion/src/metadata/profiler/metrics/static/max_length.py b/ingestion/src/metadata/profiler/metrics/static/max_length.py index 7f374d4537ee..44790d907def 100644 --- a/ingestion/src/metadata/profiler/metrics/static/max_length.py +++ b/ingestion/src/metadata/profiler/metrics/static/max_length.py @@ -12,6 +12,7 @@ """ MAX_LENGTH Metric definition """ + # pylint: disable=duplicate-code @@ -23,7 +24,7 @@ from metadata.profiler.metrics.core import StaticMetric, _label from metadata.profiler.metrics.pandas_metric_protocol import PandasComputation from metadata.profiler.orm.functions.length import LenFn -from metadata.profiler.orm.registry import is_concatenable +from metadata.profiler.orm.registry import is_complex_type, is_concatenable from metadata.utils.logger import profiler_logger if TYPE_CHECKING: @@ -100,14 +101,13 @@ def update_accumulator( current_max: Optional[int], df: "pd.DataFrame", column ) -> Optional[int]: """Computes one DataFrame chunk and updates the running maximum""" + # pylint: disable=import-outside-toplevel import pandas as pd - from numpy import vectorize - length_vectorize_func = vectorize(len) chunk_max = None - if is_concatenable(column.type): - max_val = length_vectorize_func(df[column.name].dropna().astype(str)).max() + if is_concatenable(column.type) or is_complex_type(column.type): + max_val = df[column.name].dropna().astype(str).str.len().max() if not pd.isnull(max_val): chunk_max = max_val diff --git a/ingestion/src/metadata/profiler/metrics/static/mean.py b/ingestion/src/metadata/profiler/metrics/static/mean.py index 33393a2d54b3..195ca9605e2f 100644 --- a/ingestion/src/metadata/profiler/metrics/static/mean.py +++ b/ingestion/src/metadata/profiler/metrics/static/mean.py @@ -12,6 +12,7 @@ """ AVG Metric definition """ + from functools import partial from typing import TYPE_CHECKING, Callable, NamedTuple, Optional diff --git a/ingestion/src/metadata/profiler/metrics/static/min.py b/ingestion/src/metadata/profiler/metrics/static/min.py index f831cd212e95..ad69cd4fff81 100644 --- a/ingestion/src/metadata/profiler/metrics/static/min.py +++ b/ingestion/src/metadata/profiler/metrics/static/min.py @@ -12,6 +12,7 @@ """ Min Metric definition """ + from functools import partial from typing import TYPE_CHECKING, Callable, Optional diff --git a/ingestion/src/metadata/profiler/metrics/static/min_length.py b/ingestion/src/metadata/profiler/metrics/static/min_length.py index 5e5888ce71d3..2f76df60eb30 100644 --- a/ingestion/src/metadata/profiler/metrics/static/min_length.py +++ b/ingestion/src/metadata/profiler/metrics/static/min_length.py @@ -12,6 +12,7 @@ """ MIN_LENGTH Metric definition """ + # pylint: disable=duplicate-code @@ -23,7 +24,7 @@ from metadata.profiler.metrics.core import StaticMetric, _label from metadata.profiler.metrics.pandas_metric_protocol import PandasComputation from metadata.profiler.orm.functions.length import LenFn -from metadata.profiler.orm.registry import is_concatenable +from metadata.profiler.orm.registry import is_complex_type, is_concatenable from metadata.utils.logger import profiler_logger if TYPE_CHECKING: @@ -100,14 +101,13 @@ def update_accumulator( current_min: Optional[int], df: "pd.DataFrame", column ) -> Optional[int]: """Computes one DataFrame chunk and updates the running minimum""" + # pylint: disable=import-outside-toplevel import pandas as pd - from numpy import vectorize - length_vectorize_func = vectorize(len) chunk_min = None - if is_concatenable(column.type): - min_val = length_vectorize_func(df[column.name].dropna().astype(str)).min() + if is_concatenable(column.type) or is_complex_type(column.type): + min_val = df[column.name].dropna().astype(str).str.len().min() if not pd.isnull(min_val): chunk_min = min_val diff --git a/ingestion/src/metadata/profiler/metrics/static/not_like_count.py b/ingestion/src/metadata/profiler/metrics/static/not_like_count.py index eadec9cc4274..470f7233eacd 100644 --- a/ingestion/src/metadata/profiler/metrics/static/not_like_count.py +++ b/ingestion/src/metadata/profiler/metrics/static/not_like_count.py @@ -12,6 +12,7 @@ """ Like Count Metric definition """ + # pylint: disable=duplicate-code from sqlalchemy import case, column @@ -19,6 +20,7 @@ from metadata.generated.schema.configuration.profilerConfiguration import MetricType from metadata.profiler.metrics.core import StaticMetric, _label from metadata.profiler.orm.functions.sum import SumFn +from metadata.profiler.orm.registry import is_complex_type class NotLikeCount(StaticMetric): @@ -46,6 +48,8 @@ def metric_type(self): @_label def fn(self): + if is_complex_type(self.col.type): + return None if not hasattr(self, "expression"): raise AttributeError( "Not Like Count requires an expression to be set: add_props(expression=...)(Metrics.notLikeCount)" diff --git a/ingestion/src/metadata/profiler/metrics/static/not_regexp_match_count.py b/ingestion/src/metadata/profiler/metrics/static/not_regexp_match_count.py index 8e4b7e33d9d2..88c4b6a7ba99 100644 --- a/ingestion/src/metadata/profiler/metrics/static/not_regexp_match_count.py +++ b/ingestion/src/metadata/profiler/metrics/static/not_regexp_match_count.py @@ -12,6 +12,7 @@ """ Regex Count Metric definition """ + # pylint: disable=duplicate-code import traceback @@ -24,7 +25,7 @@ from metadata.profiler.metrics.pandas_metric_protocol import PandasComputation from metadata.profiler.orm.functions.regexp import RegexpMatchFn from metadata.profiler.orm.functions.sum import SumFn -from metadata.profiler.orm.registry import is_concatenable +from metadata.profiler.orm.registry import is_complex_type, is_concatenable from metadata.utils.logger import profiler_logger if TYPE_CHECKING: @@ -64,6 +65,8 @@ def _is_concatenable(self): @_label def fn(self): """sqlalchemy function""" + if is_complex_type(self.col.type): + return None if not hasattr(self, "expression"): raise AttributeError( "Not Regex Count requires an expression to be set: add_props(expression=...)(Metrics.notRegexCount)" diff --git a/ingestion/src/metadata/profiler/metrics/static/null_count.py b/ingestion/src/metadata/profiler/metrics/static/null_count.py index 71f97a60e1e1..4e48c5996a4b 100644 --- a/ingestion/src/metadata/profiler/metrics/static/null_count.py +++ b/ingestion/src/metadata/profiler/metrics/static/null_count.py @@ -12,6 +12,7 @@ """ Null Count Metric definition """ + # pylint: disable=duplicate-code from typing import TYPE_CHECKING, Optional diff --git a/ingestion/src/metadata/profiler/metrics/static/null_missing_count.py b/ingestion/src/metadata/profiler/metrics/static/null_missing_count.py index 8b19638a3dd0..de316b074c82 100644 --- a/ingestion/src/metadata/profiler/metrics/static/null_missing_count.py +++ b/ingestion/src/metadata/profiler/metrics/static/null_missing_count.py @@ -12,6 +12,7 @@ """ Null Count Metric definition """ + # pylint: disable=duplicate-code from typing import TYPE_CHECKING, Optional diff --git a/ingestion/src/metadata/profiler/metrics/static/regexp_match_count.py b/ingestion/src/metadata/profiler/metrics/static/regexp_match_count.py index e4ab3c54796b..78c715d3aace 100644 --- a/ingestion/src/metadata/profiler/metrics/static/regexp_match_count.py +++ b/ingestion/src/metadata/profiler/metrics/static/regexp_match_count.py @@ -12,6 +12,7 @@ """ Regex Count Metric definition """ + # pylint: disable=duplicate-code import traceback @@ -24,7 +25,7 @@ from metadata.profiler.metrics.pandas_metric_protocol import PandasComputation from metadata.profiler.orm.functions.regexp import RegexpMatchFn from metadata.profiler.orm.functions.sum import SumFn -from metadata.profiler.orm.registry import is_concatenable +from metadata.profiler.orm.registry import is_complex_type, is_concatenable from metadata.utils.logger import profiler_logger if TYPE_CHECKING: @@ -64,6 +65,8 @@ def _is_concatenable(self): @_label def fn(self): """sqlalchemy function""" + if is_complex_type(self.col.type): + return None if not hasattr(self, "expression"): raise AttributeError( "Regex Count requires an expression to be set: add_props(expression=...)(Metrics.regexCount)" diff --git a/ingestion/src/metadata/profiler/metrics/static/row_count.py b/ingestion/src/metadata/profiler/metrics/static/row_count.py index d16311c1a8e5..b028a62e6852 100644 --- a/ingestion/src/metadata/profiler/metrics/static/row_count.py +++ b/ingestion/src/metadata/profiler/metrics/static/row_count.py @@ -12,6 +12,7 @@ """ Table Count Metric definition """ + from typing import TYPE_CHECKING, Callable, Optional from sqlalchemy import func diff --git a/ingestion/src/metadata/profiler/metrics/static/sum.py b/ingestion/src/metadata/profiler/metrics/static/sum.py index c4d98ff7bcfe..9280f6321cc6 100644 --- a/ingestion/src/metadata/profiler/metrics/static/sum.py +++ b/ingestion/src/metadata/profiler/metrics/static/sum.py @@ -12,6 +12,7 @@ """ SUM Metric definition """ + from functools import partial from typing import TYPE_CHECKING, Callable, Optional diff --git a/ingestion/src/metadata/profiler/metrics/static/unique_count.py b/ingestion/src/metadata/profiler/metrics/static/unique_count.py index 1f646145c91d..a2b048887c6c 100644 --- a/ingestion/src/metadata/profiler/metrics/static/unique_count.py +++ b/ingestion/src/metadata/profiler/metrics/static/unique_count.py @@ -12,6 +12,7 @@ """ Unique Count Metric definition """ + import json from collections import Counter from typing import TYPE_CHECKING, Optional @@ -23,7 +24,7 @@ from metadata.profiler.metrics.core import QueryMetric from metadata.profiler.metrics.pandas_metric_protocol import PandasComputation from metadata.profiler.orm.functions.unique_count import _unique_count_query_mapper -from metadata.profiler.orm.registry import NOT_COMPUTE, Dialects +from metadata.profiler.orm.registry import NOT_COMPUTE, Dialects, is_complex_type from metadata.utils.logger import profiler_logger if TYPE_CHECKING: @@ -60,7 +61,9 @@ def query(self, sample: Optional[type], session: Optional[Session] = None): "We are missing the session attribute to compute the UniqueCount." ) - if self.col.type.__class__.__name__ in NOT_COMPUTE: + if self.col.type.__class__.__name__ in NOT_COMPUTE or is_complex_type( + self.col.type + ): return None # Run all queries on top of the sampled data diff --git a/ingestion/src/metadata/profiler/orm/registry.py b/ingestion/src/metadata/profiler/orm/registry.py index c6da40f10e1b..bfa9583d883e 100644 --- a/ingestion/src/metadata/profiler/orm/registry.py +++ b/ingestion/src/metadata/profiler/orm/registry.py @@ -116,23 +116,8 @@ class Dialects(metaclass=EnumAdapter): pass -# Sometimes we want to skip certain types for computing metrics. -# If the type is NULL, then we won't run the metric execution -# in the profiler. -# Note that not mapped types are set to NULL by default. NOT_COMPUTE = { sqlalchemy.types.NullType.__name__, - sqlalchemy.ARRAY.__name__, - sqlalchemy.JSON.__name__, - sqa_types.SQAMap.__name__, - sqa_types.SQAStruct.__name__, - sqa_types.SQASet.__name__, - sqa_types.SQAUnion.__name__, - sqa_types.SQASGeography.__name__, - DataType.GEOMETRY.value, - DataType.ARRAY.value, - DataType.JSON.value, - CustomTypes.ARRAY.value.__name__, CustomTypes.SQADATETIMERANGE.value.__name__, DataType.XML.value, CustomTypes.UNDETERMINED.value.__name__, @@ -154,6 +139,26 @@ class Dialects(metaclass=EnumAdapter): CONCATENABLE_SET = {DataType.STRING.value, DataType.TEXT.value} +COLLECTION_SET = { + sqlalchemy.ARRAY.__name__, + DataType.ARRAY.value, + CustomTypes.ARRAY.value.__name__, +} + +STRUCT_SET = { + sqlalchemy.JSON.__name__, + sqa_types.SQAMap.__name__, + sqa_types.SQAStruct.__name__, + sqa_types.SQASet.__name__, + sqa_types.SQAUnion.__name__, + DataType.JSON.value, +} + +COMPLEX_SET = { + sqa_types.SQASGeography.__name__, + DataType.GEOMETRY.value, +} + # Now, let's define some helper methods to identify # the nature of an SQLAlchemy type @@ -237,3 +242,29 @@ def is_blob(_type) -> bool: DataType.TEXT.value, } return isinstance(_type, (HexByteString, LargeBinary, Text)) + + +def is_collection(_type) -> bool: + """Check if the type is a collection, e.g. Array""" + if isinstance(_type, DataType): + return _type.value in COLLECTION_SET + return _type.__class__.__name__ in COLLECTION_SET + + +def is_struct(_type) -> bool: + """Check if the type is a struct, e.g. JSON, Map, Struct""" + if isinstance(_type, DataType): + return _type.value in STRUCT_SET + return _type.__class__.__name__ in STRUCT_SET + + +def is_complex(_type) -> bool: + """Check if the type is complex, e.g. Geography or Geometry""" + if isinstance(_type, DataType): + return _type.value in COMPLEX_SET + return _type.__class__.__name__ in COMPLEX_SET + + +def is_complex_type(_type) -> bool: + """Helper method to group collections, structs, and complex types""" + return is_collection(_type) or is_struct(_type) or is_complex(_type) diff --git a/ingestion/src/metadata/profiler/processor/core.py b/ingestion/src/metadata/profiler/processor/core.py index 3c96d663431f..ce126f051019 100644 --- a/ingestion/src/metadata/profiler/processor/core.py +++ b/ingestion/src/metadata/profiler/processor/core.py @@ -50,7 +50,7 @@ TMetric, ) from metadata.profiler.metrics.static.row_count import RowCount -from metadata.profiler.orm.registry import NOT_COMPUTE +from metadata.profiler.orm.registry import NOT_COMPUTE, is_complex_type from metadata.profiler.processor.metric_filter import MetricFilter from metadata.utils.logger import profiler_logger @@ -378,6 +378,7 @@ def _prepare_column_metrics(self) -> List: column for column in self.columns if column.type.__class__.__name__ not in NOT_COMPUTE + and not is_complex_type(column.type) ] static_metrics = [ ThreadPoolMetrics( diff --git a/openmetadata-service/src/main/java/org/openmetadata/DefaultOperationalConfigProvider.java b/openmetadata-service/src/main/java/org/openmetadata/DefaultOperationalConfigProvider.java index 34e9d54395ec..b595435ce2a2 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/DefaultOperationalConfigProvider.java +++ b/openmetadata-service/src/main/java/org/openmetadata/DefaultOperationalConfigProvider.java @@ -2,7 +2,6 @@ import com.cronutils.utils.StringUtils; import com.fasterxml.jackson.databind.ObjectMapper; -import io.dropwizard.configuration.EnvironmentVariableSubstitutor; import io.dropwizard.configuration.FileConfigurationSourceProvider; import io.dropwizard.configuration.SubstitutingSourceProvider; import io.dropwizard.configuration.YamlConfigurationFactory; @@ -17,6 +16,7 @@ import org.openmetadata.schema.api.security.OpsConfig; import org.openmetadata.schema.email.SmtpSettings; import org.openmetadata.schema.operations.OperationalConfiguration; +import org.openmetadata.service.util.YamlSafeSubstitutor; @Getter @Setter @@ -65,8 +65,7 @@ public static OperationalConfiguration readOperationsConfig(String configFilePat new YamlConfigurationFactory<>( OperationalConfiguration.class, validator, objectMapper, "dw"); return factory.build( - new SubstitutingSourceProvider( - new FileConfigurationSourceProvider(), new EnvironmentVariableSubstitutor(false)), + new SubstitutingSourceProvider(new FileConfigurationSourceProvider(), new YamlSafeSubstitutor(false)), configFilePath); } } diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/OpenMetadataApplication.java b/openmetadata-service/src/main/java/org/openmetadata/service/OpenMetadataApplication.java index 4b0eab6398da..8a8b56784db8 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/OpenMetadataApplication.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/OpenMetadataApplication.java @@ -15,7 +15,6 @@ import static org.openmetadata.service.util.jdbi.JdbiUtils.createAndSetupJDBI; -import io.dropwizard.configuration.EnvironmentVariableSubstitutor; import io.dropwizard.configuration.SubstitutingSourceProvider; import io.dropwizard.core.Application; import io.dropwizard.core.server.DefaultServerFactory; @@ -58,6 +57,7 @@ import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.StringUtils; +import org.openmetadata.service.util.YamlSafeSubstitutor; import org.eclipse.jetty.ee10.servlet.FilterHolder; import org.eclipse.jetty.ee10.servlet.ServletHandler; import org.eclipse.jetty.ee10.servlet.ServletHolder; @@ -733,8 +733,7 @@ private boolean isSamlServletRegistered( @Override public void initialize(Bootstrap bootstrap) { bootstrap.setConfigurationSourceProvider( - new SubstitutingSourceProvider( - bootstrap.getConfigurationSourceProvider(), new EnvironmentVariableSubstitutor(false))); + new SubstitutingSourceProvider(bootstrap.getConfigurationSourceProvider(), new YamlSafeSubstitutor(false))); // Register custom filter factories bootstrap diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/apps/ConfigurationReader.java b/openmetadata-service/src/main/java/org/openmetadata/service/apps/ConfigurationReader.java index d547ab1af974..4939ed3394e6 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/apps/ConfigurationReader.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/apps/ConfigurationReader.java @@ -3,7 +3,6 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; import io.dropwizard.configuration.ConfigurationException; -import io.dropwizard.configuration.EnvironmentVariableSubstitutor; import io.dropwizard.configuration.ResourceConfigurationSourceProvider; import io.dropwizard.configuration.SubstitutingSourceProvider; import io.dropwizard.configuration.YamlConfigurationFactory; @@ -13,6 +12,7 @@ import org.apache.commons.text.StringSubstitutor; import org.openmetadata.schema.api.configuration.apps.AppPrivateConfig; import org.openmetadata.schema.utils.JsonUtils; +import org.openmetadata.service.util.YamlSafeSubstitutor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -27,11 +27,13 @@ public ConfigurationReader(Map envMap) { // envMap is for custom environment variables (e.g., for testing), defaulting to the system // environment. substitutor = - envMap == null ? new EnvironmentVariableSubstitutor(false) : new StringSubstitutor(envMap); + envMap == null + ? new YamlSafeSubstitutor(false) + : new StringSubstitutor(envMap); } public ConfigurationReader() { - this(System.getenv()); + this(null); } public AppPrivateConfig readConfigFromResource(String appName) diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/util/OpenMetadataOperations.java b/openmetadata-service/src/main/java/org/openmetadata/service/util/OpenMetadataOperations.java index 16e750427194..1f1ce1dad061 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/util/OpenMetadataOperations.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/util/OpenMetadataOperations.java @@ -17,7 +17,6 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; -import io.dropwizard.configuration.EnvironmentVariableSubstitutor; import io.dropwizard.configuration.FileConfigurationSourceProvider; import io.dropwizard.configuration.SubstitutingSourceProvider; import io.dropwizard.configuration.YamlConfigurationFactory; @@ -2871,7 +2870,7 @@ public void parseConfig() throws Exception { config = factory.build( new SubstitutingSourceProvider( - new FileConfigurationSourceProvider(), new EnvironmentVariableSubstitutor(false)), + new FileConfigurationSourceProvider(), new YamlSafeSubstitutor(false)), configFilePath); IndexMappingLoader.init(config.getElasticSearchConfiguration()); Fernet.getInstance().setFernetKey(config); diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/util/YamlSafeSubstitutor.java b/openmetadata-service/src/main/java/org/openmetadata/service/util/YamlSafeSubstitutor.java new file mode 100644 index 000000000000..891d60682548 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/util/YamlSafeSubstitutor.java @@ -0,0 +1,206 @@ +package org.openmetadata.service.util; + +import io.dropwizard.configuration.EnvironmentVariableSubstitutor; +import org.apache.commons.text.TextStringBuilder; + +/** + * Extension of {@link EnvironmentVariableSubstitutor} that automatically quotes values containing + * YAML special characters. + * + *

Environment variable substitution in Dropwizard happens at the text level before YAML parsing. + * If an environment variable value contains YAML special characters (e.g., {@code >}, {@code |}, + * {@code #}, {@code {}}, {@code []}), the resulting YAML becomes malformed. + * + *

For example, a database password like {@code >8fYhD>dkcxy#HkVi8~ET} set via + * {@code DB_USER_PASSWORD} would produce: + * + *

+ * password: >8fYhD>dkcxy#HkVi8~ET
+ * 
+ * + * This would be interpreted as an empty block scalar by the YAML parser, causing a parsing error. + * This class automatically wraps such values in double quotes: + * + *
+ * password: ">8fYhD>dkcxy#HkVi8~ET"
+ * 
+ * + * Values that look like YAML flow sequences ({@code [...]}) or flow mappings ({@code {...}}) are + * left unquoted to preserve their structural meaning. + * + * @see Issue #21176 + */ +public class YamlSafeSubstitutor extends EnvironmentVariableSubstitutor { + + public YamlSafeSubstitutor(boolean strict) { + super(strict); + } + + @Override + protected String resolveVariable( + String variableName, TextStringBuilder buf, int startPos, int endPos) { + String value = super.resolveVariable(variableName, buf, startPos, endPos); + if (value == null || !needsYamlQuoting(value)) { + return value; + } + + // Don't quote values that are embedded within a larger string (e.g., JDBC URLs like + // jdbc:${DB_SCHEME:-mysql}://${DB_HOST:-localhost}:${DB_PORT:-3306}/...). + // Quoting inline values would insert literal '"' characters into the surrounding string. + if (isInlineSubstitution(buf, startPos, endPos)) { + return value; + } + + return yamlDoubleQuote(value); + } + + /** + * Checks whether the ${...} placeholder is embedded within a larger YAML value rather than being + * the entire value. A substitution is inline if there is non-whitespace content immediately before + * or after the placeholder on the same line. + */ + static boolean isInlineSubstitution(TextStringBuilder buf, int startPos, int endPos) { + if (startPos > 0) { + char before = buf.charAt(startPos - 1); + if (before != ' ' && before != '\t' && before != '\n' && before != '\r') { + return true; + } + } + if (endPos < buf.length()) { + char after = buf.charAt(endPos); + if (after != ' ' && after != '\t' && after != '\n' && after != '\r') { + return true; + } + } + return false; + } + + /** + * Wraps the value in YAML double quotes if it contains special characters that would break YAML + * parsing. Returns the value unchanged if quoting is not needed. + */ + public static String quoteIfNeeded(String value) { + if (value != null && needsYamlQuoting(value)) { + return yamlDoubleQuote(value); + } + return value; + } + + static boolean needsYamlQuoting(String value) { + if (value.isEmpty()) { + return false; + } + + // Don't quote values that look like YAML flow sequences or mappings, + // as quoting would turn them into plain strings + if ((value.startsWith("[") && value.endsWith("]")) + || (value.startsWith("{") && value.endsWith("}"))) { + return false; + } + + // Don't quote values that are already YAML-quoted. Docker Compose defaults using the + // ${VAR:- "value"} pattern produce env vars with optional leading whitespace and surrounding + // quotes (e.g., ' "opensearch"'). Re-quoting these would double-escape them. + String trimmed = value.trim(); + if (trimmed.length() >= 2) { + char qFirst = trimmed.charAt(0); + char qLast = trimmed.charAt(trimmed.length() - 1); + if ((qFirst == '"' && qLast == '"') || (qFirst == '\'' && qLast == '\'')) { + return false; + } + } + + char first = value.charAt(0); + // Characters that are YAML indicators when they appear at the start of a scalar value: + // > block scalar (folded) + // | block scalar (literal) + // * alias + // & anchor + // ! tag + // % directive + // @ reserved + // ` reserved + // ' single quote + // " double quote + // { flow mapping start (not matched by flow-mapping check above, e.g. "{broken") + // [ flow sequence start (not matched by flow-sequence check above, e.g. "[broken") + // ? mapping key indicator + // , flow collection separator + // # comment indicator + switch (first) { + case '>': + case '|': + case '*': + case '&': + case '!': + case '%': + case '@': + case '`': + case '\'': + case '"': + case '{': + case '[': + case '?': + case ',': + case '#': + return true; + default: + break; + } + + // Check for YAML reserved words that would be parsed as non-strings + String lower = value.trim().toLowerCase(); + switch (lower) { + case "true": + case "false": + case "null": + case "yes": + case "no": + case "on": + case "off": + return true; + default: + break; + } + + // "- " at start is a block sequence entry indicator + if (first == '-' && value.length() > 1 && value.charAt(1) == ' ') { + return true; + } + + // " #" anywhere starts a YAML comment (hash preceded by whitespace) + if (value.contains(" #") || value.contains("\t#")) { + return true; + } + + // ": " or ":" at end of value is a mapping indicator + if (value.contains(": ") || value.endsWith(":")) { + return true; + } + + // Leading or trailing whitespace + char last = value.charAt(value.length() - 1); + if (first == ' ' || first == '\t' || last == ' ' || last == '\t') { + return true; + } + + // Newlines would break YAML block structure + if (value.indexOf('\n') >= 0 || value.indexOf('\r') >= 0) { + return true; + } + + return false; + } + + static String yamlDoubleQuote(String value) { + // Escape backslashes first (before introducing new ones), then double quotes and newlines + String escaped = + value + .replace("\\", "\\\\") + .replace("\"", "\\\"") + .replace("\n", "\\n") + .replace("\r", "\\r") + .replace("\t", "\\t"); + return "\"" + escaped + "\""; + } +} diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/util/YamlSafeSubstitutorTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/util/YamlSafeSubstitutorTest.java new file mode 100644 index 000000000000..d34156ef064d --- /dev/null +++ b/openmetadata-service/src/test/java/org/openmetadata/service/util/YamlSafeSubstitutorTest.java @@ -0,0 +1,100 @@ +package org.openmetadata.service.util; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.apache.commons.text.TextStringBuilder; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; +import org.junit.jupiter.params.provider.ValueSource; + +class YamlSafeSubstitutorTest { + + @ParameterizedTest + @CsvSource( + value = { + "plainvalue, false", + "value_with_underscore, false", + "value-with-hyphen, false", + "12345, false", + "'', false", + "'>value', true", + "'|value', true", + "'*value', true", + "'&value', true", + "'!value', true", + "'%value', true", + "'@value', true", + "'`value', true", + "''value', true", + "'\"value', true", + "'{value', true", + "'[value', true", + "'?value', true", + "',value', true", + "'#value', true", + "'- value', true", + "'-value', false", + "'value #comment', true", + "'value#nocomment', false", + "'key: value', true", + "'key:', true", + "' key', true", + "'key ', true", + "'val\nval', true", + "'val\rval', true", + "'[flow, sequence]', false", + "'{flow: mapping}', false", + "' \"already_quoted\"', false", + "\"'already_quoted'\", false", + "true, true", + "FALSE, true", + "null, true", + "Null, true", + "yes, true", + "no, true", + "on, true", + "off, true", + "maybe, false" + }, + nullValues = {"NULL"}) + void testNeedsYamlQuoting(String value, boolean expected) { + assertEquals(expected, YamlSafeSubstitutor.needsYamlQuoting(value)); + } + + @ParameterizedTest + @CsvSource({ + "'>value', \">value\"", + "'|value', \"|value\"", + "'#value', \"#value\"", + "'key: value', \"key: value\"", + "' val ', \" val \"", + "'val\nval', \"val\\nval\"", + "'val\"quote', \"val\\\"quote\"", + "'val\\back', \"val\\\\back\"" + }) + void testYamlDoubleQuote(String input, String expected) { + assertEquals(expected, YamlSafeSubstitutor.yamlDoubleQuote(input)); + } + + @Test + void testIsInlineSubstitution() { + TextStringBuilder buf = new TextStringBuilder("jdbc:${DB_SCHEME}://localhost"); + // ${DB_SCHEME} is at [5, 17) + assertTrue(YamlSafeSubstitutor.isInlineSubstitution(buf, 5, 17)); + + buf = new TextStringBuilder("${DB_SCHEME}"); + assertFalse(YamlSafeSubstitutor.isInlineSubstitution(buf, 0, 12)); + + buf = new TextStringBuilder(" ${DB_SCHEME} "); + assertFalse(YamlSafeSubstitutor.isInlineSubstitution(buf, 2, 14)); + + buf = new TextStringBuilder("prefix${DB_SCHEME}"); + assertTrue(YamlSafeSubstitutor.isInlineSubstitution(buf, 6, 18)); + + buf = new TextStringBuilder("${DB_SCHEME}suffix"); + assertTrue(YamlSafeSubstitutor.isInlineSubstitution(buf, 0, 12)); + } +} diff --git a/openmetadata-ui/src/main/resources/ui/src/locale/languages/en-us.json b/openmetadata-ui/src/main/resources/ui/src/locale/languages/en-us.json index d9d765bafef8..53d82b1f6dca 100644 --- a/openmetadata-ui/src/main/resources/ui/src/locale/languages/en-us.json +++ b/openmetadata-ui/src/main/resources/ui/src/locale/languages/en-us.json @@ -851,6 +851,7 @@ "filtering-condition": "Filtering Condition", "filters-configuration": "Filters Configuration", "first": "First", + "first-name": "First Name", "first-lowercase": "first", "first-quartile": "First Quartile", "fit-to-screen": "Fit to screen", @@ -1098,6 +1099,7 @@ "large": "Large", "last": "Last", "last-14-days": "Last 14 days", + "last-name": "Last Name", "last-24-hours": "Last 24 hours", "last-30-days": "Last 30 days", "last-7-days": "Last 7 days", @@ -2602,8 +2604,8 @@ "field-timeout-description": "Connection Timeout", "field-use-aws-credentials-description": "Indicates whether to use AWS credentials when connecting to OpenSearch in AWS.", "field-use-ssl-description": "This indicates whether to use SSL when connecting to Elasticsearch. By default, we will ignore SSL settings.", - "field-value-updated-notification": "Your {{fieldName}} changed to {{fieldValue}}", "field-verify-certs-description": "This indicates whether to verify certificates when using SSL connection to Elasticsearch. It's ignored by default and is set to true. Ensure that you send the certificates in the property `CA Certificates`.", + "field-value-updated-notification": "Your {{fieldName}} changed to {{fieldValue}}", "file-format-size": "{{formats}} (max. {{size}})", "file-size-exceeded": "File size exceeds maximum allowed size of {{size}}", "filter-pattern-include-exclude-info": "Explicitly {{activity}} {{filterPattern}} by adding a list of comma-separated regex.", diff --git a/openmetadata-ui/src/main/resources/ui/src/locale/languages/fr-fr.json b/openmetadata-ui/src/main/resources/ui/src/locale/languages/fr-fr.json index 4c69290fb6a5..d8cc8aeab021 100644 --- a/openmetadata-ui/src/main/resources/ui/src/locale/languages/fr-fr.json +++ b/openmetadata-ui/src/main/resources/ui/src/locale/languages/fr-fr.json @@ -851,6 +851,7 @@ "filtering-condition": "Condition de Filtrage", "filters-configuration": "Configuration des filtres", "first": "Premier", + "first-name": "Prénom", "first-lowercase": "premier", "first-quartile": "Premier Quartile", "fit-to-screen": "Plein Écran", @@ -888,6 +889,7 @@ "full-coverage": "Couverture complète", "full-name": "Nom Complet", "full-screen": "Plein Écran", + "last-name": "Nom", "full-screen-view": "Vue Plein Écran", "full-size": "Taille complète", "fullscreen": "Plein écran", diff --git a/openmetadata-ui/src/main/resources/ui/src/pages/SignUp/BasicSignup.component.tsx b/openmetadata-ui/src/main/resources/ui/src/pages/SignUp/BasicSignup.component.tsx index 08b78ecdc94d..acab06def29f 100644 --- a/openmetadata-ui/src/main/resources/ui/src/pages/SignUp/BasicSignup.component.tsx +++ b/openmetadata-ui/src/main/resources/ui/src/pages/SignUp/BasicSignup.component.tsx @@ -105,29 +105,25 @@ const BasicSignUp = () => { validateMessages={VALIDATION_MESSAGES} onFinish={handleSubmit}> diff --git a/scripts/VerifyYamlFix.java b/scripts/VerifyYamlFix.java new file mode 100644 index 000000000000..4043c7fbd059 --- /dev/null +++ b/scripts/VerifyYamlFix.java @@ -0,0 +1,50 @@ +public class VerifyYamlFix { + public static void main(String[] args) { + String[] testCases = { + "plainvalue", "value_with_#_hash", "{}", ">block", "|literal", "key: value", " spaced ", + "true", "FALSE", "null", "Null", "yes", "no", "on", "off", "maybe" + }; + + System.out.println("--- YAML Quoting Verification (Refined) ---"); + for (String test : testCases) { + boolean needsQuoting = needsYamlQuoting(test); + String result = needsQuoting ? yamlDoubleQuote(test) : test; + System.out.printf("In: [%s] -> Needs Quoting: %b -> Result: [%s]%n", test, needsQuoting, result); + } + } + + static boolean needsYamlQuoting(String value) { + if (value.isEmpty()) return false; + + // Flow mappings/sequences + if ((value.startsWith("[") && value.endsWith("]")) || (value.startsWith("{") && value.endsWith("}"))) return false; + + // Already quoted (simplified check for script) + String trimmed = value.trim(); + if (trimmed.length() >= 2) { + char first = trimmed.charAt(0); + char last = trimmed.charAt(trimmed.length() - 1); + if ((first == '"' && last == '"') || (first == '\'' && last == '\'')) return false; + } + + char first = value.charAt(0); + String indicators = ">|*&!%@`'\"{[?,#"; + if (indicators.indexOf(first) != -1) return true; + + // Reserved words + String lower = trimmed.toLowerCase(); + switch (lower) { + case "true": case "false": case "null": case "yes": case "no": case "on": case "off": + return true; + } + + if (value.contains(" #") || value.contains("\t#") || value.contains(": ") || value.endsWith(":") || value.indexOf('\n') >= 0 || value.indexOf('\r') >= 0) return true; + + char last = value.charAt(value.length() - 1); + return (first == ' ' || first == '\t' || last == ' ' || last == '\t'); + } + + static String yamlDoubleQuote(String value) { + return "\"" + value.replace("\\", "\\\\").replace("\"", "\\\"").replace("\n", "\\n").replace("\r", "\\r") + "\""; + } +}