Skip to content

Commit 80635c7

Browse files
fix(ingestion): prevent avro parser from leaking file content in warning logs (#24798)
1 parent 39e5cc0 commit 80635c7

3 files changed

Lines changed: 21 additions & 6 deletions

File tree

ingestion/src/metadata/parsers/avro_parser.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import traceback
1717
from typing import List, Optional, Tuple, Type, Union
1818

19+
import avro.errors
1920
import avro.schema as avroschema
2021
from avro.schema import ArraySchema, RecordSchema, Schema, UnionSchema
2122
from pydantic import BaseModel
@@ -260,9 +261,11 @@ def parse_avro_schema(
260261
)
261262
]
262263
return models
264+
except avro.errors.SchemaParseException:
265+
logger.warning("Unable to parse the avro schema: SchemaParseException")
263266
except Exception as exc: # pylint: disable=broad-except
264267
logger.debug(traceback.format_exc())
265-
logger.warning(f"Unable to parse the avro schema: {exc}")
268+
logger.warning(f"Unable to parse the avro schema: {type(exc).__name__}")
266269
return None
267270

268271

@@ -303,7 +306,9 @@ def get_avro_fields(
303306
)
304307
else:
305308
field_models.append(parse_single_field(field, cls=cls))
309+
except avro.errors.SchemaParseException:
310+
logger.warning("Unable to parse the avro schema into models: SchemaParseException")
306311
except Exception as exc: # pylint: disable=broad-except
307312
logger.debug(traceback.format_exc())
308-
logger.warning(f"Unable to parse the avro schema into models: {exc}")
313+
logger.warning(f"Unable to parse the avro schema into models: {type(exc).__name__}")
309314
return field_models

ingestion/src/metadata/readers/dataframe/avro.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
"""
1313
Avro DataFrame reader - streams records in batches to avoid OOM
1414
"""
15-
import traceback
1615
from functools import singledispatchmethod
1716
from typing import Iterator, List, Optional
1817

@@ -94,9 +93,8 @@ def _get_avro_columns(file_obj) -> Optional[List[Column]]:
9493
writer_schema = json.dumps(reader.writer_schema)
9594

9695
return parse_avro_schema(schema=writer_schema, cls=Column)
97-
except Exception as warn:
98-
logger.warning(f"Error reading Avro schema: {warn}")
99-
logger.debug(traceback.format_exc())
96+
except Exception:
97+
logger.warning("Error reading Avro schema")
10098
return None
10199

102100
@singledispatchmethod

ingestion/tests/unit/test_avro_parser.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
"""
1313
Avro parser tests
1414
"""
15+
import logging
1516
from unittest import TestCase
1617

1718
from metadata.parsers.avro_parser import parse_avro_schema
@@ -823,3 +824,14 @@ def test_recursive_issue_parsing(self):
823824
.children[0]
824825
.children
825826
)
827+
828+
def test_parse_failure_does_not_leak_schema_content(self):
829+
sensitive_schema = '{"secret_key": "super_secret_value_12345"}'
830+
831+
with self.assertLogs("metadata.Ingestion", level="DEBUG") as log_ctx:
832+
result = parse_avro_schema(sensitive_schema)
833+
834+
self.assertIsNone(result)
835+
for log_line in log_ctx.output:
836+
self.assertNotIn("super_secret_value_12345", log_line)
837+
self.assertNotIn("secret_key", log_line)

0 commit comments

Comments
 (0)