|
15 | 15 | from abc import ABC, abstractmethod |
16 | 16 | from typing import Any, List, Optional, Set, Union |
17 | 17 |
|
| 18 | +from metadata.generated.schema.configuration.profilerConfiguration import ( |
| 19 | + SampleDataIngestionConfig, |
| 20 | +) |
18 | 21 | from metadata.generated.schema.entity.data.database import Database |
19 | 22 | from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchema |
20 | 23 | from metadata.generated.schema.entity.data.table import ( |
@@ -240,33 +243,57 @@ def _truncate_cell(value: Any) -> Any: |
240 | 243 | return value |
241 | 244 |
|
242 | 245 | @calculate_execution_time(store=False) |
243 | | - def generate_sample_data(self) -> Optional[TableData]: |
| 246 | + def generate_sample_data( |
| 247 | + self, sample_data_config: Optional[SampleDataIngestionConfig] = None |
| 248 | + ) -> TableData: |
244 | 249 | """Fetch and ingest sample data |
245 | 250 |
|
246 | 251 | Returns: |
247 | 252 | TableData: sample data |
248 | 253 | """ |
249 | | - try: |
250 | | - logger.debug( |
251 | | - f"Fetching sample data for {self.entity.fullyQualifiedName.root}..." |
| 254 | + if sample_data_config is None: |
| 255 | + # if there is no global config, default to storing and reading sample data to ensure backward compatibility |
| 256 | + # and availability of sample data for downstream steps |
| 257 | + sample_data_config = SampleDataIngestionConfig( |
| 258 | + storeSampleData=True, readSampleData=True |
252 | 259 | ) |
253 | | - table_data = self.fetch_sample_data(self.columns) |
254 | | - # Truncate large cell values to prevent OOM in downstream |
255 | | - # processing (NLP, serialization, etc.) |
256 | | - table_data.rows = [ |
257 | | - [self._truncate_cell(cell) for cell in row] |
258 | | - for row in table_data.rows[ |
259 | | - : min(SAMPLE_DATA_DEFAULT_COUNT, self.sample_limit) |
260 | | - ] |
261 | | - ] |
262 | | - # Only store the data if configured to do so |
263 | | - if self.storage_config: |
264 | | - upload_sample_data( |
265 | | - data=table_data, |
266 | | - entity=self.entity, |
267 | | - sample_storage_config=self.storage_config, |
| 260 | + |
| 261 | + if ( |
| 262 | + not sample_data_config.storeSampleData |
| 263 | + and not sample_data_config.readSampleData |
| 264 | + ): |
| 265 | + logger.info( |
| 266 | + "Both storing and reading of sample data are disabled. Skipping sample data generation." |
| 267 | + ) |
| 268 | + return TableData(rows=[], columns=[]) |
| 269 | + try: |
| 270 | + |
| 271 | + # Stores overwrites reading since if we are storing the data, we want to fetch it |
| 272 | + # as well to pass down the pipeline. If we are not storing, but reading is enabled, |
| 273 | + # we still want to fetch the data to pass down the pipeline, but we won't store it. |
| 274 | + if sample_data_config.readSampleData or sample_data_config.storeSampleData: |
| 275 | + logger.debug( |
| 276 | + f"Fetching sample data for {self.entity.fullyQualifiedName.root}..." |
268 | 277 | ) |
269 | | - return table_data |
| 278 | + table_data = self.fetch_sample_data(self.columns) |
| 279 | + # Truncate large cell values to prevent OOM in downstream |
| 280 | + # processing (NLP, serialization, etc.) |
| 281 | + table_data.rows = [ |
| 282 | + [self._truncate_cell(cell) for cell in row] |
| 283 | + for row in table_data.rows[ |
| 284 | + : min(SAMPLE_DATA_DEFAULT_COUNT, self.sample_limit) |
| 285 | + ] |
| 286 | + ] |
| 287 | + # Only store the data if configured to do so |
| 288 | + if self.storage_config and sample_data_config.storeSampleData: |
| 289 | + upload_sample_data( |
| 290 | + data=table_data, |
| 291 | + entity=self.entity, |
| 292 | + sample_storage_config=self.storage_config, |
| 293 | + ) |
| 294 | + return table_data |
| 295 | + |
| 296 | + return TableData(rows=[], columns=[]) |
270 | 297 |
|
271 | 298 | except Exception as err: |
272 | 299 | logger.debug(traceback.format_exc()) |
|
0 commit comments