|
| 1 | +import os |
| 2 | +import tempfile |
| 3 | +from dataclasses import dataclass, field |
| 4 | +from typing import Any, Dict |
| 5 | +from urllib.parse import urlparse |
| 6 | +import pandas as pd |
| 7 | +from pandas import DataFrame |
| 8 | + |
| 9 | +from sagemaker.mlops.feature_store.feature_utils import ( |
| 10 | + start_query_execution, |
| 11 | + get_query_execution, |
| 12 | + wait_for_athena_query, |
| 13 | + download_athena_query_result, |
| 14 | +) |
| 15 | + |
| 16 | +from sagemaker.core.helper.session_helper import Session |
| 17 | + |
| 18 | +@dataclass |
| 19 | +class AthenaQuery: |
| 20 | + """Class to manage querying of feature store data with AWS Athena. |
| 21 | +
|
| 22 | + This class instantiates a AthenaQuery object that is used to retrieve data from feature store |
| 23 | + via standard SQL queries. |
| 24 | +
|
| 25 | + Attributes: |
| 26 | + catalog (str): name of the data catalog. |
| 27 | + database (str): name of the database. |
| 28 | + table_name (str): name of the table. |
| 29 | + sagemaker_session (Session): instance of the Session class to perform boto calls. |
| 30 | + """ |
| 31 | + |
| 32 | + catalog: str |
| 33 | + database: str |
| 34 | + table_name: str |
| 35 | + sagemaker_session: Session |
| 36 | + _current_query_execution_id: str = field(default=None, init=False) |
| 37 | + _result_bucket: str = field(default=None, init=False) |
| 38 | + _result_file_prefix: str = field(default=None, init=False) |
| 39 | + |
| 40 | + def run( |
| 41 | + self, query_string: str, output_location: str, kms_key: str = None, workgroup: str = None |
| 42 | + ) -> str: |
| 43 | + """Execute a SQL query given a query string, output location and kms key. |
| 44 | +
|
| 45 | + This method executes the SQL query using Athena and outputs the results to output_location |
| 46 | + and returns the execution id of the query. |
| 47 | +
|
| 48 | + Args: |
| 49 | + query_string: SQL query string. |
| 50 | + output_location: S3 URI of the query result. |
| 51 | + kms_key: KMS key id. If set, will be used to encrypt the query result file. |
| 52 | + workgroup (str): The name of the workgroup in which the query is being started. |
| 53 | +
|
| 54 | + Returns: |
| 55 | + Execution id of the query. |
| 56 | + """ |
| 57 | + response = start_query_execution( |
| 58 | + session=self.sagemaker_session, |
| 59 | + catalog=self.catalog, |
| 60 | + database=self.database, |
| 61 | + query_string=query_string, |
| 62 | + output_location=output_location, |
| 63 | + kms_key=kms_key, |
| 64 | + workgroup=workgroup, |
| 65 | + ) |
| 66 | + |
| 67 | + self._current_query_execution_id = response["QueryExecutionId"] |
| 68 | + parsed_result = urlparse(output_location, allow_fragments=False) |
| 69 | + self._result_bucket = parsed_result.netloc |
| 70 | + self._result_file_prefix = parsed_result.path.strip("/") |
| 71 | + return self._current_query_execution_id |
| 72 | + |
| 73 | + def wait(self): |
| 74 | + """Wait for the current query to finish.""" |
| 75 | + wait_for_athena_query(self.sagemaker_session, self._current_query_execution_id) |
| 76 | + |
| 77 | + def get_query_execution(self) -> Dict[str, Any]: |
| 78 | + """Get execution status of the current query. |
| 79 | +
|
| 80 | + Returns: |
| 81 | + Response dict from Athena. |
| 82 | + """ |
| 83 | + return get_query_execution(self.sagemaker_session, self._current_query_execution_id) |
| 84 | + |
| 85 | + def as_dataframe(self, **kwargs) -> DataFrame: |
| 86 | + """Download the result of the current query and load it into a DataFrame. |
| 87 | +
|
| 88 | + Args: |
| 89 | + **kwargs (object): key arguments used for the method pandas.read_csv to be able to |
| 90 | + have a better tuning on data. For more info read: |
| 91 | + https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html |
| 92 | +
|
| 93 | + Returns: |
| 94 | + A pandas DataFrame contains the query result. |
| 95 | + """ |
| 96 | + state = self.get_query_execution()["QueryExecution"]["Status"]["State"] |
| 97 | + if state != "SUCCEEDED": |
| 98 | + if state in ("QUEUED", "RUNNING"): |
| 99 | + raise RuntimeError(f"Query {self._current_query_execution_id} still executing.") |
| 100 | + raise RuntimeError(f"Query {self._current_query_execution_id} failed.") |
| 101 | + |
| 102 | + output_file = os.path.join(tempfile.gettempdir(), f"{self._current_query_execution_id}.csv") |
| 103 | + download_athena_query_result( |
| 104 | + session=self.sagemaker_session, |
| 105 | + bucket=self._result_bucket, |
| 106 | + prefix=self._result_file_prefix, |
| 107 | + query_execution_id=self._current_query_execution_id, |
| 108 | + filename=output_file, |
| 109 | + ) |
| 110 | + kwargs.pop("delimiter", None) |
| 111 | + return pd.read_csv(output_file, delimiter=",", **kwargs) |
| 112 | + |
0 commit comments