TutorTask541: Reviewer Changes

indrayudd · indrayudd · commit 52a73f6febd0 · 2025-06-01T06:14:50.000-04:00
Pre-commit checks:
All checks passed ✅
diff --git a/causal_automl/postprocess_gridstatus_metadata.py b/causal_automl/postprocess_gridstatus_metadata.py
@@ -1,30 +1,36 @@
+#!/usr/bin/env python
 """
+Convert the dataset-per-row metadata of the Gridstatus metadata into a series-
+per-row schema and upload the result back into the same S3 bucket.
+
+> python causal_automl/postprocess_gridstatus_metadata.py \
+    --aws_profile ck \
+    --bucket_path s3://causify-data-collaborators/causal_automl/metadata/ \
+    --input_version v1.0 \
+    --output_version v2.0
+
 Import as:
 
 import causal_automl.postprocess_gridstatus_metadata as capogrme
 """
 
+import argparse
 import ast
 import io
 import logging
 import os
 import re
-from typing import Dict, Iterable, List
+from typing import Any, Dict, List
 
 import helpers.hdbg as hdbg
-import helpers.henv as henv
 import helpers.hio as hio
-import helpers.hpandas as hpandas
 import helpers.hs3 as hs3
 import pandas as pd
 
 # Configure logger.
 hdbg.init_logger(verbosity=logging.INFO)
 _LOG = logging.getLogger(__name__)
 
-# Print system signature.
-_LOG.info("%s", henv.get_system_signature()[0])
-
 
 # #############################################################################
 # _GridstatusMetadataWriter
@@ -36,16 +42,23 @@ class _GridstatusMetadataWriter:
     Save Gridstatus metadata and upload to S3.
     """
 
-    def __init__(self, bucket_path: str, aws_profile: str) -> None:
+    def __init__(
+        self,
+        bucket_path: str,
+        aws_profile: str,
+        cache_dir: str = "tmp.download_metadata_cache/",
+    ) -> None:
         """
-        Initialize the writer for saving metadata and facet values to S3.
+        Initialize the writer for saving postprocessed metadata to S3.
 
         :param bucket_path: base S3 path where files will be uploaded
             (e.g., "s3://bucket/dir/")
         :param aws_profile: AWS CLI profile name used for authentication
+        :param cache_dir: cache directory path
         """
         self._bucket_path = bucket_path
         self._aws_profile = aws_profile
+        self.cache_dir = cache_dir
 
     def write_df_to_s3(self, df: pd.DataFrame, file_name: str) -> None:
         """
@@ -54,8 +67,7 @@ def write_df_to_s3(self, df: pd.DataFrame, file_name: str) -> None:
         :param df: data to be saved to S3
         :param file_name: local file name for saving
         """
-        cache_dir = "tmp.download_metadata_cache/"
-        local_file_path = os.path.join(cache_dir, file_name)
+        local_file_path = os.path.join(self.cache_dir, file_name)
         hio.create_dir(os.path.dirname(local_file_path), incremental=True)
         # Save CSV locally.
         df.to_csv(local_file_path, index=False)
@@ -66,18 +78,17 @@ def write_df_to_s3(self, df: pd.DataFrame, file_name: str) -> None:
         _LOG.debug("Uploaded to S3: %s", bucket_file_path)
 
 
-def _load_data(file_path: str) -> pd.DataFrame:
+def _load_data(file_path: str, aws_profile: str) -> pd.DataFrame:
     """
-    Load data from file path to a dataframe.
+    Load data from S3 path to a dataframe.
 
-    :param file_path: path of the data to load from
-    :return: dataframe of the loaded data
+    :param file_path: S3 path of the data to load from
+    :param aws_profile: aws profile that accesses S3 bucket
+    :return: the queried metadata
     """
-    file = hs3.from_file(file_path, aws_profile="ck")
+    file = hs3.from_file(file_path, aws_profile=aws_profile)
     df = pd.read_csv(io.StringIO(file))
-    _LOG.info("shape: %s", df.shape)
-    _LOG.info("columns: %s", df.columns)
-    _LOG.info("df: \n %s", hpandas.df_to_str(df, log_level=logging.INFO))
+    _LOG.info("Data Successfully Downloaded.")
     return df
 
 
@@ -91,34 +102,60 @@ def _prettify(col: str) -> str:
     :return: prettified column name
     """
     tokens = re.sub(r"[_\s]+", " ", col).strip().split()
-    return " ".join(t.capitalize() for t in tokens)
+    prettified = " ".join(t.capitalize() for t in tokens)
+    return prettified
 
 
 def _build_series_row(
     base_row: pd.Series,
     col_name: str,
     dataset_id: str,
     dataset_name: str,
-) -> Dict[str, object]:
+) -> Dict[str, Any]:
     """
-    Build new rows with the `id_series` and `num_series` columns.
+    Build new rows with the `id_series` and `name_series` columns.
 
     :param base_row: original row
     :param col_name: column name to prettify
+    :param dataset_id: id of the data series
+    :param dataset_name: name of the collection of series
+    :return: modified row
     """
-    nice_col_name = _prettify(col_name)
     # Start with the original row.
     new_row: Dict[str, object] = base_row.to_dict()
     # Add the two series identifiers.
     new_row["id_series"] = f"{dataset_id}.{col_name}"
-    new_row["name_series"] = f"{dataset_name} / {nice_col_name}"
+    new_row["name_series"] = f"{dataset_name} / {_prettify(col_name)}"
     return new_row
 
 
-def _explode_dataset_row(row: pd.Series) -> Iterable[Dict[str, object]]:
+def _explode_dataset_row(row: pd.Series) -> List[Dict[str, Any]]:
     """
     Transform a single row into the row-per-series view.
 
+    E.g.,
+    Input row:
+    id                                      name                    ....
+    caiso_as_prices                         CAISO AS Prices         ....
+
+    Output row:
+    id                                      name                    ....
+    caiso_as_prices                         CAISO AS Prices         ....
+    caiso_as_prices                         CAISO AS Prices         ....
+    caiso_as_prices                         CAISO AS Prices         ....
+    caiso_as_prices                         CAISO AS Prices         ....
+    caiso_as_prices                         CAISO AS Prices         ....
+    caiso_as_prices                         CAISO AS Prices         ....
+    /
+    id_series                               name_series
+    caiso_as_prices.non_spinning_reserves   CAISO AS Prices / Non Spinning Reserves
+    caiso_as_prices.regulation_down         CAISO AS Prices / Regulation Down
+    caiso_as_prices.regulation_mileage_down CAISO AS Prices / Regulation Mileage Down
+    caiso_as_prices.regulation_mileage_up   CAISO AS Prices / Regulation Mileage Up
+    caiso_as_prices.regulation_up           CAISO AS Prices / Regulation Up
+    caiso_as_prices.spinning_reserves       CAISO AS Prices / Spinning Reserves
+
+
     :param row: row to transform
     :return: the exploded row
     """
@@ -127,47 +164,98 @@ def _explode_dataset_row(row: pd.Series) -> Iterable[Dict[str, object]]:
     # Ignore primary key columns.
     ignore_cols = set(ast.literal_eval(row["primary_key_columns"]))
     # Iterate through all columns and generate the row-per-series view.
+    exploded: List[Dict[str, Any]] = []
     for col_meta in ast.literal_eval(row["all_columns"]):
         col_name: str = col_meta["name"]
         if col_meta.get("is_datetime") or col_name in ignore_cols:
             continue
-        yield _build_series_row(row, col_name, dataset_id, dataset_name)
+        exploded.append(
+            _build_series_row(row, col_name, dataset_id, dataset_name)
+        )
+    return exploded
 
 
 def create_series_metadata(df: pd.DataFrame) -> pd.DataFrame:
     """
     Transform the whole dataset into the row-per-series view.
 
+    E.g.,
+    Input dataset:
+    id                                      name                    ....
+    caiso_as_prices                         CAISO AS Prices         ....
+    ...
+
+    Output dataset:
+    id_series                               name_series
+    caiso_as_prices.non_spinning_reserves   CAISO AS Prices / Non Spinning Reserves
+    caiso_as_prices.regulation_down         CAISO AS Prices / Regulation Down
+    caiso_as_prices.regulation_mileage_down CAISO AS Prices / Regulation Mileage Down
+    caiso_as_prices.regulation_mileage_up   CAISO AS Prices / Regulation Mileage Up
+    caiso_as_prices.regulation_up           CAISO AS Prices / Regulation Up
+    caiso_as_prices.spinning_reserves       CAISO AS Prices / Spinning Reserves
+    ...
+    /
+    id                                      name                    ....
+    caiso_as_prices                         CAISO AS Prices         ....
+    caiso_as_prices                         CAISO AS Prices         ....
+    caiso_as_prices                         CAISO AS Prices         ....
+    caiso_as_prices                         CAISO AS Prices         ....
+    caiso_as_prices                         CAISO AS Prices         ....
+    caiso_as_prices                         CAISO AS Prices         ....
+    ...
+
     :param df: data to transform
     :return: transformed data
     """
-    exploded_rows: List[Dict[str, object]] = [
-        row
-        for _, dataset_row in df.iterrows()
-        for row in _explode_dataset_row(dataset_row)
-    ]
+    exploded_rows: List[Dict[str, Any]] = []
+    for _, dataset_row in df.iterrows():
+        exploded_rows.extend(_explode_dataset_row(dataset_row))
     result = pd.DataFrame(exploded_rows)
     # Arrange according to desired ordering.
     leading = ["id_series", "name_series"]
     remaining = [c for c in result.columns if c not in leading]
-    return result[leading + remaining]
+    transformed_df = result[leading + remaining]
+    return transformed_df
 
 
-# Main flow.
-if __name__ == "__main__":
-    # Configure S3.
-    aws_profile = "ck"
-    bucket_root = hs3.get_s3_bucket_path(aws_profile)
-    bucket_path = "s3://causify-data-collaborators/causal_automl/metadata/"
-    file_name = "gridstatus_metadata_original_v2.0.csv"
-    writer = _GridstatusMetadataWriter(bucket_path, aws_profile)
+def _parse() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--aws_profile", default="ck", help="AWS CLI profile for authentication"
+    )
+    parser.add_argument(
+        "--bucket_path",
+        default="s3://causify-data-collaborators/causal_automl/metadata/",
+        help="Destination S3 directory (trailing slash optional)",
+    )
+    parser.add_argument(
+        "--input_version",
+        default="v1.0",
+        help="Version of the source metadata file",
+    )
+    parser.add_argument(
+        "--output_version", default="v2.0", help="Version tag for the result file"
+    )
+    return parser.parse_args()
+
+
+def _main(args: argparse.Namespace) -> None:
     # Load data.
-    v1_path = (
-        "s3://causify-data-collaborators/causal_automl/metadata/"
-        "gridstatus_metadata_original_v1.0.csv"
+    src_file = (
+        f"{args.bucket_path.rstrip('/')}/gridstatus_metadata_original_"
+        f"{args.input_version}.csv"
     )
-    gs_meta = _load_data(v1_path)
+    gs_meta = _load_data(src_file, args.aws_profile)
     # Transform data to a row-per-series view.
     gs_meta_rps = create_series_metadata(gs_meta)
     # Save transformed dataset to S3.
-    writer.write_df_to_s3(gs_meta_rps, file_name)
+    writer = _GridstatusMetadataWriter(args.bucket_path, args.aws_profile)
+    dst_file = f"gridstatus_metadata_original_{args.output_version}.csv"
+    writer.write_df_to_s3(gs_meta_rps, dst_file)
+
+
+if __name__ == "__main__":
+    _main(_parse())