kaizenflow/im_v2/common/data/client/im_raw_data_client.py at master · Dharmitha7/kaizenflow · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
"""
Thin IM client for loading raw data.

Import as:

import im_v2.common.data.client.im_raw_data_client as imvcdcimrdc
"""

import logging
from typing import List, Optional

import pandas as pd

import core.finance.bid_ask as cfibiask
import data_schema.dataset_schema_utils as dsdascut
import helpers.hdatetime as hdateti
import helpers.hdbg as hdbg
import helpers.hparquet as hparque
import helpers.hs3 as hs3
import helpers.hsql_implementation as hsqlimpl
import im_v2.common.data.transform.transform_utils as imvcdttrut
import im_v2.common.db.db_utils as imvcddbut
import im_v2.common.universe.universe as imvcounun

_LOG = logging.getLogger(__name__)


class RawDataReader:
    """
    Load the raw data from S3 or a DB based on the dataset signature.
    """

    def __init__(self, signature: str, *, stage: str = "prod", **kwargs):
        """
        Constructor.

        :param signature: dataset signature, e.g.
            `bulk.airflow.resampled_1min.pq.bid_ask.spot.v3.crypto_chassis.binance.v1_0_0`
        :param stage: which stage to execute in, determines which DB stage or S3
            bucket is used.
        """
        # Validate signature schema.
        self.dataset_schema = dsdascut.get_dataset_schema()
        self.signature = signature
        self.args = dsdascut.parse_dataset_signature_to_args(
            signature, self.dataset_schema
        )
        self.stage = stage
        self.s3_bucket_name = hs3.get_s3_bucket_from_stage(self.stage, **kwargs)
        self.dataset_epoch_unit = imvcdttrut.get_vendor_epoch_unit(
            self.args["vendor"], self.args["data_type"]
        )
        universe_list = imvcounun.get_vendor_universe(
            self.args["vendor"],
            "download",
            version=self.args["universe"].replace("_", "."),
        )
        self.universe_list = universe_list[self.args["exchange_id"]]
        if self.args["data_format"] == "parquet":
            self.partition_mode = self._get_partition_mode()
        elif self.args["data_format"] == "postgres":
            self._setup_db_table_access()
        else:
            raise ValueError("Invalid data format `%s`", self.args["data_format"])

    def read_data_head(self) -> pd.DataFrame:
        """
        Load a sample of the data.
        """
        if self.args["data_format"] == "parquet":
            data = self.load_parquet_head()
        elif self.args["data_format"] == "postgres":
            self._setup_db_table_access()
            data = self.load_db_table_head()
        else:
            raise ValueError("Invalid data format `%s`", self.args["data_format"])
        return data

    def read_data(
        self,
        start_timestamp: Optional[pd.Timestamp],
        end_timestamp: Optional[pd.Timestamp],
        *,
        currency_pairs: Optional[List[str]] = None,
        bid_ask_levels: Optional[List[int]] = None,
    ) -> pd.DataFrame:
        """
        Load data in a specified time interval.

        :param start_timestamp: start of the filtered interval
        :param end_timestamp: end of the filtered interval
        :param currency_pairs: currency_pairs to select, if None, all
            pairs are loaded
        :param bid_ask_levels: which levels of bid_ask data to load, if
            None, all levels are loaded
        """
        if currency_pairs:
            hdbg.dassert_is_subset(set(currency_pairs), set(self.universe_list))
        if self.args["data_format"] == "parquet":
            # Load the data from Parquet.
            data = self.load_parquet(
                start_timestamp,
                end_timestamp,
                currency_pairs=currency_pairs,
                bid_ask_levels=bid_ask_levels,
            )
        elif self.args["data_format"] == "postgres":
            # Load the data from DB.
            data = self.load_db_table(
                start_timestamp,
                end_timestamp,
                currency_pairs=currency_pairs,
                bid_ask_levels=bid_ask_levels,
            )
        else:
            raise ValueError("Invalid data format `%s`", self.args["data_format"])
        return data

    def load_parquet_head(self) -> pd.DataFrame:
        """
        Load the head of a sample parquet file.

        Currently using `currency_pair=ETH_USDT/year=2022/month=11/data.parquet`
        location for each data type.
        """
        # Build s3 path.
        s3_pq_file_path = self._build_s3_pq_file_path()
        _LOG.info("Loading the data from `%s`", s3_pq_file_path)
        df = hparque.from_parquet(s3_pq_file_path, n_rows=10, aws_profile="ck")
        return df

    def load_db_table_head(self) -> pd.DataFrame:
        """
        Load the head of the DB table corresponding to the dataset signature.
        """
        start_timestamp = end_timestamp = None
        df = imvcddbut.load_db_data(
            self.db_connection,
            self.table_name,
            start_timestamp,
            end_timestamp,
            limit=10,
            exchange_id=self.args["exchange_id"],
        )
        # Remove useless id column.
        df = df.drop("id", axis=1)
        return df

    def load_db_table(
        self,
        start_timestamp: Optional[pd.Timestamp],
        end_timestamp: Optional[pd.Timestamp],
        *,
        deduplicate: bool = False,
        currency_pairs: Optional[List[str]] = None,
        bid_ask_levels: Optional[List[int]] = None,
        bid_ask_format: str = "wide",
        subset: Optional[List[str]] = None,
    ) -> pd.DataFrame:
        """
        Load data from a DB table in a specified time interval.

        Refer to `read_data()` for parameter docs.
        """
        hdbg.dassert_in(bid_ask_format, ["wide", "long"])
        if not currency_pairs:
            # Database can store different universes, determine the set based
            # on current signature's universe.
            currency_pairs = self.universe_list
        data = imvcddbut.load_db_data(
            self.db_connection,
            self.table_name,
            start_timestamp,
            end_timestamp,
            currency_pairs=currency_pairs,
            bid_ask_levels=bid_ask_levels,
            exchange_id=self.args["exchange_id"],
        )
        if deduplicate:
            hdbg.dassert_is_not(
                subset,
                None,
                "subset kwarg must be provided when deduplicate=True",
            )
            data = data.drop_duplicates(subset=subset)
        if self.args["data_type"] == "bid_ask" and bid_ask_format == "wide":
            # Set timestamp as index as required by the transform function.
            data = data.set_index("timestamp", drop=True)
            data = cfibiask.transform_bid_ask_long_data_to_wide(data, "timestamp")
        return data

    def load_csv(
        self,
        currency_pair: str,
        start_timestamp: Optional[pd.Timestamp],
        end_timestamp: Optional[pd.Timestamp],
    ) -> pd.DataFrame:
        """
        Load CSV data for a specified time frame.

        :param currency_pair: currency pair to load
        :param start_timestamp: start of the filtered interval
        :param end_timestamp: end of the filtered interval
        :return: data frame
        """
        # Build s3 path.
        s3_csv_file_path = self._build_s3_csv_file_path(currency_pair)
        _LOG.info("Loading the data from `%s`", s3_csv_file_path)
        # Load the data.
        df = pd.read_csv(s3_csv_file_path)
        # Filter the data.
        if start_timestamp:
            # Convert timestamps to unix epoch.
            start_timestamp = hdateti.convert_timestamp_to_unix_epoch(
                start_timestamp, "ms"
            )
            df = df[df["timestamp"] >= start_timestamp]
        if end_timestamp:
            end_timestamp = hdateti.convert_timestamp_to_unix_epoch(
                end_timestamp, "ms"
            )
            df = df[df["timestamp"] <= end_timestamp]
        return df

    def load_parquet(
        self,
        start_timestamp: Optional[pd.Timestamp],
        end_timestamp: Optional[pd.Timestamp],
        *,
        currency_pairs: Optional[List[str]] = None,
        bid_ask_levels: Optional[List[int]] = None,
        columns: Optional[List[int]] = None,
    ) -> pd.DataFrame:
        """
        Load Parquet data for a specified time frame.

        Refer to `read_data()` for parameter docs.
        """
        if (
            start_timestamp is not None
            and end_timestamp is not None
            and self.partition_mode == "by_year_month_day"
        ):
            filters = hparque.build_filter_with_only_equalities(
                start_timestamp, end_timestamp
            )
        else:
            filters = []
        if start_timestamp:
            start_ts = hdateti.convert_timestamp_to_unix_epoch(
                start_timestamp, unit=self.dataset_epoch_unit
            )
            filters.append(("timestamp", ">=", start_ts))
        if end_timestamp:
            end_ts = hdateti.convert_timestamp_to_unix_epoch(
                end_timestamp, unit=self.dataset_epoch_unit
            )
            filters.append(("timestamp", "<=", end_ts))
        if currency_pairs:
            filters.append(("currency_pair", "in", currency_pairs))
        if bid_ask_levels:
            if "resampled" in self.args["action_tag"]:
                columns = [
                    "timestamp",
                    "currency_pair",
                    "exchange_id",
                    "knowledge_timestamp",
                    "year",
                    "month",
                ]
                for i in bid_ask_levels:
                    for s in ["open", "close", "mean"]:
                        bid_ask_cols_level = list(
                            map(
                                lambda x: f"level_{i}.{x}.{s}",
                                imvcdttrut.BID_ASK_COLS,
                            )
                        )
                        columns.extend(bid_ask_cols_level)
                    for s in ["high", "low"]:
                        bid_ask_cols_level = list(
                            map(
                                lambda x: f"level_{i}.{x}.{s}",
                                [
                                    imvcdttrut.BID_ASK_COLS[0],
                                    imvcdttrut.BID_ASK_COLS[2],
                                ],
                            )
                        )
                        columns.extend(bid_ask_cols_level)
                    for s in ["max", "min"]:
                        bid_ask_cols_level = list(
                            map(
                                lambda x: f"level_{i}.{x}.{s}",
                                [
                                    imvcdttrut.BID_ASK_COLS[1],
                                    imvcdttrut.BID_ASK_COLS[3],
                                ],
                            )
                        )
                        columns.extend(bid_ask_cols_level)
            else:
                filters.append(("level", "in", bid_ask_levels))
        s3_path = self._build_s3_pq_file_path()
        data = hparque.from_parquet(
            s3_path, filters=filters, columns=columns, aws_profile="ck"
        )
        return data

    # ///////////////////////////////////////////////////////////////////////////
    # Private methods.
    # ///////////////////////////////////////////////////////////////////////////

    def _setup_db_table_access(self) -> None:
        """
        Set up DB connection and get DB table name based on dataset signature.
        """
        self.db_connection = imvcddbut.DbConnectionManager.get_connection(
            self.stage
        )
        self.table_name = dsdascut.get_im_db_table_name_from_signature(
            self.signature, self.dataset_schema
        )
        # Check if the table name exists.
        db_tables = hsqlimpl.get_table_names(self.db_connection)
        hdbg.dassert_in(
            self.table_name, db_tables, f"`{self.table_name} doesn't exist`"
        )
        _LOG.info("Enabled connection to the `%s` DB table", self.table_name)

    def _build_s3_pq_file_path(self) -> str:
        """
        Get the path to Parquet file on S3.
        """
        s3_bucket = f"s3://{self.s3_bucket_name}"
        s3_path = dsdascut.build_s3_dataset_path_from_args(s3_bucket, self.args)
        return s3_path

    def _build_s3_csv_file_path(self, currency_pair: str) -> str:
        """
        Get the path to CSV file on S3.

        :param currency_pair: currency pair to load
        :return: s3 path
        """
        s3_bucket = f"s3://{self.s3_bucket_name}"
        _LOG.debug("s3_bucket=%s", s3_bucket)
        s3_path = dsdascut.build_s3_dataset_path_from_args(s3_bucket, self.args)
        _LOG.debug("s3_path=%s", s3_path)
        s3_path += f"/{currency_pair}.csv.gz"
        return s3_path

    def _get_partition_mode(self) -> str:
        """
        Get the partition_mode from the directory structure.

        This can help in filtering.
        """
        # TODO(Juraj): heuristically set based on our current datasets,
        # non-heuristic approach we used to apply causes super-slow
        # performance for large datasets, see #CmTask8306.
        if (self.args["data_type"] in ["bid_ask", "trades"]) and self.args[
            "action_tag"
        ] != "resampled_1min":
            partition_mode = "by_year_month_day"
        else:
            partition_mode = "by_year_month"
        return partition_mode


# /////////////////////////////////////////////////////////////////////////////

# TODO(gp): -> raw_data_client_examples.py


def get_bid_ask_realtime_raw_data_reader(
    stage: str, data_vendor: str, universe_version: str, exchange_id: str
) -> RawDataReader:
    """
    Get raw data reader for the real-time bid/ask price data.

    Currently the DB signature is hardcoded.

    :param stage: which stage to execute in, determines which DB stage
        or S3 bucket is used.
    :param data_vendor: provider of the realtime data, e.g. CCXT or
        Binance
    :param universe_version: version of the universe
    :param exchange_id: exchange to get data from
    :return: RawDataReader initialized for the realtime bid/ask data
    """
    universe_version = universe_version.replace(".", "_")
    data_vendor = data_vendor.lower()
    bid_ask_db_signature = f"realtime.airflow.downloaded_200ms.postgres.bid_ask.futures.{universe_version}.{data_vendor}.{exchange_id}.v1_0_0"
    bid_ask_raw_data_reader = RawDataReader(bid_ask_db_signature, stage=stage)
    return bid_ask_raw_data_reader