-
Notifications
You must be signed in to change notification settings - Fork 29.2k
Expand file tree
/
Copy pathflow.py
More file actions
81 lines (70 loc) · 3.5 KB
/
flow.py
File metadata and controls
81 lines (70 loc) · 3.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from dataclasses import dataclass
from typing import Callable, Dict, List, Literal, Optional
from pyspark.sql import DataFrame
from pyspark.sql import Column
from pyspark.pipelines.source_code_location import SourceCodeLocation
QueryFunction = Callable[[], DataFrame]
@dataclass(frozen=True)
class Flow:
"""Definition of a flow in a pipeline dataflow graph. A flow defines how to update a particular
dataset.
:param name: The name of the flow.
:param target: The name of the target dataset the flow writes to.
:param spark_conf: A dict where the keys are the Spark configuration property names and the
values are the property values. These properties will be set on the flow.
:param source_code_location: The location of the source code that created this flow.
:param func: The function that defines the flow. This function should return a DataFrame.
"""
name: str
target: str
spark_conf: Dict[str, str]
source_code_location: SourceCodeLocation
func: QueryFunction
@dataclass(frozen=True)
class AutoCdcFlow:
"""Definition of an Auto CDC flow in a pipeline dataflow graph.
An Auto CDC flow applies Change Data Capture (CDC) events from a source to a target
streaming table.
:param name: Optional name of the flow. When None, defaults to the target name.
:param target: The name of the target streaming table.
:param source: The name of the CDC source to stream from.
:param keys: Column(s) that uniquely identify a row in source and target data.
:param sequence_by: Expression used to order the source data.
:param apply_as_deletes: Optional delete condition for the merged operation.
:param apply_as_truncates: Optional truncate condition for the merged operation.
:param column_list: Optional columns to include in the output table.
:param except_column_list: Optional columns to exclude from the output table.
:param stored_as_scd_type: Optional SCD type for the target table. Only 1 is supported.
:param ignore_null_updates_column_list: Subset of columns to ignore null in updates.
:param ignore_null_updates_except_column_list: Columns excluded from null-ignore in updates.
:param source_code_location: The location of the source code that created this flow.
"""
name: Optional[str]
target: str
source: str
keys: List[Column]
sequence_by: Column
apply_as_deletes: Optional[Column]
apply_as_truncates: Optional[Column]
column_list: Optional[List[Column]]
except_column_list: Optional[List[Column]]
stored_as_scd_type: Optional[Literal[1, "1"]]
ignore_null_updates_column_list: Optional[List[Column]]
ignore_null_updates_except_column_list: Optional[List[Column]]
source_code_location: SourceCodeLocation