Skip to content

Commit a9e1e12

Browse files
committed
Add dropbox recipes for CCF water level @up & down
DD-285
1 parent d3b14a7 commit a9e1e12

2 files changed

Lines changed: 156 additions & 0 deletions

File tree

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
2+
3+
dropbox_home: temp/dropbox # This is a placeholder. In practice, this would be set to the actual path of the dropbox home directory on the user's system.
4+
modeling_data: //cnrastore-bdo/Modeling_Data/
5+
target_tz: "Etc/GMT+8" # Odd, but the correct one in POSIX/pandas for PST.
6+
target_tz_label: PST
7+
8+
9+
data:
10+
- name: ccf_ele
11+
skip: False
12+
collect:
13+
file_pattern: "CC_Water_Levels_2024*.csv"
14+
wildcard: time_overlap # time_sharded | time_overlap | data | none.
15+
# If time_sharded, indicates non-overlapping timestamps that uses year chunking.
16+
# If time_overlap, indicates wildcards are ordered by date 20250531(lexicographic must = chronological)
17+
# If data, represents different station/variable. Look for metadata in the file and station lists.
18+
# If none, a wildcard is an error. # todo: test
19+
location: "${modeling_data}/hydraulic_structures/incoming" # This could be understood as a default
20+
recursive_search: False
21+
reader: read_ts # Names, pointers to code etc. To be fleshed out
22+
reader_args:
23+
names: ["datetime", "up_ele","down_ele"]
24+
na_values: ["(null)", "null", "NULL", ""]
25+
hint: "resort"
26+
force_regular: False
27+
merge_method: "ts_splice" # We will not call read_ts on wildcard, but rather glob and read and creat list of dfs
28+
merge_args:
29+
transition: prefer_first
30+
selector: down_ele # Usually a column name. When read_last_resort_csv is used, use null.
31+
transforms:
32+
- name: dst_tz # SCADA is delivered local time. Often this is not needed from better data sources
33+
args:
34+
src_tz: US/Pacific
35+
target_tz: ${target_tz}
36+
- name: coarsen
37+
args:
38+
grid: 2min
39+
preserve_vals: [0.0]
40+
qwidth: 0.01
41+
hyst: 0.5
42+
heartbeat_freq: 60min
43+
metadata:
44+
station_id: clc
45+
structure_id: ccfb
46+
agency_id: ccfb
47+
processor: dms
48+
agency: dwr
49+
source: dwr
50+
freq: None # None for irregular, "infer" for infer.
51+
param: elevation
52+
unit: ft
53+
subloc: down
54+
time_zone: ${target_tz}
55+
time_zone_label: ${target_tz_label}
56+
latitude: registry_lookup # projected will be inferred. User changes should be made to projected, not latitude/longitude.
57+
longitude: registry_lookup # what happens when this and registry entry have conflicting metadata for something like latitude.
58+
# seems like this prevails, but we should prevent conflicts.
59+
60+
output:
61+
repo_name: proprietary_formatted # This is a pointer to an entry in dstore_config.yaml
62+
staging: # This is where the read and transformed time series will be staged.
63+
# It will be used to update the repo, but that uses update_repo and may not mean a full replacement
64+
dir: ${modeling_data}/repo/continuous/proprietary/staging
65+
write_args:
66+
float_format: "%0.3f"
67+
chunk_years: False
68+
reconcile:
69+
repo_data_dir: ${modeling_data}/repo/continuous/proprietary/formatted # This is a practice directory so you don't go overwriting a real repo while developing the recipe
70+
# When omitted the target location is the repo_root as configured in dstore_config.yaml
71+
prefer: staged # This is an argument to the update process:
72+
# - staged means "prefer the new stuff we staged"
73+
# - repo means "prefer the stuff in the repo, just top off with any new time stamps"
74+
allow_new_series: true
75+
inspection: # update parameters. These should be included but seldom change
76+
recent_years: 3 # Up to 3 year old data will be checked whenever update is performed
77+
p3: 0.15 # probability of a spot check for 3-10 ago, differences will trigger full replacment of all years
78+
p10: 0.05 # pprobability of a spot check for data 10+ years old, differences will trigger full replacment of all years
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
2+
3+
dropbox_home: temp/dropbox # This is a placeholder. In practice, this would be set to the actual path of the dropbox home directory on the user's system.
4+
modeling_data: //cnrastore-bdo/Modeling_Data/
5+
target_tz: "Etc/GMT+8" # Odd, but the correct one in POSIX/pandas for PST.
6+
target_tz_label: PST
7+
8+
9+
data:
10+
- name: ccf_ele
11+
skip: False
12+
collect:
13+
file_pattern: "CC_Water_Levels_2024*.csv"
14+
wildcard: time_overlap # time_sharded | time_overlap | data | none.
15+
# If time_sharded, indicates non-overlapping timestamps that uses year chunking.
16+
# If time_overlap, indicates wildcards are ordered by date 20250531(lexicographic must = chronological)
17+
# If data, represents different station/variable. Look for metadata in the file and station lists.
18+
# If none, a wildcard is an error. # todo: test
19+
location: "${modeling_data}/hydraulic_structures/incoming" # This could be understood as a default
20+
recursive_search: False
21+
reader: read_ts # Names, pointers to code etc. To be fleshed out
22+
reader_args:
23+
names: ["datetime", "up_ele", "down_ele"]
24+
na_values: ["(null)", "null", "NULL", ""]
25+
hint: "resort"
26+
force_regular: False
27+
merge_method: "ts_splice" # We will not call read_ts on wildcard, but rather glob and read and creat list of dfs
28+
merge_args:
29+
transition: prefer_first
30+
selector: up_ele # Usually a column name. When read_last_resort_csv is used, use null.
31+
transforms:
32+
- name: dst_tz # SCADA is delivered local time. Often this is not needed from better data sources
33+
args:
34+
src_tz: US/Pacific
35+
target_tz: ${target_tz}
36+
- name: coarsen
37+
args:
38+
grid: 2min
39+
preserve_vals: [0.0]
40+
qwidth: 0.01
41+
hyst: 0.5
42+
heartbeat_freq: 60min
43+
metadata:
44+
station_id: clc
45+
structure_id: ccfb
46+
agency_id: ccfb
47+
processor: dms
48+
agency: dwr
49+
source: dwr
50+
freq: None # None for irregular, "infer" for infer.
51+
param: elevation
52+
unit: ft
53+
subloc: up
54+
time_zone: ${target_tz}
55+
time_zone_label: ${target_tz_label}
56+
latitude: registry_lookup # projected will be inferred. User changes should be made to projected, not latitude/longitude.
57+
longitude: registry_lookup # what happens when this and registry entry have conflicting metadata for something like latitude.
58+
# seems like this prevails, but we should prevent conflicts.
59+
60+
output:
61+
repo_name: proprietary_formatted # This is a pointer to an entry in dstore_config.yaml
62+
staging: # This is where the read and transformed time series will be staged.
63+
# It will be used to update the repo, but that uses update_repo and may not mean a full replacement
64+
dir: ${modeling_data}/repo/continuous/proprietary/staging
65+
write_args:
66+
float_format: "%0.3f"
67+
chunk_years: False
68+
reconcile:
69+
repo_data_dir: ${modeling_data}/repo/continuous/proprietary/formatted # This is a practice directory so you don't go overwriting a real repo while developing the recipe
70+
# When omitted the target location is the repo_root as configured in dstore_config.yaml
71+
prefer: staged # This is an argument to the update process:
72+
# - staged means "prefer the new stuff we staged"
73+
# - repo means "prefer the stuff in the repo, just top off with any new time stamps"
74+
allow_new_series: true
75+
inspection: # update parameters. These should be included but seldom change
76+
recent_years: 3 # Up to 3 year old data will be checked whenever update is performed
77+
p3: 0.15 # probability of a spot check for 3-10 ago, differences will trigger full replacment of all years
78+
p10: 0.05 # pprobability of a spot check for data 10+ years old, differences will trigger full replacment of all years

0 commit comments

Comments
 (0)