Skip to content

Commit 4712eea

Browse files
Config update (#169)
Simplified config structure and updated end date value. Removed individual metric output files. Added some missing docstring.
1 parent 605ebc9 commit 4712eea

File tree

6 files changed

+101
-158
lines changed

6 files changed

+101
-158
lines changed

config.yaml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,19 +27,19 @@ ledgers:
2727
- tezos
2828
- zcash
2929

30-
# Execution flags
31-
execution_flags:
32-
force_map: false
30+
# Flag that controls whether to force mapping or use already processed data
31+
# when available.
32+
force_map: false
3333

34-
# Analyze flags
35-
analyze_flags:
36-
clustering: true
34+
# Flag that determines whether to cluster block producers into entities that
35+
# control them.
36+
clustering: true
3737

3838
# The timeframe for which an analysis should be performed.
3939
# Each date is a string of the form YYYY-MM-DD.
4040
timeframe:
4141
start_date: 2018-01-01
42-
end_date: 2025-03-01
42+
end_date: 2026-02-01
4343

4444
# The number of days to use for the estimation window, i.e.how many days of blocks to use for each data point.
4545
# If left empty, then the entire time frame will be used (only valid when combined with empty frequency).

consensus_decentralization/analyze.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -82,18 +82,15 @@ def analyze(projects, aggregated_data_filename, input_dir, output_dir, populatio
8282
csv_contents[metric_name][row_index + 1].append(result)
8383
aggregate_output[project][date][metric_name] = result
8484

85-
for metric in metric_names:
86-
with open(output_dir / f'{metric}.csv', 'w') as f:
87-
csv_writer = csv.writer(f)
88-
csv_writer.writerows(csv_contents[metric])
89-
9085
aggregate_csv_output = [['ledger', 'date', 'clustering'] + metric_names]
9186
for project, timeframes in aggregate_output.items():
9287
for date, results in timeframes.items():
9388
metric_values = [results[metric] for metric in metric_names]
9489
if any(metric_values):
9590
aggregate_csv_output.append([project, date, clustering_flag] + metric_values)
96-
with open(output_dir / 'output.csv', 'w') as f:
91+
92+
output_filename = hlp.get_output_filename(clustering_flag)
93+
with open(output_dir / output_filename, 'w') as f:
9794
csv_writer = csv.writer(f)
9895
csv_writer.writerows(aggregate_csv_output)
9996

consensus_decentralization/helper.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -428,7 +428,7 @@ def get_force_map_flag():
428428
"""
429429
config = get_config_data()
430430
try:
431-
return config['execution_flags']['force_map']
431+
return config['force_map']
432432
except KeyError:
433433
raise ValueError('Flag "force_map" missing from config file')
434434

@@ -441,7 +441,7 @@ def get_clustering_flag():
441441
"""
442442
config = get_config_data()
443443
try:
444-
return config['analyze_flags']['clustering']
444+
return config['clustering']
445445
except KeyError:
446446
raise ValueError('Flag "clustering" missing from config file')
447447

@@ -465,6 +465,15 @@ def get_mapped_data_filename(clustering_flag):
465465
return 'mapped_data_' + ('clustered' if clustering_flag else 'non_clustered') + '.json'
466466

467467

468+
def get_output_filename(clustering_flag):
469+
"""
470+
Retrieves the filename of the output file
471+
:param clustering_flag: boolean that determines whether clustering was performed
472+
:returns: str with the filename of the output file, which depends on whether clustering was performed
473+
"""
474+
return 'output_' + ('clustered' if clustering_flag else 'non_clustered') + '.csv'
475+
476+
468477
def get_input_directories():
469478
"""
470479
Reads the config file and retrieves the directories to look for raw block data

run.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ def process_data(force_map, ledger_dir, ledger, output_dir):
1919
return None
2020

2121

22-
def main(ledgers, timeframe, estimation_window, frequency, population_windows, interim_dir=hlp.INTERIM_DIR,
23-
results_dir=hlp.RESULTS_DIR):
22+
def main(ledgers, timeframe, estimation_window, frequency, population_windows,
23+
force_map, interim_dir=hlp.INTERIM_DIR, results_dir=hlp.RESULTS_DIR):
2424
"""
2525
Executes the entire pipeline (parsing, mapping, analyzing) for some projects and timeframes.
2626
:param ledgers: list of strings that correspond to the ledgers whose data should be analyzed
@@ -30,13 +30,18 @@ def main(ledgers, timeframe, estimation_window, frequency, population_windows, i
3030
timeframe will be considered.
3131
:param frequency: int or None. The number of days to consider for the frequency of the analysis (i.e. the number
3232
of days between each data point considered in the analysis). If None, only one data point will be considered,
33-
spanning the entire timeframe (i.e. it needs to be combined with None estimation_window).
34-
:param interim_dir: pathlib.PosixPath object of the directory where the output data will be saved
33+
spanning the entire timeframe (i.e. it needs to be combined with None
34+
estimation_window).
35+
:param population_windows: int. The number of windows to look backwards and forwards to determine the population of
36+
active block producers for a given time period.
37+
:param force_map: bool. If True, then the mapping will be performed,
38+
regardless of whether mapped data for the project already exist.
39+
:param interim_dir: pathlib.PosixPath object of the directory where the
40+
output data will be saved
41+
:param results_dir: pathlib.PosixPath object of the directory where the results will be saved
3542
"""
3643
logging.info(f"The ledgers that will be analyzed are: {','.join(ledgers)}")
3744

38-
force_map = hlp.get_force_map_flag()
39-
4045
for ledger in list(ledgers):
4146
ledger_dir = interim_dir / ledger
4247
ledger_dir.mkdir(parents=True, exist_ok=True) # create ledger output directory if it doesn't already exist
@@ -89,6 +94,7 @@ def main(ledgers, timeframe, estimation_window, frequency, population_windows, i
8994

9095
estimation_window, frequency = hlp.get_estimation_window_and_frequency()
9196
population_windows = hlp.get_population_windows()
97+
force_map_flag = hlp.get_force_map_flag()
9298

9399
results_dir = hlp.get_results_dir(estimation_window, frequency, population_windows)
94100
results_dir.mkdir(parents=True, exist_ok=True)
@@ -101,6 +107,7 @@ def main(ledgers, timeframe, estimation_window, frequency, population_windows, i
101107
'the first date.')
102108
timeframe = (timeframe_start, timeframe_end)
103109

104-
main(ledgers, timeframe, estimation_window, frequency, population_windows, results_dir=results_dir)
110+
main(ledgers, timeframe, estimation_window, frequency, population_windows,
111+
force_map_flag, results_dir=results_dir)
105112

106113
logging.info('Done. Please check the output directory for results.')

tests/test_analyze.py

Lines changed: 36 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import shutil
22
import pytest
3+
import csv
34
from consensus_decentralization.helper import INTERIM_DIR, get_clustering_flag
45
from consensus_decentralization.analyze import analyze
56

@@ -59,62 +60,49 @@ def test_analyze(setup_and_cleanup):
5960
population_windows=0
6061
)
6162

62-
metrics = ['gini', 'nakamoto_coefficient', 'entropy=1']
63-
for metric in metrics:
64-
output_file = test_output_dir / 'metrics' / f'{metric}.csv'
65-
assert output_file.is_file()
63+
output_file = test_output_dir / 'metrics' / 'output_clustered.csv'
64+
assert output_file.is_file()
6665

67-
with open(output_file) as f:
68-
lines = f.readlines()
69-
assert lines[0] == 'timeframe,sample_bitcoin\n'
70-
if metric == 'gini':
71-
assert lines[1] == '2018,0.25\n'
72-
elif metric == 'nakamoto_coefficient':
73-
assert lines[1] == '2018,2\n'
74-
elif metric == 'entropy=1':
75-
assert lines[1] == '2018,1.836591668108979\n'
66+
with open(output_file) as f:
67+
reader = list(csv.reader(f))
68+
header = reader[0]
69+
# find metric column indices
70+
gini_idx = header.index('gini')
71+
nc_idx = header.index('nakamoto_coefficient')
72+
ent_idx = header.index('entropy=1')
7673

77-
analyze(
78-
projects=projects,
79-
aggregated_data_filename='month_from_2018-02-01_to_2018-03-31.csv',
80-
input_dir=test_output_dir,
81-
output_dir=test_output_dir / 'metrics',
82-
population_windows=0
83-
)
84-
85-
metrics = ['gini', 'nakamoto_coefficient', 'entropy=1']
86-
for metric in metrics:
87-
output_file = test_output_dir / 'metrics' / f'{metric}.csv'
88-
assert output_file.is_file()
89-
90-
with open(output_file) as f:
91-
lines = f.readlines()
92-
assert lines[0] == 'timeframe,sample_bitcoin\n'
93-
if metric == 'gini':
94-
assert lines[1] == 'Feb-2018,0.16666666666666666\n'
95-
assert lines[2] == 'Mar-2018,0.0\n'
96-
elif metric == 'nakamoto_coefficient':
97-
assert lines[1] == 'Feb-2018,1\n'
98-
assert lines[2] == 'Mar-2018,1\n'
99-
elif metric == 'entropy=1':
100-
assert lines[1] == 'Feb-2018,1.5\n'
101-
assert lines[2] == 'Mar-2018,0.0\n'
74+
# find the row for sample_bitcoin and 2018
75+
data_row = None
76+
for row in reader[1:]:
77+
if row[0] == 'sample_bitcoin' and row[1] == '2018':
78+
data_row = row
79+
break
80+
assert data_row is not None
81+
assert data_row[gini_idx] == '0.25'
82+
assert data_row[nc_idx] == '2'
83+
assert data_row[ent_idx] == '1.836591668108979'
10284

10385
analyze(
10486
projects=projects,
105-
aggregated_data_filename='year_from_2010-01-01_to_2010-12-31.csv',
87+
aggregated_data_filename='month_from_2018-02-01_to_2018-03-31.csv',
10688
input_dir=test_output_dir,
10789
output_dir=test_output_dir / 'metrics',
10890
population_windows=0
10991
)
11092

111-
metrics = ['gini', 'nakamoto_coefficient', 'entropy=1']
112-
for metric in metrics:
113-
output_file = test_output_dir / 'metrics' / f'{metric}.csv'
114-
assert output_file.is_file()
93+
output_file = test_output_dir / 'metrics' / 'output_clustered.csv'
94+
assert output_file.is_file()
95+
with open(output_file) as f:
96+
reader = list(csv.reader(f))
97+
header = reader[0]
98+
gini_idx = header.index('gini')
99+
nc_idx = header.index('nakamoto_coefficient')
100+
ent_idx = header.index('entropy=1')
115101

116-
with open(output_file) as f:
117-
lines = f.readlines()
118-
assert len(lines) == 2
119-
assert lines[0] == 'timeframe,sample_bitcoin\n'
120-
assert lines[1] == '2010,\n'
102+
rows_for_project = {row[1]: row for row in reader[1:] if row[0] == 'sample_bitcoin'}
103+
assert rows_for_project['Feb-2018'][gini_idx] == '0.16666666666666666'
104+
assert rows_for_project['Mar-2018'][gini_idx] == '0.0'
105+
assert rows_for_project['Feb-2018'][nc_idx] == '1'
106+
assert rows_for_project['Mar-2018'][nc_idx] == '1'
107+
assert rows_for_project['Feb-2018'][ent_idx] == '1.5'
108+
assert rows_for_project['Mar-2018'][ent_idx] == '0.0'

0 commit comments

Comments
 (0)