Config update (#169)

LadyChristina · web-flow · commit 4712eeae8e38 · 2026-02-09T21:50:21.000Z
Simplified config structure and updated end date value.
Removed individual metric output files.
Added some missing docstring.
diff --git a/config.yaml b/config.yaml
@@ -27,19 +27,19 @@ ledgers:
   - tezos
   - zcash
 
-# Execution flags
-execution_flags:
-  force_map: false
+# Flag that controls whether to force mapping or use already processed data
+# when available.
+force_map: false
 
-# Analyze flags
-analyze_flags:
-  clustering: true
+# Flag that determines whether to cluster block producers into entities that
+# control them.
+clustering: true
 
 # The timeframe for which an analysis should be performed.
 # Each date is a string of the form YYYY-MM-DD.
 timeframe:
   start_date: 2018-01-01
-  end_date: 2025-03-01
+  end_date: 2026-02-01
 
 # The number of days to use for the estimation window, i.e.how many days of blocks to use for each data point.
 # If left empty, then the entire time frame will be used (only valid when combined with empty frequency).
diff --git a/consensus_decentralization/analyze.py b/consensus_decentralization/analyze.py
@@ -82,18 +82,15 @@ def analyze(projects, aggregated_data_filename, input_dir, output_dir, populatio
                 csv_contents[metric_name][row_index + 1].append(result)
                 aggregate_output[project][date][metric_name] = result
 
-    for metric in metric_names:
-        with open(output_dir / f'{metric}.csv', 'w') as f:
-            csv_writer = csv.writer(f)
-            csv_writer.writerows(csv_contents[metric])
-
     aggregate_csv_output = [['ledger', 'date', 'clustering'] + metric_names]
     for project, timeframes in aggregate_output.items():
         for date, results in timeframes.items():
             metric_values = [results[metric] for metric in metric_names]
             if any(metric_values):
                 aggregate_csv_output.append([project, date, clustering_flag] + metric_values)
-    with open(output_dir / 'output.csv', 'w') as f:
+
+    output_filename = hlp.get_output_filename(clustering_flag)
+    with open(output_dir / output_filename, 'w') as f:
         csv_writer = csv.writer(f)
         csv_writer.writerows(aggregate_csv_output)
 
diff --git a/consensus_decentralization/helper.py b/consensus_decentralization/helper.py
@@ -428,7 +428,7 @@ def get_force_map_flag():
     """
     config = get_config_data()
     try:
-        return config['execution_flags']['force_map']
+        return config['force_map']
     except KeyError:
         raise ValueError('Flag "force_map" missing from config file')
 
@@ -441,7 +441,7 @@ def get_clustering_flag():
     """
     config = get_config_data()
     try:
-        return config['analyze_flags']['clustering']
+        return config['clustering']
     except KeyError:
         raise ValueError('Flag "clustering" missing from config file')
 
@@ -465,6 +465,15 @@ def get_mapped_data_filename(clustering_flag):
     return 'mapped_data_' + ('clustered' if clustering_flag else 'non_clustered') + '.json'
 
 
+def get_output_filename(clustering_flag):
+    """
+    Retrieves the filename of the output file
+    :param clustering_flag: boolean that determines whether clustering was performed
+    :returns: str with the filename of the output file, which depends on whether clustering was performed
+    """
+    return 'output_' + ('clustered' if clustering_flag else 'non_clustered') + '.csv'
+
+
 def get_input_directories():
     """
     Reads the config file and retrieves the directories to look for raw block data
diff --git a/run.py b/run.py
@@ -19,8 +19,8 @@ def process_data(force_map, ledger_dir, ledger, output_dir):
     return None
 
 
-def main(ledgers, timeframe, estimation_window, frequency, population_windows, interim_dir=hlp.INTERIM_DIR,
-         results_dir=hlp.RESULTS_DIR):
+def main(ledgers, timeframe, estimation_window, frequency, population_windows,
+         force_map, interim_dir=hlp.INTERIM_DIR, results_dir=hlp.RESULTS_DIR):
     """
     Executes the entire pipeline (parsing, mapping, analyzing) for some projects and timeframes.
     :param ledgers: list of strings that correspond to the ledgers whose data should be analyzed
@@ -30,13 +30,18 @@ def main(ledgers, timeframe, estimation_window, frequency, population_windows, i
         timeframe will be considered.
     :param frequency: int or None. The number of days to consider for the frequency of the analysis (i.e. the number
         of days between each data point considered in the analysis). If None, only one data point will be considered,
-        spanning the entire timeframe (i.e. it needs to be combined with None estimation_window).
-    :param interim_dir: pathlib.PosixPath object of the directory where the output data will be saved
+        spanning the entire timeframe (i.e. it needs to be combined with None
+        estimation_window).
+    :param population_windows: int. The number of windows to look backwards and forwards to determine the population of
+        active block producers for a given time period.
+    :param force_map: bool. If True, then the mapping will be performed,
+        regardless of whether mapped data for the project already exist.
+    :param interim_dir: pathlib.PosixPath object of the directory where the
+        output data will be saved
+    :param results_dir: pathlib.PosixPath object of the directory where the     results will be saved
     """
     logging.info(f"The ledgers that will be analyzed are: {','.join(ledgers)}")
 
-    force_map = hlp.get_force_map_flag()
-
     for ledger in list(ledgers):
         ledger_dir = interim_dir / ledger
         ledger_dir.mkdir(parents=True, exist_ok=True)  # create ledger output directory if it doesn't already exist
@@ -89,6 +94,7 @@ def main(ledgers, timeframe, estimation_window, frequency, population_windows, i
 
     estimation_window, frequency = hlp.get_estimation_window_and_frequency()
     population_windows = hlp.get_population_windows()
+    force_map_flag = hlp.get_force_map_flag()
 
     results_dir = hlp.get_results_dir(estimation_window, frequency, population_windows)
     results_dir.mkdir(parents=True, exist_ok=True)
@@ -101,6 +107,7 @@ def main(ledgers, timeframe, estimation_window, frequency, population_windows, i
                          'the first date.')
     timeframe = (timeframe_start, timeframe_end)
 
-    main(ledgers, timeframe, estimation_window, frequency, population_windows, results_dir=results_dir)
+    main(ledgers, timeframe, estimation_window, frequency, population_windows,
+         force_map_flag, results_dir=results_dir)
 
     logging.info('Done. Please check the output directory for results.')
diff --git a/tests/test_analyze.py b/tests/test_analyze.py
@@ -1,5 +1,6 @@
 import shutil
 import pytest
+import csv
 from consensus_decentralization.helper import INTERIM_DIR, get_clustering_flag
 from consensus_decentralization.analyze import analyze
 
@@ -59,62 +60,49 @@ def test_analyze(setup_and_cleanup):
         population_windows=0
     )
 
-    metrics = ['gini', 'nakamoto_coefficient', 'entropy=1']
-    for metric in metrics:
-        output_file = test_output_dir / 'metrics' / f'{metric}.csv'
-        assert output_file.is_file()
+    output_file = test_output_dir / 'metrics' / 'output_clustered.csv'
+    assert output_file.is_file()
 
-        with open(output_file) as f:
-            lines = f.readlines()
-            assert lines[0] == 'timeframe,sample_bitcoin\n'
-            if metric == 'gini':
-                assert lines[1] == '2018,0.25\n'
-            elif metric == 'nakamoto_coefficient':
-                assert lines[1] == '2018,2\n'
-            elif metric == 'entropy=1':
-                assert lines[1] == '2018,1.836591668108979\n'
+    with open(output_file) as f:
+        reader = list(csv.reader(f))
+    header = reader[0]
+    # find metric column indices
+    gini_idx = header.index('gini')
+    nc_idx = header.index('nakamoto_coefficient')
+    ent_idx = header.index('entropy=1')
 
-    analyze(
-        projects=projects,
-        aggregated_data_filename='month_from_2018-02-01_to_2018-03-31.csv',
-        input_dir=test_output_dir,
-        output_dir=test_output_dir / 'metrics',
-        population_windows=0
-    )
-
-    metrics = ['gini', 'nakamoto_coefficient', 'entropy=1']
-    for metric in metrics:
-        output_file = test_output_dir / 'metrics' / f'{metric}.csv'
-        assert output_file.is_file()
-
-        with open(output_file) as f:
-            lines = f.readlines()
-            assert lines[0] == 'timeframe,sample_bitcoin\n'
-            if metric == 'gini':
-                assert lines[1] == 'Feb-2018,0.16666666666666666\n'
-                assert lines[2] == 'Mar-2018,0.0\n'
-            elif metric == 'nakamoto_coefficient':
-                assert lines[1] == 'Feb-2018,1\n'
-                assert lines[2] == 'Mar-2018,1\n'
-            elif metric == 'entropy=1':
-                assert lines[1] == 'Feb-2018,1.5\n'
-                assert lines[2] == 'Mar-2018,0.0\n'
+    # find the row for sample_bitcoin and 2018
+    data_row = None
+    for row in reader[1:]:
+        if row[0] == 'sample_bitcoin' and row[1] == '2018':
+            data_row = row
+            break
+    assert data_row is not None
+    assert data_row[gini_idx] == '0.25'
+    assert data_row[nc_idx] == '2'
+    assert data_row[ent_idx] == '1.836591668108979'
 
     analyze(
         projects=projects,
-        aggregated_data_filename='year_from_2010-01-01_to_2010-12-31.csv',
+        aggregated_data_filename='month_from_2018-02-01_to_2018-03-31.csv',
         input_dir=test_output_dir,
         output_dir=test_output_dir / 'metrics',
         population_windows=0
     )
 
-    metrics = ['gini', 'nakamoto_coefficient', 'entropy=1']
-    for metric in metrics:
-        output_file = test_output_dir / 'metrics' / f'{metric}.csv'
-        assert output_file.is_file()
+    output_file = test_output_dir / 'metrics' / 'output_clustered.csv'
+    assert output_file.is_file()
+    with open(output_file) as f:
+        reader = list(csv.reader(f))
+    header = reader[0]
+    gini_idx = header.index('gini')
+    nc_idx = header.index('nakamoto_coefficient')
+    ent_idx = header.index('entropy=1')
 
-        with open(output_file) as f:
-            lines = f.readlines()
-            assert len(lines) == 2
-            assert lines[0] == 'timeframe,sample_bitcoin\n'
-            assert lines[1] == '2010,\n'
+    rows_for_project = {row[1]: row for row in reader[1:] if row[0] == 'sample_bitcoin'}
+    assert rows_for_project['Feb-2018'][gini_idx] == '0.16666666666666666'
+    assert rows_for_project['Mar-2018'][gini_idx] == '0.0'
+    assert rows_for_project['Feb-2018'][nc_idx] == '1'
+    assert rows_for_project['Mar-2018'][nc_idx] == '1'
+    assert rows_for_project['Feb-2018'][ent_idx] == '1.5'
+    assert rows_for_project['Mar-2018'][ent_idx] == '0.0'
diff --git a/tests/test_end_to_end.py b/tests/test_end_to_end.py