Improve fask failure detection for ingest and retrieve

ayefimov-1 · ayefimov-1 · commit adc0242f6300 · 2026-06-03T10:37:44.000-04:00
task files
diff --git a/roles/telemetry_chargeback/tasks/ingest_loki_data.yml b/roles/telemetry_chargeback/tasks/ingest_loki_data.yml
@@ -1,23 +1,32 @@
 ---
 # Ingest data generated by gen_synth_loki_data.yml to Loki
 
-- name: "TEST Push data to Loki"
-  ansible.builtin.uri:
-    ## loki_push_url will be used in future role developement
-    url: "{{ loki_push_url }}"
-    method: POST
-    src: "{{ cloudkitty_data_file }}"
-    body_format: json
-    client_cert: "{{ cert_dir }}/tls.crt"
-    client_key: "{{ cert_dir }}/tls.key"
-    validate_certs: false
-    status_code: 204
-    return_content: true
-  register: loki_response
-  ignore_errors: true
-  failed_when: loki_response.status != 204
+- name: "Push data to Loki {{ scenario_name }}"
+  block:
+    - name: "POST data to Loki"
+      ansible.builtin.uri:
+        # loki_push_url is configured in setup_loki_env.yml
+        url: "{{ loki_push_url }}"
+        method: POST
+        src: "{{ cloudkitty_data_file }}"
+        body_format: json
+        client_cert: "{{ cert_dir }}/tls.crt"
+        client_key: "{{ cert_dir }}/tls.key"
+        ca_path: "{{ cert_dir }}/ca.crt"
+        validate_certs: false
+        status_code: 204
+        return_content: true
+      register: loki_response
 
-- name: "Ingest Status Message"
-  ansible.builtin.debug:
-    msg: "Ingestion Successful"
-  when: loki_response.status | default(0) == 204
+    - name: "Ingest Status Message"
+      ansible.builtin.debug:
+        msg: "Ingestion Successful for {{ scenario_name }}"
+
+  rescue:
+    - name: "IngestionFailed"
+      ansible.builtin.fail:
+        msg:
+          - "Scenario: {{ scenario_name }}"
+          - "Status: {{ loki_response.status | default('Unknown') }}"
+          - "Body: {{ loki_response.content | default('No Content') }}"
+          - "Message: {{ loki_response.msg | default('Request failed') }}"
diff --git a/roles/telemetry_chargeback/tasks/retrieve_loki_data.yml b/roles/telemetry_chargeback/tasks/retrieve_loki_data.yml
@@ -13,17 +13,19 @@
         client_cert: "{{ cert_dir }}/tls.crt"
         client_key: "{{ cert_dir }}/tls.key"
         ca_path: "{{ cert_dir }}/ca.crt"
+        # TODO: Enable validate_certs in production environments
         validate_certs: false
         return_content: true
         body_format: json
       register: loki_response
-      # Wait condition
+      # Wait for Loki to return all expected log entries
       until:
         - loki_response.status == 200
         - loki_response.json.status == 'success'
         - loki_response.json.data.result | length > 0
+        # Verify Loki returned all expected log entries
         - (loki_response.json.data.result | map(attribute='values') | map('length') | sum) >= (synth_data_rates.data_summary.log_count | int)
-      retries: 25
+      retries: 20
       delay: 60
 
     - name: "Save Loki Data to JSON file"
@@ -47,26 +49,23 @@
           only returned {{ actual_count }}
         success_msg: "Query returned all data entries. Input file had {{ synth_data_rates.data_summary.log_count }} entries and Loki returned {{ actual_count }}"
 
+    # Only runs if retrieval was successful
+    - name: "Generate chargeback stats from Loki-retrieved data file: {{ scenario_name }}"
+      ansible.builtin.command:
+        cmd: >
+          python3 "{{ cloudkitty_summary_script }}"
+          -j "{{ artifacts_dir_zuul }}/{{ scenario_name }}{{ cloudkitty_loki_data_suffix }}"
+          -o "{{ artifacts_dir_zuul }}/{{ scenario_name }}{{ cloudkitty_loki_totals_metrics_suffix }}"
+          --debug "{{ cloudkitty_debug }}"
+          --debug_dir "{{ cloudkitty_debug_dir }}"
+      register: synth_rating_info
+      changed_when: synth_rating_info.rc == 0
+
   rescue:
-    - name: "Debug failure"
-      ansible.builtin.debug:
+    - name: "Report Retrieval Failure"
+      ansible.builtin.fail:
         msg:
+          - "Failed to retrieve Loki data for {{ scenario_name }}"
           - "Status: {{ loki_response.status | default('Unknown') }}"
           - "Body: {{ loki_response.content | default('No Content') }}"
-          - "Msg: {{ loki_response.msg | default('Request failed') }}"
-
-    # Failure
-    - name: "Report Retrieval Failure"
-      ansible.builtin.fail:
-        msg: "Retrieval Failed"
-
-- name: "Generate chargeback stats from Loki-retrieved data file: {{ scenario_name }}"
-  ansible.builtin.command:
-    cmd: >
-      python3 "{{ cloudkitty_summary_script }}"
-      -j "{{ artifacts_dir_zuul }}/{{ scenario_name }}{{ cloudkitty_loki_data_suffix }}"
-      -o "{{ artifacts_dir_zuul }}/{{ scenario_name }}{{ cloudkitty_loki_totals_metrics_suffix }}"
-      --debug "{{ cloudkitty_debug }}"
-      --debug_dir "{{ cloudkitty_debug_dir }}"
-  register: synth_rating_info
-  changed_when: synth_rating_info.rc == 0
+          - "Message: {{ loki_response.msg | default('Request failed') }}"