Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions .github/workflows/custom_docker_builds.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,12 @@ jobs:
image-tags: ghcr.io/spack/notary:latest
- docker-image: ./images/python-aws-bash
image-tags: ghcr.io/spack/python-aws-bash:0.0.2
- docker-image: ./images/gitlab-error-processor
image-tags: ghcr.io/spack/gitlab-error-processor:0.0.6
- docker-image: ./images/upload-gitlab-failure-logs
image-tags: ghcr.io/spack/upload-gitlab-failure-logs:0.0.6
- docker-image: ./images/snapshot-release-tags
image-tags: ghcr.io/spack/snapshot-release-tags:0.0.4
- docker-image: ./images/cache-indexer
image-tags: ghcr.io/spack/cache-indexer:0.0.3
- docker-image: ./analytics
image-tags: ghcr.io/spack/django:0.1.3
image-tags: ghcr.io/spack/django:0.1.4
steps:
- name: Checkout
uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
Expand Down
Original file line number Diff line number Diff line change
@@ -1,33 +1,19 @@
import json
import os
import re
from datetime import datetime
import json
from pathlib import Path
from time import sleep
import re
from typing import Any

from celery import shared_task
from django.conf import settings
from django.db import connections
import gitlab
import psycopg2
import sentry_sdk
import opensearch_dsl
import yaml
from kubernetes import client, config
from kubernetes.client.exceptions import ApiException
from kubernetes.client.models.v1_pod import V1Pod
from kubernetes.client.models.v1_pod_status import V1PodStatus
from opensearch_dsl import Date, Document, connections


sentry_sdk.init(
# Sample only 1% of jobs
traces_sample_rate=0.01,
)

config.load_config()
v1_client = client.CoreV1Api()


class JobPayload(Document):
timestamp = Date()
class JobPayload(opensearch_dsl.Document):
timestamp = opensearch_dsl.Date()

class Index:
name = "gitlab-job-failures-*"
Expand All @@ -42,34 +28,9 @@ def save(self, **kwargs):
return super().save(**kwargs)


GITLAB_TOKEN = os.environ["GITLAB_TOKEN"]
GITLAB_POSTGRES_DB = os.environ["GITLAB_POSTGRES_DB"]
GITLAB_POSTGRES_USER = os.environ["GITLAB_POSTGRES_RO_USER"]
GITLAB_POSTGRES_PASSWORD = os.environ["GITLAB_POSTGRES_RO_PASSWORD"]
GITLAB_POSTGRES_HOST = os.environ["GITLAB_POSTGRES_HOST"]

OPENSEARCH_ENDPOINT = os.environ["OPENSEARCH_ENDPOINT"]
OPENSEARCH_USERNAME = os.environ["OPENSEARCH_USERNAME"]
OPENSEARCH_PASSWORD = os.environ["OPENSEARCH_PASSWORD"]


# Instantiate gitlab api wrapper
gl = gitlab.Gitlab("https://gitlab.spack.io", GITLAB_TOKEN, retry_transient_errors=True)

# Instantiate postgres connection
pg_conn = psycopg2.connect(
host=GITLAB_POSTGRES_HOST,
port="5432",
dbname=GITLAB_POSTGRES_DB,
user=GITLAB_POSTGRES_USER,
password=GITLAB_POSTGRES_PASSWORD,
)


def job_retry_data(job_id: str | int, job_name: str) -> tuple[int, bool]:
with pg_conn:
cur = pg_conn.cursor()
cur.execute(
def _job_retry_data(job_id: str | int, job_name: str) -> tuple[int, bool]:
with connections["gitlab"].cursor() as cursor:
cursor.execute(
"""
SELECT attempt_number, COALESCE(retried, FALSE) as retried FROM (
SELECT ROW_NUMBER() OVER (ORDER BY id) as attempt_number, retried, id
Expand All @@ -86,13 +47,13 @@ def job_retry_data(job_id: str | int, job_name: str) -> tuple[int, bool]:
""",
{"job_id": job_id, "job_name": job_name},
)
result = cur.fetchone()
cur.close()
result = cursor.fetchone()
cursor.close()

return result


def assign_error_taxonomy(job_input_data: dict[str, Any], job_trace: str):
def _assign_error_taxonomy(job_input_data: dict[str, Any], job_trace: str):
# Read taxonomy file
with open(Path(__file__).parent / "taxonomy.yaml") as f:
taxonomy = yaml.safe_load(f)["taxonomy"]
Expand Down Expand Up @@ -130,7 +91,7 @@ def assign_error_taxonomy(job_input_data: dict[str, Any], job_trace: str):
return


def collect_pod_status(job_input_data: dict[str, Any], job_trace: str):
def _collect_pod_status(job_input_data: dict[str, Any], job_trace: str):
"""Collect k8s info about this job and store it in the OpenSearch record"""
# Record whether this job was run on a kubernetes pod or via some other
# means (a UO runner, for example)
Expand All @@ -156,37 +117,18 @@ def collect_pod_status(job_input_data: dict[str, Any], job_trace: str):
job_input_data["pod_status"] = None
return

pod_name = runner_name_matches[0]

pod: V1Pod | None = None
while True:
# Try to fetch pod with kube
try:
pod = v1_client.read_namespaced_pod(name=pod_name, namespace="pipeline")
except ApiException:
# If it doesn't work, that means the pod has already been cleaned up.
# In that case, we break out of the loop and return.
break

# Check if the pod is still running. If so, keep re-fetching it until it's complete
status: V1PodStatus = pod.status
if status.phase != "Running":
break

sleep(1)

if pod:
job_input_data["pod_status"] = pod.status.to_dict()
Comment thread
danlamanna marked this conversation as resolved.

@shared_task(name="upload_job_failure_classification", soft_time_limit=60)
def upload_job_failure_classification(job_input_data_json: str) -> None:
gl = gitlab.Gitlab(settings.GITLAB_ENDPOINT, settings.GITLAB_TOKEN, retry_transient_errors=True)

def main():
# Read input data and extract params
job_input_data = json.loads(os.environ["JOB_INPUT_DATA"])
job_input_data = json.loads(job_input_data_json)
job_id = job_input_data["build_id"]
job_name = job_input_data["build_name"]

# Annotate if job has been retried
attempt_number, retried = job_retry_data(job_id=job_id, job_name=job_name)
attempt_number, retried = _job_retry_data(job_id=job_id, job_name=job_name)
job_input_data["attempt_number"] = attempt_number
job_input_data["retried"] = retried

Expand All @@ -204,22 +146,17 @@ def main():
job_trace: str = job.trace().decode() # type: ignore

# Get info about the k8s pod this job ran on
collect_pod_status(job_input_data, job_trace)
_collect_pod_status(job_input_data, job_trace)

# Assign any/all relevant errors
assign_error_taxonomy(job_input_data, job_trace)
_assign_error_taxonomy(job_input_data, job_trace)

# Upload to OpenSearch
connections.create_connection(
hosts=[OPENSEARCH_ENDPOINT],
opensearch_dsl.connections.create_connection(
hosts=[settings.OPENSEARCH_ENDPOINT],
http_auth=(
OPENSEARCH_USERNAME,
OPENSEARCH_PASSWORD,
settings.OPENSEARCH_USERNAME,
settings.OPENSEARCH_PASSWORD,
),
)
doc = JobPayload(**job_input_data)
doc.save()


if __name__ == "__main__":
main()
4 changes: 4 additions & 0 deletions analytics/analytics/core/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from django.views.decorators.http import require_http_methods
import sentry_sdk

from analytics.core.job_failure_classifier import upload_job_failure_classification
from analytics.core.job_log_uploader import upload_job_log
from analytics.job_processor import process_job

Expand All @@ -25,6 +26,9 @@ def webhook_handler(request: HttpRequest) -> HttpResponse:
if job_input_data["build_status"] in ["success", "failed"]:
upload_job_log.delay(request.body)

if job_input_data["build_status"] == "failed":
upload_job_failure_classification.delay(request.body)

if (
re.match(BUILD_STAGE_REGEX, job_input_data["build_stage"])
and job_input_data["build_status"] == "success"
Expand Down
12 changes: 12 additions & 0 deletions analytics/analytics/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,18 @@
"django.middleware.clickjacking.XFrameOptionsMiddleware",
]

# Databases
DATABASES |= {
"gitlab": {
"ENGINE": "django.db.backends.postgresql",
"USER": os.environ["GITLAB_DB_USER"],
"HOST": os.environ["GITLAB_DB_HOST"],
"NAME": os.environ["GITLAB_DB_NAME"],
"PASSWORD": os.environ["GITLAB_DB_PASS"],
"PORT": os.environ["GITLAB_DB_PORT"],
},
}

# django-extensions
RUNSERVER_PLUS_PRINT_SQL_TRUNCATE = None
SHELL_PLUS_PRINT_SQL = True
Expand Down
5 changes: 5 additions & 0 deletions analytics/dev/.env.docker-compose
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,9 @@ OPENSEARCH_PASSWORD=elastic
SECRET_KEY=deadbeef
GITLAB_ENDPOINT="http://fakeurl"
GITLAB_TOKEN="bar"
GITLAB_DB_USER=gitlab
GITLAB_DB_HOST=gitlab-db
GITLAB_DB_PORT=5432
GITLAB_DB_NAME=gitlabhq_production
GITLAB_DB_PASS=gitlab
PROMETHEUS_URL=http://prometheus:9090
5 changes: 5 additions & 0 deletions analytics/dev/.env.docker-compose-native
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,9 @@ OPENSEARCH_USERNAME=elastic
OPENSEARCH_PASSWORD=elastic
GITLAB_ENDPOINT=http://fakegitlab
GITLAB_TOKEN=glpat-fakegitlab
GITLAB_DB_USER=gitlab
GITLAB_DB_HOST=localhost
GITLAB_DB_PORT=5433
GITLAB_DB_NAME=gitlabhq_production
GITLAB_DB_PASS=gitlab
PROMETHEUS_URL=http://localhost:9090
13 changes: 12 additions & 1 deletion analytics/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,17 @@ services:
ports:
- ${DOCKER_PROMETHEUS_PORT-9090}:9090

gitlab-db:
image: postgres:latest
ports:
- "5433:5432"
volumes:
- gitlab-db:/var/lib/postgresql/data
environment:
POSTGRES_DB: gitlabhq_production
POSTGRES_USER: gitlab
POSTGRES_PASSWORD: gitlab

volumes:
postgres:

gitlab-db:
13 changes: 0 additions & 13 deletions images/gitlab-error-processor/Dockerfile

This file was deleted.

61 changes: 0 additions & 61 deletions images/gitlab-error-processor/app.py

This file was deleted.

Loading