Skip to content

Commit c3bc33d

Browse files
authored
Merge branch 'master' into TutorTask204_Spring2025_RealTime_Bitcoin_Sentiment_Analysis_spaCy_Selenium_v2
2 parents c7c94c4 + 6e8f708 commit c3bc33d

230 files changed

Lines changed: 46214 additions & 19853 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.codecov.yml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
flag_management:
2+
individual_flags:
3+
- name: fast
4+
carryforward: true
5+
- name: slow
6+
carryforward: true
7+
- name: superslow
8+
carryforward: true
9+
10+
comment:
11+
layout: "reach, diff, files"
12+
behavior: default
13+
require_changes: false
14+
show_critical_paths: false
15+
16+
github_checks:
17+
annotations: false
18+
19+
coverage:
20+
status:
21+
project:
22+
default:
23+
# compare against the PR’s base commit coverage
24+
target: auto
25+
# fail if overall coverage drops by ≥1%
26+
threshold: 1%
27+
# only include these flags in the project-level check
28+
flags:
29+
- fast
30+
- slow
31+
- superslow
32+
# only run this check when targeting master
33+
branches:
34+
- master
35+
# turn on the patch-level coverage check
36+
patch: true
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
name: Test coverage
2+
on:
3+
# Run manually.
4+
workflow_dispatch: {}
5+
# https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#workflow_dispatch
6+
# every day at 00:00 UTC.
7+
# schedule:
8+
# - cron: '0 0 * * *'
9+
10+
env:
11+
CSFY_CI: true
12+
13+
# Set up permissions for OIDC authentication.
14+
permissions:
15+
# This is required for requesting the OIDC JWT.
16+
id-token: write
17+
# This is required for actions/checkout.
18+
contents: read
19+
# This is required for pulling the Docker image from GHCR.
20+
packages: read
21+
jobs:
22+
run_test_coverage:
23+
runs-on: ubuntu-latest
24+
25+
steps:
26+
# Configure AWS authentication for this workflow.
27+
# This step assumes an AWS IAM role to grant GH Action temporary
28+
# credentials necessary to access AWS resources.
29+
- name: Configure AWS credentials
30+
uses: aws-actions/configure-aws-credentials@v1
31+
with:
32+
role-to-assume: ${{ vars.GH_ACTION_AWS_ROLE_ARN }}
33+
role-session-name: ${{ vars.GH_ACTION_AWS_SESSION_NAME }}
34+
aws-region: ${{ vars.CSFY_AWS_DEFAULT_REGION }}
35+
36+
# This is needed to pull the Docker image.
37+
- name: Login to GHCR
38+
run: docker login ghcr.io -u gpsaggese -p ${{ secrets.GH_ACTION_ACCESS_TOKEN }}
39+
40+
# Make everything accessible by any user to avoid permission errors.
41+
- name: Cleanup
42+
run: sudo chmod 777 -R .
43+
44+
# Check out the code from GitHub so that we can run the action inside
45+
# the Docker container.
46+
- name: Checkout code
47+
uses: actions/checkout@v3
48+
with:
49+
submodules: true
50+
# TODO(Samarth): Do we need to propagate this to other `repos/workflow`
51+
# make it a default behavior? For certain tests to pass, we need entire
52+
# commit history of the repo including sub-modules.
53+
fetch-depth: 0
54+
token: ${{ secrets.GH_ACTION_ACCESS_TOKEN }}
55+
56+
# To see the modules in `helpers_root`, PYTHONPATH needs to include
57+
# `helpers_root` in the same way we do in `setenv.sh`.
58+
- name: Update PYTHONPATH
59+
run: echo "PYTHONPATH=.:helpers_root" >> $GITHUB_ENV
60+
61+
# Install packages that are required to run the job via GH.
62+
- name: Install dependencies
63+
run: |
64+
python -m pip install --upgrade pip
65+
pip install -r .github/gh_requirements.txt
66+
67+
# Pull the latest Docker image from the GHCR registry instead of ECR for
68+
# cost saving purposes to run the regressions on.
69+
- name: Pull image from GHCR
70+
run: docker pull ghcr.io/${{ github.repository }}:dev
71+
72+
# This step is used to trigger the fast test coverage generation using the invoke task.
73+
- name: Run Fast test and generate report
74+
id: run_fast
75+
continue-on-error: true
76+
env:
77+
GH_ACTION_ACCESS_TOKEN: ${{ secrets.GH_ACTION_ACCESS_TOKEN }}
78+
CSFY_AWS_ACCESS_KEY_ID: ${{ env.AWS_ACCESS_KEY_ID }}
79+
CSFY_AWS_SECRET_ACCESS_KEY: ${{ env.AWS_SECRET_ACCESS_KEY }}
80+
CSFY_AWS_SESSION_TOKEN: ${{ env.AWS_SESSION_TOKEN }}
81+
CSFY_AWS_DEFAULT_REGION: ${{ env.AWS_DEFAULT_REGION }}
82+
CSFY_ECR_BASE_PATH: ghcr.io/${{ github.repository_owner }}
83+
CSFY_AWS_S3_BUCKET: ${{ vars.CSFY_AWS_S3_BUCKET }}
84+
run: invoke run_coverage --suite fast
85+
86+
- name: Upload Fast Test Coverage to Codecov
87+
id: upload_fast
88+
# Only upload if the previous fast test run step succeeded (i.r report generated).
89+
# failed step don’t generate a coverage report, so there's nothing to upload.
90+
if: steps.run_fast.outcome == 'success'
91+
continue-on-error: true
92+
uses: codecov/codecov-action@v5
93+
with:
94+
token: ${{ secrets.CODECOV_TOKEN }}
95+
files: ./coverage.xml
96+
# Specify the Codecov flag name associated with this test suite.
97+
# Required to separate coverage reports by type (e.g., fast, slow, superslow) inside the Codecov UI.
98+
flags: fast
99+
name: fast-test-coverage
100+
101+
- name: Run Slow test and generate report
102+
id: run_slow
103+
continue-on-error: true
104+
env:
105+
GH_ACTION_ACCESS_TOKEN: ${{ secrets.GH_ACTION_ACCESS_TOKEN }}
106+
CSFY_AWS_ACCESS_KEY_ID: ${{ env.AWS_ACCESS_KEY_ID }}
107+
CSFY_AWS_SECRET_ACCESS_KEY: ${{ env.AWS_SECRET_ACCESS_KEY }}
108+
CSFY_AWS_SESSION_TOKEN: ${{ env.AWS_SESSION_TOKEN }}
109+
CSFY_AWS_DEFAULT_REGION: ${{ env.AWS_DEFAULT_REGION }}
110+
CSFY_ECR_BASE_PATH: ghcr.io/${{ github.repository_owner }}
111+
CSFY_AWS_S3_BUCKET: ${{ vars.CSFY_AWS_S3_BUCKET }}
112+
run: invoke run_coverage --suite slow
113+
114+
- name: Upload Slow Test Coverage to Codecov
115+
id: upload_slow
116+
# Only upload if the previous slow test run step succeeded (i.e, if report generated).
117+
if: steps.run_slow.outcome == 'success'
118+
continue-on-error: true
119+
uses: codecov/codecov-action@v5
120+
with:
121+
token: ${{ secrets.CODECOV_TOKEN }}
122+
files: ./coverage.xml
123+
flags: slow
124+
name: slow-test-coverage
125+
126+
# - name: Run Superslow test and generate report
127+
# id: run_superslow
128+
# env:
129+
# GH_ACTION_ACCESS_TOKEN: ${{ secrets.GH_ACTION_ACCESS_TOKEN }}
130+
# CSFY_AWS_ACCESS_KEY_ID: ${{ env.AWS_ACCESS_KEY_ID }}
131+
# CSFY_AWS_SECRET_ACCESS_KEY: ${{ env.AWS_SECRET_ACCESS_KEY }}
132+
# CSFY_AWS_SESSION_TOKEN: ${{ env.AWS_SESSION_TOKEN }}
133+
# CSFY_AWS_DEFAULT_REGION: ${{ env.AWS_DEFAULT_REGION }}
134+
# CSFY_ECR_BASE_PATH: ghcr.io/${{ github.repository_owner }}
135+
# CSFY_AWS_S3_BUCKET: ${{ vars.CSFY_AWS_S3_BUCKET }}
136+
# run: |
137+
# # Determine the day of the week (1 = Monday, 7 = Sunday).
138+
# day_of_week=$(date +%u)
139+
# # Only run superslow tests on Mondays or if the workflow is manually triggered.
140+
# if [ "$day_of_week" = "1" ] || [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
141+
# echo "Running superslow tests..."
142+
# invoke run_coverage --suite superslow
143+
# else
144+
# echo "Skipping superslow tests — today is not Monday and this is not a manual trigger"
145+
# exit 0
146+
# fi
147+
148+
# - name: Upload Superslow Test Coverage to Codecov
149+
# #TODO(Shaunak): Consider removing it when we turn this workflow into a reusable one.
150+
# if: steps.run_superslow.outcome == 'success'
151+
# uses: codecov/codecov-action@v5
152+
# with:
153+
# token: ${{ secrets.CODECOV_TOKEN }}
154+
# files: ./coverage.xml
155+
# flags: superslow
156+
# name: superslow-test-coverage
157+
158+
# # Fail the job in CI if any of the fast/ slow run/ upload steps above failed.
159+
# - name: Fail if fast/slow test or upload failed
160+
# run: |
161+
# failed=""
162+
# if [ "${{ steps.run_fast.outcome }}" != "success" ]; then
163+
# echo "Fast test run failed"
164+
# failed="true"
165+
# fi
166+
# if [ "${{ steps.upload_fast.outcome }}" != "success" ]; then
167+
# echo "Fast test coverage upload failed"
168+
# failed="true"
169+
# fi
170+
# if [ "${{ steps.run_slow.outcome }}" != "success" ]; then
171+
# echo "Slow test run failed"
172+
# failed="true"
173+
# fi
174+
# if [ "${{ steps.upload_slow.outcome }}" != "success" ]; then
175+
# echo "Slow test coverage upload failed"
176+
# failed="true"
177+
# fi
178+
# if [ "$failed" = "true" ]; then
179+
# echo "At least one fast/slow test or upload step failed."
180+
# exit 1
181+
# fi

.gitmodules

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
[submodule "helpers_root"]
22
path = helpers_root
3-
url = git@github.com:kaizen-ai/helpers.git
3+
url = git@github.com:kaizen-ai/helpers.git

DATA605/Spring2025/.DS_Store

-6 KB
Binary file not shown.

DATA605/Spring2025/check_pr.sh

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#!/usr/bin/env bash
2+
#
3+
# check_pr.sh
4+
# 1) Moves to the Git repository root
5+
# 2) Checks for any binary files staged for commit
6+
# 3) Lists the top N largest added/modified/copied files between upstream and your branch
7+
8+
# ─── 1. Jump to repo root ──────────────────────────────────────────
9+
cd "$(git rev-parse --show-toplevel)" || exit 1
10+
11+
# ─── 2. Parse arguments ───────────────────────────────────────────
12+
UPSTREAM="origin/master"
13+
BRANCH=""
14+
TOP=10
15+
16+
while [[ $# -gt 0 ]]; do
17+
case "$1" in
18+
-u|--upstream)
19+
UPSTREAM="$2"; shift 2;;
20+
-b|--branch)
21+
BRANCH="$2"; shift 2;;
22+
-n|--top)
23+
TOP="$2"; shift 2;;
24+
*)
25+
echo "Unknown argument: $1"; exit 1;;
26+
esac
27+
done
28+
29+
# Default to current branch if none provided
30+
if [[ -z "$BRANCH" ]]; then
31+
BRANCH="$(git rev-parse --abbrev-ref HEAD)"
32+
fi
33+
34+
echo "Inspecting changes on '${BRANCH}' vs '${UPSTREAM}'"
35+
36+
# ─── 3. Check for staged binary files ────────────────────────────
37+
if git diff --cached --numstat | grep -qE '^\-\s*\-'; then
38+
echo "⛔ Binary files detected in staging. Please remove them before committing."
39+
exit 1
40+
else
41+
echo "✅ No binary files in staging."
42+
fi
43+
44+
# ─── 4. List top N largest files in the diff ─────────────────────
45+
echo -e "\n🔍 Top ${TOP} largest files in ${UPSTREAM}...${BRANCH}:"
46+
git diff --diff-filter=ACM --name-only "${UPSTREAM}...${BRANCH}" \
47+
| xargs du -k 2>/dev/null \
48+
| sort -rn \
49+
| head -n "${TOP}" \
50+
| awk '{ printf("%.2fM\t%s\n", $1/1024, $2) }'

0 commit comments

Comments
 (0)