VFBquery/.github/workflows/performance-test.yml at main · VirtualFlyBrain/VFBquery · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
name: Performance Test

on:
  push:
    branches: [ main, dev ]
  pull_request:
    branches: [ main, dev ]
  workflow_dispatch:  # Enables manual triggering
  schedule:
    - cron: '0 2 * * *'  # Runs daily at 2 AM UTC

jobs:
  performance:
    name: "Performance Test"
    runs-on: ubuntu-latest
    # Raised from 60: the job runs the performance, legacy-performance and
    # connectivity suites back-to-back, each hitting live VFB infra (Neo4j /
    # SOLR / Owlery) with an auto-retry on first failure, so a cold/loaded run
    # legitimately exceeds an hour. Still bounded to stop a genuinely hung run.
    timeout-minutes: 120  # Set a timeout to prevent jobs from running indefinitely
    defaults:
      run:
        # pipefail so `python -m unittest ... | tee` propagates unittest's exit
        # status instead of always returning tee's 0.
        shell: bash -o pipefail -e {0}

    steps:
      - uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: '3.10'

      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          python -m pip install --upgrade -r requirements.txt
          python -m pip install -e .  # Editable install ensures we test the actual source code
          # pytest-xdist powers the `-n auto` parallel run in the
          # Connectivity Tests step below. Installed here rather than in
          # requirements.txt because it's only needed at test time.
          python -m pip install pytest-xdist

      - name: Test Owlery Connectivity
        run: |
          echo "Testing basic connectivity to Owlery server..."
          curl -v --max-time 30 "http://owl.virtualflybrain.org/kbs/vfb/subclasses?object=%3Chttp%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FFBbt_00005106%3E&direct=false&includeDeprecated=false&includeEquivalent=true" | head -20 || echo "Simple query failed or timed out"
          echo ""
          echo "Testing if server responds at all..."
          curl -v --max-time 10 "http://owl.virtualflybrain.org/kbs/vfb" || echo "Server unreachable"

      - name: Run Performance Test
        # Switched from `python -m unittest` (single-threaded) to `pytest -n auto`
        # so the suite parallelises across the runner's vCPUs. pytest discovers
        # unittest.TestCase subclasses automatically; no test refactoring needed.
        # Auto-retry once on failure: cold-start runs hit Neo4j + SOLR with empty
        # result caches and can breach thresholds on first-call latency. The
        # retry runs warm. If both fail, surface the failure.
        # IMPORTANT: the retry OVERWRITES performance_test_output.log so the
        # downstream "Fail job on test failures" step grades on the second
        # attempt's output only.
        env:
          # Read-only on PRs so a PR check never writes/purges the shared prod
          # cache; writable on push-to-main and scheduled runs so those refresh
          # and warm it (e.g. after a minor/major release). See
          # solr_caching_readonly().
          VFBQUERY_CACHE_READONLY: ${{ github.event_name == 'pull_request' && 'true' || 'false' }}
        run: |
          set +e
          echo "=== Performance test attempt 1/2 (parallel) ==="
          pytest -v -s -n auto src/test/test_query_performance.py 2>&1 | tee performance_test_output.log
          FIRST_EXIT=${PIPESTATUS[0]}
          if [[ $FIRST_EXIT -eq 0 ]]; then
              exit 0
          fi
          echo ""
          echo "=== Attempt 1 failed (exit $FIRST_EXIT). Retrying once with warm cache. ==="
          echo "=== RETRY ATTEMPT (auto-retry on first failure) ===" > performance_test_output.log
          pytest -v -s -n auto src/test/test_query_performance.py 2>&1 | tee -a performance_test_output.log
          exit ${PIPESTATUS[0]}

      - name: Run Legacy Performance Test
        # Always run, even if the previous test step failed, so we still get
        # the report data and don't mask additional regressions.
        if: always()
        env:
          VFBQUERY_CACHE_ENABLED: 'true'
          # Read-only on PRs (never write/purge the shared prod cache); writable
          # on push-to-main and scheduled runs so those refresh/warm it.
          VFBQUERY_CACHE_READONLY: ${{ github.event_name == 'pull_request' && 'true' || 'false' }}
          MPLBACKEND: 'Agg'
          VISPY_GL_LIB: 'osmesa'
          VISPY_USE_EGL: '0'
        run: |
          set +e
          # Each test step uses a per-step log so the canonical
          # performance_test_output.log only contains the LAST run's output
          # (after any retry). Step-local logs are concatenated at the end
          # for human inspection.
          echo "=== Legacy performance test attempt 1/2 ==="
          python -m unittest -v src.test.term_info_queries_test.TermInfoQueriesTest.test_term_info_performance 2>&1 | tee legacy_attempt.log
          FIRST_EXIT=${PIPESTATUS[0]}
          if [[ $FIRST_EXIT -eq 0 ]]; then
              cat legacy_attempt.log >> performance_test_output.log
              exit 0
          fi
          echo ""
          echo "=== Legacy attempt 1 failed (exit $FIRST_EXIT). Retrying once with warm cache. ==="
          python -m unittest -v src.test.term_info_queries_test.TermInfoQueriesTest.test_term_info_performance 2>&1 | tee legacy_attempt.log
          RETRY_EXIT=${PIPESTATUS[0]}
          echo "=== LEGACY RETRY ATTEMPT ===" >> performance_test_output.log
          cat legacy_attempt.log >> performance_test_output.log
          exit $RETRY_EXIT

      - name: Run Connectivity Tests
        if: always()
        env:
          # Disable the result cache so the connectivity integration tests
          # validate the LIVE query against the database, rather than reading
          # (possibly stale) entries from the shared production cache or writing
          # this run's results back into it. See solr_caching_disabled().
          VFBQUERY_CACHE_ENABLED: 'false'
          MPLBACKEND: 'Agg'
          VISPY_GL_LIB: 'osmesa'
          VISPY_USE_EGL: '0'
        run: |
          # These files are pytest-style (plain classes + @pytest.mark.integration).
          # Run with pytest so the markers are honoured and collection works.
          # `-n auto` parallelises across all available CPU cores via
          # pytest-xdist (typically 2-4 on GitHub-hosted ubuntu runners). The
          # connectivity tests hit the live upstream and don't share fixtures
          # or in-process state, so they parallelise cleanly. SOLR cache writes
          # are idempotent so a race between two cold workers on the same
          # term_id just produces two identical writes.
          # Auto-retry once on failure — same rationale as Run Performance Test.
          # Per-step log is concatenated into the canonical
          # performance_test_output.log only after the final (possibly retried)
          # attempt, so the failure-detection step at the end of the workflow
          # grades on the last attempt only.
          set +e
          echo "=== Connectivity test attempt 1/2 (parallel) ==="
          pytest -v -s -n 8 \
            src/test/test_neuron_neuron_connectivity.py \
            src/test/test_neuron_region_connectivity.py \
            src/test/test_upstream_class_connectivity.py \
            src/test/test_downstream_class_connectivity.py \
            src/test/test_vfb_connectivity.py \
            2>&1 | tee connectivity_attempt.log
          FIRST_EXIT=${PIPESTATUS[0]}
          if [[ $FIRST_EXIT -eq 0 ]]; then
              cat connectivity_attempt.log >> performance_test_output.log
              exit 0
          fi
          echo ""
          echo "=== Connectivity attempt 1 failed (exit $FIRST_EXIT). Retrying once with warm cache. ==="
          pytest -v -s -n 8 \
            src/test/test_neuron_neuron_connectivity.py \
            src/test/test_neuron_region_connectivity.py \
            src/test/test_upstream_class_connectivity.py \
            src/test/test_downstream_class_connectivity.py \
            src/test/test_vfb_connectivity.py \
            2>&1 | tee connectivity_attempt.log
          RETRY_EXIT=${PIPESTATUS[0]}
          echo "=== CONNECTIVITY RETRY ATTEMPT ===" >> performance_test_output.log
          cat connectivity_attempt.log >> performance_test_output.log
          exit $RETRY_EXIT

      - name: Create Performance Report
        if: always()  # Always run this step, even if the test fails
        run: |
          # Create performance.md file
          cat > performance.md << EOF
          # VFBquery Performance Test Results

          **Test Date:** $(date -u '+%Y-%m-%d %H:%M:%S UTC')
          **Git Commit:** ${{ github.sha }}
          **Branch:** ${{ github.ref_name }}
          **Workflow Run:** [${{ github.run_id }}](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})

          ## Test Overview

          This performance test measures the execution time of all implemented VFB queries organized by functionality:

          ### 1. Term Information Queries

          - **Term Info**: Comprehensive term information retrieval with preview data

          ### 2. Neuron Part & Synaptic Queries

          - **NeuronsPartHere**: Neurons with parts overlapping anatomical regions
          - **NeuronsSynaptic**: Neurons with synapses in a region
          - **NeuronsPresynapticHere**: Neurons with presynaptic terminals in a region
          - **NeuronsPostsynapticHere**: Neurons with postsynaptic terminals in a region

          ### 3. Anatomical Hierarchy Queries

          - **ComponentsOf**: Anatomical components of a structure
          - **PartsOf**: Parts of an anatomical structure
          - **SubclassesOf**: Subclasses of anatomical terms (can be very slow for complex terms)

          ### 4. Tract/Nerve & Lineage Queries

          - **NeuronClassesFasciculatingHere**: Neurons fasciculating with tracts
          - **TractsNervesInnervatingHere**: Tracts/nerves innervating neuropils
          - **LineageClonesIn**: Lineage clones in neuropils (complex OWL reasoning)

          ### 5. Image & Developmental Queries

          - **ImagesNeurons**: Neuron images in anatomical regions
          - **ImagesThatDevelopFrom**: Developmental lineage images
          - **epFrag**: Expression pattern fragments
          - **ListAllAvailableImages**: All available images for a term

          ### 6. Connectivity Queries

          - **NeuronNeuronConnectivity**: Neuron-to-neuron connectivity
          - **NeuronRegionConnectivity**: Neuron-to-region connectivity
          - **NeuronInputsTo**: Individual neuron inputs

          ### 7. Similarity Queries (NBLAST & NeuronBridge)

          - **SimilarMorphologyTo**: NBLAST morphological similarity
          - **SimilarMorphologyToPartOf**: NBLAST to expression patterns (NBLASTexp)
          - **SimilarMorphologyToPartOfexp**: Reverse NBLASTexp
          - **SimilarMorphologyToNB**: NeuronBridge matches
          - **SimilarMorphologyToNBexp**: NeuronBridge for expression patterns

          ### 8. Expression & Transcriptomics Queries

          - **ExpressionOverlapsHere**: Expression patterns overlapping regions
          - **anatScRNAseqQuery**: scRNAseq clusters in anatomy
          - **clusterExpression**: Genes expressed in clusters
          - **expressionCluster**: Clusters expressing genes
          - **scRNAdatasetData**: Cluster data from scRNAseq datasets

          ### 9. Dataset & Template Queries

          - **PaintedDomains**: Template painted anatomy domains
          - **DatasetImages**: Images in datasets
          - **AllAlignedImages**: Images aligned to templates
          - **AlignedDatasets**: Datasets aligned to templates
          - **AllDatasets**: All available datasets

          ### 10. Publication & Transgene Queries

          - **TermsForPub**: Terms referencing publications
          - **TransgeneExpressionHere**: Transgene expression patterns in regions

          ## Performance Thresholds

          - **Fast queries**: < 1 second (SOLR lookups)
          - **Medium queries**: < 3 seconds (Owlery + SOLR)
          - **Slow queries**: < 10 seconds (Neo4j + complex processing)
          - **Very Slow queries**: < 31 seconds (Complex OWL reasoning - over 30 seconds)

          ## Test Results

          \`\`\`
          $(cat performance_test_output.log)
          \`\`\`

          ## Summary

          EOF

          # Check overall test status. Note: matching "OK" or "ok" would
          # false-positive on per-test "test_foo ... ok" lines emitted by
          # unittest -v even when other tests failed. Use the absence of
          # FAIL:/ERROR: lines as the truth source (mirrors the final
          # "Fail job on test failures" step).
          # unittest summary: "Ran N tests in Xs".
          # pytest summary line ends with " in X.XXs" prefixed by " passed", " failed",
          # " error", or "no tests ran". Match either runner's summary markers.
          if grep -q "Ran .* test\| passed in \| failed in \| error in \|no tests ran" performance_test_output.log; then
            # unittest emits "FAIL:" / "ERROR:"; pytest emits "FAILED " / "ERROR " (no colon).
            if grep -q "FAIL:\|ERROR:\|FAILED\b\|^ERROR\b" performance_test_output.log; then
              echo "❌ **Test Status**: Performance tests ran but reported failures" >> performance.md
            else
              echo "✅ **Test Status**: Performance tests completed" >> performance.md
            fi
            echo "" >> performance.md

            # Count successes and failures. The log mixes both unittest
            # (`test_xxx ... ok`/`FAIL:`/`ERROR:`) and pytest
            # (`... PASSED`/`FAILED`/`ERROR` lines) outputs depending on
            # which step wrote it. Sum both formats so the report shows a
            # meaningful total instead of "Total: 1" (which is what we got
            # when only the unittest-format legacy step matched).
            #
            # `grep -c` already prints `0` and exits 1 on no-match, so
            # `|| true` is needed to swallow the exit code (otherwise
            # set -e aborts the step). Default empty captures to 0.
            UNITTEST_TESTS=$(grep -cE "^test_" performance_test_output.log 2>/dev/null || true)
            PYTEST_TESTS=$(grep -cE " (PASSED|FAILED|ERROR)( |$)" performance_test_output.log 2>/dev/null || true)
            UNITTEST_FAIL=$(grep -cE "^(FAIL|ERROR):" performance_test_output.log 2>/dev/null || true)
            PYTEST_FAIL=$(grep -cE " (FAILED|ERROR)( |$)" performance_test_output.log 2>/dev/null || true)
            UNITTEST_TESTS=${UNITTEST_TESTS:-0}
            PYTEST_TESTS=${PYTEST_TESTS:-0}
            UNITTEST_FAIL=${UNITTEST_FAIL:-0}
            PYTEST_FAIL=${PYTEST_FAIL:-0}
            TOTAL_TESTS=$((UNITTEST_TESTS + PYTEST_TESTS))
            FAILED_TESTS=$((UNITTEST_FAIL + PYTEST_FAIL))
            ERROR_TESTS=0  # ERROR counts already folded into FAILED above
            PASSED_TESTS=$((TOTAL_TESTS - FAILED_TESTS))

            echo "### Test Statistics" >> performance.md
            echo "" >> performance.md
            echo "- **Total Tests**: ${TOTAL_TESTS}" >> performance.md
            echo "- **Passed**: ${PASSED_TESTS} ✅" >> performance.md
            echo "- **Failed**: ${FAILED_TESTS} ❌" >> performance.md
            echo "- **Errors**: ${ERROR_TESTS} ⚠️" >> performance.md
            echo "" >> performance.md

            # Extract timing information for key queries
            echo "### Query Performance Details" >> performance.md
            echo "" >> performance.md

            # Extract all timing lines
            if grep -q "seconds" performance_test_output.log; then
              echo "| Query | Duration | Status |" >> performance.md
              echo "|-------|----------|--------|" >> performance.md

              # Parse timing information. The `|| true` guards against pipefail
              # propagating grep's exit-1 (no matches) into the step — which
              # was happening when pytest captured stdout and the per-query
              # timing lines never landed in the log.
              { grep -E "^(get_term_info|NeuronsPartHere|NeuronsSynaptic|NeuronsPresynapticHere|NeuronsPostsynapticHere|ComponentsOf|PartsOf|SubclassesOf|NeuronClassesFasciculatingHere|TractsNervesInnervatingHere|LineageClonesIn|ListAllAvailableImages|NeuronNeuronConnectivityQuery|NeuronRegionConnectivityQuery|NeuronInputsTo|DownstreamClassConnectivity|UpstreamClassConnectivity|QueryConnectivity):" performance_test_output.log || true; } | while read line; do
                QUERY=$(echo "$line" | sed 's/:.*//')
                DURATION=$(echo "$line" | sed 's/.*: \([0-9.]*\)s.*/\1/')
                if echo "$line" | grep -q "✅"; then
                  STATUS="✅ Pass"
                else
                  STATUS="❌ Fail"
                fi
                echo "| $QUERY | ${DURATION}s | $STATUS |" >> performance.md
              done
            fi

            echo "" >> performance.md

            # Overall result
            if [ "$FAILED_TESTS" -eq "0" ] && [ "$ERROR_TESTS" -eq "0" ]; then
              echo "🎉 **Result**: All performance thresholds met!" >> performance.md
            else
              echo "⚠️ **Result**: Some performance thresholds exceeded or tests failed" >> performance.md
              echo "" >> performance.md
              echo "Please review the failed tests above. Common causes:" >> performance.md
              echo "- Network latency to VFB services" >> performance.md
              echo "- SOLR/Neo4j/Owlery server load" >> performance.md
              echo "- First-time cache population (expected to be slower)" >> performance.md
            fi
          else
            echo "❌ **Test Status**: Performance tests failed to run properly" >> performance.md
            echo "" >> performance.md
            echo "Please check the test output above for errors." >> performance.md
          fi

          echo "" >> performance.md
          echo "---" >> performance.md
          echo "" >> performance.md
          echo "## Historical Performance" >> performance.md
          echo "" >> performance.md
          echo "Track performance trends across commits:" >> performance.md
          echo "- [GitHub Actions History](https://github.com/${{ github.repository }}/actions/workflows/performance-test.yml)" >> performance.md
          echo "" >> performance.md
          echo "---" >> performance.md
          echo "*Last updated: $(date -u '+%Y-%m-%d %H:%M:%S UTC')*" >> performance.md

          # Also add to GitHub step summary
          echo "## Performance Test Report" >> $GITHUB_STEP_SUMMARY
          echo "Performance results have been saved to performance.md" >> $GITHUB_STEP_SUMMARY
          echo "" >> $GITHUB_STEP_SUMMARY
          cat performance.md >> $GITHUB_STEP_SUMMARY

      - name: Commit and Push Performance Report
        if: always() && github.ref == 'refs/heads/main'
        run: |
          git config --local user.email "action@github.com"
          git config --local user.name "GitHub Action"
          git add performance.md
          git diff --staged --quiet || git commit -m "Update performance test results [skip ci]"
          git push origin HEAD:main

      - name: Fail job on test failures
        # Belt-and-braces: pipefail on the test steps should already make the
        # job red on any unittest failure. This grep is a safety net in case a
        # future test runner emits FAIL/ERROR lines without a non-zero exit
        # (e.g. partial runs, swallowed pipelines). Runs after the report and
        # commit so those still happen.
        if: always()
        run: |
          # Match both unittest format ("FAIL:" / "ERROR:") and pytest format
          # ("FAILED " / "ERROR " — no colon) so this catches either runner.
          if grep -q "FAIL:\|ERROR:\|FAILED\b\|^ERROR\b" performance_test_output.log; then
            echo "::error::Test run reported FAIL or ERROR lines in performance_test_output.log"
            grep "FAIL:\|ERROR:\|FAILED\b\|^ERROR\b" performance_test_output.log
            exit 1
          fi
          echo "No FAIL/ERROR lines detected."