Refactor Python examples workflow to enhance testing with dataset variations and improve logging

tae898 · tae898 · commit c16ec7cca684 · 2025-10-26T22:39:42.000+01:00
diff --git a/.github/workflows/test-python-examples.yml b/.github/workflows/test-python-examples.yml
@@ -52,13 +52,6 @@ jobs:
           # Install dependencies needed by examples
           pip install numpy requests
 
-      - name: Download sample data
-        run: |
-          cd bindings/python/examples
-          echo "📥 Downloading sample data for examples..."
-          # Download large dataset for comprehensive testing
-          python download_sample_data.py
-
       - name: Run all examples
         id: run_examples
         env:
@@ -90,29 +83,67 @@ jobs:
 
           # Run each example
           for example in $examples; do
-            total=$((total + 1))
-            echo "----------------------------------------"
-            echo "📝 Running: $example"
-            echo "----------------------------------------"
-
-            # Create a timeout wrapper to prevent hanging
-            if timeout 1800 python "$example" > "${example}.log" 2>&1; then
-              echo "✅ PASSED: $example" | tee -a $results_file
-              passed=$((passed + 1))
+            # For example 04, test both small and large datasets
+            if [ "$example" = "04_csv_import_documents.py" ]; then
+              for size in small large; do
+                total=$((total + 1))
+                example_name="$example (--size $size)"
+                log_file="${example%.py}_${size}.log"
+
+                echo "----------------------------------------"
+                echo "📝 Running: $example_name"
+                echo "----------------------------------------"
+
+                # Create a timeout wrapper to prevent hanging (30 min for small, 60 min for large)
+                if [ "$size" = "small" ]; then
+                  timeout_duration=1800
+                else
+                  timeout_duration=3600
+                fi
+
+                if timeout $timeout_duration python "$example" --size "$size" > "$log_file" 2>&1; then
+                  echo "✅ PASSED: $example_name" | tee -a $results_file
+                  passed=$((passed + 1))
+                else
+                  exit_code=$?
+                  if [ $exit_code -eq 124 ]; then
+                    echo "⏱️  TIMEOUT: $example_name (exceeded $((timeout_duration/60)) minutes)" | tee -a $results_file
+                    failed=$((failed + 1))
+                  else
+                    echo "❌ FAILED: $example_name (exit code: $exit_code)" | tee -a $results_file
+                    failed=$((failed + 1))
+                  fi
+                  # Show last 20 lines of error log
+                  echo "Last 20 lines of output:"
+                  tail -n 20 "$log_file"
+                fi
+                echo ""
+              done
             else
-              exit_code=$?
-              if [ $exit_code -eq 124 ]; then
-                echo "⏱️  TIMEOUT: $example (exceeded 30 minutes)" | tee -a $results_file
-                failed=$((failed + 1))
+              total=$((total + 1))
+              echo "----------------------------------------"
+              echo "📝 Running: $example"
+              echo "----------------------------------------"
+
+              # Create a timeout wrapper to prevent hanging
+              if timeout 1800 python "$example" > "${example}.log" 2>&1; then
+                echo "✅ PASSED: $example" | tee -a $results_file
+                passed=$((passed + 1))
               else
-                echo "❌ FAILED: $example (exit code: $exit_code)" | tee -a $results_file
-                failed=$((failed + 1))
+                exit_code=$?
+                if [ $exit_code -eq 124 ]; then
+                  echo "⏱️  TIMEOUT: $example (exceeded 30 minutes)" | tee -a $results_file
+                  failed=$((failed + 1))
+                else
+                  echo "❌ FAILED: $example (exit code: $exit_code)" | tee -a $results_file
+                  failed=$((failed + 1))
+                fi
+                # Show last 20 lines of error log
+                echo "Last 20 lines of output:"
+                tail -n 20 "${example}.log"
               fi
-              # Show last 20 lines of error log
-              echo "Last 20 lines of output:"
-              tail -n 20 "${example}.log"
+              echo ""
             fi
-            echo ""
           done
 
           # Print summary
@@ -185,12 +216,13 @@ jobs:
           echo "" >> $GITHUB_STEP_SUMMARY
           echo "### Examples Tested" >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
-          echo "- 01_simple_document_store.py - Document CRUD operations" >> $GITHUB_STEP_SUMMARY
-          echo "- 02_social_network_graph.py - Graph modeling and traversal" >> $GITHUB_STEP_SUMMARY
-          echo "- 03_vector_search.py - Vector embeddings and similarity search" >> $GITHUB_STEP_SUMMARY
-          echo "- 04_csv_import_documents.py - CSV data import with type inference" >> $GITHUB_STEP_SUMMARY
+          echo "- **01_simple_document_store.py** - Document CRUD operations with comprehensive data types" >> $GITHUB_STEP_SUMMARY
+          echo "- **02_social_network_graph.py** - Graph modeling with vertices, edges, and traversal" >> $GITHUB_STEP_SUMMARY
+          echo "- **03_vector_search.py** - Vector embeddings and semantic similarity search (experimental)" >> $GITHUB_STEP_SUMMARY
+          echo "- **04_csv_import_documents.py** - CSV import with automatic dataset download and type inference" >> $GITHUB_STEP_SUMMARY
+          echo "  - Tested with \`--size small\` (~1 MB, ~100K ratings, 30 min timeout)" >> $GITHUB_STEP_SUMMARY
+          echo "  - Tested with \`--size large\` (~265 MB, ~33M ratings, 60 min timeout)" >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
-          echo "_Note: Example 04 requires the MovieLens dataset. If it fails, the dataset may need to be downloaded._" >> $GITHUB_STEP_SUMMARY
 
       - name: Upload example logs
         if: always()