Increase JVM heap size for large CSV imports and enhance error messaging for memory issues

tae898 · tae898 · commit e06b595fe79f · 2025-10-25T23:58:14.000+02:00
diff --git a/.github/workflows/test-python-examples.yml b/.github/workflows/test-python-examples.yml
@@ -61,6 +61,9 @@ jobs:
 
       - name: Run all examples
         id: run_examples
+        env:
+          # Increase JVM heap for large CSV imports (example 04)
+          ARCADEDB_JVM_MAX_HEAP: "8g"
         run: |
           cd bindings/python/examples
 
diff --git a/bindings/python/examples/04_csv_import_documents.py b/bindings/python/examples/04_csv_import_documents.py
@@ -49,12 +49,19 @@
 - arcadedb-embedded (any distribution: headless, minimal, or full)
 - MovieLens dataset (downloaded via download_sample_data.py)
 - JRE 21+
+- Sufficient JVM heap memory (8GB recommended for large dataset)
 
 Usage:
 1. First download the dataset:
    python download_sample_data.py
-2. Run this example:
-   python 04_csv_import_documents.py
+2. Run this example with sufficient memory:
+   ARCADEDB_JVM_MAX_HEAP="8g" python 04_csv_import_documents.py
+
+Memory Requirements:
+- Small dataset (~100K ratings): 4GB heap (default) is sufficient
+- Large dataset (~33M ratings): 4GB heap (default) should work, 8GB for safety
+- Very large datasets (100M+ records): Set ARCADEDB_JVM_MAX_HEAP="8g" or higher
+- Must be set BEFORE running the script (before JVM starts)
 
 Note: This example creates a database at ./my_test_databases/movielens_db/
       The database files are preserved so you can inspect them after running.
@@ -74,6 +81,18 @@
 print("=" * 70)
 print()
 
+# Check JVM heap configuration for large imports
+jvm_heap = os.environ.get("ARCADEDB_JVM_MAX_HEAP")
+if jvm_heap:
+    print(f"💡 JVM Max Heap: {jvm_heap}")
+else:
+    print("💡 JVM Max Heap: 4g (default)")
+    print("   ℹ️  Using default JVM heap (4g)")
+    print("   💡 For very large datasets, you can increase it:")
+    print('      export ARCADEDB_JVM_MAX_HEAP="8g"  # or run with:')
+    print('      ARCADEDB_JVM_MAX_HEAP="8g" python 04_csv_import_documents.py')
+print()
+
 # -----------------------------------------------------------------------------
 # Step 0: Check Dataset Availability
 # -----------------------------------------------------------------------------
diff --git a/bindings/python/src/arcadedb_embedded/importer.py b/bindings/python/src/arcadedb_embedded/importer.py
@@ -409,6 +409,32 @@ def _import_using_java(
             return final_stats
 
         except Exception as e:
+            error_msg = str(e)
+
+            # Check for common memory-related errors
+            if any(
+                mem_indicator in error_msg.lower()
+                for mem_indicator in [
+                    "java heap space",
+                    "out of memory",
+                    "outofmemoryerror",
+                ]
+            ):
+                current_heap = os.environ.get("ARCADEDB_JVM_MAX_HEAP")
+                if current_heap:
+                    heap_msg = f"Current JVM heap: {current_heap}\n"
+                else:
+                    heap_msg = "Current JVM heap: 4g (default)\n"
+
+                raise ArcadeDBError(
+                    f"Import failed ({format_type} -> {import_type}): Out of memory.\n"
+                    f"{heap_msg}"
+                    f"💡 Try increasing heap size with environment variable:\n"
+                    f'   export ARCADEDB_JVM_MAX_HEAP="8g"\n'
+                    f"   Note: Must be set BEFORE running Python (before JVM starts)\n"
+                    f"Original error: {e}"
+                ) from e
+
             raise ArcadeDBError(
                 f"Import failed ({format_type} -> {import_type}): {e}"
             ) from e
diff --git a/bindings/python/src/arcadedb_embedded/jvm.py b/bindings/python/src/arcadedb_embedded/jvm.py
@@ -38,11 +38,11 @@ def start_jvm():
     classpath = os.pathsep.join(jar_files)
 
     # Allow customization via environment variables
-    max_heap = os.environ.get("ARCADEDB_JVM_MAX_HEAP", "2g")
+    max_heap = os.environ.get("ARCADEDB_JVM_MAX_HEAP", "4g")
 
     # Prepare JVM arguments
     jvm_args = [
-        f"-Xmx{max_heap}",  # Max heap (default 2g, override with env var)
+        f"-Xmx{max_heap}",  # Max heap (default 4g, override with env var)
         "-Djava.awt.headless=true",  # Headless mode for server use
     ]