Fix Daytona curator timeout override and robust clone parsing

sjarmak · sjarmak · commit bce4351072b2 · 2026-03-04T15:18:46.000Z
diff --git a/scripts/context_retrieval_agent.py b/scripts/context_retrieval_agent.py
@@ -844,18 +844,74 @@ def load_task_context(task_dir: Path) -> Dict[str, Any]:
 
 
 def _extract_clone_urls(dockerfile_content: str) -> List[Dict[str, str]]:
-    """Extract git clone URLs from a Dockerfile."""
+    """Extract git clone URLs from a Dockerfile.
+
+    Handles common clone flag variants (`--branch`, `--no-tags`, etc.) and
+    clones without an explicit target directory.
+    """
+    def _default_clone_target(url: str) -> str:
+        clean = url.rstrip("/")
+        if clean.endswith(".git"):
+            clean = clean[:-4]
+        # Support scp-like syntax, e.g. git@github.com:org/repo.git
+        clean = clean.split(":")[-1]
+        return clean.split("/")[-1] or "repo"
+
+    option_with_value = {
+        "--branch",
+        "--depth",
+        "--config",
+        "--origin",
+        "--reference",
+        "--reference-if-able",
+        "--separate-git-dir",
+        "--upload-pack",
+        "--shallow-since",
+        "--jobs",
+        "-b",
+        "-c",
+        "-o",
+        "-u",
+    }
+
+    normalized = dockerfile_content.replace("\\\n", " ")
     results = []
-    for match in re.finditer(
-        r"git\s+clone\s+(?:--depth\s+\d+\s+)?(\S+?)(?:\.git)?\s+(\S+)",
-        dockerfile_content,
-    ):
-        url = match.group(1)
-        target = match.group(2)
-        # Extract mirror slug from URL
+    for match in re.finditer(r"git\s+clone\b([^\n]*)", normalized):
+        suffix = match.group(1).strip()
+        suffix = re.split(r"\s*(?:&&|;|\|\|)\s*", suffix, maxsplit=1)[0]
+        try:
+            tokens = shlex.split(f"git clone {suffix}")
+        except ValueError:
+            continue
+
+        # Parse: git clone [options] <repo> [<directory>]
+        positional = []
+        i = 2
+        while i < len(tokens):
+            tok = tokens[i]
+            if tok == "--":
+                i += 1
+                continue
+            if tok in option_with_value:
+                i += 2
+                continue
+            if tok.startswith("-"):
+                i += 1
+                continue
+            positional.append(tok)
+            i += 1
+
+        if not positional:
+            continue
+
+        url = positional[0]
+        target = positional[1] if len(positional) > 1 else _default_clone_target(url)
+
+        # Extract mirror slug from URL when available.
         m = re.search(r"github\.com/(.+?)(?:\.git)?$", url)
         slug = m.group(1) if m else url
         results.append({"url": url, "slug": slug, "target": target})
+
     return results
 
 
diff --git a/scripts/daytona_curator_runner.py b/scripts/daytona_curator_runner.py
@@ -192,10 +192,22 @@ def _extract_repo_info_for_sandbox(ctx: Dict[str, Any]) -> List[Dict[str, str]]:
         slug = entry.get("slug", "")
         target = entry.get("target", "repo")
         if url:
-            # Ensure URL ends with .git for clone
-            clone_url = url if url.endswith(".git") else url + ".git"
-            # Extract dir name from target path
-            name = target.rstrip("/").split("/")[-1] if target else "repo"
+            # Only normalize GitHub URLs to ".git"; other hosts (e.g.
+            # go.googlesource.com) often work best with the original URL.
+            if url.endswith(".git"):
+                clone_url = url
+            elif "github.com/" in url:
+                clone_url = url + ".git"
+            else:
+                clone_url = url
+
+            # Extract dir name from target path. If Dockerfile uses "." as the
+            # target, derive a stable repo name from slug/URL.
+            target_name = target.rstrip("/").split("/")[-1] if target else ""
+            if target_name in {"", "."}:
+                fallback = slug or url
+                target_name = fallback.rstrip("/").split("/")[-1].replace(".git", "")
+            name = target_name or "repo"
             repos.append({
                 "url": clone_url,
                 "commit": "HEAD",  # mirrors are at the right commit
@@ -1134,9 +1146,11 @@ def _run_sdlc_mode(args, creds: Dict[str, Any]) -> int:
     ]
 
     future_timeout = SANDBOX_TIMEOUT_SEC + 300  # clone (300s) + curator (900s)
-    # Global timeout: generous to handle slow clones + curator runs in parallel
-    # Each task runs in its own sandbox, so total wall time ≈ max(individual times)
-    global_timeout = future_timeout + 600  # extra 10 min buffer
+    # Scale global timeout by queued "waves" so large batches don't get cut off
+    # by a fixed wall-clock limit. Allow explicit override for long/retry runs.
+    waves = max(1, (len(tasks) + max(1, args.parallel) - 1) // max(1, args.parallel))
+    computed_global_timeout = (future_timeout * waves) + 600  # extra 10 min buffer
+    global_timeout = args.global_timeout_sec if args.global_timeout_sec > 0 else computed_global_timeout
 
     executor = ThreadPoolExecutor(max_workers=args.parallel)
     futures = {
@@ -1291,7 +1305,9 @@ def _run_contextbench_mode(args, creds: Dict[str, Any]) -> int:
     ]
 
     future_timeout = SANDBOX_TIMEOUT_SEC + 300
-    global_timeout = future_timeout + 600
+    waves = max(1, (len(tasks) + max(1, args.parallel) - 1) // max(1, args.parallel))
+    computed_global_timeout = (future_timeout * waves) + 600
+    global_timeout = args.global_timeout_sec if args.global_timeout_sec > 0 else computed_global_timeout
 
     executor = ThreadPoolExecutor(max_workers=args.parallel)
     futures = {
@@ -1437,6 +1453,15 @@ def main() -> int:
                         choices=("local", "deepsearch", "hybrid"))
     parser.add_argument("--parallel", type=int, default=DEFAULT_PARALLEL,
                         help=f"Concurrent sandboxes (default: {DEFAULT_PARALLEL})")
+    parser.add_argument(
+        "--global-timeout-sec",
+        type=int,
+        default=0,
+        help=(
+            "Override wall-clock timeout for the full batch in seconds. "
+            "Default (0) uses a computed timeout based on task count and parallelism."
+        ),
+    )
     parser.add_argument("--max-cost", type=float, default=0, help="Cost limit in USD")
     parser.add_argument("--max-tasks", type=int, default=0, help="Max tasks to process")
     parser.add_argument("--dry-run", action="store_true")