|
20 | 20 | package org.apache.sysds.test; |
21 | 21 |
|
22 | 22 | import static java.lang.Math.ceil; |
23 | | -import static java.lang.Thread.sleep; |
24 | 23 | import static org.junit.Assert.assertEquals; |
25 | 24 | import static org.junit.Assert.fail; |
26 | 25 |
|
@@ -118,15 +117,10 @@ public abstract class AutomatedTestBase { |
118 | 117 | public static final double GPU_TOLERANCE = 1e-9; |
119 | 118 |
|
120 | 119 | /** |
121 | | - * Default upper bound (ms) passed to federated worker readiness waits. The wait returns as soon |
122 | | - * as the worker's TCP port accepts a connection, so this value only affects the deadline used |
123 | | - * when a worker never becomes ready. {@link FederatedWorkerUtils} clamps caller values below its |
124 | | - * enforced floor up to that floor, so the effective ceiling is at least that floor regardless |
125 | | - * of this constant. |
| 120 | + * Default deadline (ms) for federated worker/monitoring readiness waits and a few legacy |
| 121 | + * {@code sleep()} calls. {@link FederatedWorkerUtils} enforces its own minimum floor. |
126 | 122 | */ |
127 | 123 | public static final int FED_WORKER_WAIT = 3000; |
128 | | - public static final int FED_MONITOR_WAIT = 10000; |
129 | | - public static final int FED_WORKER_WAIT_S = 50; |
130 | 124 |
|
131 | 125 |
|
132 | 126 | // The timeout for a test to fail. all tests must execute in less than this time. |
@@ -1765,29 +1759,53 @@ private static Process spawnLocalFedWorker(int port, String[] addArgs) { |
1765 | 1759 | } |
1766 | 1760 |
|
1767 | 1761 | /** |
1768 | | - * Start new JVM for a federated monitoring backend at the port. |
| 1762 | + * Start a new JVM for a federated monitoring backend at the port. |
1769 | 1763 | * |
1770 | | - * @param port Port to use for the JVM |
1771 | | - * @return the process associated with the worker. |
| 1764 | + * <p>Returns once the backend's TCP port accepts connections (Netty's bind has completed), or |
| 1765 | + * throws a {@link RuntimeException} once the {@link FederatedWorkerUtils} readiness floor |
| 1766 | + * elapses. |
| 1767 | + * |
| 1768 | + * @param port Port to use for the JVM |
| 1769 | + * @param addArgs Extra CLI args to append, or null |
| 1770 | + * @return the process associated with the monitoring backend. |
1772 | 1771 | */ |
1773 | 1772 | protected Process startLocalFedMonitoring(int port, String[] addArgs) { |
1774 | | - Process process = null; |
| 1773 | + return startLocalFedMonitoring(port, addArgs, FED_WORKER_WAIT); |
| 1774 | + } |
| 1775 | + |
| 1776 | + /** |
| 1777 | + * Start a new JVM for a federated monitoring backend at the port. |
| 1778 | + * |
| 1779 | + * <p>Returns once the backend's TCP port accepts connections, or throws a |
| 1780 | + * {@link RuntimeException} after {@code timeoutMs} elapses. The monitoring server opens the |
| 1781 | + * port after Netty's {@code bind().sync()} returns; a successful TCP connect therefore signals |
| 1782 | + * that the HTTP listener is ready to accept requests. |
| 1783 | + * |
| 1784 | + * @param port Port to use for the JVM |
| 1785 | + * @param addArgs Extra CLI args to append, or null |
| 1786 | + * @param timeoutMs Upper bound on the wait, in ms; raised to a minimum value enforced inside |
| 1787 | + * {@link FederatedWorkerUtils}. |
| 1788 | + * @return the process associated with the monitoring backend. |
| 1789 | + */ |
| 1790 | + protected Process startLocalFedMonitoring(int port, String[] addArgs, int timeoutMs) { |
| 1791 | + Process process = spawnLocalFedMonitoring(port, addArgs); |
| 1792 | + FederatedWorkerUtils.waitForWorker(port, timeoutMs, process::isAlive, "monitoring process"); |
| 1793 | + return process; |
| 1794 | + } |
| 1795 | + |
| 1796 | + /** Spawn a federated monitoring backend JVM and return without waiting for the port to bind. */ |
| 1797 | + private static Process spawnLocalFedMonitoring(int port, String[] addArgs) { |
1775 | 1798 | String separator = System.getProperty("file.separator"); |
1776 | 1799 | String classpath = System.getProperty("java.class.path"); |
1777 | 1800 | String path = System.getProperty("java.home") + separator + "bin" + separator + "java"; |
1778 | | - String[] args = ArrayUtils.addAll(new String[]{path, "-cp", classpath, DMLScript.class.getName(), |
1779 | | - "-fedMonitoring", Integer.toString(port)}, addArgs); |
1780 | | - ProcessBuilder processBuilder = new ProcessBuilder(args); |
1781 | | - |
| 1801 | + String[] args = ArrayUtils.addAll(new String[] {path, "-cp", classpath, DMLScript.class.getName(), |
| 1802 | + "-fedMonitoring", Integer.toString(port)}, addArgs); |
1782 | 1803 | try { |
1783 | | - process = processBuilder.start(); |
1784 | | - // Wait till process is started |
1785 | | - sleep(FED_MONITOR_WAIT); |
| 1804 | + return new ProcessBuilder(args).start(); |
1786 | 1805 | } |
1787 | | - catch(IOException | InterruptedException e) { |
1788 | | - throw new RuntimeException(e); |
| 1806 | + catch(IOException e) { |
| 1807 | + throw new RuntimeException("Failed to launch federated monitoring process on port " + port, e); |
1789 | 1808 | } |
1790 | | - return process; |
1791 | 1809 | } |
1792 | 1810 |
|
1793 | 1811 | /** |
|
0 commit comments