Skip to content
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 27 additions & 2 deletions sdks/python/apache_beam/examples/inference/vllm_text_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,20 @@ def parse_known_args(argv):
required=False,
default=None,
help='Chat template to use for chat example.')
parser.add_argument(
'--vllm_server_kwargs',
dest='vllm_server_kwargs',
type=str,
required=False,
default=None,
help='VLLM server kwargs in format key1=value1,key2=value2')
parser.add_argument(
'--use_dynamo',
dest='use_dynamo',
type=bool,
required=False,
default=False,
help='Whether to use dynamo')
return parser.parse_known_args(argv)


Expand All @@ -132,13 +146,24 @@ def run(
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = save_main_session

model_handler = VLLMCompletionsModelHandler(model_name=known_args.model)
vllm_server_kwargs = {}
if known_args.vllm_server_kwargs:
for kv in known_args.vllm_server_kwargs.split(','):
k, v = kv.split('=')
vllm_server_kwargs[k] = v

model_handler = VLLMCompletionsModelHandler(
model_name=known_args.model,
vllm_server_kwargs=vllm_server_kwargs,
use_dynamo=known_args.use_dynamo)
input_examples = COMPLETION_EXAMPLES

if known_args.chat:
model_handler = VLLMChatModelHandler(
model_name=known_args.model,
chat_template_path=known_args.chat_template)
chat_template_path=known_args.chat_template,
vllm_server_kwargs=vllm_server_kwargs,
use_dynamo=known_args.use_dynamo)
input_examples = CHAT_EXAMPLES

pipeline = test_pipeline
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,14 @@ RUN python3 --version
RUN apt-get install -y curl
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12 && pip install --upgrade pip

RUN pip install --no-cache-dir -vvv apache-beam[gcp]==2.58.1
RUN pip install openai vllm
RUN pip install --no-cache-dir -vvv apache-beam[gcp]==2.71.0
RUN pip install --no-cache-dir openai vllm ai-dynamo[vllm]

RUN apt install libcairo2-dev pkg-config python3-dev -y
RUN pip install pycairo
RUN pip install --no-cache-dir pycairo

# Copy the Apache Beam worker dependencies from the Beam Python 3.12 SDK image.
COPY --from=apache/beam_python3.12_sdk:2.58.1 /opt/apache/beam /opt/apache/beam
COPY --from=apache/beam_python3.12_sdk:2.71.0 /opt/apache/beam /opt/apache/beam

# Set the entrypoint to Apache Beam SDK worker launcher.
ENTRYPOINT [ "/opt/apache/beam/boot" ]
38 changes: 32 additions & 6 deletions sdks/python/apache_beam/ml/inference/vllm_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,20 @@ def getAsyncVLLMClient(port) -> AsyncOpenAI:


class _VLLMModelServer():
def __init__(self, model_name: str, vllm_server_kwargs: dict[str, str]):
def __init__(
self,
model_name: str,
vllm_server_kwargs: dict[str, str],
vllm_executable: Optional[str] = None):
self._model_name = model_name
self._vllm_server_kwargs = vllm_server_kwargs
self._server_started = False
self._server_process = None
self._server_port: int = -1
self._server_process_lock = threading.RLock()
self._vllm_executable = 'vllm.entrypoints.openai.api_server'
if vllm_executable is not None:
self._vllm_executable = vllm_executable

self.start_server()

Expand All @@ -125,7 +132,7 @@ def start_server(self, retries=3):
server_cmd = [
sys.executable,
'-m',
'vllm.entrypoints.openai.api_server',
self._vllm_executable,
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changing this doesn't work on its own because the dynamo vllm executable doesn't include an api server. As a result, running this produces:

error: unrecognized arguments: --port 48455

So I'll need a different way of doing this

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I'll need to replicate something like https://github.com/ai-dynamo/dynamo?tab=readme-ov-file#run-dynamo instead

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made this update, and now I'm successfully starting up a model endpoint (HTTP Request: GET http://localhost:52921/v1/models "HTTP/1.1 200 OK"), however now I'm running into a new problem:

Traceback (most recent call last):
File "<frozen runpy>", line 198, in _run_module_as_main
File "<frozen runpy>", line 88, in _run_code
File "/usr/local/lib/python3.12/dist-packages/dynamo/vllm/__main__.py", line 7, in <module>
main()
File "/usr/local/lib/python3.12/dist-packages/dynamo/vllm/main.py", line 820, in main
uvloop.run(worker())
File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run
return __asyncio.run(
^^^^^^^^^^^^^^
File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run
return runner.run(main)
^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
    return self._loop.run_until_complete(task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper
return await main
^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/dynamo/vllm/main.py", line 67, in worker
runtime = DistributedRuntime(
^^^^^^^^^^^^^^^^^^^
Exception: Failed to connect to NATS: IO error: Connection refused (os error 111). Verify NATS server is running and accessible.

https://console.cloud.google.com/dataflow/jobs/us-central1/2026-02-11_07_08_48-18398043110228237613

I think that this is called out in https://github.com/ai-dynamo/dynamo?tab=readme-ov-file#run-dynamo and I can avoid NATS entirely with --kv-events-config '{"enable_kv_cache_events": false}', but I've had a little trouble getting that right so far

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I solved that piece, but still am running into issues:

{"job":"2026-02-11_07_49_28-348318588552584172", "logger":"/opt/apache/beam-venv/beam-venv-worker-sdk-0-0/lib/python3.12/site-packages/apache_beam/ml/inference/vllm_inference.py:84", "portability_worker_id":"sdk-0-0_sibling_2", "thread":"Thread-91 (log_stdout)", "worker":"beamapp-dannymccormick-02-02110749-wdvx-harness-qpdp"}
thread '<unnamed>' panicked at /opt/dynamo/lib/runtime/src/storage/kv.rs:440:29:
called `Result::unwrap()` on an `Err` value: BuildError(Unable to create lease. Check etcd server status at http://localhost:2379
{"job":"2026-02-11_07_49_28-348318588552584172", "logger":"/opt/apache/beam-venv/beam-venv-worker-sdk-0-0/lib/python3.12/site-packages/apache_beam/ml/inference/vllm_inference.py:84", "portability_worker_id":"sdk-0-0_sibling_2", "thread":"Thread-91 (log_stdout)", "worker":"beamapp-dannymccormick-02-02110749-wdvx-harness-qpdp"}
Caused by:
grpc request error: status: 'The service is currently unavailable', self: "tcp connect error")
�[2m2026-02-11T16:22:03.347934Z�[0m �[31mERROR�[0m �[2mrunners._cancel_all_tasks�[0m�[2m:�[0m unhandled exception during asyncio.run() shutdown
task: <Task finished name='Task-4' coro=<VllmEngineMonitor._check_engine_health() done, defined at /usr/local/lib/python3.12/dist-packages/dynamo/vllm/engine_monitor.py:68> exception=PanicException('called `Result::unwrap()` on an `Err` value: BuildError(Unable to create lease. Check etcd server status at http://localhost:2379\n\nCaused by:\n grpc request error: status: \'The service is currently unavailable\', self: "tcp connect error")')>
Traceback (most recent call last):
File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run
return runner.run(main)
^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
return self._loop.run_until_complete(task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper
return await main
^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/dynamo/vllm/main.py", line 117, in worker
await init(runtime, config)
File "/usr/local/lib/python3.12/dist-packages/dynamo/vllm/main.py", line 578, in init
await register_vllm_model(
File "/usr/local/lib/python3.12/dist-packages/dynamo/vllm/main.py", line 370, in register_vllm_model
await register_llm(
Exception: unable to extract tokenizer kind from directory /root/.cache/huggingface/hub/models--facebook--opt-125m/snapshots/27dcfa74d334bc871f3234de431e71c6eeba5dd6
{"job":"2026-02-11_07_49_28-348318588552584172", "logger":"/opt/apache/beam-venv/beam-venv-worker-sdk-0-0/lib/python3.12/site-packages/apache_beam/ml/inference/vllm_inference.py:84", "portability_worker_id":"sdk-0-0_sibling_2", "thread":"Thread-91 (log_stdout)", "worker":"beamapp-dannymccormick-02-02110749-wdvx-harness-qpdp"}
During handling of the above exception, another exception occurred:
{"job":"2026-02-11_07_49_28-348318588552584172", "logger":"/opt/apache/beam-venv/beam-venv-worker-sdk-0-0/lib/python3.12/site-packages/apache_beam/ml/inference/vllm_inference.py:84", "portability_worker_id":"sdk-0-0_sibling_2", "thread":"Thread-91 (log_stdout)", "worker":"beamapp-dannymccormick-02-02110749-wdvx-harness-qpdp"}
Traceback (most recent call last):
File "/usr/local/lib/python3.12/dist-packages/dynamo/vllm/engine_monitor.py", line 71, in _check_engine_health
await self.engine_client.check_health()
File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 734, in check_health
raise self.dead_error
vllm.v1.engine.exceptions.EngineDeadError: EngineCore encountered an issue. See stack trace (above) for the root cause.
{"job":"2026-02-11_07_49_28-348318588552584172", "logger":"/opt/apache/beam-venv/beam-venv-worker-sdk-0-0/lib/python3.12/site-packages/apache_beam/ml/inference/vllm_inference.py:84", "portability_worker_id":"sdk-0-0_sibling_2", "thread":"Thread-91 (log_stdout)", "worker":"beamapp-dannymccormick-02-02110749-wdvx-harness-qpdp"}
During handling of the above exception, another exception occurred:
{"job":"2026-02-11_07_49_28-348318588552584172", "logger":"/opt/apache/beam-venv/beam-venv-worker-sdk-0-0/lib/python3.12/site-packages/apache_beam/ml/inference/vllm_inference.py:84", "portability_worker_id":"sdk-0-0_sibling_2", "thread":"Thread-91 (log_stdout)", "worker":"beamapp-dannymccormick-02-02110749-wdvx-harness-qpdp"}
Traceback (most recent call last):
File "/usr/local/lib/python3.12/dist-packages/dynamo/vllm/engine_monitor.py", line 78, in _check_engine_health
self.runtime.shutdown()
pyo3_runtime.PanicException: called `Result::unwrap()` on an `Err` value: BuildError(Unable to create lease. Check etcd server status at http://localhost:2379
{"job":"2026-02-11_07_49_28-348318588552584172", "logger":"/opt/apache/beam-venv/beam-venv-worker-sdk-0-0/lib/python3.12/site-packages/apache_beam/ml/inference/vllm_inference.py:84", "portability_worker_id":"sdk-0-0_sibling_2", "thread":"Thread-91 (log_stdout)", "worker":"beamapp-dannymccormick-02-02110749-wdvx-harness-qpdp"}
Caused by:
grpc request error: status: 'The service is currently unavailable', self: "tcp connect error")
Traceback (most recent call last):
File "<frozen runpy>", line 198, in _run_module_as_main
File "<frozen runpy>", line 88, in _run_code
File "/usr/local/lib/python3.12/dist-packages/dynamo/vllm/__main__.py", line 7, in <module>
main()
File "/usr/local/lib/python3.12/dist-packages/dynamo/vllm/main.py", line 820, in main
uvloop.run(worker())
File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run
return __asyncio.run(
^^^^^^^^^^^^^^
File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run
return runner.run(main)
^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
return self._loop.run_until_complete(task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper
return await main
^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/dynamo/vllm/main.py", line 117, in worker
await init(runtime, config)
File "/usr/local/lib/python3.12/dist-packages/dynamo/vllm/main.py", line 578, in init
await register_vllm_model(
File "/usr/local/lib/python3.12/dist-packages/dynamo/vllm/main.py", line 370, in register_vllm_model
await register_llm(
Exception: unable to extract tokenizer kind from directory /root/.cache/huggingface/hub/models--facebook--opt-125m/snapshots/27dcfa74d334bc871f3234de431e71c6eeba5dd6

Not sure what is going on yet

'--model',
self._model_name,
'--port',
Expand Down Expand Up @@ -175,7 +182,8 @@ class VLLMCompletionsModelHandler(ModelHandler[str,
def __init__(
self,
model_name: str,
vllm_server_kwargs: Optional[dict[str, str]] = None):
vllm_server_kwargs: Optional[dict[str, str]] = None,
use_dynamo: bool = False):
"""Implementation of the ModelHandler interface for vLLM using text as
input.

Expand All @@ -194,13 +202,22 @@ def __init__(
`{'echo': 'true'}` to prepend new messages with the previous message.
For a list of possible kwargs, see
https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters-for-completions-api
use_dynamo: Whether to use Nvidia Dynamo as the underlying vLLM engine.
Requires installing dynamo in your runtime environment
(`pip install ai-dynamo[vllm]`)
"""
self._model_name = model_name
self._vllm_server_kwargs: dict[str, str] = vllm_server_kwargs or {}
self._env_vars = {}
self._vllm_executable = None
if use_dynamo:
self._vllm_executable = 'dynamo.vllm'

def load_model(self) -> _VLLMModelServer:
return _VLLMModelServer(self._model_name, self._vllm_server_kwargs)
return _VLLMModelServer(
self._model_name,
self._vllm_server_kwargs,
self._vllm_executable)

async def _async_run_inference(
self,
Expand Down Expand Up @@ -253,7 +270,8 @@ def __init__(
self,
model_name: str,
chat_template_path: Optional[str] = None,
vllm_server_kwargs: Optional[dict[str, str]] = None):
vllm_server_kwargs: Optional[dict[str, str]] = None,
use_dynamo: bool = False):
""" Implementation of the ModelHandler interface for vLLM using previous
messages as input.

Expand All @@ -277,12 +295,17 @@ def __init__(
`{'echo': 'true'}` to prepend new messages with the previous message.
For a list of possible kwargs, see
https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters-for-chat-api
use_dynamo: Whether to use Nvidia Dynamo as the underlying vLLM engine.
Requires installing dynamo in your runtime environment
(`pip install ai-dynamo[vllm]`)
"""
self._model_name = model_name
self._vllm_server_kwargs: dict[str, str] = vllm_server_kwargs or {}
self._env_vars = {}
self._chat_template_path = chat_template_path
self._chat_file = f'template-{uuid.uuid4().hex}.jinja'
if use_dynamo:
self._vllm_executable = 'dynamo.vllm'

def load_model(self) -> _VLLMModelServer:
chat_template_contents = ''
Expand All @@ -295,7 +318,10 @@ def load_model(self) -> _VLLMModelServer:
f.write(chat_template_contents)
self._vllm_server_kwargs['chat_template'] = local_chat_template_path

return _VLLMModelServer(self._model_name, self._vllm_server_kwargs)
return _VLLMModelServer(
self._model_name,
self._vllm_server_kwargs,
self._vllm_executable)

async def _async_run_inference(
self,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ pillow>=8.0.0
transformers>=4.18.0
google-cloud-monitoring>=2.27.0
openai>=1.52.2
ai-dynamo[vllm]>=0.1.1
2 changes: 1 addition & 1 deletion sdks/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,7 +609,7 @@ def get_portability_package_data():
'xgboost': ['xgboost>=1.6.0,<2.1.3', 'datatable==1.0.0'],
'tensorflow-hub': ['tensorflow-hub>=0.14.0,<0.16.0'],
'milvus': milvus_dependency,
'vllm': ['openai==1.107.1', 'vllm==0.10.1.1', 'triton==3.3.1']
'vllm': ['openai==1.107.1', 'vllm==0.10.1.1', 'triton==3.3.1', 'ai-dynamo[vllm]==0.1.1']
},
zip_safe=False,
# PyPI package information.
Expand Down
4 changes: 4 additions & 0 deletions sdks/python/test-suites/dataflow/common.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,10 @@ def vllmTests = tasks.create("vllmTests") {
executable 'sh'
args '-c', ". ${envdir}/bin/activate && pip install openai && python -m apache_beam.examples.inference.vllm_text_completion $cmdArgs --chat true --chat_template 'gs://apache-beam-ml/additional_files/sample_chat_template.jinja' --experiment='worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver:5xx'"
}
exec {
executable 'sh'
args '-c', ". ${envdir}/bin/activate && pip install openai && python -m apache_beam.examples.inference.vllm_text_completion $cmdArgs --use_dynamo=T --experiment='worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver:5xx'"
}
}
}

Expand Down
Loading