@@ -1601,6 +1601,8 @@ def infer_lmm(
16011601 model_id : str ,
16021602 prompt : Optional [str ] = None ,
16031603 model_id_in_path : bool = False ,
1604+ max_new_tokens : Optional [int ] = None ,
1605+ enable_thinking : Optional [bool ] = None ,
16041606 ) -> Union [dict , List [dict ]]:
16051607 """Run inference using a Large Multimodal Model (LMM).
16061608
@@ -1620,6 +1622,10 @@ def infer_lmm(
16201622 model_id_in_path (bool, optional): If True, includes model_id in the URL path
16211623 (e.g., /infer/lmm/florence-2-base) which enables path-based routing.
16221624 If False (default), model_id is only sent in the request body.
1625+ max_new_tokens (Optional[int], optional): Maximum number of tokens to generate.
1626+ If not provided, the server-side model default is used.
1627+ enable_thinking (Optional[bool], optional): Enables reasoning mode for models
1628+ that support it. If not provided, the server-side model default is used.
16231629
16241630 Returns:
16251631 Union[dict, List[dict]]: Inference results containing the model response.
@@ -1632,6 +1638,10 @@ def infer_lmm(
16321638 extra_payload = {"model_id" : model_id }
16331639 if prompt is not None :
16341640 extra_payload ["prompt" ] = prompt
1641+ if max_new_tokens is not None :
1642+ extra_payload ["max_new_tokens" ] = max_new_tokens
1643+ if enable_thinking is not None :
1644+ extra_payload ["enable_thinking" ] = enable_thinking
16351645
16361646 if model_id_in_path :
16371647 endpoint = f"/infer/lmm/{ model_id } "
@@ -1652,6 +1662,8 @@ async def infer_lmm_async(
16521662 model_id : str ,
16531663 prompt : Optional [str ] = None ,
16541664 model_id_in_path : bool = False ,
1665+ max_new_tokens : Optional [int ] = None ,
1666+ enable_thinking : Optional [bool ] = None ,
16551667 ) -> Union [dict , List [dict ]]:
16561668 """Run inference using a Large Multimodal Model (LMM) asynchronously.
16571669
@@ -1666,6 +1678,10 @@ async def infer_lmm_async(
16661678 model_id_in_path (bool, optional): If True, includes model_id in the URL path
16671679 (e.g., /infer/lmm/florence-2-base) which enables path-based routing.
16681680 If False (default), model_id is only sent in the request body.
1681+ max_new_tokens (Optional[int], optional): Maximum number of tokens to generate.
1682+ If not provided, the server-side model default is used.
1683+ enable_thinking (Optional[bool], optional): Enables reasoning mode for models
1684+ that support it. If not provided, the server-side model default is used.
16691685
16701686 Returns:
16711687 Union[dict, List[dict]]: Inference results containing the model response.
@@ -1677,6 +1693,10 @@ async def infer_lmm_async(
16771693 extra_payload = {"model_id" : model_id }
16781694 if prompt is not None :
16791695 extra_payload ["prompt" ] = prompt
1696+ if max_new_tokens is not None :
1697+ extra_payload ["max_new_tokens" ] = max_new_tokens
1698+ if enable_thinking is not None :
1699+ extra_payload ["enable_thinking" ] = enable_thinking
16801700
16811701 if model_id_in_path :
16821702 endpoint = f"/infer/lmm/{ model_id } "
0 commit comments