Skip to content

Commit 549113c

Browse files
Merge pull request #1350 from roboflow/codex/add-workflow-blocks-for-perception-encoder
Add perception encoder workflow blocks
2 parents 4feb97e + fed8301 commit 549113c

9 files changed

Lines changed: 491 additions & 1 deletion

File tree

docs/foundation/perception_encoder.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,15 @@ Run the code to use Perception Encoder on your webcam.
9797

9898
**Note:** The model will take a minute or two to load. You will not see output while the model is loading.
9999

100+
## Using PE in Workflows
101+
102+
Perception Encoder can be used in Roboflow Workflows via the
103+
**Perception Encoder Embedding Model** block. This block lets you generate
104+
embeddings for images or text without writing code.
105+
100106
## API Compatibility
101107

102108
The Perception Encoder model uses the **same API as CLIP**. This means you can use all the same methods and request/response formats as you would with CLIP, including `embed_text`, `embed_image`, and `compare`.
103109

104110
For more details and advanced usage, see the [CLIP documentation](./clip.md).
111+

inference/core/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.50.5"
1+
__version__ = "0.51.0"
22

33

44
if __name__ == "__main__":

inference/core/workflows/core_steps/loader.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,9 @@
203203
from inference.core.workflows.core_steps.models.foundation.openai.v3 import (
204204
OpenAIBlockV3,
205205
)
206+
from inference.core.workflows.core_steps.models.foundation.perception_encoder.v1 import (
207+
PerceptionEncoderModelBlockV1,
208+
)
206209
from inference.core.workflows.core_steps.models.foundation.qwen.v1 import (
207210
Qwen25VLBlockV1,
208211
)
@@ -561,6 +564,7 @@ def load_blocks() -> List[Type[WorkflowBlock]]:
561564
ClipComparisonBlockV1,
562565
ClipComparisonBlockV2,
563566
ClipModelBlockV1,
567+
PerceptionEncoderModelBlockV1,
564568
CogVLMBlockV1,
565569
ColorVisualizationBlockV1,
566570
ConvertGrayscaleBlockV1,

inference/core/workflows/core_steps/models/foundation/perception_encoder/__init__.py

Whitespace-only changes.
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
import hashlib
2+
from typing import List, Literal, Optional, Type, Union
3+
4+
from pydantic import ConfigDict, Field
5+
6+
from inference.core.cache.lru_cache import LRUCache
7+
from inference.core.entities.requests.perception_encoder import (
8+
PerceptionEncoderImageEmbeddingRequest,
9+
PerceptionEncoderTextEmbeddingRequest,
10+
)
11+
from inference.core.env import (
12+
HOSTED_CORE_MODEL_URL,
13+
LOCAL_INFERENCE_API_URL,
14+
WORKFLOWS_REMOTE_API_TARGET,
15+
)
16+
from inference.core.managers.base import ModelManager
17+
from inference.core.workflows.core_steps.common.entities import StepExecutionMode
18+
from inference.core.workflows.core_steps.common.utils import load_core_model
19+
from inference.core.workflows.execution_engine.entities.base import (
20+
OutputDefinition,
21+
WorkflowImageData,
22+
)
23+
from inference.core.workflows.execution_engine.entities.types import (
24+
EMBEDDING_KIND,
25+
IMAGE_KIND,
26+
STRING_KIND,
27+
Selector,
28+
)
29+
from inference.core.workflows.prototypes.block import (
30+
BlockResult,
31+
WorkflowBlock,
32+
WorkflowBlockManifest,
33+
)
34+
from inference_sdk import InferenceHTTPClient
35+
36+
LONG_DESCRIPTION = """
37+
Use the Meta Perception Encoder model to create semantic embeddings of text and images.
38+
39+
This block accepts an image or string and returns an embedding. The embedding can be used to compare
40+
similarity between different images or between images and text.
41+
"""
42+
43+
44+
class BlockManifest(WorkflowBlockManifest):
45+
model_config = ConfigDict(
46+
json_schema_extra={
47+
"name": "Perception Encoder Embedding Model",
48+
"version": "v1",
49+
"short_description": "Generate an embedding of an image or string.",
50+
"long_description": LONG_DESCRIPTION,
51+
"license": "MIT",
52+
"block_type": "model",
53+
"ui_manifest": {
54+
"section": "model",
55+
"icon": "far fa-paperclip",
56+
"blockPriority": 9.9,
57+
},
58+
}
59+
)
60+
type: Literal["roboflow_core/perception_encoder@v1"]
61+
name: str = Field(description="Unique name of step in workflows")
62+
data: Union[Selector(kind=[IMAGE_KIND, STRING_KIND]), str] = Field(
63+
title="Data",
64+
description="The string or image to generate an embedding for.",
65+
examples=["$inputs.image", "$steps.cropping.crops"],
66+
)
67+
version: Union[
68+
Literal[
69+
"PE-Core-B16-224",
70+
"PE-Core-L14-336",
71+
"PE-Core-G14-448",
72+
],
73+
Selector(kind=[STRING_KIND]),
74+
] = Field(
75+
default="PE-Core-L14-336",
76+
description="Variant of Perception Encoder model",
77+
examples=["PE-Core-B16-224", "$inputs.variant"],
78+
)
79+
80+
@classmethod
81+
def describe_outputs(cls) -> List[OutputDefinition]:
82+
return [OutputDefinition(name="embedding", kind=[EMBEDDING_KIND])]
83+
84+
@classmethod
85+
def get_execution_engine_compatibility(cls) -> Optional[str]:
86+
return ">=1.3.0,<2.0.0"
87+
88+
89+
text_cache = LRUCache()
90+
91+
92+
class PerceptionEncoderModelBlockV1(WorkflowBlock):
93+
def __init__(
94+
self,
95+
model_manager: ModelManager,
96+
api_key: Optional[str],
97+
step_execution_mode: StepExecutionMode,
98+
):
99+
self._model_manager = model_manager
100+
self._api_key = api_key
101+
self._step_execution_mode = step_execution_mode
102+
103+
@classmethod
104+
def get_init_parameters(cls) -> List[str]:
105+
return ["model_manager", "api_key", "step_execution_mode"]
106+
107+
@classmethod
108+
def get_manifest(cls) -> Type[WorkflowBlockManifest]:
109+
return BlockManifest
110+
111+
def run(
112+
self,
113+
data: Union[WorkflowImageData, str],
114+
version: str,
115+
) -> BlockResult:
116+
if self._step_execution_mode is StepExecutionMode.LOCAL:
117+
return self.run_locally(data=data, version=version)
118+
elif self._step_execution_mode is StepExecutionMode.REMOTE:
119+
return self.run_remotely(data=data, version=version)
120+
else:
121+
raise ValueError(
122+
f"Unknown step execution mode: {self._step_execution_mode}"
123+
)
124+
125+
def run_locally(
126+
self,
127+
data: Union[WorkflowImageData, str],
128+
version: str,
129+
) -> BlockResult:
130+
if isinstance(data, str):
131+
hash_key = hashlib.md5((version + data).encode("utf-8")).hexdigest()
132+
cached_value = text_cache.get(hash_key)
133+
if cached_value is not None:
134+
return {"embedding": cached_value}
135+
inference_request = PerceptionEncoderTextEmbeddingRequest(
136+
perception_encoder_version_id=version,
137+
text=[data],
138+
api_key=self._api_key,
139+
)
140+
pe_model_id = load_core_model(
141+
model_manager=self._model_manager,
142+
inference_request=inference_request,
143+
core_model="perception_encoder",
144+
)
145+
predictions = self._model_manager.infer_from_request_sync(
146+
pe_model_id, inference_request
147+
)
148+
text_cache.set(hash_key, predictions.embeddings[0])
149+
return {"embedding": predictions.embeddings[0]}
150+
else:
151+
inference_request = PerceptionEncoderImageEmbeddingRequest(
152+
perception_encoder_version_id=version,
153+
image=[data.to_inference_format(numpy_preferred=True)],
154+
api_key=self._api_key,
155+
)
156+
pe_model_id = load_core_model(
157+
model_manager=self._model_manager,
158+
inference_request=inference_request,
159+
core_model="perception_encoder",
160+
)
161+
predictions = self._model_manager.infer_from_request_sync(
162+
pe_model_id, inference_request
163+
)
164+
return {"embedding": predictions.embeddings[0]}
165+
166+
def run_remotely(
167+
self,
168+
data: Union[WorkflowImageData, str],
169+
version: str,
170+
) -> BlockResult:
171+
api_url = (
172+
LOCAL_INFERENCE_API_URL
173+
if WORKFLOWS_REMOTE_API_TARGET != "hosted"
174+
else HOSTED_CORE_MODEL_URL
175+
)
176+
client = InferenceHTTPClient(api_url=api_url, api_key=self._api_key)
177+
if WORKFLOWS_REMOTE_API_TARGET == "hosted":
178+
client.select_api_v0()
179+
if isinstance(data, str):
180+
result = client.get_perception_encoder_text_embeddings(
181+
text=data,
182+
perception_encoder_version=version,
183+
)
184+
else:
185+
result = client.get_perception_encoder_image_embeddings(
186+
inference_input=data.base64_image,
187+
perception_encoder_version=version,
188+
)
189+
return {"embedding": result["embeddings"][0]}

inference_sdk/http/client.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1389,6 +1389,44 @@ async def clip_compare_async(
13891389
response.raise_for_status()
13901390
return await response.json()
13911391

1392+
@wrap_errors
1393+
def get_perception_encoder_image_embeddings(
1394+
self,
1395+
inference_input: Union[ImagesReference, List[ImagesReference]],
1396+
perception_encoder_version: Optional[str] = None,
1397+
) -> Union[dict, List[dict]]:
1398+
"""Get Perception Encoder embeddings for input image(s)."""
1399+
extra_payload = {}
1400+
if perception_encoder_version is not None:
1401+
extra_payload["perception_encoder_version_id"] = perception_encoder_version
1402+
result = self._post_images(
1403+
inference_input=inference_input,
1404+
endpoint="/perception_encoder/embed_image",
1405+
extra_payload=extra_payload,
1406+
)
1407+
return unwrap_single_element_list(result)
1408+
1409+
@wrap_errors
1410+
def get_perception_encoder_text_embeddings(
1411+
self,
1412+
text: Union[str, List[str]],
1413+
perception_encoder_version: Optional[str] = None,
1414+
) -> Union[dict, List[dict]]:
1415+
"""Get Perception Encoder embeddings for input text(s)."""
1416+
payload = self.__initialise_payload()
1417+
payload["text"] = text
1418+
if perception_encoder_version is not None:
1419+
payload["perception_encoder_version_id"] = perception_encoder_version
1420+
response = requests.post(
1421+
self.__wrap_url_with_api_key(
1422+
f"{self.__api_url}/perception_encoder/embed_text"
1423+
),
1424+
json=payload,
1425+
headers=DEFAULT_HEADERS,
1426+
)
1427+
api_key_safe_raise_for_status(response=response)
1428+
return unwrap_single_element_list(sequence=response.json())
1429+
13921430
@deprecated(
13931431
reason="Please use run_workflow(...) method. This method will be removed end of Q2 2024"
13941432
)

mkdocs.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ nav:
110110
- L2CS-Net (Gaze Detection): foundation/gaze.md
111111
- Moondream2: foundation/moondream2.md
112112
- PaliGemma: foundation/paligemma.md
113+
- Perception Encoder: foundation/perception_encoder.md
113114
- Segment Anything (Segmentation): foundation/sam.md
114115
- Segment Anything 2 (Segmentation): foundation/sam2.md
115116
- SmolVLM2: foundation/smolvlm.md

0 commit comments

Comments
 (0)