Skip to content

Commit 1bf982a

Browse files
authored
Add ObjC/Swift bindings for the ImageProcessor (pytorch#20051)
Differential Revision: D106898406 Pull Request resolved: pytorch#20051
1 parent 0d904b6 commit 1bf982a

8 files changed

Lines changed: 687 additions & 0 deletions

File tree

extension/apple/BUCK

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ non_fbcode_target(_kind = fb_apple_library,
1111
autoglob_mode = "EXPORT_UNLESS_INTERNAL",
1212
extension_api_only = True,
1313
frameworks = [
14+
"CoreVideo",
1415
"Foundation",
1516
],
1617
preprocessor_flags = [
@@ -29,11 +30,13 @@ non_fbcode_target(_kind = fb_apple_library,
2930
visibility = EXECUTORCH_CLIENTS,
3031
deps = select({
3132
"ovr_config//os:macos": [
33+
"//xplat/executorch/extension/image:image_processorAppleMac",
3234
"//xplat/executorch/extension/module:moduleAppleMac",
3335
"//xplat/executorch/extension/tensor:tensorAppleMac",
3436
"//xplat/executorch/runtime/platform:platformAppleMac",
3537
],
3638
"DEFAULT": [
39+
"//xplat/executorch/extension/image:image_processorApple",
3740
"//xplat/executorch/extension/module:moduleApple",
3841
"//xplat/executorch/extension/tensor:tensorApple",
3942
"//xplat/executorch/runtime/platform:platformApple",
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
import CoreVideo
10+
11+
public extension ImageNormalization {
12+
/// Create a normalization with a custom scale factor and per-channel RGB mean
13+
/// and standard deviation. `mean` and `standardDeviation` must each contain
14+
/// exactly 3 elements (R, G, B); every `standardDeviation` entry must be
15+
/// nonzero. Applied per channel as
16+
/// `(pixel * scaleFactor - mean[c]) / standardDeviation[c]`.
17+
convenience init(scaleFactor: Float, mean: [Float], standardDeviation: [Float]) {
18+
precondition(mean.count == 3, "mean must have exactly 3 elements (R, G, B)")
19+
precondition(
20+
standardDeviation.count == 3,
21+
"standardDeviation must have exactly 3 elements (R, G, B)")
22+
self.init(
23+
__scaleFactor: scaleFactor,
24+
mean: mean.map { NSNumber(value: $0) },
25+
standardDeviation: standardDeviation.map { NSNumber(value: $0) })
26+
}
27+
}
28+
29+
public extension ImageProcessorConfig {
30+
/// Source pixel count (width * height) sentinels for `gpuMinInputPixels`.
31+
static let alwaysGPU = 0
32+
static let alwaysCPU = Int.max
33+
34+
/// Create an image processor config, specifying only the values that differ
35+
/// from the defaults.
36+
///
37+
/// `gpuMinInputPixels` is the minimum source pixel count at which the GPU
38+
/// path may be used; smaller inputs run on the CPU. Use `.alwaysGPU` (0) or
39+
/// `.alwaysCPU` to force a path.
40+
convenience init(
41+
targetWidth: Int,
42+
targetHeight: Int,
43+
resizeMode: ImageResizeMode = .stretch,
44+
letterboxAnchor: ImageLetterboxAnchor = .center,
45+
padValue: Float = 0,
46+
normalization: ImageNormalization = .zeroToOne(),
47+
gpuMinInputPixels: Int = ImageProcessorConfig.defaultGpuMinInputPixels
48+
) {
49+
self.init(
50+
__targetWidth: targetWidth,
51+
targetHeight: targetHeight,
52+
resizeMode: resizeMode,
53+
letterboxAnchor: letterboxAnchor,
54+
padValue: padValue,
55+
normalization: normalization,
56+
gpuMinInputPixels: gpuMinInputPixels)
57+
}
58+
}
59+
60+
public extension ImageProcessor {
61+
/// Process a CVPixelBuffer into a normalized float tensor.
62+
///
63+
/// Auto-detects pixel format from the buffer. Supported formats: BGRA,
64+
/// RGBA, 8-bit NV12, and 10-bit P010. Output is a `Tensor<Float>` with
65+
/// shape `[1, 3, target_height, target_width]`.
66+
///
67+
/// The buffer is treated as already upright: orientation correction is not
68+
/// applied and cannot be derived from a CVPixelBuffer, so the caller is
69+
/// responsible for supplying an upright buffer.
70+
func process(_ pixelBuffer: CVPixelBuffer) throws -> Tensor<Float> {
71+
let anyTensor = try processPixelBuffer(pixelBuffer)
72+
return Tensor<Float>(anyTensor)
73+
}
74+
75+
/// Process a CVPixelBuffer into a caller-provided tensor, reusing its storage.
76+
///
77+
/// Avoids the per-call allocation of `process(_:)`, which matters for
78+
/// sustained video. `tensor` must be a `Tensor<Float>` with shape
79+
/// `[1, 3, target_height, target_width]`; its storage is overwritten and can
80+
/// be reused across frames. The contents are valid until the next call that
81+
/// writes into the same tensor.
82+
///
83+
/// The buffer is treated as already upright (see `process(_:)`).
84+
func process(_ pixelBuffer: CVPixelBuffer, into tensor: Tensor<Float>) throws {
85+
try processPixelBuffer(pixelBuffer, into: tensor.anyTensor)
86+
}
87+
88+
/// Letterbox padding (per side, in pixels) applied for a source of the given
89+
/// size: `x` is the left/right pad and `y` the top/bottom pad of the resized
90+
/// content. Returns `(0, 0)` for the stretch resize mode or the top-left
91+
/// anchor. Lets callers map the padded output back to the source region.
92+
func computeLetterboxPadding(inputWidth: Int, inputHeight: Int) -> (x: Int, y: Int) {
93+
let padding = __computeLetterboxPadding(forInputWidth: inputWidth, height: inputHeight)
94+
return (padding.x, padding.y)
95+
}
96+
}

extension/apple/ExecuTorch/Exported/ExecuTorch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#import "ExecuTorchBackendOption.h"
1010
#import "ExecuTorchBackendOptionsMap.h"
1111
#import "ExecuTorchError.h"
12+
#import "ExecuTorchImageProcessor.h"
1213
#import "ExecuTorchLog.h"
1314
#import "ExecuTorchModule.h"
1415
#import "ExecuTorchTensor.h"
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#import <CoreVideo/CoreVideo.h>
10+
#import <Foundation/Foundation.h>
11+
12+
#import "ExecuTorchTensor.h"
13+
14+
NS_ASSUME_NONNULL_BEGIN
15+
16+
typedef NS_ENUM(uint8_t, ExecuTorchImageResizeMode) {
17+
ExecuTorchImageResizeModeStretch,
18+
ExecuTorchImageResizeModeLetterbox,
19+
} NS_SWIFT_NAME(ImageResizeMode);
20+
21+
typedef NS_ENUM(uint8_t, ExecuTorchImageLetterboxAnchor) {
22+
ExecuTorchImageLetterboxAnchorCenter,
23+
ExecuTorchImageLetterboxAnchorTopLeft,
24+
} NS_SWIFT_NAME(ImageLetterboxAnchor);
25+
26+
/// Per-side letterbox padding in pixels: `x` is the left/right pad and `y` the
27+
/// top/bottom pad of the resized content.
28+
typedef struct ExecuTorchImageLetterboxPadding {
29+
NSInteger x;
30+
NSInteger y;
31+
} ExecuTorchImageLetterboxPadding NS_SWIFT_NAME(ImageLetterboxPadding);
32+
33+
NS_SWIFT_NAME(ImageNormalization)
34+
__attribute__((objc_subclassing_restricted))
35+
@interface ExecuTorchImageNormalization : NSObject
36+
37+
+ (instancetype)zeroToOne;
38+
+ (instancetype)imagenet;
39+
40+
/// Create a normalization with a custom scale factor and per-channel RGB mean
41+
/// and standard deviation. `mean` and `standardDeviation` must each contain
42+
/// exactly 3 elements (R, G, B). Normalization is applied per channel as
43+
/// `(pixel * scaleFactor - mean[c]) / standardDeviation[c]`, so every
44+
/// `standardDeviation` entry must be nonzero.
45+
- (instancetype)initWithScaleFactor:(float)scaleFactor
46+
mean:(NSArray<NSNumber *> *)mean
47+
standardDeviation:(NSArray<NSNumber *> *)standardDeviation
48+
NS_REFINED_FOR_SWIFT;
49+
50+
+ (instancetype)new NS_UNAVAILABLE;
51+
- (instancetype)init NS_UNAVAILABLE;
52+
53+
@end
54+
55+
NS_SWIFT_NAME(ImageProcessorConfig)
56+
__attribute__((objc_subclassing_restricted))
57+
@interface ExecuTorchImageProcessorConfig : NSObject
58+
59+
@property(nonatomic, readonly) NSInteger targetWidth;
60+
@property(nonatomic, readonly) NSInteger targetHeight;
61+
@property(nonatomic, readonly) ExecuTorchImageResizeMode resizeMode;
62+
@property(nonatomic, readonly) ExecuTorchImageLetterboxAnchor letterboxAnchor;
63+
@property(nonatomic, readonly) float padValue;
64+
@property(nonatomic, readonly) ExecuTorchImageNormalization *normalization;
65+
// Minimum source pixel count (width * height) at which the GPU path may be
66+
// used; smaller inputs run on the CPU. 0 forces GPU, NSIntegerMax forces CPU.
67+
@property(nonatomic, readonly) NSInteger gpuMinInputPixels;
68+
69+
// Default value for gpuMinInputPixels (mirrors the C++ config default).
70+
@property(class, nonatomic, readonly) NSInteger defaultGpuMinInputPixels;
71+
72+
- (instancetype)initWithTargetWidth:(NSInteger)targetWidth
73+
targetHeight:(NSInteger)targetHeight
74+
resizeMode:(ExecuTorchImageResizeMode)resizeMode
75+
letterboxAnchor:(ExecuTorchImageLetterboxAnchor)letterboxAnchor
76+
padValue:(float)padValue
77+
normalization:(ExecuTorchImageNormalization *)normalization
78+
gpuMinInputPixels:(NSInteger)gpuMinInputPixels NS_REFINED_FOR_SWIFT;
79+
80+
+ (instancetype)new NS_UNAVAILABLE;
81+
- (instancetype)init NS_UNAVAILABLE;
82+
83+
@end
84+
85+
/// Thread-safety: ExecuTorchImageProcessor is NOT thread-safe per instance.
86+
/// Internal scratch buffers are mutated during processing. Use one instance
87+
/// per concurrent caller. Different instances are safe to use concurrently.
88+
NS_SWIFT_NAME(ImageProcessor)
89+
__attribute__((objc_subclassing_restricted))
90+
@interface ExecuTorchImageProcessor : NSObject
91+
92+
@property(nonatomic, readonly) ExecuTorchImageProcessorConfig *config;
93+
94+
- (instancetype)initWithConfig:(ExecuTorchImageProcessorConfig *)config;
95+
96+
/// Process a CVPixelBuffer into a normalized float tensor.
97+
///
98+
/// Auto-detects pixel format from the buffer's metadata. Supported
99+
/// formats: BGRA, RGBA, 8-bit NV12, and 10-bit P010 (P010 is narrowed to NV12
100+
/// internally). Other formats return an error.
101+
///
102+
/// The buffer is treated as already upright. Orientation correction is not
103+
/// applied and cannot be derived from a CVPixelBuffer, so the caller is
104+
/// responsible for supplying an upright buffer (e.g. by configuring the
105+
/// capture connection's orientation).
106+
///
107+
/// @param pixelBuffer The input pixel buffer.
108+
/// @param error On failure, set to an NSError describing what went wrong.
109+
/// @return An ExecuTorchTensor with shape [1, 3, H, W] (CHW), or nil on failure.
110+
- (nullable ExecuTorchTensor *)processPixelBuffer:(_Nullable CVPixelBufferRef)pixelBuffer
111+
error:(NSError **)error;
112+
113+
/// Process a CVPixelBuffer into a caller-provided tensor, reusing its storage.
114+
///
115+
/// Avoids the per-call output allocation of processPixelBuffer:error:, which
116+
/// matters for sustained video. `tensor` must be a Float tensor shaped
117+
/// [1, 3, targetHeight, targetWidth]; its storage is overwritten and can be
118+
/// reused across frames. The result aliases `tensor`, so the caller must
119+
/// finish using the previous result before the next call.
120+
///
121+
/// @param pixelBuffer The input pixel buffer.
122+
/// @param tensor The output tensor to fill.
123+
/// @param error On failure, set to an NSError describing what went wrong.
124+
/// @return YES on success, NO on failure.
125+
- (BOOL)processPixelBuffer:(_Nullable CVPixelBufferRef)pixelBuffer
126+
intoTensor:(ExecuTorchTensor *)tensor
127+
error:(NSError **)error;
128+
129+
/// Letterbox padding (per side, in pixels) the processor applies for a source
130+
/// of the given size: `x` is the left/right pad and `y` the top/bottom pad of
131+
/// the resized content. Returns {0, 0} for the stretch resize mode or the
132+
/// top-left anchor. Lets callers map the padded output back to the source
133+
/// region without replicating the resize geometry.
134+
///
135+
/// @param inputWidth The source pixel width.
136+
/// @param inputHeight The source pixel height.
137+
/// @return The {x, y} padding in pixels.
138+
- (ExecuTorchImageLetterboxPadding)computeLetterboxPaddingForInputWidth:(NSInteger)inputWidth
139+
height:(NSInteger)inputHeight
140+
NS_REFINED_FOR_SWIFT;
141+
142+
+ (instancetype)new NS_UNAVAILABLE;
143+
- (instancetype)init NS_UNAVAILABLE;
144+
145+
@end
146+
147+
NS_ASSUME_NONNULL_END

0 commit comments

Comments
 (0)