Adds Lens support

jtreminio · jtreminio · commit 62f2dd2e9382 · 2026-05-23T12:53:06.000-05:00
diff --git a/docs/Model Support.md b/docs/Model Support.md
@@ -20,6 +20,7 @@
 [Anima](#anima) | DiT | 2026 | Circlestone Labs | 2B | WTF | Modern, very small, decent for anime |
 [ERNIE](#ernie) | DiT | 2026 | Baidu | 8B | Minimal | Modern, intelligent, good quality, fast |
 [HiDream O1](#hidream-o1) | "Pixel UiT" | 2026 | HiDream | 8B | Minimal | Modern, intelligent, fast, decent quality |
+[Lens](#lens) | MMDiT | 2026 | Microsoft | 3.8B | Minimal | Modern, Great Quality, lightweight |
 
 Old or bad options also tracked listed via [Obscure Model Support](/docs/Obscure%20Model%20Support.md):
 
@@ -618,6 +619,22 @@ For upscaling with SD3, the `Refiner Do Tiling` parameter is highly recommended
 - **Dev Lora:**
     - A dev lora can be downloaded here [Kijai/hidream-O1-image_comfy](<https://huggingface.co/Kijai/hidream-O1-image_comfy/resolve/main/loras/hidream_o1_dev_lora_rank_64_bf16_pruned_v1.safetensors>). It allows use of the base model with the distilled behavior from the Dev model. 8 steps will generate a coherent image of lower quality, 16 steps seems closer to original quality. Use CFG Scale 1.
 
+# Lens
+
+- Microsoft's [Lens](<https://huggingface.co/microsoft/Lens>) is supported in SwarmUI!
+- It is a 3.8B MMDiT model, with a base model and an official turbo distill designed to run fast.
+    - The "Turbo" model (in fat BF16) can be downloaded here [Comfy-Org/Lens - turbo](<https://huggingface.co/Comfy-Org/Lens/resolve/main/split_files/diffusion_models/lens_turbo_bf16.safetensors?download=true>)
+    - Or the base version (in fat BF16) [Comfy-Org/Lens - base](<https://huggingface.co/Comfy-Org/Lens/resolve/main/split_files/diffusion_models/lens_bf16.safetensors?download=true>)
+    - Save in `diffusion_models`
+- Uses the Flux.2 VAE, will be downloaded and handled automatically
+- Uses the GPT-OSS 20B text encoder, will be downloaded and handled automatically
+- **Parameters:**
+    - **Sampler:** Default is fine.
+    - **Scheduler:** Default is fine.
+    - **CFG Scale:** For Turbo, `1`, for base normal CFG ranges (around `5`)
+    - **Steps:** For Turbo, `4` is recommended, `8` works well. For Base, `20` as normal.
+    - **Resolution:** Side length `1440` is the standard.
+
 # Video Models
 
 - Video models are documented in [Video Model Support](/docs/Video%20Model%20Support.md).
diff --git a/src/BuiltinExtensions/ComfyUIBackend/ComfyUIAPIAbstractBackend.cs b/src/BuiltinExtensions/ComfyUIBackend/ComfyUIAPIAbstractBackend.cs
@@ -1080,6 +1080,7 @@ void copyParam<T>(T2IRegisteredParam<T> param)
             copyParam(T2IParamTypes.QwenModel);
             copyParam(T2IParamTypes.MistralModel);
             copyParam(T2IParamTypes.GemmaModel);
+            copyParam(T2IParamTypes.GptOssModel);
         }
         WorkflowGenerator wg = new() { UserInput = input, ModelFolderFormat = ModelFolderFormat, Features = [.. SupportedFeatures] };
         JObject workflow = wg.Generate();
diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs
@@ -957,7 +957,7 @@ public string CreateKSampler(JArray model, JArray pos, JArray neg, JArray latent
             }
         }
         // TODO: Registry of model default preferences instead of this
-        else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie() || IsHiDreamO1())
+        else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie() || IsHiDreamO1() || IsLens())
         {
             defscheduler ??= "simple";
         }
diff --git a/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs b/src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs
@@ -91,6 +91,9 @@ public bool IsKontext()
     /// <summary>Returns true if the current model is HiDream-O1 Image.</summary>
     public bool IsHiDreamO1() => IsModelCompatClass(T2IModelClassSorter.CompatHiDreamO1);
 
+    /// <summary>Returns true if the current model is Lens.</summary>
+    public bool IsLens() => IsModelCompatClass(T2IModelClassSorter.CompatLens);
+
     /// <summary>Returns true if the current model supports Flux Guidance.</summary>
     public bool HasFluxGuidance()
     {
@@ -269,7 +272,7 @@ public WGNodeData EmptyImage(int width, int height, int batchSize, string id = n
                 ["width"] = width
             }, id));
         }
-        else if (IsAnyFlux2() || IsErnie())
+        else if (IsAnyFlux2() || IsErnie() || IsLens())
         {
             return resultImage(CreateNode("EmptyFlux2LatentImage", new JObject()
             {
@@ -598,6 +601,11 @@ public string GetMinistral3_3bModel()
             return RequireClipModel("ministral-3-3b.safetensors", "https://huggingface.co/Comfy-Org/ERNIE-Image/resolve/main/text_encoders/ministral-3-3b.safetensors", "49a750a128863854eac7d85e1a277a7b44bf6ec3646405b84686dfeeca3708ca", T2IParamTypes.MistralModel);
         }
 
+        public string GetGptOss_20bModel()
+        {
+            return RequireClipModel("gpt_oss_20b_mxfp4.safetensors", "https://huggingface.co/Comfy-Org/Lens/resolve/main/split_files/text_encoders/gpt_oss_20b_mxfp4.safetensors", "f279cf3e73c494f78e0c5e4d35cf665068ae69672f7066813dbb75c021286856", T2IParamTypes.GptOssModel);
+        }
+
         public string GetClipLModel()
         {
             if (g.UserInput.TryGet(T2IParamTypes.ClipLModel, out T2IModel model))
@@ -899,7 +907,7 @@ public void LoadClip3(string type, string modelA, string modelB, string modelC)
                     {
                         dtype = "default";
                     }
-                    else if (IsZImage() || IsZetaChroma() || IsAnima()) // Model is small and dense, so trust user preferred download format
+                    else if (IsZImage() || IsZetaChroma() || IsAnima() || IsLens()) // Model is small and dense, so trust user preferred download format
                     {
                         dtype = "default";
                     }
@@ -1057,6 +1065,18 @@ public void LoadClip3(string type, string modelA, string modelB, string modelC)
             helpers.LoadClip("flux2", helpers.GetMinistral3_3bModel());
             helpers.DoVaeLoader(UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultFlux2VAE, "flux-2", "flux2-vae");
         }
+        else if (IsLens())
+        {
+            helpers.LoadClip("lens", helpers.GetGptOss_20bModel());
+            helpers.DoVaeLoader(UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultFlux2VAE, "flux-2", "flux2-vae");
+            string cfgNormNode = CreateNode("CFGNorm", new JObject()
+            {
+                ["model"] = LoadingModel,
+                ["strength"] = 1.0,
+                ["pre_cfg"] = true
+            });
+            LoadingModel = [cfgNormNode, 0];
+        }
         else if (IsFlux() && (LoadingClip is null || LoadingVAE is null || UserInput.Get(T2IParamTypes.T5XXLModel) is not null || UserInput.Get(T2IParamTypes.ClipLModel) is not null))
         {
             helpers.LoadClip2("flux", helpers.GetT5XXLModel(), helpers.GetClipLModel());
@@ -1323,7 +1343,7 @@ public void LoadClip3(string type, string modelA, string modelB, string modelC)
         }
         if (UserInput.TryGet(T2IParamTypes.SigmaShift, out double shiftVal, sectionId: sectionId))
         {
-            if (IsFlux() || IsAnyFlux2())
+            if (IsFlux() || IsAnyFlux2() || IsLens())
             {
                 string samplingNode = CreateNode("ModelSamplingFlux", new JObject()
                 {
diff --git a/src/Text2Image/T2IModelClassSorter.cs b/src/Text2Image/T2IModelClassSorter.cs
@@ -71,6 +71,7 @@ public static T2IModelCompatClass
         CompatZetaChroma = RegisterCompat(new() { ID = "zeta-chroma", ShortCode = "ZChr", LorasTargetTextEnc = false }),
         CompatAnima = RegisterCompat(new() { ID = "anima", ShortCode = "Anima", LorasTargetTextEnc = false }),
         CompatHiDreamO1 = RegisterCompat(new() { ID = "hidream-o1", ShortCode = "HiDrO1", LorasTargetTextEnc = false }),
+        CompatLens = RegisterCompat(new() { ID = "lens", ShortCode = "Lens", LorasTargetTextEnc = false }),
         // Audio models
         CompatAceStep15 = RegisterCompat(new() { ID = "ace-step-1_5", ShortCode = "Ace15", IsAudioModel = true }),
         // Obscure old random ones
@@ -158,6 +159,7 @@ bool isFluxLora(JObject h)
         bool isFlux2KleinLora(JObject h) => hasLoraKey(h, "double_blocks.4.img_attn.proj") && hasLoraKey(h, "double_blocks.4.txt_mlp.2") && hasLoraKey(h, "single_blocks.18.linear1") && hasLoraKey(h, "single_blocks.19.linear2");
         bool isFlux2Klein9BLora(JObject h) => hasLoraKey(h, "single_blocks.23.linear1");
         bool isFlux2DevLora(JObject h) => hasLoraKey(h, "single_blocks.47.linear2");
+        bool isLens(JObject h) => h.ContainsKey("transformer_blocks.0.attn.norm_added_q.weight") && h.ContainsKey("transformer_blocks.0.img_mlp.w1.weight");
         bool isSD35Lora(JObject h) => h.ContainsKey("transformer.transformer_blocks.0.attn.to_k.lora_A.weight") && h.ContainsKey("transformer.transformer_blocks.37.attn.to_out.0.lora_B.weight");
         bool isMochi(JObject h) => hasKey(h, "blocks.0.attn.k_norm_x.weight");
         bool isMochiVae(JObject h) => h.ContainsKey("encoder.layers.4.layers.1.attn_block.attn.qkv.weight") || h.ContainsKey("layers.4.layers.1.attn_block.attn.qkv.weight") || h.ContainsKey("blocks.2.blocks.3.stack.5.weight") || h.ContainsKey("decoder.blocks.2.blocks.3.stack.5.weight");
@@ -478,6 +480,10 @@ JToken GetEmbeddingKey(JObject h)
         {
             return isFlux2KleinLora(h) && isFlux2Klein9BLora(h) && !isFlux2DevLora(h);
         }});
+        Register(new() { ID = "lens", CompatClass = CompatLens, Name = "Lens", StandardWidth = 1440, StandardHeight = 1440, IsThisModelOfClass = (m, h) =>
+        {
+            return isLens(h);
+        }});
         // ====================== Wan Video ======================
         Register(new() { ID = "wan-2_1-text2video/vae", CompatClass = CompatWan21, Name = "Wan 2.1 VAE", StandardWidth = 640, StandardHeight = 640, IsThisModelOfClass = (m, h) => { return false; }});
         Register(new() { ID = "wan-2_1-text2video-1_3b", CompatClass = CompatWan21_1_3b, Name = "Wan 2.1 Text2Video 1.3B", StandardWidth = 640, StandardHeight = 640, IsThisModelOfClass = (m, h) =>
diff --git a/src/Text2Image/T2IParamTypes.cs b/src/Text2Image/T2IParamTypes.cs
@@ -330,7 +330,7 @@ public static string ApplyStringEdit(string prior, string update)
         FreeUBlock1, FreeUBlock2, FreeUSkip1, FreeUSkip2, GlobalRegionFactor, EndStepsEarly, SamplerSigmaMin, SamplerSigmaMax, SamplerRho, VideoAugmentationLevel, VideoCFG, VideoMinCFG, Video2VideoCreativity, VideoSwapPercent, VideoExtendSwapPercent, IP2PCFG2, RegionalObjectCleanupFactor, SigmaShift, SegmentThresholdMax, SegmentCFGScale, FluxGuidanceScale, Text2AudioDuration;
     public static T2IRegisteredParam<Image> InitImage, MaskImage, VideoEndFrame;
     public static T2IRegisteredParam<AudioFile> VideoAudioInput, VideoAudioReference;
-    public static T2IRegisteredParam<T2IModel> Model, RefinerModel, VAE, RegionalObjectInpaintingModel, SegmentModel, VideoModel, VideoSwapModel, RefinerVAE, ClipLModel, ClipGModel, ClipVisionModel, T5XXLModel, LLaVAModel, LLaMAModel, QwenModel, MistralModel, GemmaModel, VideoExtendModel, VideoExtendSwapModel;
+    public static T2IRegisteredParam<T2IModel> Model, RefinerModel, VAE, RegionalObjectInpaintingModel, SegmentModel, VideoModel, VideoSwapModel, RefinerVAE, ClipLModel, ClipGModel, ClipVisionModel, T5XXLModel, LLaVAModel, LLaMAModel, QwenModel, MistralModel, GemmaModel, GptOssModel, VideoExtendModel, VideoExtendSwapModel;
     public static T2IRegisteredParam<List<string>> Loras, LoraWeights, LoraTencWeights, LoraSectionConfinement;
     public static T2IRegisteredParam<List<Image>> PromptImages;
     public static T2IRegisteredParam<bool> OutputIntermediateImages, DoNotSave, DoNotSaveIntermediates, ControlNetPreviewOnly, RevisionZeroPrompt, RemoveBackground, NoSeedIncrement, NoPreviews, VideoBoomerang, ModelSpecificEnhancements, UseInpaintingEncode, MaskCompositeUnthresholded, SaveSegmentMask, InitImageRecompositeMask, UseReferenceOnly, RefinerDoTiling, AutomaticVAE, ZeroNegative, FluxDisableGuidance, SmartImagePromptResizing, NoLoadModels, NoInternalSpecialHandling, ForwardRawBackendData, ForwardSwarmData,
@@ -715,6 +715,9 @@ static List<string> listVaes(Session s)
         GemmaModel = Register<T2IModel>(new("Gemma Model", "Which Gemma LLM to use as a text encoder, for models that use Gemma (such as Lumina2, LTX2).",
             "", IgnoreIf: "", Group: GroupAdvancedModelAddons, Subtype: "Clip", Permission: Permissions.ModelParams, Toggleable: true, IsAdvanced: true, OrderPriority: 20, ChangeWeight: 7
             ));
+        GptOssModel = Register<T2IModel>(new("GPT-OSS Model", "Which GPT-OSS LLM to use as a text encoder, for Lens-style 'diffusion_models' folder models.",
+            "", IgnoreIf: "", Group: GroupAdvancedModelAddons, Subtype: "Clip", Permission: Permissions.ModelParams, Toggleable: true, IsAdvanced: true, OrderPriority: 20, ChangeWeight: 7
+            ));
         TorchCompile = Register<string>(new("Torch Compile", "Torch.Compile is a way to dynamically accelerate AI models.\nIt wastes a bit of time (around a minute) on the first call compiling a graph of the generation, and then all subsequent generations run faster thanks to the compiled graph.\nTorch.Compile depends on Triton, which is difficult to install on Windows, easier on Linux.",
             "Disabled", IgnoreIf: "Disabled", GetValues: _ => ["Disabled", "inductor", "cudagraphs"], OrderPriority: 40, Group: GroupAdvancedModelAddons
             ));

Original file line number	Diff line number	Diff line change
`@@ -1080,6 +1080,7 @@ void copyParam<T>(T2IRegisteredParam<T> param)`
`1080`	`1080`	`copyParam(T2IParamTypes.QwenModel);`
`1081`	`1081`	`copyParam(T2IParamTypes.MistralModel);`
`1082`	`1082`	`copyParam(T2IParamTypes.GemmaModel);`
	`1083`	`+ copyParam(T2IParamTypes.GptOssModel);`
`1083`	`1084`	`}`
`1084`	`1085`	`WorkflowGenerator wg = new() { UserInput = input, ModelFolderFormat = ModelFolderFormat, Features = [.. SupportedFeatures] };`
`1085`	`1086`	`JObject workflow = wg.Generate();`
Original file line number	Diff line number	Diff line change
`@@ -957,7 +957,7 @@ public string CreateKSampler(JArray model, JArray pos, JArray neg, JArray latent`
`957`	`957`	`}`
`958`	`958`	`}`
`959`	`959`	`// TODO: Registry of model default preferences instead of this`
`960`		`- else if (IsFlux() \|\| IsWanVideo() \|\| IsWanVideo22() \|\| IsOmniGen() \|\| IsQwenImage() \|\| IsZImage() \|\| IsZetaChroma() \|\| IsErnie() \|\| IsHiDreamO1())`
	`960`	`+ else if (IsFlux() \|\| IsWanVideo() \|\| IsWanVideo22() \|\| IsOmniGen() \|\| IsQwenImage() \|\| IsZImage() \|\| IsZetaChroma() \|\| IsErnie() \|\| IsHiDreamO1() \|\| IsLens())`
`961`	`961`	`{`
`962`	`962`	`defscheduler ??= "simple";`
`963`	`963`	`}`
Original file line number	Diff line number	Diff line change
`@@ -91,6 +91,9 @@ public bool IsKontext()`
`91`	`91`	`/// <summary>Returns true if the current model is HiDream-O1 Image.</summary>`
`92`	`92`	`public bool IsHiDreamO1() => IsModelCompatClass(T2IModelClassSorter.CompatHiDreamO1);`
`93`	`93`
	`94`	`+ /// <summary>Returns true if the current model is Lens.</summary>`
	`95`	`+ public bool IsLens() => IsModelCompatClass(T2IModelClassSorter.CompatLens);`
	`96`	`+`
`94`	`97`	`/// <summary>Returns true if the current model supports Flux Guidance.</summary>`
`95`	`98`	`public bool HasFluxGuidance()`
`96`	`99`	`{`
`@@ -269,7 +272,7 @@ public WGNodeData EmptyImage(int width, int height, int batchSize, string id = n`
`269`	`272`	`["width"] = width`
`270`	`273`	`}, id));`
`271`	`274`	`}`
`272`		`- else if (IsAnyFlux2() \|\| IsErnie())`
	`275`	`+ else if (IsAnyFlux2() \|\| IsErnie() \|\| IsLens())`
`273`	`276`	`{`
`274`	`277`	`return resultImage(CreateNode("EmptyFlux2LatentImage", new JObject()`
`275`	`278`	`{`
`@@ -598,6 +601,11 @@ public string GetMinistral3_3bModel()`
`598`	`601`	`return RequireClipModel("ministral-3-3b.safetensors", "https://huggingface.co/Comfy-Org/ERNIE-Image/resolve/main/text_encoders/ministral-3-3b.safetensors", "49a750a128863854eac7d85e1a277a7b44bf6ec3646405b84686dfeeca3708ca", T2IParamTypes.MistralModel);`
`599`	`602`	`}`
`600`	`603`
	`604`	`+ public string GetGptOss_20bModel()`
	`605`	`+ {`
	`606`	`+ return RequireClipModel("gpt_oss_20b_mxfp4.safetensors", "https://huggingface.co/Comfy-Org/Lens/resolve/main/split_files/text_encoders/gpt_oss_20b_mxfp4.safetensors", "f279cf3e73c494f78e0c5e4d35cf665068ae69672f7066813dbb75c021286856", T2IParamTypes.GptOssModel);`
	`607`	`+ }`
	`608`	`+`
`601`	`609`	`public string GetClipLModel()`
`602`	`610`	`{`
`603`	`611`	`if (g.UserInput.TryGet(T2IParamTypes.ClipLModel, out T2IModel model))`
`@@ -899,7 +907,7 @@ public void LoadClip3(string type, string modelA, string modelB, string modelC)`
`899`	`907`	`{`
`900`	`908`	`dtype = "default";`
`901`	`909`	`}`
`902`		`- else if (IsZImage() \|\| IsZetaChroma() \|\| IsAnima()) // Model is small and dense, so trust user preferred download format`
	`910`	`+ else if (IsZImage() \|\| IsZetaChroma() \|\| IsAnima() \|\| IsLens()) // Model is small and dense, so trust user preferred download format`
`903`	`911`	`{`
`904`	`912`	`dtype = "default";`
`905`	`913`	`}`
`@@ -1057,6 +1065,18 @@ public void LoadClip3(string type, string modelA, string modelB, string modelC)`
`1057`	`1065`	`helpers.LoadClip("flux2", helpers.GetMinistral3_3bModel());`
`1058`	`1066`	`helpers.DoVaeLoader(UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultFlux2VAE, "flux-2", "flux2-vae");`
`1059`	`1067`	`}`
	`1068`	`+ else if (IsLens())`
	`1069`	`+ {`
	`1070`	`+ helpers.LoadClip("lens", helpers.GetGptOss_20bModel());`
	`1071`	`+ helpers.DoVaeLoader(UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultFlux2VAE, "flux-2", "flux2-vae");`
	`1072`	`+ string cfgNormNode = CreateNode("CFGNorm", new JObject()`
	`1073`	`+ {`
	`1074`	`+ ["model"] = LoadingModel,`
	`1075`	`+ ["strength"] = 1.0,`
	`1076`	`+ ["pre_cfg"] = true`
	`1077`	`+ });`
	`1078`	`+ LoadingModel = [cfgNormNode, 0];`
	`1079`	`+ }`
`1060`	`1080`	`else if (IsFlux() && (LoadingClip is null \|\| LoadingVAE is null \|\| UserInput.Get(T2IParamTypes.T5XXLModel) is not null \|\| UserInput.Get(T2IParamTypes.ClipLModel) is not null))`
`1061`	`1081`	`{`
`1062`	`1082`	`helpers.LoadClip2("flux", helpers.GetT5XXLModel(), helpers.GetClipLModel());`
`@@ -1323,7 +1343,7 @@ public void LoadClip3(string type, string modelA, string modelB, string modelC)`
`1323`	`1343`	`}`
`1324`	`1344`	`if (UserInput.TryGet(T2IParamTypes.SigmaShift, out double shiftVal, sectionId: sectionId))`
`1325`	`1345`	`{`
`1326`		`- if (IsFlux() \|\| IsAnyFlux2())`
	`1346`	`+ if (IsFlux() \|\| IsAnyFlux2() \|\| IsLens())`
`1327`	`1347`	`{`
`1328`	`1348`	`string samplingNode = CreateNode("ModelSamplingFlux", new JObject()`
`1329`	`1349`	`{`