Skip to content

Commit 62f2dd2

Browse files
committed
Adds Lens support
1 parent b0b9610 commit 62f2dd2

6 files changed

Lines changed: 52 additions & 5 deletions

File tree

docs/Model Support.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
[Anima](#anima) | DiT | 2026 | Circlestone Labs | 2B | WTF | Modern, very small, decent for anime |
2121
[ERNIE](#ernie) | DiT | 2026 | Baidu | 8B | Minimal | Modern, intelligent, good quality, fast |
2222
[HiDream O1](#hidream-o1) | "Pixel UiT" | 2026 | HiDream | 8B | Minimal | Modern, intelligent, fast, decent quality |
23+
[Lens](#lens) | MMDiT | 2026 | Microsoft | 3.8B | Minimal | Modern, Great Quality, lightweight |
2324

2425
Old or bad options also tracked listed via [Obscure Model Support](/docs/Obscure%20Model%20Support.md):
2526

@@ -618,6 +619,22 @@ For upscaling with SD3, the `Refiner Do Tiling` parameter is highly recommended
618619
- **Dev Lora:**
619620
- A dev lora can be downloaded here [Kijai/hidream-O1-image_comfy](<https://huggingface.co/Kijai/hidream-O1-image_comfy/resolve/main/loras/hidream_o1_dev_lora_rank_64_bf16_pruned_v1.safetensors>). It allows use of the base model with the distilled behavior from the Dev model. 8 steps will generate a coherent image of lower quality, 16 steps seems closer to original quality. Use CFG Scale 1.
620621

622+
# Lens
623+
624+
- Microsoft's [Lens](<https://huggingface.co/microsoft/Lens>) is supported in SwarmUI!
625+
- It is a 3.8B MMDiT model, with a base model and an official turbo distill designed to run fast.
626+
- The "Turbo" model (in fat BF16) can be downloaded here [Comfy-Org/Lens - turbo](<https://huggingface.co/Comfy-Org/Lens/resolve/main/split_files/diffusion_models/lens_turbo_bf16.safetensors?download=true>)
627+
- Or the base version (in fat BF16) [Comfy-Org/Lens - base](<https://huggingface.co/Comfy-Org/Lens/resolve/main/split_files/diffusion_models/lens_bf16.safetensors?download=true>)
628+
- Save in `diffusion_models`
629+
- Uses the Flux.2 VAE, will be downloaded and handled automatically
630+
- Uses the GPT-OSS 20B text encoder, will be downloaded and handled automatically
631+
- **Parameters:**
632+
- **Sampler:** Default is fine.
633+
- **Scheduler:** Default is fine.
634+
- **CFG Scale:** For Turbo, `1`, for base normal CFG ranges (around `5`)
635+
- **Steps:** For Turbo, `4` is recommended, `8` works well. For Base, `20` as normal.
636+
- **Resolution:** Side length `1440` is the standard.
637+
621638
# Video Models
622639

623640
- Video models are documented in [Video Model Support](/docs/Video%20Model%20Support.md).

src/BuiltinExtensions/ComfyUIBackend/ComfyUIAPIAbstractBackend.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1080,6 +1080,7 @@ void copyParam<T>(T2IRegisteredParam<T> param)
10801080
copyParam(T2IParamTypes.QwenModel);
10811081
copyParam(T2IParamTypes.MistralModel);
10821082
copyParam(T2IParamTypes.GemmaModel);
1083+
copyParam(T2IParamTypes.GptOssModel);
10831084
}
10841085
WorkflowGenerator wg = new() { UserInput = input, ModelFolderFormat = ModelFolderFormat, Features = [.. SupportedFeatures] };
10851086
JObject workflow = wg.Generate();

src/BuiltinExtensions/ComfyUIBackend/WorkflowGenerator.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -957,7 +957,7 @@ public string CreateKSampler(JArray model, JArray pos, JArray neg, JArray latent
957957
}
958958
}
959959
// TODO: Registry of model default preferences instead of this
960-
else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie() || IsHiDreamO1())
960+
else if (IsFlux() || IsWanVideo() || IsWanVideo22() || IsOmniGen() || IsQwenImage() || IsZImage() || IsZetaChroma() || IsErnie() || IsHiDreamO1() || IsLens())
961961
{
962962
defscheduler ??= "simple";
963963
}

src/BuiltinExtensions/ComfyUIBackend/WorkflowGeneratorModelSupport.cs

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,9 @@ public bool IsKontext()
9191
/// <summary>Returns true if the current model is HiDream-O1 Image.</summary>
9292
public bool IsHiDreamO1() => IsModelCompatClass(T2IModelClassSorter.CompatHiDreamO1);
9393

94+
/// <summary>Returns true if the current model is Lens.</summary>
95+
public bool IsLens() => IsModelCompatClass(T2IModelClassSorter.CompatLens);
96+
9497
/// <summary>Returns true if the current model supports Flux Guidance.</summary>
9598
public bool HasFluxGuidance()
9699
{
@@ -269,7 +272,7 @@ public WGNodeData EmptyImage(int width, int height, int batchSize, string id = n
269272
["width"] = width
270273
}, id));
271274
}
272-
else if (IsAnyFlux2() || IsErnie())
275+
else if (IsAnyFlux2() || IsErnie() || IsLens())
273276
{
274277
return resultImage(CreateNode("EmptyFlux2LatentImage", new JObject()
275278
{
@@ -598,6 +601,11 @@ public string GetMinistral3_3bModel()
598601
return RequireClipModel("ministral-3-3b.safetensors", "https://huggingface.co/Comfy-Org/ERNIE-Image/resolve/main/text_encoders/ministral-3-3b.safetensors", "49a750a128863854eac7d85e1a277a7b44bf6ec3646405b84686dfeeca3708ca", T2IParamTypes.MistralModel);
599602
}
600603

604+
public string GetGptOss_20bModel()
605+
{
606+
return RequireClipModel("gpt_oss_20b_mxfp4.safetensors", "https://huggingface.co/Comfy-Org/Lens/resolve/main/split_files/text_encoders/gpt_oss_20b_mxfp4.safetensors", "f279cf3e73c494f78e0c5e4d35cf665068ae69672f7066813dbb75c021286856", T2IParamTypes.GptOssModel);
607+
}
608+
601609
public string GetClipLModel()
602610
{
603611
if (g.UserInput.TryGet(T2IParamTypes.ClipLModel, out T2IModel model))
@@ -899,7 +907,7 @@ public void LoadClip3(string type, string modelA, string modelB, string modelC)
899907
{
900908
dtype = "default";
901909
}
902-
else if (IsZImage() || IsZetaChroma() || IsAnima()) // Model is small and dense, so trust user preferred download format
910+
else if (IsZImage() || IsZetaChroma() || IsAnima() || IsLens()) // Model is small and dense, so trust user preferred download format
903911
{
904912
dtype = "default";
905913
}
@@ -1057,6 +1065,18 @@ public void LoadClip3(string type, string modelA, string modelB, string modelC)
10571065
helpers.LoadClip("flux2", helpers.GetMinistral3_3bModel());
10581066
helpers.DoVaeLoader(UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultFlux2VAE, "flux-2", "flux2-vae");
10591067
}
1068+
else if (IsLens())
1069+
{
1070+
helpers.LoadClip("lens", helpers.GetGptOss_20bModel());
1071+
helpers.DoVaeLoader(UserInput.SourceSession?.User?.Settings?.VAEs?.DefaultFlux2VAE, "flux-2", "flux2-vae");
1072+
string cfgNormNode = CreateNode("CFGNorm", new JObject()
1073+
{
1074+
["model"] = LoadingModel,
1075+
["strength"] = 1.0,
1076+
["pre_cfg"] = true
1077+
});
1078+
LoadingModel = [cfgNormNode, 0];
1079+
}
10601080
else if (IsFlux() && (LoadingClip is null || LoadingVAE is null || UserInput.Get(T2IParamTypes.T5XXLModel) is not null || UserInput.Get(T2IParamTypes.ClipLModel) is not null))
10611081
{
10621082
helpers.LoadClip2("flux", helpers.GetT5XXLModel(), helpers.GetClipLModel());
@@ -1323,7 +1343,7 @@ public void LoadClip3(string type, string modelA, string modelB, string modelC)
13231343
}
13241344
if (UserInput.TryGet(T2IParamTypes.SigmaShift, out double shiftVal, sectionId: sectionId))
13251345
{
1326-
if (IsFlux() || IsAnyFlux2())
1346+
if (IsFlux() || IsAnyFlux2() || IsLens())
13271347
{
13281348
string samplingNode = CreateNode("ModelSamplingFlux", new JObject()
13291349
{

src/Text2Image/T2IModelClassSorter.cs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ public static T2IModelCompatClass
7171
CompatZetaChroma = RegisterCompat(new() { ID = "zeta-chroma", ShortCode = "ZChr", LorasTargetTextEnc = false }),
7272
CompatAnima = RegisterCompat(new() { ID = "anima", ShortCode = "Anima", LorasTargetTextEnc = false }),
7373
CompatHiDreamO1 = RegisterCompat(new() { ID = "hidream-o1", ShortCode = "HiDrO1", LorasTargetTextEnc = false }),
74+
CompatLens = RegisterCompat(new() { ID = "lens", ShortCode = "Lens", LorasTargetTextEnc = false }),
7475
// Audio models
7576
CompatAceStep15 = RegisterCompat(new() { ID = "ace-step-1_5", ShortCode = "Ace15", IsAudioModel = true }),
7677
// Obscure old random ones
@@ -158,6 +159,7 @@ bool isFluxLora(JObject h)
158159
bool isFlux2KleinLora(JObject h) => hasLoraKey(h, "double_blocks.4.img_attn.proj") && hasLoraKey(h, "double_blocks.4.txt_mlp.2") && hasLoraKey(h, "single_blocks.18.linear1") && hasLoraKey(h, "single_blocks.19.linear2");
159160
bool isFlux2Klein9BLora(JObject h) => hasLoraKey(h, "single_blocks.23.linear1");
160161
bool isFlux2DevLora(JObject h) => hasLoraKey(h, "single_blocks.47.linear2");
162+
bool isLens(JObject h) => h.ContainsKey("transformer_blocks.0.attn.norm_added_q.weight") && h.ContainsKey("transformer_blocks.0.img_mlp.w1.weight");
161163
bool isSD35Lora(JObject h) => h.ContainsKey("transformer.transformer_blocks.0.attn.to_k.lora_A.weight") && h.ContainsKey("transformer.transformer_blocks.37.attn.to_out.0.lora_B.weight");
162164
bool isMochi(JObject h) => hasKey(h, "blocks.0.attn.k_norm_x.weight");
163165
bool isMochiVae(JObject h) => h.ContainsKey("encoder.layers.4.layers.1.attn_block.attn.qkv.weight") || h.ContainsKey("layers.4.layers.1.attn_block.attn.qkv.weight") || h.ContainsKey("blocks.2.blocks.3.stack.5.weight") || h.ContainsKey("decoder.blocks.2.blocks.3.stack.5.weight");
@@ -478,6 +480,10 @@ JToken GetEmbeddingKey(JObject h)
478480
{
479481
return isFlux2KleinLora(h) && isFlux2Klein9BLora(h) && !isFlux2DevLora(h);
480482
}});
483+
Register(new() { ID = "lens", CompatClass = CompatLens, Name = "Lens", StandardWidth = 1440, StandardHeight = 1440, IsThisModelOfClass = (m, h) =>
484+
{
485+
return isLens(h);
486+
}});
481487
// ====================== Wan Video ======================
482488
Register(new() { ID = "wan-2_1-text2video/vae", CompatClass = CompatWan21, Name = "Wan 2.1 VAE", StandardWidth = 640, StandardHeight = 640, IsThisModelOfClass = (m, h) => { return false; }});
483489
Register(new() { ID = "wan-2_1-text2video-1_3b", CompatClass = CompatWan21_1_3b, Name = "Wan 2.1 Text2Video 1.3B", StandardWidth = 640, StandardHeight = 640, IsThisModelOfClass = (m, h) =>

src/Text2Image/T2IParamTypes.cs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,7 @@ public static string ApplyStringEdit(string prior, string update)
330330
FreeUBlock1, FreeUBlock2, FreeUSkip1, FreeUSkip2, GlobalRegionFactor, EndStepsEarly, SamplerSigmaMin, SamplerSigmaMax, SamplerRho, VideoAugmentationLevel, VideoCFG, VideoMinCFG, Video2VideoCreativity, VideoSwapPercent, VideoExtendSwapPercent, IP2PCFG2, RegionalObjectCleanupFactor, SigmaShift, SegmentThresholdMax, SegmentCFGScale, FluxGuidanceScale, Text2AudioDuration;
331331
public static T2IRegisteredParam<Image> InitImage, MaskImage, VideoEndFrame;
332332
public static T2IRegisteredParam<AudioFile> VideoAudioInput, VideoAudioReference;
333-
public static T2IRegisteredParam<T2IModel> Model, RefinerModel, VAE, RegionalObjectInpaintingModel, SegmentModel, VideoModel, VideoSwapModel, RefinerVAE, ClipLModel, ClipGModel, ClipVisionModel, T5XXLModel, LLaVAModel, LLaMAModel, QwenModel, MistralModel, GemmaModel, VideoExtendModel, VideoExtendSwapModel;
333+
public static T2IRegisteredParam<T2IModel> Model, RefinerModel, VAE, RegionalObjectInpaintingModel, SegmentModel, VideoModel, VideoSwapModel, RefinerVAE, ClipLModel, ClipGModel, ClipVisionModel, T5XXLModel, LLaVAModel, LLaMAModel, QwenModel, MistralModel, GemmaModel, GptOssModel, VideoExtendModel, VideoExtendSwapModel;
334334
public static T2IRegisteredParam<List<string>> Loras, LoraWeights, LoraTencWeights, LoraSectionConfinement;
335335
public static T2IRegisteredParam<List<Image>> PromptImages;
336336
public static T2IRegisteredParam<bool> OutputIntermediateImages, DoNotSave, DoNotSaveIntermediates, ControlNetPreviewOnly, RevisionZeroPrompt, RemoveBackground, NoSeedIncrement, NoPreviews, VideoBoomerang, ModelSpecificEnhancements, UseInpaintingEncode, MaskCompositeUnthresholded, SaveSegmentMask, InitImageRecompositeMask, UseReferenceOnly, RefinerDoTiling, AutomaticVAE, ZeroNegative, FluxDisableGuidance, SmartImagePromptResizing, NoLoadModels, NoInternalSpecialHandling, ForwardRawBackendData, ForwardSwarmData,
@@ -715,6 +715,9 @@ static List<string> listVaes(Session s)
715715
GemmaModel = Register<T2IModel>(new("Gemma Model", "Which Gemma LLM to use as a text encoder, for models that use Gemma (such as Lumina2, LTX2).",
716716
"", IgnoreIf: "", Group: GroupAdvancedModelAddons, Subtype: "Clip", Permission: Permissions.ModelParams, Toggleable: true, IsAdvanced: true, OrderPriority: 20, ChangeWeight: 7
717717
));
718+
GptOssModel = Register<T2IModel>(new("GPT-OSS Model", "Which GPT-OSS LLM to use as a text encoder, for Lens-style 'diffusion_models' folder models.",
719+
"", IgnoreIf: "", Group: GroupAdvancedModelAddons, Subtype: "Clip", Permission: Permissions.ModelParams, Toggleable: true, IsAdvanced: true, OrderPriority: 20, ChangeWeight: 7
720+
));
718721
TorchCompile = Register<string>(new("Torch Compile", "Torch.Compile is a way to dynamically accelerate AI models.\nIt wastes a bit of time (around a minute) on the first call compiling a graph of the generation, and then all subsequent generations run faster thanks to the compiled graph.\nTorch.Compile depends on Triton, which is difficult to install on Windows, easier on Linux.",
719722
"Disabled", IgnoreIf: "Disabled", GetValues: _ => ["Disabled", "inductor", "cudagraphs"], OrderPriority: 40, Group: GroupAdvancedModelAddons
720723
));

0 commit comments

Comments
 (0)