Skip to content

Commit 00dfe87

Browse files
committed
Implement BM25
1 parent b37231a commit 00dfe87

13 files changed

Lines changed: 968 additions & 16 deletions

Milvus.Client.Tests/Bm25Tests.cs

Lines changed: 545 additions & 0 deletions
Large diffs are not rendered by default.

Milvus.Client/AnnSearchRequest.cs

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,3 +116,42 @@ public SparseVectorAnnSearchRequest(
116116
/// </summary>
117117
public IReadOnlyList<MilvusSparseVector<T>> Vectors { get; }
118118
}
119+
120+
/// <summary>
121+
/// Represents an ANN search request for full-text search using BM25.
122+
/// </summary>
123+
/// <remarks>
124+
/// This request type is used for hybrid search that combines BM25 full-text search
125+
/// with dense vector search. The text queries are automatically converted to sparse
126+
/// vectors by the Milvus server using the BM25 function defined in the collection schema.
127+
/// </remarks>
128+
public sealed class TextAnnSearchRequest : AnnSearchRequest
129+
{
130+
/// <summary>
131+
/// Creates a new ANN search request for BM25 full-text search.
132+
/// </summary>
133+
/// <param name="vectorFieldName">
134+
/// The name of the sparse vector field (output field of the BM25 function) to search in.
135+
/// </param>
136+
/// <param name="texts">The text queries to search for.</param>
137+
/// <param name="limit">The maximum number of results to return.</param>
138+
public TextAnnSearchRequest(
139+
string vectorFieldName,
140+
IReadOnlyList<string> texts,
141+
int limit)
142+
: base(vectorFieldName, SimilarityMetricType.Bm25, limit)
143+
{
144+
Verify.NotNull(texts);
145+
if (texts.Count == 0)
146+
{
147+
throw new ArgumentException("At least one text query must be provided", nameof(texts));
148+
}
149+
150+
Texts = texts;
151+
}
152+
153+
/// <summary>
154+
/// The text queries to search for.
155+
/// </summary>
156+
public IReadOnlyList<string> Texts { get; }
157+
}

Milvus.Client/CollectionSchema.cs

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ namespace Milvus.Client;
1010
public sealed class CollectionSchema
1111
{
1212
private readonly List<FieldSchema> _fields = new();
13+
private readonly List<FunctionSchema> _functions = new();
1314

1415
/// <summary>
1516
/// Instantiates a new <see cref="CollectionSchema" />.
@@ -19,7 +20,15 @@ public CollectionSchema()
1920
}
2021

2122
internal CollectionSchema(IReadOnlyList<FieldSchema> fields)
22-
=> _fields.AddRange(fields);
23+
: this(fields, Array.Empty<FunctionSchema>())
24+
{
25+
}
26+
27+
internal CollectionSchema(IReadOnlyList<FieldSchema> fields, IReadOnlyList<FunctionSchema> functions)
28+
{
29+
_fields.AddRange(fields);
30+
_functions.AddRange(functions);
31+
}
2332

2433
/// <summary>
2534
/// The name of the collection.
@@ -36,6 +45,15 @@ internal CollectionSchema(IReadOnlyList<FieldSchema> fields)
3645
/// </summary>
3746
public IList<FieldSchema> Fields => _fields;
3847

48+
/// <summary>
49+
/// The functions defined in the schema for automatic data transformation.
50+
/// </summary>
51+
/// <remarks>
52+
/// Functions enable automatic data transformations during insertion, such as BM25 full-text
53+
/// search indexing where text fields are automatically converted to sparse vectors.
54+
/// </remarks>
55+
public IList<FunctionSchema> Functions => _functions;
56+
3957
/// <summary>
4058
/// Whether to enable dynamic fields for this schema. Defaults to <c>false</c>.
4159
/// </summary>

Milvus.Client/Constants.cs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,4 +181,14 @@ internal static class Constants
181181
/// to instruct server execute query/search after all DML operations finished.
182182
/// </summary>
183183
internal const long GuaranteeStrongTs = 0L;
184+
185+
/// <summary>
186+
/// Key name in type_params. Indicates whether analyzer is enabled for a varchar field.
187+
/// </summary>
188+
internal const string EnableAnalyzer = "enable_analyzer";
189+
190+
/// <summary>
191+
/// Key name in type_params. Contains analyzer parameters as JSON.
192+
/// </summary>
193+
internal const string AnalyzerParams = "analyzer_params";
184194
}

Milvus.Client/FieldSchema.cs

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,12 @@ public static FieldSchema Create<TData>(
112112
/// <param name="defaultValue">
113113
/// The default value for the field. Available since Milvus v2.5.
114114
/// </param>
115+
/// <param name="enableAnalyzer">
116+
/// Whether to enable the analyzer for this field. Required for BM25 full-text search input fields.
117+
/// </param>
118+
/// <param name="analyzerParams">
119+
/// Optional analyzer parameters as a JSON string. For example: <c>{"type": "english"}</c>.
120+
/// </param>
115121
public static FieldSchema CreateVarchar(
116122
string name,
117123
int maxLength,
@@ -120,10 +126,14 @@ public static FieldSchema CreateVarchar(
120126
bool isPartitionKey = false,
121127
string description = "",
122128
bool nullable = false,
123-
string? defaultValue = null)
129+
string? defaultValue = null,
130+
bool enableAnalyzer = false,
131+
string? analyzerParams = null)
124132
=> new(name, MilvusDataType.VarChar, isPrimaryKey, autoId, isPartitionKey, description, nullable, defaultValue)
125133
{
126-
MaxLength = maxLength
134+
MaxLength = maxLength,
135+
EnableAnalyzer = enableAnalyzer,
136+
AnalyzerParams = analyzerParams
127137
};
128138

129139
/// <summary>
@@ -362,6 +372,28 @@ internal FieldSchema(
362372
/// </summary>
363373
public int? Dimension { get; set; }
364374

375+
/// <summary>
376+
/// Whether to enable the analyzer for this field. Required for BM25 full-text search input fields.
377+
/// </summary>
378+
/// <remarks>
379+
/// When enabled, the field can be used as an input to a BM25 function for full-text search.
380+
/// This property is only applicable for <see cref="MilvusDataType.VarChar" /> fields.
381+
/// </remarks>
382+
public bool EnableAnalyzer { get; set; }
383+
384+
/// <summary>
385+
/// Optional analyzer parameters as a JSON string. For example: <c>{"type": "english"}</c>.
386+
/// </summary>
387+
/// <remarks>
388+
/// This property is only applicable when <see cref="EnableAnalyzer" /> is set to true.
389+
/// </remarks>
390+
public string? AnalyzerParams { get; set; }
391+
392+
/// <summary>
393+
/// Whether this field is the output of a function and should not be provided during insertion.
394+
/// </summary>
395+
public bool IsFunctionOutput { get; internal set; }
396+
365397
/// <summary>
366398
/// The state of the field.
367399
/// </summary>

Milvus.Client/FunctionSchema.cs

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
using System.Diagnostics;
2+
3+
namespace Milvus.Client;
4+
5+
/// <summary>
6+
/// Defines a function within a <see cref="CollectionSchema" /> that automatically transforms data during insertion.
7+
/// </summary>
8+
/// <remarks>
9+
/// Functions enable automatic data transformations such as BM25 full-text search indexing,
10+
/// where text fields are automatically converted to sparse vectors.
11+
/// </remarks>
12+
[DebuggerDisplay("{DebuggerDisplay,nq}")]
13+
public sealed class FunctionSchema
14+
{
15+
private readonly List<string> _inputFieldNames = new();
16+
private readonly List<string> _outputFieldNames = new();
17+
private readonly Dictionary<string, string> _params = new();
18+
19+
/// <summary>
20+
/// Creates a new function schema.
21+
/// </summary>
22+
/// <param name="name">The name of the function.</param>
23+
/// <param name="type">The type of function.</param>
24+
/// <param name="inputFieldNames">The names of the input fields for this function.</param>
25+
/// <param name="outputFieldNames">The names of the output fields for this function.</param>
26+
/// <param name="description">An optional description for the function.</param>
27+
public FunctionSchema(
28+
string name,
29+
FunctionType type,
30+
IEnumerable<string> inputFieldNames,
31+
IEnumerable<string> outputFieldNames,
32+
string description = "")
33+
{
34+
Name = name;
35+
Type = type;
36+
_inputFieldNames.AddRange(inputFieldNames);
37+
_outputFieldNames.AddRange(outputFieldNames);
38+
Description = description;
39+
}
40+
41+
/// <summary>
42+
/// Creates a BM25 function schema for full-text search.
43+
/// </summary>
44+
/// <param name="name">The name of the function.</param>
45+
/// <param name="inputFieldName">
46+
/// The name of the VARCHAR input field. This field must have <see cref="FieldSchema.EnableAnalyzer"/> set to true.
47+
/// </param>
48+
/// <param name="outputFieldName">
49+
/// The name of the SPARSE_FLOAT_VECTOR output field that will be automatically populated.
50+
/// </param>
51+
/// <param name="description">An optional description for the function.</param>
52+
/// <returns>A new BM25 function schema.</returns>
53+
public static FunctionSchema CreateBm25(
54+
string name,
55+
string inputFieldName,
56+
string outputFieldName,
57+
string description = "")
58+
=> new(name, FunctionType.Bm25, new[] { inputFieldName }, new[] { outputFieldName }, description);
59+
60+
internal FunctionSchema(
61+
long id,
62+
string name,
63+
FunctionType type,
64+
IEnumerable<string> inputFieldNames,
65+
IEnumerable<string> outputFieldNames,
66+
string description,
67+
IEnumerable<KeyValuePair<string, string>> parameters)
68+
{
69+
Id = id;
70+
Name = name;
71+
Type = type;
72+
_inputFieldNames.AddRange(inputFieldNames);
73+
_outputFieldNames.AddRange(outputFieldNames);
74+
Description = description;
75+
foreach (var param in parameters)
76+
{
77+
_params[param.Key] = param.Value;
78+
}
79+
}
80+
81+
/// <summary>
82+
/// The internal Milvus ID assigned to this function.
83+
/// </summary>
84+
public long Id { get; }
85+
86+
/// <summary>
87+
/// The name of the function.
88+
/// </summary>
89+
public string Name { get; }
90+
91+
/// <summary>
92+
/// The type of function.
93+
/// </summary>
94+
public FunctionType Type { get; }
95+
96+
/// <summary>
97+
/// The names of the input fields for this function.
98+
/// </summary>
99+
public IList<string> InputFieldNames => _inputFieldNames;
100+
101+
/// <summary>
102+
/// The names of the output fields for this function.
103+
/// </summary>
104+
public IList<string> OutputFieldNames => _outputFieldNames;
105+
106+
/// <summary>
107+
/// An optional description for the function.
108+
/// </summary>
109+
public string Description { get; }
110+
111+
/// <summary>
112+
/// Additional parameters for the function.
113+
/// </summary>
114+
public IDictionary<string, string> Params => _params;
115+
116+
private string DebuggerDisplay => $"{Name} ({Type})";
117+
}

Milvus.Client/FunctionType.cs

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
namespace Milvus.Client;
2+
3+
/// <summary>
4+
/// The type of function to use in a collection schema.
5+
/// </summary>
6+
public enum FunctionType
7+
{
8+
/// <summary>
9+
/// Unknown function type.
10+
/// </summary>
11+
Unknown = 0,
12+
13+
/// <summary>
14+
/// BM25 function for full-text search. Automatically generates sparse vectors from text input.
15+
/// </summary>
16+
/// <remarks>
17+
/// <para>
18+
/// The BM25 function requires exactly one VARCHAR input field (with analyzer enabled) and
19+
/// exactly one SPARSE_FLOAT_VECTOR output field.
20+
/// </para>
21+
/// <para>
22+
/// When using BM25, the sparse vector field is automatically populated during insertion based
23+
/// on the text content of the input field.
24+
/// </para>
25+
/// </remarks>
26+
Bm25 = 1,
27+
28+
/// <summary>
29+
/// Text embedding function for generating dense vectors from text.
30+
/// </summary>
31+
TextEmbedding = 2,
32+
33+
/// <summary>
34+
/// Rerank function for reordering search results.
35+
/// </summary>
36+
Rerank = 3
37+
}

Milvus.Client/IndexType.cs

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -231,9 +231,23 @@ public enum IndexType
231231
/// with low-dimensional non-zero values.
232232
/// </para>
233233
/// <para>
234-
/// Build parameters: <c>drop_ratio_build</c> (the proportion of small vector values excluded during indexing,
235-
/// range [0, 1), default 0).
234+
/// Build parameters (passed via <c>extraParams</c>):
236235
/// </para>
236+
/// <list type="bullet">
237+
/// <item>
238+
/// <c>inverted_index_algo</c>: The query algorithm. Values: <c>"DAAT_MAXSCORE"</c> (default, best for high k
239+
/// or many terms), <c>"DAAT_WAND"</c> (best for small k or short queries), <c>"TAAT_NAIVE"</c> (adapts to
240+
/// collection changes). String values must be quoted in JSON, e.g. <c>"\"DAAT_WAND\""</c>.
241+
/// </item>
242+
/// <item>
243+
/// <c>bm25_k1</c>: Controls term frequency saturation for BM25 scoring. Range [1.2, 2.0].
244+
/// Only applicable when metric type is <see cref="SimilarityMetricType.Bm25" />.
245+
/// </item>
246+
/// <item>
247+
/// <c>bm25_b</c>: Controls document length normalization for BM25 scoring. Range [0, 1], default 0.75.
248+
/// Only applicable when metric type is <see cref="SimilarityMetricType.Bm25" />.
249+
/// </item>
250+
/// </list>
237251
/// <para>
238252
/// Search parameters: <c>drop_ratio_search</c> (the proportion of small vector values excluded during search,
239253
/// range [0, 1), default 0).

Milvus.Client/MilvusClient.Collection.cs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,9 +127,51 @@ public async Task<MilvusCollection> CreateCollectionAsync(
127127
grpcField.DefaultValue = ConvertToValueField(field.DefaultValue, field.DataType);
128128
}
129129

130+
if (field.EnableAnalyzer)
131+
{
132+
grpcField.TypeParams.Add(new Grpc.KeyValuePair
133+
{
134+
Key = Constants.EnableAnalyzer,
135+
Value = "true"
136+
});
137+
}
138+
139+
if (field.AnalyzerParams is not null)
140+
{
141+
grpcField.TypeParams.Add(new Grpc.KeyValuePair
142+
{
143+
Key = Constants.AnalyzerParams,
144+
Value = field.AnalyzerParams
145+
});
146+
}
147+
130148
grpcCollectionSchema.Fields.Add(grpcField);
131149
}
132150

151+
foreach (FunctionSchema function in schema.Functions)
152+
{
153+
Grpc.FunctionSchema grpcFunction = new()
154+
{
155+
Name = function.Name,
156+
Description = function.Description,
157+
Type = (Grpc.FunctionType)(int)function.Type
158+
};
159+
160+
grpcFunction.InputFieldNames.AddRange(function.InputFieldNames);
161+
grpcFunction.OutputFieldNames.AddRange(function.OutputFieldNames);
162+
163+
foreach (var param in function.Params)
164+
{
165+
grpcFunction.Params.Add(new Grpc.KeyValuePair
166+
{
167+
Key = param.Key,
168+
Value = param.Value
169+
});
170+
}
171+
172+
grpcCollectionSchema.Functions.Add(grpcFunction);
173+
}
174+
133175
#pragma warning disable CS0612 // Schema-level AutoID is obsolete, but still there in pymilvus
134176
grpcCollectionSchema.AutoID = schema.Fields.Any(static p => p.AutoId);
135177
#pragma warning restore CS0612 // Type or member is obsolete

0 commit comments

Comments
 (0)