Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
618 changes: 618 additions & 0 deletions Milvus.Client.Tests/Bm25Tests.cs

Large diffs are not rendered by default.

39 changes: 39 additions & 0 deletions Milvus.Client/AnnSearchRequest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,42 @@ public SparseVectorAnnSearchRequest(
/// </summary>
public IReadOnlyList<MilvusSparseVector<T>> Vectors { get; }
}

/// <summary>
/// Represents an ANN search request for full-text search using BM25.
/// </summary>
/// <remarks>
/// This request type is used for hybrid search that combines BM25 full-text search
/// with dense vector search. The text queries are automatically converted to sparse
/// vectors by the Milvus server using the BM25 function defined in the collection schema.
/// </remarks>
public sealed class TextAnnSearchRequest : AnnSearchRequest
{
/// <summary>
/// Creates a new ANN search request for BM25 full-text search.
/// </summary>
/// <param name="vectorFieldName">
/// The name of the sparse vector field (output field of the BM25 function) to search in.
/// </param>
/// <param name="texts">The text queries to search for.</param>
/// <param name="limit">The maximum number of results to return.</param>
public TextAnnSearchRequest(
string vectorFieldName,
IReadOnlyList<string> texts,
int limit)
: base(vectorFieldName, SimilarityMetricType.Bm25, limit)
{
Verify.NotNull(texts);
if (texts.Count == 0)
{
throw new ArgumentException("At least one text query must be provided", nameof(texts));
}

Texts = texts;
}

/// <summary>
/// The text queries to search for.
/// </summary>
public IReadOnlyList<string> Texts { get; }
}
20 changes: 19 additions & 1 deletion Milvus.Client/CollectionSchema.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ namespace Milvus.Client;
public sealed class CollectionSchema
{
private readonly List<FieldSchema> _fields = new();
private readonly List<FunctionSchema> _functions = new();

/// <summary>
/// Instantiates a new <see cref="CollectionSchema" />.
Expand All @@ -19,7 +20,15 @@ public CollectionSchema()
}

internal CollectionSchema(IReadOnlyList<FieldSchema> fields)
=> _fields.AddRange(fields);
: this(fields, Array.Empty<FunctionSchema>())
{
}

internal CollectionSchema(IReadOnlyList<FieldSchema> fields, IReadOnlyList<FunctionSchema> functions)
{
_fields.AddRange(fields);
_functions.AddRange(functions);
}

/// <summary>
/// The name of the collection.
Expand All @@ -36,6 +45,15 @@ internal CollectionSchema(IReadOnlyList<FieldSchema> fields)
/// </summary>
public IList<FieldSchema> Fields => _fields;

/// <summary>
/// The functions defined in the schema for automatic data transformation.
/// </summary>
/// <remarks>
/// Functions enable automatic data transformations during insertion, such as BM25 full-text
/// search indexing where text fields are automatically converted to sparse vectors.
/// </remarks>
public IList<FunctionSchema> Functions => _functions;

/// <summary>
/// Whether to enable dynamic fields for this schema. Defaults to <c>false</c>.
/// </summary>
Expand Down
10 changes: 10 additions & 0 deletions Milvus.Client/Constants.cs
Original file line number Diff line number Diff line change
Expand Up @@ -181,4 +181,14 @@ internal static class Constants
/// to instruct server execute query/search after all DML operations finished.
/// </summary>
internal const long GuaranteeStrongTs = 0L;

/// <summary>
/// Key name in type_params. Indicates whether analyzer is enabled for a varchar field.
/// </summary>
internal const string EnableAnalyzer = "enable_analyzer";

/// <summary>
/// Key name in type_params. Contains analyzer parameters as JSON.
/// </summary>
internal const string AnalyzerParams = "analyzer_params";
}
36 changes: 34 additions & 2 deletions Milvus.Client/FieldSchema.cs
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,12 @@ public static FieldSchema Create<TData>(
/// <param name="defaultValue">
/// The default value for the field. Available since Milvus v2.5.
/// </param>
/// <param name="enableAnalyzer">
/// Whether to enable the analyzer for this field. Required for BM25 full-text search input fields.
/// </param>
/// <param name="analyzerParams">
/// Optional analyzer parameters. For example: <c>new Dictionary&lt;string, object&gt; { ["type"] = "english" }</c>.
/// </param>
public static FieldSchema CreateVarchar(
string name,
int maxLength,
Expand All @@ -120,10 +126,14 @@ public static FieldSchema CreateVarchar(
bool isPartitionKey = false,
string description = "",
bool nullable = false,
string? defaultValue = null)
string? defaultValue = null,
bool enableAnalyzer = false,
IReadOnlyDictionary<string, object>? analyzerParams = null)
=> new(name, MilvusDataType.VarChar, isPrimaryKey, autoId, isPartitionKey, description, nullable, defaultValue)
{
MaxLength = maxLength
MaxLength = maxLength,
EnableAnalyzer = enableAnalyzer,
AnalyzerParams = analyzerParams
};

/// <summary>
Expand Down Expand Up @@ -362,6 +372,28 @@ internal FieldSchema(
/// </summary>
public int? Dimension { get; set; }

/// <summary>
/// Whether to enable the analyzer for this field. Required for BM25 full-text search input fields.
/// </summary>
/// <remarks>
/// When enabled, the field can be used as an input to a BM25 function for full-text search.
/// This property is only applicable for <see cref="MilvusDataType.VarChar" /> fields.
/// </remarks>
public bool EnableAnalyzer { get; set; }

/// <summary>
/// Optional analyzer parameters. For example: <c>new Dictionary&lt;string, object&gt; { ["type"] = "english" }</c>.
/// </summary>
/// <remarks>
/// This property is only applicable when <see cref="EnableAnalyzer" /> is set to true.
/// </remarks>
public IReadOnlyDictionary<string, object>? AnalyzerParams { get; set; }

/// <summary>
/// Whether this field is the output of a function and should not be provided during insertion.
/// </summary>
public bool IsFunctionOutput { get; internal set; }

/// <summary>
/// The state of the field.
/// </summary>
Expand Down
117 changes: 117 additions & 0 deletions Milvus.Client/FunctionSchema.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
using System.Diagnostics;

namespace Milvus.Client;

/// <summary>
/// Defines a function within a <see cref="CollectionSchema" /> that automatically transforms data during insertion.
/// </summary>
/// <remarks>
/// Functions enable automatic data transformations such as BM25 full-text search indexing,
/// where text fields are automatically converted to sparse vectors.
/// </remarks>
[DebuggerDisplay("{DebuggerDisplay,nq}")]
public sealed class FunctionSchema
{
private readonly List<string> _inputFieldNames = new();
private readonly List<string> _outputFieldNames = new();
private readonly Dictionary<string, string> _params = new();

/// <summary>
/// Creates a new function schema.
/// </summary>
/// <param name="name">The name of the function.</param>
/// <param name="type">The type of function.</param>
/// <param name="inputFieldNames">The names of the input fields for this function.</param>
/// <param name="outputFieldNames">The names of the output fields for this function.</param>
/// <param name="description">An optional description for the function.</param>
public FunctionSchema(
string name,
FunctionType type,
IEnumerable<string> inputFieldNames,
IEnumerable<string> outputFieldNames,
string description = "")
{
Name = name;
Type = type;
_inputFieldNames.AddRange(inputFieldNames);
_outputFieldNames.AddRange(outputFieldNames);
Description = description;
}

/// <summary>
/// Creates a BM25 function schema for full-text search.
/// </summary>
/// <param name="name">The name of the function.</param>
/// <param name="inputFieldName">
/// The name of the VARCHAR input field. This field must have <see cref="FieldSchema.EnableAnalyzer"/> set to true.
/// </param>
/// <param name="outputFieldName">
/// The name of the SPARSE_FLOAT_VECTOR output field that will be automatically populated.
/// </param>
/// <param name="description">An optional description for the function.</param>
/// <returns>A new BM25 function schema.</returns>
public static FunctionSchema CreateBm25(
string name,
string inputFieldName,
string outputFieldName,
string description = "")
=> new(name, FunctionType.Bm25, new[] { inputFieldName }, new[] { outputFieldName }, description);

internal FunctionSchema(
long id,
string name,
FunctionType type,
IEnumerable<string> inputFieldNames,
IEnumerable<string> outputFieldNames,
string description,
IEnumerable<KeyValuePair<string, string>> parameters)
{
Id = id;
Name = name;
Type = type;
_inputFieldNames.AddRange(inputFieldNames);
_outputFieldNames.AddRange(outputFieldNames);
Description = description;
foreach (var param in parameters)
{
_params[param.Key] = param.Value;
}
}

/// <summary>
/// The internal Milvus ID assigned to this function.
/// </summary>
public long Id { get; }

/// <summary>
/// The name of the function.
/// </summary>
public string Name { get; }

/// <summary>
/// The type of function.
/// </summary>
public FunctionType Type { get; }

/// <summary>
/// The names of the input fields for this function.
/// </summary>
public IList<string> InputFieldNames => _inputFieldNames;

/// <summary>
/// The names of the output fields for this function.
/// </summary>
public IList<string> OutputFieldNames => _outputFieldNames;

/// <summary>
/// An optional description for the function.
/// </summary>
public string Description { get; }

/// <summary>
/// Additional parameters for the function.
/// </summary>
public IDictionary<string, string> Params => _params;

private string DebuggerDisplay => $"{Name} ({Type})";
}
37 changes: 37 additions & 0 deletions Milvus.Client/FunctionType.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
namespace Milvus.Client;

/// <summary>
/// The type of function to use in a collection schema.
/// </summary>
public enum FunctionType
{
/// <summary>
/// Unknown function type.
/// </summary>
Unknown = 0,

/// <summary>
/// BM25 function for full-text search. Automatically generates sparse vectors from text input.
/// </summary>
/// <remarks>
/// <para>
/// The BM25 function requires exactly one VARCHAR input field (with analyzer enabled) and
/// exactly one SPARSE_FLOAT_VECTOR output field.
/// </para>
/// <para>
/// When using BM25, the sparse vector field is automatically populated during insertion based
/// on the text content of the input field.
/// </para>
/// </remarks>
Bm25 = 1,

/// <summary>
/// Text embedding function for generating dense vectors from text.
/// </summary>
TextEmbedding = 2,

/// <summary>
/// Rerank function for reordering search results.
/// </summary>
Rerank = 3
}
18 changes: 16 additions & 2 deletions Milvus.Client/IndexType.cs
Original file line number Diff line number Diff line change
Expand Up @@ -231,9 +231,23 @@ public enum IndexType
/// with low-dimensional non-zero values.
/// </para>
/// <para>
/// Build parameters: <c>drop_ratio_build</c> (the proportion of small vector values excluded during indexing,
/// range [0, 1), default 0).
/// Build parameters (passed via <c>extraParams</c>):
/// </para>
/// <list type="bullet">
/// <item>
/// <c>inverted_index_algo</c>: The query algorithm. Values: <c>"DAAT_MAXSCORE"</c> (default, best for high k
/// or many terms), <c>"DAAT_WAND"</c> (best for small k or short queries), <c>"TAAT_NAIVE"</c> (adapts to
/// collection changes). String values must be quoted in JSON, e.g. <c>"\"DAAT_WAND\""</c>.
/// </item>
/// <item>
/// <c>bm25_k1</c>: Controls term frequency saturation for BM25 scoring. Range [1.2, 2.0].
/// Only applicable when metric type is <see cref="SimilarityMetricType.Bm25" />.
/// </item>
/// <item>
/// <c>bm25_b</c>: Controls document length normalization for BM25 scoring. Range [0, 1], default 0.75.
/// Only applicable when metric type is <see cref="SimilarityMetricType.Bm25" />.
/// </item>
/// </list>
/// <para>
/// Search parameters: <c>drop_ratio_search</c> (the proportion of small vector values excluded during search,
/// range [0, 1), default 0).
Expand Down
43 changes: 43 additions & 0 deletions Milvus.Client/MilvusClient.Collection.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System.Globalization;
using System.Text.Json;

namespace Milvus.Client;

Expand Down Expand Up @@ -127,9 +128,51 @@ public async Task<MilvusCollection> CreateCollectionAsync(
grpcField.DefaultValue = ConvertToValueField(field.DefaultValue, field.DataType);
}

if (field.EnableAnalyzer)
{
grpcField.TypeParams.Add(new Grpc.KeyValuePair
{
Key = Constants.EnableAnalyzer,
Value = "true"
});
}

if (field.AnalyzerParams is not null)
{
grpcField.TypeParams.Add(new Grpc.KeyValuePair
{
Key = Constants.AnalyzerParams,
Value = JsonSerializer.Serialize(field.AnalyzerParams)
});
}

grpcCollectionSchema.Fields.Add(grpcField);
}

foreach (FunctionSchema function in schema.Functions)
{
Grpc.FunctionSchema grpcFunction = new()
{
Name = function.Name,
Description = function.Description,
Type = (Grpc.FunctionType)(int)function.Type
};

grpcFunction.InputFieldNames.AddRange(function.InputFieldNames);
grpcFunction.OutputFieldNames.AddRange(function.OutputFieldNames);

foreach (var param in function.Params)
{
grpcFunction.Params.Add(new Grpc.KeyValuePair
{
Key = param.Key,
Value = param.Value
});
}

grpcCollectionSchema.Functions.Add(grpcFunction);
}

#pragma warning disable CS0612 // Schema-level AutoID is obsolete, but still there in pymilvus
grpcCollectionSchema.AutoID = schema.Fields.Any(static p => p.AutoId);
#pragma warning restore CS0612 // Type or member is obsolete
Expand Down
Loading