Skip to content

Commit def7a4a

Browse files
stephentoubericstjCopilot
authored
Remove Google.Protobuf dependency from Microsoft.ML.Tokenizers (#7587)
Co-authored-by: Eric StJohn <ericstj@microsoft.com> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 1a6739e commit def7a4a

File tree

7 files changed

+1384
-4665
lines changed

7 files changed

+1384
-4665
lines changed

NuGet.config

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
</solution>
66
<packageSources>
77
<clear />
8-
<add key="darc-pub-dotnet-maintenance-packages-ab95a1f1" value="https://dnceng.pkgs.visualstudio.com/public/_packaging/darc-pub-dotnet-maintenance-packages-ab95a1f1/nuget/v3/index.json" />
98
<add key="dotnet-public" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet-public/nuget/v3/index.json" />
109
<add key="dotnet-tools" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet-tools/nuget/v3/index.json" />
1110
<add key="dotnet-libraries" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet-libraries/nuget/v3/index.json" />

src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
<IsPackable>true</IsPackable>
77
<PackageDescription>Microsoft.ML.Tokenizers contains the implmentation of the tokenization used in the NLP transforms.</PackageDescription>
88
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
9+
<NoWarn>$(NoWarn);MSML_NoInstanceInitializers</NoWarn>
910
</PropertyGroup>
1011

1112
<ItemGroup Condition="'$(TargetFramework)' == 'netstandard2.0'">
@@ -16,10 +17,6 @@
1617
<Compile Remove="Utils/Helpers.netstandard.cs" />
1718
</ItemGroup>
1819

19-
<ItemGroup>
20-
<PackageReference Include="Google.Protobuf" />
21-
</ItemGroup>
22-
2320
<ItemGroup Condition="'$(TargetFramework)' == 'netstandard2.0'">
2421
<PackageReference Include="Microsoft.Bcl.HashCode" />
2522
<PackageReference Include="Microsoft.Bcl.Memory" />

src/Microsoft.ML.Tokenizers/Model/BertOptions.cs

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Licensed to the .NET Foundation under one or more agreements.
1+
// Licensed to the .NET Foundation under one or more agreements.
22
// The .NET Foundation licenses this file to you under the MIT license.
33
// See the LICENSE file in the project root for more information.
44

@@ -9,7 +9,6 @@ namespace Microsoft.ML.Tokenizers
99
/// </summary>
1010
public sealed class BertOptions : WordPieceOptions
1111
{
12-
#pragma warning disable MSML_NoInstanceInitializers
1312
/// <summary>
1413
/// Gets or sets a value indicating whether to lower case the input before tokenization.
1514
/// </summary>
@@ -66,7 +65,5 @@ public sealed class BertOptions : WordPieceOptions
6665
/// Gets or sets a value indicating whether to remove non-spacing marks.
6766
/// </summary>
6867
public bool RemoveNonSpacingMarks { get; set; }
69-
70-
#pragma warning restore MSML_NoInstanceInitializers
7168
}
72-
}
69+
}

src/Microsoft.ML.Tokenizers/Model/WordPieceOptions.cs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Licensed to the .NET Foundation under one or more agreements.
1+
// Licensed to the .NET Foundation under one or more agreements.
22
// The .NET Foundation licenses this file to you under the MIT license.
33
// See the LICENSE file in the project root for more information.
44

@@ -11,7 +11,6 @@ namespace Microsoft.ML.Tokenizers
1111
/// </summary>
1212
public class WordPieceOptions
1313
{
14-
#pragma warning disable MSML_NoInstanceInitializers
1514
internal const int DefaultMaxInputCharsPerWord = 100;
1615
internal const string DefaultContinuingSubwordPrefix = "##";
1716

@@ -44,6 +43,5 @@ public class WordPieceOptions
4443
/// Gets or set the maximum number of characters to consider for a single word.
4544
/// </summary>
4645
public int MaxInputCharsPerWord { get; set; } = DefaultMaxInputCharsPerWord;
47-
#pragma warning restore MSML_NoInstanceInitializers
4846
}
49-
}
47+
}

src/Microsoft.ML.Tokenizers/SentencepieceModel.cs

Lines changed: 416 additions & 4651 deletions
Large diffs are not rendered by default.

test/Microsoft.ML.Tokenizers.Tests/LlamaTests.cs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -913,5 +913,11 @@ public void TestPhi3TokenizerIdEncoding(string text, string decodedWithNoSpecial
913913
Assert.Equal(textWithSpecialTokens.Length, charactersWritten);
914914
Assert.Equal(textWithSpecialTokens, destinationBuffer.AsSpan(0, charactersWritten).ToString());
915915
}
916+
917+
[Fact]
918+
public void CreateWithNullStreamThrows()
919+
{
920+
Assert.ThrowsAny<ArgumentException>(() => LlamaTokenizer.Create(null!));
921+
}
916922
}
917923
}

0 commit comments

Comments
 (0)