-
Notifications
You must be signed in to change notification settings - Fork 72
Expand file tree
/
Copy pathCosmosDataSinkExtension.cs
More file actions
304 lines (271 loc) · 15.8 KB
/
CosmosDataSinkExtension.cs
File metadata and controls
304 lines (271 loc) · 15.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
using System.ComponentModel.Composition;
using System.Diagnostics;
using System.Dynamic;
using System.Net;
using System.Text;
using Cosmos.DataTransfer.Interfaces;
using Microsoft.Azure.Cosmos;
using Microsoft.Azure.Cosmos.Encryption;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.Logging;
using Newtonsoft.Json;
using Polly;
using Polly.Retry;
namespace Cosmos.DataTransfer.CosmosExtension
{
[Export(typeof(IDataSinkExtension))]
public class CosmosDataSinkExtension : IDataSinkExtensionWithSettings
{
public string DisplayName => "Cosmos-nosql";
/// <summary>
/// Creates or retrieves a Cosmos DB database and container based on the provided settings.
/// </summary>
/// <param name="client">The <see cref="CosmosClient"/> instance used to interact with Cosmos DB.</param>
/// <param name="settings">The <see cref="CosmosSinkSettings"/> containing configuration for the database and container.</param>
/// <param name="logger">The <see cref="ILogger"/> instance for logging information and warnings.</param>
/// <param name="cancellationToken">A <see cref="CancellationToken"/> to observe while waiting for the task to complete.</param>
/// <returns>
/// A <see cref="Container"/> instance representing the created or retrieved Cosmos DB container.
/// </returns>
/// <remarks>
/// This method performs the following actions:
/// <list type="bullet">
/// <item>Checks if the database exists and creates it if necessary, applying the specified throughput settings.</item>
/// <item>Validates and adjusts the database throughput settings to match the configuration in <paramref name="settings"/>.</item>
/// <item>Deletes and recreates the container if the <see cref="CosmosSinkSettings.RecreateContainer"/> flag is set.</item>
/// <item>Handles serverless accounts and shared throughput configurations appropriately.</item>
/// <item>Logs warnings if certain configurations, such as throughput settings, are not supported in specific scenarios (e.g., serverless accounts).</item>
/// </list>
/// </remarks>
/// <exception cref="CosmosException">
/// Thrown if there is an error interacting with Cosmos DB, such as insufficient permissions or invalid configurations.
/// </exception>
/// <exception cref="ArgumentNullException">
/// Thrown if required settings, such as the database or container name, are missing.
/// </exception>
private static async Task<Container> CreateDatabaseAndContainerAsync(
CosmosClient client,
CosmosSinkSettings settings,
ILogger logger,
CancellationToken cancellationToken)
{
Database database;
if (settings.UseSharedThroughput)
{
try
{
database = client.GetDatabase(settings.Database);
var throughputResponse = await database.ReadThroughputAsync(cancellationToken);
var currentThroughput = throughputResponse.Value;
if (settings.UseAutoscaleForDatabase && settings.CreatedContainerMaxThroughput.HasValue && currentThroughput != settings.CreatedContainerMaxThroughput)
{
await database.ReplaceThroughputAsync(
ThroughputProperties.CreateAutoscaleThroughput(settings.CreatedContainerMaxThroughput.Value),
cancellationToken: cancellationToken);
}
else if (!settings.UseAutoscaleForDatabase && settings.CreatedContainerMaxThroughput.HasValue && currentThroughput != settings.CreatedContainerMaxThroughput)
{
await database.ReplaceThroughputAsync(
ThroughputProperties.CreateManualThroughput(settings.CreatedContainerMaxThroughput.Value),
requestOptions: null,
cancellationToken: cancellationToken);
}
}
catch (CosmosException ex) when (ex.StatusCode == HttpStatusCode.NotFound)
{
var newThroughputProperties = settings.UseAutoscaleForDatabase
? ThroughputProperties.CreateAutoscaleThroughput(settings.CreatedContainerMaxThroughput ?? 4000)
: ThroughputProperties.CreateManualThroughput(settings.CreatedContainerMaxThroughput ?? 400);
database = await client.CreateDatabaseIfNotExistsAsync(
settings.Database,
newThroughputProperties,
cancellationToken: cancellationToken);
}
}
else
{
database = await client.CreateDatabaseIfNotExistsAsync(settings.Database, cancellationToken: cancellationToken);
}
if (settings.RecreateContainer)
{
try
{
await database.GetContainer(settings.Container).DeleteContainerAsync(cancellationToken: cancellationToken);
}
catch { }
}
var containerProperties = new ContainerProperties
{
Id = settings.Container,
PartitionKeyDefinitionVersion = PartitionKeyDefinitionVersion.V2,
};
if (settings.PartitionKeyPaths != null)
{
logger.LogInformation("Using partition key paths: {PartitionKeyPaths}", string.Join(", ", settings.PartitionKeyPaths));
containerProperties.PartitionKeyPaths = settings.PartitionKeyPaths;
}
else
{
containerProperties.PartitionKeyPath = settings.PartitionKeyPath;
}
ThroughputProperties? throughputProperties = settings.IsServerlessAccount || settings.UseSharedThroughput
? null
: settings.UseAutoscaleForCreatedContainer
? ThroughputProperties.CreateAutoscaleThroughput(settings.CreatedContainerMaxThroughput ?? 4000)
: ThroughputProperties.CreateManualThroughput(settings.CreatedContainerMaxThroughput ?? 400);
try
{
return await database.CreateContainerIfNotExistsAsync(containerProperties, throughputProperties, cancellationToken: cancellationToken);
}
catch (CosmosException ex) when (ex.ResponseBody.Contains("not supported for serverless accounts", StringComparison.InvariantCultureIgnoreCase))
{
logger.LogWarning("Cosmos Serverless Account does not support throughput options. Creating Container {ContainerName} without those settings.", settings.Container);
return await database.CreateContainerIfNotExistsAsync(containerProperties, cancellationToken: cancellationToken);
}
}
public async Task WriteAsync(IAsyncEnumerable<IDataItem> dataItems, IConfiguration config, IDataSourceExtension dataSource, ILogger logger, CancellationToken cancellationToken = default)
{
var settings = config.Get<CosmosSinkSettings>();
settings.Validate();
var client = CosmosExtensionServices.CreateClient(settings!, DisplayName, dataSource.DisplayName);
Container container;
if (settings!.UseRbacAuth)
{
var cosmosContainer = client.GetContainer(settings.Database, settings.Container);
container = settings.InitClientEncryption
? await cosmosContainer.InitializeEncryptionAsync(cancellationToken)
: cosmosContainer;
}
else
{
container = await CreateDatabaseAndContainerAsync(client, settings, logger, cancellationToken);
}
await CosmosExtensionServices.VerifyContainerAccess(container, settings.Container, logger, cancellationToken);
int addedCount = 0;
int inputCount = 0;
var timer = Stopwatch.StartNew();
void ReportCount(int i)
{
addedCount += i;
if (addedCount % 500 == 0)
{
logger.LogInformation("{AddedCount} records added after {TotalSeconds}s ({AddRate} records/s)", addedCount, $"{timer.ElapsedMilliseconds / 1000.0:F2}", $"{(int)(addedCount / (timer.ElapsedMilliseconds / 1000.0))}");
}
}
var convertedObjects = dataItems
.Select(di => di.BuildDynamicObjectTree(requireStringId: true, ignoreNullValues: settings.IgnoreNullValues, preserveMixedCaseIds: settings.PreserveMixedCaseIds, transformations: settings.Transformations))
.Where(o => o != null)
.OfType<ExpandoObject>();
var batches = convertedObjects.Buffer(settings.BatchSize);
var retry = GetRetryPolicy(settings.MaxRetryCount, settings.InitialRetryDurationMs);
await foreach (var batch in batches.WithCancellation(cancellationToken))
{
var addTasks = batch.Select(item => AddItemAsync(container, item, settings.PartitionKeyPath ?? settings.PartitionKeyPaths?.FirstOrDefault(), settings.WriteMode, retry, logger, cancellationToken)).ToList();
var results = await Task.WhenAll(addTasks);
ReportCount(results.Sum(i => i.ItemCount));
inputCount += results.Length;
}
if (addedCount != inputCount)
{
logger.LogWarning("Added {AddedCount} of {TotalCount} total records in {TotalSeconds}s", addedCount, inputCount, $"{timer.ElapsedMilliseconds / 1000.0:F2}");
throw new Exception($"Only {addedCount} of {inputCount} records were added to Cosmos");
}
logger.LogInformation("Added {AddedCount} total records in {TotalSeconds}s ({AddRate} records/s)", addedCount, $"{timer.ElapsedMilliseconds / 1000.0:F2}", $"{(int)(addedCount / (timer.ElapsedMilliseconds / 1000.0))}");
}
private static AsyncRetryPolicy GetRetryPolicy(int maxRetryCount, int initialRetryDuration)
{
int retryDelayBaseMs = initialRetryDuration / 2;
var jitter = new Random();
var retryPolicy = Policy
.Handle<CosmosException>(c => c.StatusCode == System.Net.HttpStatusCode.TooManyRequests)
.WaitAndRetryAsync(maxRetryCount,
retryAttempt => TimeSpan.FromMilliseconds(Math.Pow(2, retryAttempt) * retryDelayBaseMs + jitter.Next(0, retryDelayBaseMs))
);
return retryPolicy;
}
private static Task<ItemResult> AddItemAsync(Container container, ExpandoObject item, string? partitionKeyPath, DataWriteMode mode, AsyncRetryPolicy retryPolicy, ILogger logger, CancellationToken cancellationToken)
{
string? id = GetPropertyValue(item, "id");
logger.LogTrace("Adding item {Id}", id);
var task = retryPolicy.ExecuteAsync(() => PopulateItem(container, item, partitionKeyPath, mode, id, cancellationToken))
.ContinueWith(t =>
{
bool requestSucceeded = t.Result.IsSuccess;
if (t.IsCompletedSuccessfully && requestSucceeded)
{
return t.Result;
}
if (t.IsFaulted)
{
logger.LogWarning(t.Exception, "Error adding record: {ErrorMessage}", t.Exception?.Message);
}
else if (!requestSucceeded)
{
logger.LogWarning(t.Exception, "Error adding record {Id}: {ErrorMessage}", t.Result.Id, t.Result.StatusCode);
return t.Result;
}
return new ItemResult(null, mode, HttpStatusCode.InternalServerError);
}, cancellationToken);
return task;
}
private static async Task<ItemResult> PopulateItem(Container container, ExpandoObject item, string? partitionKeyPath, DataWriteMode mode, string? itemId, CancellationToken cancellationToken)
{
HttpStatusCode? statusCode = null;
switch (mode)
{
case DataWriteMode.InsertStream:
ArgumentNullException.ThrowIfNull(partitionKeyPath, nameof(partitionKeyPath));
var insertMessage = await container.CreateItemStreamAsync(CreateItemStream(item), new PartitionKey(GetPropertyValue(item, partitionKeyPath.TrimStart('/'))), cancellationToken: cancellationToken);
statusCode = insertMessage.StatusCode;
break;
case DataWriteMode.Insert:
var insertResponse = await container.CreateItemAsync(item, cancellationToken: cancellationToken);
statusCode = insertResponse.StatusCode;
break;
case DataWriteMode.UpsertStream:
ArgumentNullException.ThrowIfNull(partitionKeyPath, nameof(partitionKeyPath));
var upsertMessage = await container.UpsertItemStreamAsync(CreateItemStream(item), new PartitionKey(GetPropertyValue(item, partitionKeyPath.TrimStart('/'))), cancellationToken: cancellationToken);
statusCode = upsertMessage.StatusCode;
break;
case DataWriteMode.Upsert:
var upsertResponse = await container.UpsertItemAsync(item, cancellationToken: cancellationToken);
statusCode = upsertResponse.StatusCode;
break;
case DataWriteMode.DeleteStream:
ArgumentNullException.ThrowIfNull(partitionKeyPath, nameof(partitionKeyPath));
var deleteMessage = await container.DeleteItemStreamAsync(itemId, new PartitionKey(GetPropertyValue(item, partitionKeyPath.TrimStart('/'))), cancellationToken: cancellationToken);
statusCode = deleteMessage.StatusCode;
break;
case DataWriteMode.Delete:
ArgumentNullException.ThrowIfNull(partitionKeyPath, nameof(partitionKeyPath));
var deleteResponse = await container.DeleteItemAsync<ExpandoObject>(itemId, new PartitionKey(GetPropertyValue(item, partitionKeyPath.TrimStart('/'))), cancellationToken: cancellationToken);
statusCode = deleteResponse.StatusCode;
break;
}
if (statusCode == null)
{
throw new ArgumentOutOfRangeException(nameof(mode), $"Invalid data write mode specified: {mode}");
}
return new ItemResult(itemId, mode, statusCode.Value);
}
private static MemoryStream CreateItemStream(ExpandoObject item)
{
var json = JsonConvert.SerializeObject(item, RawJsonCosmosSerializer.GetDefaultSettings());
return new MemoryStream(Encoding.UTF8.GetBytes(json));
}
private static string? GetPropertyValue(ExpandoObject item, string propertyName)
{
return ((IDictionary<string, object?>)item)[propertyName]?.ToString();
}
public IEnumerable<IDataExtensionSettings> GetSettings()
{
yield return new CosmosSinkSettings();
}
public record ItemResult(string? Id, DataWriteMode DataWriteMode, HttpStatusCode StatusCode)
{
public bool IsSuccess => StatusCode is HttpStatusCode.OK or HttpStatusCode.Created ||
(StatusCode is HttpStatusCode.NoContent or HttpStatusCode.NotFound && DataWriteMode is DataWriteMode.Delete or DataWriteMode.DeleteStream);
public int ItemCount => IsSuccess ? 1 : 0;
}
}
}