Skip to content

Commit 2e04999

Browse files
authored
VCF-42: Reduce memory consumption when a single position returns many samples (#867)
* Added external MD5 library * Added a join utility function This function joins a vector of strings using a given delimiter character while optionally omitting empty strings. * Added a SampleHeaders class to abstract the representation of loaded sample headers SampleHeaders is a subclass of the TileDBVCFDataset class since it is intended to be instantiated and loaded exclusively by TileDBVCFDataset. Internally SampleHeaders uses MD5 checksums and maps to only load unique headers and to associate samples with these unique headers, respectively. * Updated relevant code paths to use new SampleHeaders class * Updated VCF store unit tests to use new SampleHeaders class * unique_ptr instances returned by the SampleHeaders class are no longer released This is to prevent memory leaks while making the lifetime of these managed pointers more obvious. * Added a samples_view() method to the SampleHeaders class This allows the names of samples stored in a SampleHeaders instance to be efficiently iterated via a view of the keys of an internal map. * Updated Reader to use samples_view() method of SampleHeaders instance This allows the names of only samples that have headers loaded to be iterated efficiently, both in terms of run-time and memory consumption. * Fixed MD5 stack overflow bug on macOS Specifically, the std::string(const char* s, size_t n) constructor on macOS seems to ignore the n parameter and overflows. This was mitigated by adding a terminating character to s and using the std::string(const char* s) constructor instead.
1 parent f117c5d commit 2e04999

18 files changed

Lines changed: 706 additions & 184 deletions

libtiledbvcf/external/md5/md5.cc

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
#include "md5.h"
2+
#include <cstddef>
3+
4+
namespace md5 {
5+
6+
/*
7+
* Constants defined by the MD5 algorithm
8+
*/
9+
#define A 0x67452301
10+
#define B 0xefcdab89
11+
#define C 0x98badcfe
12+
#define D 0x10325476
13+
14+
static uint32_t S[] = {7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7,
15+
12, 17, 22, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9,
16+
14, 20, 5, 9, 14, 20, 4, 11, 16, 23, 4, 11, 16,
17+
23, 4, 11, 16, 23, 4, 11, 16, 23, 6, 10, 15, 21,
18+
6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21};
19+
20+
static uint32_t K[] = {
21+
0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a,
22+
0xa8304613, 0xfd469501, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
23+
0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, 0xf61e2562, 0xc040b340,
24+
0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
25+
0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8,
26+
0x676f02d9, 0x8d2a4c8a, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
27+
0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, 0x289b7ec6, 0xeaa127fa,
28+
0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
29+
0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92,
30+
0xffeff47d, 0x85845dd1, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
31+
0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391};
32+
33+
/*
34+
* Padding used to make the size (in bits) of the input congruent to 448 mod 512
35+
*/
36+
static uint8_t PADDING[] = {
37+
0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
38+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
39+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
40+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
41+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
42+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
43+
44+
/*
45+
* Bit-manipulation functions defined by the MD5 algorithm
46+
*/
47+
#define F(X, Y, Z) ((X & Y) | (~X & Z))
48+
#define G(X, Y, Z) ((X & Z) | (Y & ~Z))
49+
#define H(X, Y, Z) (X ^ Y ^ Z)
50+
#define I(X, Y, Z) (Y ^ (X | ~Z))
51+
52+
/*
53+
* Rotates a 32-bit word left by n bits
54+
*/
55+
uint32_t rotateLeft(uint32_t x, uint32_t n) {
56+
return (x << n) | (x >> (32 - n));
57+
}
58+
59+
void md5_init(MD5Context* ctx) {
60+
ctx->size = (uint64_t)0;
61+
62+
ctx->buffer[0] = (uint32_t)A;
63+
ctx->buffer[1] = (uint32_t)B;
64+
ctx->buffer[2] = (uint32_t)C;
65+
ctx->buffer[3] = (uint32_t)D;
66+
}
67+
68+
void md5_update(
69+
MD5Context* ctx, const uint8_t* input_buffer, size_t input_len) {
70+
uint32_t input[16];
71+
unsigned int offset = ctx->size % 64;
72+
ctx->size += (uint64_t)input_len;
73+
74+
// Copy each byte in input_buffer into the next space in our context input
75+
for (unsigned int i = 0; i < input_len; ++i) {
76+
ctx->input[offset++] = (uint8_t)*(input_buffer + i);
77+
78+
// If we've filled our context input, copy it into our local array input
79+
// then reset the offset to 0 and fill in a new buffer.
80+
// Every time we fill out a chunk, we run it through the algorithm
81+
// to enable some back and forth between cpu and i/o
82+
if (offset % 64 == 0) {
83+
for (unsigned int j = 0; j < 16; ++j) {
84+
// Convert to little-endian
85+
// The local variable `input` our 512-bit chunk separated into 32-bit
86+
// words we can use in calculations
87+
input[j] = (uint32_t)(ctx->input[(j * 4) + 3]) << 24 |
88+
(uint32_t)(ctx->input[(j * 4) + 2]) << 16 |
89+
(uint32_t)(ctx->input[(j * 4) + 1]) << 8 |
90+
(uint32_t)(ctx->input[(j * 4)]);
91+
}
92+
md5_step(ctx->buffer, input);
93+
offset = 0;
94+
}
95+
}
96+
}
97+
98+
void md5_finalize(MD5Context* ctx) {
99+
uint32_t input[16];
100+
unsigned int offset = ctx->size % 64;
101+
unsigned int padding_length = offset < 56 ? 56 - offset : (56 + 64) - offset;
102+
103+
// Fill in the padding and undo the changes to size that resulted from the
104+
// update
105+
md5_update(ctx, PADDING, padding_length);
106+
ctx->size -= (uint64_t)padding_length;
107+
108+
// Do a final update (internal to this function)
109+
// Last two 32-bit words are the two halves of the size (converted from bytes
110+
// to bits)
111+
for (unsigned int j = 0; j < 14; ++j) {
112+
input[j] = (uint32_t)(ctx->input[(j * 4) + 3]) << 24 |
113+
(uint32_t)(ctx->input[(j * 4) + 2]) << 16 |
114+
(uint32_t)(ctx->input[(j * 4) + 1]) << 8 |
115+
(uint32_t)(ctx->input[(j * 4)]);
116+
}
117+
input[14] = (uint32_t)(ctx->size * 8);
118+
input[15] = (uint32_t)((ctx->size * 8) >> 32);
119+
120+
md5_step(ctx->buffer, input);
121+
122+
// Move the result into digest (convert from little-endian)
123+
for (unsigned int i = 0; i < 4; ++i) {
124+
ctx->digest[(i * 4) + 0] = (uint8_t)((ctx->buffer[i] & 0x000000FF));
125+
ctx->digest[(i * 4) + 1] = (uint8_t)((ctx->buffer[i] & 0x0000FF00) >> 8);
126+
ctx->digest[(i * 4) + 2] = (uint8_t)((ctx->buffer[i] & 0x00FF0000) >> 16);
127+
ctx->digest[(i * 4) + 3] = (uint8_t)((ctx->buffer[i] & 0xFF000000) >> 24);
128+
}
129+
}
130+
131+
void md5_step(uint32_t* buffer, uint32_t* input) {
132+
uint32_t AA = buffer[0];
133+
uint32_t BB = buffer[1];
134+
uint32_t CC = buffer[2];
135+
uint32_t DD = buffer[3];
136+
137+
uint32_t E;
138+
139+
unsigned int j;
140+
141+
for (unsigned int i = 0; i < 64; ++i) {
142+
switch (i / 16) {
143+
case 0:
144+
E = F(BB, CC, DD);
145+
j = i;
146+
break;
147+
case 1:
148+
E = G(BB, CC, DD);
149+
j = ((i * 5) + 1) % 16;
150+
break;
151+
case 2:
152+
E = H(BB, CC, DD);
153+
j = ((i * 3) + 5) % 16;
154+
break;
155+
default:
156+
E = I(BB, CC, DD);
157+
j = (i * 7) % 16;
158+
break;
159+
}
160+
161+
uint32_t temp = DD;
162+
DD = CC;
163+
CC = BB;
164+
BB = BB + rotateLeft(AA + E + K[i] + input[j], S[i]);
165+
AA = temp;
166+
}
167+
168+
buffer[0] += AA;
169+
buffer[1] += BB;
170+
buffer[2] += CC;
171+
buffer[3] += DD;
172+
}
173+
174+
void md5_string(const char* input, uint8_t* result) {
175+
MD5Context ctx;
176+
md5_init(&ctx);
177+
md5_update(&ctx, (uint8_t*)input, strlen(input));
178+
md5_finalize(&ctx);
179+
180+
memcpy(result, ctx.digest, 16);
181+
}
182+
183+
void md5_file(FILE* file, uint8_t* result) {
184+
char* input_buffer = (char*)malloc(1024);
185+
size_t input_size = 0;
186+
187+
MD5Context ctx;
188+
md5_init(&ctx);
189+
190+
while ((input_size = fread(input_buffer, 1, 1024, file)) > 0) {
191+
md5_update(&ctx, (uint8_t*)input_buffer, input_size);
192+
}
193+
194+
md5_finalize(&ctx);
195+
196+
free(input_buffer);
197+
198+
memcpy(result, ctx.digest, 16);
199+
}
200+
201+
void md5_to_hex(const uint8_t* input, char* output) {
202+
for (size_t i = 0; i < 16; i++) {
203+
sprintf(output, "%02x", input[i]);
204+
output += 2;
205+
}
206+
}
207+
208+
std::string md5_to_hex(const uint8_t* input) {
209+
char output[33];
210+
md5_to_hex(input, output);
211+
output[32] = '\0';
212+
return std::string(output);
213+
}
214+
215+
} // namespace md5

libtiledbvcf/external/md5/md5.h

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
/*
2+
* This file (md5.h) and the corresponding md5.cc file are licensed under the
3+
* Unlicense license and are therefore released into the public domain.
4+
*
5+
* This is an a adaptation of the MD5 reference implementation from
6+
* https://github.com/Zunawe/md5-c. It is derived from the RSA Data Security,
7+
* Inc. MD5 Message-Digest Algorithm and modified slightly to be functionally
8+
* identical but condensed into control structures.
9+
*/
10+
11+
#ifndef TILEDB_EXTERNAL_MD5_H
12+
#define TILEDB_EXTERNAL_MD5_H
13+
14+
#include <stdint.h>
15+
#include <stdio.h>
16+
#include <stdlib.h>
17+
#include <string.h>
18+
#include <string>
19+
20+
namespace md5 {
21+
22+
typedef struct {
23+
uint64_t size; // Size of input in bytes
24+
uint32_t buffer[4]; // Current accumulation of hash
25+
uint8_t input[64]; // Input to be used in the next step
26+
uint8_t digest[16]; // Result of algorithm
27+
} MD5Context;
28+
29+
/*
30+
* Initialize an md5 context
31+
*/
32+
void md5_init(MD5Context* ctx);
33+
34+
/*
35+
* Add some amount of input to the context
36+
*
37+
* If the input fills out a block of 512 bits, apply the algorithm (md5_step)
38+
* and save the result in the buffer. Also updates the overall size.
39+
*/
40+
void md5_update(MD5Context* ctx, const uint8_t* input, size_t input_len);
41+
42+
/*
43+
* Pad the current input to get to 448 bytes, append the size in bits to the
44+
* very end, and save the result of the final iteration into digest.
45+
*/
46+
void md5_finalize(MD5Context* ctx);
47+
48+
/*
49+
* Step on 512 bits of input with the main MD5 algorithm.
50+
*/
51+
void md5_step(uint32_t* buffer, uint32_t* input);
52+
53+
/*
54+
* Functions that run the algorithm on the provided input and put the digest
55+
* into result. result should be able to store 16 bytes.
56+
*/
57+
void md5_string(const char* input, uint8_t* result);
58+
59+
/*
60+
* Functions that run the algorithm on the contents of the provided file and
61+
* put the digest into result. result should be able to store 16 bytes.
62+
*/
63+
void md5_file(FILE* file, uint8_t* result);
64+
65+
/*
66+
* Converts an md5 array into a C string.
67+
*/
68+
void md5_to_hex(const uint8_t* input, char* output);
69+
70+
/*
71+
* Converts an md5 array into a string.
72+
*/
73+
std::string md5_to_hex(const uint8_t* input);
74+
75+
} // namespace md5
76+
77+
#endif

libtiledbvcf/src/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ set(TILEDB_VCF_SOURCES
9090

9191
set(TILEDB_VCF_EXTERNAL_SOURCES
9292
${CMAKE_CURRENT_SOURCE_DIR}/../external/base64/base64.cc
93+
${CMAKE_CURRENT_SOURCE_DIR}/../external/md5/md5.cc
9394
)
9495

9596
add_library(TILEDB_VCF_OBJECTS OBJECT

0 commit comments

Comments
 (0)