Skip to content

Commit d93c1ca

Browse files
committed
Move C code for json+struct codec to an example in the docs
1 parent 2f26dc6 commit d93c1ca

6 files changed

Lines changed: 340 additions & 4 deletions

File tree

c/examples/Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ TSKIT_SOURCE=../tskit/*.c ../subprojects/kastore/kastore.c
2323
targets = api_structure error_handling \
2424
haploid_wright_fisher streaming \
2525
tree_iteration tree_traversal \
26-
take_ownership
26+
take_ownership \
27+
json_struct_metadata
2728

2829
all: $(targets)
2930

c/examples/json_struct_metadata.c

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
#include <stdio.h>
2+
#include <stdlib.h>
3+
#include <err.h>
4+
#include <string.h>
5+
#include <tskit.h>
6+
7+
// these are properties of the ``json+struct`` codec, documented in tskit
8+
#define JSON_STRUCT_HEADER_SIZE 21
9+
10+
const uint8_t json_struct_codec_magic[4] = { 'J', 'B', 'L', 'B' };
11+
const uint8_t json_struct_codec_version = 1;
12+
13+
// little-endian read of a uint64_t from an address
14+
static uint64_t
15+
load_u64_le(const uint8_t *p)
16+
{
17+
uint64_t value = (uint64_t) p[0];
18+
value |= (uint64_t) p[1] << 8;
19+
value |= (uint64_t) p[2] << 16;
20+
value |= (uint64_t) p[3] << 24;
21+
value |= (uint64_t) p[4] << 32;
22+
value |= (uint64_t) p[5] << 40;
23+
value |= (uint64_t) p[6] << 48;
24+
value |= (uint64_t) p[7] << 56;
25+
return value;
26+
}
27+
28+
// little-endian write of a uint64_t to an address
29+
static void
30+
set_u64_le(uint8_t *dest, uint64_t value)
31+
{
32+
dest[0] = (uint8_t) (value & 0xFF);
33+
dest[1] = (uint8_t) ((value >> 8) & 0xFF);
34+
dest[2] = (uint8_t) ((value >> 16) & 0xFF);
35+
dest[3] = (uint8_t) ((value >> 24) & 0xFF);
36+
dest[4] = (uint8_t) ((value >> 32) & 0xFF);
37+
dest[5] = (uint8_t) ((value >> 40) & 0xFF);
38+
dest[6] = (uint8_t) ((value >> 48) & 0xFF);
39+
dest[7] = (uint8_t) ((value >> 56) & 0xFF);
40+
}
41+
42+
// Extract the json and binary payloads from the `json+struct` codec data buffer.
43+
// Note that the output pointers `json` and `binary` reference memory
44+
// inside the `metadata` buffer passed in.
45+
void
46+
json_struct_codec_get_components(uint8_t *metadata, tsk_size_t metadata_length,
47+
uint8_t **json, tsk_size_t *json_length, uint8_t **binary, tsk_size_t *binary_length)
48+
{
49+
// check the structure of the codec header and the sizes it specifies
50+
if (metadata == NULL || json == NULL || json_length == NULL || binary == NULL
51+
|| binary_length == NULL)
52+
errx(EXIT_FAILURE, "bad parameter value.");
53+
if (metadata_length < JSON_STRUCT_HEADER_SIZE)
54+
errx(EXIT_FAILURE, "metadata truncated.");
55+
if (memcmp(metadata, json_struct_codec_magic, sizeof(json_struct_codec_magic)) != 0)
56+
errx(EXIT_FAILURE, "bad magic bytes.");
57+
58+
uint8_t version = metadata[4];
59+
if (version != json_struct_codec_version)
60+
errx(EXIT_FAILURE, "bad version number.");
61+
62+
uint64_t json_length_u64 = load_u64_le(metadata + 5);
63+
uint64_t binary_length_u64 = load_u64_le(metadata + 13);
64+
if (json_length_u64 > UINT64_MAX - (uint64_t) JSON_STRUCT_HEADER_SIZE)
65+
errx(EXIT_FAILURE, "invalid length.");
66+
67+
// determine the number of padding bytes and do more safety checks
68+
uint64_t length = (uint64_t) JSON_STRUCT_HEADER_SIZE + json_length_u64;
69+
uint64_t padding_length = (8 - (length & 0x07)) % 8;
70+
if (padding_length > UINT64_MAX - length)
71+
errx(EXIT_FAILURE, "invalid length.");
72+
73+
length += padding_length;
74+
if (binary_length_u64 > UINT64_MAX - length)
75+
errx(EXIT_FAILURE, "invalid length.");
76+
77+
length += binary_length_u64;
78+
if ((uint64_t) metadata_length != length)
79+
errx(EXIT_FAILURE, "unexpected size.");
80+
81+
uint8_t *padding_start = metadata + JSON_STRUCT_HEADER_SIZE + json_length_u64;
82+
for (uint64_t j = 0; j < padding_length; ++j)
83+
if (*(padding_start + j) != 0)
84+
errx(EXIT_FAILURE, "padding bytes are nonzero.");
85+
86+
// the structure of the codec data seems valid; return components
87+
*json = metadata + JSON_STRUCT_HEADER_SIZE;
88+
*json_length = (tsk_size_t) json_length_u64;
89+
90+
*binary = metadata + JSON_STRUCT_HEADER_SIZE + json_length_u64 + padding_length;
91+
*binary_length = (tsk_size_t) binary_length_u64;
92+
}
93+
94+
// malloc and return a data buffer for the `json+struct` codec
95+
// that contains the given components
96+
void
97+
json_struct_codec_create_buffer(const uint8_t *json, tsk_size_t json_length,
98+
const uint8_t *binary, tsk_size_t binary_length, uint8_t **buffer,
99+
tsk_size_t *buffer_length)
100+
{
101+
// figure out the total length of the codec's data and allocate the buffer for it
102+
tsk_size_t header_length = JSON_STRUCT_HEADER_SIZE;
103+
tsk_size_t padding_length = (8 - ((header_length + json_length) & 0x07)) % 8;
104+
tsk_size_t total_length
105+
= header_length + json_length + padding_length + binary_length;
106+
uint8_t *bytes = malloc(total_length);
107+
if (!bytes)
108+
errx(EXIT_FAILURE, "memory for buffer could not be allocated.");
109+
110+
// then set up the bytes for the codec header
111+
memcpy(bytes, json_struct_codec_magic, 4);
112+
bytes[4] = json_struct_codec_version;
113+
set_u64_le(bytes + 5, (uint64_t) json_length);
114+
set_u64_le(bytes + 13, (uint64_t) binary_length);
115+
116+
// copy in the JSON and binary data, separated by the padding bytes; the goal of the
117+
// padding bytes is to ensure that the binary data is 8-byte-aligned relative to the
118+
// start of the buffer
119+
memcpy(bytes + header_length, json, json_length);
120+
memset(bytes + header_length + json_length, 0, padding_length);
121+
memcpy(bytes + header_length + json_length + padding_length, binary, binary_length);
122+
123+
// return the buffer and its length; the caller takes ownership of the buffer
124+
*buffer = bytes;
125+
*buffer_length = total_length;
126+
}
127+
128+
int
129+
main(int argc, char **argv)
130+
{
131+
// we start with JSON and binary payloads that we encode into a new buffer
132+
// note that the JSON payload does not have to end with a trailing NULL
133+
const char json_payload[] = { '{', '"', 'a', '"', ':', '1', '}' };
134+
const uint8_t binary_payload[] = { 0x01, 0x02, 0x03, 0x04 };
135+
uint8_t *metadata;
136+
tsk_size_t metadata_length;
137+
138+
json_struct_codec_create_buffer((const uint8_t *) json_payload, sizeof(json_payload),
139+
binary_payload, sizeof(binary_payload), &metadata, &metadata_length);
140+
141+
// then we decode that buffer to recover the json and binary data
142+
uint8_t *decoded_json, *decoded_binary;
143+
tsk_size_t decoded_json_length, decoded_binary_length;
144+
145+
json_struct_codec_get_components(metadata, metadata_length, &decoded_json,
146+
&decoded_json_length, &decoded_binary, &decoded_binary_length);
147+
148+
// print the recovered data to demonstrate that the round-trip worked
149+
// note that the JSON data is not NULL-terminated unless you put a NULL there!
150+
printf("JSON: %.*s\n", (int) decoded_json_length, decoded_json);
151+
152+
printf("Binary data:");
153+
for (tsk_size_t j = 0; j < decoded_binary_length; j++)
154+
printf(" %#04x", decoded_binary[j]);
155+
printf("\n");
156+
157+
free(metadata);
158+
return EXIT_SUCCESS;
159+
}

c/meson.build

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,9 @@ if not meson.is_subproject()
125125
executable('multichrom_wright_fisher_singlethreaded',
126126
sources: ['examples/multichrom_wright_fisher_singlethreaded.c'],
127127
link_with: [tskit_lib], dependencies: lib_deps)
128+
executable('json_struct_metadata',
129+
sources: ['examples/json_struct_metadata.c'],
130+
link_with: [tskit_lib], dependencies: lib_deps)
128131

129132
thread_dep = dependency('threads')
130133
executable('multichrom_wright_fisher',

docs/c-api.rst

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -949,3 +949,60 @@ nodes need to be retained, and use
949949
.. literalinclude:: ../c/examples/multichrom_wright_fisher.c
950950
:language: c
951951

952+
----------------------------
953+
Reading and writing metadata
954+
----------------------------
955+
956+
The C API does not provide any functionality for manipulating
957+
the contents of metadata. For JSON metadata it is easy to
958+
parse metadata using an external JSON library, and for
959+
struct-encoded metadata the values can be directly unpacked.
960+
Examples of both can be found in
961+
`the SLiM code <https://messerlab.github.com/slim/>`_.
962+
963+
The :ref:`"json+struct" <sec_metadata_codecs_jsonstruct>`
964+
metadata codec is a little less straightforward to use,
965+
so we provide here an example of how to write to it
966+
and read from it in C. See :ref:`sec_metadata_codecs_jsonstruct`
967+
for details of how the metadata is encoded.
968+
(In Python, tskit automatically decodes both JSON and binary
969+
metadata and provides it as Python-data-typed metadata,
970+
just as for other codecs.)
971+
972+
The structure of this example is as follows:
973+
974+
1. Values specific to the metadata's header (e.g., the magic bytes `JBLB`).
975+
2. Functions that encode/decode `uint64_t`, used to store the lengths
976+
of the two components in the header.
977+
3. A method to "read" the metadata: really, to get pointers to the
978+
json and struct components.
979+
4. A method to "write" the metadata, again just given pointers to
980+
and lengths of the two components.
981+
5. The program itself just round-trips a very simple chunk of metadata,
982+
consisting of the JSON "`{"a": 1}`" and some binary `uint8_t` bytes ("`1234`").
983+
984+
.. literalinclude:: ../c/examples/json_struct_metadata.c
985+
:language: c
986+
987+
Much of the complexity of the code is careful error checking of the lengths.
988+
989+
Here ``json_struct_codec_get_components`` takes a pointer to binary metadata
990+
and returns pointers to *within that memory*.
991+
A different approach might have copied the two portions of the metadata
992+
into two buffers (to then be decoded, for instance).
993+
However, that would double the memory footprint,
994+
and since this codec is intended for large metadata,
995+
we did not use that approach in this example.
996+
997+
Along the same lines, it is worth noting that this example does make a copy of
998+
the JSON and binary data when writing, in ``json_struct_codec_create_buffer()``,
999+
which doubles the memory footprint at that point, and adds the
1000+
overhead of copying the data. A more efficient approach would be to calculate
1001+
the buffer length needed for the codec’s data, allocate the buffer with that
1002+
length, and then generate the necessary JSON and binary metadata directly into
1003+
that buffer. This would require the metadata-generating code to be more
1004+
closely entwined with the code for handling the json+struct codec header and
1005+
padding bytes, and so we have chosen not to adopt that approach here, for
1006+
pedagogical purposes; but if your use of this codec will involve large
1007+
metadata, such an approach is recommended.
1008+

docs/development.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -832,7 +832,7 @@ To generate and view coverage reports for the C tests locally:
832832
Compile with coverage enabled:
833833
```bash
834834
cd c
835-
meson build -D b_coverage=true
835+
meson setup build -D b_coverage=true
836836
ninja -C build
837837
```
838838

@@ -853,7 +853,7 @@ Lines prefixed with `#####` were never executed, lines with numbers show executi
853853
`lcov` can be used to create browsable HTML coverage reports:
854854
```bash
855855
sudo apt-get install lcov # if needed
856-
lcov --capture --directory build-gcc --output-file coverage.info
856+
lcov --capture --directory build --output-file coverage.info
857857
genhtml coverage.info --output-directory coverage_html
858858
firefox coverage_html/index.html
859859
```

0 commit comments

Comments
 (0)