Skip to content

Commit fe54fb9

Browse files
committed
Support deleted rows in SAS data files
1 parent 52ad5a2 commit fe54fb9

3 files changed

Lines changed: 95 additions & 15 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ test_readstat
3131
*.log
3232
*.trs
3333
test_csv_to_dta*
34+
.vs/
3435
.vscode/
3536
*.swp
3637
dev/

src/sas/readstat_sas.h

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -95,18 +95,20 @@ typedef struct sas_text_ref_s {
9595
#define SAS_PAGE_TYPE_AMD 0x0400
9696
#define SAS_PAGE_TYPE_MASK 0x0F00
9797

98-
#define SAS_PAGE_TYPE_META2 0x4000
99-
#define SAS_PAGE_TYPE_COMP 0x9000
98+
#define SAS_PAGE_TYPE_DELETED 0x0080
99+
#define SAS_PAGE_TYPE_META2 0x4000
100+
#define SAS_PAGE_TYPE_COMP 0x9000
100101

101102
#define SAS_SUBHEADER_POINTER_SIZE_32BIT 12
102103
#define SAS_SUBHEADER_POINTER_SIZE_64BIT 24
103104

104105
#define SAS_PAGE_HEADER_SIZE_32BIT 24
105106
#define SAS_PAGE_HEADER_SIZE_64BIT 40
106107

107-
#define SAS_COMPRESSION_NONE 0x00
108-
#define SAS_COMPRESSION_TRUNC 0x01
109-
#define SAS_COMPRESSION_ROW 0x04
108+
#define SAS_COMPRESSION_NONE 0x00
109+
#define SAS_COMPRESSION_TRUNC 0x01
110+
#define SAS_COMPRESSION_ROW 0x04
111+
#define SAS_COMPRESSION_ROW_DELETED 0x05
110112

111113
#define SAS_COMPRESSION_SIGNATURE_RLE "SASYZCRL"
112114
#define SAS_COMPRESSION_SIGNATURE_RDC "SASYZCR2"

src/sas/readstat_sas7bdat_read.c

Lines changed: 87 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,10 @@ typedef struct sas7bdat_ctx_s {
4646
uint32_t row_length;
4747
uint32_t page_row_count;
4848
uint32_t parsed_row_count;
49+
uint32_t parsed_deleted_row_count;
4950
uint32_t column_count;
5051
uint32_t row_limit;
52+
uint32_t deleted_row_limit;
5153
uint32_t row_offset;
5254

5355
uint64_t header_size;
@@ -232,7 +234,7 @@ static readstat_error_t sas7bdat_parse_column_size_subheader(const char *subhead
232234
static readstat_error_t sas7bdat_parse_row_size_subheader(const char *subheader, size_t len, sas7bdat_ctx_t *ctx) {
233235
readstat_error_t retval = READSTAT_OK;
234236
uint64_t total_row_count;
235-
uint64_t row_length, page_row_count;
237+
uint64_t row_length, deleted_row_limit, page_row_count;
236238

237239
if (len < (ctx->u64 ? 250: 190)) {
238240
retval = READSTAT_ERROR_PARSE;
@@ -242,13 +244,21 @@ static readstat_error_t sas7bdat_parse_row_size_subheader(const char *subheader,
242244
if (ctx->u64) {
243245
row_length = sas_read8(&subheader[40], ctx->bswap);
244246
total_row_count = sas_read8(&subheader[48], ctx->bswap);
247+
deleted_row_limit = sas_read8(&subheader[56], ctx->bswap);
245248
page_row_count = sas_read8(&subheader[120], ctx->bswap);
246249
} else {
247250
row_length = sas_read4(&subheader[20], ctx->bswap);
248251
total_row_count = sas_read4(&subheader[24], ctx->bswap);
252+
deleted_row_limit = sas_read4(&subheader[28], ctx->bswap);
249253
page_row_count = sas_read4(&subheader[60], ctx->bswap);
250254
}
251255

256+
if (deleted_row_limit > total_row_count) {
257+
retval = READSTAT_ERROR_PARSE;
258+
goto cleanup;
259+
}
260+
ctx->deleted_row_limit = deleted_row_limit;
261+
252262
sas_text_ref_t file_label_ref = sas7bdat_parse_text_ref(&subheader[len-130], ctx);
253263
if (file_label_ref.length) {
254264
if ((retval = sas7bdat_copy_text_ref(ctx->file_label, sizeof(ctx->file_label),
@@ -390,6 +400,19 @@ static readstat_error_t sas7bdat_parse_column_format_subheader(const char *subhe
390400
return retval;
391401
}
392402

403+
static readstat_error_t sas7bdat_register_deleted_row(sas7bdat_ctx_t* ctx) {
404+
if (ctx->parsed_deleted_row_count >= ctx->deleted_row_limit) {
405+
return READSTAT_ERROR_PARSE;
406+
}
407+
ctx->parsed_row_count++;
408+
ctx->parsed_deleted_row_count++;
409+
return READSTAT_OK;
410+
}
411+
412+
static uint32_t sas7bdat_get_current_row_id(sas7bdat_ctx_t* ctx) {
413+
return ctx->parsed_row_count - ctx->parsed_deleted_row_count;
414+
}
415+
393416
static readstat_error_t sas7bdat_handle_data_value(readstat_variable_t *variable,
394417
col_info_t *col_info, const char *col_data, sas7bdat_ctx_t *ctx) {
395418
readstat_error_t retval = READSTAT_OK;
@@ -406,7 +429,7 @@ static readstat_error_t sas7bdat_handle_data_value(readstat_variable_t *variable
406429
if (ctx->handle.error) {
407430
snprintf(ctx->error_buf, sizeof(ctx->error_buf),
408431
"ReadStat: Error converting string (row=%u, col=%u) to specified encoding: %.*s",
409-
ctx->parsed_row_count+1, col_info->index+1, col_info->width, col_data);
432+
sas7bdat_get_current_row_id(ctx)+1, col_info->index+1, col_info->width, col_data);
410433
ctx->handle.error(ctx->error_buf, ctx->user_ctx);
411434
}
412435
goto cleanup;
@@ -438,7 +461,7 @@ static readstat_error_t sas7bdat_handle_data_value(readstat_variable_t *variable
438461
value.v.double_value = dval;
439462
}
440463
}
441-
cb_retval = ctx->handle.value(ctx->parsed_row_count, variable, value, ctx->user_ctx);
464+
cb_retval = ctx->handle.value(sas7bdat_get_current_row_id(ctx), variable, value, ctx->user_ctx);
442465

443466
if (cb_retval != READSTAT_HANDLER_OK)
444467
retval = READSTAT_ERROR_USER_ABORT;
@@ -487,7 +510,14 @@ static readstat_error_t sas7bdat_parse_single_row(const char *data, sas7bdat_ctx
487510
return retval;
488511
}
489512

490-
static readstat_error_t sas7bdat_parse_rows(const char *data, size_t len, sas7bdat_ctx_t *ctx) {
513+
static uint8_t sas7bdat_read_bitmap(const uint8_t* bitmap, int index) {
514+
uint8_t current_byte = bitmap[index / 8];
515+
uint8_t mask = 1 << (7 - index % 8);
516+
517+
return current_byte & mask;
518+
}
519+
520+
static readstat_error_t sas7bdat_parse_rows(const char* data, size_t len, const uint8_t* deleted_bitmap, sas7bdat_ctx_t* ctx) {
491521
readstat_error_t retval = READSTAT_OK;
492522
int i;
493523
size_t row_offset=0;
@@ -496,8 +526,13 @@ static readstat_error_t sas7bdat_parse_rows(const char *data, size_t len, sas7bd
496526
retval = READSTAT_ERROR_ROW_WIDTH_MISMATCH;
497527
goto cleanup;
498528
}
499-
if ((retval = sas7bdat_parse_single_row(&data[row_offset], ctx)) != READSTAT_OK)
529+
if (deleted_bitmap != NULL && sas7bdat_read_bitmap(deleted_bitmap, i)) {
530+
if ((retval = sas7bdat_register_deleted_row(ctx)) != READSTAT_OK) {
531+
goto cleanup;
532+
}
533+
} else if ((retval = sas7bdat_parse_single_row(&data[row_offset], ctx)) != READSTAT_OK) {
500534
goto cleanup;
535+
}
501536

502537
row_offset += ctx->row_length;
503538
}
@@ -608,7 +643,7 @@ static readstat_error_t sas7bdat_parse_subheader_rle(const char *subheader, size
608643
if (ctx->handle.error) {
609644
snprintf(ctx->error_buf, sizeof(ctx->error_buf),
610645
"ReadStat: Row #%d decompressed to %ld bytes (expected %d bytes)",
611-
ctx->parsed_row_count, (long)(bytes_decompressed), ctx->row_length);
646+
sas7bdat_get_current_row_id(ctx), (long)(bytes_decompressed), ctx->row_length);
612647
ctx->handle.error(ctx->error_buf, ctx->user_ctx);
613648
}
614649
goto cleanup;
@@ -735,7 +770,7 @@ static readstat_error_t sas7bdat_submit_columns(sas7bdat_ctx_t *ctx, int compres
735770
readstat_error_t retval = READSTAT_OK;
736771
if (ctx->handle.metadata) {
737772
readstat_metadata_t metadata = {
738-
.row_count = ctx->row_limit,
773+
.row_count = ctx->row_limit - ctx->deleted_row_limit,
739774
.var_count = ctx->column_count,
740775
.table_name = ctx->table_name,
741776
.file_label = ctx->file_label,
@@ -895,7 +930,7 @@ static readstat_error_t sas7bdat_parse_page_pass1(const char *page, size_t page_
895930
goto cleanup;
896931
}
897932
}
898-
} else if (shp_info.compression == SAS_COMPRESSION_ROW) {
933+
} else if (shp_info.compression == SAS_COMPRESSION_ROW || shp_info.compression == SAS_COMPRESSION_ROW_DELETED) {
899934
/* void */
900935
} else {
901936
retval = READSTAT_ERROR_UNSUPPORTED_COMPRESSION;
@@ -911,6 +946,26 @@ static readstat_error_t sas7bdat_parse_page_pass1(const char *page, size_t page_
911946
return retval;
912947
}
913948

949+
static readstat_error_t sas7bdat_parse_deleted_row_bitmap(const char* page, const char* data,
950+
size_t page_size, const uint8_t** deleted_row_bitmap, sas7bdat_ctx_t* ctx) {
951+
uint64_t page_unused_bytes;
952+
if (ctx->u64) {
953+
page_unused_bytes = sas_read8(&page[24], ctx->bswap);
954+
}
955+
else {
956+
page_unused_bytes = sas_read4(&page[12], ctx->bswap);
957+
}
958+
uint32_t row_count = ctx->page_row_count < ctx->row_limit ? ctx->page_row_count : ctx->row_limit;
959+
uint64_t deleted_row_bitmap_offset = row_count * ctx->row_length + page_unused_bytes;
960+
uint32_t required_bytes = row_count / 8 + (row_count % 8 == 0 ? 0 : 1);
961+
962+
if ((data - page) + deleted_row_bitmap_offset + required_bytes > page_size) {
963+
return READSTAT_ERROR_PARSE;
964+
}
965+
*deleted_row_bitmap = (const uint8_t*)data + deleted_row_bitmap_offset;
966+
return READSTAT_OK;
967+
}
968+
914969
static readstat_error_t sas7bdat_parse_page_pass2(const char *page, size_t page_size, sas7bdat_ctx_t *ctx) {
915970
uint16_t page_type;
916971

@@ -975,6 +1030,10 @@ static readstat_error_t sas7bdat_parse_page_pass2(const char *page, size_t page_
9751030
if ((retval = sas7bdat_parse_subheader_compressed(page + shp_info.offset, shp_info.len, ctx)) != READSTAT_OK) {
9761031
goto cleanup;
9771032
}
1033+
} else if (shp_info.compression == SAS_COMPRESSION_ROW_DELETED) {
1034+
if ((retval = sas7bdat_register_deleted_row(ctx)) != READSTAT_OK) {
1035+
goto cleanup;
1036+
}
9781037
} else {
9791038
retval = READSTAT_ERROR_UNSUPPORTED_COMPRESSION;
9801039
goto cleanup;
@@ -1004,7 +1063,14 @@ static readstat_error_t sas7bdat_parse_page_pass2(const char *page, size_t page_
10041063
goto cleanup;
10051064
}
10061065
if (ctx->handle.value) {
1007-
retval = sas7bdat_parse_rows(data, page + page_size - data, ctx);
1066+
const uint8_t* deleted_row_bitmap = NULL;
1067+
if (page_type & SAS_PAGE_TYPE_DELETED) {
1068+
if ((retval = sas7bdat_parse_deleted_row_bitmap(page, data, page_size,
1069+
&deleted_row_bitmap, ctx)) != READSTAT_OK) {
1070+
goto cleanup;
1071+
}
1072+
}
1073+
retval = sas7bdat_parse_rows(data, page + page_size - data, deleted_row_bitmap, ctx);
10081074
}
10091075
}
10101076
cleanup:
@@ -1276,11 +1342,22 @@ readstat_error_t readstat_parse_sas7bdat(readstat_parser_t *parser, const char *
12761342
goto cleanup;
12771343
}
12781344

1345+
if (ctx->handle.value && ctx->parsed_deleted_row_count != ctx->deleted_row_limit) {
1346+
retval = READSTAT_ERROR_ROW_COUNT_MISMATCH;
1347+
if (ctx->handle.error) {
1348+
snprintf(ctx->error_buf, sizeof(ctx->error_buf), "ReadStat: Expected %d deleted rows in file, found %d",
1349+
ctx->deleted_row_limit, ctx->parsed_deleted_row_count);
1350+
ctx->handle.error(ctx->error_buf, ctx->user_ctx);
1351+
}
1352+
goto cleanup;
1353+
}
1354+
12791355
if (ctx->handle.value && ctx->parsed_row_count != ctx->row_limit) {
12801356
retval = READSTAT_ERROR_ROW_COUNT_MISMATCH;
12811357
if (ctx->handle.error) {
12821358
snprintf(ctx->error_buf, sizeof(ctx->error_buf), "ReadStat: Expected %d rows in file, found %d",
1283-
ctx->row_limit, ctx->parsed_row_count);
1359+
ctx->row_limit - ctx->deleted_row_limit,
1360+
ctx->parsed_row_count - ctx->parsed_deleted_row_count);
12841361
ctx->handle.error(ctx->error_buf, ctx->user_ctx);
12851362
}
12861363
goto cleanup;

0 commit comments

Comments
 (0)