Skip to content

Commit 2e793b5

Browse files
committed
Support deleted rows in SAS data files
1 parent 4010db5 commit 2e793b5

2 files changed

Lines changed: 94 additions & 15 deletions

File tree

src/sas/readstat_sas.h

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -111,18 +111,20 @@ typedef enum sas_subheader_type_e {
111111
#define SAS_PAGE_TYPE_AMD 0x0400
112112
#define SAS_PAGE_TYPE_MASK 0x0F00
113113

114-
#define SAS_PAGE_TYPE_META2 0x4000
115-
#define SAS_PAGE_TYPE_COMP 0x9000
114+
#define SAS_PAGE_TYPE_DELETED 0x0080
115+
#define SAS_PAGE_TYPE_META2 0x4000
116+
#define SAS_PAGE_TYPE_COMP 0x9000
116117

117118
#define SAS_SUBHEADER_POINTER_SIZE_32BIT 12
118119
#define SAS_SUBHEADER_POINTER_SIZE_64BIT 24
119120

120121
#define SAS_PAGE_HEADER_SIZE_32BIT 24
121122
#define SAS_PAGE_HEADER_SIZE_64BIT 40
122123

123-
#define SAS_COMPRESSION_NONE 0x00
124-
#define SAS_COMPRESSION_TRUNC 0x01
125-
#define SAS_COMPRESSION_ROW 0x04
124+
#define SAS_COMPRESSION_NONE 0x00
125+
#define SAS_COMPRESSION_TRUNC 0x01
126+
#define SAS_COMPRESSION_ROW 0x04
127+
#define SAS_COMPRESSION_ROW_DELETED 0x05
126128

127129
#define SAS_COMPRESSION_SIGNATURE_RLE "SASYZCRL"
128130
#define SAS_COMPRESSION_SIGNATURE_RDC "SASYZCR2"

src/sas/readstat_sas7bdat_read.c

Lines changed: 87 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,10 @@ typedef struct sas7bdat_ctx_s {
4646
uint32_t row_length;
4747
uint32_t page_row_count;
4848
uint32_t parsed_row_count;
49+
uint32_t parsed_deleted_row_count;
4950
uint32_t column_count;
5051
uint32_t row_limit;
52+
uint32_t deleted_row_limit;
5153
uint32_t row_offset;
5254

5355
uint64_t header_size;
@@ -232,7 +234,7 @@ static readstat_error_t sas7bdat_parse_column_size_subheader(const char *subhead
232234
static readstat_error_t sas7bdat_parse_row_size_subheader(const char *subheader, size_t len, sas7bdat_ctx_t *ctx) {
233235
readstat_error_t retval = READSTAT_OK;
234236
uint64_t total_row_count;
235-
uint64_t row_length, page_row_count;
237+
uint64_t row_length, deleted_row_limit, page_row_count;
236238

237239
if (len < (ctx->u64 ? 250: 190)) {
238240
retval = READSTAT_ERROR_PARSE;
@@ -242,13 +244,21 @@ static readstat_error_t sas7bdat_parse_row_size_subheader(const char *subheader,
242244
if (ctx->u64) {
243245
row_length = sas_read8(&subheader[40], ctx->bswap);
244246
total_row_count = sas_read8(&subheader[48], ctx->bswap);
247+
deleted_row_limit = sas_read8(&subheader[56], ctx->bswap);
245248
page_row_count = sas_read8(&subheader[120], ctx->bswap);
246249
} else {
247250
row_length = sas_read4(&subheader[20], ctx->bswap);
248251
total_row_count = sas_read4(&subheader[24], ctx->bswap);
252+
deleted_row_limit = sas_read4(&subheader[28], ctx->bswap);
249253
page_row_count = sas_read4(&subheader[60], ctx->bswap);
250254
}
251255

256+
if (deleted_row_limit > total_row_count) {
257+
retval = READSTAT_ERROR_PARSE;
258+
goto cleanup;
259+
}
260+
ctx->deleted_row_limit = deleted_row_limit;
261+
252262
sas_text_ref_t file_label_ref = sas7bdat_parse_text_ref(&subheader[len-130], ctx);
253263
if (file_label_ref.length) {
254264
if ((retval = sas7bdat_copy_text_ref(ctx->file_label, sizeof(ctx->file_label),
@@ -393,6 +403,19 @@ static readstat_error_t sas7bdat_parse_column_format_subheader(const char *subhe
393403
return retval;
394404
}
395405

406+
static readstat_error_t sas7bdat_register_deleted_row(sas7bdat_ctx_t* ctx) {
407+
if (ctx->parsed_deleted_row_count >= ctx->deleted_row_limit) {
408+
return READSTAT_ERROR_PARSE;
409+
}
410+
ctx->parsed_row_count++;
411+
ctx->parsed_deleted_row_count++;
412+
return READSTAT_OK;
413+
}
414+
415+
static uint32_t sas7bdat_get_current_row_id(sas7bdat_ctx_t* ctx) {
416+
return ctx->parsed_row_count - ctx->parsed_deleted_row_count;
417+
}
418+
396419
static readstat_error_t sas7bdat_handle_data_value(readstat_variable_t *variable,
397420
col_info_t *col_info, const char *col_data, sas7bdat_ctx_t *ctx) {
398421
readstat_error_t retval = READSTAT_OK;
@@ -409,7 +432,7 @@ static readstat_error_t sas7bdat_handle_data_value(readstat_variable_t *variable
409432
if (ctx->handle.error) {
410433
snprintf(ctx->error_buf, sizeof(ctx->error_buf),
411434
"ReadStat: Error converting string (row=%u, col=%u) to specified encoding: %.*s",
412-
ctx->parsed_row_count+1, col_info->index+1, col_info->width, col_data);
435+
sas7bdat_get_current_row_id(ctx)+1, col_info->index+1, col_info->width, col_data);
413436
ctx->handle.error(ctx->error_buf, ctx->user_ctx);
414437
}
415438
goto cleanup;
@@ -441,7 +464,7 @@ static readstat_error_t sas7bdat_handle_data_value(readstat_variable_t *variable
441464
value.v.double_value = dval;
442465
}
443466
}
444-
cb_retval = ctx->handle.value(ctx->parsed_row_count, variable, value, ctx->user_ctx);
467+
cb_retval = ctx->handle.value(sas7bdat_get_current_row_id(ctx), variable, value, ctx->user_ctx);
445468

446469
if (cb_retval != READSTAT_HANDLER_OK)
447470
retval = READSTAT_ERROR_USER_ABORT;
@@ -490,7 +513,14 @@ static readstat_error_t sas7bdat_parse_single_row(const char *data, sas7bdat_ctx
490513
return retval;
491514
}
492515

493-
static readstat_error_t sas7bdat_parse_rows(const char *data, size_t len, sas7bdat_ctx_t *ctx) {
516+
static uint8_t sas7bdat_read_bitmap(const uint8_t* bitmap, int index) {
517+
uint8_t current_byte = bitmap[index / 8];
518+
uint8_t mask = 1 << (7 - index % 8);
519+
520+
return current_byte & mask;
521+
}
522+
523+
static readstat_error_t sas7bdat_parse_rows(const char* data, size_t len, const uint8_t* deleted_bitmap, sas7bdat_ctx_t* ctx) {
494524
readstat_error_t retval = READSTAT_OK;
495525
int i;
496526
size_t row_offset=0;
@@ -499,8 +529,13 @@ static readstat_error_t sas7bdat_parse_rows(const char *data, size_t len, sas7bd
499529
retval = READSTAT_ERROR_ROW_WIDTH_MISMATCH;
500530
goto cleanup;
501531
}
502-
if ((retval = sas7bdat_parse_single_row(&data[row_offset], ctx)) != READSTAT_OK)
532+
if (deleted_bitmap != NULL && sas7bdat_read_bitmap(deleted_bitmap, i)) {
533+
if ((retval = sas7bdat_register_deleted_row(ctx)) != READSTAT_OK) {
534+
goto cleanup;
535+
}
536+
} else if ((retval = sas7bdat_parse_single_row(&data[row_offset], ctx)) != READSTAT_OK) {
503537
goto cleanup;
538+
}
504539

505540
row_offset += ctx->row_length;
506541
}
@@ -611,7 +646,7 @@ static readstat_error_t sas7bdat_parse_subheader_rle(const char *subheader, size
611646
if (ctx->handle.error) {
612647
snprintf(ctx->error_buf, sizeof(ctx->error_buf),
613648
"ReadStat: Row #%d decompressed to %ld bytes (expected %d bytes)",
614-
ctx->parsed_row_count, (long)(bytes_decompressed), ctx->row_length);
649+
sas7bdat_get_current_row_id(ctx), (long)(bytes_decompressed), ctx->row_length);
615650
ctx->handle.error(ctx->error_buf, ctx->user_ctx);
616651
}
617652
goto cleanup;
@@ -739,7 +774,7 @@ static readstat_error_t sas7bdat_submit_columns(sas7bdat_ctx_t *ctx, int compres
739774
readstat_error_t retval = READSTAT_OK;
740775
if (ctx->handle.metadata) {
741776
readstat_metadata_t metadata = {
742-
.row_count = ctx->row_limit,
777+
.row_count = ctx->row_limit - ctx->deleted_row_limit,
743778
.var_count = ctx->column_count,
744779
.table_name = ctx->table_name,
745780
.file_label = ctx->file_label,
@@ -931,7 +966,7 @@ static readstat_error_t sas7bdat_parse_page_pass1(const char *page, size_t page_
931966
goto cleanup;
932967
}
933968
}
934-
} else if (shp_info.compression == SAS_COMPRESSION_ROW) {
969+
} else if (shp_info.compression == SAS_COMPRESSION_ROW || shp_info.compression == SAS_COMPRESSION_ROW_DELETED) {
935970
/* void */
936971
} else {
937972
retval = READSTAT_ERROR_UNSUPPORTED_COMPRESSION;
@@ -947,6 +982,26 @@ static readstat_error_t sas7bdat_parse_page_pass1(const char *page, size_t page_
947982
return retval;
948983
}
949984

985+
static readstat_error_t sas7bdat_parse_deleted_row_bitmap(const char* page, const char* data,
986+
size_t page_size, const uint8_t** deleted_row_bitmap, sas7bdat_ctx_t* ctx) {
987+
uint64_t page_unused_bytes;
988+
if (ctx->u64) {
989+
page_unused_bytes = sas_read8(&page[24], ctx->bswap);
990+
}
991+
else {
992+
page_unused_bytes = sas_read4(&page[12], ctx->bswap);
993+
}
994+
uint32_t row_count = ctx->page_row_count < ctx->row_limit ? ctx->page_row_count : ctx->row_limit;
995+
uint64_t deleted_row_bitmap_offset = row_count * ctx->row_length + page_unused_bytes;
996+
uint32_t required_bytes = row_count / 8 + (row_count % 8 == 0 ? 0 : 1);
997+
998+
if ((data - page) + deleted_row_bitmap_offset + required_bytes > page_size) {
999+
return READSTAT_ERROR_PARSE;
1000+
}
1001+
*deleted_row_bitmap = (const uint8_t*)data + deleted_row_bitmap_offset;
1002+
return READSTAT_OK;
1003+
}
1004+
9501005
static readstat_error_t sas7bdat_parse_page_pass2(const char *page, size_t page_size, sas7bdat_ctx_t *ctx) {
9511006
uint16_t page_type;
9521007

@@ -1007,6 +1062,10 @@ static readstat_error_t sas7bdat_parse_page_pass2(const char *page, size_t page_
10071062
if ((retval = sas7bdat_parse_subheader_compressed(page + shp_info.offset, shp_info.len, ctx)) != READSTAT_OK) {
10081063
goto cleanup;
10091064
}
1065+
} else if (shp_info.compression == SAS_COMPRESSION_ROW_DELETED) {
1066+
if ((retval = sas7bdat_register_deleted_row(ctx)) != READSTAT_OK) {
1067+
goto cleanup;
1068+
}
10101069
} else {
10111070
retval = READSTAT_ERROR_UNSUPPORTED_COMPRESSION;
10121071
goto cleanup;
@@ -1036,7 +1095,14 @@ static readstat_error_t sas7bdat_parse_page_pass2(const char *page, size_t page_
10361095
goto cleanup;
10371096
}
10381097
if (ctx->handle.value) {
1039-
retval = sas7bdat_parse_rows(data, page + page_size - data, ctx);
1098+
const uint8_t* deleted_row_bitmap = NULL;
1099+
if (page_type & SAS_PAGE_TYPE_DELETED) {
1100+
if ((retval = sas7bdat_parse_deleted_row_bitmap(page, data, page_size,
1101+
&deleted_row_bitmap, ctx)) != READSTAT_OK) {
1102+
goto cleanup;
1103+
}
1104+
}
1105+
retval = sas7bdat_parse_rows(data, page + page_size - data, deleted_row_bitmap, ctx);
10401106
}
10411107
}
10421108
cleanup:
@@ -1308,11 +1374,22 @@ readstat_error_t readstat_parse_sas7bdat(readstat_parser_t *parser, const char *
13081374
goto cleanup;
13091375
}
13101376

1377+
if (ctx->handle.value && ctx->parsed_deleted_row_count != ctx->deleted_row_limit) {
1378+
retval = READSTAT_ERROR_ROW_COUNT_MISMATCH;
1379+
if (ctx->handle.error) {
1380+
snprintf(ctx->error_buf, sizeof(ctx->error_buf), "ReadStat: Expected %d deleted rows in file, found %d",
1381+
ctx->deleted_row_limit, ctx->parsed_deleted_row_count);
1382+
ctx->handle.error(ctx->error_buf, ctx->user_ctx);
1383+
}
1384+
goto cleanup;
1385+
}
1386+
13111387
if (ctx->handle.value && ctx->parsed_row_count != ctx->row_limit) {
13121388
retval = READSTAT_ERROR_ROW_COUNT_MISMATCH;
13131389
if (ctx->handle.error) {
13141390
snprintf(ctx->error_buf, sizeof(ctx->error_buf), "ReadStat: Expected %d rows in file, found %d",
1315-
ctx->row_limit, ctx->parsed_row_count);
1391+
ctx->row_limit - ctx->deleted_row_limit,
1392+
ctx->parsed_row_count - ctx->parsed_deleted_row_count);
13161393
ctx->handle.error(ctx->error_buf, ctx->user_ctx);
13171394
}
13181395
goto cleanup;

0 commit comments

Comments
 (0)