Skip to content

Commit 1cbd51e

Browse files
committed
Support deleted rows in SAS data files
1 parent 4010db5 commit 1cbd51e

2 files changed

Lines changed: 98 additions & 15 deletions

File tree

src/sas/readstat_sas.h

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -111,18 +111,20 @@ typedef enum sas_subheader_type_e {
111111
#define SAS_PAGE_TYPE_AMD 0x0400
112112
#define SAS_PAGE_TYPE_MASK 0x0F00
113113

114-
#define SAS_PAGE_TYPE_META2 0x4000
115-
#define SAS_PAGE_TYPE_COMP 0x9000
114+
#define SAS_PAGE_TYPE_DELETED_ROWS 0x0080
115+
#define SAS_PAGE_TYPE_META2 0x4000
116+
#define SAS_PAGE_TYPE_COMP 0x9000
116117

117118
#define SAS_SUBHEADER_POINTER_SIZE_32BIT 12
118119
#define SAS_SUBHEADER_POINTER_SIZE_64BIT 24
119120

120121
#define SAS_PAGE_HEADER_SIZE_32BIT 24
121122
#define SAS_PAGE_HEADER_SIZE_64BIT 40
122123

123-
#define SAS_COMPRESSION_NONE 0x00
124-
#define SAS_COMPRESSION_TRUNC 0x01
125-
#define SAS_COMPRESSION_ROW 0x04
124+
#define SAS_COMPRESSION_NONE 0x00
125+
#define SAS_COMPRESSION_TRUNC 0x01
126+
#define SAS_COMPRESSION_ROW 0x04
127+
#define SAS_COMPRESSION_DELETED_ROW 0x05
126128

127129
#define SAS_COMPRESSION_SIGNATURE_RLE "SASYZCRL"
128130
#define SAS_COMPRESSION_SIGNATURE_RDC "SASYZCR2"

src/sas/readstat_sas7bdat_read.c

Lines changed: 91 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <string.h>
66
#include <math.h>
77
#include <inttypes.h>
8+
#include <limits.h>
89
#include "readstat_sas.h"
910
#include "readstat_sas_rle.h"
1011
#include "../readstat_iconv.h"
@@ -46,8 +47,10 @@ typedef struct sas7bdat_ctx_s {
4647
uint32_t row_length;
4748
uint32_t page_row_count;
4849
uint32_t parsed_row_count;
50+
uint32_t parsed_deleted_row_count;
4951
uint32_t column_count;
5052
uint32_t row_limit;
53+
uint32_t deleted_row_limit;
5154
uint32_t row_offset;
5255

5356
uint64_t header_size;
@@ -232,7 +235,7 @@ static readstat_error_t sas7bdat_parse_column_size_subheader(const char *subhead
232235
static readstat_error_t sas7bdat_parse_row_size_subheader(const char *subheader, size_t len, sas7bdat_ctx_t *ctx) {
233236
readstat_error_t retval = READSTAT_OK;
234237
uint64_t total_row_count;
235-
uint64_t row_length, page_row_count;
238+
uint64_t row_length, deleted_row_limit, page_row_count;
236239

237240
if (len < (ctx->u64 ? 250: 190)) {
238241
retval = READSTAT_ERROR_PARSE;
@@ -242,13 +245,21 @@ static readstat_error_t sas7bdat_parse_row_size_subheader(const char *subheader,
242245
if (ctx->u64) {
243246
row_length = sas_read8(&subheader[40], ctx->bswap);
244247
total_row_count = sas_read8(&subheader[48], ctx->bswap);
248+
deleted_row_limit = sas_read8(&subheader[56], ctx->bswap);
245249
page_row_count = sas_read8(&subheader[120], ctx->bswap);
246250
} else {
247251
row_length = sas_read4(&subheader[20], ctx->bswap);
248252
total_row_count = sas_read4(&subheader[24], ctx->bswap);
253+
deleted_row_limit = sas_read4(&subheader[28], ctx->bswap);
249254
page_row_count = sas_read4(&subheader[60], ctx->bswap);
250255
}
251256

257+
if (deleted_row_limit > total_row_count) {
258+
retval = READSTAT_ERROR_PARSE;
259+
goto cleanup;
260+
}
261+
ctx->deleted_row_limit = deleted_row_limit;
262+
252263
sas_text_ref_t file_label_ref = sas7bdat_parse_text_ref(&subheader[len-130], ctx);
253264
if (file_label_ref.length) {
254265
if ((retval = sas7bdat_copy_text_ref(ctx->file_label, sizeof(ctx->file_label),
@@ -393,6 +404,22 @@ static readstat_error_t sas7bdat_parse_column_format_subheader(const char *subhe
393404
return retval;
394405
}
395406

407+
static readstat_error_t sas7bdat_register_deleted_row(sas7bdat_ctx_t *ctx) {
408+
if (ctx->parsed_row_count == ctx->row_limit) {
409+
return READSTAT_OK;
410+
}
411+
if (ctx->parsed_deleted_row_count >= ctx->deleted_row_limit) {
412+
return READSTAT_ERROR_PARSE;
413+
}
414+
ctx->parsed_row_count++;
415+
ctx->parsed_deleted_row_count++;
416+
return READSTAT_OK;
417+
}
418+
419+
static uint32_t sas7bdat_get_current_row_id(sas7bdat_ctx_t *ctx) {
420+
return ctx->parsed_row_count - ctx->parsed_deleted_row_count;
421+
}
422+
396423
static readstat_error_t sas7bdat_handle_data_value(readstat_variable_t *variable,
397424
col_info_t *col_info, const char *col_data, sas7bdat_ctx_t *ctx) {
398425
readstat_error_t retval = READSTAT_OK;
@@ -409,7 +436,7 @@ static readstat_error_t sas7bdat_handle_data_value(readstat_variable_t *variable
409436
if (ctx->handle.error) {
410437
snprintf(ctx->error_buf, sizeof(ctx->error_buf),
411438
"ReadStat: Error converting string (row=%u, col=%u) to specified encoding: %.*s",
412-
ctx->parsed_row_count+1, col_info->index+1, col_info->width, col_data);
439+
sas7bdat_get_current_row_id(ctx)+1, col_info->index+1, col_info->width, col_data);
413440
ctx->handle.error(ctx->error_buf, ctx->user_ctx);
414441
}
415442
goto cleanup;
@@ -441,7 +468,7 @@ static readstat_error_t sas7bdat_handle_data_value(readstat_variable_t *variable
441468
value.v.double_value = dval;
442469
}
443470
}
444-
cb_retval = ctx->handle.value(ctx->parsed_row_count, variable, value, ctx->user_ctx);
471+
cb_retval = ctx->handle.value(sas7bdat_get_current_row_id(ctx), variable, value, ctx->user_ctx);
445472

446473
if (cb_retval != READSTAT_HANDLER_OK)
447474
retval = READSTAT_ERROR_USER_ABORT;
@@ -490,7 +517,15 @@ static readstat_error_t sas7bdat_parse_single_row(const char *data, sas7bdat_ctx
490517
return retval;
491518
}
492519

493-
static readstat_error_t sas7bdat_parse_rows(const char *data, size_t len, sas7bdat_ctx_t *ctx) {
520+
static unsigned char sas7bdat_read_bitmap(const unsigned char *bitmap, int index) {
521+
unsigned char current_byte = bitmap[index / CHAR_BIT];
522+
unsigned char mask = 1 << (CHAR_BIT - 1 - index % CHAR_BIT);
523+
524+
return current_byte & mask;
525+
}
526+
527+
static readstat_error_t sas7bdat_parse_rows(const char *data, size_t len,
528+
const unsigned char *deleted_bitmap, sas7bdat_ctx_t *ctx) {
494529
readstat_error_t retval = READSTAT_OK;
495530
int i;
496531
size_t row_offset=0;
@@ -499,8 +534,13 @@ static readstat_error_t sas7bdat_parse_rows(const char *data, size_t len, sas7bd
499534
retval = READSTAT_ERROR_ROW_WIDTH_MISMATCH;
500535
goto cleanup;
501536
}
502-
if ((retval = sas7bdat_parse_single_row(&data[row_offset], ctx)) != READSTAT_OK)
537+
if (deleted_bitmap != NULL && sas7bdat_read_bitmap(deleted_bitmap, i)) {
538+
if ((retval = sas7bdat_register_deleted_row(ctx)) != READSTAT_OK) {
539+
goto cleanup;
540+
}
541+
} else if ((retval = sas7bdat_parse_single_row(&data[row_offset], ctx)) != READSTAT_OK) {
503542
goto cleanup;
543+
}
504544

505545
row_offset += ctx->row_length;
506546
}
@@ -611,7 +651,7 @@ static readstat_error_t sas7bdat_parse_subheader_rle(const char *subheader, size
611651
if (ctx->handle.error) {
612652
snprintf(ctx->error_buf, sizeof(ctx->error_buf),
613653
"ReadStat: Row #%d decompressed to %ld bytes (expected %d bytes)",
614-
ctx->parsed_row_count, (long)(bytes_decompressed), ctx->row_length);
654+
sas7bdat_get_current_row_id(ctx), (long)(bytes_decompressed), ctx->row_length);
615655
ctx->handle.error(ctx->error_buf, ctx->user_ctx);
616656
}
617657
goto cleanup;
@@ -739,7 +779,7 @@ static readstat_error_t sas7bdat_submit_columns(sas7bdat_ctx_t *ctx, int compres
739779
readstat_error_t retval = READSTAT_OK;
740780
if (ctx->handle.metadata) {
741781
readstat_metadata_t metadata = {
742-
.row_count = ctx->row_limit,
782+
.row_count = ctx->row_limit - ctx->deleted_row_limit,
743783
.var_count = ctx->column_count,
744784
.table_name = ctx->table_name,
745785
.file_label = ctx->file_label,
@@ -931,7 +971,7 @@ static readstat_error_t sas7bdat_parse_page_pass1(const char *page, size_t page_
931971
goto cleanup;
932972
}
933973
}
934-
} else if (shp_info.compression == SAS_COMPRESSION_ROW) {
974+
} else if (shp_info.compression == SAS_COMPRESSION_ROW || shp_info.compression == SAS_COMPRESSION_DELETED_ROW) {
935975
/* void */
936976
} else {
937977
retval = READSTAT_ERROR_UNSUPPORTED_COMPRESSION;
@@ -947,6 +987,25 @@ static readstat_error_t sas7bdat_parse_page_pass1(const char *page, size_t page_
947987
return retval;
948988
}
949989

990+
static readstat_error_t sas7bdat_parse_deleted_row_bitmap(const char *page, const char *data,
991+
size_t page_size, const unsigned char **deleted_row_bitmap, sas7bdat_ctx_t *ctx) {
992+
uint64_t page_unused_bytes;
993+
if (ctx->u64) {
994+
page_unused_bytes = sas_read8(&page[24], ctx->bswap);
995+
} else {
996+
page_unused_bytes = sas_read4(&page[12], ctx->bswap);
997+
}
998+
uint32_t row_count = ctx->page_row_count < ctx->row_limit ? ctx->page_row_count : ctx->row_limit;
999+
uint64_t deleted_row_bitmap_offset = row_count * ctx->row_length + page_unused_bytes;
1000+
uint32_t required_bytes = row_count / CHAR_BIT + (row_count % CHAR_BIT == 0 ? 0 : 1);
1001+
1002+
if ((data - page) + deleted_row_bitmap_offset + required_bytes > page_size) {
1003+
return READSTAT_ERROR_PARSE;
1004+
}
1005+
*deleted_row_bitmap = (const unsigned char *)data + deleted_row_bitmap_offset;
1006+
return READSTAT_OK;
1007+
}
1008+
9501009
static readstat_error_t sas7bdat_parse_page_pass2(const char *page, size_t page_size, sas7bdat_ctx_t *ctx) {
9511010
uint16_t page_type;
9521011

@@ -1007,6 +1066,10 @@ static readstat_error_t sas7bdat_parse_page_pass2(const char *page, size_t page_
10071066
if ((retval = sas7bdat_parse_subheader_compressed(page + shp_info.offset, shp_info.len, ctx)) != READSTAT_OK) {
10081067
goto cleanup;
10091068
}
1069+
} else if (shp_info.compression == SAS_COMPRESSION_DELETED_ROW) {
1070+
if ((retval = sas7bdat_register_deleted_row(ctx)) != READSTAT_OK) {
1071+
goto cleanup;
1072+
}
10101073
} else {
10111074
retval = READSTAT_ERROR_UNSUPPORTED_COMPRESSION;
10121075
goto cleanup;
@@ -1036,7 +1099,14 @@ static readstat_error_t sas7bdat_parse_page_pass2(const char *page, size_t page_
10361099
goto cleanup;
10371100
}
10381101
if (ctx->handle.value) {
1039-
retval = sas7bdat_parse_rows(data, page + page_size - data, ctx);
1102+
const unsigned char *deleted_row_bitmap = NULL;
1103+
if (page_type & SAS_PAGE_TYPE_DELETED_ROWS) {
1104+
if ((retval = sas7bdat_parse_deleted_row_bitmap(page, data, page_size,
1105+
&deleted_row_bitmap, ctx)) != READSTAT_OK) {
1106+
goto cleanup;
1107+
}
1108+
}
1109+
retval = sas7bdat_parse_rows(data, page + page_size - data, deleted_row_bitmap, ctx);
10401110
}
10411111
}
10421112
cleanup:
@@ -1308,11 +1378,22 @@ readstat_error_t readstat_parse_sas7bdat(readstat_parser_t *parser, const char *
13081378
goto cleanup;
13091379
}
13101380

1381+
if (ctx->handle.value && ctx->parsed_deleted_row_count != ctx->deleted_row_limit) {
1382+
retval = READSTAT_ERROR_ROW_COUNT_MISMATCH;
1383+
if (ctx->handle.error) {
1384+
snprintf(ctx->error_buf, sizeof(ctx->error_buf), "ReadStat: Expected %d deleted rows in file, found %d",
1385+
ctx->deleted_row_limit, ctx->parsed_deleted_row_count);
1386+
ctx->handle.error(ctx->error_buf, ctx->user_ctx);
1387+
}
1388+
goto cleanup;
1389+
}
1390+
13111391
if (ctx->handle.value && ctx->parsed_row_count != ctx->row_limit) {
13121392
retval = READSTAT_ERROR_ROW_COUNT_MISMATCH;
13131393
if (ctx->handle.error) {
13141394
snprintf(ctx->error_buf, sizeof(ctx->error_buf), "ReadStat: Expected %d rows in file, found %d",
1315-
ctx->row_limit, ctx->parsed_row_count);
1395+
ctx->row_limit - ctx->deleted_row_limit,
1396+
ctx->parsed_row_count - ctx->parsed_deleted_row_count);
13161397
ctx->handle.error(ctx->error_buf, ctx->user_ctx);
13171398
}
13181399
goto cleanup;

0 commit comments

Comments
 (0)