Skip to content

Commit 3ebcd73

Browse files
committed
Support moved rows in SAS data files
1 parent 4010db5 commit 3ebcd73

2 files changed

Lines changed: 109 additions & 6 deletions

File tree

src/sas/readstat_sas.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,12 @@ typedef enum sas_subheader_type_e {
120120
#define SAS_PAGE_HEADER_SIZE_32BIT 24
121121
#define SAS_PAGE_HEADER_SIZE_64BIT 40
122122

123-
#define SAS_COMPRESSION_NONE 0x00
124-
#define SAS_COMPRESSION_TRUNC 0x01
125-
#define SAS_COMPRESSION_ROW 0x04
123+
#define SAS_COMPRESSION_NONE 0x00
124+
#define SAS_COMPRESSION_TRUNC 0x01
125+
#define SAS_COMPRESSION_MOVED 0x03
126+
#define SAS_COMPRESSION_ROW 0x04
127+
#define SAS_COMPRESSION_MOVED_ROW 0x06
128+
#define SAS_COMPRESSION_MYSTERY 0x0d
126129

127130
#define SAS_COMPRESSION_SIGNATURE_RLE "SASYZCRL"
128131
#define SAS_COMPRESSION_SIGNATURE_RDC "SASYZCR2"

src/sas/readstat_sas7bdat_read.c

Lines changed: 103 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ typedef struct sas7bdat_ctx_s {
4242
readstat_io_t *io;
4343
int bswap;
4444
int did_submit_columns;
45+
int requires_seek;
4546

4647
uint32_t row_length;
4748
uint32_t page_row_count;
@@ -919,7 +920,7 @@ static readstat_error_t sas7bdat_parse_page_pass1(const char *page, size_t page_
919920
if ((retval = sas7bdat_parse_subheader_pointer(shp, page + page_size - shp, &shp_info, ctx)) != READSTAT_OK) {
920921
goto cleanup;
921922
}
922-
if (shp_info.len > 0 && shp_info.compression != SAS_COMPRESSION_TRUNC) {
923+
if (shp_info.len > 0 && shp_info.compression != SAS_COMPRESSION_TRUNC && shp_info.compression != SAS_COMPRESSION_MOVED) {
923924
if ((retval = sas7bdat_validate_subheader_pointer(&shp_info, page_size, subheader_count, ctx)) != READSTAT_OK) {
924925
goto cleanup;
925926
}
@@ -931,7 +932,8 @@ static readstat_error_t sas7bdat_parse_page_pass1(const char *page, size_t page_
931932
goto cleanup;
932933
}
933934
}
934-
} else if (shp_info.compression == SAS_COMPRESSION_ROW) {
935+
} else if (shp_info.compression == SAS_COMPRESSION_ROW || shp_info.compression == SAS_COMPRESSION_MOVED_ROW ||
936+
shp_info.compression == SAS_COMPRESSION_MYSTERY) {
935937
/* void */
936938
} else {
937939
retval = READSTAT_ERROR_UNSUPPORTED_COMPRESSION;
@@ -947,6 +949,83 @@ static readstat_error_t sas7bdat_parse_page_pass1(const char *page, size_t page_
947949
return retval;
948950
}
949951

952+
static readstat_error_t sas7bdat_parse_moved_row(uint64_t page_index, uint64_t subheader_index, sas7bdat_ctx_t *ctx) {
953+
readstat_error_t retval = READSTAT_OK;
954+
readstat_io_t *io = ctx->io;
955+
956+
const uint64_t page_size = ctx->page_size;
957+
char *page = NULL;
958+
959+
if (page_index >= ctx->page_count) {
960+
retval = READSTAT_ERROR_PARSE;
961+
goto cleanup;
962+
}
963+
964+
ctx->requires_seek = 1;
965+
if (io->seek(ctx->header_size + page_index * page_size, READSTAT_SEEK_SET, io->io_ctx) == -1) {
966+
retval = READSTAT_ERROR_SEEK;
967+
if (ctx->handle.error) {
968+
snprintf(ctx->error_buf, sizeof(ctx->error_buf), "ReadStat: Failed to seek to position %" PRId64
969+
" (= %" PRId64 " + %" PRId64 "*%" PRId64 ")",
970+
ctx->header_size + page_index * page_size, ctx->header_size, page_index, page_size);
971+
ctx->handle.error(ctx->error_buf, ctx->user_ctx);
972+
}
973+
goto cleanup;
974+
}
975+
if ((page = readstat_malloc(page_size)) == NULL) {
976+
retval = READSTAT_ERROR_MALLOC;
977+
goto cleanup;
978+
}
979+
if (io->read(page, page_size, io->io_ctx) < page_size) {
980+
retval = READSTAT_ERROR_READ;
981+
goto cleanup;
982+
}
983+
984+
uint16_t page_type = sas_read2(&page[ctx->page_header_size - 8], ctx->bswap);
985+
if ((page_type & SAS_PAGE_TYPE_MASK) == SAS_PAGE_TYPE_DATA || page_type & SAS_PAGE_TYPE_COMP) {
986+
retval = READSTAT_ERROR_READ;
987+
goto cleanup;
988+
}
989+
uint16_t subheader_count = sas_read2(&page[ctx->page_header_size - 4], ctx->bswap);
990+
if (subheader_index >= subheader_count) {
991+
retval = READSTAT_ERROR_READ;
992+
goto cleanup;
993+
}
994+
uint64_t shp_offset = ctx->page_header_size + subheader_index * ctx->subheader_pointer_size;
995+
if (shp_offset + ctx->subheader_pointer_size >= page_size) {
996+
retval = READSTAT_ERROR_READ;
997+
goto cleanup;
998+
}
999+
1000+
const char *shp = &page[shp_offset];
1001+
subheader_pointer_t shp_info = { 0 };
1002+
if ((retval = sas7bdat_parse_subheader_pointer(shp, page + page_size - shp, &shp_info, ctx)) != READSTAT_OK) {
1003+
goto cleanup;
1004+
}
1005+
if ((retval = sas7bdat_validate_subheader_pointer(&shp_info, page_size, subheader_count, ctx)) != READSTAT_OK) {
1006+
goto cleanup;
1007+
}
1008+
if (shp_info.compression != SAS_COMPRESSION_MOVED_ROW) {
1009+
retval = READSTAT_ERROR_UNSUPPORTED_COMPRESSION;
1010+
goto cleanup;
1011+
}
1012+
1013+
if ((retval = sas7bdat_submit_columns_if_needed(ctx, 1)) != READSTAT_OK) {
1014+
goto cleanup;
1015+
}
1016+
if ((retval = sas7bdat_parse_subheader_compressed(page + shp_info.offset, shp_info.len, ctx)) != READSTAT_OK) {
1017+
goto cleanup;
1018+
}
1019+
1020+
cleanup:
1021+
1022+
if (page) {
1023+
free(page);
1024+
}
1025+
1026+
return retval;
1027+
}
1028+
9501029
static readstat_error_t sas7bdat_parse_page_pass2(const char *page, size_t page_size, sas7bdat_ctx_t *ctx) {
9511030
uint16_t page_type;
9521031

@@ -976,7 +1055,13 @@ static readstat_error_t sas7bdat_parse_page_pass2(const char *page, size_t page_
9761055
if ((retval = sas7bdat_parse_subheader_pointer(shp, page + page_size - shp, &shp_info, ctx)) != READSTAT_OK) {
9771056
goto cleanup;
9781057
}
979-
if (shp_info.len > 0 && shp_info.compression != SAS_COMPRESSION_TRUNC) {
1058+
if (shp_info.len > 0 && shp_info.compression == SAS_COMPRESSION_MOVED) {
1059+
uint64_t page_index = shp_info.offset - 1;
1060+
uint64_t subheader_index = shp_info.len - 1;
1061+
if ((retval = sas7bdat_parse_moved_row(page_index, subheader_index, ctx)) != READSTAT_OK) {
1062+
goto cleanup;
1063+
}
1064+
} else if (shp_info.len > 0 && shp_info.compression != SAS_COMPRESSION_TRUNC) {
9801065
if ((retval = sas7bdat_validate_subheader_pointer(&shp_info, page_size, subheader_count, ctx)) != READSTAT_OK) {
9811066
goto cleanup;
9821067
}
@@ -1007,6 +1092,8 @@ static readstat_error_t sas7bdat_parse_page_pass2(const char *page, size_t page_
10071092
if ((retval = sas7bdat_parse_subheader_compressed(page + shp_info.offset, shp_info.len, ctx)) != READSTAT_OK) {
10081093
goto cleanup;
10091094
}
1095+
} else if (shp_info.compression == SAS_COMPRESSION_MOVED_ROW || shp_info.compression == SAS_COMPRESSION_MYSTERY) {
1096+
/* void */
10101097
} else {
10111098
retval = READSTAT_ERROR_UNSUPPORTED_COMPRESSION;
10121099
goto cleanup;
@@ -1180,6 +1267,19 @@ static readstat_error_t sas7bdat_parse_all_pages_pass2(sas7bdat_ctx_t *ctx) {
11801267
if ((retval = sas7bdat_update_progress(ctx)) != READSTAT_OK) {
11811268
goto cleanup;
11821269
}
1270+
if (ctx->requires_seek) {
1271+
if (io->seek(ctx->header_size + i * ctx->page_size, READSTAT_SEEK_SET, io->io_ctx) == -1) {
1272+
retval = READSTAT_ERROR_SEEK;
1273+
if (ctx->handle.error) {
1274+
snprintf(ctx->error_buf, sizeof(ctx->error_buf), "ReadStat: Failed to seek to position %" PRId64
1275+
" (= %" PRId64 " + %" PRId64 "*%" PRId64 ")",
1276+
ctx->header_size + i * ctx->page_size, ctx->header_size, i, ctx->page_size);
1277+
ctx->handle.error(ctx->error_buf, ctx->user_ctx);
1278+
}
1279+
goto cleanup;
1280+
}
1281+
ctx->requires_seek = 0;
1282+
}
11831283
if (io->read(ctx->page, ctx->page_size, io->io_ctx) < ctx->page_size) {
11841284
retval = READSTAT_ERROR_READ;
11851285
goto cleanup;

0 commit comments

Comments
 (0)