-
Notifications
You must be signed in to change notification settings - Fork 296
Expand file tree
/
Copy pathload_data_parquet.sql
More file actions
332 lines (295 loc) · 13.2 KB
/
load_data_parquet.sql
File metadata and controls
332 lines (295 loc) · 13.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
-- pre
drop database if exists parq;
create database parq;
use parq;
-- read from an already exported file
create table t1(id bigint,name varchar);
load data infile {'filepath'='$resources/load_data/simple.parq', 'format'='parquet'} into table t1;
select * from t1;
create table t2(id bigint not null, name varchar not null, sex bool, f32 float(5,2));
load data infile {'filepath'='$resources/load_data/simple2.parq', 'format'='parquet'} into table t2;
select * from t2;
create table t3(c varchar);
load data infile {'filepath'='$resources/load_data/indexed_str.parq', 'format'='parquet'} into table t3;
select * from t3;
create table t4(id bigint not null, name varchar not null, sex bool, f32 float(5,2));
create stage parqstage URL='file:///$resources/load_data/';
load data infile {'filepath'='stage://parqstage/simple2.parq', 'format'='parquet'} into table t4;
select * from t4;
drop stage parqstage;
create table t5(id bigint, name varchar, int8column tinyint, int16column smallint, binarycolumn binary, varbinarycolumn varbinary(32), blobcolumn blob);
load data infile {'filepath'='$resources/load_data/int8_int16_binary_varbinary_blob.parq', 'format'='parquet'} into table t5;
select * from t5;
-- nested types (List/Struct/Map) to TEXT
create table t6(id int, name varchar(100), scores text, address text, metadata text);
load data infile {'filepath'='$resources/load_data/nested_types.parq', 'format'='parquet'} into table t6;
select * from t6;
-- nested types to JSON
create table t7(id int, name varchar(100), scores json, address json, metadata json);
load data infile {'filepath'='$resources/load_data/nested_types.parq', 'format'='parquet'} into table t7;
select * from t7;
-- nullable columns
drop table if exists pq_nullable_test;
CREATE TABLE pq_nullable_test (
id BIGINT NOT NULL,
col_nullable_with_null DOUBLE,
col_nullable_no_null VARCHAR(10),
col_not_nullable BIGINT NOT NULL,
col_all_null DOUBLE,
PRIMARY KEY (id)
);
load data infile {'filepath'='$resources/parquet/nullable_test.parquet', 'format'='parquet'} into table pq_nullable_test;
SELECT
COUNT(*) as total_rows,
SUM(CASE WHEN col_nullable_with_null IS NULL THEN 1 ELSE 0 END) as null_count_col1,
SUM(CASE WHEN col_all_null IS NULL THEN 1 ELSE 0 END) as null_count_all_null
FROM pq_nullable_test;
SELECT COUNT(*) FROM pq_nullable_test WHERE col_not_nullable IS NULL;
SELECT * FROM pq_nullable_test ORDER BY id;
drop table pq_nullable_test;
-- Map、Struct、List、Decimal、timestamp-no-zone
drop table if exists parquet_complex_types;
CREATE TABLE `parquet_complex_types` (
`test_id` bigint NOT NULL,
`decimal_small` decimal(10,2) NOT NULL DEFAULT 0.0,
`decimal_medium` decimal(18,4) NOT NULL DEFAULT 0.0,
`decimal_large` decimal(38,6) NOT NULL DEFAULT 0.0,
`timestamp_utc` timestamp NOT NULL DEFAULT '1970-01-01 00:00:00',
`timestamp_micros` timestamp(6) NOT NULL DEFAULT '1970-01-01 00:00:00',
`timestamp_millis` timestamp(3) NOT NULL DEFAULT '1970-01-01 00:00:00',
`map_string_int` json DEFAULT NULL,
`map_string_string` json DEFAULT NULL,
`struct_simple` json DEFAULT NULL,
`struct_nested` json DEFAULT NULL,
`list_int` json DEFAULT NULL,
`list_string` json DEFAULT NULL,
`list_struct` json DEFAULT NULL
);
load data infile {'filepath'='$resources/parquet/complex.parquet', 'format'='parquet'} into table parquet_complex_types;
-- @ignore:4,5,6
select * from parquet_complex_types order by test_id limit 3;
SELECT COUNT(*) FROM parquet_complex_types;
drop table parquet_complex_types;
drop table if exists ts_test;
CREATE TABLE ts_test (
id BIGINT,
ts_no_tz_micros DATETIME,
ts_no_tz_millis DATETIME
);
load data infile {'filepath'='$resources/parquet/ts_no_tz_test.parquet', 'format'='parquet'} into table ts_test;
select * from ts_test;
drop table ts_test;
-- compressed file load
drop table if exists none_compression;
CREATE TABLE none_compression (
id BIGINT,
name VARCHAR(100),
value DOUBLE,
status BOOL
);
load data infile {'filepath'='$resources/parquet/test_none.parquet', 'format'='parquet'} into table none_compression;
SELECT 'NONE' as codec, COUNT(*), SUM(value) FROM none_compression;
drop table if exists snappy_compression;
CREATE TABLE snappy_compression (
id BIGINT,
name VARCHAR(100),
value DOUBLE,
status BOOL
);
load data infile {'filepath'='$resources/parquet/test_snappy.parquet', 'format'='parquet'} into table snappy_compression;
SELECT 'NONE' as codec, COUNT(*), SUM(value) FROM snappy_compression;
drop table if exists gzip_compression;
CREATE TABLE gzip_compression (
id BIGINT,
name VARCHAR(100),
value DOUBLE,
status BOOL
);
load data infile {'filepath'='$resources/parquet/test_gzip.parquet', 'format'='parquet'} into table gzip_compression;
SELECT 'NONE' as codec, COUNT(*), SUM(value) FROM gzip_compression;
drop table if exists lz4_compression;
CREATE TABLE lz4_compression (
id BIGINT,
name VARCHAR(100),
value DOUBLE,
status BOOL
);
load data infile {'filepath'='$resources/parquet/test_lz4.parquet', 'format'='parquet'} into table lz4_compression;
SELECT 'NONE' as codec, COUNT(*), SUM(value) FROM lz4_compression;
drop table if exists zstd_compression;
CREATE TABLE zstd_compression (
id BIGINT,
name VARCHAR(100),
value DOUBLE,
status BOOL
);
load data infile {'filepath'='$resources/parquet/test_zstd.parquet', 'format'='parquet'} into table zstd_compression;
SELECT 'NONE' as codec, COUNT(*), SUM(value) FROM zstd_compression;
drop table if exists brotli_compression;
CREATE TABLE brotli_compression (
id BIGINT,
name VARCHAR(100),
value DOUBLE,
status BOOL
);
load data infile {'filepath'='$resources/parquet/test_brotli.parquet', 'format'='parquet'} into table brotli_compression;
select 'NONE' as codec, count(*), sum(value) from brotli_compression;
-- v1
drop table if exists pq_version_compare;
CREATE TABLE `pq_version_compare` (
`id` bigint DEFAULT NULL,
`user_name` varchar(50) DEFAULT NULL,
`login_time` timestamp NULL DEFAULT NULL
);
load data infile {'filepath'='$resources/parquet/test_v1_legacy.parquet', 'format'='parquet'} into table pq_version_compare;
select * from pq_version_compare;
truncate table pq_version_compare;
-- v2
load data infile {'filepath'='$resources/parquet/test_v2_modern.parquet', 'format'='parquet'} into table pq_version_compare;
select * from pq_version_compare;
-- Column statistics (write_statistics=True/False)
drop table if exists pq_stats_check;
CREATE TABLE pq_stats_check (
id BIGINT,
score DOUBLE
);
load data infile {'filepath'='$resources/parquet/no_stats.parquet', 'format'='parquet'} into table pq_stats_check;
select * from pq_stats_check;
select 'no_stats' as source, count(*), MIN(id), MAX(score) from pq_stats_check;
truncate pq_stats_check;
load data infile {'filepath'='$resources/parquet/with_stats.parquet', 'format'='parquet'} into table pq_stats_check;
select * from pq_stats_check;
select 'no_stats' as source, count(*), MIN(id), MAX(score) from pq_stats_check;
-- empty file
-- parquet file column number equal to table column number, and column name is not same
drop table if exists empty_table;
create table empty_table(col1 bigint, col2 varchar);
load data infile {'filepath'='$resources/parquet/empty_test.parquet', 'format'='parquet'} into table empty_table;
select * from empty_table;
-- parquet file column number equal to table column number, and column name is the same
drop table if exists empty_table01;
create table empty_table01(id bigint, name varchar);
load data infile {'filepath'='$resources/parquet/empty_test.parquet', 'format'='parquet'} into table empty_table01;
select * from empty_table;
-- parquet file's column number is not equal to table's column number
drop table if exists empty_table02;
create table empty_table02(col1 int);
load data infile {'filepath'='$resources/parquet/empty_test.parquet', 'format'='parquet'} into table empty_table02;
select * from empty_table02;
-- wide tables
drop table if exists wide_100_columns;
CREATE TABLE wide_100_columns (
col_0 BIGINT, col_1 DOUBLE, col_2 BIGINT, col_3 DOUBLE, col_4 BIGINT,
col_5 DOUBLE, col_6 BIGINT, col_7 DOUBLE, col_8 BIGINT, col_9 DOUBLE,
col_10 BIGINT, col_11 DOUBLE, col_12 BIGINT, col_13 DOUBLE, col_14 BIGINT,
col_15 DOUBLE, col_16 BIGINT, col_17 DOUBLE, col_18 BIGINT, col_19 DOUBLE,
col_20 BIGINT, col_21 DOUBLE, col_22 BIGINT, col_23 DOUBLE, col_24 BIGINT,
col_25 DOUBLE, col_26 BIGINT, col_27 DOUBLE, col_28 BIGINT, col_29 DOUBLE,
col_30 BIGINT, col_31 DOUBLE, col_32 BIGINT, col_33 DOUBLE, col_34 BIGINT,
col_35 DOUBLE, col_36 BIGINT, col_37 DOUBLE, col_38 BIGINT, col_39 DOUBLE,
col_40 BIGINT, col_41 DOUBLE, col_42 BIGINT, col_43 DOUBLE, col_44 BIGINT,
col_45 DOUBLE, col_46 BIGINT, col_47 DOUBLE, col_48 BIGINT, col_49 DOUBLE,
col_50 BIGINT, col_51 DOUBLE, col_52 BIGINT, col_53 DOUBLE, col_54 BIGINT,
col_55 DOUBLE, col_56 BIGINT, col_57 DOUBLE, col_58 BIGINT, col_59 DOUBLE,
col_60 BIGINT, col_61 DOUBLE, col_62 BIGINT, col_63 DOUBLE, col_64 BIGINT,
col_65 DOUBLE, col_66 BIGINT, col_67 DOUBLE, col_68 BIGINT, col_69 DOUBLE,
col_70 BIGINT, col_71 DOUBLE, col_72 BIGINT, col_73 DOUBLE, col_74 BIGINT,
col_75 DOUBLE, col_76 BIGINT, col_77 DOUBLE, col_78 BIGINT, col_79 DOUBLE,
col_80 BIGINT, col_81 DOUBLE, col_82 BIGINT, col_83 DOUBLE, col_84 BIGINT,
col_85 DOUBLE, col_86 BIGINT, col_87 DOUBLE, col_88 BIGINT, col_89 DOUBLE,
col_90 BIGINT, col_91 DOUBLE, col_92 BIGINT, col_93 DOUBLE, col_94 BIGINT,
col_95 DOUBLE, col_96 BIGINT, col_97 DOUBLE, col_98 BIGINT, col_99 DOUBLE
);
load data infile {'filepath'='$resources/parquet/wide_table_100.parquet', 'format'='parquet'} into table wide_100_columns;
select count(*) from wide_100_columns;
select max(col_99) from wide_100_columns;
select col_0, col_50, col_99 from wide_100_columns LIMIT 5;
drop table if exists pq_supported_types;
CREATE TABLE pq_supported_types (
int32_col INT,
int64_col BIGINT,
uint32_col INT UNSIGNED,
float32_col FLOAT,
float64_col DOUBLE,
string_col VARCHAR(255),
bool_col BOOL,
date_col DATE,
time_col TIME,
ts_col TIMESTAMP
);
load data infile {'filepath'='$resources/parquet/supported_types.parquet', 'format'='parquet'} into table pq_supported_types;
select int32_col,int64_col,uint32_col,float32_col,float64_col,string_col,bool_col,date_col,time_col from pq_supported_types;
select count(*) from pq_supported_types;
-- new types
drop table if exists pq_new_types;
create table pq_new_types (
col_int8 TINYINT,
col_int16 SMALLINT,
col_large_string TEXT,
col_ts_no_tz DATETIME,
col_binary BLOB,
col_decimal DECIMAL(18, 3)
);
load data infile {'filepath'='$resources/parquet/new_supported_types.parquet', 'format'='parquet'} into table pq_new_types;
select count(*) from pq_new_types;
select * from pq_new_types;
select length(col_binary) from pq_new_types;
select col_ts_no_tz from pq_new_types;
-- int8->int8,int16->int16,binary,decimal->decimal
drop table if exists subtask_test;
create table subtask_test (
c_int8 TINYINT,
c_int16 SMALLINT,
c_binary BLOB,
c_decimal DECIMAL(12, 4),
c_large_str TEXT
);
load data infile {'filepath'='$resources/parquet/subtask_22691_test.parquet', 'format'='parquet'} into table subtask_test;
select * from subtask_test;
-- decimal
drop table if exists pq_decimal_boundary;
create table pq_decimal_boundary (
dec_p9_s2 DECIMAL(9, 2),
dec_p18_s9 DECIMAL(18, 9),
dec_p38_s0 DECIMAL(38, 0),
dec_p38_s38 DECIMAL(38, 38)
);
load data infile {'filepath'='$resources/parquet/decimal_boundary_test.parquet', 'format'='parquet'} into table pq_decimal_boundary;
select * from pq_decimal_boundary;
select dec_p38_s0 from pq_decimal_boundary WHERE dec_p38_s0 > 0;
select dec_p38_s38 from pq_decimal_boundary WHERE dec_p38_s38 > 0;
select dec_p9_s2 * 1.1 from pq_decimal_boundary LIMIT 1;
-- binary->binary/blob
drop table if exists pq_binary_test;
create table pq_binary_test (
bin_fixed BINARY(10),
bin_var VARBINARY(255),
bin_blob BLOB,
bin_empty VARBINARY(10)
);
load data infile {'filepath'='$resources/parquet/binary_mapping_test.parquet', 'format'='parquet'} into table pq_binary_test;
select
hex(bin_fixed) as fixed_hex,
hex(bin_var) as var_hex,
hex(bin_blob) as blob_hex
from pq_binary_test;
select length(bin_fixed) from pq_binary_test;
select count(*) from pq_binary_test where bin_blob is null;
select length(bin_empty) from pq_binary_test limit 1;
-- string to int
drop table if exists parquet_01;
CREATE TABLE `parquet_01` (
`sepal.length` double DEFAULT NULL,
`sepal.width` double DEFAULT NULL,
`petal.length` double DEFAULT NULL,
`petal.width` double DEFAULT NULL,
`variety` varchar(20) DEFAULT NULL
);
load data infile {'filepath'='$resources/parquet/Iris.parquet', 'format'='parquet'} into table parquet_01;
select count(*) from parquet_01;
drop table if exists parquet_06;
create table parquet_06(PassengerId bigint,Survived bigint, Pclass bigint,Name varchar(50),Sex varchar(50),Age double,SibSp bigint,Parch bigint,Ticket varchar(50),Fare double,Cabin varchar(50),Embarked varchar(50));
load data infile {'filepath'='$resources/parquet/Titanic.parquet','format'='parquet'} into table parquet_06;
select count(*) from parquet_06;
-- post
drop database parq;