diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..036deed2a --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "df-autohint-runner/dsb"] + path = df-autohint-runner/dsb + url = https://github.com/microsoft/dsb.git diff --git a/autohint-implementations/df-execution-impl/src/visitor.rs b/autohint-implementations/df-execution-impl/src/visitor.rs index 8c517239c..f1275815c 100644 --- a/autohint-implementations/df-execution-impl/src/visitor.rs +++ b/autohint-implementations/df-execution-impl/src/visitor.rs @@ -4,10 +4,10 @@ use hint_engine::{JoinAlgorithm, LeafNode, PlanNodeMetadata}; use datafusion::arrow::datatypes::Schema; use datafusion::execution::context::SessionContext; -use datafusion::physical_plan::ExecutionPlan; use datafusion::physical_plan::joins::{HashJoinExec, NestedLoopJoinExec}; use datafusion::physical_plan::test::exec::MockExec; -use datafusion::physical_plan::{ExecutionPlanVisitor, accept}; +use datafusion::physical_plan::ExecutionPlan; +use datafusion::physical_plan::{accept, ExecutionPlanVisitor}; use std::collections::HashSet; use std::sync::Arc; @@ -139,8 +139,15 @@ impl ExecutionPlanVisitor for DFExecutionPlanVisitor { let file_name = plan .as_any() .downcast_ref::() - .unwrap() - .data_source() + .unwrap(); + + // NOTE: Here the data_source is a MemorySourceConfig instead of FileScanConfig + let file_name = file_name.data_source(); + println!("\nfile_name"); + println!("{:?}", file_name); + + // NOTE: This unwrap() panicked + let file_name = file_name .as_any() .downcast_ref::() .unwrap() diff --git a/df-autohint-runner/.gitignore b/df-autohint-runner/.gitignore index 9f4046c32..1207b4541 100644 --- a/df-autohint-runner/.gitignore +++ b/df-autohint-runner/.gitignore @@ -1,8 +1,12 @@ **target/ **data/ +output output/tpch/ output/job/ +output/dsb/ input/job/queries +input/dsb/queries **Cargo.lock +**__pycache__ *.rs.bk diff --git a/df-autohint-runner/README.md b/df-autohint-runner/README.md index 874980aa0..7e1c22810 100644 --- a/df-autohint-runner/README.md +++ b/df-autohint-runner/README.md @@ -17,7 +17,7 @@ export PGPASSWORD= TPC-H tables and queries will be generated under `input/tpch/data` and `input/tpch/queries`. TPC-H tables and schema will be loaded into Postgres `tpch` database. -## SQL to Datafusion to Hint Runner +### SQL to Datafusion to Hint Runner run: ``` ./scripts/tpch-df-logical-plan.sh @@ -28,7 +28,7 @@ This will load the schema and query into datafusion, run the hint engine over th The original SQL with dialect translation and produced SQL with hints will be dumped into `output/tpch/original-queries` and `output/tpch/{logical, logical-optimized, physical-optimized}-queries` directories. -## SQL with Hints Postgres Runner +### SQL with Hints Postgres Runner run: ``` export PGPASSWORD= @@ -53,7 +53,7 @@ export PGPASSWORD= job-fkindex.sh ``` -## SQL to Datafusion to Hint Runner +### SQL to Datafusion to Hint Runner run: ``` ./scripts/job-df-logical-plan.sh @@ -64,7 +64,7 @@ This will load the schema and query into datafusion, run the hint engine over th The original SQL with dialect translation and produced SQL with hints will be dumped into `output/job/original-queries` and `output/job/{logical, logical-optimized, physical-optimized}-queries` directories. -## SQL with Hints Postgres Runner +### SQL with Hints Postgres Runner run: ``` export PGPASSWORD= @@ -74,6 +74,65 @@ This will run explain analyze or explain on the original queries from `output/jo and optimized queries from `output/job/logical-queries`, `output/job/logical-optimized-queries`, and `output/job/physical-optimized-queries` and dump their runtime performance and explain analyze plans or just explain plans into `output/job/postgres-results-{unhinted, logical, logical-optimized, physical-optimized}` respectively. Please notice that any other processes containing name 'postgres', 'pgbench', or 'psql' could be killed during benchmark run, and postgres server will be restarted. + +## DSB Setup +To init submodule `https://github.com/microsoft/dsb`, run +``` +git submodule init +``` + +Then to build the `dsdgen` binary, edit `dsb/code/tools/Makefile.suite` `OS = ...` field if needed, then +``` +cd dsb/code/tools +make +cd - +``` +should build the `dsdgen` binary in the `dsb/code/tools` dir. + +Then run the data generation and load python scripts, first install `psycopg2-binary` +``` +pip3 install psycopg2-binary +``` +Then run +``` +python3 ./scripts/dsb-gen.py +python3 ./scripts/dsb-load.py +``` +This should load dsb data into Postgres `dsb` database. + +To create the reference indexes for DSB, run +``` +python3 ./scripts/dsb-index.py +``` + +To generate the workload queries, adjust the config json `./scripts/dsb_workload_config.json` +and run +``` +./scripts/dsb-workload-gen.sh +``` +This should generate all queries under `input/dsb/queries` + +### SQL to Datafusion to Hint Runner +run: +``` +./scripts/dsb-df-logical-plan.sh +./scripts/dsb-df-logical-optimized-plan.sh +./scripts/dsb-df-execution-plan.sh +``` +This will load the schema and query into datafusion, run the hint engine over the unoptimized logical plan, optimized logical plan, and optimized physical plan respectively, and dump the hinted SQL for postgres. +The original SQL with dialect translation and produced SQL with hints will be dumped into +`output/dsb/original-queries` and `output/dsb/{logical, logical-optimized, physical-optimized}-queries` directories. + +### SQL with Hints Postgres Runner +run: +``` +export PGPASSWORD= +./scripts/dsb-run-all.sh --single-core|--multi-core [--timeout-second 10] [--explain-only] +``` +This will run explain analyze or explain on the original queries from `output/job/original-queries` +and optimized queries from `output/dsb/logical-queries`, `output/dsb/logical-optimized-queries`, and `output/dsb/physical-optimized-queries` and dump their runtime performance and explain analyze plans or just explain plans into `output/dsb/postgres-results-{unhinted, logical, logical-optimized, physical-optimized}` respectively. +Please notice that any other processes containing name 'postgres', 'pgbench', or 'psql' could be killed during benchmark run, and postgres server will be restarted. + --- ## Complete Benchmark Analysis Pipeline (`run_benchmark_analysis.sh`) diff --git a/df-autohint-runner/datafusion-logical-runner/src/lib.rs b/df-autohint-runner/datafusion-logical-runner/src/lib.rs index 62d0d967a..75fa7992e 100644 --- a/df-autohint-runner/datafusion-logical-runner/src/lib.rs +++ b/df-autohint-runner/datafusion-logical-runner/src/lib.rs @@ -17,6 +17,7 @@ use std::{fs, path::Path, sync::Arc}; pub enum Benchmark { TPCH, JOB, + DSB, } fn observer(_plan: &LogicalPlan, _rule: &dyn OptimizerRule) { @@ -122,6 +123,9 @@ pub async fn setup_database( Benchmark::JOB => { run_csv_ddl_file(&ctx, &schema_dir.as_ref().join("df-schema.sql"), data_dir).await?; } + Benchmark::DSB => { + run_csv_ddl_file(&ctx, &schema_dir.as_ref().join("df-schema.sql"), data_dir).await?; + } } Ok(ctx) } diff --git a/df-autohint-runner/datafusion-logical-runner/src/main.rs b/df-autohint-runner/datafusion-logical-runner/src/main.rs index e437c4ed5..c42ab73de 100644 --- a/df-autohint-runner/datafusion-logical-runner/src/main.rs +++ b/df-autohint-runner/datafusion-logical-runner/src/main.rs @@ -4,7 +4,7 @@ use datafusion::sql::sqlparser::dialect::PostgreSqlDialect; use indicatif::{ProgressBar, ProgressStyle}; use std::{fs, path::PathBuf}; -use datafusion_logical_runner::{Benchmark, df_sql_to_logical, is_sql_file, plan, setup_database}; +use datafusion_logical_runner::{df_sql_to_logical, is_sql_file, plan, setup_database, Benchmark}; use df_execution_impl::DFExecutionPlanVisitor; use std::sync::Arc; @@ -13,8 +13,8 @@ use datafusion::physical_plan::displayable; use df_logical_impl::DFLogicalPlanVisitor; use hint_engine::{engine::Engine, engine::HintEngineConfig, hints::PGHintType}; -use tracing::{error, info}; -use tracing_subscriber::{EnvFilter, fmt}; +use tracing::{debug, error, info, warn}; +use tracing_subscriber::{fmt, EnvFilter}; #[derive(Parser, Debug)] #[command(name = "df-autohint-runner", version, about = "Run tools")] @@ -48,6 +48,7 @@ async fn main() -> Result<()> { let benchmark = match args.benchmark.to_lowercase().as_str() { "job" => Benchmark::JOB, "tpch" => Benchmark::TPCH, + "dsb" => Benchmark::DSB, other => panic!("Unknown benchmark: {other}"), }; diff --git a/df-autohint-runner/dsb b/df-autohint-runner/dsb new file mode 160000 index 000000000..ec9a156ce --- /dev/null +++ b/df-autohint-runner/dsb @@ -0,0 +1 @@ +Subproject commit ec9a156ceee923db1114cbe388f6183b53d49787 diff --git a/df-autohint-runner/input/dsb/schema/df-schema.sql b/df-autohint-runner/input/dsb/schema/df-schema.sql new file mode 100644 index 000000000..9131205a7 --- /dev/null +++ b/df-autohint-runner/input/dsb/schema/df-schema.sql @@ -0,0 +1,588 @@ +-- +-- Legal Notice +-- +-- This document and associated source code (the "Work") is a part of a +-- benchmark specification maintained by the TPC. +-- +-- The TPC reserves all right, title, and interest to the Work as provided +-- under U.S. and international laws, including without limitation all patent +-- and trademark rights therein. +-- +-- No Warranty +-- +-- 1.1 TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, THE INFORMATION +-- CONTAINED HEREIN IS PROVIDED "AS IS" AND WITH ALL FAULTS, AND THE +-- AUTHORS AND DEVELOPERS OF THE WORK HEREBY DISCLAIM ALL OTHER +-- WARRANTIES AND CONDITIONS, EITHER EXPRESS, IMPLIED OR STATUTORY, +-- INCLUDING, BUT NOT LIMITED TO, ANY (IF ANY) IMPLIED WARRANTIES, +-- DUTIES OR CONDITIONS OF MERCHANTABILITY, OF FITNESS FOR A PARTICULAR +-- PURPOSE, OF ACCURACY OR COMPLETENESS OF RESPONSES, OF RESULTS, OF +-- WORKMANLIKE EFFORT, OF LACK OF VIRUSES, AND OF LACK OF NEGLIGENCE. +-- ALSO, THERE IS NO WARRANTY OR CONDITION OF TITLE, QUIET ENJOYMENT, +-- QUIET POSSESSION, CORRESPONDENCE TO DESCRIPTION OR NON-INFRINGEMENT +-- WITH REGARD TO THE WORK. +-- 1.2 IN NO EVENT WILL ANY AUTHOR OR DEVELOPER OF THE WORK BE LIABLE TO +-- ANY OTHER PARTY FOR ANY DAMAGES, INCLUDING BUT NOT LIMITED TO THE +-- COST OF PROCURING SUBSTITUTE GOODS OR SERVICES, LOST PROFITS, LOSS +-- OF USE, LOSS OF DATA, OR ANY INCIDENTAL, CONSEQUENTIAL, DIRECT, +-- INDIRECT, OR SPECIAL DAMAGES WHETHER UNDER CONTRACT, TORT, WARRANTY, +-- OR OTHERWISE, ARISING IN ANY WAY OUT OF THIS OR ANY OTHER AGREEMENT +-- RELATING TO THE WORK, WHETHER OR NOT SUCH AUTHOR OR DEVELOPER HAD +-- ADVANCE NOTICE OF THE POSSIBILITY OF SUCH DAMAGES. +-- +-- Contributors: +-- Gradient Systems +-- +create table dbgen_version +( + dv_version varchar(16) , + dv_create_date date , + dv_create_time time , + dv_cmdline_args varchar(200) +); + +create table customer_address +( + ca_address_sk integer not null, + ca_address_id char(16) not null, + ca_street_number char(10) , + ca_street_name varchar(60) , + ca_street_type char(15) , + ca_suite_number char(10) , + ca_city varchar(60) , + ca_county varchar(30) , + ca_state char(2) , + ca_zip char(10) , + ca_country varchar(20) , + ca_gmt_offset decimal(5,2) , + ca_location_type char(20) , + primary key (ca_address_sk) +); + +create table customer_demographics +( + cd_demo_sk integer not null, + cd_gender char(1) , + cd_marital_status char(1) , + cd_education_status char(20) , + cd_purchase_estimate integer , + cd_credit_rating char(10) , + cd_dep_count integer , + cd_dep_employed_count integer , + cd_dep_college_count integer , + primary key (cd_demo_sk) +); + +create table date_dim +( + d_date_sk integer not null, + d_date_id char(16) not null, + d_date date , + d_month_seq integer , + d_week_seq integer , + d_quarter_seq integer , + d_year integer , + d_dow integer , + d_moy integer , + d_dom integer , + d_qoy integer , + d_fy_year integer , + d_fy_quarter_seq integer , + d_fy_week_seq integer , + d_day_name char(9) , + d_quarter_name char(6) , + d_holiday char(1) , + d_weekend char(1) , + d_following_holiday char(1) , + d_first_dom integer , + d_last_dom integer , + d_same_day_ly integer , + d_same_day_lq integer , + d_current_day char(1) , + d_current_week char(1) , + d_current_month char(1) , + d_current_quarter char(1) , + d_current_year char(1) , + primary key (d_date_sk) +); + +create table warehouse +( + w_warehouse_sk integer not null, + w_warehouse_id char(16) not null, + w_warehouse_name varchar(20) , + w_warehouse_sq_ft integer , + w_street_number char(10) , + w_street_name varchar(60) , + w_street_type char(15) , + w_suite_number char(10) , + w_city varchar(60) , + w_county varchar(30) , + w_state char(2) , + w_zip char(10) , + w_country varchar(20) , + w_gmt_offset decimal(5,2) , + primary key (w_warehouse_sk) +); + +create table ship_mode +( + sm_ship_mode_sk integer not null, + sm_ship_mode_id char(16) not null, + sm_type char(30) , + sm_code char(10) , + sm_carrier char(20) , + sm_contract char(20) , + primary key (sm_ship_mode_sk) +); + +create table time_dim +( + t_time_sk integer not null, + t_time_id char(16) not null, + t_time integer , + t_hour integer , + t_minute integer , + t_second integer , + t_am_pm char(2) , + t_shift char(20) , + t_sub_shift char(20) , + t_meal_time char(20) , + primary key (t_time_sk) +); + +create table reason +( + r_reason_sk integer not null, + r_reason_id char(16) not null, + r_reason_desc char(100) , + primary key (r_reason_sk) +); + +create table income_band +( + ib_income_band_sk integer not null, + ib_lower_bound integer , + ib_upper_bound integer , + primary key (ib_income_band_sk) +); + +create table item +( + i_item_sk integer not null, + i_item_id char(16) not null, + i_rec_start_date date , + i_rec_end_date date , + i_item_desc varchar(200) , + i_current_price decimal(7,2) , + i_wholesale_cost decimal(7,2) , + i_brand_id integer , + i_brand char(50) , + i_class_id integer , + i_class char(50) , + i_category_id integer , + i_category char(50) , + i_manufact_id integer , + i_manufact char(50) , + i_size char(20) , + i_formulation char(20) , + i_color char(20) , + i_units char(10) , + i_container char(10) , + i_manager_id integer , + i_product_name char(50) , + primary key (i_item_sk) +); + +create table store +( + s_store_sk integer not null, + s_store_id char(16) not null, + s_rec_start_date date , + s_rec_end_date date , + s_closed_date_sk integer , + s_store_name varchar(50) , + s_number_employees integer , + s_floor_space integer , + s_hours char(20) , + s_manager varchar(40) , + s_market_id integer , + s_geography_class varchar(100) , + s_market_desc varchar(100) , + s_market_manager varchar(40) , + s_division_id integer , + s_division_name varchar(50) , + s_company_id integer , + s_company_name varchar(50) , + s_street_number varchar(10) , + s_street_name varchar(60) , + s_street_type char(15) , + s_suite_number char(10) , + s_city varchar(60) , + s_county varchar(30) , + s_state char(2) , + s_zip char(10) , + s_country varchar(20) , + s_gmt_offset decimal(5,2) , + s_tax_precentage decimal(5,2) , + primary key (s_store_sk) +); + +create table call_center +( + cc_call_center_sk integer not null, + cc_call_center_id char(16) not null, + cc_rec_start_date date , + cc_rec_end_date date , + cc_closed_date_sk integer , + cc_open_date_sk integer , + cc_name varchar(50) , + cc_class varchar(50) , + cc_employees integer , + cc_sq_ft integer , + cc_hours char(20) , + cc_manager varchar(40) , + cc_mkt_id integer , + cc_mkt_class char(50) , + cc_mkt_desc varchar(100) , + cc_market_manager varchar(40) , + cc_division integer , + cc_division_name varchar(50) , + cc_company integer , + cc_company_name char(50) , + cc_street_number char(10) , + cc_street_name varchar(60) , + cc_street_type char(15) , + cc_suite_number char(10) , + cc_city varchar(60) , + cc_county varchar(30) , + cc_state char(2) , + cc_zip char(10) , + cc_country varchar(20) , + cc_gmt_offset decimal(5,2) , + cc_tax_percentage decimal(5,2) , + primary key (cc_call_center_sk) +); + +create table customer +( + c_customer_sk integer not null, + c_customer_id char(16) not null, + c_current_cdemo_sk integer , + c_current_hdemo_sk integer , + c_current_addr_sk integer , + c_first_shipto_date_sk integer , + c_first_sales_date_sk integer , + c_salutation char(10) , + c_first_name char(20) , + c_last_name char(30) , + c_preferred_cust_flag char(1) , + c_birth_day integer , + c_birth_month integer , + c_birth_year integer , + c_birth_country varchar(20) , + c_login char(13) , + c_email_address char(50) , + c_last_review_date_sk integer , + primary key (c_customer_sk) +); + +create table web_site +( + web_site_sk integer not null, + web_site_id char(16) not null, + web_rec_start_date date , + web_rec_end_date date , + web_name varchar(50) , + web_open_date_sk integer , + web_close_date_sk integer , + web_class varchar(50) , + web_manager varchar(40) , + web_mkt_id integer , + web_mkt_class varchar(50) , + web_mkt_desc varchar(100) , + web_market_manager varchar(40) , + web_company_id integer , + web_company_name char(50) , + web_street_number char(10) , + web_street_name varchar(60) , + web_street_type char(15) , + web_suite_number char(10) , + web_city varchar(60) , + web_county varchar(30) , + web_state char(2) , + web_zip char(10) , + web_country varchar(20) , + web_gmt_offset decimal(5,2) , + web_tax_percentage decimal(5,2) , + primary key (web_site_sk) +); + +create table store_returns +( + sr_returned_date_sk integer , + sr_return_time_sk integer , + sr_item_sk integer not null, + sr_customer_sk integer , + sr_cdemo_sk integer , + sr_hdemo_sk integer , + sr_addr_sk integer , + sr_store_sk integer , + sr_reason_sk integer , + sr_ticket_number integer not null, + sr_return_quantity integer , + sr_return_amt decimal(7,2) , + sr_return_tax decimal(7,2) , + sr_return_amt_inc_tax decimal(7,2) , + sr_fee decimal(7,2) , + sr_return_ship_cost decimal(7,2) , + sr_refunded_cash decimal(7,2) , + sr_reversed_charge decimal(7,2) , + sr_store_credit decimal(7,2) , + sr_net_loss decimal(7,2) , + primary key (sr_item_sk, sr_ticket_number) +); + +create table household_demographics +( + hd_demo_sk integer not null, + hd_income_band_sk integer , + hd_buy_potential char(15) , + hd_dep_count integer , + hd_vehicle_count integer , + primary key (hd_demo_sk) +); + +create table web_page +( + wp_web_page_sk integer not null, + wp_web_page_id char(16) not null, + wp_rec_start_date date , + wp_rec_end_date date , + wp_creation_date_sk integer , + wp_access_date_sk integer , + wp_autogen_flag char(1) , + wp_customer_sk integer , + wp_url varchar(100) , + wp_type char(50) , + wp_char_count integer , + wp_link_count integer , + wp_image_count integer , + wp_max_ad_count integer , + primary key (wp_web_page_sk) +); + +create table promotion +( + p_promo_sk integer not null, + p_promo_id char(16) not null, + p_start_date_sk integer , + p_end_date_sk integer , + p_item_sk integer , + p_cost decimal(15,2) , + p_response_target integer , + p_promo_name char(50) , + p_channel_dmail char(1) , + p_channel_email char(1) , + p_channel_catalog char(1) , + p_channel_tv char(1) , + p_channel_radio char(1) , + p_channel_press char(1) , + p_channel_event char(1) , + p_channel_demo char(1) , + p_channel_details varchar(100) , + p_purpose char(15) , + p_discount_active char(1) , + primary key (p_promo_sk) +); + +create table catalog_page +( + cp_catalog_page_sk integer not null, + cp_catalog_page_id char(16) not null, + cp_start_date_sk integer , + cp_end_date_sk integer , + cp_department varchar(50) , + cp_catalog_number integer , + cp_catalog_page_number integer , + cp_description varchar(100) , + cp_type varchar(100) , + primary key (cp_catalog_page_sk) +); + +create table inventory +( + inv_date_sk integer not null, + inv_item_sk integer not null, + inv_warehouse_sk integer not null, + inv_quantity_on_hand integer , + primary key (inv_date_sk, inv_item_sk, inv_warehouse_sk) +); + +create table catalog_returns +( + cr_returned_date_sk integer , + cr_returned_time_sk integer , + cr_item_sk integer not null, + cr_refunded_customer_sk integer , + cr_refunded_cdemo_sk integer , + cr_refunded_hdemo_sk integer , + cr_refunded_addr_sk integer , + cr_returning_customer_sk integer , + cr_returning_cdemo_sk integer , + cr_returning_hdemo_sk integer , + cr_returning_addr_sk integer , + cr_call_center_sk integer , + cr_catalog_page_sk integer , + cr_ship_mode_sk integer , + cr_warehouse_sk integer , + cr_reason_sk integer , + cr_order_number integer not null, + cr_return_quantity integer , + cr_return_amount decimal(7,2) , + cr_return_tax decimal(7,2) , + cr_return_amt_inc_tax decimal(7,2) , + cr_fee decimal(7,2) , + cr_return_ship_cost decimal(7,2) , + cr_refunded_cash decimal(7,2) , + cr_reversed_charge decimal(7,2) , + cr_store_credit decimal(7,2) , + cr_net_loss decimal(7,2) , + primary key (cr_item_sk, cr_order_number) +); + +create table web_returns +( + wr_returned_date_sk integer , + wr_returned_time_sk integer , + wr_item_sk integer not null, + wr_refunded_customer_sk integer , + wr_refunded_cdemo_sk integer , + wr_refunded_hdemo_sk integer , + wr_refunded_addr_sk integer , + wr_returning_customer_sk integer , + wr_returning_cdemo_sk integer , + wr_returning_hdemo_sk integer , + wr_returning_addr_sk integer , + wr_web_page_sk integer , + wr_reason_sk integer , + wr_order_number integer not null, + wr_return_quantity integer , + wr_return_amt decimal(7,2) , + wr_return_tax decimal(7,2) , + wr_return_amt_inc_tax decimal(7,2) , + wr_fee decimal(7,2) , + wr_return_ship_cost decimal(7,2) , + wr_refunded_cash decimal(7,2) , + wr_reversed_charge decimal(7,2) , + wr_account_credit decimal(7,2) , + wr_net_loss decimal(7,2) , + primary key (wr_item_sk, wr_order_number) +); + +create table web_sales +( + ws_sold_date_sk integer , + ws_sold_time_sk integer , + ws_ship_date_sk integer , + ws_item_sk integer not null, + ws_bill_customer_sk integer , + ws_bill_cdemo_sk integer , + ws_bill_hdemo_sk integer , + ws_bill_addr_sk integer , + ws_ship_customer_sk integer , + ws_ship_cdemo_sk integer , + ws_ship_hdemo_sk integer , + ws_ship_addr_sk integer , + ws_web_page_sk integer , + ws_web_site_sk integer , + ws_ship_mode_sk integer , + ws_warehouse_sk integer , + ws_promo_sk integer , + ws_order_number integer not null, + ws_quantity integer , + ws_wholesale_cost decimal(7,2) , + ws_list_price decimal(7,2) , + ws_sales_price decimal(7,2) , + ws_ext_discount_amt decimal(7,2) , + ws_ext_sales_price decimal(7,2) , + ws_ext_wholesale_cost decimal(7,2) , + ws_ext_list_price decimal(7,2) , + ws_ext_tax decimal(7,2) , + ws_coupon_amt decimal(7,2) , + ws_ext_ship_cost decimal(7,2) , + ws_net_paid decimal(7,2) , + ws_net_paid_inc_tax decimal(7,2) , + ws_net_paid_inc_ship decimal(7,2) , + ws_net_paid_inc_ship_tax decimal(7,2) , + ws_net_profit decimal(7,2) , + primary key (ws_item_sk, ws_order_number) +); + +create table catalog_sales +( + cs_sold_date_sk integer , + cs_sold_time_sk integer , + cs_ship_date_sk integer , + cs_bill_customer_sk integer , + cs_bill_cdemo_sk integer , + cs_bill_hdemo_sk integer , + cs_bill_addr_sk integer , + cs_ship_customer_sk integer , + cs_ship_cdemo_sk integer , + cs_ship_hdemo_sk integer , + cs_ship_addr_sk integer , + cs_call_center_sk integer , + cs_catalog_page_sk integer , + cs_ship_mode_sk integer , + cs_warehouse_sk integer , + cs_item_sk integer not null, + cs_promo_sk integer , + cs_order_number integer not null, + cs_quantity integer , + cs_wholesale_cost decimal(7,2) , + cs_list_price decimal(7,2) , + cs_sales_price decimal(7,2) , + cs_ext_discount_amt decimal(7,2) , + cs_ext_sales_price decimal(7,2) , + cs_ext_wholesale_cost decimal(7,2) , + cs_ext_list_price decimal(7,2) , + cs_ext_tax decimal(7,2) , + cs_coupon_amt decimal(7,2) , + cs_ext_ship_cost decimal(7,2) , + cs_net_paid decimal(7,2) , + cs_net_paid_inc_tax decimal(7,2) , + cs_net_paid_inc_ship decimal(7,2) , + cs_net_paid_inc_ship_tax decimal(7,2) , + cs_net_profit decimal(7,2) , + primary key (cs_item_sk, cs_order_number) +); + +create table store_sales +( + ss_sold_date_sk integer , + ss_sold_time_sk integer , + ss_item_sk integer not null, + ss_customer_sk integer , + ss_cdemo_sk integer , + ss_hdemo_sk integer , + ss_addr_sk integer , + ss_store_sk integer , + ss_promo_sk integer , + ss_ticket_number integer not null, + ss_quantity integer , + ss_wholesale_cost decimal(7,2) , + ss_list_price decimal(7,2) , + ss_sales_price decimal(7,2) , + ss_ext_discount_amt decimal(7,2) , + ss_ext_sales_price decimal(7,2) , + ss_ext_wholesale_cost decimal(7,2) , + ss_ext_list_price decimal(7,2) , + ss_ext_tax decimal(7,2) , + ss_coupon_amt decimal(7,2) , + ss_net_paid decimal(7,2) , + ss_net_paid_inc_tax decimal(7,2) , + ss_net_profit decimal(7,2) , + primary key (ss_item_sk, ss_ticket_number) +); + diff --git a/df-autohint-runner/scripts/dsb-df-execution-plan.sh b/df-autohint-runner/scripts/dsb-df-execution-plan.sh new file mode 100755 index 000000000..87eeb4fe9 --- /dev/null +++ b/df-autohint-runner/scripts/dsb-df-execution-plan.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +cd datafusion-logical-runner + + +cargo run -- --benchmark dsb --data-dir ../input/dsb/data/ --query-dir ../input/dsb/queries/ --schema-dir ../input/dsb/schema/ --raw-sql-output-dir ../output/dsb/original-queries/ --print-plan --optimized-sql-output-dir ../output/dsb/physical-optimized-queries/ --mode physical-optimized +cd - diff --git a/df-autohint-runner/scripts/dsb-df-logical-optimized-plan.sh b/df-autohint-runner/scripts/dsb-df-logical-optimized-plan.sh new file mode 100755 index 000000000..240f5decf --- /dev/null +++ b/df-autohint-runner/scripts/dsb-df-logical-optimized-plan.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +cd datafusion-logical-runner + + +cargo run -- --benchmark dsb --data-dir ../input/dsb/data/ --query-dir ../input/dsb/queries/ --schema-dir ../input/dsb/schema/ --raw-sql-output-dir ../output/dsb/original-queries/ --print-plan --optimized-sql-output-dir ../output/dsb/logical-optimized-queries/ --mode logical-optimized +cd - diff --git a/df-autohint-runner/scripts/dsb-df-logical-plan.sh b/df-autohint-runner/scripts/dsb-df-logical-plan.sh new file mode 100755 index 000000000..ba7e492cc --- /dev/null +++ b/df-autohint-runner/scripts/dsb-df-logical-plan.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +cd datafusion-logical-runner + +cargo run -- --benchmark dsb --data-dir ../input/dsb/data/ --query-dir ../input/dsb/queries/ --schema-dir ../input/dsb/schema/ --raw-sql-output-dir ../output/dsb/original-queries/ --print-plan --optimized-sql-output-dir ../output/dsb/logical-queries/ --mode logical +cd - diff --git a/df-autohint-runner/scripts/dsb-gen.py b/df-autohint-runner/scripts/dsb-gen.py new file mode 100644 index 000000000..bbeac4027 --- /dev/null +++ b/df-autohint-runner/scripts/dsb-gen.py @@ -0,0 +1,38 @@ +import os +import subprocess +import shutil + +# Path to compiled dsdgen binary (Linux) +# Directory of this script file +base_dir = os.path.dirname(os.path.abspath(__file__)) +bin_dir = '../dsb/code/tools' +bin_dir = os.path.join(base_dir, bin_dir) +bin_dir = os.path.realpath(bin_dir) +bin_path = os.path.join(bin_dir, 'dsdgen') + +# Scale factor (e.g., 10 = 10GB) +scale = 1 + +# Directory where output tables will be generated +file_dir = '../input/dsb/data' +file_dir = os.path.join(base_dir, file_dir) +file_dir = os.path.realpath(file_dir) + +# Clean existing output +if os.path.exists(file_dir): + shutil.rmtree(file_dir) +os.makedirs(file_dir, exist_ok=True) + +# Generate data +cmd = [ + bin_path, + '-scale', str(scale), + '-force', + '-delimiter', '|', + '-dir', file_dir +] + +print("Running:", " ".join(cmd)) +subprocess.run(cmd, cwd=bin_dir, check=True) +print("Data generated in:", file_dir) + diff --git a/df-autohint-runner/scripts/dsb-index.py b/df-autohint-runner/scripts/dsb-index.py new file mode 100644 index 000000000..d27f2d9e2 --- /dev/null +++ b/df-autohint-runner/scripts/dsb-index.py @@ -0,0 +1,25 @@ +import os +import pg_util + +db_name = 'dsb' # database name +base_dir = os.path.dirname(os.path.abspath(__file__)) +sql_path = '../dsb/scripts/dsb_index_pg.sql' +sql_path = os.path.join(base_dir, sql_path) +sql_path = os.path.realpath(sql_path) + +# start database service +pg_util.start_server() + +# postgres credential +user = 'postgres' +password = 'postgres' + +# connect to the database +conn = pg_util.connect(user = user, password = password, db_name = db_name) +cursor = conn.cursor() + +# create indexes +pg_util.execute(cursor, open(sql_path, 'r').read(), verbose = True) + +cursor.close() +conn.close() diff --git a/df-autohint-runner/scripts/dsb-load.py b/df-autohint-runner/scripts/dsb-load.py new file mode 100644 index 000000000..73cd97436 --- /dev/null +++ b/df-autohint-runner/scripts/dsb-load.py @@ -0,0 +1,58 @@ +import os +import sys + +import pg_util + +tables = ['call_center', + 'catalog_page', 'catalog_returns', + 'catalog_sales', + 'customer', 'customer_address', 'customer_demographics', + 'date_dim', 'household_demographics', 'income_band', 'inventory', 'item', 'promotion', 'reason', 'ship_mode', + 'store', 'store_returns', 'store_sales', + 'time_dim', 'warehouse', + 'web_page', 'web_returns', 'web_sales', 'web_site' + ] + +base_dir = os.path.dirname(os.path.abspath(__file__)) +data_path = '../input/dsb/data' +data_path = os.path.join(base_dir, data_path) +data_path = os.path.realpath(data_path) # directory of data files + +db_name = "dsb" +tmp_csv_path = "/tmp/dsb_tmp.csv" # path of tmp csv file for bulk loading + +create_db = True # If create the database +create_table = True # If create the tables + +# start database service +pg_util.start_server() + +# postgres credential +user = 'postgres' +password = 'postgres' + +# create database +if create_db: + master_conn = pg_util.connect(user = user, password = password) + pg_util.execute(master_conn.cursor(), 'create database ' + db_name, verbose = True) + master_conn.close() + +# connect to the database +conn = pg_util.connect(user = user, password = password, db_name = db_name) +cursor = conn.cursor() + +# create tables +if create_table: + sql_path = '../dsb/scripts/create_tables.sql' + sql_path = os.path.join(base_dir, sql_path) + sql_path = os.path.realpath(sql_path) + pg_util.execute(cursor, open(sql_path, 'r').read(), verbose = True) + +# insert tuples into tables +for table in tables: + file_path = os.path.join(data_path, table + '.dat') + pg_util.execute(cursor, 'delete from ' + table + ';', verbose = True) + pg_util.bulk_load_from_csv_file(cursor, file_path, tmp_csv_path, table, delimiter = '|') + +cursor.close() +conn.close() diff --git a/df-autohint-runner/scripts/dsb-run-all.sh b/df-autohint-runner/scripts/dsb-run-all.sh new file mode 100755 index 000000000..417d8c1c4 --- /dev/null +++ b/df-autohint-runner/scripts/dsb-run-all.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# Default values +CORE_SETTING="--single-core" +TIMEOUT_SECOND=300 +RUN_SETTING="EXPLAIN ANALYZE" + +# Parse args +while [[ $# -gt 0 ]]; do + case $1 in + --single-core) + CORE_SETTING="--single-core" + shift + ;; + --multi-core) + CORE_SETTING="--multi-core" + shift + ;; + --timeout-second) + if [[ -n "${2:-}" && $2 =~ ^[0-9]+$ ]]; then + TIMEOUT_SECOND="$2" + shift 2 + else + echo "Error: --timeout-second requires an integer argument." + echo "Usage: $0 --single-core|--multi-core [--explain-only] [--timeout-second N]" + exit 1 + fi + ;; + --explain-only) + RUN_SETTING="EXPLAIN" + shift + ;; + *) + echo "Unknown argument: $1" + echo "Usage: $0 --single-core|--multi-core [--explain-only] [--timeout-second N]" + exit 1 + ;; + esac +done + +# Validate core setting +if [[ -z "$CORE_SETTING" ]]; then + echo "Error: Must specify either --single-core or --multi-core" + exit 1 +fi + +echo "Running unhinted version, $CORE_SETTING..." +./scripts/dsb-run.sh "$CORE_SETTING" --timeout-second "$TIMEOUT_SECOND" $([[ "$RUN_SETTING" == "EXPLAIN" ]] && echo "--explain-only") +echo "Done." + +echo "Running hinted version from logical plan, $CORE_SETTING..." +./scripts/dsb-run-hinted.sh --query logical "$CORE_SETTING" --timeout-second "$TIMEOUT_SECOND" $([[ "$RUN_SETTING" == "EXPLAIN" ]] && echo "--explain-only") + +echo "Running hinted version from optimized logical plan, $CORE_SETTING..." +./scripts/dsb-run-hinted.sh --query logical-optimized "$CORE_SETTING" --timeout-second "$TIMEOUT_SECOND" $([[ "$RUN_SETTING" == "EXPLAIN" ]] && echo "--explain-only") + +echo "Running hinted version from optimized physical plan, $CORE_SETTING..." +./scripts/dsb-run-hinted.sh --query physical-optimized "$CORE_SETTING" --timeout-second "$TIMEOUT_SECOND" $([[ "$RUN_SETTING" == "EXPLAIN" ]] && echo "--explain-only") +echo "Done." diff --git a/df-autohint-runner/scripts/dsb-run-hinted.sh b/df-autohint-runner/scripts/dsb-run-hinted.sh new file mode 100755 index 000000000..14d2fe2c9 --- /dev/null +++ b/df-autohint-runner/scripts/dsb-run-hinted.sh @@ -0,0 +1,240 @@ +# ############################################################################## +# +# Pass of the DSB Benchmark over a PostgreSQL instance +# +# ############################################################################## + +#!/bin/bash +ulimit -c unlimited + +RUN_SETTING="EXPLAIN ANALYZE" +CORE_SETTING="" +TIMEOUT_SECOND=30 + +# Parse args +while [[ $# -gt 0 ]]; do + case $1 in + --explain-only) + RUN_SETTING="EXPLAIN" + shift + ;; + --single-core) + CORE_SETTING="single" + shift + ;; + --multi-core) + CORE_SETTING="multi" + shift + ;; + --timeout-second) + if [[ -n "${2:-}" && $2 =~ ^[0-9]+$ ]]; then + TIMEOUT_SECOND="$2" + shift 2 + else + echo "Error: --timeout-second requires an integer argument." + echo "Usage: $0 --query logical|logical-optimized|physical-optimized --single-core|--multi-core [--explain-only] [--timeout-second N]" + exit 1 + fi + ;; + --query) + if [[ -n "${2:-}" ]]; then + case "$2" in + logical|logical-optimized|physical-optimized) + QUERY_MODE="$2" + shift 2 + ;; + *) + echo "Error: --query requires one of: logical, logical-optimized, physical-optimized" + echo "Usage: $0 --query logical|logical-optimized|physical-optimized --single-core|--multi-core [--explain-only] [--timeout-second N]" + exit 1 + ;; + esac + else + echo "Error: --query requires an argument." + echo "Usage: $0 --query logical|logical-optimized|physical-optimized --single-core|--multi-core [--explain-only] [--timeout-second N]" + exit 1 + fi + ;; + *) + shift + ;; + esac +done + +if [[ "$CORE_SETTING" != "single" && "$CORE_SETTING" != "multi" ]]; then + echo "Invalid flag. Use --single-core or --multi-core." + echo "Usage: $0 --query logical|logical-optimized|physical-optimized --single-core|--multi-core [--explain-only] [--timeout-second N]" + exit 1 +fi + +echo "Use $CORE_SETTING-core settings" + +echo "Use query timeout of $TIMEOUT_SECOND seconds" + +# Binaries and data dirs +INSTDIR=`pwd`/tmp_install +QUERY_DIR=output/dsb/$QUERY_MODE-queries +OUTPUT_DIR=output/dsb/postgres-results-$QUERY_MODE +mkdir -p $OUTPUT_DIR + +export PGDATABASE=dsb +export PGPORT=5432 +export PGHOST=localhost +export PGUSER=postgres +export PGPASSWORD=postgres +export PGOPTIONS="-c statement_timeout=$((TIMEOUT_SECOND * 1000))" + +#define environment +export LD_LIBRARY_PATH=$INSTDIR/lib:$LD_LIBRARY_PATH +export PATH=$INSTDIR/bin:$PATH + +# Stop instances and clean logs. +sudo systemctl stop postgresql@17-main + +# Kill all postgres processes +unamestr=`uname` +if [[ "$unamestr" == 'Linux' ]]; then + pkill -U `whoami` -9 -e postgres + pkill -U `whoami` -9 -e pgbench + pkill -U `whoami` -9 -e psql +elif [[ "$OSTYPE" == "darwin"* ]]; then + killall -u `whoami` -vz -9 postgres + killall -u `whoami` -vz -9 pgbench + killall -u `whoami` -vz -9 psql +else + echo "Unintended OS." +fi +sleep 1 + +sudo systemctl start postgresql@17-main +psql -c "DROP EXTENSION IF EXISTS pg_stat_statements" +psql -c "DROP EXTENSION IF EXISTS pg_prewarm" + +# INSTANCE SETTINGS ############################################################ +psql -c "ALTER SYSTEM SET compute_query_id = 'on'" +psql -c "ALTER SYSTEM SET shared_preload_libraries = pg_prewarm,pg_stat_statements,pg_hint_plan" +psql -c "SHOW shared_preload_libraries;" +psql -c "ALTER SYSTEM SET checkpoint_timeout = 86399" +psql -c "ALTER SYSTEM SET fsync = 'off'" + +# Performance & Planning ([un]-comment something before the test, if necessary) +psql -c "ALTER SYSTEM SET from_collapse_limit = 20" +psql -c "ALTER SYSTEM SET join_collapse_limit = 20" + +# These configs will only be effective when multicore setting is used +psql -c "ALTER SYSTEM SET max_worker_processes = 32" +psql -c "ALTER SYSTEM SET parallel_setup_cost = 0.1" +psql -c "ALTER SYSTEM SET parallel_tuple_cost = 0.00001" + +# single / multicore +if [ "$CORE_SETTING" = "single" ]; then + # single core setting: + psql -c "ALTER SYSTEM SET max_parallel_workers_per_gather = 0" +elif [ "$CORE_SETTING" = "single" ]; then + # multi core setting: + psql -c "ALTER SYSTEM SET max_parallel_workers_per_gather = 2" +else + echo "Invalid flag. Use --single-core or --multi-core." + exit 1 +fi + +psql -c "ALTER SYSTEM SET min_parallel_table_scan_size = 0" +psql -c "ALTER SYSTEM SET min_parallel_index_scan_size = 0" +psql -c "ALTER SYSTEM SET max_parallel_workers = 32" +psql -c "ALTER SYSTEM SET effective_cache_size = '32GB'" +psql -c "ALTER SYSTEM SET geqo_threshold=18" +psql -c "ALTER SYSTEM SET shared_buffers='4GB'" +psql -c "ALTER SYSTEM SET work_mem='2GB'" + +# Partitioning +psql -c "ALTER SYSTEM SET enable_partitionwise_join = 'on'" + +# prewarm +psql -c "ALTER SYSTEM SET pg_prewarm.autoprewarm = true" +psql -c "ALTER SYSTEM SET pg_prewarm.autoprewarm_interval = 0" + +#pg_stat_statements +psql -c "ALTER SYSTEM SET pg_stat_statements.max = 50000" +psql -c "ALTER SYSTEM SET pg_stat_statements.track = 'top'" +psql -c "ALTER SYSTEM SET pg_stat_statements.track_utility = 'off'" +psql -c "ALTER SYSTEM SET pg_stat_statements.track_planning = 'off'" +psql -c "ALTER SYSTEM SET pg_stat_statements.save = 'off'" + +# pg_hint_plan +psql -c "ALTER SYSTEM SET pg_hint_plan.enable_hint TO on" +psql -c "ALTER SYSTEM SET pg_hint_plan.enable_hint_table = 'off'" + +# query timeout +psql -c "ALTER SYSTEM SET statement_timeout = 2400000" +# ############################################################################## + +psql -c "SELECT pg_reload_conf();" +sudo systemctl restart postgresql@17-main + +sleep 20 # pg_prewarm should has already done its stuff + +echo "The DSB Benchmark ..." + +psql -c "CREATE EXTENSION pg_stat_statements" +psql -c "SELECT pg_stat_statements_reset()" +psql -c "CREATE EXTENSION pg_prewarm" +psql -c "CREATE EXTENSION pg_hint_plan" +psql -c "SHOW pg_hint_plan.enable_hint_table" +psql -c "SHOW pg_hint_plan.enable_hint" + +# Enable pg_hint_plan debug printing +psql -f scripts/pg-hint-plan-debug.sql + +for i in {1..3} +do + filenum=1 + if [ "$RUN_SETTING" = "EXPLAIN ANALYZE" ]; then + # Write header for runtime output file + echo -e "QueryNumber\tQueryName\tExecutionTime" > $OUTPUT_DIR/dsb_onepass-hinted-$i.dat + # Create postgres explain analyze plan output directory + mkdir -p "$OUTPUT_DIR/explain-analyze-hinted-$i" + # Create pg_hint_plan log output directory + mkdir -p "$OUTPUT_DIR/explain-analyze-hinted-logs-$i" + else + # Create postgres explain plan output directory + mkdir -p "$OUTPUT_DIR/explain-hinted-$i" + # Create pg_hint_plan log output directory + mkdir -p "$OUTPUT_DIR/explain-hinted-logs-$i" + fi + + for file in $QUERY_DIR/*.sql + do + # Disable star expansion + set -f + # Get filename + short_file=$(basename "$file") + test_name=$(basename "$file" .sql) + + if [ "$RUN_SETTING" = "EXPLAIN ANALYZE" ]; then + # Prepare query + echo -n "/* $filenum */ EXPLAIN (ANALYZE, VERBOSE, FORMAT JSON) " > $OUTPUT_DIR/test.sql + cat $file >> $OUTPUT_DIR/test.sql + # Query and save explain analyze output + result=$(psql -f $OUTPUT_DIR/test.sql 2>>$OUTPUT_DIR/explain-analyze-hinted-logs-$i/$test_name-hint.log) + echo -e $result >> $OUTPUT_DIR/explain-analyze-hinted-$i/$test_name.json + # Extract execution time and record + exec_time=$(echo $result | sed -n 's/.*"Execution Time": \([0-9]*\.[0-9]*\).*/\1/p') + echo -e "$filenum\t$short_file\t$exec_time" + echo -e "$filenum\t$short_file\t$exec_time" >> $OUTPUT_DIR/dsb_onepass-hinted-$i.dat + else + # Prepare query + echo -n "/* $filenum */ EXPLAIN (VERBOSE, FORMAT JSON) " > $OUTPUT_DIR/test.sql + cat $file >> $OUTPUT_DIR/test.sql + # Query and save explain output + result=$(psql -f $OUTPUT_DIR/test.sql 2>>$OUTPUT_DIR/explain-hinted-logs-$i/$test_name-hint.log) + echo -e "$filenum\t$short_file" + echo -e $result >> $OUTPUT_DIR/explain-hinted-$i/$test_name.json + fi + + filenum=$((filenum+1)) + set +f + done +done + +# Save buffers usage to do correct pg_prewarm next time +psql -c "SELECT autoprewarm_dump_now();" diff --git a/df-autohint-runner/scripts/dsb-run.sh b/df-autohint-runner/scripts/dsb-run.sh new file mode 100755 index 000000000..f0397d8ba --- /dev/null +++ b/df-autohint-runner/scripts/dsb-run.sh @@ -0,0 +1,215 @@ + +# ############################################################################## +# +# Pass of the DSB Benchmark over a PostgreSQL instance +# +# ############################################################################## + +#!/bin/bash +ulimit -c unlimited + +RUN_SETTING="EXPLAIN ANALYZE" +CORE_SETTING="" +TIMEOUT_SECOND=30 + +# Parse args +while [[ $# -gt 0 ]]; do + case $1 in + --explain-only) + RUN_SETTING="EXPLAIN" + shift + ;; + --single-core) + CORE_SETTING="single" + shift + ;; + --multi-core) + CORE_SETTING="multi" + shift + ;; + --timeout-second) + if [[ -n "${2:-}" && $2 =~ ^[0-9]+$ ]]; then + TIMEOUT_SECOND="$2" + shift 2 + else + echo "Error: --timeout-second requires an integer argument." + echo "Usage: $0 --single-core|--multi-core [--explain-only] [--timeout-second N]" + exit 1 + fi + ;; + *) + shift + ;; + esac +done + +if [[ "$CORE_SETTING" != "single" && "$CORE_SETTING" != "multi" ]]; then + echo "Invalid flag. Use --single-core or --multi-core." + echo "Usage: $0 --single-core|--multi-core [--explain-only] [--timeout-second N]" + exit 1 +fi + +echo "Use $CORE_SETTING-core settings" + +echo "Use query timeout of $TIMEOUT_SECOND seconds" + +# Binaries and data dirs +INSTDIR=`pwd`/tmp_install +QUERY_DIR=output/dsb/original-queries +OUTPUT_DIR=output/dsb/postgres-results-unhinted +mkdir -p $OUTPUT_DIR + +export PGDATABASE=dsb +export PGPORT=5432 +export PGHOST=localhost +export PGUSER=postgres +export PGPASSWORD=postgres +export PGOPTIONS="-c statement_timeout=$((TIMEOUT_SECOND * 1000))" + +#define environment +export LD_LIBRARY_PATH=$INSTDIR/lib:$LD_LIBRARY_PATH +export PATH=$INSTDIR/bin:$PATH + +# Stop instances and clean logs. +sudo systemctl stop postgresql@17-main + +# Kill all postgres processes +unamestr=`uname` +if [[ "$unamestr" == 'Linux' ]]; then + pkill -U `whoami` -9 -e postgres + pkill -U `whoami` -9 -e pgbench + pkill -U `whoami` -9 -e psql +elif [[ "$OSTYPE" == "darwin"* ]]; then + killall -u `whoami` -vz -9 postgres + killall -u `whoami` -vz -9 pgbench + killall -u `whoami` -vz -9 psql +else + echo "Unintended OS." +fi +sleep 1 + +sudo systemctl start postgresql@17-main +psql -c "DROP EXTENSION IF EXISTS pg_stat_statements" +psql -c "DROP EXTENSION IF EXISTS pg_prewarm" + +# INSTANCE SETTINGS ############################################################ +psql -c "ALTER SYSTEM SET compute_query_id = 'on'" +psql -c "ALTER SYSTEM SET shared_preload_libraries = pg_prewarm,pg_stat_statements" +psql -c "SHOW shared_preload_libraries;" +psql -c "ALTER SYSTEM SET checkpoint_timeout = 86399" +psql -c "ALTER SYSTEM SET fsync = 'off'" + +# Performance & Planning ([un]-comment something before the test, if necessary) +psql -c "ALTER SYSTEM SET from_collapse_limit = 20" +psql -c "ALTER SYSTEM SET join_collapse_limit = 20" + +# These configs will only be effective when multicore setting is used +psql -c "ALTER SYSTEM SET max_worker_processes = 32" +psql -c "ALTER SYSTEM SET parallel_setup_cost = 0.1" +psql -c "ALTER SYSTEM SET parallel_tuple_cost = 0.00001" + +# single / multicore +if [ "$CORE_SETTING" = "single" ]; then + # single core setting: + psql -c "ALTER SYSTEM SET max_parallel_workers_per_gather = 0" +elif [ "$CORE_SETTING" = "single" ]; then + # multi core setting: + psql -c "ALTER SYSTEM SET max_parallel_workers_per_gather = 2" +else + echo "Invalid flag. Use --single-core or --multi-core." + exit 1 +fi + +psql -c "ALTER SYSTEM SET min_parallel_table_scan_size = 0" +psql -c "ALTER SYSTEM SET min_parallel_index_scan_size = 0" +psql -c "ALTER SYSTEM SET max_parallel_workers = 32" +psql -c "ALTER SYSTEM SET effective_cache_size = '32GB'" +psql -c "ALTER SYSTEM SET geqo_threshold=18" +psql -c "ALTER SYSTEM SET shared_buffers='4GB'" +psql -c "ALTER SYSTEM SET work_mem='2GB'" + +# Partitioning +psql -c "ALTER SYSTEM SET enable_partitionwise_join = 'on'" + +# prewarm +psql -c "ALTER SYSTEM SET pg_prewarm.autoprewarm = true" +psql -c "ALTER SYSTEM SET pg_prewarm.autoprewarm_interval = 0" + +#pg_stat_statements +psql -c "ALTER SYSTEM SET pg_stat_statements.max = 50000" +psql -c "ALTER SYSTEM SET pg_stat_statements.track = 'top'" +psql -c "ALTER SYSTEM SET pg_stat_statements.track_utility = 'off'" +psql -c "ALTER SYSTEM SET pg_stat_statements.track_planning = 'off'" +psql -c "ALTER SYSTEM SET pg_stat_statements.save = 'off'" + +# pg_hint_plan +psql -c "ALTER SYSTEM SET pg_hint_plan.enable_hint TO off" +psql -c "ALTER SYSTEM SET pg_hint_plan.enable_hint_table = 'off'" + +# query timeout +psql -c "ALTER SYSTEM SET statement_timeout = 2400000" +# ############################################################################## + +psql -c "SELECT pg_reload_conf();" +sudo systemctl restart postgresql@17-main + +sleep 20 # pg_prewarm should has already done its stuff + +echo "The DSB Benchmark ..." + +psql -c "CREATE EXTENSION pg_stat_statements" +psql -c "SELECT pg_stat_statements_reset()" +psql -c "CREATE EXTENSION pg_prewarm" +psql -c "SHOW pg_hint_plan.enable_hint" + + +for i in {1..3} +do + filenum=1 + if [ "$RUN_SETTING" = "EXPLAIN ANALYZE" ]; then + # Write header for runtime output file + echo -e "QueryNumber\tQueryName\tExecutionTime" > $OUTPUT_DIR/dsb_onepass-$i.dat + # Create postgres explain analyze plan output directory + mkdir -p "$OUTPUT_DIR/explain-analyze-$i" + else + # Create postgres explain plan output directory + mkdir -p "$OUTPUT_DIR/explain-$i" + fi + + for file in $QUERY_DIR/*.sql + do + # Disable star expansion + set -f + # Get filename + short_file=$(basename "$file") + test_name=$(basename "$file" .sql) + + if [ "$RUN_SETTING" = "EXPLAIN ANALYZE" ]; then + # Prepare query + echo -n "/* $filenum */ EXPLAIN (ANALYZE, VERBOSE, FORMAT JSON) " > $OUTPUT_DIR/test.sql + cat $file >> $OUTPUT_DIR/test.sql + # Query and save explain analyze output + result=$(psql -f $OUTPUT_DIR/test.sql) + echo -e $result >> $OUTPUT_DIR/explain-analyze-$i/$test_name.json + # Extract execution time and record + exec_time=$(echo $result | sed -n 's/.*"Execution Time": \([0-9]*\.[0-9]*\).*/\1/p') + echo -e "$filenum\t$short_file\t$exec_time" + echo -e "$filenum\t$short_file\t$exec_time" >> $OUTPUT_DIR/dsb_onepass-$i.dat + else + # Prepare query + echo -n "/* $filenum */ EXPLAIN (VERBOSE, FORMAT JSON) " > $OUTPUT_DIR/test.sql + cat $file >> $OUTPUT_DIR/test.sql + # Query and save explain analyze output + result=$(psql -f $OUTPUT_DIR/test.sql) + echo -e "$filenum\t$short_file" + echo -e $result >> $OUTPUT_DIR/explain-$i/$test_name.json + fi + + filenum=$((filenum+1)) + set +f + done +done + + +# Save buffers usage to do correct pg_prewarm next time +psql -c "SELECT autoprewarm_dump_now();" diff --git a/df-autohint-runner/scripts/dsb-workload-gen.py b/df-autohint-runner/scripts/dsb-workload-gen.py new file mode 100644 index 000000000..4af5e61af --- /dev/null +++ b/df-autohint-runner/scripts/dsb-workload-gen.py @@ -0,0 +1,118 @@ +import simplejson as json +import shutil +import os +import subprocess + +# TODO: Modify these paths as needed +# TODO: This script should be executed from dsb/code/tools/ directory +base_dir = os.path.dirname(os.path.abspath(__file__)) +base_dir = os.path.join(base_dir, "../") +base_dir = os.path.realpath(base_dir) # runner root dir + +def load_json(filename): + with open(filename, 'r') as f: + return json.load(f) + +def get_query_template_files(dir, filenames): + dir = os.path.join(base_dir, dir) + if os.path.isfile(dir): + basename = os.path.basename(dir) + if basename.startswith('query') and basename.endswith('.tpl') and (filenames == [] or basename in filenames): + return [dir] + return [] + else: + result = [] + for file in os.listdir(dir): + result.extend(get_query_template_files(os.path.join(dir, file), filenames)) + return result + +def gen_gaussian_dist(exec_dir, options): + exec_path = os.path.join(exec_dir, 'distcomp') + cmd = [exec_path, '-i', 'tpcds.dst', '-o', 'tpcds.idx', '-param_dist', 'normal', '-verbose'] + if 'param_sigma' in options: + cmd.extend(['-param_sigma', str(options['param_sigma'])]) + if 'param_center' in options: + cmd.extend(['-param_center', str(options['param_center'])]) + if 'rngseed' in options and options['rngseed'] is not None: + cmd.extend(['-rngseed', str(options['rngseed'])]) + print(' '.join(cmd)) + subprocess.run(cmd, cwd=exec_dir) + +def generate_queries(exec_dir, output_dir, tmp_dir, template_filename, dialect, options): + print(f"generate queries for template {template_filename} to {output_dir}") + print('workload config:', options) + + os.makedirs(output_dir, exist_ok=True) + query_name = os.path.splitext(os.path.basename(template_filename))[0] + query_dir = os.path.join(output_dir, query_name) + os.makedirs(query_dir, exist_ok=True) + + os.chdir(exec_dir) + + exec_path = os.path.join('./dsqgen') + + # Must use relative path here, otherwise have weird path errors + template_dir = os.path.relpath(os.path.dirname(template_filename), exec_dir) + + cmd = [ + exec_path, + '-output_dir', tmp_dir, + '-streams', str(options['instance_count']), + '-directory', template_dir, + '-template', os.path.basename(template_filename), + '-dialect', dialect + ] + if 'param_dist' in options: + cmd.extend(['-param_dist', options['param_dist']]) + if 'rngseed' in options: + cmd.extend(['-rngseed', str(options['rngseed'])]) + print(' '.join(cmd)) + subprocess.run(cmd, cwd=exec_dir) + + # Rename the query instance files + for i in range(options['instance_count']): + src = os.path.join(tmp_dir, f'query_{i}.sql') + dst = os.path.join(query_dir, f'{query_name}_{i}.sql') + shutil.copy(src, dst) + +def generate_workload(workload_config): + output_dir = workload_config['output_dir'] + output_dir = os.path.join(base_dir, output_dir) + os.makedirs(output_dir, exist_ok=True) + + tmp_dir = os.path.join(output_dir, 'tmp') + os.makedirs(tmp_dir, exist_ok=True) + + exec_dir = workload_config['binary_dir'] + exec_dir = os.path.join(base_dir, exec_dir) + + for workload in workload_config['workload']: + query_template_files = get_query_template_files( + workload_config['query_template_root_dir'], + workload['query_template_names'] + ) + + # recompile distribution files if necessary + if workload.get('param_dist') == 'normal': + gen_gaussian_dist(exec_dir, workload) + + # save the weight file + shutil.copyfile( + os.path.join(exec_dir, 'tpcds.idx'), + os.path.join(output_dir, f"tpcds_{workload['id']}.idx") + ) + + dir = os.path.join(output_dir, workload['id']) + for query_template_file in query_template_files: + generate_queries(exec_dir, dir, tmp_dir, query_template_file, workload_config['dialect'], workload) + + # Remove temp dir + shutil.rmtree(tmp_dir) + +# Config json path — modify as needed +workload_config_file = 'scripts/dsb_workload_config.json' +workload_config_file = os.path.join(base_dir, workload_config_file) +workload_config = load_json(workload_config_file) +generate_workload(workload_config) + + diff --git a/df-autohint-runner/scripts/dsb-workload-gen.sh b/df-autohint-runner/scripts/dsb-workload-gen.sh new file mode 100755 index 000000000..7036c93c8 --- /dev/null +++ b/df-autohint-runner/scripts/dsb-workload-gen.sh @@ -0,0 +1,5 @@ +#!/bin/sh +python3 ./scripts/dsb-workload-gen.py +mv input/dsb/queries/1/*/*.sql input/dsb/queries +rm -r input/dsb/queries/1 +rm input/dsb/queries/tpcds_1.idx diff --git a/df-autohint-runner/scripts/dsb_workload_config.json b/df-autohint-runner/scripts/dsb_workload_config.json new file mode 100644 index 000000000..1f4519b40 --- /dev/null +++ b/df-autohint-runner/scripts/dsb_workload_config.json @@ -0,0 +1,18 @@ +{ + "output_dir": "input/dsb/queries", + "binary_dir": "dsb/code/tools", + "query_template_root_dir": "dsb/query_templates_pg", + "dialect" : "postgres", + "workload": + [ + { + "id" : "1", + "query_template_names" : [], + "instance_count" : 20, + "param_dist" : "normal", + "param_sigma" : 2, + "param_center" : 0, + "rngseed" : 997 + } + ] +} diff --git a/df-autohint-runner/scripts/pg_util.py b/df-autohint-runner/scripts/pg_util.py new file mode 100644 index 000000000..d14db9ce9 --- /dev/null +++ b/df-autohint-runner/scripts/pg_util.py @@ -0,0 +1,83 @@ +import os +import psycopg2 +from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT + +def run_cmd(cmd, verbose = False): + if verbose: + print(cmd) + status = os.popen(cmd).read() + print(status) + +def bulk_load_from_csv_file(cursor, csv_file, tmp_file, table_name, delimiter): + # Split the csv file into batches to load incrementally. + batch_size = 100000 + prev_pos = 0 + count = 0 + with open(csv_file) as fin: + flag = True + while flag: + with open(tmp_file, 'w') as fout: + cur = 0 + for i in range(batch_size): + line = fin.readline() + # Check if we reach EOF + pos = fin.tell() + if pos == prev_pos: + flag = False + break + else: + prev_pos = pos + if line.endswith(delimiter): + line = line[:-1] + else: + line = line[:-2] + '\n' + fout.write(line) + count += 1 + cur += 1 + load_from_csv_file(cursor, tmp_file, table_name, delimiter) + print('.', end = '', flush = True) + print('loaded', count, 'rows from', csv_file, 'to table', table_name) + + +def load_from_csv_file(cursor, csv_file, table_name, delimiter): + sql_cmd = 'copy ' + table_name + " from '" + csv_file + "' with (delimiter '" + delimiter + "', format csv);" + execute(cursor, sql_cmd) + +def execute(cursor, cmd, verbose = False): + try: + if verbose: + print(cmd) + cursor.execute(cmd) + except Exception as e: + print(e) + +def connect(user, password, db_name = None): + try: + if db_name is not None: + conn = psycopg2.connect(user=user, password=password, database=db_name) + else: + conn = psycopg2.connect(user=user, password=password) + conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) + + except Exception as e: + print(e) + + return conn + +def restart_server(pg_service_name = 'postgresql-x64-13'): + stop_server(pg_service_name) + start_server(pg_service_name) + +def stop_server(pg_service_name = 'postgresql-x64-13'): + + cmd = 'net stop ' + pg_service_name + util.run_cmd(cmd) + +def start_server(pg_service_name = 'postgresql-x64-13'): + + cmd = 'net start ' + pg_service_name + run_cmd(cmd) + +if __name__ == '__main__': + pg_service_name = 'postgresql-x64-13' + start_server(pg_service_name)