@@ -9,7 +9,7 @@ ROOT=$(pwd)
99if [[ -n " $1 " ]]; then
1010 url=" $1 "
1111else
12- url=' https://apache-doris-releases.oss-accelerate.aliyuncs.com/apache-doris-2 .1.7 -rc01-bin-x64.tar.gz'
12+ url=' https://apache-doris-releases.oss-accelerate.aliyuncs.com/apache-doris-4 .1.0 -rc01-bin-x64.tar.gz'
1313fi
1414# Download
1515file_name=" $( basename ${url} ) "
@@ -24,21 +24,20 @@ dir_name="${file_name/.tar.gz/}"
2424
2525# Try to stop Doris and remove it first if execute this script multiple times
2626set +e
27- " $dir_name " /apache-doris-2.1.7-rc01-bin-x64 /fe/bin/stop_fe.sh
28- " $dir_name " /apache-doris-2.1.7-rc01-bin-x64 /be/bin/stop_be.sh
27+ " $dir_name " /" $dir_name " /fe/bin/stop_fe.sh
28+ " $dir_name " /" $dir_name " /be/bin/stop_be.sh
2929rm -rf " $dir_name "
3030set -e
3131
3232# Uncompress
3333mkdir " $dir_name "
3434tar zxf " $file_name " -C " $dir_name "
35- DORIS_HOME=" $ROOT /$dir_name /apache-doris-2.1.7-rc01-bin-x64 "
35+ DORIS_HOME=" $ROOT /$dir_name /$dir_name "
3636export DORIS_HOME
3737
3838# Install dependencies
3939sudo apt-get update -y
40- sudo apt-get install -y openjdk-17-jdk
41- sudo apt-get install -y mysql-client
40+ sudo apt-get install -y openjdk-17-jdk mysql-client
4241export JAVA_HOME=" /usr/lib/jvm/java-17-openjdk-$( dpkg --print-architecture) /"
4342export PATH=$JAVA_HOME /bin:$PATH
4443
@@ -89,34 +88,160 @@ sleep 5
8988mysql -h 127.0.0.1 -P9030 -uroot hits < " $ROOT " /create.sql
9089
9190# Download data
92- if [[ ! -f hits.tsv.gz ]] && [[ ! -f hits.tsv ]]; then
93- sudo apt-get install -y pigz
94- wget --continue --progress=dot:giga ' https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz'
95- pigz -d -f hits.tsv.gz
91+ BE_DATA_DIR=" $DORIS_HOME /be/"
92+ mkdir -p " $BE_DATA_DIR /user_files_secure"
93+
94+ seq 0 99 | xargs -P100 -I{} bash -c ' wget --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet'
95+ mv * .parquet " $BE_DATA_DIR /user_files_secure"
96+
97+ BE_ID=$( mysql -h127.0.0.1 -P9030 -uroot -N -e ' show backends' | awk ' {print $1}' | head -1)
98+
99+ CORES=$( nproc)
100+ PARALLEL_NUM=$(( CORES / 4 ))
101+ if [ " $PARALLEL_NUM " -lt 1 ]; then
102+ echo " Computed parallel_pipeline_task_num ($PARALLEL_NUM ) is less than 1 based on $CORES cores; clamping to 1."
103+ PARALLEL_NUM=1
96104fi
105+ echo " Setting parallel_pipeline_task_num to $PARALLEL_NUM (cpu cores: $CORES , computed as CORES/4 with min 1)"
97106
98- # Load data
99- echo " start loading hits.tsv, estimated to take about 9 minutes ..."
100- date
107+ echo " start loading hits.parquet using TVF, estimated to take about 3 minutes ..."
101108START=$( date +%s)
102- curl --location-trusted \
103- -u root: \
104- -T " hits.tsv" \
105- -H " label:hits" \
106- -H "columns: WatchID,JavaEnable,Title,GoodEvent,EventTime,EventDate,CounterID,ClientIP,RegionID,UserID,CounterClass,OS,UserAgent,URL,Referer,IsRefresh,RefererCategoryID,RefererRegionID,URLCategoryID,URLRegionID,ResolutionWidth,ResolutionHeight,ResolutionDepth,FlashMajor,FlashMinor,FlashMinor2,NetMajor,NetMinor,UserAgentMajor,UserAgentMinor,CookieEnable,JavascriptEnable,IsMobile,MobilePhone,MobilePhoneModel,Params,IPNetworkID,TraficSourceID,SearchEngineID,SearchPhrase,AdvEngineID,IsArtifical,WindowClientWidth,WindowClientHeight,ClientTimeZone,ClientEventTime,SilverlightVersion1,SilverlightVersion2,SilverlightVersion3,SilverlightVersion4,PageCharset,CodeVersion,IsLink,IsDownload,IsNotBounce,FUniqID,OriginalURL,HID,IsOldCounter,IsEvent,IsParameter,DontCountHits,WithHash,HitColor,LocalEventTime,Age,Sex,Income,Interests,Robotness,RemoteIP,WindowName,OpenerName,HistoryLength,BrowserLanguage,BrowserCountry,SocialNetwork,SocialAction,HTTPError,SendTiming,DNSTiming,ConnectTiming,ResponseStartTiming,ResponseEndTiming,FetchTiming,SocialSourceNetworkID,SocialSourcePage,ParamPrice,ParamOrderID,ParamCurrency,ParamCurrencyID,OpenstatServiceName,OpenstatCampaignID,OpenstatAdID,OpenstatSourceID,UTMSource,UTMMedium,UTMCampaign,UTMContent,UTMTerm,FromTag,HasGCLID,RefererHash,URLHash,CLID" \
107- http://localhost:8030/api/hits/hits/_stream_load
109+ mysql -h 127.0.0.1 -P9030 -uroot hits -e " SET parallel_pipeline_task_num = $PARALLEL_NUM ;\
110+ INSERT INTO hits SELECT
111+ CounterID,
112+ DATE_ADD('1970-01-01', INTERVAL EventDate DAY) AS EventDate,
113+ UserID,
114+ FROM_UNIXTIME(EventTime) AS EventTime,
115+ WatchID,
116+ JavaEnable,
117+ Title,
118+ GoodEvent,
119+ ClientIP,
120+ RegionID,
121+ CounterClass,
122+ OS,
123+ UserAgent,
124+ URL,
125+ Referer,
126+ IsRefresh,
127+ RefererCategoryID,
128+ RefererRegionID,
129+ URLCategoryID,
130+ URLRegionID,
131+ ResolutionWidth,
132+ ResolutionHeight,
133+ ResolutionDepth,
134+ FlashMajor,
135+ FlashMinor,
136+ FlashMinor2,
137+ NetMajor,
138+ NetMinor,
139+ UserAgentMajor,
140+ UserAgentMinor,
141+ CookieEnable,
142+ JavascriptEnable,
143+ IsMobile,
144+ MobilePhone,
145+ MobilePhoneModel,
146+ Params,
147+ IPNetworkID,
148+ TraficSourceID,
149+ SearchEngineID,
150+ SearchPhrase,
151+ AdvEngineID,
152+ IsArtifical,
153+ WindowClientWidth,
154+ WindowClientHeight,
155+ ClientTimeZone,
156+ FROM_UNIXTIME(ClientEventTime) AS ClientEventTime,
157+ SilverlightVersion1,
158+ SilverlightVersion2,
159+ SilverlightVersion3,
160+ SilverlightVersion4,
161+ PageCharset,
162+ CodeVersion,
163+ IsLink,
164+ IsDownload,
165+ IsNotBounce,
166+ FUniqID,
167+ OriginalURL,
168+ HID,
169+ IsOldCounter,
170+ IsEvent,
171+ IsParameter,
172+ DontCountHits,
173+ WithHash,
174+ HitColor,
175+ FROM_UNIXTIME(LocalEventTime) AS LocalEventTime,
176+ Age,
177+ Sex,
178+ Income,
179+ Interests,
180+ Robotness,
181+ RemoteIP,
182+ WindowName,
183+ OpenerName,
184+ HistoryLength,
185+ BrowserLanguage,
186+ BrowserCountry,
187+ SocialNetwork,
188+ SocialAction,
189+ HTTPError,
190+ SendTiming,
191+ DNSTiming,
192+ ConnectTiming,
193+ ResponseStartTiming,
194+ ResponseEndTiming,
195+ FetchTiming,
196+ SocialSourceNetworkID,
197+ SocialSourcePage,
198+ ParamPrice,
199+ ParamOrderID,
200+ ParamCurrency,
201+ ParamCurrencyID,
202+ OpenstatServiceName,
203+ OpenstatCampaignID,
204+ OpenstatAdID,
205+ OpenstatSourceID,
206+ UTMSource,
207+ UTMMedium,
208+ UTMCampaign,
209+ UTMContent,
210+ UTMTerm,
211+ FromTag,
212+ HasGCLID,
213+ RefererHash,
214+ URLHash,
215+ CLID
216+ FROM local(
217+ \" file_path\" = \" user_files_secure/hits_*.parquet\" ,
218+ \" backend_id\" = \" $BE_ID \" ,
219+ \" format\" = \" parquet\"
220+ )
221+ "
108222END=$( date +%s)
109223LOADTIME=$( echo " $END - $START " | bc)
110224echo " Load time: $LOADTIME "
111225echo " $LOADTIME " > loadtime
112226
113- # Dataset contains 99997497 rows, storage size is about 17319588503 bytes
114- mysql -h 127.0.0.1 -P9030 -uroot hits -e " SELECT count(*) FROM hits"
115- du -bs " $DORIS_HOME " /be/storage/ | cut -f1 | tee storage_size
116227
228+ du -bs " $DORIS_HOME " /be/storage/ | cut -f1 | tee storage_size
117229echo " Data size: $( cat storage_size) "
118230
119- ./run.sh 2>&1 | tee -a log.txt
231+ mysql -h 127.0.0.1 -P9030 -uroot hits -e " set global enable_sql_cache = false"
232+ # Dataset contains 99997497 rows, storage size is about 13319588503 bytes
233+ mysql -h 127.0.0.1 -P9030 -uroot hits -e " SELECT count(*) FROM hits"
234+
235+ # Run queries
236+ TRIES=3
237+ while read -r query; do
238+ sync
239+ echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null
240+
241+ for i in $( seq 1 $TRIES ) ; do
242+ mysql -vvv -h127.1 -P9030 -uroot hits -e " ${query} " 2>&1 | tee -a log.txt
243+ done
244+ done < queries.sql
120245
121246cat log.txt |
122247 grep -P ' rows? in set|Empty set|^ERROR' |
0 commit comments