Skip to content

Commit 4897962

Browse files
Merge branch 'InfiniTensor:master' into master
2 parents 3939679 + b1e4b03 commit 4897962

6 files changed

Lines changed: 579 additions & 349 deletions

File tree

scripts/compare_loss.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import sys
1010
from pathlib import Path
1111
from argparse import ArgumentParser
12+
from compare_utils import collect_log_files, exit_if_duplicate_logs
1213

1314
def get_dtype_from_filename(filename):
1415
"""Determine dtype from filename. Returns 'bfloat16' or 'fp32'."""
@@ -62,8 +63,10 @@ def main():
6263
args.threshold_fp32 = args.threshold
6364
args.threshold_bf16 = args.threshold
6465

65-
files1 = {f.name: f for f in args.dir1.glob('*.log') if not f.name.startswith('build')}
66-
files2 = {f.name: f for f in args.dir2.glob('*.log') if not f.name.startswith('build')}
66+
files1, duplicates1 = collect_log_files(args.dir1)
67+
files2, duplicates2 = collect_log_files(args.dir2)
68+
exit_if_duplicate_logs(args.dir1, duplicates1)
69+
exit_if_duplicate_logs(args.dir2, duplicates2)
6770

6871
only_in_1 = set(files1.keys()) - set(files2.keys())
6972
only_in_2 = set(files2.keys()) - set(files1.keys())

scripts/compare_tps.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import sys
1010
from pathlib import Path
1111
from argparse import ArgumentParser
12+
from compare_utils import collect_log_files, exit_if_duplicate_logs
1213

1314
def parse_log(file_path):
1415
"""Extract step -> tok/s mapping from log file."""
@@ -55,8 +56,10 @@ def main():
5556
parser.add_argument('--verbose', action='store_true', help='Print detailed output for all files, including passed ones')
5657
args = parser.parse_args()
5758

58-
files1 = {f.name: f for f in args.dir1.glob('*.log') if not f.name.startswith('build')}
59-
files2 = {f.name: f for f in args.dir2.glob('*.log') if not f.name.startswith('build')}
59+
files1, duplicates1 = collect_log_files(args.dir1)
60+
files2, duplicates2 = collect_log_files(args.dir2)
61+
exit_if_duplicate_logs(args.dir1, duplicates1)
62+
exit_if_duplicate_logs(args.dir2, duplicates2)
6063

6164
only_in_1 = set(files1.keys()) - set(files2.keys())
6265
only_in_2 = set(files2.keys()) - set(files1.keys())

scripts/compare_utils.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from pathlib import Path
2+
import sys
3+
4+
5+
def collect_log_files(base_dir: Path):
6+
"""Collect comparable training logs keyed by basename."""
7+
files = {}
8+
duplicates = {}
9+
10+
for path in base_dir.rglob("*.log"):
11+
if path.name.startswith("build") or path.name.endswith("_profile.log"):
12+
continue
13+
14+
key = path.name
15+
if key in files:
16+
duplicates.setdefault(key, [files[key]]).append(path)
17+
continue
18+
files[key] = path
19+
20+
return files, duplicates
21+
22+
23+
def exit_if_duplicate_logs(base_dir: Path, duplicates):
24+
"""Abort when duplicate basenames make comparison ambiguous."""
25+
if not duplicates:
26+
return
27+
28+
print(f"Found duplicate log basenames in {base_dir.resolve()}, cannot compare safely:")
29+
for name, paths in sorted(duplicates.items()):
30+
print(f" {name}: {', '.join(str(p.relative_to(base_dir)) for p in paths)}")
31+
sys.exit(1)

scripts/run_models_and_profile.bash

Lines changed: 126 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,56 @@
33
set -e
44
set -o pipefail
55

6-
CONFIG_FILE="${1:-test_config.json}"
6+
usage() {
7+
cat <<'EOF'
8+
Usage: run_models_and_profile.bash [--test-config path] [--only-run tag1,tag2]
9+
10+
Options:
11+
--test-config PATH Path to test config JSON. Default: test_config.json.
12+
--only-run TAGS Only run the specified tag groups, separated by commas.
13+
-h, --help Show this help message.
14+
EOF
15+
}
16+
17+
CONFIG_FILE="test_config.json"
18+
ONLY_RUN_TAGS=""
19+
20+
while [[ $# -gt 0 ]]; do
21+
case "$1" in
22+
--test-config)
23+
[[ $# -lt 2 ]] && { echo "Error: --test-config requires a file path."; exit 1; }
24+
CONFIG_FILE="$2"
25+
shift 2
26+
;;
27+
--test-config=*)
28+
CONFIG_FILE="${1#*=}"
29+
shift
30+
;;
31+
--only-run)
32+
[[ $# -lt 2 ]] && { echo "Error: --only-run requires a comma-separated tag list."; exit 1; }
33+
ONLY_RUN_TAGS="$2"
34+
shift 2
35+
;;
36+
--only-run=*)
37+
ONLY_RUN_TAGS="${1#*=}"
38+
shift
39+
;;
40+
-h|--help)
41+
usage
42+
exit 0
43+
;;
44+
-*)
45+
echo "Error: Unknown option: $1"
46+
usage
47+
exit 1
48+
;;
49+
*)
50+
echo "Error: Unknown positional argument: $1"
51+
usage
52+
exit 1
53+
;;
54+
esac
55+
done
756

857
# Dependencies check
958
if ! command -v jq >/dev/null 2>&1; then
@@ -33,6 +82,28 @@ done < <(jq -r '.variables | to_entries[] | "\(.key)=\(.value)"' "$CONFIG_FILE")
3382

3483
# Global variable to save the last cmake command
3584
LAST_CMAKE_CMD=""
85+
declare -A SELECTED_TAGS=()
86+
87+
normalize_tag() {
88+
local raw="$1"
89+
raw="${raw#"${raw%%[![:space:]]*}"}"
90+
raw="${raw%"${raw##*[![:space:]]}"}"
91+
printf '%s' "$raw"
92+
}
93+
94+
if [[ -n "$ONLY_RUN_TAGS" ]]; then
95+
IFS=',' read -r -a requested_tags <<< "$ONLY_RUN_TAGS"
96+
for raw_tag in "${requested_tags[@]}"; do
97+
tag="$(normalize_tag "$raw_tag")"
98+
[[ -z "$tag" ]] && continue
99+
SELECTED_TAGS["$tag"]=1
100+
done
101+
102+
if [[ ${#SELECTED_TAGS[@]} -eq 0 ]]; then
103+
echo "Error: --only-run did not contain any valid tags."
104+
exit 1
105+
fi
106+
fi
36107

37108
# Clean the build directory
38109
clean_build_dir() {
@@ -46,9 +117,12 @@ run_and_log() {
46117
local cmd="$1"
47118
local log_name="$2"
48119
local is_profile="$3"
120+
local tag="${4:-basic}"
49121
local timestamp
50122
timestamp=$(date '+%Y-%m-%d %H:%M:%S')
51-
local log_path="$(realpath "${LOG_DIR}/${log_name}.log")"
123+
local tag_log_dir="${LOG_DIR}/${tag}"
124+
mkdir -p "$tag_log_dir"
125+
local log_path="$(realpath "${tag_log_dir}/${log_name}.log")"
52126

53127
echo -e "\033[1;32m============================================================\033[0m"
54128
echo -e "\033[1;36m[$timestamp] [Running] ${log_name}\033[0m"
@@ -99,22 +173,25 @@ run_and_log() {
99173

100174
# If profiling is enabled, move profiling files to the target directory
101175
if [[ "$is_profile" == "yes" ]]; then
102-
move_profile_logs "$log_name"
176+
move_profile_logs "$log_name" "$tag"
103177
fi
104178
}
105179

106180

107181
# Move profiling output logs
108182
move_profile_logs() {
109183
local prefix="$1"
184+
local tag="${2:-basic}"
185+
local tag_profile_dir="${PROFILE_LOG_DIR}/${tag}"
186+
mkdir -p "$tag_profile_dir"
110187

111188
# Move *.report.rankN files
112189
for report_file in "${BUILD_DIR}"/*.report.rank*; do
113190
if [[ -f "$report_file" ]]; then
114191
local base_name
115192
base_name=$(basename "$report_file")
116-
mv "$report_file" "${PROFILE_LOG_DIR}/${prefix}_${base_name}"
117-
echo "Moved $base_name to ${PROFILE_LOG_DIR}/${prefix}_${base_name}"
193+
mv "$report_file" "${tag_profile_dir}/${prefix}_${base_name}"
194+
echo "Moved $base_name to ${tag_profile_dir}/${prefix}_${base_name}"
118195
fi
119196
done
120197

@@ -123,25 +200,39 @@ move_profile_logs() {
123200
if [[ -f "$record_file" ]]; then
124201
local base_name
125202
base_name=$(basename "$record_file")
126-
mv "$record_file" "${PROFILE_LOG_DIR}/${prefix}_${base_name}"
127-
echo "Moved $base_name to ${PROFILE_LOG_DIR}/${prefix}_${base_name}"
203+
mv "$record_file" "${tag_profile_dir}/${prefix}_${base_name}"
204+
echo "Moved $base_name to ${tag_profile_dir}/${prefix}_${base_name}"
128205
fi
129206
done
130207
}
131208

132-
# Build "--key value" arg string from tests[i].args (shell-escaped)
209+
# Build "--key value" arg string from test_groups[gi].tests[ti].args (shell-escaped)
133210
args_string_for_test() {
134-
local idx="$1"
135-
jq -r --argjson i "$idx" '
136-
.tests[$i].args
211+
local group_idx="$1"
212+
local test_idx="$2"
213+
jq -r --argjson g "$group_idx" --argjson t "$test_idx" '
214+
.test_groups[$g].tests[$t].args
137215
| to_entries[]
138216
| "--\(.key) \(.value|tostring)"
139217
' "$CONFIG_FILE" | paste -sd' ' -
140218
}
141219

142220
# Run tests
143221
num_builds=$(jq '.builds | length' "$CONFIG_FILE")
144-
num_tests=$(jq '.tests | length' "$CONFIG_FILE")
222+
num_groups=$(jq '.test_groups | length' "$CONFIG_FILE")
223+
224+
selected_group_count=0
225+
for ((gi=0; gi<num_groups; ++gi)); do
226+
group_tag=$(jq -r ".test_groups[$gi].tag" "$CONFIG_FILE")
227+
if [[ ${#SELECTED_TAGS[@]} -eq 0 || -n "${SELECTED_TAGS[$group_tag]}" ]]; then
228+
((selected_group_count += 1))
229+
fi
230+
done
231+
232+
if [[ "$selected_group_count" -eq 0 ]]; then
233+
echo "Error: No matching test groups found for --only-run=${ONLY_RUN_TAGS}"
234+
exit 1
235+
fi
145236

146237
for ((id=0; id<num_builds; ++id)); do
147238
build_id=$(jq -r ".builds[$id].id" "$CONFIG_FILE")
@@ -152,7 +243,7 @@ for ((id=0; id<num_builds; ++id)); do
152243

153244
# always clean before another build
154245
clean_build_dir
155-
run_and_log "$LAST_CMAKE_CMD" "${build_id}" "no"
246+
run_and_log "$LAST_CMAKE_CMD" "${build_id}" "no" "build"
156247

157248
# profile flag for runs
158249
profile_flag="no"
@@ -162,17 +253,27 @@ for ((id=0; id<num_builds; ++id)); do
162253
log_suffix="_profile"
163254
fi
164255

165-
for ((ti=0; ti<num_tests; ++ti)); do
166-
test_id=$(jq -r ".tests[$ti].id" "$CONFIG_FILE")
167-
arg_str="$(args_string_for_test "$ti")"
256+
for ((gi=0; gi<num_groups; ++gi)); do
257+
group_tag=$(jq -r ".test_groups[$gi].tag" "$CONFIG_FILE")
258+
if [[ ${#SELECTED_TAGS[@]} -gt 0 && -z "${SELECTED_TAGS[$group_tag]}" ]]; then
259+
continue
260+
fi
261+
262+
num_tests=$(jq ".test_groups[$gi].tests | length" "$CONFIG_FILE")
263+
echo -e "\033[1;36m[TEST GROUP] tag=${group_tag}, cases=${num_tests}\033[0m"
168264

169-
# gpt2
170-
gpt2_cmd="${prefix}./gpt2 --input_bin ${GPT2_INPUT_BIN} --llmc_filepath ${GPT2_LLMC_FILEPATH} --device cuda ${arg_str}"
171-
run_and_log "$gpt2_cmd" "gpt2_${test_id}${log_suffix}" "$profile_flag"
265+
for ((ti=0; ti<num_tests; ++ti)); do
266+
test_id=$(jq -r ".test_groups[$gi].tests[$ti].id" "$CONFIG_FILE")
267+
arg_str="$(args_string_for_test "$gi" "$ti")"
172268

173-
# llama3
174-
llama3_cmd="${prefix}./llama3 --input_bin ${LLAMA3_INPUT_BIN} --llmc_filepath ${LLAMA3_LLMC_FILEPATH} --device cuda ${arg_str}"
175-
run_and_log "$llama3_cmd" "llama3_${test_id}${log_suffix}" "$profile_flag"
269+
# gpt2
270+
gpt2_cmd="${prefix}./gpt2 --input_bin ${GPT2_INPUT_BIN} --llmc_filepath ${GPT2_LLMC_FILEPATH} --device cuda ${arg_str}"
271+
run_and_log "$gpt2_cmd" "gpt2_${test_id}${log_suffix}" "$profile_flag" "$group_tag"
272+
273+
# llama3
274+
llama3_cmd="${prefix}./llama3 --input_bin ${LLAMA3_INPUT_BIN} --llmc_filepath ${LLAMA3_LLMC_FILEPATH} --device cuda ${arg_str}"
275+
run_and_log "$llama3_cmd" "llama3_${test_id}${log_suffix}" "$profile_flag" "$group_tag"
276+
done
176277
done
177278
done
178279

@@ -202,3 +303,6 @@ else
202303
echo -e "\033[1;33m or export COMPARE_LOG_DIR=/path/to/baseline_logs before running.\033[0m"
203304
echo -e "\033[1;33m============================================================\033[0m"
204305
fi
306+
307+
echo -e "\n\033[1;36m[END OF TEST] Cleaning build directory after all tests\033[0m"
308+
clean_build_dir

0 commit comments

Comments
 (0)