Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 21 additions & 6 deletions cmake/test_macros.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@ include(GoogleTest)
# Features:
# 1. Create executable target
# 2. Configure compile options, link libraries, and include paths
# 3. Use gtest_discover_tests to auto-discover test cases
# 4. Set test labels
# 3. Use gtest_discover_tests to auto-discover CPU test cases
# 4. Register CUDA tests at binary granularity with CTest GPU resources
# 5. Set test labels
#
# Arguments:
# SOURCES: Source file list (required)
Expand Down Expand Up @@ -73,7 +74,7 @@ macro(infini_train_add_test)
# 5. Link project library (reuses framework linking strategy)
link_infini_train_exe(${ARG_TEST_NAME})

# 6. Auto-discover gtest cases and register as ctest tests
# 6. Register tests
set(labels "cpu")
if(ARG_LABELS)
set(labels "${ARG_LABELS}")
Expand All @@ -84,16 +85,30 @@ macro(infini_train_add_test)
set(test_timeout ${ARG_TEST_TIMEOUT})
endif()

if(ARG_TEST_FILTER)
list(FIND labels cuda _has_cuda_label)
if(NOT _has_cuda_label EQUAL -1)
set(_cuda_test_args)
if(ARG_TEST_FILTER)
list(APPEND _cuda_test_args --gtest_filter=${ARG_TEST_FILTER})
endif()

add_test(
NAME ${ARG_TEST_NAME}
COMMAND $<TARGET_FILE:${ARG_TEST_NAME}> ${_cuda_test_args}
)
set_tests_properties(${ARG_TEST_NAME}
PROPERTIES
LABELS "${labels}"
TIMEOUT ${test_timeout}
)
elseif(ARG_TEST_FILTER)
gtest_discover_tests(${ARG_TEST_NAME}
EXTRA_ARGS --gtest_output=xml:%T.xml
TEST_FILTER "${ARG_TEST_FILTER}"
DISCOVERY_TIMEOUT 10
PROPERTIES LABELS "${labels}" TIMEOUT ${test_timeout}
)
else()
gtest_discover_tests(${ARG_TEST_NAME}
EXTRA_ARGS --gtest_output=xml:%T.xml
PROPERTIES LABELS "${labels}" TIMEOUT ${test_timeout}
)
endif()
Expand Down
4 changes: 3 additions & 1 deletion scripts/compare_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ def collect_log_files(base_dir: Path):
duplicates = {}

for path in base_dir.rglob("*.log"):
if path.name.startswith("build") or path.name.endswith("_profile.log"):
if not path.name.startswith(("gpt2_", "llama3_")):
continue
if path.name.endswith("_profile.log"):
continue

key = path.name
Expand Down
71 changes: 69 additions & 2 deletions scripts/run_models_and_profile.bash
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ LOG_DIR="$(read_var LOG_DIR)"; : "${LOG_DIR:=logs}"
PROFILE_LOG_DIR="$(read_var PROFILE_LOG_DIR)"; : "${PROFILE_LOG_DIR:=./profile_logs}"
COMPARE_LOG_DIR="$(read_var COMPARE_LOG_DIR)"; : "${COMPARE_LOG_DIR:=}"
RUN_CTEST="$(read_var RUN_CTEST)"; : "${RUN_CTEST:=true}"
CTEST_CMD="$(read_var CTEST_CMD)"; : "${CTEST_CMD:=ctest --output-on-failure -LE cuda -j$(nproc) && ctest --output-on-failure -L cuda -j1}"

mkdir -p "$BUILD_DIR" "$LOG_DIR" "$PROFILE_LOG_DIR"

Expand Down Expand Up @@ -114,6 +113,74 @@ clean_build_dir() {
rm -rf "${BUILD_DIR:?}/"*
}

run_ctest() {
local gpu_list=()
local cuda_tests=()

if [[ -n "${CTEST_CUDA_GPUS:-}" ]]; then
IFS=',' read -r -a gpu_list <<< "$CTEST_CUDA_GPUS"
elif command -v nvidia-smi >/dev/null 2>&1; then
mapfile -t gpu_list < <(nvidia-smi --query-gpu=index --format=csv,noheader 2>/dev/null || true)
fi

if [[ ${#gpu_list[@]} -eq 0 ]]; then
gpu_list=(0)
fi

local filtered_gpu_list=()
local gpu
for gpu in "${gpu_list[@]}"; do
gpu="${gpu//[[:space:]]/}"
[[ -z "$gpu" ]] && continue
filtered_gpu_list+=("$gpu")
done

if [[ ${#filtered_gpu_list[@]} -eq 0 ]]; then
filtered_gpu_list=(0)
fi

ctest --output-on-failure -LE cuda -j"$(nproc)"

mapfile -t cuda_tests < <(ctest -N -L cuda | sed -n 's/^ *Test *#[0-9][0-9]*: //p')
if [[ ${#cuda_tests[@]} -eq 0 ]]; then
return 0
fi

local worker_count="${#filtered_gpu_list[@]}"
local pids=()
local worker_idx
for ((worker_idx = 0; worker_idx < worker_count; worker_idx++)); do
(
local worker_failed=0
local test_idx="$worker_idx"
local test_name
local assigned_gpu="${filtered_gpu_list[$worker_idx]}"

while ((test_idx < ${#cuda_tests[@]})); do
test_name="${cuda_tests[$test_idx]}"
echo "[CUDA GPU ${assigned_gpu}] ${test_name}"
if ! CUDA_VISIBLE_DEVICES="$assigned_gpu" ctest --output-on-failure -R "^${test_name}$" -j1; then
worker_failed=1
fi
test_idx=$((test_idx + worker_count))
done

exit "$worker_failed"
) &
pids+=("$!")
done

local failed=0
local pid
for pid in "${pids[@]}"; do
if ! wait "$pid"; then
failed=1
fi
done

return "$failed"
}

# Run a command and log output
run_and_log() {
local cmd="$1"
Expand Down Expand Up @@ -247,7 +314,7 @@ for ((id=0; id<num_builds; ++id)); do
clean_build_dir
run_and_log "$LAST_CMAKE_CMD" "${build_id}" "no" "build"
if [[ "$RUN_CTEST" == "true" && "$build_profile" != "true" ]]; then
run_and_log "$CTEST_CMD" "ctest_${build_id}" "no" "ctest"
run_and_log "run_ctest" "ctest_${build_id}" "no" "ctest"
fi

# profile flag for runs
Expand Down
3 changes: 1 addition & 2 deletions scripts/test_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
"PROFILE_LOG_DIR": "./profile_logs",
"LOG_DIR": "./logs",
"COMPARE_LOG_DIR": "",
"RUN_CTEST": "true",
"CTEST_CMD": "ctest --output-on-failure -LE cuda -j$(nproc) && ctest --output-on-failure -L cuda -j1"
"RUN_CTEST": "true"
},
"builds": [
{
Expand Down
Loading