blob: 6375418dc896cacf3feb3670438205068b9aab85 [file] [log] [blame]
#!/bin/bash
# Runs a perf instance against a pre-configured spanner database,
# rebuilding and restarting on code changes.
# Prerequisites:
# 1. gcloud auth application-default login
# 2. bazelisk in PATH
# Example Command:
# ./run_perfserver.sh --instance chrome [--proxy]
if ! command -v bazelisk > /dev/null 2>&1;
then
echo "Error: bazelisk command not found. Please ensure it's in your PATH."
exit 1
fi
# --- Dependency Checks ---
MISSING_DEPS=false
for cmd in docker nc fuser; do
if ! command -v "$cmd" > /dev/null 2>&1;
then
echo "Error: Required command '$cmd' not found. Please ensure it's in your PATH."
MISSING_DEPS=true
fi
done
if [ "$MISSING_DEPS" = true ]; then
exit 1
fi
# --- ADC Check ---
ADC_FILE="$HOME/.config/gcloud/application_default_credentials.json"
if [ ! -f "$ADC_FILE" ]; then
echo "Error: Application Default Credentials file not found at: $ADC_FILE"
echo "Please run: gcloud auth application-default login"
exit 1
fi
# --- Docker Permission Check ---
DOCKER_CMD="docker"
if ! docker ps > /dev/null 2>&1; then
if sudo docker ps > /dev/null 2>&1; then
DOCKER_CMD="sudo docker"
echo "Docker requires sudo. Using 'sudo docker'."
fi
fi
# --- Configuration presets ---
# Common constants to reduce redundancy and line length
CORP="skia-infra-corp"
PUB="skia-infra-public"
SPAN_CORP="tfgen-spanid-20241205020733610"
SPAN_PUB="tfgen-spanid-20241204191122045"
GSRC="chromium.googlesource.com"
ASRC="android.googlesource.com"
TSRC="turquoise-internal.googlesource.com"
SSRC="skia.googlesource.com"
declare -A PROJECTS SPANNER_INSTANCES DATABASES CONFIGS DOMAINS REPOS
# Configuration table
while read -r instance project spanner db config_name domain repo; do
[[ -z "$instance" || "$instance" =~ ^# ]] && continue
PROJECTS[$instance]="$project"
SPANNER_INSTANCES[$instance]="$spanner"
DATABASES[$instance]="$db"
CONFIGS[$instance]="perf/configs/spanner/$config_name"
DOMAINS[$instance]="$domain"
REPOS[$instance]="$repo"
done <<EOF
chrome $CORP $SPAN_CORP chrome_int chrome-internal.json $GSRC chromium/src
v8 $CORP $SPAN_CORP v8_int v8-internal.json $GSRC v8/v8
fuchsia $CORP $SPAN_CORP fuchsia_int fuchsia-internal.json $TSRC integration
germanium $CORP $SPAN_CORP germanium_evals germanium-internal.json $GSRC chromium/src
android $PUB $SPAN_PUB androidx android2.json $ASRC platform/frameworks/support
angle $PUB $SPAN_PUB angle angle.json $GSRC angle/angle
flutter $PUB $SPAN_PUB flutter_flutter flutter-flutter.json $GSRC flutter/flutter
widevine $CORP $SPAN_CORP widevine_cdm widevine-cdm.json $GSRC chromium/src
webrtc $PUB $SPAN_PUB webrtc_pub webrtc-public.json $GSRC chromium/src
skia $PUB $SPAN_PUB skia skia-public.json $SSRC skia
v8_pub $PUB $SPAN_PUB v8 v8-public.json $GSRC v8/v8
EOF
VALID_INSTANCES="${!PROJECTS[@]}"
usage() {
echo "Usage: $0 --instance <instance_type> [--proxy]"
echo " Valid instance types are: ${VALID_INSTANCES// / | }"
echo " --proxy: Build and run the auth-proxy in the background."
exit 1
}
INSTANCE=""
RUN_PROXY=false
# Parse arguments
while [[ "$#" -gt 0 ]]; do
case $1 in
--instance)
INSTANCE="$2"
shift
;;
--proxy)
RUN_PROXY=true
;;
--help|-h)
usage
;;
*)
echo "Unknown parameter passed: $1"
usage
;;
esac
shift
done
if [[ -z "$INSTANCE" ]]; then
echo "Error: --instance flag is required."
usage
fi
if [[ ! -v PROJECTS["$INSTANCE"] ]]; then
echo "Error: Invalid instance type: $INSTANCE"
usage
fi
PROJECT=${PROJECTS[$INSTANCE]}
SPANNER_INSTANCE=${SPANNER_INSTANCES[$INSTANCE]}
DATABASE=${DATABASES[$INSTANCE]}
CONFIG=${CONFIGS[$INSTANCE]}
DOMAIN=${DOMAINS[$INSTANCE]}
REPO=${REPOS[$INSTANCE]}
export GOOGLE_CLOUD_PROJECT="$PROJECT"
# Validate config file existence
if [ ! -f "$CONFIG" ]; then
echo "Error: Configuration file not found: $CONFIG"
exit 1
fi
# Create a temporary config file that disables notifications to avoid
# Secret Manager permission issues when running locally.
TEMP_CONFIG="/tmp/perf_config_${INSTANCE}_$$.json"
sed -e 's/"notifications": ".*"/"notifications": "none"/' \
-e 's/"notification_type": ".*"/"notification_type": "none"/' \
-e 's/"type": "redis"/"type": "local"/' \
"$CONFIG" > "$TEMP_CONFIG"
echo "Created temporary config file: $TEMP_CONFIG (notifications disabled, local cache)"
# --- Dynamic Port Allocation ---
echo "Finding free port slot..."
OFFSET=0
STEP=10
while true; do
PERF_PORT=$((8002 + OFFSET))
PERF_PROM_PORT=$((20001 + OFFSET))
PG_PORT=$((5432 + OFFSET))
PROXY_PORT=$((8003 + OFFSET))
PROXY_PROM_PORT=$((20003 + OFFSET))
# Check if main ports are in use
if nc -z 127.0.0.1 $PERF_PORT >/dev/null 2>&1 || \
nc -z 127.0.0.1 $PERF_PROM_PORT >/dev/null 2>&1 || \
nc -z 127.0.0.1 $PG_PORT >/dev/null 2>&1; then
OFFSET=$((OFFSET + STEP))
continue
fi
# If proxy is requested, check those ports too
if [ "$RUN_PROXY" = true ]; then
if nc -z 127.0.0.1 $PROXY_PORT >/dev/null 2>&1 || \
nc -z 127.0.0.1 $PROXY_PROM_PORT >/dev/null 2>&1; then
OFFSET=$((OFFSET + STEP))
continue
fi
fi
# All required ports for this offset are free
break
done
echo "Selected ports (Offset: $OFFSET):"
echo " Perfserver: $PERF_PORT (metrics: $PERF_PROM_PORT)"
echo " Postgres: $PG_PORT"
if [ "$RUN_PROXY" = true ]; then
echo " Auth-Proxy: $PROXY_PORT (metrics: $PROXY_PROM_PORT)"
fi
# Use a hash of the current directory to allow running the same instance from different checkouts
DIR_HASH=$(pwd | md5sum 2>/dev/null | awk '{print $1}')
if [ -z "$DIR_HASH" ]; then
# Fallback if md5sum is missing
DIR_HASH=$(pwd | sed 's/[^a-zA-Z0-9]/_/g')
fi
LOCKFILE="/tmp/run_perfserver_${INSTANCE}_${DIR_HASH}.lock"
MARKER_FILE="/tmp/perf_server_marker_${INSTANCE}_${DIR_HASH}"
if [ -e "$LOCKFILE" ]; then
LOCKED_PID=$(cat "$LOCKFILE")
if kill -0 "$LOCKED_PID" > /dev/null 2>&1; then
echo "Error: Lock file exists: $LOCKFILE"
echo "Another instance of the script for '$INSTANCE' in this directory is running"
echo "(PID: $LOCKED_PID)."
rm -f "$TEMP_CONFIG"
exit 1
else
echo "Found stale lock file (PID: $LOCKED_PID). Removing it."
rm -f "$LOCKFILE"
fi
fi
echo $$ > "$LOCKFILE"
touch "$MARKER_FILE"
echo "Using the following params: -p=$PROJECT -i=$SPANNER_INSTANCE -d=$DATABASE \
-config=$TEMP_CONFIG -domain=$DOMAIN -repo=$REPO"
# Include hash in container name to avoid collisions between different directories
# running same instance
CONTAINER_NAME="perf-pgadapter-${INSTANCE}-${DIR_HASH}"
SERVER_PID=0
AUTH_PROXY_PID=0
RETRY_INTERVAL=60
LAST_BUILD_ATTEMPT=0
stop_server() {
if [ "$SERVER_PID" -ne 0 ]; then
echo "Stopping old perfserver (PID: $SERVER_PID)..."
kill "$SERVER_PID" > /dev/null 2>&1
# Give it a moment to shut down
sleep 0.5
wait "$SERVER_PID" > /dev/null 2>&1
SERVER_PID=0
fi
}
build_server() {
echo "Building perfserver..."
LAST_BUILD_ATTEMPT=$(date +%s)
bazelisk build //perf/... --config=mayberemote -c dbg
}
start_server() {
echo "Starting new perfserver in background..."
LOG_FILE="/tmp/perfserver_${INSTANCE}_$(date +%Y%m%d_%H%M%S).log"
echo "Log file: $LOG_FILE"
_bazel_bin/perf/go/perfserver/perfserver_/perfserver frontend \
--dev_mode \
--localToProd \
--do_clustering=false \
--port=:$PERF_PORT \
--prom_port=:$PERF_PROM_PORT \
--config_filename="$TEMP_CONFIG" \
--display_group_by=false \
--disable_metrics_update=true \
--resources_dir=_bazel_bin/perf/pages/development/ \
--connection_string="postgresql://root@127.0.0.1:$PG_PORT/${DATABASE}?sslmode=disable" \
--commit_range_url="https://${DOMAIN}/${REPO}/+log/{begin}..{end}" > "$LOG_FILE" 2>&1 &
SERVER_PID=$!
echo "Perfserver process started (PID: $SERVER_PID)"
echo "Waiting for perfserver to be ready on port $PERF_PORT..."
MAX_TRIES=150
COUNT=0
while ! nc -z 127.0.0.1 $PERF_PORT > /dev/null 2>&1; do
if ! kill -0 $SERVER_PID > /dev/null 2>&1; then
echo "Perfserver process (PID: $SERVER_PID) died unexpectedly."
echo -e "\n--- Last 100 lines of perfserver log ---"
tail -n 100 "$LOG_FILE"
echo -e "\n--- Last 50 lines of pgadapter log ---"
$DOCKER_CMD logs "$CONTAINER_NAME" 2>&1 | tail -n 50
if $DOCKER_CMD logs "$CONTAINER_NAME" 2>&1 | \
grep -q "PERMISSION_DENIED: Cloud Monitoring API has not been used in project"; then
echo -e "\n*** DETECTED WRONG PROJECT CONFIGURATION ***"
echo "Your credentials are likely configured with an incorrect quota project."
echo "Please run the following command to fix it:"
echo " gcloud auth application-default set-quota-project $PROJECT"
echo "**********************************************"
fi
echo "---------------------------------"
SERVER_PID=0
return 1
fi
sleep 2
COUNT=$((COUNT + 1))
if [ $COUNT -ge $MAX_TRIES ]; then
echo "Perfserver failed to become ready after $(($MAX_TRIES * 2)) seconds."
echo -e "\n--- Last 100 lines of perfserver log ---"
tail -n 100 "$LOG_FILE"
kill "$SERVER_PID" > /dev/null 2>&1
SERVER_PID=0
return 1
fi
echo -n "."
done
HOSTNAME=$(hostname -f)
echo -e "\nPerfserver is up and running at http://${HOSTNAME}:$PERF_PORT"
if [ "$RUN_PROXY" = true ]; then
echo "Auth proxy is up and running at http://${HOSTNAME}:$PROXY_PORT"
fi
return 0
}
attempt_build_and_start() {
if build_server; then
if start_server; then
return 0
else
echo "Start failed. Will retry in $RETRY_INTERVAL seconds or on file change."
return 1
fi
else
echo "Build failed. Will retry in $RETRY_INTERVAL seconds or on file change."
return 1
fi
}
# Cleanup function to stop docker container and background processes
cleanup() {
echo -e "\nShutting down..."
# Disable further traps to prevent re-entry
trap - INT TERM
stop_server
if [ "$AUTH_PROXY_PID" -ne 0 ]; then
echo "Stopping auth-proxy (PID: $AUTH_PROXY_PID)..."
kill "$AUTH_PROXY_PID" > /dev/null 2>&1
wait "$AUTH_PROXY_PID" > /dev/null 2>&1
fi
# Failsafe: ensure ports are cleared
fuser -k $PERF_PORT/tcp >/dev/null 2>&1
fuser -k $PERF_PROM_PORT/tcp >/dev/null 2>&1
fuser -k $PG_PORT/tcp >/dev/null 2>&1
if [ "$RUN_PROXY" = true ]; then
fuser -k $PROXY_PORT/tcp >/dev/null 2>&1
fuser -k $PROXY_PROM_PORT/tcp >/dev/null 2>&1
fi
if [ -n "$CONTAINER_NAME" ]; then
echo "Stopping pgadapter container..."
$DOCKER_CMD rm -f "$CONTAINER_NAME" > /dev/null 2>&1
fi
rm -f "$LOCKFILE"
rm -f "$MARKER_FILE"
if [ -f "$TEMP_CONFIG" ]; then
rm -f "$TEMP_CONFIG"
fi
echo "Exited"
exit 0
}
# Trap exit signals to run cleanup
trap cleanup INT TERM
# First delete any existing docker container for this instance to start clean.
echo "Removing any existing docker container for this instance..."
$DOCKER_CMD rm -f "$CONTAINER_NAME" > /dev/null 2>&1
# Now let's run pgadapter connected to the supplied spanner database.
echo "Starting pgadapter on port $PG_PORT..."
$DOCKER_CMD run -d --rm --name "$CONTAINER_NAME" -p 127.0.0.1:$PG_PORT:5432 \
-e GOOGLE_CLOUD_PROJECT="$PROJECT" \
-e GOOGLE_CLOUD_QUOTA_PROJECT="$PROJECT" \
-v "$ADC_FILE":/acct_credentials.json \
gcr.io/cloud-spanner-pg-adapter/pgadapter:latest \
-p "$PROJECT" -i "$SPANNER_INSTANCE" -d "$DATABASE" \
-c /acct_credentials.json -x > /dev/null
echo "pgadapter container started: $CONTAINER_NAME"
echo "Waiting for pgadapter to be ready on port $PG_PORT..."
MAX_PG_TRIES=30
PG_COUNT=0
while ! nc -z 127.0.0.1 $PG_PORT > /dev/null 2>&1; do
sleep 1
PG_COUNT=$((PG_COUNT + 1))
if [ $PG_COUNT -ge $MAX_PG_TRIES ]; then
echo -e "\nError: pgadapter failed to become ready after $MAX_PG_TRIES seconds."
if ! $DOCKER_CMD ps -q -f name="$CONTAINER_NAME" | grep -q .; then
echo "pgadapter container died. Docker logs:"
$DOCKER_CMD logs "$CONTAINER_NAME"
fi
exit 1
fi
echo -n "."
done
echo -e "\npgadapter is ready! Waiting 5s for full initialization..."
sleep 5
PROJECT_ROOT=$(readlink -f "$(dirname "$0")/..")
cd "$PROJECT_ROOT" || exit 1
echo "Current directory: $(pwd)"
if [ "$RUN_PROXY" = true ]; then
echo "Building and starting auth proxy..."
# Using bazelisk directly to avoid hardcoded ports in Makefile
bazelisk run //kube/cmd/auth-proxy -- \
--prom-port=:$PROXY_PROM_PORT \
--role=editor=google.com \
--authtype=mocked \
--mock_user=$(whoami)@google.com \
--port=:$PROXY_PORT \
--target_port=http://127.0.0.1:$PERF_PORT \
--local > /tmp/auth_proxy_${INSTANCE}_$$.log 2>&1 &
AUTH_PROXY_PID=$!
echo "Auth proxy PID: $AUTH_PROXY_PID"
sleep 2 # Give proxy time to start
fi
# Initial build and run
attempt_build_and_start
echo "Watching for file changes in ./perf ..."
touch "$MARKER_FILE"
while true; do
sleep 2
CURRENT_TS=$(date +%s)
# Watch for changes in Go files, BUILD files, and a few other common types
CHANGED_FILES=$(find ./perf -type f \
\( -name '*.go' -o -name 'BUILD.bazel' -o -name '*.html' -o -name '*.ts' \
-o -name '*.scss' -o -name '*.json' -o -name '*.css' -o -name '*.png' -o -name '*.svg' \) \
-newer "$MARKER_FILE")
if [ -n "$CHANGED_FILES" ]; then
echo "Changes detected in:"
echo "$CHANGED_FILES"
# Update marker file immediately to capture any changes that happen during rebuild
touch "$MARKER_FILE"
stop_server
attempt_build_and_start
continue
fi
# If server is NOT running, check if we should retry
if [ "$SERVER_PID" -eq 0 ]; then
TIME_SINCE_LAST=$(($CURRENT_TS - $LAST_BUILD_ATTEMPT))
if [ "$TIME_SINCE_LAST" -ge "$RETRY_INTERVAL" ]; then
echo "Retrying build/start after $RETRY_INTERVAL seconds..."
attempt_build_and_start
# Reset marker file on retry to avoid immediate re-trigger if nothing changed
touch "$MARKER_FILE"
fi
else
# Check if server died silently
if ! kill -0 "$SERVER_PID" > /dev/null 2>&1; then
echo "Perfserver process (PID: $SERVER_PID) disappeared. Scheduling retry."
SERVER_PID=0
# Will be retried on next loop iteration if enough time passed
fi
fi
done