blob: d04a256a7911563cffc0a91ebbeb0d6fddafa8c2 [file] [log] [blame]
package machine
import (
"context"
"strings"
"time"
"go.skia.org/infra/go/now"
"go.skia.org/infra/task_scheduler/go/types"
)
// SwarmingDimensions is for de/serializing swarming dimensions:
//
// https://chromium.googlesource.com/infra/luci/luci-py.git/+doc/master/appengine/swarming/doc/Magic-Values.md#bot-dimensions
type SwarmingDimensions map[string][]string
// Copy returns a deep copy of the dimensions map.
func (s SwarmingDimensions) Copy() SwarmingDimensions {
n := make(SwarmingDimensions, len(s))
for k, v := range s {
copyValues := make([]string, len(v))
copy(copyValues, v)
n[k] = copyValues
}
return n
}
// GetDimensionValueOrEmptyString returns the last string in the value slice at
// the given key, or the empty string if the key doesn't exist.
func (s SwarmingDimensions) GetDimensionValueOrEmptyString(key string) string {
if values, ok := s[key]; ok && len(values) > 0 {
return values[len(values)-1]
}
return ""
}
// AsMetricsTags returns a map that is suitable to pass as tags for a metric.
//
// If there are multiple values for a key only the most specific value us used.
func (s SwarmingDimensions) AsMetricsTags() map[string]string {
// Note that all metrics with the same name must have the exact same set of
// tag keys so we can't just stuff all the Dimensions into the tags.
return map[string]string{
DimID: s.GetDimensionValueOrEmptyString(DimID),
DimOS: s.GetDimensionValueOrEmptyString(DimOS),
DimDeviceType: s.GetDimensionValueOrEmptyString(DimDeviceType),
}
}
// Well-known swarming dimensions:
const (
DimID = "id"
DimOS = "os"
DimQuarantined = "quarantined"
DimDeviceType = "device_type"
DimAndroidDevices = "android_devices"
DimChromeOSChannel = "chromeos_channel"
DimChromeOSMilestone = "chromeos_milestone"
DimChromeOSReleaseVersion = "release_version"
DimCores = "cores"
DimCPU = "cpu"
DimGPU = "gpu"
DimGCE = "gce"
DimMachineType = "machine_type"
DimDockerInstalled = "docker_installed"
DimTestMachineMonitorVersion = "tmm_version"
DimTaskType = "task_type"
DimPool = "pool"
BadBatteryLevel = -99
)
// TaskRequestor is the system that is allowed to schedule jobs on a machine.
// This is the value type for the Dimension keyed by DimTaskType.
type TaskRequestor string
const (
// Swarming is allowed to schedule tasks.
Swarming TaskRequestor = "swarming"
// SkTask means Skia Task Scheduler is allowed to schedule tasks.
SkTask TaskRequestor = "sktask"
)
var AllTaskRequestorStates = []TaskRequestor{Swarming, SkTask}
// Pool names.
const (
PoolSkia = "Skia"
PoolSkiaInternal = "SkiaInternal"
)
// AllValidPools contains all valid pool names.
var AllValidPools = []string{PoolSkia, PoolSkiaInternal}
// PowerCycleState is the state of powercycling for a single machine.
type PowerCycleState string
const (
// NotAvailable means powercycling is not available for this machine. This
// is the default.
NotAvailable PowerCycleState = "not_available"
// Available means powercycling is available for this machine.
Available PowerCycleState = "available"
// InError means that powercycle should be available, but an error has
// occurred on powercycle_server, likely it failed to connect to the power
// cycle device, aka the POE switch, or the PD.
InError PowerCycleState = "in_error"
)
var AllPowerCycleStates = []PowerCycleState{NotAvailable, Available, InError}
// AttachedDevice is what kind of mobile device, if any, we expect to find attached to the test machine.
type AttachedDevice string
const (
// AttachedDeviceNone means no device is attached. Used for all non-mobile
// test machines, like Windows boxes, as well as Raspberry Pis that have no
// device attached yet.
AttachedDeviceNone AttachedDevice = "nodevice"
// AttachedDeviceAdb means an Android device, or anything else that
// understands adb.
AttachedDeviceAdb AttachedDevice = "adb"
// AttachedDeviceIOS means an iOS device, or anything else that we talk to
// using idevice* commands.
AttachedDeviceIOS AttachedDevice = "ios"
// AttachedDeviceSSH means a ChromeOS device, or any other device we
// interact with via SSH.
AttachedDeviceSSH AttachedDevice = "ssh"
)
var AllAttachedDevices = []AttachedDevice{AttachedDeviceNone, AttachedDeviceAdb, AttachedDeviceIOS, AttachedDeviceSSH}
// Annotation represents a timestamped message.
type Annotation struct {
Message string
User string
Timestamp time.Time
}
// Description is the current state of a single machine.
type Description struct {
// MaintenanceMode is non-empy if someone manually puts the machine into
// this mode. The value will be the user's email address and the date of the
// change.
MaintenanceMode string `sql:"maintenance_mode STRING NOT NULL DEFAULT ''"`
// IsQuarantined is true if the machine has failed too many tasks and should
// stop running tasks pending user intervention. Recipes/Task Drivers can
// write a $HOME/${SWARMING_BOT_ID}.quarantined file to move a machine into
// quarantined mode.
IsQuarantined bool `sql:"is_quarantined BOOL NOT NULL DEFAULT FALSE"`
// Recovering is a non-empty string if test_machine_monitor detects the
// device is too hot, or low on charge. The value is a description of what
// is recovering.
Recovering string `sql:"recovering STRING NOT NULL DEFAULT ''"`
// AttachedDevice defines the kind of device attached to this test machine,
// if any.
AttachedDevice AttachedDevice `sql:"attached_device STRING NOT NULL DEFAULT 'nodevice'"`
// Annotation is used to record the most recent non-user change to Description.
// For example, if the device battery is too low, this will be set automatically.
// This will be in addition to the normal auditlog of user actions:
// https://pkg.go.dev/go.skia.org/infra/go/auditlog?tab=doc
Annotation Annotation `sql:"annotation JSONB NOT NULL"`
// Note is a user authored message on the state of a machine.
Note Annotation `sql:"note JSONB NOT NULL"`
// Version of test_machine_monitor being run.
Version string `sql:"version STRING NOT NULL DEFAULT ''"`
// PowerCycle is true if the machine needs to be power-cycled.
PowerCycle bool `sql:"powercycle BOOL NOT NULL DEFAULT FALSE"`
// PowerCycleState is the state of power cycling availability for this
// machine.
PowerCycleState PowerCycleState `sql:"powercycle_state STRING NOT NULL DEFAULT 'not_available'"`
LastUpdated time.Time `sql:"last_updated TIMESTAMPTZ NOT NULL"`
Battery int `sql:"battery INT NOT NULL DEFAULT 0"` // Charge as an integer percent, e.g. 50% = 50.
Temperature map[string]float64 `sql:"temperatures JSONB NOT NULL"` // In Celsius.
RunningSwarmingTask bool `sql:"running_swarmingTask BOOL NOT NULL DEFAULT FALSE"`
LaunchedSwarming bool `sql:"launched_swarming BOOL NOT NULL DEFAULT FALSE"` // True if test_machine_monitor launched Swarming.
RecoveryStart time.Time `sql:"recovery_start TIMESTAMPTZ NOT NULL"` // When did the machine start being in recovery mode.
// DeviceUptime is how long the attached device has been up. It is measured in seconds.
DeviceUptime int32 `sql:"device_uptime INT4 DEFAULT 0"`
// SSHUserIP, for example, "root@skia-sparky360-03" indicates we should connect to the
// given ChromeOS device at that username and ip/hostname.
SSHUserIP string `sql:"ssh_user_ip STRING NOT NULL DEFAULT ''"`
// SuppliedDimensions are dimensions that we, the humans, supply because they are difficult
// for the automated system to gather. These are used only for ChromeOS devices, which don't
// readily report their CPU and GPU.
SuppliedDimensions SwarmingDimensions `sql:"supplied_dimensions JSONB NOT NULL"`
// Dimensions describe what hardware/software this machine has and informs what tasks
// it can run.
Dimensions SwarmingDimensions `sql:"dimensions JSONB NOT NULL"`
// TaskRequest, if present, will be the trigger that launches a task.
//
// To kill a running TaskRequest, just modify the Description to delete the TaskRequest, i.e. TaskRequest = null.
TaskRequest *types.TaskRequest `sql:"task_request JSONB" json:",omitempty"`
// TaskStarted records when a task was started. This value is set on
// machineserver during Update.
TaskStarted time.Time `sql:"task_started TIMESTAMPTZ NOT NULL DEFAULT (0)::TIMESTAMPTZ"`
// Create a computed column with the machine id to use as the primary key.
machineIDComputed struct{} `sql:"machine_id STRING PRIMARY KEY AS (dimensions->'id'->>0) STORED"`
// Create a computed column that records if the task_request column is NULL,
// so we can quickly query for machines that aren't running tasks.
runningTask struct{} `sql:"running_task bool AS (task_request IS NOT NULL) STORED"`
// Create generalized inverted index (GIN) for Dimensions.
dimensionsIndex struct{} `sql:"INVERTED INDEX dimensions_gin (dimensions)"`
// Create an index for the powercycle column.
powerCycleIndex struct{} `sql:"INDEX by_powercycle (powercycle)"`
// Create an index for the running_task column.
taskRequestIndex struct{} `sql:"INDEX by_running_task (running_task)"`
}
// IsRecovering returns true if the machine is recoving, i.e. has a non-empty Recovering message.
func (d Description) IsRecovering() bool {
return d.Recovering != ""
}
// InMaintenanceMode returns true if the machine is in maintenance mode, i.e. has a non-empty MaintenanceMode message.
func (d Description) InMaintenanceMode() bool {
return d.MaintenanceMode != ""
}
// DestFromDescription returns a slice of interface containing pointers to every public member
// of Description. This is useful in code that stores the Description in an SQL database.
//
// Make sure this always stays in the same order as the fields appear in the struct.
func DestFromDescription(d *Description) []interface{} {
return []interface{}{
&d.MaintenanceMode,
&d.IsQuarantined,
&d.Recovering,
&d.AttachedDevice,
&d.Annotation,
&d.Note,
&d.Version,
&d.PowerCycle,
&d.PowerCycleState,
&d.LastUpdated,
&d.Battery,
&d.Temperature,
&d.RunningSwarmingTask,
&d.LaunchedSwarming,
&d.RecoveryStart,
&d.DeviceUptime,
&d.SSHUserIP,
&d.SuppliedDimensions,
&d.Dimensions,
&d.TaskRequest,
&d.TaskStarted,
}
}
// SetSwarmingQuarantinedMessage sets the Swarming Dimensions to reflect the
// full quarantined state. Returns true if the machine is quarantined.
func SetSwarmingQuarantinedMessage(d *Description) bool {
parts := []string{}
if d.InMaintenanceMode() {
parts = append(parts, "Maintenance: "+d.MaintenanceMode)
}
if d.IsQuarantined {
parts = append(parts, "Forced Quarantine")
}
if d.IsRecovering() {
parts = append(parts, "Recovering: "+d.Recovering)
}
msg := strings.Join(parts, ", ")
delete(d.Dimensions, DimQuarantined)
if msg != "" {
d.Dimensions[DimQuarantined] = []string{msg}
return true
}
return false
}
// NewDescription returns a new Description instance. It describes an available machine with no
// known dimensions.
func NewDescription(ctx context.Context) Description {
return Description{
AttachedDevice: AttachedDeviceNone,
Dimensions: SwarmingDimensions{},
LastUpdated: now.Now(ctx),
}
}
// Copy returns a deep copy of Description.
func (d Description) Copy() Description {
ret := d
ret.Dimensions = d.Dimensions.Copy()
ret.SuppliedDimensions = d.SuppliedDimensions.Copy()
ret.Temperature = map[string]float64{}
for k, v := range d.Temperature {
ret.Temperature[k] = v
}
if d.TaskRequest != nil {
tr := *d.TaskRequest
ret.TaskRequest = &tr
}
return ret
}
// EventType is the type of update we got from the machine.
type EventType string
const (
// EventTypeRawState means the raw state from test_machine_monitor has been
// updated.
EventTypeRawState EventType = "raw_state"
)
// Android contains the raw results from interrogating an Android device.
type Android struct {
GetProp string `json:"getprop"`
DumpsysBattery string `json:"dumpsys_battery"`
DumpsysThermalService string `json:"dumpsys_thermal_service"`
// A positive Uptime indicates there is an Android device attached to the host.
Uptime time.Duration `json:"uptime"`
}
// IsPopulated returns whether the Android subevent record has been filled out, indicating that an
// Android device is attached.
func (a *Android) IsPopulated() bool {
return a.Uptime > 0
}
// Host is information about the host machine.
type Host struct {
// Name is the machine id, from SWARMING_BOT_ID environment variable or hostname().
Name string `json:"name"`
// Version of test_machine_monitor being run.
Version string `json:"version"`
// StartTime is when the test_machine_monitor started running.
StartTime time.Time `json:"start_time"`
}
// ChromeOS encapsulates the information reported by a ChromeOS machine.
type ChromeOS struct {
Channel string `json:"channel"`
Milestone string `json:"milestone"`
ReleaseVersion string `json:"release_version"`
// A positive Uptime indicates there is an ChromeOS device attached to the host.
Uptime time.Duration `json:"uptime"`
}
// IsPopulated returns whether the ChromeOS subevent record has been filled out, implying that the
// machine from which the event originated drives tests on a ChromeOS device.
func (c *ChromeOS) IsPopulated() bool {
return c.Uptime > 0
}
type IOS struct {
OSVersion string `json:"version"` // e.g. "13.3.1". "" if it couldn't be detected.
DeviceType string `json:"device_type"` // e.g. "iPhone10,1"
Battery int `json:"battery"` // as integer percent, or BadBatteryLevel
}
// IsPopulated returns whether the IOS subevent record has been filled out, implying an attached iOS
// device.
func (i *IOS) IsPopulated() bool {
return i.DeviceType != ""
}
// Standalone represents the Swarming-style dimensions of a test machine that runs tests on itself,
// not on some attached device.
//
// We may merge this into Host later, once we get consistent about dimensions referring to either
// the host or the attached device. Right now, they're a mix. Having Standalone makes it more
// obvious, in the processor flow control, which dimension values take precedence.
type Standalone struct {
// Number of CPU cores:
Cores int `json:"cores"`
// Model of CPU, e.g. "arm64-64-Apple_M1" or "x86-64", in various precisions, e.g. "x86",
// "x86-64", "x86-64-i7-9750H":
CPUs []string `json:"cpus"`
// Model of GPU, e.g. "1002:6821-4.0.20-3.2.8" or "8086:591e", in various precisions:
GPUs []string `json:"gpus"`
// OS version in various previsions, e.g. ["Mac-10", "Mac-10.15", "Mac-10.15.7"]:
OSVersions []string `json:"os_versions"`
// IsGCEMachine is true if the machine is running on GCE.
IsGCEMachine bool `json:"gce"`
// GCEMachineType is the type of GCE machine, e.g. "n1-highmem-2".
GCEMachineType string `json:"machine_type"`
// IsDockerInstalled indicates where the "docker" binary is available on the machine.
IsDockerInstalled bool `json:"docker_installed"`
}
// IsPopulated returns whether the Standalone subevent record is filled out, which is the case iff a
// host is explicitly marked as having no device in the machineserver UI. IsPopulated does not
// return true when a device merely falls off a host accidentally.
func (s *Standalone) IsPopulated() bool {
return s.Cores > 0
}
// Event is the information a machine should send via Source when its local state has changed.
type Event struct {
EventType EventType `json:"type"`
Android Android `json:"android"`
ChromeOS ChromeOS `json:"chromeos"`
IOS IOS `json:"ios"`
Standalone Standalone `json:"standalone"`
Host Host `json:"host"`
RunningSwarmingTask bool `json:"running_swarming_task"`
// ForcedQuarantine is true if a TaskDriver or Recipe wrote a file $HOME/${MACHINE_ID}.force_quarantine.
ForcedQuarantine bool `json:"forced_quarantine"`
// LaunchedSwarming is true if test_machine_monitor launched Swarming.
LaunchedSwarming bool `json:"launched_swarming"`
}
// NewEvent returns a new Event instance.
func NewEvent() Event {
return Event{
EventType: EventTypeRawState,
}
}
// TaskResult is a task result.
type TaskResult struct {
// TaskResult from Task Scheduler stored as JSONB so that it may change without affecting
// machineserver code.
TaskResult types.TaskResult `sql:"result JSONB NOT NULL"`
// ID should match the TaskResult's ID.
ID string `sql:"id STRING NOT NULL PRIMARY KEY"`
MachineID string `sql:"machine_id STRING NOT NULL"`
Finished time.Time `sql:"finished TIMESTAMPTZ NOT NULL"`
Status types.TaskStatus `sql:"status STRING NOT NULL DEFAULT ''"`
machineIndex struct{} `sql:"INDEX by_machine_id (machine_id)"`
statusIndex struct{} `sql:"INDEX by_status (status)"`
}