Remove old jumphost prometheus installation.
This changes the role
install_prometheus_and_alert_to_pubsub
to
install_alert_to_pubsub
and drops all the Prometheus related files and tasks, which
are now covered in the
install_managed_prometheus
role.
Bug: skia:13542
Change-Id: I5dad5f0541243dd4013d0f3d5245e4e0ecb172e2
Reviewed-on: https://skia-review.googlesource.com/c/buildbot/+/561418
Reviewed-by: Ravi Mistry <rmistry@google.com>
Commit-Queue: Joe Gregorio <jcgregorio@google.com>
diff --git a/skolo/ansible/switchboard/jumphosts.yml b/skolo/ansible/switchboard/jumphosts.yml
index 6b29173..4891701 100644
--- a/skolo/ansible/switchboard/jumphosts.yml
+++ b/skolo/ansible/switchboard/jumphosts.yml
@@ -22,7 +22,7 @@
- role: install_router_backup
router_backup_ansible_version:
'{{ router_backup_ansible_version_override }}'
- - role: install_prometheus_and_alert_to_pubsub
+ - role: install_alert_to_pubsub
alert_to_pubsub_ansible_version:
'{{ alert_to_pubsub_ansible_version_override }}'
- role: install_managed_prometheus
diff --git a/skolo/ansible/switchboard/prom.yml b/skolo/ansible/switchboard/prom.yml
index 4c87a23..25f77af 100644
--- a/skolo/ansible/switchboard/prom.yml
+++ b/skolo/ansible/switchboard/prom.yml
@@ -1,6 +1,6 @@
-# Configures metrics and alerting on each jumphost.
+# Configures alert-to-pubsub on each jumphost.
#
-# A slimmed down playbook that just deals with Prometheus and alert_to_pubsub.
+# See the install_managed_prometheus role for how Prometheus is installed.
#
# The primary jumphost playbook is jumphosts.yaml.
- hosts: jumphosts
@@ -10,6 +10,6 @@
alert_to_pubsub_ansible_version_override: ''
roles:
- - role: install_prometheus_and_alert_to_pubsub
+ - role: install_alert_to_pubsub
alert_to_pubsub_ansible_version:
'{{ alert_to_pubsub_ansible_version_override }}'
diff --git a/skolo/ansible/switchboard/roles/install_prometheus_and_alert_to_pubsub/README.md b/skolo/ansible/switchboard/roles/install_alert_to_pubsub/README.md
similarity index 84%
rename from skolo/ansible/switchboard/roles/install_prometheus_and_alert_to_pubsub/README.md
rename to skolo/ansible/switchboard/roles/install_alert_to_pubsub/README.md
index af2fbdd..72e85e5 100644
--- a/skolo/ansible/switchboard/roles/install_prometheus_and_alert_to_pubsub/README.md
+++ b/skolo/ansible/switchboard/roles/install_alert_to_pubsub/README.md
@@ -1,11 +1,10 @@
# Role Name
-`install_prometheus_and_alert_to_pubsub`
+`install_alert_to_pubsub`
## Description
-Builds alert-to-pubsub and deploys it along with Prometheus to each of the
-jumphosts.
+Builds alert-to-pubsub and deploys it to each of the jumphosts.
## Arguments
@@ -23,7 +22,7 @@
- hosts: jumphosts
roles:
- - install_prometheus_and_alert_to_pubsub
+ - install_alert_to_pubsub
## Pushing a test/debug binary:
diff --git a/skolo/ansible/switchboard/roles/install_prometheus_and_alert_to_pubsub/meta/argument_specs.yml b/skolo/ansible/switchboard/roles/install_alert_to_pubsub/meta/argument_specs.yml
similarity index 71%
rename from skolo/ansible/switchboard/roles/install_prometheus_and_alert_to_pubsub/meta/argument_specs.yml
rename to skolo/ansible/switchboard/roles/install_alert_to_pubsub/meta/argument_specs.yml
index ca36af9..32b6949 100644
--- a/skolo/ansible/switchboard/roles/install_prometheus_and_alert_to_pubsub/meta/argument_specs.yml
+++ b/skolo/ansible/switchboard/roles/install_alert_to_pubsub/meta/argument_specs.yml
@@ -1,12 +1,12 @@
argument_specs:
main:
short_description:
- The main entry point for the install_prometheus_and_alert_to_pubsub role.
+ The main entry point for the install_alert_to_pubsub role.
options:
alert_to_pubsub_ansible_version:
- type: "str"
+ type: 'str'
required: false
- default: ""
+ default: ''
description:
'The version of alert_to_pubsub_ansible to deploy, e.g.
2021-09-17T22:47:32Z-jcgregorio-77033a142afc2ebf670339c958cbb4eb738c88ee-dirty'
diff --git a/skolo/ansible/switchboard/roles/install_alert_to_pubsub/tasks/main.yml b/skolo/ansible/switchboard/roles/install_alert_to_pubsub/tasks/main.yml
new file mode 100644
index 0000000..ee1fe2c
--- /dev/null
+++ b/skolo/ansible/switchboard/roles/install_alert_to_pubsub/tasks/main.yml
@@ -0,0 +1,35 @@
+- name: Load alert_to_pubsub_ansible executables.
+ import_role:
+ name: get_ansible_binaries
+ vars:
+ get_ansible_binaries_application: alert_to_pubsub_ansible
+ get_ansible_binaries_version: '{{ alert_to_pubsub_ansible_version }}'
+
+- name: Copy over alert_to_pubsub service file.
+ become: true
+ template:
+ src: templates/alert_to_pubsub.service
+ dest: /etc/systemd/system/alert_to_pubsub.service
+ owner: root
+ group: root
+ mode: '0644'
+
+- name: Copy over alert_to_pubsub_ansible executable.
+ become: true
+ copy:
+ src:
+ "{{ get_ansible_binaries_directory.path }}/build/{{
+ ansible_facts['system'] }}/{{ ansible_facts['architecture']
+ }}/alert_to_pubsub_ansible"
+ dest: /usr/local/bin/alert_to_pubsub_ansible
+ owner: root
+ group: root
+ mode: '0755'
+
+- name: restart_alert_to_pubsub
+ become: true
+ systemd:
+ enabled: true
+ state: restarted
+ name: alert_to_pubsub
+ daemon_reload: true
diff --git a/skolo/ansible/switchboard/roles/install_prometheus_and_alert_to_pubsub/templates/alert_to_pubsub.service b/skolo/ansible/switchboard/roles/install_alert_to_pubsub/templates/alert_to_pubsub.service
similarity index 100%
rename from skolo/ansible/switchboard/roles/install_prometheus_and_alert_to_pubsub/templates/alert_to_pubsub.service
rename to skolo/ansible/switchboard/roles/install_alert_to_pubsub/templates/alert_to_pubsub.service
diff --git a/skolo/ansible/switchboard/roles/install_prometheus_and_alert_to_pubsub/files/alerts.yml b/skolo/ansible/switchboard/roles/install_prometheus_and_alert_to_pubsub/files/alerts.yml
deleted file mode 100644
index 685d59c..0000000
--- a/skolo/ansible/switchboard/roles/install_prometheus_and_alert_to_pubsub/files/alerts.yml
+++ /dev/null
@@ -1,39 +0,0 @@
-groups:
- - name: general
- rules:
- - alert: JobDown
- expr: up == 0
- for: 1m
- labels:
- category: infra
- severity: critical
- annotations:
- abbr:
- '{{ $labels.job }} - {{ $labels.hostname }} - {{
- $externalLabels.cluster }}'
- description:
- '{{ $labels.job }} on {{ $labels.hostname }} in {{
- $externalLabels.cluster }} has been down for more than 5 minutes.'
-
- - alert: TooManyGoRoutines
- expr: go_goroutines > 3000
- for: 2m
- labels:
- category: infra
- severity: warning
- annotations:
- abbr: '{{ $labels.job }} - {{ $externalLabels.cluster }}'
- description:
- 'Too many Go routines in {{ $labels.hostname }} for app {{
- $labels.job }}.'
-
- - alert: TooManyOpenFDs
- expr: process_open_fds > 5000
- labels:
- category: infra
- severity: warning
- annotations:
- abbr: '{{ $labels.job }} - {{ $externalLabels.cluster }}'
- description:
- 'Too many open file handles on {{ $labels.hostname }} for app {{
- $labels.job }}.'
diff --git a/skolo/ansible/switchboard/roles/install_prometheus_and_alert_to_pubsub/tasks/main.yml b/skolo/ansible/switchboard/roles/install_prometheus_and_alert_to_pubsub/tasks/main.yml
deleted file mode 100644
index 9227617..0000000
--- a/skolo/ansible/switchboard/roles/install_prometheus_and_alert_to_pubsub/tasks/main.yml
+++ /dev/null
@@ -1,136 +0,0 @@
-- name: Check if Prometheus binary needs to be downloaded.
- stat:
- path:
- "{{ all.repo_root }}/skolo/build/{{ ansible_facts['system'] }}/{{
- ansible_facts['architecture'] }}/prometheus-{{ all.prometheus.version
- }}.linux-amd64/prometheus"
- register: prometheus_binary
-
-- name: Download Prometheus binary.
- delegate_to: 127.0.0.1
- throttle: 1
- make:
- chdir: '{{ all.repo_root }}/skolo/'
- target:
- "download_prometheus_{{ ansible_facts['architecture'] }}_{{
- ansible_facts['system'] }}"
- params:
- PROMETHEUS_VERSION: '{{ all.prometheus.version }}'
- when: not prometheus_binary.stat.exists
-
-- name: Copy over service file.
- become: true
- template:
- src: templates/prometheus.service
- dest: /etc/systemd/system/prometheus.service
- owner: root
- group: root
- mode: '0644'
-
-- name: Copy over prometheus executable.
- become: true
- copy:
- src:
- "{{ all.repo_root }}/skolo/build/{{ ansible_facts['system'] }}/{{
- ansible_facts['architecture'] }}/prometheus-{{ all.prometheus.version
- }}.linux-amd64/prometheus"
- dest: /usr/local/bin/prometheus
- owner: root
- group: root
- mode: '0755'
-
-- name: Copy over promtool executable.
- become: true
- copy:
- src:
- "{{ all.repo_root }}/skolo/build/{{ ansible_facts['system'] }}/{{
- ansible_facts['architecture'] }}/prometheus-{{ all.prometheus.version
- }}.linux-amd64/promtool"
- dest: /usr/local/bin/promtool
- owner: root
- group: root
- mode: '0755'
-
-- name: Ensure /etc/prometheus directory exists.
- become: true
- file:
- path: /etc/prometheus
- state: directory
- owner: chrome-bot
- group: chrome-bot
- mode: '0755'
-
-- name: Copy over prometheus config file.
- template:
- src: templates/prometheus.yml
- dest: /etc/prometheus/prometheus.yml
- owner: chrome-bot
- group: chrome-bot
- mode: '0644'
-
-- name: Copy over alerts file.
- copy:
- src: files/alerts.yml
- dest: /etc/prometheus/alerts.yml
- owner: chrome-bot
- group: chrome-bot
- mode: '0644'
-
-- name: Ensure tsdb directory exists.
- become: true
- file:
- path: /var/lib/prometheus/tsdb
- state: directory
- owner: chrome-bot
- group: chrome-bot
- mode: '0755'
-
-- name: Load alert_to_pubsub_ansible executables.
- import_role:
- name: get_ansible_binaries
- vars:
- get_ansible_binaries_application: alert_to_pubsub_ansible
- get_ansible_binaries_version: '{{ alert_to_pubsub_ansible_version }}'
-
-- name: Copy over alert_to_pubsub service file.
- become: true
- template:
- src: templates/alert_to_pubsub.service
- dest: /etc/systemd/system/alert_to_pubsub.service
- owner: root
- group: root
- mode: '0644'
-
-- name: Copy over alert_to_pubsub_ansible executable.
- become: true
- copy:
- src:
- "{{ get_ansible_binaries_directory.path }}/build/{{
- ansible_facts['system'] }}/{{ ansible_facts['architecture']
- }}/alert_to_pubsub_ansible"
- dest: /usr/local/bin/alert_to_pubsub_ansible
- owner: root
- group: root
- mode: '0755'
-
-- name: restart_prometheus
- become: true
- systemd:
- enabled: true
- state: restarted
- name: prometheus
- daemon_reload: true
-
-- name: restart_alert_to_pubsub
- become: true
- systemd:
- enabled: true
- state: restarted
- name: alert_to_pubsub
- daemon_reload: true
-
-- name: validate_prometheus_config
- command: /usr/local/bin/promtool check config /etc/prometheus/prometheus.yml
-
-- name: validate_prometheus_rules
- command: /usr/local/bin/promtool check rules /etc/prometheus/alerts.yml
diff --git a/skolo/ansible/switchboard/roles/install_prometheus_and_alert_to_pubsub/templates/prometheus.service b/skolo/ansible/switchboard/roles/install_prometheus_and_alert_to_pubsub/templates/prometheus.service
deleted file mode 100644
index daa06f7..0000000
--- a/skolo/ansible/switchboard/roles/install_prometheus_and_alert_to_pubsub/templates/prometheus.service
+++ /dev/null
@@ -1,12 +0,0 @@
-[Service]
-User=chrome-bot
-Group=chrome-bot
-ExecStart=/usr/local/bin/prometheus \
- --config.file=/etc/prometheus/prometheus.yml \
- --storage.tsdb.path=/var/lib/prometheus/data \
- --storage.tsdb.retention.time=60d \
- --web.listen-address=:{{ all.prometheus.web_server_port }}
-ExecReload=/bin/kill -HUP $MAINPID
-
-[Install]
-WantedBy=multi-user.target
\ No newline at end of file
diff --git a/skolo/ansible/switchboard/roles/install_prometheus_and_alert_to_pubsub/templates/prometheus.yml b/skolo/ansible/switchboard/roles/install_prometheus_and_alert_to_pubsub/templates/prometheus.yml
deleted file mode 100644
index 9edadd1..0000000
--- a/skolo/ansible/switchboard/roles/install_prometheus_and_alert_to_pubsub/templates/prometheus.yml
+++ /dev/null
@@ -1,48 +0,0 @@
-global:
- scrape_interval: 15s
- evaluation_interval: 15s
-
- external_labels:
- cluster: skolo-{{ inventory_hostname }}
-
-alerting:
- alertmanagers:
- - api_version: v1
- static_configs:
- - targets:
- - localhost:{{ all.prometheus.alert_to_pubsub_port }}
-
-rule_files:
- - "alerts.yml"
-
-scrape_configs:
- - job_name: 'prometheus'
- static_configs:
- - targets: ['localhost:8000']
- labels:
- job: 'prometheus'
- hostname: 'jumphost'
- - targets: ['localhost:{{ all.prometheus.monitoring.ports.metadata_server_ansible }}']
- labels:
- job: 'metadata_server_ansible'
- hostname: 'jumphost'
- - targets: ['localhost:{{ all.prometheus.monitoring.ports.router_backup_ansible }}']
- labels:
- job: 'router_backup_ansible'
- hostname: 'jumphost'
- - targets: ['localhost:{{ all.prometheus.monitoring.ports.powercycle_server_ansible }}']
- labels:
- job: 'powercycle_server_ansible'
- hostname: 'jumphost'
- - targets: ['localhost:{{ all.prometheus.monitoring.ports.alert_to_pubsub_ansible }}']
- labels:
- job: 'alert_to_pubsub_ansible'
- hostname: 'jumphost'
- - job_name: 'test_machines'
- static_configs:
-{% for hostname in hostvars['rack4']['groups'][inventory_hostname + '_machines'] %}
- - targets: ['{{ hostname }}:{{ all.prometheus.monitoring.ports.test_machine_monitor }}']
- labels:
- job: 'test_machine_monitor'
- hostname: '{{ hostname }}'
-{% endfor %}