blob: 276af88849a03069d986703f9d39605750cc9835 [file] [log] [blame]
#!/usr/bin/env python
# Copyright (c) 2013 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Python utility to compare two CSV files and output HTML results."""
import csv
import datetime
import optparse
import os
import re
import sys
import tempfile
# Add the django settings file to DJANGO_SETTINGS_MODULE.
import django
os.environ['DJANGO_SETTINGS_MODULE'] = 'csv-django-settings'
django.setup()
from django.template import loader
GS_HTML_BROWSER_LINK = (
'https://console.cloud.google.com/storage/browser/cluster-telemetry')
GS_HTML_DIRECT_LINK = (
'https://console.cloud.google.com/m/cloudstorage/b/cluster-telemetry/o')
def _GetPercentageDiff(value1, value2):
"""Returns the percentage difference between the specified values."""
difference = value2 - value1
avg = (value2 + value1)/2
return 0 if avg == 0 else difference/avg * 100
def _GetPercentageChange(value1, value2):
"""Returns the percentage change between the specified values."""
difference = value2 - value1
return 0 if value1 == 0 else difference/value1 * 100
class PageValues(object):
"""Container class to hold the values of a page name."""
def __init__(self, page_name, value1, value2, perc_diff, perc_change,
pageset_link, archive_link, traceUrls1, traceUrls2):
self.page_name = page_name
self.value1 = value1
self.value2 = value2
self.perc_diff = perc_diff
self.perc_change = perc_change
self.pageset_link = pageset_link
self.archive_link = archive_link
self.traceUrls1 = traceUrls1.split(',') if traceUrls1 else []
self.traceUrls2 = traceUrls2.split(',') if traceUrls2 else []
class FieldNameValues(object):
"""Container class to hold the values of a field name."""
def __init__(self, value1, value2, perc_diff, total_webpages_reported):
self.value1 = value1
self.value2 = value2
self.perc_diff = perc_diff
self.total_webpages_reported = total_webpages_reported
class CsvComparer(object):
"""Class that compares two telemetry CSV files and outputs HTML results."""
def __init__(self, csv_file1, csv_file2, output_html_dir, requester_email,
chromium_patch_link, skia_patch_link,
variance_threshold, absolute_url, min_pages_in_each_field,
discard_outliers, raw_csv_nopatch, raw_csv_withpatch,
num_repeated, target_platform, crashed_instances,
missing_devices, browser_args_nopatch, browser_args_withpatch,
pageset_type, chromium_hash, skia_hash, missing_output_workers,
logs_link_prefix, description, total_archives):
"""Constructs a CsvComparer instance."""
self._csv_file1 = csv_file1
self._csv_file2 = csv_file2
self._output_html_dir = output_html_dir
self._requester_email = requester_email
self._chromium_patch_link = chromium_patch_link
self._skia_patch_link = skia_patch_link
self._variance_threshold = float(variance_threshold)
self._absolute_url = absolute_url
self._min_pages_in_each_field = min_pages_in_each_field
self._discard_outliers = float(discard_outliers)
self._raw_csv_nopatch = raw_csv_nopatch
self._raw_csv_withpatch = raw_csv_withpatch
self._num_repeated = num_repeated
self._target_platform = target_platform
self._crashed_instances = crashed_instances
self._missing_devices = missing_devices
self._browser_args_nopatch = browser_args_nopatch
self._browser_args_withpatch = browser_args_withpatch
self._pageset_type = pageset_type
self._chromium_hash = chromium_hash
self._skia_hash = skia_hash
self._missing_output_workers = missing_output_workers
self._logs_link_prefix = logs_link_prefix
self._description = description
self._total_archives = total_archives
def _IsPercDiffSameOrAboveThreshold(self, perc_diff):
"""Compares the specified diff to the variance threshold.
Returns True if the difference is at or above the variance threshold.
"""
return abs(perc_diff) >= self._variance_threshold
def _GetSortedCSV(self, unsorted_csv_reader):
"""Sorts the specified CSV by page_name into a new CSV file."""
result = sorted(unsorted_csv_reader, key=lambda d: d['page_name'])
fd, sorted_csv_file = tempfile.mkstemp()
# Close the fd.
os.close(fd)
with open(sorted_csv_file, 'w') as f:
writer = csv.DictWriter(f, unsorted_csv_reader.fieldnames)
writer.writeheader()
writer.writerows(result)
return sorted_csv_file
def Compare(self):
"""Method that does the CSV comparision."""
# Do one pass of all the page_names in the 1st CSV and store them.
# The purpose of this is that when we walk through the 2nd CSV we will know
# Whether the same page exists in the 1st CSV (the pages are ordered the
# same way in both files but some could be missing from each file).
csv1_page_names = {}
with open(self._csv_file1, 'r') as f1:
csv1_reader = csv.DictReader(f1)
for row in csv1_reader:
csv1_page_names[row['page_name']] = 1
# Sort both CSVs.
with open(self._csv_file1, 'r') as f1, open(self._csv_file2, 'r') as f2:
sorted_csv1_filepath = self._GetSortedCSV(csv.DictReader(f1))
sorted_csv2_filepath = self._GetSortedCSV(csv.DictReader(f2))
with open(sorted_csv1_filepath, 'r') as sorted_csv1, \
open(sorted_csv2_filepath, 'r') as sorted_csv2:
csv1_reader = csv.DictReader(sorted_csv1)
csv2_reader = csv.DictReader(sorted_csv2)
# Dictionary that holds the fieldname to the ongoing total on both CSVs.
fieldnames_to_totals = {}
# Map of a fieldname to list of tuples containing (page_name,
# csv_value1, csv_value2, percentage_difference).
fieldnames_to_page_values = {}
# Map of a fieldname to the discarded page value.
fieldnames_to_discards = {}
# Now walk through both CSV files with a pointer at each one and collect
# the value totals.
for csv2_row in csv2_reader:
# Make sure the CSV2 page_name existings in CSV1 else skip it (move
# CSV2 pointer down).
page_name2 = csv2_row['page_name']
if page_name2 not in csv1_page_names:
continue
# Reach the right page_name in CSV1 (move CSV1 pointer down).
try:
csv1_row = next(csv1_reader);
while csv1_row['page_name'] != page_name2:
csv1_row = next(csv1_reader)
except StopIteration:
# Reached the end of CSV1, break out of the row loop.
break
# Store values for all fieldnames (except page_name).
for fieldname in csv2_reader.fieldnames:
if fieldname != 'page_name' and fieldname in csv1_row:
if csv1_row[fieldname] == '' or csv2_row[fieldname] == '':
# TODO(rmistry): Check with tonyg about what the algorithm
# should be doing when one CSV has an empty value and the other
# does not.
continue
try:
if csv1_row[fieldname] == '-':
csv1_value = 0
else:
csv1_value = float(csv1_row.get(fieldname))
if csv2_row[fieldname] == '-':
csv2_value = 0
else:
csv2_value = float(csv2_row.get(fieldname))
except ValueError:
# We expected only floats, cannot compare strings. Skip field.
continue
# Update the total in the dict.
fieldname_values = fieldnames_to_totals.get(
fieldname, FieldNameValues(0, 0, 0, 0))
fieldname_values.value1 += csv1_value
fieldname_values.value2 += csv2_value
fieldnames_to_totals[fieldname] = fieldname_values
perc_diff = _GetPercentageDiff(csv1_value, csv2_value)
if self._IsPercDiffSameOrAboveThreshold(perc_diff):
rank = 1
worker_num = 1
m = re.match(r".* \(#([0-9]+)\)", page_name2)
if m and m.group(1):
rank = int(m.group(1))
while rank > worker_num * 100:
worker_num += 1
pageset_link = (
'%s/swarming/page_sets/%s/%s/%s.py' % (
GS_HTML_DIRECT_LINK, self._pageset_type, rank, rank))
archive_link = (
'%s/swarming/webpage_archives/%s/%s' % (
GS_HTML_BROWSER_LINK, self._pageset_type, rank))
# Add this page only if its diff is above the threshold.
l = fieldnames_to_page_values.get(fieldname, [])
pv = PageValues(
page_name2, csv1_value, csv2_value, perc_diff,
_GetPercentageChange(csv1_value, csv2_value), pageset_link,
archive_link, csv1_row.get('traceUrls'),
csv2_row.get('traceUrls'))
l.append(pv)
fieldnames_to_page_values[fieldname] = l
os.remove(sorted_csv1_filepath)
os.remove(sorted_csv2_filepath)
# Calculate and add the percentage differences for each fieldname.
# The fieldnames_to_totals dict is modified in the below loop to remove
# entries which are below the threshold .
for fieldname in list(fieldnames_to_totals):
fieldname_values = fieldnames_to_totals[fieldname]
if fieldname not in fieldnames_to_page_values:
del fieldnames_to_totals[fieldname]
continue
page_values = fieldnames_to_page_values[fieldname]
# Sort page values by the percentage difference.
page_values.sort(key=lambda page_value: page_value.perc_diff,
reverse=True)
if self._discard_outliers:
# Lose the top X% and the bottom X%
outliers_num = int(len(page_values) * self._discard_outliers/100)
top_outliers = page_values[0:outliers_num]
bottom_outliers = page_values[-outliers_num:]
# Discard top and bottom outliers.
fieldnames_to_page_values[fieldname] = (
page_values[outliers_num:-outliers_num])
# Remove discarded values from the running totals.
for discarded_page in top_outliers + bottom_outliers:
fieldname_values.value1 -= discarded_page.value1
fieldname_values.value2 -= discarded_page.value2
fieldnames_to_discards[fieldname] = top_outliers + bottom_outliers
perc_diff = _GetPercentageDiff(fieldname_values.value1,
fieldname_values.value2)
if self._IsPercDiffSameOrAboveThreshold(perc_diff):
if (len(fieldnames_to_page_values[fieldname]) <
self._min_pages_in_each_field):
# This field does not have enough webpages, delete it from both maps.
print('Removing because not enough webpages: %s' % fieldname)
print(len(fieldnames_to_page_values[fieldname]))
del fieldnames_to_page_values[fieldname]
del fieldnames_to_totals[fieldname]
continue
fieldname_values.perc_diff = perc_diff
fieldname_values.perc_change = _GetPercentageChange(
fieldname_values.value1, fieldname_values.value2)
else:
# Only store fieldnames that are below the variance threshold.
print('Removing because below the variance threshold: %s' % fieldname)
del fieldnames_to_totals[fieldname]
# Delete keys in fieldnames_to_page_values that are not in
# fieldnames_to_totals because those are the only ones we want to
# display.
fieldnames_to_page_values = dict(
(k,v) for k,v in fieldnames_to_page_values.items()
if k in fieldnames_to_totals)
# Both maps should end up with the same number of keys.
assert set(fieldnames_to_page_values.keys()) == set(
fieldnames_to_totals.keys())
# Set the number of reporting webpages in fieldnames_to_totals.
for fieldname, values in fieldnames_to_page_values.items():
fieldnames_to_totals[fieldname].total_webpages_reported = len(values)
# Done processing. Output the HTML.
self.OutputToHTML(fieldnames_to_totals, fieldnames_to_page_values,
fieldnames_to_discards, self._output_html_dir)
def OutputToHTML(self, fieldnames_to_totals, fieldnames_to_page_values,
fieldnames_to_discards, html_dir):
# Calculate the current UTC time.
html_report_date = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')
# Output the main totals HTML page.
sorted_fieldnames_totals_items = sorted(
fieldnames_to_totals.items(), key=lambda tuple: tuple[1].perc_diff,
reverse=True)
missing_output_workers_list = []
if self._missing_output_workers:
missing_output_workers_list = self._missing_output_workers.split(' ')
rendered = loader.render_to_string(
'csv_totals.html',
{'sorted_fieldnames_totals_items': sorted_fieldnames_totals_items,
'requester_email': self._requester_email,
'chromium_patch_link': self._chromium_patch_link,
'skia_patch_link': self._skia_patch_link,
'raw_csv_nopatch': self._raw_csv_nopatch,
'raw_csv_withpatch': self._raw_csv_withpatch,
'threshold': self._variance_threshold,
'discard_outliers': self._discard_outliers,
'min_webpages': self._min_pages_in_each_field,
'num_repeated': self._num_repeated,
'target_platform': self._target_platform,
'crashed_instances': self._crashed_instances,
'missing_devices': self._missing_devices,
'browser_args_nopatch': self._browser_args_nopatch,
'browser_args_withpatch': self._browser_args_withpatch,
'absolute_url': self._absolute_url,
'pageset_type': self._pageset_type,
'html_report_date': html_report_date,
'chromium_hash': self._chromium_hash,
'skia_hash': self._skia_hash,
'missing_output_workers': missing_output_workers_list,
'logs_link_prefix': self._logs_link_prefix,
'description': self._description,
})
index_html_path = os.path.join(self._output_html_dir, 'index.html')
with open(index_html_path, 'w') as index_html:
index_html.write(rendered)
# Output the different per-fieldname HTML pages.
fieldname_count = 0
# pylint: disable=W0612
for fieldname, unused_values in sorted_fieldnames_totals_items:
fieldname_count += 1
page_values = fieldnames_to_page_values[fieldname]
rendered = loader.render_to_string(
'csv_per_page.html',
{'fieldname': fieldname,
'page_values': page_values,
'discard_outliers': self._discard_outliers,
'discarded_webpages': fieldnames_to_discards.get(fieldname, []),
'total_archives': self._total_archives,
'absolute_url': self._absolute_url})
with open(
os.path.join(self._output_html_dir,
'fieldname%s.html' % fieldname_count), 'w') as fieldname_html:
fieldname_html.write(rendered)
if '__main__' == __name__:
option_parser = optparse.OptionParser()
option_parser.add_option(
'', '--csv_file1',
help='The absolute path to the first CSV file.')
option_parser.add_option(
'', '--csv_file2',
help='The absolute path to the second CSV file.')
option_parser.add_option(
'', '--output_html_dir',
help='The absolute path of the HTML dir that will contain the results of'
' the comparision CSV.')
option_parser.add_option(
'', '--requester_email',
help='Email address of the user who kicked off the run.')
option_parser.add_option(
'', '--chromium_patch_link',
help='Link to the Chromium patch used for this run.')
option_parser.add_option(
'', '--skia_patch_link',
help='Link to the Skia patch used for this run.')
option_parser.add_option(
'', '--variance_threshold',
help='The allowable variance in percentage between total values for each '
'field for the two CSVs.')
option_parser.add_option(
'', '--absolute_url',
help='Servers like Google Storage require an absolute url for links '
'within the HTML output files.',
default='')
option_parser.add_option(
'', '--min_pages_in_each_field',
help='The min number of pages that must have a fieldname. If a fieldname'
'has less pages than this then it is not reported as a failure even'
'if the percentage difference is more than the variance threshold.',
default=0)
option_parser.add_option(
'', '--discard_outliers',
help='Determines the percentage of the outliers that will be discarded'
'from the top and bottom values. Eg: If this value is 10% and the'
'number of webpages in a field are 10 then the 1st and 10th'
'webpages are discarded.',
default=10)
option_parser.add_option(
'', '--num_repeated',
help='The number of times each pageset was run.')
option_parser.add_option(
'', '--raw_csv_nopatch',
help='Link to the raw CSV output of the nopatch run.')
option_parser.add_option(
'', '--raw_csv_withpatch',
help='Link to the raw CSV output of the withpatch run.')
option_parser.add_option(
'', '--crashed_instances',
help='Text that lists any crashed instances.')
option_parser.add_option(
'', '--missing_devices',
help='Text that lists all instances with missing Android devices.')
option_parser.add_option(
'', '--target_platform',
help='The platform telemetry benchmarks/measurements were run on.')
option_parser.add_option(
'', '--browser_args_nopatch',
help='The browser args that were used for the nopatch run.')
option_parser.add_option(
'', '--browser_args_withpatch',
help='The browser args that were used for the withpatch run.')
option_parser.add_option(
'', '--pageset_type',
help='The page set type this run was done on.')
option_parser.add_option(
'', '--chromium_hash',
help='The chromium git hash that was used for this run.')
option_parser.add_option(
'', '--skia_hash',
help='The skia git hash that was used for this run.',
default='')
option_parser.add_option(
'', '--missing_output_workers',
help='Workers which had no output for this run.')
option_parser.add_option(
'', '--logs_link_prefix',
help='Prefix link to the logs of the workers.')
option_parser.add_option(
'', '--description',
help='The description of the run as entered by the requester.')
option_parser.add_option(
'', '--total_archives',
help='Number of archives that were used to get these results.')
options, unused_args = option_parser.parse_args()
if not (options.csv_file1 and options.csv_file2 and options.output_html_dir
and options.variance_threshold and options.requester_email
and options.chromium_patch_link
and options.skia_patch_link and options.raw_csv_nopatch
and options.raw_csv_withpatch and options.num_repeated
and options.target_platform and options.pageset_type
and options.chromium_hash and options.description):
option_parser.error('Must specify csv_file1, csv_file2, output_html_dir, '
'variance_threshold, requester_email, '
'chromium_patch_link, '
'skia_patch_link, raw_csv_nopatch, description, '
'raw_csv_withpatch, num_repeated, pageset_type, '
'chromium_hash and target_platform')
sys.exit(CsvComparer(
options.csv_file1, options.csv_file2, options.output_html_dir,
options.requester_email, options.chromium_patch_link,
options.skia_patch_link,
options.variance_threshold, options.absolute_url,
options.min_pages_in_each_field, options.discard_outliers,
options.raw_csv_nopatch, options.raw_csv_withpatch,
options.num_repeated, options.target_platform,
options.crashed_instances, options.missing_devices,
options.browser_args_nopatch, options.browser_args_withpatch,
options.pageset_type, options.chromium_hash, options.skia_hash,
options.missing_output_workers, options.logs_link_prefix,
options.description, options.total_archives).Compare())