blob: 9e0019ceee9ef2420db794f042a726baec8c0efb [file] [log] [blame]
#!/usr/bin/env python
# Copyright (c) 2013 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Python utility to compare two CSV files and output HTML results."""
import csv
import datetime
import optparse
import os
import re
import sys
import tempfile
# Add the django settings file to DJANGO_SETTINGS_MODULE.
import django
os.environ['DJANGO_SETTINGS_MODULE'] = 'csv-django-settings'
from django.template import loader
def _GetPercentageDiff(value1, value2):
"""Returns the percentage difference between the specified values."""
difference = value2 - value1
avg = (value2 + value1)/2
return 0 if avg == 0 else difference/avg * 100
def _GetPercentageChange(value1, value2):
"""Returns the percentage change between the specified values."""
difference = value2 - value1
return 0 if value1 == 0 else difference/value1 * 100
class PageValues(object):
"""Container class to hold the values of a page name."""
def __init__(self, page_name, value1, value2, perc_diff, perc_change,
pageset_link, archive_link, traceUrls1, traceUrls2):
self.page_name = page_name
self.value1 = value1
self.value2 = value2
self.perc_diff = perc_diff
self.perc_change = perc_change
self.pageset_link = pageset_link
self.archive_link = archive_link
self.traceUrls1 = traceUrls1.split(',') if traceUrls1 else []
self.traceUrls2 = traceUrls2.split(',') if traceUrls2 else []
class FieldNameValues(object):
"""Container class to hold the values of a field name."""
def __init__(self, value1, value2, perc_diff, total_webpages_reported):
self.value1 = value1
self.value2 = value2
self.perc_diff = perc_diff
self.total_webpages_reported = total_webpages_reported
class CsvComparer(object):
"""Class that compares two telemetry CSV files and outputs HTML results."""
def __init__(self, csv_file1, csv_file2, output_html_dir, requester_email,
chromium_patch_link, skia_patch_link,
variance_threshold, absolute_url, min_pages_in_each_field,
discard_outliers, raw_csv_nopatch, raw_csv_withpatch,
num_repeated, target_platform, crashed_instances,
missing_devices, browser_args_nopatch, browser_args_withpatch,
pageset_type, chromium_hash, skia_hash, missing_output_slaves,
logs_link_prefix, description, total_archives):
"""Constructs a CsvComparer instance."""
self._csv_file1 = csv_file1
self._csv_file2 = csv_file2
self._output_html_dir = output_html_dir
self._requester_email = requester_email
self._chromium_patch_link = chromium_patch_link
self._skia_patch_link = skia_patch_link
self._variance_threshold = float(variance_threshold)
self._absolute_url = absolute_url
self._min_pages_in_each_field = min_pages_in_each_field
self._discard_outliers = float(discard_outliers)
self._raw_csv_nopatch = raw_csv_nopatch
self._raw_csv_withpatch = raw_csv_withpatch
self._num_repeated = num_repeated
self._target_platform = target_platform
self._crashed_instances = crashed_instances
self._missing_devices = missing_devices
self._browser_args_nopatch = browser_args_nopatch
self._browser_args_withpatch = browser_args_withpatch
self._pageset_type = pageset_type
self._chromium_hash = chromium_hash
self._skia_hash = skia_hash
self._missing_output_slaves = missing_output_slaves
self._logs_link_prefix = logs_link_prefix
self._description = description
self._total_archives = total_archives
def _IsPercDiffSameOrAboveThreshold(self, perc_diff):
"""Compares the specified diff to the variance threshold.
Returns True if the difference is at or above the variance threshold.
return abs(perc_diff) >= self._variance_threshold
def _GetSortedCSV(self, unsorted_csv_reader):
"""Sorts the specified CSV by page_name into a new CSV file."""
_, sorted_csv_file = tempfile.mkstemp()
result = sorted(unsorted_csv_reader, key=lambda d: d['page_name'])
writer = csv.DictWriter(open(sorted_csv_file, 'w'),
return sorted_csv_file
def Compare(self):
"""Method that does the CSV comparision."""
# Do one pass of all the page_names in the 1st CSV and store them.
# The purpose of this is that when we walk through the 2nd CSV we will know
# Whether the same page exists in the 1st CSV (the pages are ordered the
# same way in both files but some could be missing from each file).
csv1_page_names = {}
csv1_reader = csv.DictReader(open(self._csv_file1, 'r'))
for row in csv1_reader:
csv1_page_names[row['page_name']] = 1
# Sort both CSVs.
unsorted_csv1_reader = csv.DictReader(open(self._csv_file1, 'r'))
sorted_csv1_filepath = self._GetSortedCSV(unsorted_csv1_reader)
sorted_csv1 = open(sorted_csv1_filepath, 'r')
csv1_reader = csv.DictReader(sorted_csv1)
unsorted_csv2_reader = csv.DictReader(open(self._csv_file2, 'r'))
sorted_csv2_filepath = self._GetSortedCSV(unsorted_csv2_reader)
sorted_csv2 = open(sorted_csv2_filepath, 'r')
csv2_reader = csv.DictReader(sorted_csv2)
# Dictionary that holds the fieldname to the ongoing total on both CSVs.
fieldnames_to_totals = {}
# Map of a fieldname to list of tuples containing (page_name, csv_value1,
# csv_value2, percentage_difference).
fieldnames_to_page_values = {}
# Map of a fieldname to the discarded page value.
fieldnames_to_discards = {}
# Now walk through both CSV files with a pointer at each one and collect
# the value totals.
for csv2_row in csv2_reader:
# Make sure the CSV2 page_name existings in CSV1 else skip it (move CSV2
# pointer down).
page_name2 = csv2_row['page_name']
if not csv1_page_names.has_key(page_name2):
# Reach the right page_name in CSV1 (move CSV1 pointer down).
csv1_row =
while csv1_row['page_name'] != page_name2:
csv1_row =
except StopIteration:
# Reached the end of CSV1, break out of the row loop.
# Store values for all fieldnames (except page_name).
for fieldname in csv2_reader.fieldnames:
if fieldname != 'page_name' and csv1_row.has_key(fieldname):
if csv1_row[fieldname] == '' or csv2_row[fieldname] == '':
# TODO(rmistry): Check with tonyg about what the algorithm should
# be doing when one CSV has an empty value and the other does not.
if csv1_row[fieldname] == '-':
csv1_value = 0
csv1_value = float(csv1_row.get(fieldname))
if csv2_row[fieldname] == '-':
csv2_value = 0
csv2_value = float(csv2_row.get(fieldname))
except ValueError:
# We expected only floats, cannot compare strings. Skip field.
# Update the total in the dict.
fieldname_values = fieldnames_to_totals.get(
fieldname, FieldNameValues(0, 0, 0, 0))
fieldname_values.value1 += csv1_value
fieldname_values.value2 += csv2_value
fieldnames_to_totals[fieldname] = fieldname_values
perc_diff = _GetPercentageDiff(csv1_value, csv2_value)
if self._IsPercDiffSameOrAboveThreshold(perc_diff):
rank = 1
slave_num = 1
m = re.match(r".* \(#([0-9]+)\)", page_name2)
if m and
rank = int(
while rank > slave_num * 100:
slave_num += 1
pageset_link = (
'%s/swarming/page_sets/%s/%s/' % (
GS_HTML_DIRECT_LINK, self._pageset_type, rank, rank))
archive_link = (
'%s/swarming/webpage_archives/%s/%s' % (
GS_HTML_BROWSER_LINK, self._pageset_type, rank))
# Add this page only if its diff is above the threshold.
l = fieldnames_to_page_values.get(fieldname, [])
l.append(PageValues(page_name2, csv1_value, csv2_value, perc_diff,
_GetPercentageChange(csv1_value, csv2_value),
pageset_link, archive_link,
fieldnames_to_page_values[fieldname] = l
# Calculate and add the percentage differences for each fieldname.
# The fieldnames_to_totals dict is modified in the below loop to remove
# entries which are below the threshold .
for fieldname, fieldname_values in fieldnames_to_totals.items():
if not fieldnames_to_page_values.has_key(fieldname):
del fieldnames_to_totals[fieldname]
page_values = fieldnames_to_page_values[fieldname]
# Sort page values by the percentage difference.
page_values.sort(key=lambda page_value: page_value.perc_diff,
if self._discard_outliers:
# Lose the top X% and the bottom X%
outliers_num = int(len(page_values) * self._discard_outliers/100)
top_outliers = page_values[0:outliers_num]
bottom_outliers = page_values[-outliers_num:]
# Discard top and bottom outliers.
fieldnames_to_page_values[fieldname] = (
# Remove discarded values from the running totals.
for discarded_page in top_outliers + bottom_outliers:
fieldname_values.value1 -= discarded_page.value1
fieldname_values.value2 -= discarded_page.value2
fieldnames_to_discards[fieldname] = top_outliers + bottom_outliers
perc_diff = _GetPercentageDiff(fieldname_values.value1,
if self._IsPercDiffSameOrAboveThreshold(perc_diff):
if (len(fieldnames_to_page_values[fieldname]) <
# This field does not have enough webpages, delete it from both maps.
print 'Removing because not enough webpages: %s' % fieldname
print len(fieldnames_to_page_values[fieldname])
del fieldnames_to_page_values[fieldname]
del fieldnames_to_totals[fieldname]
fieldname_values.perc_diff = perc_diff
fieldname_values.perc_change = _GetPercentageChange(
fieldname_values.value1, fieldname_values.value2)
# Only store fieldnames that are below the variance threshold.
print 'Removing because below the variance threshold: %s' % fieldname
del fieldnames_to_totals[fieldname]
# Delete keys in fieldnames_to_page_values that are not in
# fieldnames_to_totals because those are the only ones we want to
# display.
fieldnames_to_page_values = dict(
(k,v) for k,v in fieldnames_to_page_values.iteritems()
if k in fieldnames_to_totals)
# Both maps should end up with the same number of keys.
assert set(fieldnames_to_page_values.keys()) == set(
# Set the number of reporting webpages in fieldnames_to_totals.
for fieldname, values in fieldnames_to_page_values.iteritems():
fieldnames_to_totals[fieldname].total_webpages_reported = len(values)
# Done processing. Output the HTML.
self.OutputToHTML(fieldnames_to_totals, fieldnames_to_page_values,
fieldnames_to_discards, self._output_html_dir)
def OutputToHTML(self, fieldnames_to_totals, fieldnames_to_page_values,
fieldnames_to_discards, html_dir):
# Calculate the current UTC time.
html_report_date = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')
# Output the main totals HTML page.
sorted_fieldnames_totals_items = sorted(
fieldnames_to_totals.items(), key=lambda tuple: tuple[1].perc_diff,
missing_output_slaves_list = []
if self._missing_output_slaves:
missing_output_slaves_list = self._missing_output_slaves.split(' ')
rendered = loader.render_to_string(
{'sorted_fieldnames_totals_items': sorted_fieldnames_totals_items,
'requester_email': self._requester_email,
'chromium_patch_link': self._chromium_patch_link,
'skia_patch_link': self._skia_patch_link,
'raw_csv_nopatch': self._raw_csv_nopatch,
'raw_csv_withpatch': self._raw_csv_withpatch,
'threshold': self._variance_threshold,
'discard_outliers': self._discard_outliers,
'min_webpages': self._min_pages_in_each_field,
'num_repeated': self._num_repeated,
'target_platform': self._target_platform,
'crashed_instances': self._crashed_instances,
'missing_devices': self._missing_devices,
'browser_args_nopatch': self._browser_args_nopatch,
'browser_args_withpatch': self._browser_args_withpatch,
'absolute_url': self._absolute_url,
'pageset_type': self._pageset_type,
'html_report_date': html_report_date,
'chromium_hash': self._chromium_hash,
'skia_hash': self._skia_hash,
'missing_output_slaves': missing_output_slaves_list,
'logs_link_prefix': self._logs_link_prefix,
'description': self._description,
index_html = open(os.path.join(self._output_html_dir, 'index.html'), 'w')
# Output the different per-fieldname HTML pages.
fieldname_count = 0
# pylint: disable=W0612
for fieldname, unused_values in sorted_fieldnames_totals_items:
fieldname_count += 1
page_values = fieldnames_to_page_values[fieldname]
rendered = loader.render_to_string(
{'fieldname': fieldname,
'page_values': page_values,
'discard_outliers': self._discard_outliers,
'discarded_webpages': fieldnames_to_discards.get(fieldname, []),
'total_archives': self._total_archives,
'absolute_url': self._absolute_url})
fieldname_html = open(
'fieldname%s.html' % fieldname_count), 'w')
if '__main__' == __name__:
option_parser = optparse.OptionParser()
'', '--csv_file1',
help='The absolute path to the first CSV file.')
'', '--csv_file2',
help='The absolute path to the second CSV file.')
'', '--output_html_dir',
help='The absolute path of the HTML dir that will contain the results of'
' the comparision CSV.')
'', '--requester_email',
help='Email address of the user who kicked off the run.')
'', '--chromium_patch_link',
help='Link to the Chromium patch used for this run.')
'', '--skia_patch_link',
help='Link to the Skia patch used for this run.')
'', '--variance_threshold',
help='The allowable variance in percentage between total values for each '
'field for the two CSVs.')
'', '--absolute_url',
help='Servers like Google Storage require an absolute url for links '
'within the HTML output files.',
'', '--min_pages_in_each_field',
help='The min number of pages that must have a fieldname. If a fieldname'
'has less pages than this then it is not reported as a failure even'
'if the percentage difference is more than the variance threshold.',
'', '--discard_outliers',
help='Determines the percentage of the outliers that will be discarded'
'from the top and bottom values. Eg: If this value is 10% and the'
'number of webpages in a field are 10 then the 1st and 10th'
'webpages are discarded.',
'', '--num_repeated',
help='The number of times each pageset was run.')
'', '--raw_csv_nopatch',
help='Link to the raw CSV output of the nopatch run.')
'', '--raw_csv_withpatch',
help='Link to the raw CSV output of the withpatch run.')
'', '--crashed_instances',
help='Text that lists any crashed instances.')
'', '--missing_devices',
help='Text that lists all instances with missing Android devices.')
'', '--target_platform',
help='The platform telemetry benchmarks/measurements were run on.')
'', '--browser_args_nopatch',
help='The browser args that were used for the nopatch run.')
'', '--browser_args_withpatch',
help='The browser args that were used for the withpatch run.')
'', '--pageset_type',
help='The page set type this run was done on.')
'', '--chromium_hash',
help='The chromium git hash that was used for this run.')
'', '--skia_hash',
help='The skia git hash that was used for this run.')
'', '--missing_output_slaves',
help='Slaves which had no output for this run.')
'', '--logs_link_prefix',
help='Prefix link to the logs of the slaves.')
'', '--description',
help='The description of the run as entered by the requester.')
'', '--total_archives',
help='Number of archives that were used to get these results.')
options, unused_args = option_parser.parse_args()
if not (options.csv_file1 and options.csv_file2 and options.output_html_dir
and options.variance_threshold and options.requester_email
and options.chromium_patch_link
and options.skia_patch_link and options.raw_csv_nopatch
and options.raw_csv_withpatch and options.num_repeated
and options.target_platform and options.pageset_type
and options.chromium_hash and options.skia_hash
and options.description):
option_parser.error('Must specify csv_file1, csv_file2, output_html_dir, '
'variance_threshold, requester_email, '
'chromium_patch_link, '
'skia_patch_link, raw_csv_nopatch, description, '
'raw_csv_withpatch, num_repeated, pageset_type, '
'chromium_hash, skia_hash and target_platform')
options.csv_file1, options.csv_file2, options.output_html_dir,
options.requester_email, options.chromium_patch_link,
options.variance_threshold, options.absolute_url,
options.min_pages_in_each_field, options.discard_outliers,
options.raw_csv_nopatch, options.raw_csv_withpatch,
options.num_repeated, options.target_platform,
options.crashed_instances, options.missing_devices,
options.browser_args_nopatch, options.browser_args_withpatch,
options.pageset_type, options.chromium_hash, options.skia_hash,
options.missing_output_slaves, options.logs_link_prefix,
options.description, options.total_archives).Compare())