blob: e0430263115411b79eae78bfec7d9184996c1063 [file] [log] [blame]
#!/usr/bin/env python
# Copyright (c) 2013 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Python utility to compare two CSV files and output HTML results."""
import csv
import datetime
import math
import optparse
import os
import random
import re
import sys
import tempfile
# Add the django settings file to DJANGO_SETTINGS_MODULE.
import django
os.environ['DJANGO_SETTINGS_MODULE'] = 'csv-django-settings'
django.setup()
from django.template import loader
GS_HTML_BROWSER_LINK = (
'https://console.cloud.google.com/storage/browser/cluster-telemetry')
GS_HTML_DIRECT_LINK = (
'https://console.cloud.google.com/m/cloudstorage/b/cluster-telemetry/o')
# go/rasta-confidence-intervals#background mentions that 50 or 100 buckets
# would have been preferable but storage constraints made it not possible.
# We do not have such constraints here and so we use 100 buckets.
# If data is < 100 items then we do not calculate the CI.
NUM_CI_BUCKETS=100
# Hardcoding the tail ppf below for 100 buckets so that we do not have to
# install scipy. It is the result of scipy.stats.t.ppf(0.975,99)
# 95% confidence interval -> right tail is 97.5% = 0.97
TAIL_PPF=1.9842169515086827
def _GetPercentageDiff(value1, value2):
"""Returns the percentage difference between the specified values."""
difference = value2 - value1
avg = (value2 + value1)/2
return 0 if avg == 0 else difference/avg * 100
def _GetPercentageChange(value1, value2):
"""Returns the percentage change between the specified values."""
difference = value2 - value1
return 0 if value1 == 0 else difference/value1 * 100
class PageValues(object):
"""Container class to hold the values of a page name."""
def __init__(self, page_name, value1, value2, perc_diff, perc_change,
pageset_link, archive_link, traceUrls1, traceUrls2):
self.page_name = page_name
self.value1 = value1
self.value2 = value2
self.perc_diff = perc_diff
self.perc_change = perc_change
self.pageset_link = pageset_link
self.archive_link = archive_link
self.traceUrls1 = traceUrls1.split(',') if traceUrls1 else []
self.traceUrls2 = traceUrls2.split(',') if traceUrls2 else []
class FieldNameValues(object):
"""Container class to hold the values of a field name."""
def __init__(self, value1, value2, perc_diff, total_webpages_reported):
self.value1 = value1
self.value2 = value2
self.perc_diff = perc_diff
self.total_webpages_reported = total_webpages_reported
class CsvComparer(object):
"""Class that compares two telemetry CSV files and outputs HTML results."""
def __init__(self, csv_file1, csv_file2, output_html_dir, requester_email,
chromium_patch_link, skia_patch_link,
variance_threshold, absolute_url, min_pages_in_each_field,
discard_outliers, raw_csv_nopatch, raw_csv_withpatch,
num_repeated, target_platform, crashed_instances,
missing_devices, browser_args_nopatch, browser_args_withpatch,
pageset_type, chromium_hash, skia_hash, missing_output_workers,
logs_link_prefix, description, total_archives):
"""Constructs a CsvComparer instance."""
self._csv_file1 = csv_file1
self._csv_file2 = csv_file2
self._output_html_dir = output_html_dir
self._requester_email = requester_email
self._chromium_patch_link = chromium_patch_link
self._skia_patch_link = skia_patch_link
self._variance_threshold = float(variance_threshold)
self._absolute_url = absolute_url
self._min_pages_in_each_field = min_pages_in_each_field
self._discard_outliers = float(discard_outliers)
self._raw_csv_nopatch = raw_csv_nopatch
self._raw_csv_withpatch = raw_csv_withpatch
self._num_repeated = num_repeated
self._target_platform = target_platform
self._crashed_instances = crashed_instances
self._missing_devices = missing_devices
self._browser_args_nopatch = browser_args_nopatch
self._browser_args_withpatch = browser_args_withpatch
self._pageset_type = pageset_type
self._chromium_hash = chromium_hash
self._skia_hash = skia_hash
self._missing_output_workers = missing_output_workers
self._logs_link_prefix = logs_link_prefix
self._description = description
self._total_archives = total_archives
def _IsPercDiffSameOrAboveThreshold(self, perc_diff):
"""Compares the specified diff to the variance threshold.
Returns True if the difference is at or above the variance threshold.
"""
return abs(perc_diff) >= self._variance_threshold
def _GetSortedCSV(self, unsorted_csv_reader):
"""Sorts the specified CSV by page_name into a new CSV file."""
result = sorted(unsorted_csv_reader, key=lambda d: d['page_name'])
fd, sorted_csv_file = tempfile.mkstemp()
# Close the fd.
os.close(fd)
with open(sorted_csv_file, 'w') as f:
writer = csv.DictWriter(f, unsorted_csv_reader.fieldnames)
writer.writeheader()
writer.writerows(result)
return sorted_csv_file
def Compare(self):
"""Method that does the CSV comparision."""
# Do one pass of all the page_names in the 1st CSV and store them.
# The purpose of this is that when we walk through the 2nd CSV we will know
# Whether the same page exists in the 1st CSV (the pages are ordered the
# same way in both files but some could be missing from each file).
csv1_page_names = {}
with open(self._csv_file1, 'r') as f1:
csv1_reader = csv.DictReader(f1)
for row in csv1_reader:
csv1_page_names[row['page_name']] = 1
# Sort both CSVs.
with open(self._csv_file1, 'r') as f1, open(self._csv_file2, 'r') as f2:
sorted_csv1_filepath = self._GetSortedCSV(csv.DictReader(f1))
sorted_csv2_filepath = self._GetSortedCSV(csv.DictReader(f2))
with open(sorted_csv1_filepath, 'r') as sorted_csv1, \
open(sorted_csv2_filepath, 'r') as sorted_csv2:
csv1_reader = csv.DictReader(sorted_csv1)
csv2_reader = csv.DictReader(sorted_csv2)
# Dictionary that holds the fieldname to the ongoing total on both CSVs.
fieldnames_to_totals = {}
# Map of a fieldname to list of tuples containing (page_name,
# csv_value1, csv_value2, percentage_difference).
fieldnames_to_page_values = {}
# Map of a fieldname to the discarded page value.
fieldnames_to_discards = {}
# Now walk through both CSV files with a pointer at each one and collect
# the value totals.
for csv2_row in csv2_reader:
# Make sure the CSV2 page_name existings in CSV1 else skip it (move
# CSV2 pointer down).
page_name2 = csv2_row['page_name']
if page_name2 not in csv1_page_names:
continue
# Reach the right page_name in CSV1 (move CSV1 pointer down).
try:
csv1_row = next(csv1_reader);
while csv1_row['page_name'] != page_name2:
csv1_row = next(csv1_reader)
except StopIteration:
# Reached the end of CSV1, break out of the row loop.
break
# Store values for all fieldnames (except page_name).
for fieldname in csv2_reader.fieldnames:
if fieldname != 'page_name' and fieldname in csv1_row:
if csv1_row[fieldname] == '' or csv2_row[fieldname] == '':
# TODO(rmistry): Check with tonyg about what the algorithm
# should be doing when one CSV has an empty value and the other
# does not.
continue
try:
if csv1_row[fieldname] == '-':
csv1_value = 0
else:
csv1_value = float(csv1_row.get(fieldname))
if csv2_row[fieldname] == '-':
csv2_value = 0
else:
csv2_value = float(csv2_row.get(fieldname))
except ValueError:
# We expected only floats, cannot compare strings. Skip field.
continue
# Update the total in the dict.
fieldname_values = fieldnames_to_totals.get(
fieldname, FieldNameValues(0, 0, 0, 0))
fieldname_values.value1 += csv1_value
fieldname_values.value2 += csv2_value
fieldnames_to_totals[fieldname] = fieldname_values
perc_diff = _GetPercentageDiff(csv1_value, csv2_value)
if self._IsPercDiffSameOrAboveThreshold(perc_diff):
rank = 1
worker_num = 1
m = re.match(r".* \(#([0-9]+)\)", page_name2)
if m and m.group(1):
rank = int(m.group(1))
while rank > worker_num * 100:
worker_num += 1
pageset_link = (
'%s/swarming/page_sets/%s/%s/%s.py' % (
GS_HTML_DIRECT_LINK, self._pageset_type, rank, rank))
archive_link = (
'%s/swarming/webpage_archives/%s/%s' % (
GS_HTML_BROWSER_LINK, self._pageset_type, rank))
# Add this page only if its diff is above the threshold.
l = fieldnames_to_page_values.get(fieldname, [])
pv = PageValues(
page_name2, csv1_value, csv2_value, perc_diff,
_GetPercentageChange(csv1_value, csv2_value), pageset_link,
archive_link, csv1_row.get('traceUrls'),
csv2_row.get('traceUrls'))
l.append(pv)
fieldnames_to_page_values[fieldname] = l
os.remove(sorted_csv1_filepath)
os.remove(sorted_csv2_filepath)
# Calculate and add the percentage differences for each fieldname.
# The fieldnames_to_totals dict is modified in the below loop to remove
# entries which are below the threshold .
for fieldname in list(fieldnames_to_totals):
fieldname_values = fieldnames_to_totals[fieldname]
if fieldname not in fieldnames_to_page_values:
del fieldnames_to_totals[fieldname]
continue
page_values = fieldnames_to_page_values[fieldname]
# Sort page values by the percentage difference.
page_values.sort(key=lambda page_value: page_value.perc_diff,
reverse=True)
if self._discard_outliers:
# Lose the top X% and the bottom X%
outliers_num = int(len(page_values) * self._discard_outliers/100)
top_outliers = page_values[0:outliers_num]
bottom_outliers = page_values[-outliers_num:]
# Discard top and bottom outliers.
fieldnames_to_page_values[fieldname] = (
page_values[outliers_num:-outliers_num])
# Remove discarded values from the running totals.
for discarded_page in top_outliers + bottom_outliers:
fieldname_values.value1 -= discarded_page.value1
fieldname_values.value2 -= discarded_page.value2
fieldnames_to_discards[fieldname] = top_outliers + bottom_outliers
perc_diff = _GetPercentageDiff(fieldname_values.value1,
fieldname_values.value2)
if self._IsPercDiffSameOrAboveThreshold(perc_diff):
if (len(fieldnames_to_page_values[fieldname]) <
self._min_pages_in_each_field):
# This field does not have enough webpages, delete it from both maps.
print('Removing because not enough webpages: %s' % fieldname)
print(len(fieldnames_to_page_values[fieldname]))
del fieldnames_to_page_values[fieldname]
del fieldnames_to_totals[fieldname]
continue
fieldname_values.perc_diff = perc_diff
fieldname_values.perc_change = _GetPercentageChange(
fieldname_values.value1, fieldname_values.value2)
else:
# Only store fieldnames that are below the variance threshold.
print('Removing because below the variance threshold: %s' % fieldname)
del fieldnames_to_totals[fieldname]
# Delete keys in fieldnames_to_page_values that are not in
# fieldnames_to_totals because those are the only ones we want to
# display.
fieldnames_to_page_values = dict(
(k,v) for k,v in fieldnames_to_page_values.items()
if k in fieldnames_to_totals)
# Both maps should end up with the same number of keys.
assert set(fieldnames_to_page_values.keys()) == set(
fieldnames_to_totals.keys())
# Set the number of reporting webpages in fieldnames_to_totals.
for fieldname, values in fieldnames_to_page_values.items():
fieldnames_to_totals[fieldname].total_webpages_reported = len(values)
# Set the 95% confidence interval.
for fieldname, pageValues in fieldnames_to_page_values.items():
field_totals = fieldnames_to_totals[fieldname]
if field_totals.total_webpages_reported >= NUM_CI_BUCKETS:
lower_ci, upper_ci = find95ConfidenceInterval(pageValues, field_totals)
fieldnames_to_totals[fieldname].lower_ci = lower_ci
fieldnames_to_totals[fieldname].upper_ci = upper_ci
fieldnames_to_totals[fieldname].ci_exists = True
# If any of the bounds in the interval include 0 then set the special
# includes_zero flag. We do this so that intervals that do not include
# 0 can be displayed with emphasis because they will likely represent
# a real change.
if lower_ci <= 0 and upper_ci >= 0:
fieldnames_to_totals[fieldname].includes_zero = True
# Done processing. Output the HTML.
self.OutputToHTML(fieldnames_to_totals, fieldnames_to_page_values,
fieldnames_to_discards, self._output_html_dir)
def OutputToHTML(self, fieldnames_to_totals, fieldnames_to_page_values,
fieldnames_to_discards, html_dir):
# Calculate the current UTC time.
html_report_date = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')
# Output the main totals HTML page.
sorted_fieldnames_totals_items = sorted(
fieldnames_to_totals.items(), key=lambda tuple: tuple[1].perc_diff,
reverse=True)
missing_output_workers_list = []
if self._missing_output_workers:
missing_output_workers_list = self._missing_output_workers.split(' ')
rendered = loader.render_to_string(
'csv_totals.html',
{'sorted_fieldnames_totals_items': sorted_fieldnames_totals_items,
'requester_email': self._requester_email,
'chromium_patch_link': self._chromium_patch_link,
'skia_patch_link': self._skia_patch_link,
'raw_csv_nopatch': self._raw_csv_nopatch,
'raw_csv_withpatch': self._raw_csv_withpatch,
'threshold': self._variance_threshold,
'discard_outliers': self._discard_outliers,
'min_webpages': self._min_pages_in_each_field,
'num_repeated': self._num_repeated,
'target_platform': self._target_platform,
'crashed_instances': self._crashed_instances,
'missing_devices': self._missing_devices,
'browser_args_nopatch': self._browser_args_nopatch,
'browser_args_withpatch': self._browser_args_withpatch,
'absolute_url': self._absolute_url,
'pageset_type': self._pageset_type,
'html_report_date': html_report_date,
'chromium_hash': self._chromium_hash,
'skia_hash': self._skia_hash,
'missing_output_workers': missing_output_workers_list,
'logs_link_prefix': self._logs_link_prefix,
'description': self._description,
})
index_html_path = os.path.join(self._output_html_dir, 'index.html')
with open(index_html_path, 'w') as index_html:
index_html.write(rendered)
# Output the different per-fieldname HTML pages.
fieldname_count = 0
# pylint: disable=W0612
for fieldname, unused_values in sorted_fieldnames_totals_items:
fieldname_count += 1
page_values = fieldnames_to_page_values[fieldname]
rendered = loader.render_to_string(
'csv_per_page.html',
{'fieldname': fieldname,
'page_values': page_values,
'discard_outliers': self._discard_outliers,
'discarded_webpages': fieldnames_to_discards.get(fieldname, []),
'total_archives': self._total_archives,
'absolute_url': self._absolute_url})
with open(
os.path.join(self._output_html_dir,
'fieldname%s.html' % fieldname_count), 'w') as fieldname_html:
fieldname_html.write(rendered)
def find95ConfidenceInterval(field_values, field_totals):
""" Finds the confidence interval of %-changes with 95% confidence level.
Implements the jacknife cookie buckets method described in:
* go/yaqs/3837187828798717952
* go/rasta-confidence-intervals
* https://en.wikipedia.org/wiki/Jackknife_resampling
Args:
field_values = List of PageValues objects.
field_totals: List of FieldNameValues objects.
Returns:
A tuple containing (lower_internal, upper_internal) of the CI.
"""
# Step1: Find the overall totals and the statistic we plan to use as the
# point estimate (the percentage change).
total_no_patch = field_totals.value1
total_with_patch = field_totals.value2
point_estimate = field_totals.perc_change
# Step2: Randomly assign each row to one of 100 buckets. Initialize the bucket
# with empty list, append all hits after that.
buckets = [[] for x in range(NUM_CI_BUCKETS)]
for fv in field_values:
buckets[random.randrange(NUM_CI_BUCKETS)].append(fv)
# Step3: For each bucket, calculate the overall percentage diff statistic
# with that data removed.
all_but_one_estimates = []
for b in buckets:
bucket_value1 = 0
bucket_value2 = 0
for pv in b:
bucket_value1 += pv.value1
bucket_value2 += pv.value2
perc_change = _GetPercentageChange(total_no_patch-bucket_value1,
total_with_patch-bucket_value2)
all_but_one_estimates.append(perc_change)
# We should have gathered and processed exactly 100 buckets.
assert len(all_but_one_estimates) == NUM_CI_BUCKETS
# Step4: Do the jacknife calculation.
# D_j on go/rasta-confidence-intervals
d_js = [(NUM_CI_BUCKETS * point_estimate) - ((NUM_CI_BUCKETS - 1) * e)
for e in all_but_one_estimates]
# D with a line over it on go/rasta-confidence-intervals
d_overline = sum(d_js) / len(d_js)
# S_D^2 on go/rasta-confidence-intervals
s_d2 = (sum(((d_j - d_overline)**2) for d_j in d_js) /
float(NUM_CI_BUCKETS - 1))
standard_error = math.sqrt(s_d2 / float(NUM_CI_BUCKETS))
center = point_estimate
halfwidth = TAIL_PPF * standard_error
return center - halfwidth, center + halfwidth
if '__main__' == __name__:
option_parser = optparse.OptionParser()
option_parser.add_option(
'', '--csv_file1',
help='The absolute path to the first CSV file.')
option_parser.add_option(
'', '--csv_file2',
help='The absolute path to the second CSV file.')
option_parser.add_option(
'', '--output_html_dir',
help='The absolute path of the HTML dir that will contain the results of'
' the comparision CSV.')
option_parser.add_option(
'', '--requester_email',
help='Email address of the user who kicked off the run.')
option_parser.add_option(
'', '--chromium_patch_link',
help='Link to the Chromium patch used for this run.')
option_parser.add_option(
'', '--skia_patch_link',
help='Link to the Skia patch used for this run.')
option_parser.add_option(
'', '--variance_threshold',
help='The allowable variance in percentage between total values for each '
'field for the two CSVs.')
option_parser.add_option(
'', '--absolute_url',
help='Servers like Google Storage require an absolute url for links '
'within the HTML output files.',
default='')
option_parser.add_option(
'', '--min_pages_in_each_field',
help='The min number of pages that must have a fieldname. If a fieldname'
'has less pages than this then it is not reported as a failure even'
'if the percentage difference is more than the variance threshold.',
default=0)
option_parser.add_option(
'', '--discard_outliers',
help='Determines the percentage of the outliers that will be discarded'
'from the top and bottom values. Eg: If this value is 10% and the'
'number of webpages in a field are 10 then the 1st and 10th'
'webpages are discarded.',
default=10)
option_parser.add_option(
'', '--num_repeated',
help='The number of times each pageset was run.')
option_parser.add_option(
'', '--raw_csv_nopatch',
help='Link to the raw CSV output of the nopatch run.')
option_parser.add_option(
'', '--raw_csv_withpatch',
help='Link to the raw CSV output of the withpatch run.')
option_parser.add_option(
'', '--crashed_instances',
help='Text that lists any crashed instances.')
option_parser.add_option(
'', '--missing_devices',
help='Text that lists all instances with missing Android devices.')
option_parser.add_option(
'', '--target_platform',
help='The platform telemetry benchmarks/measurements were run on.')
option_parser.add_option(
'', '--browser_args_nopatch',
help='The browser args that were used for the nopatch run.')
option_parser.add_option(
'', '--browser_args_withpatch',
help='The browser args that were used for the withpatch run.')
option_parser.add_option(
'', '--pageset_type',
help='The page set type this run was done on.')
option_parser.add_option(
'', '--chromium_hash',
help='The chromium git hash that was used for this run.')
option_parser.add_option(
'', '--skia_hash',
help='The skia git hash that was used for this run.',
default='')
option_parser.add_option(
'', '--missing_output_workers',
help='Workers which had no output for this run.')
option_parser.add_option(
'', '--logs_link_prefix',
help='Prefix link to the logs of the workers.')
option_parser.add_option(
'', '--description',
help='The description of the run as entered by the requester.')
option_parser.add_option(
'', '--total_archives',
help='Number of archives that were used to get these results.')
options, unused_args = option_parser.parse_args()
if not (options.csv_file1 and options.csv_file2 and options.output_html_dir
and options.variance_threshold and options.requester_email
and options.chromium_patch_link
and options.skia_patch_link and options.raw_csv_nopatch
and options.raw_csv_withpatch and options.num_repeated
and options.target_platform and options.pageset_type
and options.chromium_hash and options.description):
option_parser.error('Must specify csv_file1, csv_file2, output_html_dir, '
'variance_threshold, requester_email, '
'chromium_patch_link, '
'skia_patch_link, raw_csv_nopatch, description, '
'raw_csv_withpatch, num_repeated, pageset_type, '
'chromium_hash and target_platform')
sys.exit(CsvComparer(
options.csv_file1, options.csv_file2, options.output_html_dir,
options.requester_email, options.chromium_patch_link,
options.skia_patch_link,
options.variance_threshold, options.absolute_url,
options.min_pages_in_each_field, options.discard_outliers,
options.raw_csv_nopatch, options.raw_csv_withpatch,
options.num_repeated, options.target_platform,
options.crashed_instances, options.missing_devices,
options.browser_args_nopatch, options.browser_args_withpatch,
options.pageset_type, options.chromium_hash, options.skia_hash,
options.missing_output_workers, options.logs_link_prefix,
options.description, options.total_archives).Compare())