| #!/usr/bin/env python |
| # Copyright (c) 2013 The Chromium Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Python utility to compare two CSV files and output HTML results.""" |
| |
| |
| import csv |
| import datetime |
| import optparse |
| import os |
| import re |
| import sys |
| import tempfile |
| |
| # Add the django settings file to DJANGO_SETTINGS_MODULE. |
| import django |
| os.environ['DJANGO_SETTINGS_MODULE'] = 'csv-django-settings' |
| django.setup() |
| from django.template import loader |
| |
| GS_HTML_BROWSER_LINK = ( |
| 'https://console.cloud.google.com/storage/browser/cluster-telemetry') |
| GS_HTML_DIRECT_LINK = ( |
| 'https://console.cloud.google.com/m/cloudstorage/b/cluster-telemetry/o') |
| |
| |
| def _GetPercentageDiff(value1, value2): |
| """Returns the percentage difference between the specified values.""" |
| difference = value2 - value1 |
| avg = (value2 + value1)/2 |
| return 0 if avg == 0 else difference/avg * 100 |
| |
| |
| def _GetPercentageChange(value1, value2): |
| """Returns the percentage change between the specified values.""" |
| difference = value2 - value1 |
| return 0 if value1 == 0 else difference/value1 * 100 |
| |
| |
| class PageValues(object): |
| """Container class to hold the values of a page name.""" |
| def __init__(self, page_name, value1, value2, perc_diff, perc_change, |
| pageset_link, archive_link, traceUrls1, traceUrls2): |
| self.page_name = page_name |
| self.value1 = value1 |
| self.value2 = value2 |
| self.perc_diff = perc_diff |
| self.perc_change = perc_change |
| self.pageset_link = pageset_link |
| self.archive_link = archive_link |
| self.traceUrls1 = traceUrls1.split(',') if traceUrls1 else [] |
| self.traceUrls2 = traceUrls2.split(',') if traceUrls2 else [] |
| |
| |
| class FieldNameValues(object): |
| """Container class to hold the values of a field name.""" |
| def __init__(self, value1, value2, perc_diff, total_webpages_reported): |
| self.value1 = value1 |
| self.value2 = value2 |
| self.perc_diff = perc_diff |
| self.total_webpages_reported = total_webpages_reported |
| |
| |
| class CsvComparer(object): |
| """Class that compares two telemetry CSV files and outputs HTML results.""" |
| |
| def __init__(self, csv_file1, csv_file2, output_html_dir, requester_email, |
| chromium_patch_link, skia_patch_link, |
| variance_threshold, absolute_url, min_pages_in_each_field, |
| discard_outliers, raw_csv_nopatch, raw_csv_withpatch, |
| num_repeated, target_platform, crashed_instances, |
| missing_devices, browser_args_nopatch, browser_args_withpatch, |
| pageset_type, chromium_hash, skia_hash, missing_output_slaves, |
| logs_link_prefix, description, total_archives): |
| """Constructs a CsvComparer instance.""" |
| self._csv_file1 = csv_file1 |
| self._csv_file2 = csv_file2 |
| self._output_html_dir = output_html_dir |
| self._requester_email = requester_email |
| self._chromium_patch_link = chromium_patch_link |
| self._skia_patch_link = skia_patch_link |
| self._variance_threshold = float(variance_threshold) |
| self._absolute_url = absolute_url |
| self._min_pages_in_each_field = min_pages_in_each_field |
| self._discard_outliers = float(discard_outliers) |
| self._raw_csv_nopatch = raw_csv_nopatch |
| self._raw_csv_withpatch = raw_csv_withpatch |
| self._num_repeated = num_repeated |
| self._target_platform = target_platform |
| self._crashed_instances = crashed_instances |
| self._missing_devices = missing_devices |
| self._browser_args_nopatch = browser_args_nopatch |
| self._browser_args_withpatch = browser_args_withpatch |
| self._pageset_type = pageset_type |
| self._chromium_hash = chromium_hash |
| self._skia_hash = skia_hash |
| self._missing_output_slaves = missing_output_slaves |
| self._logs_link_prefix = logs_link_prefix |
| self._description = description |
| self._total_archives = total_archives |
| |
| def _IsPercDiffSameOrAboveThreshold(self, perc_diff): |
| """Compares the specified diff to the variance threshold. |
| |
| Returns True if the difference is at or above the variance threshold. |
| """ |
| return abs(perc_diff) >= self._variance_threshold |
| |
| def _GetSortedCSV(self, unsorted_csv_reader): |
| """Sorts the specified CSV by page_name into a new CSV file.""" |
| result = sorted(unsorted_csv_reader, key=lambda d: d['page_name']) |
| fd, sorted_csv_file = tempfile.mkstemp() |
| # Close the fd. |
| os.close(fd) |
| with open(sorted_csv_file, 'wb') as f: |
| writer = csv.DictWriter(f, unsorted_csv_reader.fieldnames) |
| writer.writeheader() |
| writer.writerows(result) |
| return sorted_csv_file |
| |
| def Compare(self): |
| """Method that does the CSV comparision.""" |
| |
| # Do one pass of all the page_names in the 1st CSV and store them. |
| # The purpose of this is that when we walk through the 2nd CSV we will know |
| # Whether the same page exists in the 1st CSV (the pages are ordered the |
| # same way in both files but some could be missing from each file). |
| csv1_page_names = {} |
| with open(self._csv_file1, 'rb') as f1: |
| csv1_reader = csv.DictReader(f1) |
| for row in csv1_reader: |
| csv1_page_names[row['page_name']] = 1 |
| |
| # Sort both CSVs. |
| with open(self._csv_file1, 'rb') as f1, open(self._csv_file2, 'rb') as f2: |
| sorted_csv1_filepath = self._GetSortedCSV(csv.DictReader(f1)) |
| sorted_csv2_filepath = self._GetSortedCSV(csv.DictReader(f2)) |
| with open(sorted_csv1_filepath, 'rb') as sorted_csv1, \ |
| open(sorted_csv2_filepath, 'rb') as sorted_csv2: |
| csv1_reader = csv.DictReader(sorted_csv1) |
| csv2_reader = csv.DictReader(sorted_csv2) |
| |
| # Dictionary that holds the fieldname to the ongoing total on both CSVs. |
| fieldnames_to_totals = {} |
| # Map of a fieldname to list of tuples containing (page_name, |
| # csv_value1, csv_value2, percentage_difference). |
| fieldnames_to_page_values = {} |
| # Map of a fieldname to the discarded page value. |
| fieldnames_to_discards = {} |
| |
| # Now walk through both CSV files with a pointer at each one and collect |
| # the value totals. |
| for csv2_row in csv2_reader: |
| # Make sure the CSV2 page_name existings in CSV1 else skip it (move |
| # CSV2 pointer down). |
| page_name2 = csv2_row['page_name'] |
| if not csv1_page_names.has_key(page_name2): |
| continue |
| # Reach the right page_name in CSV1 (move CSV1 pointer down). |
| try: |
| csv1_row = csv1_reader.next() |
| while csv1_row['page_name'] != page_name2: |
| csv1_row = csv1_reader.next() |
| except StopIteration: |
| # Reached the end of CSV1, break out of the row loop. |
| break |
| |
| # Store values for all fieldnames (except page_name). |
| for fieldname in csv2_reader.fieldnames: |
| if fieldname != 'page_name' and csv1_row.has_key(fieldname): |
| if csv1_row[fieldname] == '' or csv2_row[fieldname] == '': |
| # TODO(rmistry): Check with tonyg about what the algorithm |
| # should be doing when one CSV has an empty value and the other |
| # does not. |
| continue |
| try: |
| if csv1_row[fieldname] == '-': |
| csv1_value = 0 |
| else: |
| csv1_value = float(csv1_row.get(fieldname)) |
| if csv2_row[fieldname] == '-': |
| csv2_value = 0 |
| else: |
| csv2_value = float(csv2_row.get(fieldname)) |
| except ValueError: |
| # We expected only floats, cannot compare strings. Skip field. |
| continue |
| |
| # Update the total in the dict. |
| fieldname_values = fieldnames_to_totals.get( |
| fieldname, FieldNameValues(0, 0, 0, 0)) |
| fieldname_values.value1 += csv1_value |
| fieldname_values.value2 += csv2_value |
| fieldnames_to_totals[fieldname] = fieldname_values |
| |
| perc_diff = _GetPercentageDiff(csv1_value, csv2_value) |
| if self._IsPercDiffSameOrAboveThreshold(perc_diff): |
| rank = 1 |
| slave_num = 1 |
| m = re.match(r".* \(#([0-9]+)\)", page_name2) |
| if m and m.group(1): |
| rank = int(m.group(1)) |
| while rank > slave_num * 100: |
| slave_num += 1 |
| pageset_link = ( |
| '%s/swarming/page_sets/%s/%s/%s.py' % ( |
| GS_HTML_DIRECT_LINK, self._pageset_type, rank, rank)) |
| archive_link = ( |
| '%s/swarming/webpage_archives/%s/%s' % ( |
| GS_HTML_BROWSER_LINK, self._pageset_type, rank)) |
| # Add this page only if its diff is above the threshold. |
| l = fieldnames_to_page_values.get(fieldname, []) |
| pv = PageValues( |
| page_name2, csv1_value, csv2_value, perc_diff, |
| _GetPercentageChange(csv1_value, csv2_value), pageset_link, |
| archive_link, csv1_row.get('traceUrls'), |
| csv2_row.get('traceUrls')) |
| l.append(pv) |
| fieldnames_to_page_values[fieldname] = l |
| os.remove(sorted_csv1_filepath) |
| os.remove(sorted_csv2_filepath) |
| |
| # Calculate and add the percentage differences for each fieldname. |
| # The fieldnames_to_totals dict is modified in the below loop to remove |
| # entries which are below the threshold . |
| for fieldname, fieldname_values in fieldnames_to_totals.items(): |
| if not fieldnames_to_page_values.has_key(fieldname): |
| del fieldnames_to_totals[fieldname] |
| continue |
| |
| page_values = fieldnames_to_page_values[fieldname] |
| # Sort page values by the percentage difference. |
| page_values.sort(key=lambda page_value: page_value.perc_diff, |
| reverse=True) |
| |
| if self._discard_outliers: |
| # Lose the top X% and the bottom X% |
| outliers_num = int(len(page_values) * self._discard_outliers/100) |
| top_outliers = page_values[0:outliers_num] |
| bottom_outliers = page_values[-outliers_num:] |
| # Discard top and bottom outliers. |
| fieldnames_to_page_values[fieldname] = ( |
| page_values[outliers_num:-outliers_num]) |
| # Remove discarded values from the running totals. |
| for discarded_page in top_outliers + bottom_outliers: |
| fieldname_values.value1 -= discarded_page.value1 |
| fieldname_values.value2 -= discarded_page.value2 |
| fieldnames_to_discards[fieldname] = top_outliers + bottom_outliers |
| |
| perc_diff = _GetPercentageDiff(fieldname_values.value1, |
| fieldname_values.value2) |
| if self._IsPercDiffSameOrAboveThreshold(perc_diff): |
| if (len(fieldnames_to_page_values[fieldname]) < |
| self._min_pages_in_each_field): |
| # This field does not have enough webpages, delete it from both maps. |
| print 'Removing because not enough webpages: %s' % fieldname |
| print len(fieldnames_to_page_values[fieldname]) |
| del fieldnames_to_page_values[fieldname] |
| del fieldnames_to_totals[fieldname] |
| continue |
| fieldname_values.perc_diff = perc_diff |
| fieldname_values.perc_change = _GetPercentageChange( |
| fieldname_values.value1, fieldname_values.value2) |
| else: |
| # Only store fieldnames that are below the variance threshold. |
| print 'Removing because below the variance threshold: %s' % fieldname |
| del fieldnames_to_totals[fieldname] |
| |
| # Delete keys in fieldnames_to_page_values that are not in |
| # fieldnames_to_totals because those are the only ones we want to |
| # display. |
| fieldnames_to_page_values = dict( |
| (k,v) for k,v in fieldnames_to_page_values.iteritems() |
| if k in fieldnames_to_totals) |
| |
| # Both maps should end up with the same number of keys. |
| assert set(fieldnames_to_page_values.keys()) == set( |
| fieldnames_to_totals.keys()) |
| |
| # Set the number of reporting webpages in fieldnames_to_totals. |
| for fieldname, values in fieldnames_to_page_values.iteritems(): |
| fieldnames_to_totals[fieldname].total_webpages_reported = len(values) |
| |
| # Done processing. Output the HTML. |
| self.OutputToHTML(fieldnames_to_totals, fieldnames_to_page_values, |
| fieldnames_to_discards, self._output_html_dir) |
| |
| def OutputToHTML(self, fieldnames_to_totals, fieldnames_to_page_values, |
| fieldnames_to_discards, html_dir): |
| # Calculate the current UTC time. |
| html_report_date = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC') |
| # Output the main totals HTML page. |
| sorted_fieldnames_totals_items = sorted( |
| fieldnames_to_totals.items(), key=lambda tuple: tuple[1].perc_diff, |
| reverse=True) |
| missing_output_slaves_list = [] |
| if self._missing_output_slaves: |
| missing_output_slaves_list = self._missing_output_slaves.split(' ') |
| rendered = loader.render_to_string( |
| 'csv_totals.html', |
| {'sorted_fieldnames_totals_items': sorted_fieldnames_totals_items, |
| 'requester_email': self._requester_email, |
| 'chromium_patch_link': self._chromium_patch_link, |
| 'skia_patch_link': self._skia_patch_link, |
| 'raw_csv_nopatch': self._raw_csv_nopatch, |
| 'raw_csv_withpatch': self._raw_csv_withpatch, |
| 'threshold': self._variance_threshold, |
| 'discard_outliers': self._discard_outliers, |
| 'min_webpages': self._min_pages_in_each_field, |
| 'num_repeated': self._num_repeated, |
| 'target_platform': self._target_platform, |
| 'crashed_instances': self._crashed_instances, |
| 'missing_devices': self._missing_devices, |
| 'browser_args_nopatch': self._browser_args_nopatch, |
| 'browser_args_withpatch': self._browser_args_withpatch, |
| 'absolute_url': self._absolute_url, |
| 'pageset_type': self._pageset_type, |
| 'html_report_date': html_report_date, |
| 'chromium_hash': self._chromium_hash, |
| 'skia_hash': self._skia_hash, |
| 'missing_output_slaves': missing_output_slaves_list, |
| 'logs_link_prefix': self._logs_link_prefix, |
| 'description': self._description, |
| }) |
| index_html_path = os.path.join(self._output_html_dir, 'index.html') |
| with open(index_html_path, 'wb') as index_html: |
| index_html.write(rendered) |
| |
| # Output the different per-fieldname HTML pages. |
| fieldname_count = 0 |
| # pylint: disable=W0612 |
| for fieldname, unused_values in sorted_fieldnames_totals_items: |
| fieldname_count += 1 |
| page_values = fieldnames_to_page_values[fieldname] |
| rendered = loader.render_to_string( |
| 'csv_per_page.html', |
| {'fieldname': fieldname, |
| 'page_values': page_values, |
| 'discard_outliers': self._discard_outliers, |
| 'discarded_webpages': fieldnames_to_discards.get(fieldname, []), |
| 'total_archives': self._total_archives, |
| 'absolute_url': self._absolute_url}) |
| with open( |
| os.path.join(self._output_html_dir, |
| 'fieldname%s.html' % fieldname_count), 'wb') as fieldname_html: |
| fieldname_html.write(rendered) |
| |
| |
| if '__main__' == __name__: |
| option_parser = optparse.OptionParser() |
| option_parser.add_option( |
| '', '--csv_file1', |
| help='The absolute path to the first CSV file.') |
| option_parser.add_option( |
| '', '--csv_file2', |
| help='The absolute path to the second CSV file.') |
| option_parser.add_option( |
| '', '--output_html_dir', |
| help='The absolute path of the HTML dir that will contain the results of' |
| ' the comparision CSV.') |
| option_parser.add_option( |
| '', '--requester_email', |
| help='Email address of the user who kicked off the run.') |
| option_parser.add_option( |
| '', '--chromium_patch_link', |
| help='Link to the Chromium patch used for this run.') |
| option_parser.add_option( |
| '', '--skia_patch_link', |
| help='Link to the Skia patch used for this run.') |
| option_parser.add_option( |
| '', '--variance_threshold', |
| help='The allowable variance in percentage between total values for each ' |
| 'field for the two CSVs.') |
| option_parser.add_option( |
| '', '--absolute_url', |
| help='Servers like Google Storage require an absolute url for links ' |
| 'within the HTML output files.', |
| default='') |
| option_parser.add_option( |
| '', '--min_pages_in_each_field', |
| help='The min number of pages that must have a fieldname. If a fieldname' |
| 'has less pages than this then it is not reported as a failure even' |
| 'if the percentage difference is more than the variance threshold.', |
| default=0) |
| option_parser.add_option( |
| '', '--discard_outliers', |
| help='Determines the percentage of the outliers that will be discarded' |
| 'from the top and bottom values. Eg: If this value is 10% and the' |
| 'number of webpages in a field are 10 then the 1st and 10th' |
| 'webpages are discarded.', |
| default=10) |
| option_parser.add_option( |
| '', '--num_repeated', |
| help='The number of times each pageset was run.') |
| option_parser.add_option( |
| '', '--raw_csv_nopatch', |
| help='Link to the raw CSV output of the nopatch run.') |
| option_parser.add_option( |
| '', '--raw_csv_withpatch', |
| help='Link to the raw CSV output of the withpatch run.') |
| option_parser.add_option( |
| '', '--crashed_instances', |
| help='Text that lists any crashed instances.') |
| option_parser.add_option( |
| '', '--missing_devices', |
| help='Text that lists all instances with missing Android devices.') |
| option_parser.add_option( |
| '', '--target_platform', |
| help='The platform telemetry benchmarks/measurements were run on.') |
| option_parser.add_option( |
| '', '--browser_args_nopatch', |
| help='The browser args that were used for the nopatch run.') |
| option_parser.add_option( |
| '', '--browser_args_withpatch', |
| help='The browser args that were used for the withpatch run.') |
| option_parser.add_option( |
| '', '--pageset_type', |
| help='The page set type this run was done on.') |
| option_parser.add_option( |
| '', '--chromium_hash', |
| help='The chromium git hash that was used for this run.') |
| option_parser.add_option( |
| '', '--skia_hash', |
| help='The skia git hash that was used for this run.', |
| default='') |
| option_parser.add_option( |
| '', '--missing_output_slaves', |
| help='Slaves which had no output for this run.') |
| option_parser.add_option( |
| '', '--logs_link_prefix', |
| help='Prefix link to the logs of the slaves.') |
| option_parser.add_option( |
| '', '--description', |
| help='The description of the run as entered by the requester.') |
| option_parser.add_option( |
| '', '--total_archives', |
| help='Number of archives that were used to get these results.') |
| |
| options, unused_args = option_parser.parse_args() |
| if not (options.csv_file1 and options.csv_file2 and options.output_html_dir |
| and options.variance_threshold and options.requester_email |
| and options.chromium_patch_link |
| and options.skia_patch_link and options.raw_csv_nopatch |
| and options.raw_csv_withpatch and options.num_repeated |
| and options.target_platform and options.pageset_type |
| and options.chromium_hash and options.description): |
| option_parser.error('Must specify csv_file1, csv_file2, output_html_dir, ' |
| 'variance_threshold, requester_email, ' |
| 'chromium_patch_link, ' |
| 'skia_patch_link, raw_csv_nopatch, description, ' |
| 'raw_csv_withpatch, num_repeated, pageset_type, ' |
| 'chromium_hash and target_platform') |
| |
| sys.exit(CsvComparer( |
| options.csv_file1, options.csv_file2, options.output_html_dir, |
| options.requester_email, options.chromium_patch_link, |
| options.skia_patch_link, |
| options.variance_threshold, options.absolute_url, |
| options.min_pages_in_each_field, options.discard_outliers, |
| options.raw_csv_nopatch, options.raw_csv_withpatch, |
| options.num_repeated, options.target_platform, |
| options.crashed_instances, options.missing_devices, |
| options.browser_args_nopatch, options.browser_args_withpatch, |
| options.pageset_type, options.chromium_hash, options.skia_hash, |
| options.missing_output_slaves, options.logs_link_prefix, |
| options.description, options.total_archives).Compare()) |
| |