| #!/usr/bin/env python |
| # Copyright (c) 2013 The Chromium Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Python utility to compare two CSV files and output HTML results.""" |
| |
| |
| import csv |
| import optparse |
| import os |
| import sys |
| |
| from django.template import loader |
| |
| # Add the django settings file to DJANGO_SETTINGS_MODULE. |
| os.environ['DJANGO_SETTINGS_MODULE'] = 'csv-django-settings' |
| |
| |
| def _GetPercentageDiff(value1, value2): |
| """Returns the percentage difference between the specified values.""" |
| difference = value2 - value1 |
| avg = (value2 + value1)/2 |
| return 0 if avg == 0 else difference/avg * 100 |
| |
| |
| def _GetPercentageChange(value1, value2): |
| """Returns the percentage change between the specified values.""" |
| difference = value2 - value1 |
| return 0 if value1 == 0 else difference/value1 * 100 |
| |
| |
| class PageValues(object): |
| """Container class to hold the values of a page name.""" |
| def __init__(self, page_name, value1, value2, perc_diff, perc_change): |
| self.page_name = page_name |
| self.value1 = value1 |
| self.value2 = value2 |
| self.perc_diff = perc_diff |
| self.perc_change = perc_change |
| |
| |
| class FieldNameValues(object): |
| """Container class to hold the values of a field name.""" |
| def __init__(self, value1, value2, perc_diff): |
| self.value1 = value1 |
| self.value2 = value2 |
| self.perc_diff = perc_diff |
| |
| |
| class CsvComparer(object): |
| """Class that compares two telemetry CSV files and outputs HTML results.""" |
| |
| def __init__(self, csv_file1, csv_file2, output_html_dir, requester_email, |
| chromium_patch_link, blink_patch_link, skia_patch_link, |
| variance_threshold, absolute_url, min_pages_in_each_field, |
| discard_outliers, raw_csv_nopatch, raw_csv_withpatch): |
| """Constructs a CsvComparer instance.""" |
| self._csv_file1 = csv_file1 |
| self._csv_file2 = csv_file2 |
| self._output_html_dir = output_html_dir |
| self._requester_email = requester_email |
| self._chromium_patch_link = chromium_patch_link |
| self._blink_patch_link = blink_patch_link |
| self._skia_patch_link = skia_patch_link |
| self._variance_threshold = float(variance_threshold) |
| self._absolute_url = absolute_url |
| self._min_pages_in_each_field = min_pages_in_each_field |
| self._discard_outliers = float(discard_outliers) |
| self._raw_csv_nopatch = raw_csv_nopatch |
| self._raw_csv_withpatch = raw_csv_withpatch |
| |
| def _IsPercDiffAboveThreshold(self, perc_diff): |
| """Compares the specified diff to the variance threshold. |
| |
| Returns True if the difference is above the variance threshold. |
| """ |
| return abs(perc_diff) > self._variance_threshold |
| |
| def Compare(self): |
| """Method that does the CSV comparision.""" |
| |
| # Do one pass of all the page_names in the 1st CSV and store them. |
| # The purpose of this is that when we walk through the 2nd CSV we will know |
| # Whether the same page exists in the 1st CSV (the pages are ordered the |
| # same way in both files but some could be missing from each file). |
| csv1_page_names = {} |
| csv1_reader = csv.DictReader(open(self._csv_file1, 'r')) |
| for row in csv1_reader: |
| csv1_page_names[row['page_name']] = 1 |
| |
| csv1_reader = csv.DictReader(open(self._csv_file1, 'r')) |
| csv2_reader = csv.DictReader(open(self._csv_file2, 'r')) |
| |
| # Dictionary that holds the fieldname to the ongoing total on both CSVs. |
| fieldnames_to_totals = {} |
| # Map of a fieldname to list of tuples containing (page_name, csv_value1, |
| # csv_value2, percentage_difference). |
| fieldnames_to_page_values = {} |
| |
| # Now walk through both CSV files with a pointer at each one and collect |
| # the value totals. |
| for csv2_row in csv2_reader: |
| # Make sure the CSV2 page_name existings in CSV1 else skip it (move CSV2 |
| # pointer down). |
| page_name2 = csv2_row['page_name'] |
| if not csv1_page_names.has_key(page_name2): |
| continue |
| # Reach the right page_name in CSV1 (move CSV1 pointer down). |
| try: |
| csv1_row = csv1_reader.next() |
| while csv1_row['page_name'] != page_name2: |
| csv1_row = csv1_reader.next() |
| except StopIteration: |
| # Reached the end of CSV1, break out of the row loop. |
| break |
| |
| # Store values for all fieldnames (except page_name). |
| for fieldname in csv2_reader.fieldnames: |
| if fieldname != 'page_name' and csv1_row.has_key(fieldname): |
| if csv1_row[fieldname] == '' or csv2_row[fieldname] == '': |
| # TODO(rmistry): Check with tonyg about what the algorithm should |
| # be doing when one CSV has an empty value and the other does not. |
| continue |
| try: |
| if csv1_row[fieldname] == '-': |
| csv1_value = 0 |
| else: |
| csv1_value = float(csv1_row.get(fieldname)) |
| if csv2_row[fieldname] == '-': |
| csv2_value = 0 |
| else: |
| csv2_value = float(csv2_row.get(fieldname)) |
| except ValueError: |
| # We expected only floats, cannot compare strings. Skip this field. |
| continue |
| if csv1_value < 0 or csv2_value < 0: |
| # If either value is less than 0 then something went wrong, skip it. |
| continue |
| |
| # Update the total in the dict. |
| fieldname_values = fieldnames_to_totals.get( |
| fieldname, FieldNameValues(0, 0, 0)) |
| fieldname_values.value1 += csv1_value |
| fieldname_values.value2 += csv2_value |
| fieldnames_to_totals[fieldname] = fieldname_values |
| |
| perc_diff = _GetPercentageDiff(csv1_value, csv2_value) |
| if self._IsPercDiffAboveThreshold(perc_diff): |
| # Add this page only if its diff is above the threshold. |
| l = fieldnames_to_page_values.get(fieldname, []) |
| l.append(PageValues(page_name2, csv1_value, csv2_value, perc_diff, |
| _GetPercentageChange(csv1_value, csv2_value))) |
| fieldnames_to_page_values[fieldname] = l |
| |
| # Calculate and add the percentage differences for each fieldname. |
| # The fieldnames_to_totals dict is modified in the below loop to remove |
| # entries which are below the threshold . |
| for fieldname, fieldname_values in fieldnames_to_totals.items(): |
| if not fieldnames_to_page_values.has_key(fieldname): |
| del fieldnames_to_totals[fieldname] |
| continue |
| |
| page_values = fieldnames_to_page_values[fieldname] |
| # Sort page values by the percentage difference. |
| page_values.sort(key=lambda page_value: page_value.perc_diff, |
| reverse=True) |
| |
| if self._discard_outliers: |
| # Lose the top X% and the bottom X% |
| outliers_num = int(len(page_values) * self._discard_outliers/100) |
| top_outliers = page_values[0:outliers_num] |
| bottom_outliers = page_values[-outliers_num:] |
| # Discard top and bottom outliers. |
| fieldnames_to_page_values[fieldname] = ( |
| page_values[outliers_num:-outliers_num]) |
| # Remove discarded values from the running totals. |
| for discarded_page in top_outliers + bottom_outliers: |
| fieldname_values.value1 -= discarded_page.value1 |
| fieldname_values.value2 -= discarded_page.value2 |
| |
| perc_diff = _GetPercentageDiff(fieldname_values.value1, |
| fieldname_values.value2) |
| if self._IsPercDiffAboveThreshold(perc_diff): |
| if (len(fieldnames_to_page_values[fieldname]) < |
| self._min_pages_in_each_field): |
| # This field does not have enough webpages, delete it from both maps. |
| print 'Removing because not enough webpages: %s' % fieldname |
| print len(fieldnames_to_page_values[fieldname]) |
| del fieldnames_to_page_values[fieldname] |
| del fieldnames_to_totals[fieldname] |
| continue |
| fieldname_values.perc_diff = perc_diff |
| fieldname_values.perc_change = _GetPercentageChange( |
| fieldname_values.value1, fieldname_values.value2) |
| else: |
| # Only store fieldnames that are below the variance threshold. |
| print 'Removing because below the variance threshold: %s' % fieldname |
| del fieldnames_to_totals[fieldname] |
| |
| # Delete keys in fieldnames_to_page_values that are not in |
| # fieldnames_to_totals because those are the only ones we want to |
| # display. |
| fieldnames_to_page_values = dict( |
| (k,v) for k,v in fieldnames_to_page_values.iteritems() |
| if k in fieldnames_to_totals) |
| |
| # Both maps should end up with the same number of keys. |
| assert set(fieldnames_to_page_values.keys()) == set( |
| fieldnames_to_totals.keys()) |
| |
| # Done processing. Output the HTML. |
| self.OutputToHTML(fieldnames_to_totals, fieldnames_to_page_values, |
| self._output_html_dir) |
| |
| def OutputToHTML(self, fieldnames_to_totals, fieldnames_to_page_values, |
| html_dir): |
| # Output the main totals HTML page. |
| sorted_fieldnames_totals_items = sorted( |
| fieldnames_to_totals.items(), key=lambda tuple: tuple[1].perc_diff, |
| reverse=True) |
| rendered = loader.render_to_string( |
| 'csv_totals.html', |
| {'sorted_fieldnames_totals_items': sorted_fieldnames_totals_items, |
| 'requester_email': self._requester_email, |
| 'chromium_patch_link': self._chromium_patch_link, |
| 'blink_patch_link': self._blink_patch_link, |
| 'skia_patch_link': self._skia_patch_link, |
| 'raw_csv_nopatch': self._raw_csv_nopatch, |
| 'raw_csv_withpatch': self._raw_csv_withpatch, |
| 'threshold': self._variance_threshold, |
| 'discard_outliers': self._discard_outliers, |
| 'min_webpages': self._min_pages_in_each_field, |
| 'absolute_url': self._absolute_url}) |
| index_html = open(os.path.join(self._output_html_dir, 'index.html'), 'w') |
| index_html.write(rendered) |
| |
| # Output the different per-fieldname HTML pages. |
| fieldname_count = 0 |
| # pylint: disable=W0612 |
| for fieldname, unused_values in sorted_fieldnames_totals_items: |
| fieldname_count += 1 |
| page_values = fieldnames_to_page_values[fieldname] |
| rendered = loader.render_to_string( |
| 'csv_per_page.html', |
| {'fieldname': fieldname, |
| 'page_values': page_values, |
| 'absolute_url': self._absolute_url}) |
| fieldname_html = open( |
| os.path.join(self._output_html_dir, |
| 'fieldname%s.html' % fieldname_count), 'w') |
| fieldname_html.write(rendered) |
| |
| |
| if '__main__' == __name__: |
| option_parser = optparse.OptionParser() |
| option_parser.add_option( |
| '', '--csv_file1', |
| help='The absolute path to the first CSV file.') |
| option_parser.add_option( |
| '', '--csv_file2', |
| help='The absolute path to the second CSV file.') |
| option_parser.add_option( |
| '', '--output_html_dir', |
| help='The absolute path of the HTML dir that will contain the results of' |
| ' the comparision CSV.') |
| option_parser.add_option( |
| '', '--requester_email', |
| help='Email address of the user who kicked off the run.') |
| option_parser.add_option( |
| '', '--chromium_patch_link', |
| help='Link to the Chromium patch used for this run.') |
| option_parser.add_option( |
| '', '--blink_patch_link', |
| help='Link to the Blink patch used for this run.') |
| option_parser.add_option( |
| '', '--skia_patch_link', |
| help='Link to the Skia patch used for this run.') |
| option_parser.add_option( |
| '', '--variance_threshold', |
| help='The allowable variance in percentage between total values for each ' |
| 'field for the two CSVs.') |
| option_parser.add_option( |
| '', '--absolute_url', |
| help='Servers like Google Storage require an absolute url for links ' |
| 'within the HTML output files.', |
| default='') |
| option_parser.add_option( |
| '', '--min_pages_in_each_field', |
| help='The min number of pages that must have a fieldname. If a fieldname' |
| 'has less pages than this then it is not reported as a failure even' |
| 'if the percentage difference is more than the variance threshold.', |
| default=10) |
| option_parser.add_option( |
| '', '--discard_outliers', |
| help='Determines the percentage of the outliers that will be discarded' |
| 'from the top and bottom values. Eg: If this value is 10% and the' |
| 'number of webpages in a field are 10 then the 1st and 10th' |
| 'webpages are discarded.', |
| default=10) |
| option_parser.add_option( |
| '', '--raw_csv_nopatch', |
| help='Link to the raw CSV output of the nopatch run.') |
| option_parser.add_option( |
| '', '--raw_csv_withpatch', |
| help='Link to the raw CSV output of the withpatch run.') |
| |
| options, unused_args = option_parser.parse_args() |
| if not (options.csv_file1 and options.csv_file2 and options.output_html_dir |
| and options.variance_threshold and options.requester_email |
| and options.chromium_patch_link and options.blink_patch_link |
| and options.skia_patch_link and options.raw_csv_nopatch |
| and options.raw_csv_withpatch): |
| option_parser.error('Must specify csv_file1, csv_file2, output_html_dir, ' |
| 'variance_threshold, requester_email, ' |
| 'chromium_patch_link, blink_patch_link, ' |
| 'skia_patch_link, raw_csv_nopatch and ' |
| 'raw_csv_withpatch') |
| |
| sys.exit(CsvComparer( |
| options.csv_file1, options.csv_file2, options.output_html_dir, |
| options.requester_email, options.chromium_patch_link, |
| options.blink_patch_link, options.skia_patch_link, |
| options.variance_threshold, options.absolute_url, |
| options.min_pages_in_each_field, options.discard_outliers, |
| options.raw_csv_nopatch, options.raw_csv_withpatch).Compare()) |
| |