ct/py/csv_comparer.py - buildbot - Git at Google

 #!/usr/bin/env python
 # Copyright (c) 2013 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Python utility to compare two CSV files and output HTML results."""


 import csv
 import datetime
 import math
 import optparse
 import os
 import random
 import re
 import sys
 import tempfile

 # Add the django settings file to DJANGO_SETTINGS_MODULE.
 import django
 os.environ['DJANGO_SETTINGS_MODULE'] = 'csv-django-settings'
 django.setup()
 from django.template import loader

 GS_HTML_BROWSER_LINK = (
     'https://console.cloud.google.com/storage/browser/cluster-telemetry')
 GS_HTML_DIRECT_LINK = (
     'https://console.cloud.google.com/m/cloudstorage/b/cluster-telemetry/o')


 # go/rasta-confidence-intervals#background mentions that 50 or 100 buckets
 # would have been preferable but storage constraints made it not possible.
 # We do not have such constraints here and so we use 100 buckets.
 # If data is < 100 items then we do not calculate the CI.
 NUM_CI_BUCKETS=100

 # Hardcoding the tail ppf below for 100 buckets so that we do not have to
 # install scipy. It is the result of scipy.stats.t.ppf(0.975,99)
 # 95% confidence interval -> right tail is 97.5% = 0.97
 TAIL_PPF=1.9842169515086827


 def _GetPercentageDiff(value1, value2):
   """Returns the percentage difference between the specified values."""
   difference = value2 - value1
   avg = (value2 + value1)/2
   return 0 if avg == 0 else difference/avg * 100


 def _GetPercentageChange(value1, value2):
   """Returns the percentage change between the specified values."""
   difference = value2 - value1
   return 0 if value1 == 0 else difference/value1 * 100


 class PageValues(object):
   """Container class to hold the values of a page name."""
   def __init__(self, page_name, value1, value2, perc_diff, perc_change,
                pageset_link, archive_link, traceUrls1, traceUrls2):
     self.page_name = page_name
     self.value1 = value1
     self.value2 = value2
     self.perc_diff = perc_diff
     self.perc_change = perc_change
     self.pageset_link = pageset_link
     self.archive_link = archive_link
     self.traceUrls1 = traceUrls1.split(',') if traceUrls1 else []
     self.traceUrls2 = traceUrls2.split(',') if traceUrls2 else []


 class FieldNameValues(object):
   """Container class to hold the values of a field name."""
   def __init__(self, value1, value2, perc_diff, total_webpages_reported):
     self.value1 = value1
     self.value2 = value2
     self.perc_diff = perc_diff
     self.total_webpages_reported = total_webpages_reported


 class CsvComparer(object):
   """Class that compares two telemetry CSV files and outputs HTML results."""

   def __init__(self, csv_file1, csv_file2, output_html_dir, requester_email,
                chromium_patch_link, skia_patch_link,
                variance_threshold, absolute_url, min_pages_in_each_field,
                discard_outliers, raw_csv_nopatch, raw_csv_withpatch,
                num_repeated, target_platform, crashed_instances,
                missing_devices, browser_args_nopatch, browser_args_withpatch,
                pageset_type, chromium_hash, skia_hash, missing_output_workers,
                logs_link_prefix, description, total_archives):
     """Constructs a CsvComparer instance."""
     self._csv_file1 = csv_file1
     self._csv_file2 = csv_file2
     self._output_html_dir = output_html_dir
     self._requester_email = requester_email
     self._chromium_patch_link = chromium_patch_link
     self._skia_patch_link = skia_patch_link
     self._variance_threshold = float(variance_threshold)
     self._absolute_url = absolute_url
     self._min_pages_in_each_field = min_pages_in_each_field
     self._discard_outliers = float(discard_outliers)
     self._raw_csv_nopatch = raw_csv_nopatch
     self._raw_csv_withpatch = raw_csv_withpatch
     self._num_repeated = num_repeated
     self._target_platform = target_platform
     self._crashed_instances = crashed_instances
     self._missing_devices = missing_devices
     self._browser_args_nopatch = browser_args_nopatch
     self._browser_args_withpatch = browser_args_withpatch
     self._pageset_type = pageset_type
     self._chromium_hash = chromium_hash
     self._skia_hash = skia_hash
     self._missing_output_workers = missing_output_workers
     self._logs_link_prefix = logs_link_prefix
     self._description = description
     self._total_archives = total_archives

   def _IsPercDiffSameOrAboveThreshold(self, perc_diff):
     """Compares the specified diff to the variance threshold.

     Returns True if the difference is at or above the variance threshold.
     """
     return abs(perc_diff) >= self._variance_threshold

   def _GetSortedCSV(self, unsorted_csv_reader):
     """Sorts the specified CSV by page_name into a new CSV file."""
     result = sorted(unsorted_csv_reader, key=lambda d: d['page_name'])
     fd, sorted_csv_file = tempfile.mkstemp()
     # Close the fd.
     os.close(fd)
     with open(sorted_csv_file, 'w') as f:
       writer = csv.DictWriter(f, unsorted_csv_reader.fieldnames)
       writer.writeheader()
       writer.writerows(result)
     return sorted_csv_file

   def Compare(self):
     """Method that does the CSV comparision."""

     # Do one pass of all the page_names in the 1st CSV and store them.
     # The purpose of this is that when we walk through the 2nd CSV we will know
     # Whether the same page exists in the 1st CSV (the pages are ordered the
     # same way in both files but some could be missing from each file).
     csv1_page_names = {}
     with open(self._csv_file1, 'r') as f1:
       csv1_reader = csv.DictReader(f1)
       for row in csv1_reader:
         csv1_page_names[row['page_name']] = 1

     # Sort both CSVs.
     with open(self._csv_file1, 'r') as f1, open(self._csv_file2, 'r') as f2:
       sorted_csv1_filepath = self._GetSortedCSV(csv.DictReader(f1))
       sorted_csv2_filepath = self._GetSortedCSV(csv.DictReader(f2))
       with open(sorted_csv1_filepath, 'r') as sorted_csv1, \
            open(sorted_csv2_filepath, 'r') as sorted_csv2:
         csv1_reader = csv.DictReader(sorted_csv1)
         csv2_reader = csv.DictReader(sorted_csv2)

         # Dictionary that holds the fieldname to the ongoing total on both CSVs.
         fieldnames_to_totals = {}
         # Map of a fieldname to list of tuples containing (page_name,
         # csv_value1, csv_value2, percentage_difference).
         fieldnames_to_page_values = {}
         # Map of a fieldname to the discarded page value.
         fieldnames_to_discards = {}

         # Now walk through both CSV files with a pointer at each one and collect
         # the value totals.
         for csv2_row in csv2_reader:
           # Make sure the CSV2 page_name existings in CSV1 else skip it (move
           # CSV2 pointer down).
           page_name2 = csv2_row['page_name']
           if page_name2 not in csv1_page_names:
             continue
           # Reach the right page_name in CSV1 (move CSV1 pointer down).
           try:
             csv1_row = next(csv1_reader);
             while csv1_row['page_name'] != page_name2:
               csv1_row = next(csv1_reader)
           except StopIteration:
             # Reached the end of CSV1, break out of the row loop.
             break

           # Store values for all fieldnames (except page_name).
           for fieldname in csv2_reader.fieldnames:
             if fieldname != 'page_name' and fieldname in csv1_row:
               if csv1_row[fieldname] == '' or csv2_row[fieldname] == '':
                 # TODO(rmistry): Check with tonyg about what the algorithm
                 # should be doing when one CSV has an empty value and the other
                 # does not.
                 continue
               try:
                 if csv1_row[fieldname] == '-':
                   csv1_value = 0
                 else:
                   csv1_value = float(csv1_row.get(fieldname))
                 if csv2_row[fieldname] == '-':
                   csv2_value = 0
                 else:
                   csv2_value = float(csv2_row.get(fieldname))
               except ValueError:
                 # We expected only floats, cannot compare strings. Skip field.
                 continue

               # Update the total in the dict.
               fieldname_values = fieldnames_to_totals.get(
                   fieldname, FieldNameValues(0, 0, 0, 0))
               fieldname_values.value1 += csv1_value
               fieldname_values.value2 += csv2_value
               fieldnames_to_totals[fieldname] = fieldname_values

               perc_diff = _GetPercentageDiff(csv1_value, csv2_value)
               if self._IsPercDiffSameOrAboveThreshold(perc_diff):
                 rank = 1
                 worker_num = 1
                 m = re.match(r".* \(#([0-9]+)\)", page_name2)
                 if m and m.group(1):
                   rank = int(m.group(1))
                   while rank > worker_num * 100:
                     worker_num += 1
                 pageset_link = (
                     '%s/swarming/page_sets/%s/%s/%s.py' % (
                         GS_HTML_DIRECT_LINK, self._pageset_type, rank, rank))
                 archive_link = (
                     '%s/swarming/webpage_archives/%s/%s' % (
                         GS_HTML_BROWSER_LINK, self._pageset_type, rank))
                 # Add this page only if its diff is above the threshold.
                 l = fieldnames_to_page_values.get(fieldname, [])
                 pv = PageValues(
                     page_name2, csv1_value, csv2_value, perc_diff,
                     _GetPercentageChange(csv1_value, csv2_value), pageset_link,
                     archive_link, csv1_row.get('traceUrls'),
                     csv2_row.get('traceUrls'))
                 l.append(pv)
                 fieldnames_to_page_values[fieldname] = l
       os.remove(sorted_csv1_filepath)
       os.remove(sorted_csv2_filepath)

     # Calculate and add the percentage differences for each fieldname.
     # The fieldnames_to_totals dict is modified in the below loop to remove
     # entries which are below the threshold .
     for fieldname in list(fieldnames_to_totals):
       fieldname_values = fieldnames_to_totals[fieldname]
       if fieldname not in fieldnames_to_page_values:
         del fieldnames_to_totals[fieldname]
         continue

       page_values = fieldnames_to_page_values[fieldname]
       # Sort page values by the percentage difference.
       page_values.sort(key=lambda page_value: page_value.perc_diff,
                        reverse=True)

       if self._discard_outliers:
         # Lose the top X% and the bottom X%
         outliers_num = int(len(page_values) * self._discard_outliers/100)
         top_outliers = page_values[0:outliers_num]
         bottom_outliers = page_values[-outliers_num:]
         # Discard top and bottom outliers.
         fieldnames_to_page_values[fieldname] = (
             page_values[outliers_num:-outliers_num])
         # Remove discarded values from the running totals.
         for discarded_page in top_outliers + bottom_outliers:
           fieldname_values.value1 -= discarded_page.value1
           fieldname_values.value2 -= discarded_page.value2
         fieldnames_to_discards[fieldname] = top_outliers + bottom_outliers

       perc_diff = _GetPercentageDiff(fieldname_values.value1,
                                      fieldname_values.value2)
       if self._IsPercDiffSameOrAboveThreshold(perc_diff):
         if (len(fieldnames_to_page_values[fieldname]) <
             self._min_pages_in_each_field):
           # This field does not have enough webpages, delete it from both maps.
           print('Removing because not enough webpages: %s' % fieldname)
           print(len(fieldnames_to_page_values[fieldname]))
           del fieldnames_to_page_values[fieldname]
           del fieldnames_to_totals[fieldname]
           continue
         fieldname_values.perc_diff = perc_diff
         fieldname_values.perc_change = _GetPercentageChange(
             fieldname_values.value1, fieldname_values.value2)
       else:
         # Only store fieldnames that are below the variance threshold.
         print('Removing because below the variance threshold: %s' % fieldname)
         del fieldnames_to_totals[fieldname]

     # Delete keys in fieldnames_to_page_values that are not in
     # fieldnames_to_totals because those are the only ones we want to
     # display.
     fieldnames_to_page_values = dict(
         (k,v) for k,v in fieldnames_to_page_values.items()
         if k in fieldnames_to_totals)

     # Both maps should end up with the same number of keys.
     assert set(fieldnames_to_page_values.keys()) == set(
         fieldnames_to_totals.keys())

     # Set the number of reporting webpages in fieldnames_to_totals.
     for fieldname, values in fieldnames_to_page_values.items():
       fieldnames_to_totals[fieldname].total_webpages_reported = len(values)

     # Set the 95% confidence interval.
     for fieldname, pageValues in fieldnames_to_page_values.items():
       field_totals = fieldnames_to_totals[fieldname]

       if field_totals.total_webpages_reported >= NUM_CI_BUCKETS:
         lower_ci, upper_ci = find95ConfidenceInterval(pageValues, field_totals)
         fieldnames_to_totals[fieldname].lower_ci = lower_ci
         fieldnames_to_totals[fieldname].upper_ci = upper_ci
         fieldnames_to_totals[fieldname].ci_exists = True

         # If any of the bounds in the interval include 0 then set the special
         # includes_zero flag. We do this so that intervals that do not include
         # 0 can be displayed with emphasis because they will likely represent
         # a real change.
         if lower_ci <= 0 and upper_ci >= 0:
           fieldnames_to_totals[fieldname].includes_zero = True

     # Done processing. Output the HTML.
     self.OutputToHTML(fieldnames_to_totals, fieldnames_to_page_values,
                       fieldnames_to_discards, self._output_html_dir)

   def OutputToHTML(self, fieldnames_to_totals, fieldnames_to_page_values,
                    fieldnames_to_discards, html_dir):
     # Calculate the current UTC time.
     html_report_date = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')
     # Output the main totals HTML page.
     sorted_fieldnames_totals_items = sorted(
         fieldnames_to_totals.items(), key=lambda tuple: tuple[1].perc_diff,
         reverse=True)
     missing_output_workers_list = []
     if self._missing_output_workers:
       missing_output_workers_list = self._missing_output_workers.split(' ')
     rendered = loader.render_to_string(
         'csv_totals.html',
         {'sorted_fieldnames_totals_items': sorted_fieldnames_totals_items,
          'requester_email': self._requester_email,
          'chromium_patch_link': self._chromium_patch_link,
          'skia_patch_link': self._skia_patch_link,
          'raw_csv_nopatch': self._raw_csv_nopatch,
          'raw_csv_withpatch': self._raw_csv_withpatch,
          'threshold': self._variance_threshold,
          'discard_outliers': self._discard_outliers,
          'min_webpages': self._min_pages_in_each_field,
          'num_repeated': self._num_repeated,
          'target_platform': self._target_platform,
          'crashed_instances': self._crashed_instances,
          'missing_devices': self._missing_devices,
          'browser_args_nopatch': self._browser_args_nopatch,
          'browser_args_withpatch': self._browser_args_withpatch,
          'absolute_url': self._absolute_url,
          'pageset_type': self._pageset_type,
          'html_report_date': html_report_date,
          'chromium_hash': self._chromium_hash,
          'skia_hash': self._skia_hash,
          'missing_output_workers': missing_output_workers_list,
          'logs_link_prefix': self._logs_link_prefix,
          'description': self._description,
         })
     index_html_path = os.path.join(self._output_html_dir, 'index.html')
     with open(index_html_path, 'w') as index_html:
       index_html.write(rendered)

     # Output the different per-fieldname HTML pages.
     fieldname_count = 0
     # pylint: disable=W0612
     for fieldname, unused_values in sorted_fieldnames_totals_items:
       fieldname_count += 1
       page_values = fieldnames_to_page_values[fieldname]
       rendered = loader.render_to_string(
           'csv_per_page.html',
           {'fieldname': fieldname,
            'page_values': page_values,
            'discard_outliers': self._discard_outliers,
            'discarded_webpages': fieldnames_to_discards.get(fieldname, []),
            'total_archives': self._total_archives,
            'absolute_url': self._absolute_url})
       with open(
           os.path.join(self._output_html_dir,
           'fieldname%s.html' % fieldname_count), 'w') as fieldname_html:
         fieldname_html.write(rendered)


 def find95ConfidenceInterval(field_values, field_totals):
   """ Finds the confidence interval of %-changes with 95% confidence level.

   Implements the jacknife cookie buckets method described in:
   * go/yaqs/3837187828798717952
   * go/rasta-confidence-intervals
   * https://en.wikipedia.org/wiki/Jackknife_resampling

   Args:
     field_values = List of PageValues objects.
     field_totals: List of FieldNameValues objects.

   Returns:
     A tuple containing (lower_internal, upper_internal) of the CI.
   """
   # Step1: Find the overall totals and the statistic we plan to use as the
   # point estimate (the percentage change).
   total_no_patch = field_totals.value1
   total_with_patch = field_totals.value2
   point_estimate = field_totals.perc_change

   # Step2: Randomly assign each row to one of 100 buckets. Initialize the bucket
   # with empty list, append all hits after that.
   buckets = [[] for x in range(NUM_CI_BUCKETS)]
   for fv in field_values:
     buckets[random.randrange(NUM_CI_BUCKETS)].append(fv)

   # Step3: For each bucket, calculate the overall percentage diff statistic
   # with that data removed.
   all_but_one_estimates = []
   for b in buckets:
     bucket_value1 = 0
     bucket_value2 = 0
     for pv in b:
       bucket_value1 += pv.value1
       bucket_value2 += pv.value2
     perc_change = _GetPercentageChange(total_no_patch-bucket_value1,
                                        total_with_patch-bucket_value2)
     all_but_one_estimates.append(perc_change)

   # We should have gathered and processed exactly 100 buckets.
   assert len(all_but_one_estimates) == NUM_CI_BUCKETS

   # Step4: Do the jacknife calculation.
   # D_j on go/rasta-confidence-intervals
   d_js = [(NUM_CI_BUCKETS * point_estimate) - ((NUM_CI_BUCKETS - 1) * e)
           for e in all_but_one_estimates]

   # D with a line over it on go/rasta-confidence-intervals
   d_overline = sum(d_js) / len(d_js)

   # S_D^2 on go/rasta-confidence-intervals
   s_d2 = (sum(((d_j - d_overline)**2) for d_j in d_js) /
           float(NUM_CI_BUCKETS - 1))
   standard_error = math.sqrt(s_d2 / float(NUM_CI_BUCKETS))

   center = point_estimate
   halfwidth = TAIL_PPF * standard_error
   return center - halfwidth, center + halfwidth


 if '__main__' == __name__:
   option_parser = optparse.OptionParser()
   option_parser.add_option(
       '', '--csv_file1',
       help='The absolute path to the first CSV file.')
   option_parser.add_option(
       '', '--csv_file2',
       help='The absolute path to the second CSV file.')
   option_parser.add_option(
       '', '--output_html_dir',
       help='The absolute path of the HTML dir that will contain the results of'
            ' the comparision CSV.')
   option_parser.add_option(
       '', '--requester_email',
       help='Email address of the user who kicked off the run.')
   option_parser.add_option(
       '', '--chromium_patch_link',
       help='Link to the Chromium patch used for this run.')
   option_parser.add_option(
       '', '--skia_patch_link',
       help='Link to the Skia patch used for this run.')
   option_parser.add_option(
       '', '--variance_threshold',
       help='The allowable variance in percentage between total values for each '
            'field for the two CSVs.')
   option_parser.add_option(
       '', '--absolute_url',
       help='Servers like Google Storage require an absolute url for links '
            'within the HTML output files.',
       default='')
   option_parser.add_option(
       '', '--min_pages_in_each_field',
       help='The min number of pages that must have a fieldname. If a fieldname'
            'has less pages than this then it is not reported as a failure even'
            'if the percentage difference is more than the variance threshold.',
       default=0)
   option_parser.add_option(
       '', '--discard_outliers',
       help='Determines the percentage of the outliers that will be discarded'
            'from the top and bottom values. Eg: If this value is 10% and the'
            'number of webpages in a field are 10 then the 1st and 10th'
            'webpages are discarded.',
       default=10)
   option_parser.add_option(
       '', '--num_repeated',
       help='The number of times each pageset was run.')
   option_parser.add_option(
       '', '--raw_csv_nopatch',
       help='Link to the raw CSV output of the nopatch run.')
   option_parser.add_option(
       '', '--raw_csv_withpatch',
       help='Link to the raw CSV output of the withpatch run.')
   option_parser.add_option(
       '', '--crashed_instances',
       help='Text that lists any crashed instances.')
   option_parser.add_option(
       '', '--missing_devices',
       help='Text that lists all instances with missing Android devices.')
   option_parser.add_option(
       '', '--target_platform',
       help='The platform telemetry benchmarks/measurements were run on.')
   option_parser.add_option(
       '', '--browser_args_nopatch',
       help='The browser args that were used for the nopatch run.')
   option_parser.add_option(
       '', '--browser_args_withpatch',
       help='The browser args that were used for the withpatch run.')
   option_parser.add_option(
       '', '--pageset_type',
       help='The page set type this run was done on.')
   option_parser.add_option(
       '', '--chromium_hash',
       help='The chromium git hash that was used for this run.')
   option_parser.add_option(
       '', '--skia_hash',
       help='The skia git hash that was used for this run.',
       default='')
   option_parser.add_option(
       '', '--missing_output_workers',
       help='Workers which had no output for this run.')
   option_parser.add_option(
       '', '--logs_link_prefix',
       help='Prefix link to the logs of the workers.')
   option_parser.add_option(
       '', '--description',
       help='The description of the run as entered by the requester.')
   option_parser.add_option(
       '', '--total_archives',
       help='Number of archives that were used to get these results.')

   options, unused_args = option_parser.parse_args()
   if not (options.csv_file1 and options.csv_file2 and options.output_html_dir
           and options.variance_threshold and options.requester_email
           and options.chromium_patch_link
           and options.skia_patch_link and options.raw_csv_nopatch
           and options.raw_csv_withpatch and options.num_repeated
           and options.target_platform and options.pageset_type
           and options.chromium_hash and options.description):
     option_parser.error('Must specify csv_file1, csv_file2, output_html_dir, '
                         'variance_threshold, requester_email, '
                         'chromium_patch_link, '
                         'skia_patch_link, raw_csv_nopatch, description, '
                         'raw_csv_withpatch, num_repeated, pageset_type, '
                         'chromium_hash and target_platform')

   sys.exit(CsvComparer(
       options.csv_file1, options.csv_file2, options.output_html_dir,
       options.requester_email, options.chromium_patch_link,
       options.skia_patch_link,
       options.variance_threshold, options.absolute_url,
       options.min_pages_in_each_field, options.discard_outliers,
       options.raw_csv_nopatch, options.raw_csv_withpatch,
       options.num_repeated, options.target_platform,
       options.crashed_instances, options.missing_devices,
       options.browser_args_nopatch, options.browser_args_withpatch,
       options.pageset_type, options.chromium_hash, options.skia_hash,
       options.missing_output_workers, options.logs_link_prefix,
       options.description, options.total_archives).Compare())