ct/py/csv_comparer.py - buildbot - Git at Google

 #!/usr/bin/env python
 # Copyright (c) 2013 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Python utility to compare two CSV files and output HTML results."""


 import csv
 import datetime
 import optparse
 import os
 import re
 import sys
 import tempfile

 # Add the django settings file to DJANGO_SETTINGS_MODULE.
 import django
 os.environ['DJANGO_SETTINGS_MODULE'] = 'csv-django-settings'
 django.setup()
 from django.template import loader

 GS_HTML_BROWSER_LINK = (
     'https://console.cloud.google.com/storage/browser/cluster-telemetry')
 GS_HTML_DIRECT_LINK = (
     'https://console.cloud.google.com/m/cloudstorage/b/cluster-telemetry/o')


 def _GetPercentageDiff(value1, value2):
   """Returns the percentage difference between the specified values."""
   difference = value2 - value1
   avg = (value2 + value1)/2
   return 0 if avg == 0 else difference/avg * 100


 def _GetPercentageChange(value1, value2):
   """Returns the percentage change between the specified values."""
   difference = value2 - value1
   return 0 if value1 == 0 else difference/value1 * 100


 class PageValues(object):
   """Container class to hold the values of a page name."""
   def __init__(self, page_name, value1, value2, perc_diff, perc_change,
                pageset_link, archive_link, traceUrls1, traceUrls2):
     self.page_name = page_name
     self.value1 = value1
     self.value2 = value2
     self.perc_diff = perc_diff
     self.perc_change = perc_change
     self.pageset_link = pageset_link
     self.archive_link = archive_link
     self.traceUrls1 = traceUrls1.split(',') if traceUrls1 else []
     self.traceUrls2 = traceUrls2.split(',') if traceUrls2 else []


 class FieldNameValues(object):
   """Container class to hold the values of a field name."""
   def __init__(self, value1, value2, perc_diff, total_webpages_reported):
     self.value1 = value1
     self.value2 = value2
     self.perc_diff = perc_diff
     self.total_webpages_reported = total_webpages_reported


 class CsvComparer(object):
   """Class that compares two telemetry CSV files and outputs HTML results."""

   def __init__(self, csv_file1, csv_file2, output_html_dir, requester_email,
                chromium_patch_link, skia_patch_link,
                variance_threshold, absolute_url, min_pages_in_each_field,
                discard_outliers, raw_csv_nopatch, raw_csv_withpatch,
                num_repeated, target_platform, crashed_instances,
                missing_devices, browser_args_nopatch, browser_args_withpatch,
                pageset_type, chromium_hash, skia_hash, missing_output_slaves,
                logs_link_prefix, description, total_archives):
     """Constructs a CsvComparer instance."""
     self._csv_file1 = csv_file1
     self._csv_file2 = csv_file2
     self._output_html_dir = output_html_dir
     self._requester_email = requester_email
     self._chromium_patch_link = chromium_patch_link
     self._skia_patch_link = skia_patch_link
     self._variance_threshold = float(variance_threshold)
     self._absolute_url = absolute_url
     self._min_pages_in_each_field = min_pages_in_each_field
     self._discard_outliers = float(discard_outliers)
     self._raw_csv_nopatch = raw_csv_nopatch
     self._raw_csv_withpatch = raw_csv_withpatch
     self._num_repeated = num_repeated
     self._target_platform = target_platform
     self._crashed_instances = crashed_instances
     self._missing_devices = missing_devices
     self._browser_args_nopatch = browser_args_nopatch
     self._browser_args_withpatch = browser_args_withpatch
     self._pageset_type = pageset_type
     self._chromium_hash = chromium_hash
     self._skia_hash = skia_hash
     self._missing_output_slaves = missing_output_slaves
     self._logs_link_prefix = logs_link_prefix
     self._description = description
     self._total_archives = total_archives

   def _IsPercDiffSameOrAboveThreshold(self, perc_diff):
     """Compares the specified diff to the variance threshold.

     Returns True if the difference is at or above the variance threshold.
     """
     return abs(perc_diff) >= self._variance_threshold

   def _GetSortedCSV(self, unsorted_csv_reader):
     """Sorts the specified CSV by page_name into a new CSV file."""
     result = sorted(unsorted_csv_reader, key=lambda d: d['page_name'])
     fd, sorted_csv_file = tempfile.mkstemp()
     # Close the fd.
     os.close(fd)
     with open(sorted_csv_file, 'wb') as f:
       writer = csv.DictWriter(f, unsorted_csv_reader.fieldnames)
       writer.writeheader()
       writer.writerows(result)
     return sorted_csv_file

   def Compare(self):
     """Method that does the CSV comparision."""

     # Do one pass of all the page_names in the 1st CSV and store them.
     # The purpose of this is that when we walk through the 2nd CSV we will know
     # Whether the same page exists in the 1st CSV (the pages are ordered the
     # same way in both files but some could be missing from each file).
     csv1_page_names = {}
     with open(self._csv_file1, 'rb') as f1:
       csv1_reader = csv.DictReader(f1)
       for row in csv1_reader:
         csv1_page_names[row['page_name']] = 1

     # Sort both CSVs.
     with open(self._csv_file1, 'rb') as f1, open(self._csv_file2, 'rb') as f2:
       sorted_csv1_filepath = self._GetSortedCSV(csv.DictReader(f1))
       sorted_csv2_filepath = self._GetSortedCSV(csv.DictReader(f2))
       with open(sorted_csv1_filepath, 'rb') as sorted_csv1, \
            open(sorted_csv2_filepath, 'rb') as sorted_csv2:
         csv1_reader = csv.DictReader(sorted_csv1)
         csv2_reader = csv.DictReader(sorted_csv2)

         # Dictionary that holds the fieldname to the ongoing total on both CSVs.
         fieldnames_to_totals = {}
         # Map of a fieldname to list of tuples containing (page_name,
         # csv_value1, csv_value2, percentage_difference).
         fieldnames_to_page_values = {}
         # Map of a fieldname to the discarded page value.
         fieldnames_to_discards = {}

         # Now walk through both CSV files with a pointer at each one and collect
         # the value totals.
         for csv2_row in csv2_reader:
           # Make sure the CSV2 page_name existings in CSV1 else skip it (move
           # CSV2 pointer down).
           page_name2 = csv2_row['page_name']
           if not csv1_page_names.has_key(page_name2):
             continue
           # Reach the right page_name in CSV1 (move CSV1 pointer down).
           try:
             csv1_row = csv1_reader.next()
             while csv1_row['page_name'] != page_name2:
               csv1_row = csv1_reader.next()
           except StopIteration:
             # Reached the end of CSV1, break out of the row loop.
             break

           # Store values for all fieldnames (except page_name).
           for fieldname in csv2_reader.fieldnames:
             if fieldname != 'page_name' and csv1_row.has_key(fieldname):
               if csv1_row[fieldname] == '' or csv2_row[fieldname] == '':
                 # TODO(rmistry): Check with tonyg about what the algorithm
                 # should be doing when one CSV has an empty value and the other
                 # does not.
                 continue
               try:
                 if csv1_row[fieldname] == '-':
                   csv1_value = 0
                 else:
                   csv1_value = float(csv1_row.get(fieldname))
                 if csv2_row[fieldname] == '-':
                   csv2_value = 0
                 else:
                   csv2_value = float(csv2_row.get(fieldname))
               except ValueError:
                 # We expected only floats, cannot compare strings. Skip field.
                 continue

               # Update the total in the dict.
               fieldname_values = fieldnames_to_totals.get(
                   fieldname, FieldNameValues(0, 0, 0, 0))
               fieldname_values.value1 += csv1_value
               fieldname_values.value2 += csv2_value
               fieldnames_to_totals[fieldname] = fieldname_values

               perc_diff = _GetPercentageDiff(csv1_value, csv2_value)
               if self._IsPercDiffSameOrAboveThreshold(perc_diff):
                 rank = 1
                 slave_num = 1
                 m = re.match(r".* \(#([0-9]+)\)", page_name2)
                 if m and m.group(1):
                   rank = int(m.group(1))
                   while rank > slave_num * 100:
                     slave_num += 1
                 pageset_link = (
                     '%s/swarming/page_sets/%s/%s/%s.py' % (
                         GS_HTML_DIRECT_LINK, self._pageset_type, rank, rank))
                 archive_link = (
                     '%s/swarming/webpage_archives/%s/%s' % (
                         GS_HTML_BROWSER_LINK, self._pageset_type, rank))
                 # Add this page only if its diff is above the threshold.
                 l = fieldnames_to_page_values.get(fieldname, [])
                 pv = PageValues(
                     page_name2, csv1_value, csv2_value, perc_diff,
                     _GetPercentageChange(csv1_value, csv2_value), pageset_link,
                     archive_link, csv1_row.get('traceUrls'),
                     csv2_row.get('traceUrls'))
                 l.append(pv)
                 fieldnames_to_page_values[fieldname] = l
       os.remove(sorted_csv1_filepath)
       os.remove(sorted_csv2_filepath)

     # Calculate and add the percentage differences for each fieldname.
     # The fieldnames_to_totals dict is modified in the below loop to remove
     # entries which are below the threshold .
     for fieldname, fieldname_values in fieldnames_to_totals.items():
       if not fieldnames_to_page_values.has_key(fieldname):
         del fieldnames_to_totals[fieldname]
         continue

       page_values = fieldnames_to_page_values[fieldname]
       # Sort page values by the percentage difference.
       page_values.sort(key=lambda page_value: page_value.perc_diff,
                        reverse=True)

       if self._discard_outliers:
         # Lose the top X% and the bottom X%
         outliers_num = int(len(page_values) * self._discard_outliers/100)
         top_outliers = page_values[0:outliers_num]
         bottom_outliers = page_values[-outliers_num:]
         # Discard top and bottom outliers.
         fieldnames_to_page_values[fieldname] = (
             page_values[outliers_num:-outliers_num])
         # Remove discarded values from the running totals.
         for discarded_page in top_outliers + bottom_outliers:
           fieldname_values.value1 -= discarded_page.value1
           fieldname_values.value2 -= discarded_page.value2
         fieldnames_to_discards[fieldname] = top_outliers + bottom_outliers

       perc_diff = _GetPercentageDiff(fieldname_values.value1,
                                      fieldname_values.value2)
       if self._IsPercDiffSameOrAboveThreshold(perc_diff):
         if (len(fieldnames_to_page_values[fieldname]) <
             self._min_pages_in_each_field):
           # This field does not have enough webpages, delete it from both maps.
           print 'Removing because not enough webpages: %s' % fieldname
           print len(fieldnames_to_page_values[fieldname])
           del fieldnames_to_page_values[fieldname]
           del fieldnames_to_totals[fieldname]
           continue
         fieldname_values.perc_diff = perc_diff
         fieldname_values.perc_change = _GetPercentageChange(
             fieldname_values.value1, fieldname_values.value2)
       else:
         # Only store fieldnames that are below the variance threshold.
         print 'Removing because below the variance threshold: %s' % fieldname
         del fieldnames_to_totals[fieldname]

     # Delete keys in fieldnames_to_page_values that are not in
     # fieldnames_to_totals because those are the only ones we want to
     # display.
     fieldnames_to_page_values = dict(
         (k,v) for k,v in fieldnames_to_page_values.iteritems()
         if k in fieldnames_to_totals)

     # Both maps should end up with the same number of keys.
     assert set(fieldnames_to_page_values.keys()) == set(
         fieldnames_to_totals.keys())

     # Set the number of reporting webpages in fieldnames_to_totals.
     for fieldname, values in fieldnames_to_page_values.iteritems():
       fieldnames_to_totals[fieldname].total_webpages_reported = len(values)

     # Done processing. Output the HTML.
     self.OutputToHTML(fieldnames_to_totals, fieldnames_to_page_values,
                       fieldnames_to_discards, self._output_html_dir)

   def OutputToHTML(self, fieldnames_to_totals, fieldnames_to_page_values,
                    fieldnames_to_discards, html_dir):
     # Calculate the current UTC time.
     html_report_date = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')
     # Output the main totals HTML page.
     sorted_fieldnames_totals_items = sorted(
         fieldnames_to_totals.items(), key=lambda tuple: tuple[1].perc_diff,
         reverse=True)
     missing_output_slaves_list = []
     if self._missing_output_slaves:
       missing_output_slaves_list = self._missing_output_slaves.split(' ')
     rendered = loader.render_to_string(
         'csv_totals.html',
         {'sorted_fieldnames_totals_items': sorted_fieldnames_totals_items,
          'requester_email': self._requester_email,
          'chromium_patch_link': self._chromium_patch_link,
          'skia_patch_link': self._skia_patch_link,
          'raw_csv_nopatch': self._raw_csv_nopatch,
          'raw_csv_withpatch': self._raw_csv_withpatch,
          'threshold': self._variance_threshold,
          'discard_outliers': self._discard_outliers,
          'min_webpages': self._min_pages_in_each_field,
          'num_repeated': self._num_repeated,
          'target_platform': self._target_platform,
          'crashed_instances': self._crashed_instances,
          'missing_devices': self._missing_devices,
          'browser_args_nopatch': self._browser_args_nopatch,
          'browser_args_withpatch': self._browser_args_withpatch,
          'absolute_url': self._absolute_url,
          'pageset_type': self._pageset_type,
          'html_report_date': html_report_date,
          'chromium_hash': self._chromium_hash,
          'skia_hash': self._skia_hash,
          'missing_output_slaves': missing_output_slaves_list,
          'logs_link_prefix': self._logs_link_prefix,
          'description': self._description,
         })
     index_html_path = os.path.join(self._output_html_dir, 'index.html')
     with open(index_html_path, 'wb') as index_html:
       index_html.write(rendered)

     # Output the different per-fieldname HTML pages.
     fieldname_count = 0
     # pylint: disable=W0612
     for fieldname, unused_values in sorted_fieldnames_totals_items:
       fieldname_count += 1
       page_values = fieldnames_to_page_values[fieldname]
       rendered = loader.render_to_string(
           'csv_per_page.html',
           {'fieldname': fieldname,
            'page_values': page_values,
            'discard_outliers': self._discard_outliers,
            'discarded_webpages': fieldnames_to_discards.get(fieldname, []),
            'total_archives': self._total_archives,
            'absolute_url': self._absolute_url})
       with open(
           os.path.join(self._output_html_dir,
           'fieldname%s.html' % fieldname_count), 'wb') as fieldname_html:
         fieldname_html.write(rendered)


 if '__main__' == __name__:
   option_parser = optparse.OptionParser()
   option_parser.add_option(
       '', '--csv_file1',
       help='The absolute path to the first CSV file.')
   option_parser.add_option(
       '', '--csv_file2',
       help='The absolute path to the second CSV file.')
   option_parser.add_option(
       '', '--output_html_dir',
       help='The absolute path of the HTML dir that will contain the results of'
            ' the comparision CSV.')
   option_parser.add_option(
       '', '--requester_email',
       help='Email address of the user who kicked off the run.')
   option_parser.add_option(
       '', '--chromium_patch_link',
       help='Link to the Chromium patch used for this run.')
   option_parser.add_option(
       '', '--skia_patch_link',
       help='Link to the Skia patch used for this run.')
   option_parser.add_option(
       '', '--variance_threshold',
       help='The allowable variance in percentage between total values for each '
            'field for the two CSVs.')
   option_parser.add_option(
       '', '--absolute_url',
       help='Servers like Google Storage require an absolute url for links '
            'within the HTML output files.',
       default='')
   option_parser.add_option(
       '', '--min_pages_in_each_field',
       help='The min number of pages that must have a fieldname. If a fieldname'
            'has less pages than this then it is not reported as a failure even'
            'if the percentage difference is more than the variance threshold.',
       default=0)
   option_parser.add_option(
       '', '--discard_outliers',
       help='Determines the percentage of the outliers that will be discarded'
            'from the top and bottom values. Eg: If this value is 10% and the'
            'number of webpages in a field are 10 then the 1st and 10th'
            'webpages are discarded.',
       default=10)
   option_parser.add_option(
       '', '--num_repeated',
       help='The number of times each pageset was run.')
   option_parser.add_option(
       '', '--raw_csv_nopatch',
       help='Link to the raw CSV output of the nopatch run.')
   option_parser.add_option(
       '', '--raw_csv_withpatch',
       help='Link to the raw CSV output of the withpatch run.')
   option_parser.add_option(
       '', '--crashed_instances',
       help='Text that lists any crashed instances.')
   option_parser.add_option(
       '', '--missing_devices',
       help='Text that lists all instances with missing Android devices.')
   option_parser.add_option(
       '', '--target_platform',
       help='The platform telemetry benchmarks/measurements were run on.')
   option_parser.add_option(
       '', '--browser_args_nopatch',
       help='The browser args that were used for the nopatch run.')
   option_parser.add_option(
       '', '--browser_args_withpatch',
       help='The browser args that were used for the withpatch run.')
   option_parser.add_option(
       '', '--pageset_type',
       help='The page set type this run was done on.')
   option_parser.add_option(
       '', '--chromium_hash',
       help='The chromium git hash that was used for this run.')
   option_parser.add_option(
       '', '--skia_hash',
       help='The skia git hash that was used for this run.',
       default='')
   option_parser.add_option(
       '', '--missing_output_slaves',
       help='Slaves which had no output for this run.')
   option_parser.add_option(
       '', '--logs_link_prefix',
       help='Prefix link to the logs of the slaves.')
   option_parser.add_option(
       '', '--description',
       help='The description of the run as entered by the requester.')
   option_parser.add_option(
       '', '--total_archives',
       help='Number of archives that were used to get these results.')

   options, unused_args = option_parser.parse_args()
   if not (options.csv_file1 and options.csv_file2 and options.output_html_dir
           and options.variance_threshold and options.requester_email
           and options.chromium_patch_link
           and options.skia_patch_link and options.raw_csv_nopatch
           and options.raw_csv_withpatch and options.num_repeated
           and options.target_platform and options.pageset_type
           and options.chromium_hash and options.description):
     option_parser.error('Must specify csv_file1, csv_file2, output_html_dir, '
                         'variance_threshold, requester_email, '
                         'chromium_patch_link, '
                         'skia_patch_link, raw_csv_nopatch, description, '
                         'raw_csv_withpatch, num_repeated, pageset_type, '
                         'chromium_hash and target_platform')

   sys.exit(CsvComparer(
       options.csv_file1, options.csv_file2, options.output_html_dir,
       options.requester_email, options.chromium_patch_link,
       options.skia_patch_link,
       options.variance_threshold, options.absolute_url,
       options.min_pages_in_each_field, options.discard_outliers,
       options.raw_csv_nopatch, options.raw_csv_withpatch,
       options.num_repeated, options.target_platform,
       options.crashed_instances, options.missing_devices,
       options.browser_args_nopatch, options.browser_args_withpatch,
       options.pageset_type, options.chromium_hash, options.skia_hash,
       options.missing_output_slaves, options.logs_link_prefix,
       options.description, options.total_archives).Compare())
	#!/usr/bin/env python
	# Copyright (c) 2013 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""Python utility to compare two CSV files and output HTML results."""


	import csv
	import datetime
	import optparse
	import os
	import re
	import sys
	import tempfile

	# Add the django settings file to DJANGO_SETTINGS_MODULE.
	import django
	os.environ['DJANGO_SETTINGS_MODULE'] = 'csv-django-settings'
	django.setup()
	from django.template import loader

	GS_HTML_BROWSER_LINK = (
	'https://console.cloud.google.com/storage/browser/cluster-telemetry')
	GS_HTML_DIRECT_LINK = (
	'https://console.cloud.google.com/m/cloudstorage/b/cluster-telemetry/o')


	def _GetPercentageDiff(value1, value2):
	"""Returns the percentage difference between the specified values."""
	difference = value2 - value1
	avg = (value2 + value1)/2
	return 0 if avg == 0 else difference/avg * 100


	def _GetPercentageChange(value1, value2):
	"""Returns the percentage change between the specified values."""
	difference = value2 - value1
	return 0 if value1 == 0 else difference/value1 * 100


	class PageValues(object):
	"""Container class to hold the values of a page name."""
	def __init__(self, page_name, value1, value2, perc_diff, perc_change,
	pageset_link, archive_link, traceUrls1, traceUrls2):
	self.page_name = page_name
	self.value1 = value1
	self.value2 = value2
	self.perc_diff = perc_diff
	self.perc_change = perc_change
	self.pageset_link = pageset_link
	self.archive_link = archive_link
	self.traceUrls1 = traceUrls1.split(',') if traceUrls1 else []
	self.traceUrls2 = traceUrls2.split(',') if traceUrls2 else []


	class FieldNameValues(object):
	"""Container class to hold the values of a field name."""
	def __init__(self, value1, value2, perc_diff, total_webpages_reported):
	self.value1 = value1
	self.value2 = value2
	self.perc_diff = perc_diff
	self.total_webpages_reported = total_webpages_reported


	class CsvComparer(object):
	"""Class that compares two telemetry CSV files and outputs HTML results."""

	def __init__(self, csv_file1, csv_file2, output_html_dir, requester_email,
	chromium_patch_link, skia_patch_link,
	variance_threshold, absolute_url, min_pages_in_each_field,
	discard_outliers, raw_csv_nopatch, raw_csv_withpatch,
	num_repeated, target_platform, crashed_instances,
	missing_devices, browser_args_nopatch, browser_args_withpatch,
	pageset_type, chromium_hash, skia_hash, missing_output_slaves,
	logs_link_prefix, description, total_archives):
	"""Constructs a CsvComparer instance."""
	self._csv_file1 = csv_file1
	self._csv_file2 = csv_file2
	self._output_html_dir = output_html_dir
	self._requester_email = requester_email
	self._chromium_patch_link = chromium_patch_link
	self._skia_patch_link = skia_patch_link
	self._variance_threshold = float(variance_threshold)
	self._absolute_url = absolute_url
	self._min_pages_in_each_field = min_pages_in_each_field
	self._discard_outliers = float(discard_outliers)
	self._raw_csv_nopatch = raw_csv_nopatch
	self._raw_csv_withpatch = raw_csv_withpatch
	self._num_repeated = num_repeated
	self._target_platform = target_platform
	self._crashed_instances = crashed_instances
	self._missing_devices = missing_devices
	self._browser_args_nopatch = browser_args_nopatch
	self._browser_args_withpatch = browser_args_withpatch
	self._pageset_type = pageset_type
	self._chromium_hash = chromium_hash
	self._skia_hash = skia_hash
	self._missing_output_slaves = missing_output_slaves
	self._logs_link_prefix = logs_link_prefix
	self._description = description
	self._total_archives = total_archives

	def _IsPercDiffSameOrAboveThreshold(self, perc_diff):
	"""Compares the specified diff to the variance threshold.

	Returns True if the difference is at or above the variance threshold.
	"""
	return abs(perc_diff) >= self._variance_threshold

	def _GetSortedCSV(self, unsorted_csv_reader):
	"""Sorts the specified CSV by page_name into a new CSV file."""
	result = sorted(unsorted_csv_reader, key=lambda d: d['page_name'])
	fd, sorted_csv_file = tempfile.mkstemp()
	# Close the fd.
	os.close(fd)
	with open(sorted_csv_file, 'wb') as f:
	writer = csv.DictWriter(f, unsorted_csv_reader.fieldnames)
	writer.writeheader()
	writer.writerows(result)
	return sorted_csv_file

	def Compare(self):
	"""Method that does the CSV comparision."""

	# Do one pass of all the page_names in the 1st CSV and store them.
	# The purpose of this is that when we walk through the 2nd CSV we will know
	# Whether the same page exists in the 1st CSV (the pages are ordered the
	# same way in both files but some could be missing from each file).
	csv1_page_names = {}
	with open(self._csv_file1, 'rb') as f1:
	csv1_reader = csv.DictReader(f1)
	for row in csv1_reader:
	csv1_page_names[row['page_name']] = 1

	# Sort both CSVs.
	with open(self._csv_file1, 'rb') as f1, open(self._csv_file2, 'rb') as f2:
	sorted_csv1_filepath = self._GetSortedCSV(csv.DictReader(f1))
	sorted_csv2_filepath = self._GetSortedCSV(csv.DictReader(f2))
	with open(sorted_csv1_filepath, 'rb') as sorted_csv1, \
	open(sorted_csv2_filepath, 'rb') as sorted_csv2:
	csv1_reader = csv.DictReader(sorted_csv1)
	csv2_reader = csv.DictReader(sorted_csv2)

	# Dictionary that holds the fieldname to the ongoing total on both CSVs.
	fieldnames_to_totals = {}
	# Map of a fieldname to list of tuples containing (page_name,
	# csv_value1, csv_value2, percentage_difference).
	fieldnames_to_page_values = {}
	# Map of a fieldname to the discarded page value.
	fieldnames_to_discards = {}

	# Now walk through both CSV files with a pointer at each one and collect
	# the value totals.
	for csv2_row in csv2_reader:
	# Make sure the CSV2 page_name existings in CSV1 else skip it (move
	# CSV2 pointer down).
	page_name2 = csv2_row['page_name']
	if not csv1_page_names.has_key(page_name2):
	continue
	# Reach the right page_name in CSV1 (move CSV1 pointer down).
	try:
	csv1_row = csv1_reader.next()
	while csv1_row['page_name'] != page_name2:
	csv1_row = csv1_reader.next()
	except StopIteration:
	# Reached the end of CSV1, break out of the row loop.
	break

	# Store values for all fieldnames (except page_name).
	for fieldname in csv2_reader.fieldnames:
	if fieldname != 'page_name' and csv1_row.has_key(fieldname):
	if csv1_row[fieldname] == '' or csv2_row[fieldname] == '':
	# TODO(rmistry): Check with tonyg about what the algorithm
	# should be doing when one CSV has an empty value and the other
	# does not.
	continue
	try:
	if csv1_row[fieldname] == '-':
	csv1_value = 0
	else:
	csv1_value = float(csv1_row.get(fieldname))
	if csv2_row[fieldname] == '-':
	csv2_value = 0
	else:
	csv2_value = float(csv2_row.get(fieldname))
	except ValueError:
	# We expected only floats, cannot compare strings. Skip field.
	continue

	# Update the total in the dict.
	fieldname_values = fieldnames_to_totals.get(
	fieldname, FieldNameValues(0, 0, 0, 0))
	fieldname_values.value1 += csv1_value
	fieldname_values.value2 += csv2_value
	fieldnames_to_totals[fieldname] = fieldname_values

	perc_diff = _GetPercentageDiff(csv1_value, csv2_value)
	if self._IsPercDiffSameOrAboveThreshold(perc_diff):
	rank = 1
	slave_num = 1
	m = re.match(r".* \(#([0-9]+)\)", page_name2)
	if m and m.group(1):
	rank = int(m.group(1))
	while rank > slave_num * 100:
	slave_num += 1
	pageset_link = (
	'%s/swarming/page_sets/%s/%s/%s.py' % (
	GS_HTML_DIRECT_LINK, self._pageset_type, rank, rank))
	archive_link = (
	'%s/swarming/webpage_archives/%s/%s' % (
	GS_HTML_BROWSER_LINK, self._pageset_type, rank))
	# Add this page only if its diff is above the threshold.
	l = fieldnames_to_page_values.get(fieldname, [])
	pv = PageValues(
	page_name2, csv1_value, csv2_value, perc_diff,
	_GetPercentageChange(csv1_value, csv2_value), pageset_link,
	archive_link, csv1_row.get('traceUrls'),
	csv2_row.get('traceUrls'))
	l.append(pv)
	fieldnames_to_page_values[fieldname] = l
	os.remove(sorted_csv1_filepath)
	os.remove(sorted_csv2_filepath)

	# Calculate and add the percentage differences for each fieldname.
	# The fieldnames_to_totals dict is modified in the below loop to remove
	# entries which are below the threshold .
	for fieldname, fieldname_values in fieldnames_to_totals.items():
	if not fieldnames_to_page_values.has_key(fieldname):
	del fieldnames_to_totals[fieldname]
	continue

	page_values = fieldnames_to_page_values[fieldname]
	# Sort page values by the percentage difference.
	page_values.sort(key=lambda page_value: page_value.perc_diff,
	reverse=True)

	if self._discard_outliers:
	# Lose the top X% and the bottom X%
	outliers_num = int(len(page_values) * self._discard_outliers/100)
	top_outliers = page_values[0:outliers_num]
	bottom_outliers = page_values[-outliers_num:]
	# Discard top and bottom outliers.
	fieldnames_to_page_values[fieldname] = (
	page_values[outliers_num:-outliers_num])
	# Remove discarded values from the running totals.
	for discarded_page in top_outliers + bottom_outliers:
	fieldname_values.value1 -= discarded_page.value1
	fieldname_values.value2 -= discarded_page.value2
	fieldnames_to_discards[fieldname] = top_outliers + bottom_outliers

	perc_diff = _GetPercentageDiff(fieldname_values.value1,
	fieldname_values.value2)
	if self._IsPercDiffSameOrAboveThreshold(perc_diff):
	if (len(fieldnames_to_page_values[fieldname]) <
	self._min_pages_in_each_field):
	# This field does not have enough webpages, delete it from both maps.
	print 'Removing because not enough webpages: %s' % fieldname
	print len(fieldnames_to_page_values[fieldname])
	del fieldnames_to_page_values[fieldname]
	del fieldnames_to_totals[fieldname]
	continue
	fieldname_values.perc_diff = perc_diff
	fieldname_values.perc_change = _GetPercentageChange(
	fieldname_values.value1, fieldname_values.value2)
	else:
	# Only store fieldnames that are below the variance threshold.
	print 'Removing because below the variance threshold: %s' % fieldname
	del fieldnames_to_totals[fieldname]

	# Delete keys in fieldnames_to_page_values that are not in
	# fieldnames_to_totals because those are the only ones we want to
	# display.
	fieldnames_to_page_values = dict(
	(k,v) for k,v in fieldnames_to_page_values.iteritems()
	if k in fieldnames_to_totals)

	# Both maps should end up with the same number of keys.
	assert set(fieldnames_to_page_values.keys()) == set(
	fieldnames_to_totals.keys())

	# Set the number of reporting webpages in fieldnames_to_totals.
	for fieldname, values in fieldnames_to_page_values.iteritems():
	fieldnames_to_totals[fieldname].total_webpages_reported = len(values)

	# Done processing. Output the HTML.
	self.OutputToHTML(fieldnames_to_totals, fieldnames_to_page_values,
	fieldnames_to_discards, self._output_html_dir)

	def OutputToHTML(self, fieldnames_to_totals, fieldnames_to_page_values,
	fieldnames_to_discards, html_dir):
	# Calculate the current UTC time.
	html_report_date = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')
	# Output the main totals HTML page.
	sorted_fieldnames_totals_items = sorted(
	fieldnames_to_totals.items(), key=lambda tuple: tuple[1].perc_diff,
	reverse=True)
	missing_output_slaves_list = []
	if self._missing_output_slaves:
	missing_output_slaves_list = self._missing_output_slaves.split(' ')
	rendered = loader.render_to_string(
	'csv_totals.html',
	{'sorted_fieldnames_totals_items': sorted_fieldnames_totals_items,
	'requester_email': self._requester_email,
	'chromium_patch_link': self._chromium_patch_link,
	'skia_patch_link': self._skia_patch_link,
	'raw_csv_nopatch': self._raw_csv_nopatch,
	'raw_csv_withpatch': self._raw_csv_withpatch,
	'threshold': self._variance_threshold,
	'discard_outliers': self._discard_outliers,
	'min_webpages': self._min_pages_in_each_field,
	'num_repeated': self._num_repeated,
	'target_platform': self._target_platform,
	'crashed_instances': self._crashed_instances,
	'missing_devices': self._missing_devices,
	'browser_args_nopatch': self._browser_args_nopatch,
	'browser_args_withpatch': self._browser_args_withpatch,
	'absolute_url': self._absolute_url,
	'pageset_type': self._pageset_type,
	'html_report_date': html_report_date,
	'chromium_hash': self._chromium_hash,
	'skia_hash': self._skia_hash,
	'missing_output_slaves': missing_output_slaves_list,
	'logs_link_prefix': self._logs_link_prefix,
	'description': self._description,
	})
	index_html_path = os.path.join(self._output_html_dir, 'index.html')
	with open(index_html_path, 'wb') as index_html:
	index_html.write(rendered)

	# Output the different per-fieldname HTML pages.
	fieldname_count = 0
	# pylint: disable=W0612
	for fieldname, unused_values in sorted_fieldnames_totals_items:
	fieldname_count += 1
	page_values = fieldnames_to_page_values[fieldname]
	rendered = loader.render_to_string(
	'csv_per_page.html',
	{'fieldname': fieldname,
	'page_values': page_values,
	'discard_outliers': self._discard_outliers,
	'discarded_webpages': fieldnames_to_discards.get(fieldname, []),
	'total_archives': self._total_archives,
	'absolute_url': self._absolute_url})
	with open(
	os.path.join(self._output_html_dir,
	'fieldname%s.html' % fieldname_count), 'wb') as fieldname_html:
	fieldname_html.write(rendered)


	if '__main__' == __name__:
	option_parser = optparse.OptionParser()
	option_parser.add_option(
	'', '--csv_file1',
	help='The absolute path to the first CSV file.')
	option_parser.add_option(
	'', '--csv_file2',
	help='The absolute path to the second CSV file.')
	option_parser.add_option(
	'', '--output_html_dir',
	help='The absolute path of the HTML dir that will contain the results of'
	' the comparision CSV.')
	option_parser.add_option(
	'', '--requester_email',
	help='Email address of the user who kicked off the run.')
	option_parser.add_option(
	'', '--chromium_patch_link',
	help='Link to the Chromium patch used for this run.')
	option_parser.add_option(
	'', '--skia_patch_link',
	help='Link to the Skia patch used for this run.')
	option_parser.add_option(
	'', '--variance_threshold',
	help='The allowable variance in percentage between total values for each '
	'field for the two CSVs.')
	option_parser.add_option(
	'', '--absolute_url',
	help='Servers like Google Storage require an absolute url for links '
	'within the HTML output files.',
	default='')
	option_parser.add_option(
	'', '--min_pages_in_each_field',
	help='The min number of pages that must have a fieldname. If a fieldname'
	'has less pages than this then it is not reported as a failure even'
	'if the percentage difference is more than the variance threshold.',
	default=0)
	option_parser.add_option(
	'', '--discard_outliers',
	help='Determines the percentage of the outliers that will be discarded'
	'from the top and bottom values. Eg: If this value is 10% and the'
	'number of webpages in a field are 10 then the 1st and 10th'
	'webpages are discarded.',
	default=10)
	option_parser.add_option(
	'', '--num_repeated',
	help='The number of times each pageset was run.')
	option_parser.add_option(
	'', '--raw_csv_nopatch',
	help='Link to the raw CSV output of the nopatch run.')
	option_parser.add_option(
	'', '--raw_csv_withpatch',
	help='Link to the raw CSV output of the withpatch run.')
	option_parser.add_option(
	'', '--crashed_instances',
	help='Text that lists any crashed instances.')
	option_parser.add_option(
	'', '--missing_devices',
	help='Text that lists all instances with missing Android devices.')
	option_parser.add_option(
	'', '--target_platform',
	help='The platform telemetry benchmarks/measurements were run on.')
	option_parser.add_option(
	'', '--browser_args_nopatch',
	help='The browser args that were used for the nopatch run.')
	option_parser.add_option(
	'', '--browser_args_withpatch',
	help='The browser args that were used for the withpatch run.')
	option_parser.add_option(
	'', '--pageset_type',
	help='The page set type this run was done on.')
	option_parser.add_option(
	'', '--chromium_hash',
	help='The chromium git hash that was used for this run.')
	option_parser.add_option(
	'', '--skia_hash',
	help='The skia git hash that was used for this run.',
	default='')
	option_parser.add_option(
	'', '--missing_output_slaves',
	help='Slaves which had no output for this run.')
	option_parser.add_option(
	'', '--logs_link_prefix',
	help='Prefix link to the logs of the slaves.')
	option_parser.add_option(
	'', '--description',
	help='The description of the run as entered by the requester.')
	option_parser.add_option(
	'', '--total_archives',
	help='Number of archives that were used to get these results.')

	options, unused_args = option_parser.parse_args()
	if not (options.csv_file1 and options.csv_file2 and options.output_html_dir
	and options.variance_threshold and options.requester_email
	and options.chromium_patch_link
	and options.skia_patch_link and options.raw_csv_nopatch
	and options.raw_csv_withpatch and options.num_repeated
	and options.target_platform and options.pageset_type
	and options.chromium_hash and options.description):
	option_parser.error('Must specify csv_file1, csv_file2, output_html_dir, '
	'variance_threshold, requester_email, '
	'chromium_patch_link, '
	'skia_patch_link, raw_csv_nopatch, description, '
	'raw_csv_withpatch, num_repeated, pageset_type, '
	'chromium_hash and target_platform')

	sys.exit(CsvComparer(
	options.csv_file1, options.csv_file2, options.output_html_dir,
	options.requester_email, options.chromium_patch_link,
	options.skia_patch_link,
	options.variance_threshold, options.absolute_url,
	options.min_pages_in_each_field, options.discard_outliers,
	options.raw_csv_nopatch, options.raw_csv_withpatch,
	options.num_repeated, options.target_platform,
	options.crashed_instances, options.missing_devices,
	options.browser_args_nopatch, options.browser_args_withpatch,
	options.pageset_type, options.chromium_hash, options.skia_hash,
	options.missing_output_slaves, options.logs_link_prefix,
	options.description, options.total_archives).Compare())