cluster_telemetry/telemetry_slave_scripts/capture_and_compare_pixeldiffs.py - buildbot - Git at Google

 #!/usr/bin/env python
 # Copyright 2014 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 # Tool for seeing the real world impact of a patch.
 #
 # Layout Tests can tell you whether something has changed, but this can help
 # you determine whether a subtle/controversial change is beneficial or not.
 #
 # It dumps the rendering of a large number of sites, both with and without a
 # patch being evaluated, then sorts them by greatest difference in rendering,
 # such that a human reviewer can quickly review the most impacted sites,
 # rather than having to manually try sites to see if anything changes.
 #
 # In future it might be possible to extend this to other kinds of differences,
 # e.g. page load times.
 #
 # pylint: disable=C0301
 # The original file is from http://src.chromium.org/viewvc/chrome/trunk/src/tools/real_world_impact/real_world_impact.py
 # It was written by johnme@ and modified by pdr@.
 # rmistry@ has renamed the file and modified it to run on the Cluster telemetry
 # 100 slaves (http://skia-tree-status.appspot.com/skia-telemetry/chromium_try).

 import argparse
 from argparse import RawTextHelpFormatter
 import datetime
 import errno
 from distutils.spawn import find_executable
 from operator import itemgetter
 import multiprocessing
 import os
 import posixpath
 import re
 import subprocess
 import sys
 import textwrap
 import time
 from urlparse import urlparse
 import webbrowser


 action = None
 allow_js = False
 additional_content_shell_flags = ''
 output_dir = ''
 image_diff = ''
 content_shell = ''
 urls = []
 print_lock = multiprocessing.Lock()


 def MakeDirsIfNotExist(directory):
   try:
     os.makedirs(directory)
   except OSError as e:
     if e.errno != errno.EEXIST:
       raise


 def SetupPaths():
   MakeDirsIfNotExist(output_dir)
   return True


 def CheckPrerequisites():
   if not find_executable('wget'):
     print 'wget not found! Install wget and re-run this.'
     return False
   if not os.path.exists(image_diff):
     print 'image_diff not found (%s)!' % image_diff
     print 'Build the image_diff target and re-run this.'
     return False
   if not os.path.exists(content_shell):
     print 'Content shell not found (%s)!' % content_shell
     print 'Build Release/content_shell and re-run this.'
     return False
   return True


 def PickSampleUrls(start_number, end_number, csv_path):
   global urls
   data_dir = os.path.join(output_dir, 'data')
   MakeDirsIfNotExist(data_dir)

   bad_urls_path = os.path.join(data_dir, 'bad_urls.txt')
   if os.path.exists(bad_urls_path):
     with open(bad_urls_path) as f:
       bad_urls = set(f.read().splitlines())
   else:
     bad_urls = set()

   # See if we've already selected the same sample previously (this way, if you
   # call this script with arguments
   # '--start_number=1 --end_number=10 --action=before' then
   # '--start_number=1 --end_number=10 --action=after', we'll use the same
   # sample, as expected!).
   urls_path = os.path.join(data_dir, '%d-%d_urls.txt' % (start_number,
                                                          end_number))
   if not os.path.exists(urls_path):
     if action == 'compare':
       print ('Error: you must run "--action=before" and "--action=after" '
              'before running "--action=compare"')
       return False
     print 'Picking %d-%d from the Alexa list...' % (start_number, end_number)

     urls = []
     current_rank = 0
     with open(csv_path) as f:
       for entry in f:
         current_rank += 1
         if current_rank < start_number:
           continue
         elif current_rank > end_number:
           break
         hostname = entry.strip().split(',')[1]
         if not '/' in hostname:  # Skip Alexa 1,000,000 entries that have paths.
           url = 'http://%s/' % hostname
           if not url in bad_urls:
             urls.append(url)
     # Don't write these to disk yet; we'll do that in SaveWorkingUrls below
     # once we have tried to download them and seen which ones fail.
   else:
     with open(urls_path) as f:
       urls = [u for u in f.read().splitlines() if not u in bad_urls]
   return True


 def SaveWorkingUrls(start_number, end_number):
   # TODO(johnme): Update the list if a url that used to work goes offline.
   urls_path = os.path.join(output_dir, 'data', '%d-%d_urls.txt' % (start_number,
                                                                    end_number))
   if not os.path.exists(urls_path):
     with open(urls_path, 'w') as f:
       f.writelines(u + '\n' for u in urls)


 def PrintElapsedTime(elapsed, detail=''):
   elapsed = round(elapsed * 10) / 10.0
   m = elapsed / 60
   s = elapsed % 60
   print 'Took %dm%.1fs' % (m, s), detail


 def DownloadStaticCopyTask(url):
   url_parts = urlparse(url)
   host_dir = os.path.join(output_dir, 'data', url_parts.hostname)
   # Use wget for now, as does a reasonable job of spidering page dependencies
   # (e.g. CSS, JS, images).
   success = True
   try:
     subprocess.check_call(['timeout', '60',
                            'wget',
                            '--execute', 'robots=off',
                            ('--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS '
                             'X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) C'
                             'hrome/32.0.1700.14 Safari/537.36'),
                            '--page-requisites',
                            '--span-hosts',
                            '--adjust-extension',
                            '--convert-links',
                            '--directory-prefix=' + host_dir,
                            '--force-directories',
                            '--default-page=index.html',
                            '--no-check-certificate',
                            '--timeout=5', # 5s timeout
                            '--tries=2',
                            '--quiet',
                            url])
   except KeyboardInterrupt:
     success = False
   except subprocess.CalledProcessError:
     # Ignoring these for now, as some sites have issues with their subresources
     # yet still produce a renderable index.html
     pass #success = False
   if success:
     download_path = os.path.join(host_dir, url_parts.hostname, 'index.html')
     if not os.path.exists(download_path):
       success = False
     else:
       with print_lock:
         print 'Downloaded:', url
   if not success:
     with print_lock:
       print 'Failed to download:', url
     return False
   return True


 def DownloadStaticCopies(start_number, end_number):
   global urls
   new_urls = []
   for url in urls:
     url_parts = urlparse(url)
     host_dir = os.path.join(output_dir, 'data', url_parts.hostname)
     download_path = os.path.join(host_dir, url_parts.hostname, 'index.html')
     if not os.path.exists(download_path):
       new_urls.append(url)

   if new_urls:
     print 'Downloading static copies of %d sites...' % len(new_urls)
     start_time = time.time()

     results = multiprocessing.Pool(20).map(DownloadStaticCopyTask, new_urls)
     failed_urls = [new_urls[i] for i, ret in enumerate(results) if not ret]
     if failed_urls:
       bad_urls_path = os.path.join(output_dir, 'data', 'bad_urls.txt')
       with open(bad_urls_path, 'a') as f:
         f.writelines(u + '\n' for u in failed_urls)
       failed_urls_set = set(failed_urls)
       urls = [u for u in urls if u not in failed_urls_set]

     PrintElapsedTime(time.time() - start_time)

   SaveWorkingUrls(start_number, end_number)


 def RunDrtTask(url):
   url_parts = urlparse(url)
   host_dir = os.path.join(output_dir, 'data', url_parts.hostname)
   html_path = os.path.join(host_dir, url_parts.hostname, 'index.html')

   if not allow_js:
     nojs_path = os.path.join(host_dir, url_parts.hostname, 'index-nojs.html')
     if not os.path.exists(nojs_path):
       with open(html_path) as f:
         html = f.read()
       if not html:
         return False
       # These aren't intended to be XSS safe :)
       block_tags = (r'<\s*(script|object|video|audio|iframe|frameset|frame)'
                     r'\b.*?<\s*\/\s*\1\s*>')
       block_attrs = r'\s(onload|onerror)\s*=\s*(\'[^\']*\'|"[^"]*|\S*)'
       html = re.sub(block_tags, '', html, flags=re.I|re.S)
       html = re.sub(block_attrs, '', html, flags=re.I)
       with open(nojs_path, 'w') as f:
         f.write(html)
     html_path = nojs_path

   start_time = time.time()

   with open(os.devnull, 'w') as fnull:
     dump_tree_cmd = [content_shell,
                      '--dump-render-tree',
                      additional_content_shell_flags,
                      # The escaped single quote is not a typo, it's a separator!
                      html_path + "\\'--pixel-test"
                     ]
     p = subprocess.Popen(' '.join(dump_tree_cmd),
                          shell=True,
                          stdout=subprocess.PIPE,
                          stderr=fnull)
   result = p.stdout.read()

   PNG_START = b'\x89\x50\x4E\x47\x0D\x0A\x1A\x0A'
   PNG_END = b'\x49\x45\x4E\x44\xAE\x42\x60\x82'
   try:
     start = result.index(PNG_START)
     end = result.rindex(PNG_END) + 8
   except ValueError:
     return False

   png_path = os.path.join(output_dir, action, url_parts.hostname + '.png')
   MakeDirsIfNotExist(os.path.dirname(png_path))
   with open(png_path, 'wb') as f:
     f.write(result[start:end])
   elapsed_time = (time.time() - start_time, url)
   return elapsed_time


 def RunDrt():
   print 'Taking screenshots of %d pages...' % len(urls)
   start_time = time.time()

   results = multiprocessing.Pool().map(RunDrtTask, urls, 1)

   max_time, url = max(t for t in results if t)
   elapsed_detail = '(slowest: %.2fs on %s)' % (max_time, url)
   PrintElapsedTime(time.time() - start_time, elapsed_detail)


 def CompareResultsTask(url):
   url_parts = urlparse(url)
   before_path = os.path.join(output_dir, 'before', url_parts.hostname + '.png')
   after_path = os.path.join(output_dir, 'after', url_parts.hostname + '.png')
   diff_path = os.path.join(output_dir, 'diff', url_parts.hostname + '.png')
   MakeDirsIfNotExist(os.path.join(output_dir, 'diff'))

   red_path = ('data:image/gif;base64,R0lGODlhAQABAPAAAP8AAP///yH5BAAAAAAALAAAAA'
               'ABAAEAAAICRAEAOw==')

   before_exists = os.path.exists(before_path)
   after_exists = os.path.exists(after_path)
   if not before_exists and not after_exists:
     # TODO(johnme): Make this more informative.
     return (-100, url, red_path)
   if before_exists != after_exists:
     # TODO(johnme): Make this more informative.
     return (200, url, red_path)

   # Get percentage difference.
   p = subprocess.Popen([image_diff, '--histogram',
                         before_path, after_path],
                         shell=False,
                         stdout=subprocess.PIPE)
   output, _ = p.communicate()
   if p.returncode == 0:
     return (0, url, before_path)
   diff_match = re.match(r'histogram diff: (\d+\.\d{2})% (?:passed|failed)\n'
                          'exact diff: (\d+\.\d{2})% (?:passed|failed)', output)
   if not diff_match:
     raise Exception('image_diff output format changed')
   histogram_diff = float(diff_match.group(1))
   exact_diff = float(diff_match.group(2))
   combined_diff = max(histogram_diff + exact_diff / 8, 0.001)

   # Produce diff PNG.
   subprocess.call([image_diff, '--diff', before_path, after_path, diff_path])
   return (combined_diff, url, diff_path)


 def CompareResults(start_number, end_number, gs_url_prefix):
   print 'Running image_diff on %d pages...' % len(urls)
   start_time = time.time()

   results = multiprocessing.Pool().map(CompareResultsTask, urls)
   results.sort(key=itemgetter(0), reverse=True)

   PrintElapsedTime(time.time() - start_time)

   now = datetime.datetime.today().strftime('%a %Y-%m-%d %H:%M')
   html_start = textwrap.dedent("""\
   <!DOCTYPE html>
   <html>
   <head>
   <title>Real World Impact report %s</title>
   <script>
     var togglingImg = null;
     var toggleTimer = null;

     var before = true;
     function toggle() {
       var newFolder = before ? "\/before" : "\/after";
       togglingImg.src = togglingImg.src.replace(/\/before|\/after|\/diff/, newFolder);
       before = !before;
       toggleTimer = setTimeout(toggle, 300);
     }

     function startToggle(img) {
       before = true;
       togglingImg = img;
       if (!img.origSrc)
         img.origSrc = img.src;
       toggle();
     }
     function stopToggle(img) {
       clearTimeout(toggleTimer);
       img.src = img.origSrc;
     }

     document.onkeydown = function(e) {
       e = e || window.event;
       var keyCode = e.keyCode || e.which;
       var newFolder;
       switch (keyCode) {
         case 49: //'1'
           newFolder = "\/before"; break;
         case 50: //'2'
           newFolder = "\/after"; break;
         case 51: //'3'
           newFolder = "\/diff"; break;
         default:
           return;
       }
       var imgs = document.getElementsByTagName("img");
       for (var i = 0; i < imgs.length; i++) {
         imgs[i].src = imgs[i].src.replace(/\/before|\/after|\/diff/, newFolder);
       }
     };
   </script>
   <style>
     h1 {
       font-family: sans;
     }
     h2 {
       font-family: monospace;
       white-space: pre;
     }
     .nsfw-spacer {
       height: 50vh;
     }
     .nsfw-warning {
       background: yellow;
       border: 10px solid red;
     }
     .info {
       font-size: 1.2em;
       font-style: italic;
     }
     body:not(.details-supported) details {
       display: none;
     }
   </style>
   </head>
   <body>
     <script>
     if ('open' in document.createElement('details'))
       document.body.className = "details-supported";
     </script>
     <!--<div class="nsfw-spacer"></div>-->
     <p class="nsfw-warning">Warning: sites below are taken from the Alexa
     top %d-%d and may be NSFW.</p>
     <!--<div class="nsfw-spacer"></div>-->
     <h1>Real World Impact report %s</h1>
     <p class="info">Press 1, 2 and 3 to switch between before, after and diff
     screenshots respectively; or hover over the images to rapidly alternate
     between before and after.</p>
   """ % (now, start_number, end_number, now))

   html_same_row = """\
   <h2>No difference on <a href="%s">%s</a>.</h2>
   """

   html_diff_row = """\
   <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2>
   <img src="%s" width="800" height="600"
        onmouseover="startToggle(this)" onmouseout="stopToggle(this)">
   """

   html_end = textwrap.dedent("""\
   </body>
   </html>""")

   html_path = os.path.join(output_dir, 'diff.html')
   with open(html_path, 'w') as f:
     f.write(html_start)
     for (diff_float, url, diff_path) in results:
       diff_path = os.path.relpath(diff_path, output_dir)
       if diff_float == 0:
         f.write(html_same_row % (url, url))
       else:
         f.write(html_diff_row % (
             diff_float, url, url, posixpath.join(gs_url_prefix, diff_path)))
     f.write(html_end)

   webbrowser.open_new_tab('file://' + html_path)


 def main(argv):
   global action, allow_js, output_dir, additional_content_shell_flags, \
          image_diff, content_shell

   parser = argparse.ArgumentParser(
       formatter_class=RawTextHelpFormatter,
       description='Compare the real world impact of a content shell change.',
       epilog=textwrap.dedent("""\
           Example usage:
             1. Build content_shell in out/Release without any changes.
             2. Run: %s --action=before --start_number=1 --end_number=10
             3. Either:
                  a. Apply your controversial patch and rebuild content_shell.
                  b. Pass --additional_flags="--enable_your_flag" in step 4.
             4. Run: %s --action=after --start_number=1 --end_number=10
             5. Run: %s --action=compare --start_number=1 --end_number=10
           """ % (argv[0], argv[0], argv[0])))
   parser.add_argument('--allow_js', help='Do not disable Javascript',
                       action='store_true')
   parser.add_argument('--additional_flags',
                       help='Additional flags to pass to content shell')
   parser.add_argument('--action',
                       help=textwrap.dedent("""\
                         Action to perform.
                           download - Just download the sites.
                           before - Run content shell and record 'before' result.
                           after - Run content shell and record 'after' result.
                           compare - Compare before and after results.
                       """),
                       choices=['download', 'before', 'after', 'compare'],
                       required=True)
   parser.add_argument('--start_number',
                       help='Specifies which website rank (in Alexa\'s list) to '
                            'start with',
                       type=int, required=True)
   parser.add_argument('--end_number',
                       help='Specifies which website rank (in Alexa\'s list) to '
                            'end with',
                       type=int, required=True)
   parser.add_argument('--output_dir',
                       help='Directory where output files will be stored',
                       required=True)
   parser.add_argument('--csv_path',
                       help='Path to the Alexa top 1M webpages CSV',
                       required=True)
   parser.add_argument('--chromium_out_dir',
                       help='Path to Chromium build\'s out directory.',
                       required=True)
   parser.add_argument('--gs_url_prefix',
                       help='The GS prefix to use which points to img files.',
                       required=True)

   args = parser.parse_args()

   action = args.action
   output_dir = os.path.join(args.output_dir, 'real_world_impact')
   gs_url_prefix = args.gs_url_prefix
   chromium_out_dir = args.chromium_out_dir
   image_diff = os.path.join(chromium_out_dir, 'image_diff')
   content_shell = os.path.join(chromium_out_dir, 'content_shell')
   csv_path = args.csv_path
   start_number = args.start_number
   end_number = args.end_number

   if (args.allow_js):
     allow_js = args.allow_js

   if (args.additional_flags):
     additional_content_shell_flags = args.additional_flags

   if not SetupPaths() or not CheckPrerequisites() or not PickSampleUrls(
       start_number, end_number, csv_path):
     return 1

   if action == 'compare':
     CompareResults(start_number, end_number, gs_url_prefix)
   else:
     DownloadStaticCopies(start_number, end_number)
     if action != 'download':
       RunDrt()
   return 0


 if __name__ == '__main__':
   sys.exit(main(sys.argv))
	#!/usr/bin/env python
	# Copyright 2014 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	# Tool for seeing the real world impact of a patch.
	#
	# Layout Tests can tell you whether something has changed, but this can help
	# you determine whether a subtle/controversial change is beneficial or not.
	#
	# It dumps the rendering of a large number of sites, both with and without a
	# patch being evaluated, then sorts them by greatest difference in rendering,
	# such that a human reviewer can quickly review the most impacted sites,
	# rather than having to manually try sites to see if anything changes.
	#
	# In future it might be possible to extend this to other kinds of differences,
	# e.g. page load times.
	#
	# pylint: disable=C0301
	# The original file is from http://src.chromium.org/viewvc/chrome/trunk/src/tools/real_world_impact/real_world_impact.py
	# It was written by johnme@ and modified by pdr@.
	# rmistry@ has renamed the file and modified it to run on the Cluster telemetry
	# 100 slaves (http://skia-tree-status.appspot.com/skia-telemetry/chromium_try).

	import argparse
	from argparse import RawTextHelpFormatter
	import datetime
	import errno
	from distutils.spawn import find_executable
	from operator import itemgetter
	import multiprocessing
	import os
	import posixpath
	import re
	import subprocess
	import sys
	import textwrap
	import time
	from urlparse import urlparse
	import webbrowser


	action = None
	allow_js = False
	additional_content_shell_flags = ''
	output_dir = ''
	image_diff = ''
	content_shell = ''
	urls = []
	print_lock = multiprocessing.Lock()


	def MakeDirsIfNotExist(directory):
	try:
	os.makedirs(directory)
	except OSError as e:
	if e.errno != errno.EEXIST:
	raise


	def SetupPaths():
	MakeDirsIfNotExist(output_dir)
	return True


	def CheckPrerequisites():
	if not find_executable('wget'):
	print 'wget not found! Install wget and re-run this.'
	return False
	if not os.path.exists(image_diff):
	print 'image_diff not found (%s)!' % image_diff
	print 'Build the image_diff target and re-run this.'
	return False
	if not os.path.exists(content_shell):
	print 'Content shell not found (%s)!' % content_shell
	print 'Build Release/content_shell and re-run this.'
	return False
	return True


	def PickSampleUrls(start_number, end_number, csv_path):
	global urls
	data_dir = os.path.join(output_dir, 'data')
	MakeDirsIfNotExist(data_dir)

	bad_urls_path = os.path.join(data_dir, 'bad_urls.txt')
	if os.path.exists(bad_urls_path):
	with open(bad_urls_path) as f:
	bad_urls = set(f.read().splitlines())
	else:
	bad_urls = set()

	# See if we've already selected the same sample previously (this way, if you
	# call this script with arguments
	# '--start_number=1 --end_number=10 --action=before' then
	# '--start_number=1 --end_number=10 --action=after', we'll use the same
	# sample, as expected!).
	urls_path = os.path.join(data_dir, '%d-%d_urls.txt' % (start_number,
	end_number))
	if not os.path.exists(urls_path):
	if action == 'compare':
	print ('Error: you must run "--action=before" and "--action=after" '
	'before running "--action=compare"')
	return False
	print 'Picking %d-%d from the Alexa list...' % (start_number, end_number)

	urls = []
	current_rank = 0
	with open(csv_path) as f:
	for entry in f:
	current_rank += 1
	if current_rank < start_number:
	continue
	elif current_rank > end_number:
	break
	hostname = entry.strip().split(',')[1]
	if not '/' in hostname: # Skip Alexa 1,000,000 entries that have paths.
	url = 'http://%s/' % hostname
	if not url in bad_urls:
	urls.append(url)
	# Don't write these to disk yet; we'll do that in SaveWorkingUrls below
	# once we have tried to download them and seen which ones fail.
	else:
	with open(urls_path) as f:
	urls = [u for u in f.read().splitlines() if not u in bad_urls]
	return True


	def SaveWorkingUrls(start_number, end_number):
	# TODO(johnme): Update the list if a url that used to work goes offline.
	urls_path = os.path.join(output_dir, 'data', '%d-%d_urls.txt' % (start_number,
	end_number))
	if not os.path.exists(urls_path):
	with open(urls_path, 'w') as f:
	f.writelines(u + '\n' for u in urls)


	def PrintElapsedTime(elapsed, detail=''):
	elapsed = round(elapsed * 10) / 10.0
	m = elapsed / 60
	s = elapsed % 60
	print 'Took %dm%.1fs' % (m, s), detail


	def DownloadStaticCopyTask(url):
	url_parts = urlparse(url)
	host_dir = os.path.join(output_dir, 'data', url_parts.hostname)
	# Use wget for now, as does a reasonable job of spidering page dependencies
	# (e.g. CSS, JS, images).
	success = True
	try:
	subprocess.check_call(['timeout', '60',
	'wget',
	'--execute', 'robots=off',
	('--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS '
	'X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) C'
	'hrome/32.0.1700.14 Safari/537.36'),
	'--page-requisites',
	'--span-hosts',
	'--adjust-extension',
	'--convert-links',
	'--directory-prefix=' + host_dir,
	'--force-directories',
	'--default-page=index.html',
	'--no-check-certificate',
	'--timeout=5', # 5s timeout
	'--tries=2',
	'--quiet',
	url])
	except KeyboardInterrupt:
	success = False
	except subprocess.CalledProcessError:
	# Ignoring these for now, as some sites have issues with their subresources
	# yet still produce a renderable index.html
	pass #success = False
	if success:
	download_path = os.path.join(host_dir, url_parts.hostname, 'index.html')
	if not os.path.exists(download_path):
	success = False
	else:
	with print_lock:
	print 'Downloaded:', url
	if not success:
	with print_lock:
	print 'Failed to download:', url
	return False
	return True


	def DownloadStaticCopies(start_number, end_number):
	global urls
	new_urls = []
	for url in urls:
	url_parts = urlparse(url)
	host_dir = os.path.join(output_dir, 'data', url_parts.hostname)
	download_path = os.path.join(host_dir, url_parts.hostname, 'index.html')
	if not os.path.exists(download_path):
	new_urls.append(url)

	if new_urls:
	print 'Downloading static copies of %d sites...' % len(new_urls)
	start_time = time.time()

	results = multiprocessing.Pool(20).map(DownloadStaticCopyTask, new_urls)
	failed_urls = [new_urls[i] for i, ret in enumerate(results) if not ret]
	if failed_urls:
	bad_urls_path = os.path.join(output_dir, 'data', 'bad_urls.txt')
	with open(bad_urls_path, 'a') as f:
	f.writelines(u + '\n' for u in failed_urls)
	failed_urls_set = set(failed_urls)
	urls = [u for u in urls if u not in failed_urls_set]

	PrintElapsedTime(time.time() - start_time)

	SaveWorkingUrls(start_number, end_number)


	def RunDrtTask(url):
	url_parts = urlparse(url)
	host_dir = os.path.join(output_dir, 'data', url_parts.hostname)
	html_path = os.path.join(host_dir, url_parts.hostname, 'index.html')

	if not allow_js:
	nojs_path = os.path.join(host_dir, url_parts.hostname, 'index-nojs.html')
	if not os.path.exists(nojs_path):
	with open(html_path) as f:
	html = f.read()
	if not html:
	return False
	# These aren't intended to be XSS safe :)
	block_tags = (r'<\s*(script\|object\|video\|audio\|iframe\|frameset\|frame)'
	r'\b.?<\s\/\s\1\s>')
	block_attrs = r'\s(onload\|onerror)\s=\s(\'[^\']\'\|"[^"]\|\S*)'
	html = re.sub(block_tags, '', html, flags=re.I\|re.S)
	html = re.sub(block_attrs, '', html, flags=re.I)
	with open(nojs_path, 'w') as f:
	f.write(html)
	html_path = nojs_path

	start_time = time.time()

	with open(os.devnull, 'w') as fnull:
	dump_tree_cmd = [content_shell,
	'--dump-render-tree',
	additional_content_shell_flags,
	# The escaped single quote is not a typo, it's a separator!
	html_path + "\\'--pixel-test"
	]
	p = subprocess.Popen(' '.join(dump_tree_cmd),
	shell=True,
	stdout=subprocess.PIPE,
	stderr=fnull)
	result = p.stdout.read()

	PNG_START = b'\x89\x50\x4E\x47\x0D\x0A\x1A\x0A'
	PNG_END = b'\x49\x45\x4E\x44\xAE\x42\x60\x82'
	try:
	start = result.index(PNG_START)
	end = result.rindex(PNG_END) + 8
	except ValueError:
	return False

	png_path = os.path.join(output_dir, action, url_parts.hostname + '.png')
	MakeDirsIfNotExist(os.path.dirname(png_path))
	with open(png_path, 'wb') as f:
	f.write(result[start:end])
	elapsed_time = (time.time() - start_time, url)
	return elapsed_time


	def RunDrt():
	print 'Taking screenshots of %d pages...' % len(urls)
	start_time = time.time()

	results = multiprocessing.Pool().map(RunDrtTask, urls, 1)

	max_time, url = max(t for t in results if t)
	elapsed_detail = '(slowest: %.2fs on %s)' % (max_time, url)
	PrintElapsedTime(time.time() - start_time, elapsed_detail)


	def CompareResultsTask(url):
	url_parts = urlparse(url)
	before_path = os.path.join(output_dir, 'before', url_parts.hostname + '.png')
	after_path = os.path.join(output_dir, 'after', url_parts.hostname + '.png')
	diff_path = os.path.join(output_dir, 'diff', url_parts.hostname + '.png')
	MakeDirsIfNotExist(os.path.join(output_dir, 'diff'))

	red_path = ('data:image/gif;base64,R0lGODlhAQABAPAAAP8AAP///yH5BAAAAAAALAAAAA'
	'ABAAEAAAICRAEAOw==')

	before_exists = os.path.exists(before_path)
	after_exists = os.path.exists(after_path)
	if not before_exists and not after_exists:
	# TODO(johnme): Make this more informative.
	return (-100, url, red_path)
	if before_exists != after_exists:
	# TODO(johnme): Make this more informative.
	return (200, url, red_path)

	# Get percentage difference.
	p = subprocess.Popen([image_diff, '--histogram',
	before_path, after_path],
	shell=False,
	stdout=subprocess.PIPE)
	output, _ = p.communicate()
	if p.returncode == 0:
	return (0, url, before_path)
	diff_match = re.match(r'histogram diff: (\d+\.\d{2})% (?:passed\|failed)\n'
	'exact diff: (\d+\.\d{2})% (?:passed\|failed)', output)
	if not diff_match:
	raise Exception('image_diff output format changed')
	histogram_diff = float(diff_match.group(1))
	exact_diff = float(diff_match.group(2))
	combined_diff = max(histogram_diff + exact_diff / 8, 0.001)

	# Produce diff PNG.
	subprocess.call([image_diff, '--diff', before_path, after_path, diff_path])
	return (combined_diff, url, diff_path)


	def CompareResults(start_number, end_number, gs_url_prefix):
	print 'Running image_diff on %d pages...' % len(urls)
	start_time = time.time()

	results = multiprocessing.Pool().map(CompareResultsTask, urls)
	results.sort(key=itemgetter(0), reverse=True)

	PrintElapsedTime(time.time() - start_time)

	now = datetime.datetime.today().strftime('%a %Y-%m-%d %H:%M')
	html_start = textwrap.dedent("""\
	<!DOCTYPE html>
	<html>
	<head>
	<title>Real World Impact report %s</title>
	<script>
	var togglingImg = null;
	var toggleTimer = null;

	var before = true;
	function toggle() {
	var newFolder = before ? "\/before" : "\/after";
	togglingImg.src = togglingImg.src.replace(/\/before\|\/after\|\/diff/, newFolder);
	before = !before;
	toggleTimer = setTimeout(toggle, 300);
	}

	function startToggle(img) {
	before = true;
	togglingImg = img;
	if (!img.origSrc)
	img.origSrc = img.src;
	toggle();
	}
	function stopToggle(img) {
	clearTimeout(toggleTimer);
	img.src = img.origSrc;
	}

	document.onkeydown = function(e) {
	e = e \|\| window.event;
	var keyCode = e.keyCode \|\| e.which;
	var newFolder;
	switch (keyCode) {
	case 49: //'1'
	newFolder = "\/before"; break;
	case 50: //'2'
	newFolder = "\/after"; break;
	case 51: //'3'
	newFolder = "\/diff"; break;
	default:
	return;
	}
	var imgs = document.getElementsByTagName("img");
	for (var i = 0; i < imgs.length; i++) {
	imgs[i].src = imgs[i].src.replace(/\/before\|\/after\|\/diff/, newFolder);
	}
	};
	</script>
	<style>
	h1 {
	font-family: sans;
	}
	h2 {
	font-family: monospace;
	white-space: pre;
	}
	.nsfw-spacer {
	height: 50vh;
	}
	.nsfw-warning {
	background: yellow;
	border: 10px solid red;
	}
	.info {
	font-size: 1.2em;
	font-style: italic;
	}
	body:not(.details-supported) details {
	display: none;
	}
	</style>
	</head>
	<body>
	<script>
	if ('open' in document.createElement('details'))
	document.body.className = "details-supported";
	</script>
	<!--<div class="nsfw-spacer"></div>-->
	<p class="nsfw-warning">Warning: sites below are taken from the Alexa
	top %d-%d and may be NSFW.</p>
	<!--<div class="nsfw-spacer"></div>-->
	<h1>Real World Impact report %s</h1>
	<p class="info">Press 1, 2 and 3 to switch between before, after and diff
	screenshots respectively; or hover over the images to rapidly alternate
	between before and after.</p>
	""" % (now, start_number, end_number, now))

	html_same_row = """\
	<h2>No difference on <a href="%s">%s</a>.</h2>
	"""

	html_diff_row = """\
	<h2>%7.3f%% difference on <a href="%s">%s</a>:</h2>
	<img src="%s" width="800" height="600"
	onmouseover="startToggle(this)" onmouseout="stopToggle(this)">
	"""

	html_end = textwrap.dedent("""\
	</body>
	</html>""")

	html_path = os.path.join(output_dir, 'diff.html')
	with open(html_path, 'w') as f:
	f.write(html_start)
	for (diff_float, url, diff_path) in results:
	diff_path = os.path.relpath(diff_path, output_dir)
	if diff_float == 0:
	f.write(html_same_row % (url, url))
	else:
	f.write(html_diff_row % (
	diff_float, url, url, posixpath.join(gs_url_prefix, diff_path)))
	f.write(html_end)

	webbrowser.open_new_tab('file://' + html_path)


	def main(argv):
	global action, allow_js, output_dir, additional_content_shell_flags, \
	image_diff, content_shell

	parser = argparse.ArgumentParser(
	formatter_class=RawTextHelpFormatter,
	description='Compare the real world impact of a content shell change.',
	epilog=textwrap.dedent("""\
	Example usage:
	1. Build content_shell in out/Release without any changes.
	2. Run: %s --action=before --start_number=1 --end_number=10
	3. Either:
	a. Apply your controversial patch and rebuild content_shell.
	b. Pass --additional_flags="--enable_your_flag" in step 4.
	4. Run: %s --action=after --start_number=1 --end_number=10
	5. Run: %s --action=compare --start_number=1 --end_number=10
	""" % (argv[0], argv[0], argv[0])))
	parser.add_argument('--allow_js', help='Do not disable Javascript',
	action='store_true')
	parser.add_argument('--additional_flags',
	help='Additional flags to pass to content shell')
	parser.add_argument('--action',
	help=textwrap.dedent("""\
	Action to perform.
	download - Just download the sites.
	before - Run content shell and record 'before' result.
	after - Run content shell and record 'after' result.
	compare - Compare before and after results.
	"""),
	choices=['download', 'before', 'after', 'compare'],
	required=True)
	parser.add_argument('--start_number',
	help='Specifies which website rank (in Alexa\'s list) to '
	'start with',
	type=int, required=True)
	parser.add_argument('--end_number',
	help='Specifies which website rank (in Alexa\'s list) to '
	'end with',
	type=int, required=True)
	parser.add_argument('--output_dir',
	help='Directory where output files will be stored',
	required=True)
	parser.add_argument('--csv_path',
	help='Path to the Alexa top 1M webpages CSV',
	required=True)
	parser.add_argument('--chromium_out_dir',
	help='Path to Chromium build\'s out directory.',
	required=True)
	parser.add_argument('--gs_url_prefix',
	help='The GS prefix to use which points to img files.',
	required=True)

	args = parser.parse_args()

	action = args.action
	output_dir = os.path.join(args.output_dir, 'real_world_impact')
	gs_url_prefix = args.gs_url_prefix
	chromium_out_dir = args.chromium_out_dir
	image_diff = os.path.join(chromium_out_dir, 'image_diff')
	content_shell = os.path.join(chromium_out_dir, 'content_shell')
	csv_path = args.csv_path
	start_number = args.start_number
	end_number = args.end_number

	if (args.allow_js):
	allow_js = args.allow_js

	if (args.additional_flags):
	additional_content_shell_flags = args.additional_flags

	if not SetupPaths() or not CheckPrerequisites() or not PickSampleUrls(
	start_number, end_number, csv_path):
	return 1

	if action == 'compare':
	CompareResults(start_number, end_number, gs_url_prefix)
	else:
	DownloadStaticCopies(start_number, end_number)
	if action != 'download':
	RunDrt()
	return 0


	if __name__ == '__main__':
	sys.exit(main(sys.argv))