blob: 07ab3ab7763d67806f5bcc718550b7fbf09c7050 [file] [log] [blame]
#!/usr/bin/env python
# Copyright 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# Tool for seeing the real world impact of a patch.
#
# Layout Tests can tell you whether something has changed, but this can help
# you determine whether a subtle/controversial change is beneficial or not.
#
# It dumps the rendering of a large number of sites, both with and without a
# patch being evaluated, then sorts them by greatest difference in rendering,
# such that a human reviewer can quickly review the most impacted sites,
# rather than having to manually try sites to see if anything changes.
#
# In future it might be possible to extend this to other kinds of differences,
# e.g. page load times.
#
# pylint: disable=C0301
# The original file is from http://src.chromium.org/viewvc/chrome/trunk/src/tools/real_world_impact/real_world_impact.py
# It was written by johnme@ and modified by pdr@.
# rmistry@ has renamed the file and modified it to run on the Cluster telemetry
# 100 slaves (http://skia-tree-status.appspot.com/skia-telemetry/chromium_try).
import argparse
from argparse import RawTextHelpFormatter
import datetime
import errno
from distutils.spawn import find_executable
from operator import itemgetter
import multiprocessing
import os
import posixpath
import re
import subprocess
import sys
import textwrap
import time
from urlparse import urlparse
import webbrowser
action = None
allow_js = False
additional_content_shell_flags = ''
output_dir = ''
image_diff = ''
content_shell = ''
urls = []
print_lock = multiprocessing.Lock()
def MakeDirsIfNotExist(directory):
try:
os.makedirs(directory)
except OSError as e:
if e.errno != errno.EEXIST:
raise
def SetupPaths():
MakeDirsIfNotExist(output_dir)
return True
def CheckPrerequisites():
if not find_executable('wget'):
print 'wget not found! Install wget and re-run this.'
return False
if not os.path.exists(image_diff):
print 'image_diff not found (%s)!' % image_diff
print 'Build the image_diff target and re-run this.'
return False
if not os.path.exists(content_shell):
print 'Content shell not found (%s)!' % content_shell
print 'Build Release/content_shell and re-run this.'
return False
return True
def PickSampleUrls(start_number, end_number, csv_path):
global urls
data_dir = os.path.join(output_dir, 'data')
MakeDirsIfNotExist(data_dir)
bad_urls_path = os.path.join(data_dir, 'bad_urls.txt')
if os.path.exists(bad_urls_path):
with open(bad_urls_path) as f:
bad_urls = set(f.read().splitlines())
else:
bad_urls = set()
# See if we've already selected the same sample previously (this way, if you
# call this script with arguments
# '--start_number=1 --end_number=10 --action=before' then
# '--start_number=1 --end_number=10 --action=after', we'll use the same
# sample, as expected!).
urls_path = os.path.join(data_dir, '%d-%d_urls.txt' % (start_number,
end_number))
if not os.path.exists(urls_path):
if action == 'compare':
print ('Error: you must run "--action=before" and "--action=after" '
'before running "--action=compare"')
return False
print 'Picking %d-%d from the Alexa list...' % (start_number, end_number)
urls = []
current_rank = 0
with open(csv_path) as f:
for entry in f:
current_rank += 1
if current_rank < start_number:
continue
elif current_rank > end_number:
break
hostname = entry.strip().split(',')[1]
if not '/' in hostname: # Skip Alexa 1,000,000 entries that have paths.
url = 'http://%s/' % hostname
if not url in bad_urls:
urls.append(url)
# Don't write these to disk yet; we'll do that in SaveWorkingUrls below
# once we have tried to download them and seen which ones fail.
else:
with open(urls_path) as f:
urls = [u for u in f.read().splitlines() if not u in bad_urls]
return True
def SaveWorkingUrls(start_number, end_number):
# TODO(johnme): Update the list if a url that used to work goes offline.
urls_path = os.path.join(output_dir, 'data', '%d-%d_urls.txt' % (start_number,
end_number))
if not os.path.exists(urls_path):
with open(urls_path, 'w') as f:
f.writelines(u + '\n' for u in urls)
def PrintElapsedTime(elapsed, detail=''):
elapsed = round(elapsed * 10) / 10.0
m = elapsed / 60
s = elapsed % 60
print 'Took %dm%.1fs' % (m, s), detail
def DownloadStaticCopyTask(url):
url_parts = urlparse(url)
host_dir = os.path.join(output_dir, 'data', url_parts.hostname)
# Use wget for now, as does a reasonable job of spidering page dependencies
# (e.g. CSS, JS, images).
success = True
try:
subprocess.check_call(['timeout', '60',
'wget',
'--execute', 'robots=off',
('--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS '
'X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) C'
'hrome/32.0.1700.14 Safari/537.36'),
'--page-requisites',
'--span-hosts',
'--adjust-extension',
'--convert-links',
'--directory-prefix=' + host_dir,
'--force-directories',
'--default-page=index.html',
'--no-check-certificate',
'--timeout=5', # 5s timeout
'--tries=2',
'--quiet',
url])
except KeyboardInterrupt:
success = False
except subprocess.CalledProcessError:
# Ignoring these for now, as some sites have issues with their subresources
# yet still produce a renderable index.html
pass #success = False
if success:
download_path = os.path.join(host_dir, url_parts.hostname, 'index.html')
if not os.path.exists(download_path):
success = False
else:
with print_lock:
print 'Downloaded:', url
if not success:
with print_lock:
print 'Failed to download:', url
return False
return True
def DownloadStaticCopies(start_number, end_number):
global urls
new_urls = []
for url in urls:
url_parts = urlparse(url)
host_dir = os.path.join(output_dir, 'data', url_parts.hostname)
download_path = os.path.join(host_dir, url_parts.hostname, 'index.html')
if not os.path.exists(download_path):
new_urls.append(url)
if new_urls:
print 'Downloading static copies of %d sites...' % len(new_urls)
start_time = time.time()
results = multiprocessing.Pool(20).map(DownloadStaticCopyTask, new_urls)
failed_urls = [new_urls[i] for i, ret in enumerate(results) if not ret]
if failed_urls:
bad_urls_path = os.path.join(output_dir, 'data', 'bad_urls.txt')
with open(bad_urls_path, 'a') as f:
f.writelines(u + '\n' for u in failed_urls)
failed_urls_set = set(failed_urls)
urls = [u for u in urls if u not in failed_urls_set]
PrintElapsedTime(time.time() - start_time)
SaveWorkingUrls(start_number, end_number)
def RunDrtTask(url):
url_parts = urlparse(url)
host_dir = os.path.join(output_dir, 'data', url_parts.hostname)
html_path = os.path.join(host_dir, url_parts.hostname, 'index.html')
if not allow_js:
nojs_path = os.path.join(host_dir, url_parts.hostname, 'index-nojs.html')
if not os.path.exists(nojs_path):
with open(html_path) as f:
html = f.read()
if not html:
return False
# These aren't intended to be XSS safe :)
block_tags = (r'<\s*(script|object|video|audio|iframe|frameset|frame)'
r'\b.*?<\s*\/\s*\1\s*>')
block_attrs = r'\s(onload|onerror)\s*=\s*(\'[^\']*\'|"[^"]*|\S*)'
html = re.sub(block_tags, '', html, flags=re.I|re.S)
html = re.sub(block_attrs, '', html, flags=re.I)
with open(nojs_path, 'w') as f:
f.write(html)
html_path = nojs_path
start_time = time.time()
with open(os.devnull, 'w') as fnull:
dump_tree_cmd = [content_shell,
'--dump-render-tree',
additional_content_shell_flags,
# The escaped single quote is not a typo, it's a separator!
html_path + "\\'--pixel-test"
]
p = subprocess.Popen(' '.join(dump_tree_cmd),
shell=True,
stdout=subprocess.PIPE,
stderr=fnull)
result = p.stdout.read()
PNG_START = b'\x89\x50\x4E\x47\x0D\x0A\x1A\x0A'
PNG_END = b'\x49\x45\x4E\x44\xAE\x42\x60\x82'
try:
start = result.index(PNG_START)
end = result.rindex(PNG_END) + 8
except ValueError:
return False
png_path = os.path.join(output_dir, action, url_parts.hostname + '.png')
MakeDirsIfNotExist(os.path.dirname(png_path))
with open(png_path, 'wb') as f:
f.write(result[start:end])
elapsed_time = (time.time() - start_time, url)
return elapsed_time
def RunDrt():
print 'Taking screenshots of %d pages...' % len(urls)
start_time = time.time()
results = multiprocessing.Pool().map(RunDrtTask, urls, 1)
max_time, url = max(t for t in results if t)
elapsed_detail = '(slowest: %.2fs on %s)' % (max_time, url)
PrintElapsedTime(time.time() - start_time, elapsed_detail)
def CompareResultsTask(url):
url_parts = urlparse(url)
before_path = os.path.join(output_dir, 'before', url_parts.hostname + '.png')
after_path = os.path.join(output_dir, 'after', url_parts.hostname + '.png')
diff_path = os.path.join(output_dir, 'diff', url_parts.hostname + '.png')
MakeDirsIfNotExist(os.path.join(output_dir, 'diff'))
red_path = ('data:image/gif;base64,R0lGODlhAQABAPAAAP8AAP///yH5BAAAAAAALAAAAA'
'ABAAEAAAICRAEAOw==')
before_exists = os.path.exists(before_path)
after_exists = os.path.exists(after_path)
if not before_exists and not after_exists:
# TODO(johnme): Make this more informative.
return (-100, url, red_path)
if before_exists != after_exists:
# TODO(johnme): Make this more informative.
return (200, url, red_path)
# Get percentage difference.
p = subprocess.Popen([image_diff, '--histogram',
before_path, after_path],
shell=False,
stdout=subprocess.PIPE)
output, _ = p.communicate()
if p.returncode == 0:
return (0, url, before_path)
diff_match = re.match(r'histogram diff: (\d+\.\d{2})% (?:passed|failed)\n'
'exact diff: (\d+\.\d{2})% (?:passed|failed)', output)
if not diff_match:
raise Exception('image_diff output format changed')
histogram_diff = float(diff_match.group(1))
exact_diff = float(diff_match.group(2))
combined_diff = max(histogram_diff + exact_diff / 8, 0.001)
# Produce diff PNG.
subprocess.call([image_diff, '--diff', before_path, after_path, diff_path])
return (combined_diff, url, diff_path)
def CompareResults(start_number, end_number, gs_url_prefix):
print 'Running image_diff on %d pages...' % len(urls)
start_time = time.time()
results = multiprocessing.Pool().map(CompareResultsTask, urls)
results.sort(key=itemgetter(0), reverse=True)
PrintElapsedTime(time.time() - start_time)
now = datetime.datetime.today().strftime('%a %Y-%m-%d %H:%M')
html_start = textwrap.dedent("""\
<!DOCTYPE html>
<html>
<head>
<title>Real World Impact report %s</title>
<script>
var togglingImg = null;
var toggleTimer = null;
var before = true;
function toggle() {
var newFolder = before ? "\/before" : "\/after";
togglingImg.src = togglingImg.src.replace(/\/before|\/after|\/diff/, newFolder);
before = !before;
toggleTimer = setTimeout(toggle, 300);
}
function startToggle(img) {
before = true;
togglingImg = img;
if (!img.origSrc)
img.origSrc = img.src;
toggle();
}
function stopToggle(img) {
clearTimeout(toggleTimer);
img.src = img.origSrc;
}
document.onkeydown = function(e) {
e = e || window.event;
var keyCode = e.keyCode || e.which;
var newFolder;
switch (keyCode) {
case 49: //'1'
newFolder = "\/before"; break;
case 50: //'2'
newFolder = "\/after"; break;
case 51: //'3'
newFolder = "\/diff"; break;
default:
return;
}
var imgs = document.getElementsByTagName("img");
for (var i = 0; i < imgs.length; i++) {
imgs[i].src = imgs[i].src.replace(/\/before|\/after|\/diff/, newFolder);
}
};
</script>
<style>
h1 {
font-family: sans;
}
h2 {
font-family: monospace;
white-space: pre;
}
.nsfw-spacer {
height: 50vh;
}
.nsfw-warning {
background: yellow;
border: 10px solid red;
}
.info {
font-size: 1.2em;
font-style: italic;
}
body:not(.details-supported) details {
display: none;
}
</style>
</head>
<body>
<script>
if ('open' in document.createElement('details'))
document.body.className = "details-supported";
</script>
<!--<div class="nsfw-spacer"></div>-->
<p class="nsfw-warning">Warning: sites below are taken from the Alexa
top %d-%d and may be NSFW.</p>
<!--<div class="nsfw-spacer"></div>-->
<h1>Real World Impact report %s</h1>
<p class="info">Press 1, 2 and 3 to switch between before, after and diff
screenshots respectively; or hover over the images to rapidly alternate
between before and after.</p>
""" % (now, start_number, end_number, now))
html_same_row = """\
<h2>No difference on <a href="%s">%s</a>.</h2>
"""
html_diff_row = """\
<h2>%7.3f%% difference on <a href="%s">%s</a>:</h2>
<img src="%s" width="800" height="600"
onmouseover="startToggle(this)" onmouseout="stopToggle(this)">
"""
html_end = textwrap.dedent("""\
</body>
</html>""")
html_path = os.path.join(output_dir, 'diff.html')
with open(html_path, 'w') as f:
f.write(html_start)
for (diff_float, url, diff_path) in results:
diff_path = os.path.relpath(diff_path, output_dir)
if diff_float == 0:
f.write(html_same_row % (url, url))
else:
f.write(html_diff_row % (
diff_float, url, url, posixpath.join(gs_url_prefix, diff_path)))
f.write(html_end)
webbrowser.open_new_tab('file://' + html_path)
def main(argv):
global action, allow_js, output_dir, additional_content_shell_flags, \
image_diff, content_shell
parser = argparse.ArgumentParser(
formatter_class=RawTextHelpFormatter,
description='Compare the real world impact of a content shell change.',
epilog=textwrap.dedent("""\
Example usage:
1. Build content_shell in out/Release without any changes.
2. Run: %s --action=before --start_number=1 --end_number=10
3. Either:
a. Apply your controversial patch and rebuild content_shell.
b. Pass --additional_flags="--enable_your_flag" in step 4.
4. Run: %s --action=after --start_number=1 --end_number=10
5. Run: %s --action=compare --start_number=1 --end_number=10
""" % (argv[0], argv[0], argv[0])))
parser.add_argument('--allow_js', help='Do not disable Javascript',
action='store_true')
parser.add_argument('--additional_flags',
help='Additional flags to pass to content shell')
parser.add_argument('--action',
help=textwrap.dedent("""\
Action to perform.
download - Just download the sites.
before - Run content shell and record 'before' result.
after - Run content shell and record 'after' result.
compare - Compare before and after results.
"""),
choices=['download', 'before', 'after', 'compare'],
required=True)
parser.add_argument('--start_number',
help='Specifies which website rank (in Alexa\'s list) to '
'start with',
type=int, required=True)
parser.add_argument('--end_number',
help='Specifies which website rank (in Alexa\'s list) to '
'end with',
type=int, required=True)
parser.add_argument('--output_dir',
help='Directory where output files will be stored',
required=True)
parser.add_argument('--csv_path',
help='Path to the Alexa top 1M webpages CSV',
required=True)
parser.add_argument('--chromium_out_dir',
help='Path to Chromium build\'s out directory.',
required=True)
parser.add_argument('--gs_url_prefix',
help='The GS prefix to use which points to img files.',
required=True)
args = parser.parse_args()
action = args.action
output_dir = os.path.join(args.output_dir, 'real_world_impact')
gs_url_prefix = args.gs_url_prefix
chromium_out_dir = args.chromium_out_dir
image_diff = os.path.join(chromium_out_dir, 'image_diff')
content_shell = os.path.join(chromium_out_dir, 'content_shell')
csv_path = args.csv_path
start_number = args.start_number
end_number = args.end_number
if (args.allow_js):
allow_js = args.allow_js
if (args.additional_flags):
additional_content_shell_flags = args.additional_flags
if not SetupPaths() or not CheckPrerequisites() or not PickSampleUrls(
start_number, end_number, csv_path):
return 1
if action == 'compare':
CompareResults(start_number, end_number, gs_url_prefix)
else:
DownloadStaticCopies(start_number, end_number)
if action != 'download':
RunDrt()
return 0
if __name__ == '__main__':
sys.exit(main(sys.argv))