blob: f3d22cc01e6d871406e45d75dbdb904b8caf775a [file] [log] [blame]
#!/usr/bin/env python
# Copyright (c) 2013 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Downloads CSV of top 1M webpages and creates a JSON telemetry page_set.
This module does the following steps:
* Downloads a ZIP from http://s3.amazonaws.com/alexa-static/top-1m.csv.zip
* Unpacks it and reads its contents in memory.
* Writes out multiple JSON page sets from the CSV file for the specified number
of webpages.
Note: Blacklisted webpages will not be added to the outputted JSON page_set. If
you request 100 webpages and 5 of them are blacklisted then the page_set will
only contain 95 webpages.
Sample Usage:
python create_page_set.py -s 1 -e 10000
Running the above command will create 10000 different page sets.
The outputted page sets are intended to be used by the webpages_playback.py
script.
Sample usage of the webpages_playback.py script with the outputted page sets:
python webpages_playback.py --record=True /
--page_sets=../../tools/page_sets/*.json /
--do_not_upload_to_gs=True --output_dir=/network/accessible/moint/point/
"""
__author__ = 'Ravi Mistry'
import getpass
import json
import optparse
import os
import urllib
import zipfile
from datetime import datetime
from StringIO import StringIO
TOP1M_CSV_FILE_NAME = 'top-1m.csv'
TOP1M_CSV_ZIP_LOCATION = (
'http://s3.amazonaws.com/alexa-static/%s.zip' % TOP1M_CSV_FILE_NAME)
ALEXA_PREFIX = 'alexa'
if '__main__' == __name__:
option_parser = optparse.OptionParser()
option_parser.add_option(
'-s', '--start_number',
help='Specifies where to start with when adding the top webpages to the '
'page_set.',
default='1')
option_parser.add_option(
'-e', '--end_number',
help='Specifies where to end with when adding the top webpages to the '
'page_set',
default='10000')
option_parser.add_option(
'-b', '--blacklist',
help='Location of a black_list file which specifies which webpages '
'should not be converted into page_sets.',
default='')
option_parser.add_option(
'-c', '--csv_file',
help='Location of a filtered alexa top 1M CSV file. Each row should '
'have 3 entries, 1st will be rank, 2nd will be domain name and '
'third will be the fully qualified url. If the third section is '
'missing then a page_set for the URL will not be generated. If '
'csv_file is not specified then this script downloads it from the '
'internet.',
default=None)
option_parser.add_option(
'-p', '--pagesets_type',
help='The type of pagesets to create from the 1M list. Eg: All, '
'Filtered, 100k, 10k, Deeplinks',
default='All')
options, unused_args = option_parser.parse_args()
# Validate arguments.
if int(options.start_number) <= 0:
raise Exception('The -s/--start_number must be greater than 0')
if int(options.start_number) > int(options.end_number):
raise Exception('The -s/--start_number must be less than or equal to '
'-e/--end_number')
if options.csv_file:
csv_contents = open(options.csv_file).readlines()
else:
# Download the zip file in member and extract its contents.
usock = urllib.urlopen(TOP1M_CSV_ZIP_LOCATION)
myzipfile = zipfile.ZipFile(StringIO(usock.read()))
csv_contents = myzipfile.open(TOP1M_CSV_FILE_NAME).readlines()
# Validate options.end_number.
if int(options.end_number) > len(csv_contents):
raise Exception('Please specify -e/--end_number less than or equal to %s' %
len(csv_contents))
# Populate the JSON dictionary.
pages = []
json_dict = {
'_comment': 'Generated on %s by %s using create_page_set.py' % (
datetime.now(), getpass.getuser()),
'description': 'Top %s-%s Alexa global.' % (options.start_number,
options.end_number),
'archive_data_file': os.path.join(
'/', 'home', 'default', 'storage', 'webpages_archive',
options.pagesets_type,
'alexa%s-%s.json' % (options.start_number, options.end_number)),
'pages': pages,
'smoothness': { 'action': 'scroll'},
'user_agent_type': 'desktop',
}
blacklisted_webpages = (open(options.blacklist).readlines()
if options.blacklist else [])
for index in xrange(int(options.start_number) - 1, int(options.end_number)):
line = csv_contents[index]
if options.csv_file:
try:
(unused_number, website, qualified_website) = line.strip().split(',')
except ValueError:
print '%s is not mapped to a qualified website.' % (
line.strip().split(',')[1])
continue
else:
(unused_number, website) = line.strip().split(',')
# Qualified website was not provided in the CSV, construct it.
qualified_website = 'http://www.%s' % website
website_filename = '%s%s_%s_desktop' % (
ALEXA_PREFIX, index + 1, website.replace('.', '-').replace('/', '-'))
skip_webpage = False
for blacklisted_webpage in blacklisted_webpages:
if blacklisted_webpage.rstrip() in website_filename:
skip_webpage = True
break
if skip_webpage:
print 'Skipping %s because it is in the provided blacklist file!' % (
website_filename)
continue
pages.append({
'url': qualified_website,
'why': '#%s in Alexa global.' % (index + 1),
'navigate_steps': [
{'action': 'navigate'},
{'action': 'wait', 'seconds': 5}
]
})
# Output the JSON dictionary to a file.
try:
with open(os.path.join('page_sets', 'alexa%s-%s.json' % (
options.start_number, options.end_number)),
'w') as outfile:
json.dump(json_dict, outfile, indent=4)
except Exception, e:
print 'Skipping %s because it failed with Exception: %s' % (
website_filename, e)