blob: adba0fb31881fbe4f9efc58f05a46a033b1c33b9 [file] [log] [blame]
#!/usr/bin/env python
# Copyright (c) 2013 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Creates a Python telemetry page_set from the specified webpages CSV.
This module does the following steps:
* Downloads a ZIP from http://s3.amazonaws.com/alexa-static/top-1m.csv.zip
* Unpacks it and reads its contents in memory.
* Writes out multiple Python page sets from the CSV file for the specified
number of webpages.
Sample Usage:
python create_page_set.py -s 1 -e 10000
Running the above command will create 10000 different page sets.
"""
__author__ = 'Ravi Mistry'
import optparse
import os
import urllib
import zipfile
from StringIO import StringIO
TOP1M_CSV_FILE_NAME = 'top-1m.csv'
TOP1M_CSV_ZIP_LOCATION = (
'http://s3.amazonaws.com/alexa-static/%s.zip' % TOP1M_CSV_FILE_NAME)
ALEXA_PREFIX = 'alexa'
if '__main__' == __name__:
option_parser = optparse.OptionParser()
option_parser.add_option(
'-s', '--start_number',
help='Specifies where to start with when adding the top webpages to the '
'page_set.',
default='1')
option_parser.add_option(
'-e', '--end_number',
help='Specifies where to end with when adding the top webpages to the '
'page_set',
default='10000')
option_parser.add_option(
'-c', '--csv_file',
help='Location of a filtered alexa top 1M CSV file. Each row should '
'have 3 entries, 1st will be rank, 2nd will be domain name and '
'third will be the fully qualified url. If the third section is '
'missing then a page_set for the URL will not be generated. If '
'csv_file is not specified then this script downloads it from the '
'internet.',
default=None)
option_parser.add_option(
'-p', '--pagesets_type',
help='The type of pagesets to create from the 1M list. Eg: All, '
'100k, 10k, IndexSample10k, Mobile10k',
default='All')
option_parser.add_option(
'-u', '--useragent_type',
help='The type of user agent to use in the pagesets. Eg: desktop, '
'mobile, tablet',
default='desktop')
options, unused_args = option_parser.parse_args()
# Validate arguments.
if int(options.start_number) <= 0:
raise Exception('The -s/--start_number must be greater than 0')
if int(options.start_number) > int(options.end_number):
raise Exception('The -s/--start_number must be less than or equal to '
'-e/--end_number')
if options.csv_file:
csv_contents = open(options.csv_file).readlines()
else:
# Download the zip file in member and extract its contents.
usock = urllib.urlopen(TOP1M_CSV_ZIP_LOCATION)
myzipfile = zipfile.ZipFile(StringIO(usock.read()))
csv_contents = myzipfile.open(TOP1M_CSV_FILE_NAME).readlines()
# Validate options.end_number.
if int(options.end_number) > len(csv_contents):
raise Exception('Please specify -e/--end_number less than or equal to %s' %
len(csv_contents))
websites = []
for index in xrange(int(options.start_number) - 1, int(options.end_number)):
line = csv_contents[index]
website = line.strip().split(',')[1]
if website.startswith('https://') or website.startswith('http://'):
qualified_website = website
else:
qualified_website = 'http://www.%s' % website
websites.append(qualified_website)
archive_data_file = os.path.join(
'/', 'b', 'storage', 'webpages_archive',
options.pagesets_type,
'alexa%s-%s.json' % (options.start_number, options.end_number))
page_set_content = """
# Copyright 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
# pylint: disable=W0401,W0614
from telemetry.page import page as page_module
from telemetry.page import page_set as page_set_module
from page_sets import repaint_helpers
class TypicalAlexaPage(page_module.Page):
def __init__(self, url, page_set):
super(TypicalAlexaPage, self).__init__(url=url, page_set=page_set)
self.user_agent_type = '%s'
self.archive_data_file = '%s'
def RunPageInteractions(self, action_runner):
repaint_helpers.Repaint(action_runner)
class TypicalAlexaPageSet(page_set_module.PageSet):
def __init__(self):
super(TypicalAlexaPageSet, self).__init__(
user_agent_type='%s',
archive_data_file='%s')
urls_list = %s
for url in urls_list:
self.AddPage(TypicalAlexaPage(url, self))
""" % (options.useragent_type, archive_data_file, options.useragent_type,
archive_data_file, str(websites))
# Output the pageset to a file.
with open(os.path.join('page_sets', 'alexa%s-%s.py' % (
options.start_number, options.end_number)),
'w') as outfile:
outfile.write(page_set_content)