| #!/usr/bin/env python |
| # Copyright (c) 2013 The Chromium Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| """Creates a Python telemetry page_set from the specified webpages CSV. |
| |
| This module does the following steps: |
| * Downloads a ZIP from http://s3.amazonaws.com/alexa-static/top-1m.csv.zip |
| * Unpacks it and reads its contents in memory. |
| * Writes out multiple Python page sets from the CSV file for the specified |
| number of webpages. |
| |
| Sample Usage: |
| python create_page_set.py -s 1 -e 10000 |
| |
| Running the above command will create 10000 different page sets. |
| """ |
| |
| __author__ = 'Ravi Mistry' |
| |
| import optparse |
| import os |
| import urllib |
| import zipfile |
| |
| from StringIO import StringIO |
| |
| |
| TOP1M_CSV_FILE_NAME = 'top-1m.csv' |
| TOP1M_CSV_ZIP_LOCATION = ( |
| 'http://s3.amazonaws.com/alexa-static/%s.zip' % TOP1M_CSV_FILE_NAME) |
| ALEXA_PREFIX = 'alexa' |
| |
| |
| if '__main__' == __name__: |
| option_parser = optparse.OptionParser() |
| option_parser.add_option( |
| '-s', '--start_number', |
| help='Specifies where to start with when adding the top webpages to the ' |
| 'page_set.', |
| default='1') |
| option_parser.add_option( |
| '-e', '--end_number', |
| help='Specifies where to end with when adding the top webpages to the ' |
| 'page_set', |
| default='10000') |
| option_parser.add_option( |
| '-c', '--csv_file', |
| help='Location of a filtered alexa top 1M CSV file. Each row should ' |
| 'have 3 entries, 1st will be rank, 2nd will be domain name and ' |
| 'third will be the fully qualified url. If the third section is ' |
| 'missing then a page_set for the URL will not be generated. If ' |
| 'csv_file is not specified then this script downloads it from the ' |
| 'internet.', |
| default=None) |
| option_parser.add_option( |
| '-p', '--pagesets_type', |
| help='The type of pagesets to create from the 1M list. Eg: All, ' |
| '100k, 10k, IndexSample10k, Mobile10k', |
| default='All') |
| option_parser.add_option( |
| '-u', '--useragent_type', |
| help='The type of user agent to use in the pagesets. Eg: desktop, ' |
| 'mobile, tablet', |
| default='desktop') |
| options, unused_args = option_parser.parse_args() |
| |
| # Validate arguments. |
| if int(options.start_number) <= 0: |
| raise Exception('The -s/--start_number must be greater than 0') |
| if int(options.start_number) > int(options.end_number): |
| raise Exception('The -s/--start_number must be less than or equal to ' |
| '-e/--end_number') |
| |
| if options.csv_file: |
| csv_contents = open(options.csv_file).readlines() |
| else: |
| # Download the zip file in member and extract its contents. |
| usock = urllib.urlopen(TOP1M_CSV_ZIP_LOCATION) |
| myzipfile = zipfile.ZipFile(StringIO(usock.read())) |
| csv_contents = myzipfile.open(TOP1M_CSV_FILE_NAME).readlines() |
| |
| # Validate options.end_number. |
| if int(options.end_number) > len(csv_contents): |
| raise Exception('Please specify -e/--end_number less than or equal to %s' % |
| len(csv_contents)) |
| |
| websites = [] |
| for index in xrange(int(options.start_number) - 1, int(options.end_number)): |
| line = csv_contents[index] |
| website = line.strip().split(',')[1] |
| if website.startswith('https://') or website.startswith('http://'): |
| qualified_website = website |
| else: |
| qualified_website = 'http://www.%s' % website |
| websites.append(qualified_website) |
| |
| archive_data_file = os.path.join( |
| '/', 'b', 'storage', 'webpages_archive', |
| options.pagesets_type, |
| 'alexa%s-%s.json' % (options.start_number, options.end_number)) |
| |
| page_set_content = """ |
| # Copyright 2014 The Chromium Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| # pylint: disable=W0401,W0614 |
| |
| from telemetry.page import page as page_module |
| from telemetry.page import page_set as page_set_module |
| from page_sets import repaint_helpers |
| |
| |
| class TypicalAlexaPage(page_module.Page): |
| |
| def __init__(self, url, page_set): |
| super(TypicalAlexaPage, self).__init__(url=url, page_set=page_set) |
| self.user_agent_type = '%s' |
| self.archive_data_file = '%s' |
| |
| def RunPageInteractions(self, action_runner): |
| repaint_helpers.Repaint(action_runner) |
| |
| |
| class TypicalAlexaPageSet(page_set_module.PageSet): |
| |
| def __init__(self): |
| super(TypicalAlexaPageSet, self).__init__( |
| user_agent_type='%s', |
| archive_data_file='%s') |
| |
| urls_list = %s |
| |
| for url in urls_list: |
| self.AddPage(TypicalAlexaPage(url, self)) |
| """ % (options.useragent_type, archive_data_file, options.useragent_type, |
| archive_data_file, str(websites)) |
| |
| # Output the pageset to a file. |
| with open(os.path.join('page_sets', 'alexa%s-%s.py' % ( |
| options.start_number, options.end_number)), |
| 'w') as outfile: |
| outfile.write(page_set_content) |
| |