|  | #!/usr/bin/env python | 
|  | # Copyright (c) 2013 The Chromium Authors. All rights reserved. | 
|  | # Use of this source code is governed by a BSD-style license that can be | 
|  | # found in the LICENSE file. | 
|  |  | 
|  | """Creates a Python telemetry page_set from the specified webpages CSV. | 
|  |  | 
|  | This module does the following steps: | 
|  | * Downloads a ZIP from http://s3.amazonaws.com/alexa-static/top-1m.csv.zip | 
|  | * Unpacks it and reads its contents in memory. | 
|  | * Writes out multiple Python page sets from the CSV file for the specified | 
|  | number of webpages. | 
|  |  | 
|  | Sample Usage: | 
|  | python create_page_set.py -s 1 -e 10000 | 
|  |  | 
|  | Running the above command will create 10000 different page sets. | 
|  | """ | 
|  |  | 
|  | __author__ = 'Ravi Mistry' | 
|  |  | 
|  | import optparse | 
|  | import os | 
|  | import urllib | 
|  | import zipfile | 
|  |  | 
|  | from StringIO import StringIO | 
|  |  | 
|  |  | 
|  | TOP1M_CSV_FILE_NAME = 'top-1m.csv' | 
|  | TOP1M_CSV_ZIP_LOCATION = ( | 
|  | 'http://s3.amazonaws.com/alexa-static/%s.zip' % TOP1M_CSV_FILE_NAME) | 
|  | ALEXA_PREFIX = 'alexa' | 
|  |  | 
|  |  | 
|  | if '__main__' == __name__: | 
|  | option_parser = optparse.OptionParser() | 
|  | option_parser.add_option( | 
|  | '-s', '--start_number', | 
|  | help='Specifies where to start with when adding the top webpages to the ' | 
|  | 'page_set.', | 
|  | default='1') | 
|  | option_parser.add_option( | 
|  | '-e', '--end_number', | 
|  | help='Specifies where to end with when adding the top webpages to the ' | 
|  | 'page_set', | 
|  | default='10000') | 
|  | option_parser.add_option( | 
|  | '-c', '--csv_file', | 
|  | help='Location of a filtered alexa top 1M CSV file. Each row should ' | 
|  | 'have 3 entries, 1st will be rank, 2nd will be domain name and ' | 
|  | 'third will be the fully qualified url. If the third section is ' | 
|  | 'missing then a page_set for the URL will not be generated. If ' | 
|  | 'csv_file is not specified then this script downloads it from the ' | 
|  | 'internet.', | 
|  | default=None) | 
|  | option_parser.add_option( | 
|  | '-p', '--pagesets_type', | 
|  | help='The type of pagesets to create from the 1M list. Eg: All, ' | 
|  | '100k, 10k, IndexSample10k, Mobile10k', | 
|  | default='All') | 
|  | option_parser.add_option( | 
|  | '-u', '--useragent_type', | 
|  | help='The type of user agent to use in the pagesets. Eg: desktop, ' | 
|  | 'mobile, tablet', | 
|  | default='desktop') | 
|  | options, unused_args = option_parser.parse_args() | 
|  |  | 
|  | # Validate arguments. | 
|  | if int(options.start_number) <= 0: | 
|  | raise Exception('The -s/--start_number must be greater than 0') | 
|  | if int(options.start_number) > int(options.end_number): | 
|  | raise Exception('The -s/--start_number must be less than or equal to ' | 
|  | '-e/--end_number') | 
|  |  | 
|  | if options.csv_file: | 
|  | csv_contents = open(options.csv_file).readlines() | 
|  | else: | 
|  | # Download the zip file in member and extract its contents. | 
|  | usock = urllib.urlopen(TOP1M_CSV_ZIP_LOCATION) | 
|  | myzipfile = zipfile.ZipFile(StringIO(usock.read())) | 
|  | csv_contents = myzipfile.open(TOP1M_CSV_FILE_NAME).readlines() | 
|  |  | 
|  | # Validate options.end_number. | 
|  | if int(options.end_number) > len(csv_contents): | 
|  | raise Exception('Please specify -e/--end_number less than or equal to %s' % | 
|  | len(csv_contents)) | 
|  |  | 
|  | websites = [] | 
|  | for index in xrange(int(options.start_number) - 1, int(options.end_number)): | 
|  | line = csv_contents[index] | 
|  | website = line.strip().split(',')[1] | 
|  | if website.startswith('https://') or website.startswith('http://'): | 
|  | qualified_website = website | 
|  | else: | 
|  | qualified_website = 'http://www.%s' % website | 
|  | websites.append(qualified_website) | 
|  |  | 
|  | archive_data_file = os.path.join( | 
|  | '/', 'b', 'storage', 'webpages_archive', | 
|  | options.pagesets_type, | 
|  | 'alexa%s-%s.json' % (options.start_number, options.end_number)) | 
|  |  | 
|  | page_set_content = """ | 
|  | # Copyright 2014 The Chromium Authors. All rights reserved. | 
|  | # Use of this source code is governed by a BSD-style license that can be | 
|  | # found in the LICENSE file. | 
|  | # pylint: disable=W0401,W0614 | 
|  |  | 
|  | from telemetry.page import page as page_module | 
|  | from telemetry.page import page_set as page_set_module | 
|  | from page_sets import repaint_helpers | 
|  |  | 
|  |  | 
|  | class TypicalAlexaPage(page_module.Page): | 
|  |  | 
|  | def __init__(self, url, page_set): | 
|  | super(TypicalAlexaPage, self).__init__(url=url, page_set=page_set) | 
|  | self.user_agent_type = '%s' | 
|  | self.archive_data_file = '%s' | 
|  |  | 
|  | def RunPageInteractions(self, action_runner): | 
|  | repaint_helpers.Repaint(action_runner) | 
|  |  | 
|  |  | 
|  | class TypicalAlexaPageSet(page_set_module.PageSet): | 
|  |  | 
|  | def __init__(self): | 
|  | super(TypicalAlexaPageSet, self).__init__( | 
|  | user_agent_type='%s', | 
|  | archive_data_file='%s') | 
|  |  | 
|  | urls_list = %s | 
|  |  | 
|  | for url in urls_list: | 
|  | self.AddPage(TypicalAlexaPage(url, self)) | 
|  | """ % (options.useragent_type, archive_data_file, options.useragent_type, | 
|  | archive_data_file, str(websites)) | 
|  |  | 
|  | # Output the pageset to a file. | 
|  | with open(os.path.join('page_sets', 'alexa%s-%s.py' % ( | 
|  | options.start_number, options.end_number)), | 
|  | 'w') as outfile: | 
|  | outfile.write(page_set_content) | 
|  |  |