tools/create_page_set.py - buildbot - Git at Google

 #!/usr/bin/env python
 # Copyright (c) 2013 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Downloads CSV of top 1M webpages and creates a JSON telemetry page_set.

 This module does the following steps:
 * Downloads a ZIP from http://s3.amazonaws.com/alexa-static/top-1m.csv.zip
 * Unpacks it and reads its contents in memory.
 * Writes out multiple JSON page sets from the CSV file for the specified number
   of webpages.

 Note: Blacklisted webpages will not be added to the outputted JSON page_set. If
 you request 100 webpages and 5 of them are blacklisted then the page_set will
 only contain 95 webpages.

 Sample Usage:
   python create_page_set.py -s 1 -e 10000

 Running the above command will create 10000 different page sets.
 The outputted page sets are intended to be used by the webpages_playback.py
 script.
 Sample usage of the webpages_playback.py script with the outputted page sets:
   python webpages_playback.py --record=True /
   --page_sets=../../tools/page_sets/*.json /
   --do_not_upload_to_gs=True --output_dir=/network/accessible/moint/point/
 """

 __author__ = 'Ravi Mistry'

 import getpass
 import json
 import optparse
 import os
 import urllib
 import zipfile

 from datetime import datetime
 from StringIO import StringIO


 TOP1M_CSV_FILE_NAME = 'top-1m.csv'
 TOP1M_CSV_ZIP_LOCATION = (
     'http://s3.amazonaws.com/alexa-static/%s.zip' % TOP1M_CSV_FILE_NAME)
 ALEXA_PREFIX = 'alexa'


 if '__main__' == __name__:
   option_parser = optparse.OptionParser()
   option_parser.add_option(
       '-s', '--start_number',
       help='Specifies where to start with when adding the top webpages to the '
            'page_set.',
       default='1')
   option_parser.add_option(
       '-e', '--end_number',
       help='Specifies where to end with when adding the top webpages to the '
            'page_set',
       default='10000')
   option_parser.add_option(
       '-b', '--blacklist',
       help='Location of a black_list file which specifies which webpages '
            'should not be converted into page_sets.',
       default='')
   option_parser.add_option(
       '-c', '--csv_file',
       help='Location of a filtered alexa top 1M CSV file. Each row should '
            'have 3 entries, 1st will be rank, 2nd will be domain name and '
            'third will be the fully qualified url. If the third section is '
            'missing then a page_set for the URL will not be generated. If '
            'csv_file is not specified then this script downloads it from the '
            'internet.',
       default=None)
   option_parser.add_option(
       '-p', '--pagesets_type',
       help='The type of pagesets to create from the 1M list. Eg: All, '
            'Filtered, 100k, 10k, Deeplinks',
       default='All')
   options, unused_args = option_parser.parse_args()

   # Validate arguments.
   if int(options.start_number) <= 0:
     raise Exception('The -s/--start_number must be greater than 0')
   if int(options.start_number) > int(options.end_number):
     raise Exception('The -s/--start_number must be less than or equal to '
                     '-e/--end_number')

   if options.csv_file:
     csv_contents = open(options.csv_file).readlines()
   else:
     # Download the zip file in member and extract its contents.
     usock = urllib.urlopen(TOP1M_CSV_ZIP_LOCATION)
     myzipfile = zipfile.ZipFile(StringIO(usock.read()))
     csv_contents = myzipfile.open(TOP1M_CSV_FILE_NAME).readlines()

   # Validate options.end_number.
   if int(options.end_number) > len(csv_contents):
     raise Exception('Please specify -e/--end_number less than or equal to %s' %
               len(csv_contents))

   # Populate the JSON dictionary.
   pages = []
   json_dict = {
       '_comment': 'Generated on %s by %s using create_page_set.py' % (
           datetime.now(), getpass.getuser()),
       'description': 'Top %s-%s Alexa global.' % (options.start_number,
                                                   options.end_number),
       'archive_data_file': os.path.join(
           '/', 'home', 'default', 'storage', 'webpages_archive',
           options.pagesets_type,
           'alexa%s-%s.json' % (options.start_number, options.end_number)),
       'pages': pages,
       'smoothness': { 'action': 'scroll'},
       'user_agent_type': 'desktop',
   }

   blacklisted_webpages = (open(options.blacklist).readlines()
                           if options.blacklist else [])

   for index in xrange(int(options.start_number) - 1, int(options.end_number)):
     line = csv_contents[index]
     if options.csv_file:
       try:
         (unused_number, website, qualified_website) = line.strip().split(',')
       except ValueError:
         print '%s is not mapped to a qualified website.' % (
             line.strip().split(',')[1])
         continue
     else:
       (unused_number, website) = line.strip().split(',')
       # Qualified website was not provided in the CSV, construct it.
       qualified_website = 'http://www.%s' % website

     website_filename = '%s%s_%s_desktop' % (
         ALEXA_PREFIX, index + 1, website.replace('.', '-').replace('/', '-'))

     skip_webpage = False
     for blacklisted_webpage in blacklisted_webpages:
       if blacklisted_webpage.rstrip() in website_filename:
         skip_webpage = True
         break
     if skip_webpage:
       print 'Skipping %s because it is in the provided blacklist file!' % (
           website_filename)
       continue
     pages.append({
         'url': qualified_website,
         'why': '#%s in Alexa global.' % (index + 1),
         'navigate_steps': [
             {'action': 'navigate'},
             {'action': 'wait', 'seconds': 5}
         ]
     })

     # Output the JSON dictionary to a file.
     try:
       with open(os.path.join('page_sets', 'alexa%s-%s.json' % (
                     options.start_number, options.end_number)),
                 'w') as outfile:
         json.dump(json_dict, outfile, indent=4)
     except Exception, e:
       print 'Skipping %s because it failed with Exception: %s' % (
           website_filename, e)
	#!/usr/bin/env python
	# Copyright (c) 2013 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""Downloads CSV of top 1M webpages and creates a JSON telemetry page_set.

	This module does the following steps:
	* Downloads a ZIP from http://s3.amazonaws.com/alexa-static/top-1m.csv.zip
	* Unpacks it and reads its contents in memory.
	* Writes out multiple JSON page sets from the CSV file for the specified number
	of webpages.

	Note: Blacklisted webpages will not be added to the outputted JSON page_set. If
	you request 100 webpages and 5 of them are blacklisted then the page_set will
	only contain 95 webpages.

	Sample Usage:
	python create_page_set.py -s 1 -e 10000

	Running the above command will create 10000 different page sets.
	The outputted page sets are intended to be used by the webpages_playback.py
	script.
	Sample usage of the webpages_playback.py script with the outputted page sets:
	python webpages_playback.py --record=True /
	--page_sets=../../tools/page_sets/*.json /
	--do_not_upload_to_gs=True --output_dir=/network/accessible/moint/point/
	"""

	__author__ = 'Ravi Mistry'

	import getpass
	import json
	import optparse
	import os
	import urllib
	import zipfile

	from datetime import datetime
	from StringIO import StringIO


	TOP1M_CSV_FILE_NAME = 'top-1m.csv'
	TOP1M_CSV_ZIP_LOCATION = (
	'http://s3.amazonaws.com/alexa-static/%s.zip' % TOP1M_CSV_FILE_NAME)
	ALEXA_PREFIX = 'alexa'


	if '__main__' == __name__:
	option_parser = optparse.OptionParser()
	option_parser.add_option(
	'-s', '--start_number',
	help='Specifies where to start with when adding the top webpages to the '
	'page_set.',
	default='1')
	option_parser.add_option(
	'-e', '--end_number',
	help='Specifies where to end with when adding the top webpages to the '
	'page_set',
	default='10000')
	option_parser.add_option(
	'-b', '--blacklist',
	help='Location of a black_list file which specifies which webpages '
	'should not be converted into page_sets.',
	default='')
	option_parser.add_option(
	'-c', '--csv_file',
	help='Location of a filtered alexa top 1M CSV file. Each row should '
	'have 3 entries, 1st will be rank, 2nd will be domain name and '
	'third will be the fully qualified url. If the third section is '
	'missing then a page_set for the URL will not be generated. If '
	'csv_file is not specified then this script downloads it from the '
	'internet.',
	default=None)
	option_parser.add_option(
	'-p', '--pagesets_type',
	help='The type of pagesets to create from the 1M list. Eg: All, '
	'Filtered, 100k, 10k, Deeplinks',
	default='All')
	options, unused_args = option_parser.parse_args()

	# Validate arguments.
	if int(options.start_number) <= 0:
	raise Exception('The -s/--start_number must be greater than 0')
	if int(options.start_number) > int(options.end_number):
	raise Exception('The -s/--start_number must be less than or equal to '
	'-e/--end_number')

	if options.csv_file:
	csv_contents = open(options.csv_file).readlines()
	else:
	# Download the zip file in member and extract its contents.
	usock = urllib.urlopen(TOP1M_CSV_ZIP_LOCATION)
	myzipfile = zipfile.ZipFile(StringIO(usock.read()))
	csv_contents = myzipfile.open(TOP1M_CSV_FILE_NAME).readlines()

	# Validate options.end_number.
	if int(options.end_number) > len(csv_contents):
	raise Exception('Please specify -e/--end_number less than or equal to %s' %
	len(csv_contents))

	# Populate the JSON dictionary.
	pages = []
	json_dict = {
	'_comment': 'Generated on %s by %s using create_page_set.py' % (
	datetime.now(), getpass.getuser()),
	'description': 'Top %s-%s Alexa global.' % (options.start_number,
	options.end_number),
	'archive_data_file': os.path.join(
	'/', 'home', 'default', 'storage', 'webpages_archive',
	options.pagesets_type,
	'alexa%s-%s.json' % (options.start_number, options.end_number)),
	'pages': pages,
	'smoothness': { 'action': 'scroll'},
	'user_agent_type': 'desktop',
	}

	blacklisted_webpages = (open(options.blacklist).readlines()
	if options.blacklist else [])

	for index in xrange(int(options.start_number) - 1, int(options.end_number)):
	line = csv_contents[index]
	if options.csv_file:
	try:
	(unused_number, website, qualified_website) = line.strip().split(',')
	except ValueError:
	print '%s is not mapped to a qualified website.' % (
	line.strip().split(',')[1])
	continue
	else:
	(unused_number, website) = line.strip().split(',')
	# Qualified website was not provided in the CSV, construct it.
	qualified_website = 'http://www.%s' % website

	website_filename = '%s%s_%s_desktop' % (
	ALEXA_PREFIX, index + 1, website.replace('.', '-').replace('/', '-'))

	skip_webpage = False
	for blacklisted_webpage in blacklisted_webpages:
	if blacklisted_webpage.rstrip() in website_filename:
	skip_webpage = True
	break
	if skip_webpage:
	print 'Skipping %s because it is in the provided blacklist file!' % (
	website_filename)
	continue
	pages.append({
	'url': qualified_website,
	'why': '#%s in Alexa global.' % (index + 1),
	'navigate_steps': [
	{'action': 'navigate'},
	{'action': 'wait', 'seconds': 5}
	]
	})

	# Output the JSON dictionary to a file.
	try:
	with open(os.path.join('page_sets', 'alexa%s-%s.json' % (
	options.start_number, options.end_number)),
	'w') as outfile:
	json.dump(json_dict, outfile, indent=4)
	except Exception, e:
	print 'Skipping %s because it failed with Exception: %s' % (
	website_filename, e)