| # |
| # sources.py |
| # |
| # Convert source code comments to multi-line blocks (library file). |
| # |
| # Copyright 2002-2018 by |
| # David Turner. |
| # |
| # This file is part of the FreeType project, and may only be used, |
| # modified, and distributed under the terms of the FreeType project |
| # license, LICENSE.TXT. By continuing to use, modify, or distribute |
| # this file you indicate that you have read the license and |
| # understand and accept it fully. |
| |
| """Utility for parsing source files. |
| |
| This library file contains definitions of classes needed to decompose C |
| source code files into a series of multi-line 'blocks'. There are two |
| kinds of blocks. |
| |
| * Normal blocks, which contain source code or ordinary comments. |
| |
| * Documentation blocks, which have restricted formatting, and whose text |
| always start with a documentation markup tag like `<Function>', |
| `<Type>', etc. |
| |
| The routines to process the content of documentation blocks are contained |
| in file `content.py'; the classes and methods found here only deal with |
| text parsing and basic documentation block extraction. |
| """ |
| |
| from __future__ import print_function |
| |
| import fileinput |
| import logging |
| import re |
| |
| log = logging.getLogger( __name__ ) |
| |
| ################################################################ |
| ## |
| ## SOURCE BLOCK FORMAT CLASS |
| ## |
| ## A simple class containing compiled regular expressions to detect |
| ## potential documentation format block comments within C source code. |
| ## |
| ## The `column' pattern must contain a group to `unbox' the content of |
| ## documentation comment blocks. |
| ## |
| ## Later on, paragraphs are converted to long lines, which simplifies the |
| ## regular expressions that act upon the text. |
| ## |
| class SourceBlockFormat( object ): |
| |
| def __init__( self, iden, start, column, end ): |
| """Create a block pattern, used to recognize special documentation |
| blocks.""" |
| self.id = iden |
| self.start = re.compile( start, re.VERBOSE ) |
| self.column = re.compile( column, re.VERBOSE ) |
| self.end = re.compile( end, re.VERBOSE ) |
| |
| |
| # |
| # Format 1 documentation comment blocks. |
| # |
| # /************************************/ (at least 2 asterisks) |
| # /* */ |
| # /* */ |
| # /* */ |
| # /************************************/ (at least 2 asterisks) |
| # |
| start = r''' |
| \s* # any number of whitespace |
| /\*{2,}/ # followed by '/' and at least two asterisks then '/' |
| \s*$ # probably followed by whitespace |
| ''' |
| |
| column = r''' |
| \s* # any number of whitespace |
| /\*{1} # followed by '/' and precisely one asterisk |
| ([^*].*) # followed by anything (group 1) |
| \*{1}/ # followed by one asterisk and a '/' |
| \s*$ # probably followed by whitespace |
| ''' |
| |
| re_source_block_format1 = SourceBlockFormat( 1, start, column, start ) |
| |
| |
| # |
| # Format 2 documentation comment blocks. |
| # |
| # /************************************ (at least 2 asterisks) |
| # * |
| # * (1 asterisk) |
| # * |
| # */ (1 or more asterisks) |
| # |
| start = r''' |
| \s* # any number of whitespace |
| /\*{2,} # followed by '/' and at least two asterisks |
| \s*$ # probably followed by whitespace |
| ''' |
| |
| column = r''' |
| \s* # any number of whitespace |
| \*{1}(?![*/]) # followed by precisely one asterisk not followed by `/' |
| (.*) # then anything (group1) |
| ''' |
| |
| end = r''' |
| \s* # any number of whitespace |
| \*+/ # followed by at least one asterisk, then '/' |
| ''' |
| |
| re_source_block_format2 = SourceBlockFormat( 2, start, column, end ) |
| |
| |
| # |
| # The list of supported documentation block formats. We could add new ones |
| # quite easily. |
| # |
| re_source_block_formats = [re_source_block_format1, re_source_block_format2] |
| |
| |
| # |
| # The following regular expressions correspond to markup tags within the |
| # documentation comment blocks. They are equivalent despite their different |
| # syntax. |
| # |
| # A markup tag consists of letters or character `-', to be found in group 1. |
| # |
| # Notice that a markup tag _must_ begin a new paragraph. |
| # |
| re_markup_tag1 = re.compile( r'''\s*<((?:\w|-)*)>''' ) # <xxxx> format |
| re_markup_tag2 = re.compile( r'''\s*@((?:\w|-)*):''' ) # @xxxx: format |
| |
| # |
| # The list of supported markup tags. We could add new ones quite easily. |
| # |
| re_markup_tags = [re_markup_tag1, re_markup_tag2] |
| |
| |
| # |
| # A regular expression to detect a cross reference, after markup tags have |
| # been stripped off. |
| # |
| # Two syntax forms are supported: |
| # |
| # @<name> |
| # @<name>[<id>] |
| # |
| # where both `<name>' and `<id>' consist of alphanumeric characters, `_', |
| # and `-'. Use `<id>' if there are multiple, valid `<name>' entries. |
| # |
| # Example: @foo[bar] |
| # |
| re_crossref = re.compile( r""" |
| @ |
| (?P<name>(?:\w|-)+ |
| (?:\[(?:\w|-)+\])?) |
| (?P<rest>.*) |
| """, re.VERBOSE ) |
| |
| # |
| # Two regular expressions to detect italic and bold markup, respectively. |
| # Group 1 is the markup, group 2 the rest of the line. |
| # |
| # Note that the markup is limited to words consisting of letters, digits, |
| # the characters `_' and `-', or an apostrophe (but not as the first |
| # character). |
| # |
| re_italic = re.compile( r"_((?:\w|-)(?:\w|'|-)*)_(.*)" ) # _italic_ |
| re_bold = re.compile( r"\*((?:\w|-)(?:\w|'|-)*)\*(.*)" ) # *bold* |
| |
| # |
| # This regular expression code to identify an URL has been taken from |
| # |
| # https://mail.python.org/pipermail/tutor/2002-September/017228.html |
| # |
| # (with slight modifications). |
| # |
| urls = r'(?:https?|telnet|gopher|file|wais|ftp)' |
| ltrs = r'\w' |
| gunk = r'/#~:.?+=&%@!\-' |
| punc = r'.:?\-' |
| any_sym = "%(ltrs)s%(gunk)s%(punc)s" % { 'ltrs' : ltrs, |
| 'gunk' : gunk, |
| 'punc' : punc } |
| url = r""" |
| ( |
| \b(?![<]) # start at word boundary ignore if < found |
| %(urls)s : # need resource and a colon |
| [%(any)s] +? # followed by one or more of any valid |
| # character, but be conservative and |
| # take only what you need to... |
| (?= # [look-ahead non-consumptive assertion] |
| [%(punc)s]* # either 0 or more punctuation |
| (?: # [non-grouping parentheses] |
| [^%(any)s] | $ # followed by a non-url char |
| # or end of the string |
| ) |
| ) |
| ) |
| """ % {'urls' : urls, |
| 'any' : any_sym, |
| 'punc' : punc } |
| |
| re_url = re.compile( url, re.VERBOSE | re.MULTILINE ) |
| |
| # |
| # A regular expression that stops collection of comments for the current |
| # block. |
| # |
| re_source_sep = re.compile( r'\s*/\*\s*\*/' ) # /* */ |
| |
| # |
| # A regular expression to find possible C identifiers while outputting |
| # source code verbatim, covering things like `*foo' or `(bar'. Group 1 is |
| # the prefix, group 2 the identifier -- since we scan lines from left to |
| # right, sequentially splitting the source code into prefix and identifier |
| # is fully sufficient for our purposes. |
| # |
| re_source_crossref = re.compile( r'(\W*)(\w*)' ) |
| |
| # |
| # A regular expression that matches a list of reserved C source keywords. |
| # |
| re_source_keywords = re.compile( '''\\b ( typedef | |
| struct | |
| enum | |
| union | |
| const | |
| char | |
| int | |
| short | |
| long | |
| void | |
| signed | |
| unsigned | |
| include | |
| define | |
| undef | |
| if | |
| ifdef | |
| ifndef | |
| else | |
| endif ) \\b''', re.VERBOSE ) |
| |
| |
| ################################################################ |
| ## |
| ## SOURCE BLOCK CLASS |
| ## |
| ## There are two important fields in a `SourceBlock' object. |
| ## |
| ## self.lines |
| ## A list of text lines for the corresponding block. |
| ## |
| ## self.content |
| ## For documentation comment blocks only, this is the block content |
| ## that has been `unboxed' from its decoration. This is `None' for all |
| ## other blocks (i.e., sources or ordinary comments with no starting |
| ## markup tag) |
| ## |
| class SourceBlock( object ): |
| |
| def __init__( self, processor, filename, lineno, lines ): |
| self.processor = processor |
| self.filename = filename |
| self.lineno = lineno |
| self.lines = lines[:] |
| self.format = processor.format |
| self.content = [] |
| |
| if self.format == None: |
| return |
| |
| # extract comment lines |
| lines = [] |
| |
| for line0 in self.lines: |
| m = self.format.column.match( line0 ) |
| if m: |
| lines.append( m.group( 1 ) ) |
| |
| # now, look for a markup tag |
| for l in lines: |
| l = l.strip() |
| if len( l ) > 0: |
| for tag in re_markup_tags: |
| if tag.match( l ): |
| self.content = lines |
| return |
| |
| def location( self ): |
| return "(" + self.filename + ":" + repr( self.lineno ) + ")" |
| |
| # debugging only -- not used in normal operations |
| def dump( self ): |
| if self.content: |
| print( "{{{content start---" ) |
| for l in self.content: |
| print( l ) |
| print( "---content end}}}" ) |
| return |
| |
| for line in self.lines: |
| print( line ) |
| |
| |
| ################################################################ |
| ## |
| ## SOURCE PROCESSOR CLASS |
| ## |
| ## The `SourceProcessor' is in charge of reading a C source file and |
| ## decomposing it into a series of different `SourceBlock' objects. |
| ## |
| ## A SourceBlock object consists of the following data. |
| ## |
| ## - A documentation comment block using one of the layouts above. Its |
| ## exact format will be discussed later. |
| ## |
| ## - Normal sources lines, including comments. |
| ## |
| ## |
| class SourceProcessor( object ): |
| |
| def __init__( self ): |
| """Initialize a source processor.""" |
| self.blocks = [] |
| self.filename = None |
| self.format = None |
| self.lines = [] |
| |
| def reset( self ): |
| """Reset a block processor and clean up all its blocks.""" |
| self.blocks = [] |
| self.format = None |
| |
| def parse_file( self, filename ): |
| """Parse a C source file and add its blocks to the processor's |
| list.""" |
| self.reset() |
| |
| self.filename = filename |
| log.debug( "Parsing file %s.", filename ) |
| |
| fileinput.close() |
| self.format = None |
| self.lineno = 0 |
| self.lines = [] |
| |
| for line in fileinput.input( filename ): |
| # strip trailing newlines, important on Windows machines! |
| if line[-1] == '\012': |
| line = line[0:-1] |
| |
| if self.format == None: |
| self.process_normal_line( line ) |
| else: |
| if self.format.end.match( line ): |
| # A normal block end. Add it to `lines' and create a |
| # new block |
| self.lines.append( line ) |
| self.add_block_lines() |
| elif self.format.column.match( line ): |
| # A normal column line. Add it to `lines'. |
| self.lines.append( line ) |
| else: |
| # An unexpected block end. Create a new block, but |
| # don't process the line. |
| self.add_block_lines() |
| |
| # we need to process the line again |
| self.process_normal_line( line ) |
| |
| # record the last lines |
| self.add_block_lines() |
| |
| def process_normal_line( self, line ): |
| """Process a normal line and check whether it is the start of a new |
| block.""" |
| for f in re_source_block_formats: |
| if f.start.match( line ): |
| self.add_block_lines() |
| self.format = f |
| self.lineno = fileinput.filelineno() |
| |
| self.lines.append( line ) |
| |
| def add_block_lines( self ): |
| """Add the current accumulated lines and create a new block.""" |
| if self.lines != []: |
| block = SourceBlock( self, |
| self.filename, |
| self.lineno, |
| self.lines ) |
| |
| self.blocks.append( block ) |
| self.format = None |
| self.lines = [] |
| |
| # debugging only, not used in normal operations |
| def dump( self ): |
| """Print all blocks in a processor.""" |
| for b in self.blocks: |
| b.dump() |
| |
| # eof |