Merge pull request #282 from cwoffenden/faster-amalgamate

Replace the slow single-file shell script with a faster Python equivalent
diff --git a/contrib/single_file_transcoder/README.md b/contrib/single_file_transcoder/README.md
index d37788e..80c7fe1 100644
--- a/contrib/single_file_transcoder/README.md
+++ b/contrib/single_file_transcoder/README.md
@@ -1,30 +1,30 @@
 # Single File Basis Universal Transcoder
 
-The script `combine.sh` creates an _amalgamated_ C++ source file that can be used with or without `basisu_transcoder.h`. This _isn't_ a header-only file but it does offer a similar level of simplicity when integrating into a project.
+The script `combine.py` creates an _amalgamated_ C++ source file that can be used with or without `basisu_transcoder.h`. This _isn't_ a header-only file but it does offer a similar level of simplicity when integrating into a project.
 
 Create `basisu_transcoder.cpp` from the transcoder sources using:
 ```
 cd basis_universal/contrib/single_file_transcoder
 
-./combine.sh -r ../../transcoder -o basisu_transcoder.cpp basisu_transcoder-in.cpp
+python3 combine.py -r ../../transcoder -o basisu_transcoder.cpp basisu_transcoder-in.cpp
 ```
 Then add the resulting file to your project (see the [example files](examples)).
 
-If certain features will _never__ be enabled, e.g. `BASISD_SUPPORT_BC7_MODE6_OPAQUE_ONLY`, then `combine.sh` can be told to exclude files with the `-x` option:
+If certain features will _never__ be enabled, e.g. `BASISD_SUPPORT_BC7_MODE6_OPAQUE_ONLY`, then `combine.py` can be told to exclude files with the `-x` option:
 ```
-./combine.sh -r ../../transcoder -x basisu_transcoder_tables_bc7_m6.inc -o basisu_transcoder.cpp basisu_transcoder-in.cpp
+python3 combine.py -r ../../transcoder -x basisu_transcoder_tables_bc7_m6.inc -o basisu_transcoder.cpp basisu_transcoder-in.cpp
 ```
 Excluding the BC7 mode 6 support reduces the generated source by 1.2MB, which is the choice taken in `basisu_transcoder-in.cpp` and used in the examples, with `create_transcoder.sh` running the above script, creating the final `basisu_transcoder.cpp`.
 
 The combiner script can also generate separate amalgamated header and source files, using the `-k` option to keep the specified inline directive, and `-p` to keep the `#pragma once` directives in the header:
 ```
-./combine.sh -r ../../transcoder -o basisu_transcoder.h -p ../../transcoder/basisu_transcoder.h
+python3 combine.py -r ../../transcoder -o basisu_transcoder.h -p ../../transcoder/basisu_transcoder.h
 
-./combine.sh -r ../../transcoder -x basisu_transcoder_tables_bc7_m6.inc -k basisu_transcoder.h -o basisu_transcoder.cpp basisu_transcoder-in.cpp 
+python3 combine.py -r ../../transcoder -x basisu_transcoder_tables_bc7_m6.inc -k basisu_transcoder.h -o basisu_transcoder.cpp basisu_transcoder-in.cpp 
 
 ```
 
-Note: the amalgamation script will run on pretty much anything but is _extremely_ slow on Windows with the `bash` included with Git.
+Note: the amalgamation script was tested on Windows and Mac, requiring Python 3.8, with a fallback implementation as a shell script that will run on pretty much anything.
 
 Why?
 ----
diff --git a/contrib/single_file_transcoder/basisu_transcoder-in.cpp b/contrib/single_file_transcoder/basisu_transcoder-in.cpp
index 830cd35..27bc278 100644
--- a/contrib/single_file_transcoder/basisu_transcoder-in.cpp
+++ b/contrib/single_file_transcoder/basisu_transcoder-in.cpp
@@ -1,7 +1,7 @@
 /**
  * Basis Universal single file library. Generated using:
  * \code
- *	./combine.sh -r ../../transcoder -o basisu_transcoder.cpp basisu_transcoder-in.cpp
+ *	./combine.py -r ../../transcoder -o basisu_transcoder.cpp basisu_transcoder-in.cpp
  * \endcode
  */
 
diff --git a/contrib/single_file_transcoder/combine.py b/contrib/single_file_transcoder/combine.py
new file mode 100755
index 0000000..3d1018d
--- /dev/null
+++ b/contrib/single_file_transcoder/combine.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python3
+
+# Tool to bundle multiple C/C++ source files, inlining any includes.
+# 
+# Note: there are two types of exclusion options: the '-x' flag, which besides
+# excluding a file also adds an #error directive in place of the #include, and
+# the '-k' flag, which keeps the #include and doesn't inline the file. The
+# intended use cases are: '-x' for files that would normally be #if'd out, so
+# features that 100% won't be used in the amalgamated file, for which every
+# occurrence adds the error, and '-k' for headers that we wish to manually
+# include, such as a project's public API, for which occurrences after the first
+# are removed.
+# 
+# Todo: the error handling could be better, which currently throws and halts
+# (which is functional just not very friendly).
+# 
+# Author: Carl Woffenden, Numfum GmbH (this script is released under a CC0 license/Public Domain)
+
+import argparse, re, sys
+
+from pathlib import Path
+from typing import Any, List, Optional, Pattern, Set, TextIO
+
+# Set of file roots when searching (equivalent to -I paths for the compiler).
+roots: Set[Path] = set()
+
+# Set of (canonical) file Path objects to exclude from inlining (and not only
+# exclude but to add a compiler error directive when they're encountered).
+excludes: Set[Path] = set()
+
+# Set of (canonical) file Path objects to keep as include directives.
+keeps: Set[Path] = set()
+
+# Whether to keep the #pragma once directives (unlikely, since this will result
+# in a warning, but the option is there).
+keep_pragma: bool = False
+
+# Destination file object (or stdout if no output file was supplied).
+destn:TextIO = sys.stdout
+
+# Set of file Path objects previously inlined (and to ignore if reencountering).
+found: Set[Path] = set()
+
+# Compiled regex Patern to handle the following type of file includes:
+# 
+#   #include "file"
+#     #include "file"
+#   #  include "file"
+#   #include   "file"
+#   #include "file" // comment
+#   #include "file" // comment with quote "
+# 
+# And all combinations of, as well as ignoring the following:
+# 
+#   #include <file>
+#   //#include "file"
+#   /*#include "file"*/
+# 
+# We don't try to catch errors since the compiler will do this (and the code is
+# expected to be valid before processing) and we don't care what follows the
+# file (whether it's a valid comment or not, since anything after the quoted
+# string is ignored)
+# 
+include_regex: Pattern = re.compile(r'^\s*#\s*include\s*"(.+?)"')
+
+# Simple tests to prove include_regex's cases.
+# 
+def test_match_include() -> bool:
+    if (include_regex.match('#include "file"')   and
+        include_regex.match('  #include "file"') and
+        include_regex.match('#  include "file"') and
+        include_regex.match('#include   "file"') and
+        include_regex.match('#include "file" // comment')):
+            if (not include_regex.match('#include <file>')   and
+                not include_regex.match('//#include "file"') and
+                not include_regex.match('/*#include "file"*/')):
+                    found = include_regex.match('#include "file" // "')
+                    if (found and found.group(1) == 'file'):
+                        print('#include match valid')
+                        return True
+    return False
+
+# Compiled regex Patern to handle "#pragma once" in various formats:
+# 
+#   #pragma once
+#     #pragma once
+#   #  pragma once
+#   #pragma   once
+#   #pragma once // comment
+# 
+# Ignoring commented versions, same as include_regex.
+# 
+pragma_regex: Pattern = re.compile(r'^\s*#\s*pragma\s*once\s*')
+
+# Simple tests to prove pragma_regex's cases.
+# 
+def text_match_pragma() -> bool:
+    if (pragma_regex.match('#pragma once')   and
+        pragma_regex.match('  #pragma once') and
+        pragma_regex.match('#  pragma once') and
+        pragma_regex.match('#pragma   once') and
+        pragma_regex.match('#pragma once // comment')):
+            if (not pragma_regex.match('//#pragma once') and
+                not pragma_regex.match('/*#pragma once*/')):
+                    print('#pragma once match valid')
+                    return True
+    return False
+
+# Finds 'file'. First the list of 'root' paths are searched, followed by the
+# the currently processing file's 'parent' path, returning a valid Path in
+# canonical form. If no match is found None is returned.
+# 
+def resolve_include(file: str, parent: Optional[Path] = None) -> Optional[Path]:
+    for root in roots:
+        found = root.joinpath(file).resolve()
+        if (found.is_file()):
+            return found
+    if (parent):
+        found = parent.joinpath(file).resolve();
+    else:
+        found = Path(file)
+    if (found.is_file()):
+        return found
+    return None
+
+# Helper to resolve lists of files. 'file_list' is passed in from the arguments
+# and each entry resolved to its canonical path (like any include entry, either
+# from the list of root paths or the owning file's 'parent', which in this case
+# is case is the input file). The results are stored in 'resolved'.
+# 
+def resolve_excluded_files(file_list: Optional[List[str]], resolved: Set[Path], parent: Optional[Path] = None) -> None:
+    if (file_list):
+        for filename in file_list:
+            found = resolve_include(filename, parent)
+            if (found):
+                resolved.add(found)
+            else:
+                error_line(f'Warning: excluded file not found: {filename}')
+
+# Writes 'line' to the open 'destn' (or stdout).
+# 
+def write_line(line: str) -> None:
+    print(line, file=destn)
+
+# Logs 'line' to stderr. This is also used for general notifications that we
+# don't want to go to stdout (so the source can be piped).
+# 
+def error_line(line: Any) -> None:
+    print(line, file=sys.stderr)
+
+# Inline the contents of 'file' (with any of its includes also inlined, etc.).
+# 
+# Note: text encoding errors are ignored and replaced with ? when reading the
+# input files. This isn't ideal, but it's more than likely in the comments than
+# code and a) the text editor has probably also failed to read the same content,
+# and b) the compiler probably did too.
+# 
+def add_file(file: Path, file_name: str = None) -> None:
+    if (file.is_file()):
+        if (not file_name):
+            file_name = file.name
+        error_line(f'Processing: {file_name}')
+        with file.open('r', errors='replace') as opened:
+            for line in opened:
+                line = line.rstrip('\n')
+                match_include = include_regex.match(line);
+                if (match_include):
+                    # We have a quoted include directive so grab the file
+                    inc_name = match_include.group(1)
+                    resolved = resolve_include(inc_name, file.parent)
+                    if (resolved):
+                        if (resolved in excludes):
+                            # The file was excluded so error if the compiler uses it
+                            write_line(f'#error Using excluded file: {inc_name}')
+                            error_line(f'Excluding: {inc_name}')
+                        else:
+                            if (resolved not in found):
+                                # The file was not previously encountered
+                                found.add(resolved)
+                                if (resolved in keeps):
+                                    # But the include was flagged to keep as included
+                                    write_line(f'/**** *NOT* inlining {inc_name} ****/')
+                                    write_line(line)
+                                    error_line(f'Not inlining: {inc_name}')
+                                else:
+                                    # The file was neither excluded nor seen before so inline it
+                                    write_line(f'/**** start inlining {inc_name} ****/')
+                                    add_file(resolved, inc_name)
+                                    write_line(f'/**** ended inlining {inc_name} ****/')
+                            else:
+                                write_line(f'/**** skipping file: {inc_name} ****/')
+                    else:
+                        # The include file didn't resolve to a file
+                        write_line(f'#error Unable to find: {inc_name}')
+                        error_line(f'Error: Unable to find: {inc_name}')
+                else:
+                    # Skip any 'pragma once' directives, otherwise write the source line
+                    if (keep_pragma or not pragma_regex.match(line)):
+                        write_line(line)
+    else:
+        error_line(f'Error: Invalid file: {file}')
+
+# Start here
+parser = argparse.ArgumentParser(description='Amalgamate Tool', epilog=f'example: {sys.argv[0]} -r ../my/path -r ../other/path -o out.c in.c')
+parser.add_argument('-r', '--root', action='append', type=Path, help='file root search path')
+parser.add_argument('-x', '--exclude',  action='append', help='file to completely exclude from inlining')
+parser.add_argument('-k', '--keep', action='append', help='file to exclude from inlining but keep the include directive')
+parser.add_argument('-p', '--pragma', action='store_true', default=False, help='keep any "#pragma once" directives (removed by default)')
+parser.add_argument('-o', '--output', type=argparse.FileType('w'), help='output file (otherwise stdout)')
+parser.add_argument('input', type=Path, help='input file')
+args = parser.parse_args()
+
+# Fail early on an invalid input (and store it so we don't recurse)
+args.input = args.input.resolve(strict=True)
+found.add(args.input)
+
+# Resolve all of the root paths upfront (we'll halt here on invalid roots)
+if (args.root):
+    for path in args.root:
+        roots.add(path.resolve(strict=True))
+
+# The remaining params: so resolve the excluded files and #pragma once directive
+resolve_excluded_files(args.exclude, excludes, args.input.parent)
+resolve_excluded_files(args.keep,    keeps,    args.input.parent)
+keep_pragma = args.pragma;
+
+# Then recursively process the input file
+try:
+    if (args.output):
+        destn = args.output
+    add_file(args.input)
+finally:
+    if (not destn):
+        destn.close()
diff --git a/contrib/single_file_transcoder/create_transcoder.sh b/contrib/single_file_transcoder/create_transcoder.sh
index ee9a298..6016549 100755
--- a/contrib/single_file_transcoder/create_transcoder.sh
+++ b/contrib/single_file_transcoder/create_transcoder.sh
@@ -4,7 +4,12 @@
 OUT_FILE="tempbin"
 
 echo "Amalgamating files... this can take a while"
-./combine.sh -r ../../transcoder -o basisu_transcoder.cpp basisu_transcoder-in.cpp
+# Use the faster Python script if we have 3.8 or higher
+if python3 -c 'import sys; assert sys.version_info >= (3,8)' 2>/dev/null; then
+  ./combine.py -r ../../transcoder -o basisu_transcoder.cpp basisu_transcoder-in.cpp
+else
+  ./combine.sh -r ../../transcoder -o basisu_transcoder.cpp basisu_transcoder-in.cpp
+fi
 # Did combining work?
 if [ $? -ne 0 ]; then
   echo "Combine script: FAILED"