Add id descriptor feature to SPIR-V Id descriptors are computed as a recursive hash of all instructions used to define an id. Descriptors are invarint of actual id values and the similar code in different files would produce the same descriptors. Multiple ids can have the same descriptor. For example %1 = OpConstant %u32 1 %2 = OpConstant %u32 1 would produce two ids with the same descriptor. But %3 = OpConstant %s32 1 %4 = OpConstant %u32 2 would have descriptors different from %1 and %2. Descriptors will be used as handles of move-to-front sequences in SPIR-V compression.

commit: 78cf86150e5a716b149046dcbab555ff7684a0a4 [log] [tgz]
author: Andrey Tuganov <andreyt@google.com> Wed Aug 02 16:47:25 2017 -0400
committer: David Neto <dneto@google.com> Thu Aug 10 18:44:52 2017 -0400
tree: 23da135946c3df25d506f417c7c96deb900216b9
parent: b0310a4156345aa84d758284b8e43e55f31d8bd2 [diff]
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 6ea6ad2..44bb973 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt

@@ -207,6 +207,7 @@
   ${CMAKE_CURRENT_SOURCE_DIR}/enum_string_mapping.h
   ${CMAKE_CURRENT_SOURCE_DIR}/ext_inst.h
   ${CMAKE_CURRENT_SOURCE_DIR}/extensions.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/id_descriptor.h
   ${CMAKE_CURRENT_SOURCE_DIR}/instruction.h
   ${CMAKE_CURRENT_SOURCE_DIR}/macro.h
   ${CMAKE_CURRENT_SOURCE_DIR}/name_mapper.h
@@ -234,6 +235,7 @@
   ${CMAKE_CURRENT_SOURCE_DIR}/enum_string_mapping.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/ext_inst.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/extensions.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/id_descriptor.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/libspirv.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/message.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/name_mapper.cpp

diff --git a/source/id_descriptor.cpp b/source/id_descriptor.cpp
new file mode 100644
index 0000000..d2c6220
--- /dev/null
+++ b/source/id_descriptor.cpp

@@ -0,0 +1,78 @@
+// Copyright (c) 2017 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "id_descriptor.h"
+
+#include <cassert>
+#include <iostream>
+
+#include "opcode.h"
+#include "operand.h"
+
+namespace libspirv {
+
+namespace {
+
+// Hashes an array of words. Order of words is important.
+uint32_t HashU32Array(const std::vector<uint32_t>& words) {
+  // The hash function is a sum of hashes of each word seeded by word index.
+  // Knuth's multiplicative hash is used to hash the words.
+  const uint32_t kKnuthMulHash = 2654435761;
+  uint32_t val = 0;
+  for (uint32_t i = 0; i < words.size(); ++i) {
+    val += (words[i] + i + 123) * kKnuthMulHash;
+  }
+  return val;
+}
+
+}  // namespace
+
+uint32_t IdDescriptorCollection::ProcessInstruction(
+    const spv_parsed_instruction_t& inst) {
+  if (!inst.result_id)
+    return 0;
+
+  assert(words_.empty());
+  words_.push_back(inst.words[0]);
+
+  for (size_t operand_index = 0; operand_index < inst.num_operands;
+       ++operand_index) {
+    const auto &operand = inst.operands[operand_index];
+    if (spvIsIdType(operand.type)) {
+      const uint32_t id = inst.words[operand.offset];
+      const auto it = id_to_descriptor_.find(id);
+      // Forward declared ids are not hashed.
+      if (it != id_to_descriptor_.end()) {
+        words_.push_back(it->second);
+      }
+    } else {
+      for (size_t operand_word_index = 0;
+           operand_word_index < operand.num_words; ++operand_word_index) {
+        words_.push_back(inst.words[operand.offset + operand_word_index]);
+      }
+    }
+  }
+
+  const uint32_t descriptor = HashU32Array(words_);
+  assert(descriptor);
+
+  words_.clear();
+
+  const auto result = id_to_descriptor_.emplace(inst.result_id, descriptor);
+  assert(result.second);
+  (void)result;
+  return descriptor;
+}
+
+}  // namespace libspirv

diff --git a/source/id_descriptor.h b/source/id_descriptor.h
new file mode 100644
index 0000000..b4947a7
--- /dev/null
+++ b/source/id_descriptor.h

@@ -0,0 +1,59 @@
+// Copyright (c) 2017 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef LIBSPIRV_ID_DESCRIPTOR_H_
+#define LIBSPIRV_ID_DESCRIPTOR_H_
+
+#include <unordered_map>
+#include <vector>
+
+#include "spirv-tools/libspirv.hpp"
+
+namespace libspirv {
+
+// Computes and stores id descriptors.
+//
+// Descriptors are computed as hash of all words in the instruction where ids
+// were substituted with previously computed descriptors.
+class IdDescriptorCollection {
+ public:
+  IdDescriptorCollection() {
+    words_.reserve(16);
+  }
+
+  // Computes descriptor for the result id of the given instruction and
+  // registers it in id_to_descriptor_. Returns the computed descriptor.
+  // This function needs to be sequentially called for every instruction in the
+  // module.
+  uint32_t ProcessInstruction(const spv_parsed_instruction_t& inst);
+
+  // Returns a previously computed descriptor id.
+  uint32_t GetDescriptor(uint32_t id) const {
+    const auto it = id_to_descriptor_.find(id);
+    if (it == id_to_descriptor_.end())
+      return 0;
+    return it->second;
+  }
+
+ private:
+  std::unordered_map<uint32_t, uint32_t> id_to_descriptor_;
+
+  // Scratch buffer used for hashing. Class member to optimize on allocation.
+  std::vector<uint32_t> words_;
+};
+
+}  // namespace libspirv
+
+#endif  // LIBSPIRV_ID_DESCRIPTOR_H_
+

diff --git a/source/spirv_stats.cpp b/source/spirv_stats.cpp
index 0c628d5..34cab6d 100644
--- a/source/spirv_stats.cpp
+++ b/source/spirv_stats.cpp

@@ -26,6 +26,7 @@
 #include "enum_string_mapping.h"
 #include "extensions.h"
 #include "instruction.h"
+#include "id_descriptor.h"
 #include "opcode.h"
 #include "operand.h"
 #include "spirv-tools/libspirv.h"
@@ -35,6 +36,7 @@
 #include "val/instruction.h"
 #include "val/validation_state.h"
 
+using libspirv::IdDescriptorCollection;
 using libspirv::Instruction;
 using libspirv::SpirvStats;
 using libspirv::ValidationState_t;
@@ -77,10 +79,41 @@
     ProcessEnums();
     ProcessLiteralStrings();
     ProcessNonIdWords();
+    ProcessIdDescriptors();
 
     return SPV_SUCCESS;
   }
 
+  // Collects statistics of descriptors generated by IdDescriptorCollection.
+  void ProcessIdDescriptors() {
+    const Instruction& inst = GetCurrentInstruction();
+    const uint32_t new_descriptor =
+        id_descriptors_.ProcessInstruction(inst.c_inst());
+
+    if (new_descriptor) {
+      std::stringstream ss;
+      ss << spvOpcodeString(inst.opcode());
+      for (size_t i = 1; i < inst.words().size(); ++i) {
+        ss << " " << inst.word(i);
+      }
+      stats_->id_descriptor_labels.emplace(new_descriptor, ss.str());
+    }
+
+    uint32_t index = 0;
+    for (const auto& operand : inst.operands()) {
+      if (spvIsIdType(operand.type)) {
+        const uint32_t descriptor =
+            id_descriptors_.GetDescriptor(inst.word(operand.offset));
+        if (descriptor) {
+          ++stats_->id_descriptor_hist[descriptor];
+          ++stats_->operand_slot_id_descriptor_hist[
+              std::pair<uint32_t, uint32_t>(inst.opcode(), index)][descriptor];
+        }
+      }
+      ++index;
+    }
+  }
+
   // Collects statistics of enum words for operands of specific types.
   void ProcessEnums() {
     const Instruction& inst = GetCurrentInstruction();
@@ -135,7 +168,7 @@
     uint32_t index = 0;
     for (const auto& operand : inst.operands()) {
       if (operand.num_words == 1 && !spvIsIdType(operand.type)) {
-          ++stats_->non_id_words_hist[std::pair<uint32_t, uint32_t>(
+          ++stats_->operand_slot_non_id_words_hist[std::pair<uint32_t, uint32_t>(
               inst.opcode(), index)][inst.word(operand.offset)];
       }
       ++index;
@@ -240,6 +273,7 @@
   SpirvStats* stats_;
   spv_validator_options_t validator_options_;
   std::unique_ptr<ValidationState_t> vstate_;
+  IdDescriptorCollection id_descriptors_;
 };
 
 spv_result_t ProcessHeader(

diff --git a/source/spirv_stats.h b/source/spirv_stats.h
index 4b16951..541dd03 100644
--- a/source/spirv_stats.h
+++ b/source/spirv_stats.h

@@ -77,7 +77,19 @@
   // This is a generalization of enum_hist, also includes literal integers and
   // masks.
   std::map<std::pair<uint32_t, uint32_t>,
-      std::map<uint32_t, uint32_t>> non_id_words_hist;
+      std::map<uint32_t, uint32_t>> operand_slot_non_id_words_hist;
+
+  // Historgam of descriptors generated by IdDescriptorCollection.
+  // Descriptor -> count.
+  std::unordered_map<uint32_t, uint32_t> id_descriptor_hist;
+
+  // Debut labels for id descriptors, descriptor -> label.
+  std::unordered_map<uint32_t, std::string> id_descriptor_labels;
+
+  // Historgam of descriptors generated by IdDescriptorCollection for every
+  // operand slot. pair<opcode, operand index> -> descriptor -> count.
+  std::map<std::pair<uint32_t, uint32_t>,
+      std::map<uint32_t, uint32_t>> operand_slot_id_descriptor_hist;
 
   // Histogram of literal strings, sharded by opcodes, opcode -> string -> count.
   // This is suboptimal if an opcode has multiple literal string operands,

diff --git a/test/stats/stats_aggregate_test.cpp b/test/stats/stats_aggregate_test.cpp
index 43026e8..6ea3892 100644
--- a/test/stats/stats_aggregate_test.cpp
+++ b/test/stats/stats_aggregate_test.cpp

@@ -433,4 +433,55 @@
   EXPECT_EQ(1u, stats.s64_constant_hist.at(-64));
 }
 
+TEST(AggregateStats, IdDescriptor) {
+  const std::string code1 = R"(
+OpCapability Addresses
+OpCapability Kernel
+OpCapability GenericPointer
+OpCapability Linkage
+OpMemoryModel Physical32 OpenCL
+%u32 = OpTypeInt 32 0
+%f32 = OpTypeFloat 32
+%1 = OpConstant %f32 1
+%2 = OpConstant %f32 1
+%3 = OpConstant %u32 32
+)";
+
+  const std::string code2 = R"(
+OpCapability Shader
+OpCapability Linkage
+OpMemoryModel Logical GLSL450
+%f32 = OpTypeFloat 32
+%u32 = OpTypeInt 32 0
+%1 = OpConstant %f32 1
+%2 = OpConstant %f32 3
+%3 = OpConstant %u32 32
+)";
+
+  const uint32_t kF32 = 1951208733;
+  const uint32_t kU32 = 2430404313;
+  const uint32_t kF32_1 = 296981500;
+  const uint32_t kF32_3 = 1450415100;
+  const uint32_t kU32_32 = 827246872;
+
+  SpirvStats stats;
+
+  CompileAndAggregateStats(code1, &stats);
+
+  {
+    const std::unordered_map<uint32_t, uint32_t> expected = {
+      {kF32, 3}, {kU32, 2}, {kF32_1, 2}, {kU32_32, 1}
+    };
+    EXPECT_EQ(expected, stats.id_descriptor_hist);
+  }
+
+  CompileAndAggregateStats(code2, &stats);
+  {
+    const std::unordered_map<uint32_t, uint32_t> expected = {
+      {kF32, 6}, {kU32, 4}, {kF32_1, 3}, {kF32_3, 1}, {kU32_32, 2}
+    };
+    EXPECT_EQ(expected, stats.id_descriptor_hist);
+  }
+}
+
 }  // namespace

diff --git a/tools/stats/stats.cpp b/tools/stats/stats.cpp
index 6e6878d..3eea982 100644
--- a/tools/stats/stats.cpp
+++ b/tools/stats/stats.cpp

@@ -73,6 +73,11 @@
                    Output generated C++ code for Huffman codecs for
                    single-word non-id slots.
                    This flag disables non-C++ output.
+
+  --codegen_id_descriptor_huffman_codecs
+                   Output generated C++ code for Huffman codecs for
+                   common id descriptors.
+                   This flag disables non-C++ output.
 )",
       argv0, argv0, argv0);
 }
@@ -113,6 +118,7 @@
   bool codegen_opcode_and_num_operands_markov_huffman_codecs = false;
   bool codegen_literal_string_huffman_codecs = false;
   bool codegen_non_id_word_huffman_codecs = false;
+  bool codegen_id_descriptor_huffman_codecs = false;
 
   std::vector<const char*> paths;
   const char* output_path = nullptr;
@@ -128,27 +134,31 @@
         codegen_opcode_hist = true;
         export_text = false;
       } else if (0 == strcmp(cur_arg,
-			     "--codegen_opcode_and_num_operands_hist")) {
+                             "--codegen_opcode_and_num_operands_hist")) {
         codegen_opcode_and_num_operands_hist = true;
         export_text = false;
       } else if (strcmp(
-	  "--codegen_opcode_and_num_operands_markov_huffman_codecs",
-	  cur_arg) == 0) {
+          "--codegen_opcode_and_num_operands_markov_huffman_codecs",
+          cur_arg) == 0) {
         codegen_opcode_and_num_operands_markov_huffman_codecs = true;
         export_text = false;
       } else if (0 == strcmp(cur_arg,
-			     "--codegen_literal_string_huffman_codecs")) {
+                             "--codegen_literal_string_huffman_codecs")) {
         codegen_literal_string_huffman_codecs = true;
         export_text = false;
       } else if (0 == strcmp(cur_arg,
-			     "--codegen_non_id_word_huffman_codecs")) {
+                             "--codegen_non_id_word_huffman_codecs")) {
         codegen_non_id_word_huffman_codecs = true;
         export_text = false;
+      } else if (0 == strcmp(cur_arg,
+                             "--codegen_id_descriptor_huffman_codecs")) {
+        codegen_id_descriptor_huffman_codecs = true;
+        export_text = false;
       } else if (0 == strcmp(cur_arg, "--verbose") ||
-		 0 == strcmp(cur_arg, "-v")) {
+                 0 == strcmp(cur_arg, "-v")) {
         verbose = true;
       } else if (0 == strcmp(cur_arg, "--output") ||
-		 0 == strcmp(cur_arg, "-o")) {
+                 0 == strcmp(cur_arg, "-o")) {
         expect_output_path = true;
       } else {
         PrintUsage(argv[0]);
@@ -157,10 +167,10 @@
       }
     } else {
       if (expect_output_path) {
-	output_path = cur_arg;
-	expect_output_path = false;
+        output_path = cur_arg;
+        expect_output_path = false;
       } else {
-	paths.push_back(cur_arg);
+        paths.push_back(cur_arg);
       }
     }
   }
@@ -255,5 +265,10 @@
     analyzer.WriteCodegenNonIdWordHuffmanCodecs(out);
   }
 
+  if (codegen_id_descriptor_huffman_codecs) {
+    out << std::endl;
+    analyzer.WriteCodegenIdDescriptorHuffmanCodecs(out);
+  }
+
   return 0;
 }

diff --git a/tools/stats/stats_analyzer.cpp b/tools/stats/stats_analyzer.cpp
index e196e49..3f2e7ec 100644
--- a/tools/stats/stats_analyzer.cpp
+++ b/tools/stats/stats_analyzer.cpp

@@ -726,7 +726,7 @@
       << "  std::map<std::pair<uint32_t, uint32_t>, "
       << "std::unique_ptr<HuffmanCodec<uint64_t>>> codecs;\n";
 
-  for (const auto& kv : stats_.non_id_words_hist) {
+  for (const auto& kv : stats_.operand_slot_non_id_words_hist) {
     const auto& opcode_and_index = kv.first;
     const uint32_t opcode = opcode_and_index.first;
     const uint32_t index = opcode_and_index.second;
@@ -764,3 +764,50 @@
 
   out << "  return codecs;\n}\n";
 }
+
+void StatsAnalyzer::WriteCodegenIdDescriptorHuffmanCodecs(
+    std::ostream& out) {
+  out << "std::map<std::pair<uint32_t, uint32_t>, "
+      << "std::unique_ptr<HuffmanCodec<uint64_t>>>\n"
+      << "GetIdDescriptorHuffmanCodecs() {\n"
+      << "  std::map<std::pair<uint32_t, uint32_t>, "
+      << "std::unique_ptr<HuffmanCodec<uint64_t>>> codecs;\n";
+
+  for (const auto& kv : stats_.operand_slot_id_descriptor_hist) {
+    const auto& opcode_and_index = kv.first;
+    const uint32_t opcode = opcode_and_index.first;
+    const uint32_t index = opcode_and_index.second;
+
+    const double kOpcodeFrequentEnoughToAnalyze = 0.001;
+    if (opcode_freq_[opcode] < kOpcodeFrequentEnoughToAnalyze) continue;
+
+    const std::map<uint32_t, uint32_t>& hist = kv.second;
+
+    uint32_t total = 0;
+    for (const auto& pair : hist) {
+      total += pair.second;
+    }
+
+    out << "  {\n";
+    out << "    std::unique_ptr<HuffmanCodec<uint64_t>> "
+        << "codec(new HuffmanCodec<uint64_t>({\n";
+    for (const auto& pair : hist) {
+      const uint32_t descriptor = pair.first;
+      const uint32_t count = pair.second;
+      const double freq = double(count) / double(total);
+      const double kDescriptorFrequentEnoughToAnalyze = 0.005;
+      if (freq < kDescriptorFrequentEnoughToAnalyze) continue;
+      out << "      { " << descriptor << ", " << count << " },\n";
+    }
+
+    out << "      { kMarkvNoneOfTheAbove, " << 1 + int(total * 0.05) << " },\n";
+
+    out << "    }));\n" << std::endl;
+    out << "    codecs.emplace(std::pair<uint32_t, uint32_t>(SpvOp"
+        << spvOpcodeString(SpvOp(opcode))
+        << ", " << index << "), std::move(codec));\n";
+    out << "  }\n\n";
+  }
+
+  out << "  return codecs;\n}\n";
+}

diff --git a/tools/stats/stats_analyzer.h b/tools/stats/stats_analyzer.h
index 54e2c3d..6cff20a 100644
--- a/tools/stats/stats_analyzer.h
+++ b/tools/stats/stats_analyzer.h

@@ -60,6 +60,11 @@
   // specific operand slot (opcode and operand number).
   void WriteCodegenNonIdWordHuffmanCodecs(std::ostream& out);
 
+  // Writes C++ code containing a function returning a map of Huffman codecs
+  // for common id descriptors. Each Huffman codec is created for a
+  // specific operand slot (opcode and operand number).
+  void WriteCodegenIdDescriptorHuffmanCodecs(std::ostream& out);
+
  private:
   const libspirv::SpirvStats& stats_;
commit	78cf86150e5a716b149046dcbab555ff7684a0a4	[log] [tgz]
author	Andrey Tuganov <andreyt@google.com>	Wed Aug 02 16:47:25 2017 -0400
committer	David Neto <dneto@google.com>	Thu Aug 10 18:44:52 2017 -0400
tree	23da135946c3df25d506f417c7c96deb900216b9
parent	b0310a4156345aa84d758284b8e43e55f31d8bd2 [diff]