jekyll · Ch4s3 · Sep 15, 2015 · Sep 11, 2015 · parkr · Sep 11, 2015
diff --git a/lib/classifier-reborn/lsi.rb b/lib/classifier-reborn/lsi.rb
@@ -15,6 +15,7 @@
 
 require_relative 'lsi/word_list'
 require_relative 'lsi/content_node'
+require_relative 'lsi/cached_content_node'
 require_relative 'lsi/summarizer'
 
 module ClassifierReborn
@@ -24,25 +25,30 @@ module ClassifierReborn
   # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
   class LSI
 
-    attr_reader :word_list
+    attr_reader :word_list, :cache_node_vectors
     attr_accessor :auto_rebuild
 
     # Create a fresh index.
     # If you want to call #build_index manually, use
     #      ClassifierReborn::LSI.new :auto_rebuild => false
+    # If you want to use ContentNodes with cached vector transpositions, use
+    #      lsi = ClassifierReborn::LSI.new :cache_node_vectors => true
     #
     def initialize(options = {})
       @auto_rebuild = options[:auto_rebuild] != false
       @word_list, @items = WordList.new, {}
       @version, @built_at_version = 0, -1
       @language = options[:language] || 'en'
+      if @cache_node_vectors = options[:cache_node_vectors]
+        extend CachedContentNode::InstanceMethods
+      end
     end
 
     # Returns true if the index needs to be rebuilt.  The index needs
     # to be built after all informaton is added, but before you start
     # using it for search, classification and cluster detection.
     def needs_rebuild?
-      (@items.keys.size > 1) && (@version != @built_at_version)
+      (@items.size > 1) && (@version != @built_at_version)
     end
 
     # Adds an item to the index. item is assumed to be a string, but
@@ -60,7 +66,11 @@ def needs_rebuild?
     #
     def add_item( item, *categories, &block )
       clean_word_hash = Hasher.clean_word_hash((block ? block.call(item) : item.to_s), @language)
-      @items[item] = ContentNode.new(clean_word_hash, *categories)
+      @items[item] = if @cache_node_vectors
+        CachedContentNode.new(clean_word_hash, *categories)
+      else
+        ContentNode.new(clean_word_hash, *categories)
+      end
       @version += 1
       build_index if @auto_rebuild
     end
@@ -173,9 +183,9 @@ def proximity_array_for_content( doc, &block )
       result =
         @items.keys.collect do |item|
           if $GSL
-             val = content_node.search_vector * @items[item].search_vector.col
+            val = content_node.search_vector * @items[item].transposed_search_vector
           else
-             val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
+            val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
           end
           [item, val]
         end

diff --git a/lib/classifier-reborn/lsi/cached_content_node.rb b/lib/classifier-reborn/lsi/cached_content_node.rb
@@ -0,0 +1,48 @@
+# Author::    Kelley Reynolds  (mailto:[email protected])
+# Copyright:: Copyright (c) 2015 Kelley Reynolds
+# License::   LGPL
+
+module ClassifierReborn
+
+  # Subclass of ContentNode which caches the search_vector transpositions.
+  # Its great because its much faster for large indexes, but at the cost of more ram. Additionally,
+  # if you Marshal your classifier and want to keep the size down, you'll need to manually
+  # clear the cache before you dump
+  class CachedContentNode < ContentNode
+    module InstanceMethods
+      # Go through each item in this index and clear the cache
+      def clear_cache!
+        @items.each_value(&:clear_cache!)
+      end
+    end
+
+    def initialize( word_hash, *categories )
+      clear_cache!
+      super
+    end
+
+    def clear_cache!
+      @transposed_search_vector = nil
+    end
+
+    # Cache the transposed vector, it gets used a lot
+    def transposed_search_vector
+      @transposed_search_vector ||= super
+    end
+
+    # Clear the cache before we continue on
+    def raw_vector_with( word_list )
+      clear_cache!
+      super
+    end
+
+    # We don't want the cached_data here
+    def marshal_dump
+      [@lsi_vector, @lsi_norm, @raw_vector, @raw_norm, @categories, @word_hash]
+    end
+
+    def marshal_load(array)
+      @lsi_vector, @lsi_norm, @raw_vector, @raw_norm, @categories, @word_hash = array
+    end
+  end
+end
diff --git a/lib/classifier-reborn/lsi/content_node.rb b/lib/classifier-reborn/lsi/content_node.rb
@@ -26,6 +26,11 @@ def search_vector
       @lsi_vector || @raw_vector
     end
 
+    # Method to access the transposed search vector
+    def transposed_search_vector
+      search_vector.col
+    end
+
     # Use this to fetch the appropriate search vector in normalized form.
     def search_norm
       @lsi_norm || @raw_norm
@@ -47,7 +52,7 @@ def raw_vector_with( word_list )
       # Perform the scaling transform and force floating point arithmetic
       if $GSL
         sum = 0.0
-        vec.collect{|v| sum += v}
+        vec.each {|v| sum += v }
         total_words = sum
       else
         total_words = vec.reduce(0, :+).to_f
@@ -56,7 +61,7 @@ def raw_vector_with( word_list )
       total_unique_words = 0
 
       if $GSL
-        vec.each { |word| total_unique_words += 1 if word != 0 }
+        vec.each { |word| total_unique_words += 1 if word != 0.0 }
       else
         total_unique_words = vec.count{ |word| word != 0 }
       end
@@ -65,20 +70,31 @@ def raw_vector_with( word_list )
       # then one word in it.
       if total_words > 1.0 && total_unique_words > 1
         weighted_total = 0.0
+        # Cache calculations, this takes too long on large indexes
+        cached_calcs = Hash.new { |hash, term|
+          hash[term] = (( term / total_words ) * Math.log( term / total_words ))
+        }
+
         vec.each do |term|
-          if ( term > 0 )
-            weighted_total += (( term / total_words ) * Math.log( term / total_words ))
-          end
+          weighted_total += cached_calcs[term] if term > 0.0
         end
-        vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
+
+        # Cache calculations, this takes too long on large indexes
+        cached_calcs = Hash.new do |hash, val|
+          hash[val] = Math.log( val + 1 ) / -weighted_total
+        end
+
+        vec.collect! { |val|
+          cached_calcs[val]
+        }
       end
 
       if $GSL
-         @raw_norm   = vec.normalize
-         @raw_vector = vec
+        @raw_norm   = vec.normalize
+        @raw_vector = vec
       else
-         @raw_norm   = Vector[*vec].normalize
-         @raw_vector = Vector[*vec]
+        @raw_norm   = Vector[*vec].normalize
+        @raw_vector = Vector[*vec]
       end
     end
 

diff --git a/test/lsi/lsi_test.rb b/test/lsi/lsi_test.rb
@@ -131,6 +131,40 @@ def test_serialize_safe
     assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
   end
 
+  def test_uncached_content_node_option
+    lsi = ClassifierReborn::LSI.new
+    [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
+    lsi.instance_variable_get(:@items).values.each { |node|
+      assert node.instance_of?(ContentNode)
+    }
+  end
+
+  def test_cached_content_node_option
+    lsi = ClassifierReborn::LSI.new(cache_node_vectors: true)
+    [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
+    lsi.instance_variable_get(:@items).values.each { |node|
+      assert node.instance_of?(CachedContentNode)
+    }
+  end
+
+  def test_clears_cached_content_node_cache
+    if $GSL
+      lsi = ClassifierReborn::LSI.new(cache_node_vectors: true)
+      lsi.add_item @str1, "Dog"
+      lsi.add_item @str2, "Dog"
+      lsi.add_item @str3, "Cat"
+      lsi.add_item @str4, "Cat"
+      lsi.add_item @str5, "Bird"
+
+      assert_equal "Dog", lsi.classify( "something about dogs, but not an exact dog string" )
+
+      first_content_node = lsi.instance_variable_get(:@items).values.first
+      refute_nil first_content_node.instance_variable_get(:@transposed_search_vector)
+      lsi.clear_cache!
+      assert_nil first_content_node.instance_variable_get(:@transposed_search_vector)
+    end
+  end
+
   def test_keyword_search
     lsi = ClassifierReborn::LSI.new
     lsi.add_item @str1, "Dog"