Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions lib/classifier-reborn/lsi.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

require_relative 'lsi/word_list'
require_relative 'lsi/content_node'
require_relative 'lsi/cached_content_node'
require_relative 'lsi/summarizer'

module ClassifierReborn
Expand All @@ -24,25 +25,30 @@ module ClassifierReborn
# please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
class LSI

attr_reader :word_list
attr_reader :word_list, :cache_node_vectors
attr_accessor :auto_rebuild

# Create a fresh index.
# If you want to call #build_index manually, use
# ClassifierReborn::LSI.new :auto_rebuild => false
# If you want to use ContentNodes with cached vector transpositions, use
# lsi = ClassifierReborn::LSI.new :cache_node_vectors => true
#
def initialize(options = {})
@auto_rebuild = options[:auto_rebuild] != false
@word_list, @items = WordList.new, {}
@version, @built_at_version = 0, -1
@language = options[:language] || 'en'
if @cache_node_vectors = options[:cache_node_vectors]
extend CachedContentNode::InstanceMethods
end
end

# Returns true if the index needs to be rebuilt. The index needs
# to be built after all informaton is added, but before you start
# using it for search, classification and cluster detection.
def needs_rebuild?
(@items.keys.size > 1) && (@version != @built_at_version)
(@items.size > 1) && (@version != @built_at_version)
end

# Adds an item to the index. item is assumed to be a string, but
Expand All @@ -60,7 +66,11 @@ def needs_rebuild?
#
def add_item( item, *categories, &block )
clean_word_hash = Hasher.clean_word_hash((block ? block.call(item) : item.to_s), @language)
@items[item] = ContentNode.new(clean_word_hash, *categories)
@items[item] = if @cache_node_vectors
CachedContentNode.new(clean_word_hash, *categories)
else
ContentNode.new(clean_word_hash, *categories)
end
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what do you think about a new_content_node() method that abstracts this creation?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is already a #node_for_content that serves up either a indexed ContentNode or creates a new one (used for searching/classification, its transient). Creating CachedContentNodes for transient operations is just overhead .. only items in the index need to be CachedContentNodes. I could abstract it, but since its not reused it doesn't seem worthwhile to me.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok!

@version += 1
build_index if @auto_rebuild
end
Expand Down Expand Up @@ -173,9 +183,9 @@ def proximity_array_for_content( doc, &block )
result =
@items.keys.collect do |item|
if $GSL
val = content_node.search_vector * @items[item].search_vector.col
val = content_node.search_vector * @items[item].transposed_search_vector
else
val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
end
[item, val]
end
Expand Down
48 changes: 48 additions & 0 deletions lib/classifier-reborn/lsi/cached_content_node.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Author:: Kelley Reynolds (mailto:[email protected])
# Copyright:: Copyright (c) 2015 Kelley Reynolds
# License:: LGPL

module ClassifierReborn

# Subclass of ContentNode which caches the search_vector transpositions.
# Its great because its much faster for large indexes, but at the cost of more ram. Additionally,
# if you Marshal your classifier and want to keep the size down, you'll need to manually
# clear the cache before you dump
class CachedContentNode < ContentNode
module InstanceMethods
# Go through each item in this index and clear the cache
def clear_cache!
@items.each_value(&:clear_cache!)
end
end

def initialize( word_hash, *categories )
clear_cache!
super
end

def clear_cache!
@transposed_search_vector = nil
end

# Cache the transposed vector, it gets used a lot
def transposed_search_vector
@transposed_search_vector ||= super
end

# Clear the cache before we continue on
def raw_vector_with( word_list )
clear_cache!
super
end

# We don't want the cached_data here
def marshal_dump
[@lsi_vector, @lsi_norm, @raw_vector, @raw_norm, @categories, @word_hash]
end

def marshal_load(array)
@lsi_vector, @lsi_norm, @raw_vector, @raw_norm, @categories, @word_hash = array
end
end
end
36 changes: 26 additions & 10 deletions lib/classifier-reborn/lsi/content_node.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ def search_vector
@lsi_vector || @raw_vector
end

# Method to access the transposed search vector
def transposed_search_vector
search_vector.col
end

# Use this to fetch the appropriate search vector in normalized form.
def search_norm
@lsi_norm || @raw_norm
Expand All @@ -47,7 +52,7 @@ def raw_vector_with( word_list )
# Perform the scaling transform and force floating point arithmetic
if $GSL
sum = 0.0
vec.collect{|v| sum += v}
vec.each {|v| sum += v }
total_words = sum
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could we use reduce here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was slower than each, I benchmarked every change pretty thoroughly, but you are welcome to verify.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe you. 👍

else
total_words = vec.reduce(0, :+).to_f
Expand All @@ -56,7 +61,7 @@ def raw_vector_with( word_list )
total_unique_words = 0

if $GSL
vec.each { |word| total_unique_words += 1 if word != 0 }
vec.each { |word| total_unique_words += 1 if word != 0.0 }
else
total_unique_words = vec.count{ |word| word != 0 }
end
Expand All @@ -65,20 +70,31 @@ def raw_vector_with( word_list )
# then one word in it.
if total_words > 1.0 && total_unique_words > 1
weighted_total = 0.0
# Cache calculations, this takes too long on large indexes
cached_calcs = Hash.new { |hash, term|
hash[term] = (( term / total_words ) * Math.log( term / total_words ))
}

vec.each do |term|
if ( term > 0 )
weighted_total += (( term / total_words ) * Math.log( term / total_words ))
end
weighted_total += cached_calcs[term] if term > 0.0
end
vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }

# Cache calculations, this takes too long on large indexes
cached_calcs = Hash.new do |hash, val|
hash[val] = Math.log( val + 1 ) / -weighted_total
end

vec.collect! { |val|
cached_calcs[val]
}
end

if $GSL
@raw_norm = vec.normalize
@raw_vector = vec
@raw_norm = vec.normalize
@raw_vector = vec
else
@raw_norm = Vector[*vec].normalize
@raw_vector = Vector[*vec]
@raw_norm = Vector[*vec].normalize
@raw_vector = Vector[*vec]
end
end

Expand Down
34 changes: 34 additions & 0 deletions test/lsi/lsi_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,40 @@ def test_serialize_safe
assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
end

def test_uncached_content_node_option
lsi = ClassifierReborn::LSI.new
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
lsi.instance_variable_get(:@items).values.each { |node|
assert node.instance_of?(ContentNode)
}
end

def test_cached_content_node_option
lsi = ClassifierReborn::LSI.new(cache_node_vectors: true)
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
lsi.instance_variable_get(:@items).values.each { |node|
assert node.instance_of?(CachedContentNode)
}
end

def test_clears_cached_content_node_cache
if $GSL
lsi = ClassifierReborn::LSI.new(cache_node_vectors: true)
lsi.add_item @str1, "Dog"
lsi.add_item @str2, "Dog"
lsi.add_item @str3, "Cat"
lsi.add_item @str4, "Cat"
lsi.add_item @str5, "Bird"

assert_equal "Dog", lsi.classify( "something about dogs, but not an exact dog string" )

first_content_node = lsi.instance_variable_get(:@items).values.first
refute_nil first_content_node.instance_variable_get(:@transposed_search_vector)
lsi.clear_cache!
assert_nil first_content_node.instance_variable_get(:@transposed_search_vector)
end
end

def test_keyword_search
lsi = ClassifierReborn::LSI.new
lsi.add_item @str1, "Dog"
Expand Down