diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 033395f..53f0e18 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,17 +14,17 @@ on: jobs: ci: - name: "Run Tests (Ruby ${{ matrix.ruby_version }}, GSL: ${{ matrix.gsl }})" + name: "Run Tests (Ruby ${{ matrix.ruby_version }}, Linalg: ${{ matrix.linalg_gem }})" runs-on: "ubuntu-latest" env: # See https://github.com/marketplace/actions/setup-ruby-jruby-and-truffleruby#matrix-of-gemfiles BUNDLE_GEMFILE: ${{ matrix.gemfile }} - LOAD_GSL: ${{ matrix.gsl }} + LINALG_GEM: ${{ matrix.linalg_gem }} strategy: fail-fast: false matrix: ruby_version: ["2.7", "3.0", "3.1", "jruby-9.3.4.0"] - gsl: [true, false] + linalg_gem: ["none", "gsl", "numo"] # We use `include` to assign the correct Gemfile for each ruby_version include: - ruby_version: "2.7" @@ -39,17 +39,23 @@ jobs: # Ruby 3.0 does not work with the latest released gsl gem # https://github.com/SciRuby/rb-gsl/issues/67 - ruby_version: "3.0" - gsl: true + linalg_gem: "gsl" # Ruby 3.1 does not work with the latest released gsl gem # https://github.com/SciRuby/rb-gsl/issues/67 - ruby_version: "3.1" - gsl: true + linalg_gem: "gsl" # jruby-9.3.4.0 doesn't easily build the gsl gem on a GitHub worker. Skipping for now. - ruby_version: "jruby-9.3.4.0" - gsl: true + linalg_gem: "gsl" + # jruby-9.3.4.0 doesn't easily build the numo gems on a GitHub worker. Skipping for now. + - ruby_version: "jruby-9.3.4.0" + linalg_gem: "numo" steps: - name: Checkout Repository uses: actions/checkout@v3 + - name: Install Lapack + if: ${{ matrix.linalg_gem == 'numo' }} + run: sudo apt-get install -y liblapacke-dev libopenblas-dev - name: "Set up ${{ matrix.label }}" uses: ruby/setup-ruby@v1 with: diff --git a/.rubocop.yml b/.rubocop.yml index d96a1e1..ea8dbc6 100644 --- a/.rubocop.yml +++ b/.rubocop.yml @@ -1,7 +1,7 @@ inherit_from: .rubocop_todo.yml Style/GlobalVars: - AllowedVariables: [$GSL] + AllowedVariables: [$SVD] Naming/MethodName: Exclude: diff --git a/Gemfile b/Gemfile index 7609f70..9acb68c 100644 --- a/Gemfile +++ b/Gemfile @@ -4,4 +4,9 @@ source 'https://rubygems.org' gemspec name: 'classifier-reborn' # For testing with GSL support & bundle exec -gem 'gsl' if ENV['LOAD_GSL'] == 'true' +gem 'gsl' if ENV['LINALG_GEM'] == 'gsl' + +if ENV['LINALG_GEM'] == 'numo' + gem 'numo-narray' + gem 'numo-linalg' +end diff --git a/docs/index.md b/docs/index.md index c5f1638..e1f12a5 100644 --- a/docs/index.md +++ b/docs/index.md @@ -60,12 +60,34 @@ The only runtime dependency of this gem is Roman Shterenzon's `fast-stemmer` gem gem install fast-stemmer ``` -To speed up `LSI` classification by at least 10x consider installing following libraries. +In addition, it is **recommended** to install either Numo or GSL to speed up LSI classification by at least 10x. -* [GSL - GNU Scientific Library](http://www.gnu.org/software/gsl) -* [Ruby/GSL Gem](https://rubygems.org/gems/gsl) +Note that LSI will work without these libraries, but as soon as they are installed, classifier will make use of them. No configuration changes are needed, we like to keep things ridiculously easy for you. + +### Install Numo Gems + +[Numo](https://ruby-numo.github.io/narray/) is a set of Numerical Module gems for Ruby that provide a Ruby interface to [LAPACK](http://www.netlib.org/lapack/). If classifier detects that the required Numo gems are installed, it will make use of them to perform LSI faster. + +* Install [LAPACKE](https://www.netlib.org/lapack/lapacke.html) + * Ubuntu: `apt-get install liblapacke-dev` + * macOS: (Help wanted to verify installation steps) https://stackoverflow.com/questions/38114201/installing-lapack-and-blas-libraries-for-c-on-mac-os +* Install [OpenBLAS](https://www.openblas.net/) + * Ubuntu: `apt-get install libopenblas-dev` + * macOS: (Help wanted to verify installation steps) https://stackoverflow.com/questions/38114201/installing-lapack-and-blas-libraries-for-c-on-mac-os +* Install the [Numo::NArray](https://ruby-numo.github.io/narray/) and [Numo::Linalg](https://ruby-numo.github.io/linalg/) gems + * `gem install numo-narray numo-linalg` + +### Install GSL Gem + +**Note:** The `gsl` gem is currently incompatible with Ruby 3. It is recommended to use Numo instead with Ruby 3. + +The [GNU Scientific Library (GSL)](http://www.gnu.org/software/gsl) is an alternative to Numo/LAPACK that can be used to improve LSI performance. (You should install one or the other, but both are not required.) + +* Install the [GNU Scientific Library](http://www.gnu.org/software/gsl) + * Ubuntu: `apt-get install libgsl-dev` +* Install the [Ruby/GSL Gem](https://rubygems.org/gems/gsl) (or add it to your Gemfile) + * `gem install gsl` -Note that `LSI` will work without these libraries, but as soon as they are installed, classifier will make use of them. No configuration changes are needed, we like to keep things ridiculously easy for you. ## Further Readings diff --git a/lib/classifier-reborn/lsi.rb b/lib/classifier-reborn/lsi.rb index 378f1da..ba5e39d 100644 --- a/lib/classifier-reborn/lsi.rb +++ b/lib/classifier-reborn/lsi.rb @@ -4,16 +4,28 @@ # Copyright:: Copyright (c) 2005 David Fayram II # License:: LGPL +# Try to load Numo first - it's the most current and the most well-supported. +# Fall back to GSL. +# Fall back to native vector. begin raise LoadError if ENV['NATIVE_VECTOR'] == 'true' # to test the native vector class, try `rake test NATIVE_VECTOR=true` + raise LoadError if ENV['GSL'] == 'true' # to test with gsl, try `rake test GSL=true` - require 'gsl' # requires https://github.com/SciRuby/rb-gsl - require_relative 'extensions/vector_serialize' - $GSL = true + require 'numo/narray' # https://ruby-numo.github.io/narray/ + require 'numo/linalg' # https://ruby-numo.github.io/linalg/ + $SVD = :numo rescue LoadError - $GSL = false - require_relative 'extensions/vector' - require_relative 'extensions/zero_vector' + begin + raise LoadError if ENV['NATIVE_VECTOR'] == 'true' # to test the native vector class, try `rake test NATIVE_VECTOR=true` + + require 'gsl' # requires https://github.com/SciRuby/rb-gsl + require_relative 'extensions/vector_serialize' + $SVD = :gsl + rescue LoadError + $SVD = :ruby + require_relative 'extensions/vector' + require_relative 'extensions/zero_vector' + end end require_relative 'lsi/word_list' @@ -140,7 +152,15 @@ def build_index(cutoff = 0.75) doc_list = @items.values tda = doc_list.collect { |node| node.raw_vector_with(@word_list) } - if $GSL + if $SVD == :numo + tdm = Numo::NArray.asarray(tda.map(&:to_a)).transpose + ntdm = numo_build_reduced_matrix(tdm, cutoff) + + ntdm.each_over_axis(1).with_index do |col_vec, i| + doc_list[i].lsi_vector = col_vec + doc_list[i].lsi_norm = col_vec / Numo::Linalg.norm(col_vec) + end + elsif $SVD == :gsl tdm = GSL::Matrix.alloc(*tda).trans ntdm = build_reduced_matrix(tdm, cutoff) @@ -201,7 +221,9 @@ def proximity_array_for_content(doc, &block) content_node = node_for_content(doc, &block) result = @items.keys.collect do |item| - val = if $GSL + val = if $SVD == :numo + content_node.search_vector.dot(@items[item].transposed_search_vector) + elsif $SVD == :gsl content_node.search_vector * @items[item].transposed_search_vector else (Matrix[content_node.search_vector] * @items[item].search_vector)[0] @@ -220,7 +242,8 @@ def proximity_norms_for_content(doc, &block) return [] if needs_rebuild? content_node = node_for_content(doc, &block) - if $GSL && content_node.raw_norm.isnan?.all? + if ($SVD == :gsl && content_node.raw_norm.isnan?.all?) || + ($SVD == :numo && content_node.raw_norm.isnan.all?) puts "There are no documents that are similar to #{doc}" else content_node_norms(content_node) @@ -230,7 +253,9 @@ def proximity_norms_for_content(doc, &block) def content_node_norms(content_node) result = @items.keys.collect do |item| - val = if $GSL + val = if $SVD == :numo + content_node.search_norm.dot(@items[item].search_norm) + elsif $SVD == :gsl content_node.search_norm * @items[item].search_norm.col else (Matrix[content_node.search_norm] * @items[item].search_norm)[0] @@ -332,7 +357,20 @@ def build_reduced_matrix(matrix, cutoff = 0.75) s[ord] = 0.0 if s[ord] < s_cutoff end # Reconstruct the term document matrix, only with reduced rank - u * ($GSL ? GSL::Matrix : ::Matrix).diag(s) * v.trans + u * ($SVD == :gsl ? GSL::Matrix : ::Matrix).diag(s) * v.trans + end + + def numo_build_reduced_matrix(matrix, cutoff = 0.75) + s, u, vt = Numo::Linalg.svd(matrix, driver: 'svd', job: 'S') + + # TODO: Better than 75% term (as above) + s_cutoff = s.sort.reverse[(s.size * cutoff).round - 1] + s.size.times do |ord| + s[ord] = 0.0 if s[ord] < s_cutoff + end + + # Reconstruct the term document matrix, only with reduced rank + u.dot(::Numo::DFloat.eye(s.size) * s).dot(vt) end def node_for_content(item, &block) diff --git a/lib/classifier-reborn/lsi/content_node.rb b/lib/classifier-reborn/lsi/content_node.rb index 91872b8..786dd72 100644 --- a/lib/classifier-reborn/lsi/content_node.rb +++ b/lib/classifier-reborn/lsi/content_node.rb @@ -29,7 +29,11 @@ def search_vector # Method to access the transposed search vector def transposed_search_vector - search_vector.col + if $SVD == :numo + search_vector + else + search_vector.col + end end # Use this to fetch the appropriate search vector in normalized form. @@ -40,7 +44,9 @@ def search_norm # Creates the raw vector out of word_hash using word_list as the # key for mapping the vector space. def raw_vector_with(word_list) - vec = if $GSL + vec = if $SVD == :numo + Numo::DFloat.zeros(word_list.size) + elsif $SVD == :gsl GSL::Vector.alloc(word_list.size) else Array.new(word_list.size, 0) @@ -51,7 +57,9 @@ def raw_vector_with(word_list) end # Perform the scaling transform and force floating point arithmetic - if $GSL + if $SVD == :numo + total_words = vec.sum.to_f + elsif $SVD == :gsl sum = 0.0 vec.each { |v| sum += v } total_words = sum @@ -61,7 +69,7 @@ def raw_vector_with(word_list) total_unique_words = 0 - if $GSL + if [:numo, :gsl].include?($SVD) vec.each { |word| total_unique_words += 1 if word != 0.0 } else total_unique_words = vec.count { |word| word != 0 } @@ -85,12 +93,15 @@ def raw_vector_with(word_list) hash[val] = Math.log(val + 1) / -weighted_total end - vec.collect! do |val| + vec = vec.map do |val| cached_calcs[val] end end - if $GSL + if $SVD == :numo + @raw_norm = vec / Numo::Linalg.norm(vec) + @raw_vector = vec + elsif $SVD == :gsl @raw_norm = vec.normalize @raw_vector = vec else diff --git a/test/extensions/matrix_test.rb b/test/extensions/matrix_test.rb index e142c35..587e221 100644 --- a/test/extensions/matrix_test.rb +++ b/test/extensions/matrix_test.rb @@ -2,7 +2,7 @@ class MatrixTest < Minitest::Test def test_zero_division - skip "extensions/vector is only used by non-GSL implementation" if $GSL + skip "extensions/vector is only used by non-GSL implementation" if $SVD != :ruby matrix = Matrix[[1, 0], [0, 1]] matrix.SV_decomp diff --git a/test/extensions/zero_vector_test.rb b/test/extensions/zero_vector_test.rb index 0ce53ad..95b5d4c 100644 --- a/test/extensions/zero_vector_test.rb +++ b/test/extensions/zero_vector_test.rb @@ -2,7 +2,7 @@ class ZeroVectorTest < Minitest::Test def test_zero? - skip "extensions/zero_vector is only used by non-GSL implementation" if $GSL + skip "extensions/zero_vector is only used by non-GSL implementation" if $SVD != :ruby vec0 = Vector[] vec1 = Vector[0] diff --git a/test/lsi/lsi_test.rb b/test/lsi/lsi_test.rb index 63ca5e0..4a8179a 100644 --- a/test/lsi/lsi_test.rb +++ b/test/lsi/lsi_test.rb @@ -163,7 +163,7 @@ def test_cached_content_node_option end def test_clears_cached_content_node_cache - skip "transposed_search_vector is only used by GSL implementation" unless $GSL + skip "transposed_search_vector is only used by GSL implementation" if $SVD == :ruby lsi = ClassifierReborn::LSI.new(cache_node_vectors: true) lsi.add_item @str1, 'Dog' @@ -191,8 +191,8 @@ def test_keyword_search assert_equal %i[dog text deal], lsi.highest_ranked_stems(@str1) end - def test_invalid_searching_when_using_gsl - skip "Only GSL currently raises invalid search error" unless $GSL + def test_invalid_searching_with_linalg_lib + skip "Only GSL currently raises invalid search error" if $SVD == :ruby lsi = ClassifierReborn::LSI.new lsi.add_item @str1, 'Dog'