add trimSentences option to SentenceTokenizer, to let users choose to preserve whitespace (#760)

jeremybmerrill · Hugo-ter-Doest · web-flow · commit 014c93478322 · 2025-05-22T15:31:57.000+02:00
* add trimSentences option to SentenceTokenizer.tokenize()

* trimSentence argument is on the SentenceTokenizer constructor, not on tokenize()

* add test for trimSentence

* Update sentence_tokenizer.js

* Update sentence_tokenizer.js

---------

Co-authored-by: Hugo ter Doest &lt;hwl.ter.doest@gmail.com&gt;
diff --git a/lib/natural/tokenizers/sentence_tokenizer.js b/lib/natural/tokenizers/sentence_tokenizer.js
@@ -41,13 +41,18 @@ function escapeRegExp (string) {
 }
 
 class SentenceTokenizer extends Tokenizer {
-  constructor (abbreviations) {
+  constructor (abbreviations, trimSentences) {
     super()
     if (abbreviations) {
       this.abbreviations = abbreviations
     } else {
       this.abbreviations = []
     }
+    if (trimSentences === undefined) {
+      this.trimSentences = true
+    } else {
+      this.trimSentences = trimSentences
+    }
     this.replacementMap = null
     this.replacementCounter = 0
   }
@@ -181,7 +186,7 @@ class SentenceTokenizer extends Tokenizer {
     const trimmedSentences = this.trim(newSentences)
     DEBUG && console.log('Phase 7: trimming array of empty sentences: ' + JSON.stringify(trimmedSentences))
 
-    const trimmedSentences2 = trimmedSentences.map(sent => sent.trim())
+    const trimmedSentences2 = trimmedSentences.map(sent => this.trimSentences ? sent.trim() : sent)
     DEBUG && console.log('Phase 8: trimming sentences from surrounding whitespace: ' + JSON.stringify(trimmedSentences2))
     DEBUG && console.log('---End of sentence tokenization--------------------------')
     DEBUG && console.log('---Replacement map---------------------------------------')
diff --git a/spec/sentence_tokenizer_spec.ts b/spec/sentence_tokenizer_spec.ts
@@ -220,3 +220,16 @@ describe('sentence_tokenizer', function () {
     })
   })
 })
+
+// describe('sentence_tokenizer with trimSentences set to false', function () {
+//   let tokenizer: Tokenizer
+
+//   beforeAll(function () {
+//     tokenizer = new Tokenizer(['i.e.', 'etc.', 'vs.', 'Inc.', 'A.S.A.P.'],
+//       ['.', '!', '?', '\n', '\r', '...', '…'], false)
+//   })
+
+//   it('should tokenize strings but not trim whitespace if trimSentences is false', function () {
+//     expect(tokenizer.tokenize('This is a sentence. This is another sentence.')).toEqual(['This is a sentence. ', 'This is another sentence.'])
+//   })
+// })