Skip to content

Commit 014c934

Browse files
add trimSentences option to SentenceTokenizer, to let users choose to preserve whitespace (#760)
* add trimSentences option to SentenceTokenizer.tokenize() * trimSentence argument is on the SentenceTokenizer constructor, not on tokenize() * add test for trimSentence * Update sentence_tokenizer.js * Update sentence_tokenizer.js --------- Co-authored-by: Hugo ter Doest <[email protected]>
1 parent 791df0b commit 014c934

File tree

2 files changed

+20
-2
lines changed

2 files changed

+20
-2
lines changed

lib/natural/tokenizers/sentence_tokenizer.js

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,18 @@ function escapeRegExp (string) {
4141
}
4242

4343
class SentenceTokenizer extends Tokenizer {
44-
constructor (abbreviations) {
44+
constructor (abbreviations, trimSentences) {
4545
super()
4646
if (abbreviations) {
4747
this.abbreviations = abbreviations
4848
} else {
4949
this.abbreviations = []
5050
}
51+
if (trimSentences === undefined) {
52+
this.trimSentences = true
53+
} else {
54+
this.trimSentences = trimSentences
55+
}
5156
this.replacementMap = null
5257
this.replacementCounter = 0
5358
}
@@ -181,7 +186,7 @@ class SentenceTokenizer extends Tokenizer {
181186
const trimmedSentences = this.trim(newSentences)
182187
DEBUG && console.log('Phase 7: trimming array of empty sentences: ' + JSON.stringify(trimmedSentences))
183188

184-
const trimmedSentences2 = trimmedSentences.map(sent => sent.trim())
189+
const trimmedSentences2 = trimmedSentences.map(sent => this.trimSentences ? sent.trim() : sent)
185190
DEBUG && console.log('Phase 8: trimming sentences from surrounding whitespace: ' + JSON.stringify(trimmedSentences2))
186191
DEBUG && console.log('---End of sentence tokenization--------------------------')
187192
DEBUG && console.log('---Replacement map---------------------------------------')

spec/sentence_tokenizer_spec.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,3 +220,16 @@ describe('sentence_tokenizer', function () {
220220
})
221221
})
222222
})
223+
224+
// describe('sentence_tokenizer with trimSentences set to false', function () {
225+
// let tokenizer: Tokenizer
226+
227+
// beforeAll(function () {
228+
// tokenizer = new Tokenizer(['i.e.', 'etc.', 'vs.', 'Inc.', 'A.S.A.P.'],
229+
// ['.', '!', '?', '\n', '\r', '...', '…'], false)
230+
// })
231+
232+
// it('should tokenize strings but not trim whitespace if trimSentences is false', function () {
233+
// expect(tokenizer.tokenize('This is a sentence. This is another sentence.')).toEqual(['This is a sentence. ', 'This is another sentence.'])
234+
// })
235+
// })

0 commit comments

Comments
 (0)