13
13
@component
14
14
class DocumentSplitter :
15
15
"""
16
- Splits a list of text documents into a list of text documents with shorter texts .
16
+ Splits long documents into smaller chunks .
17
17
18
- Splitting documents with long texts is a common preprocessing step during indexing.
19
- This allows Embedders to create significant semantic representations
20
- and avoids exceeding the maximum context length of language models.
18
+ This is a common preprocessing step during indexing.
19
+ It helps Embedders create meaningful semantic representations
20
+ and prevents exceeding language model context limits.
21
+
22
+ ### Usage example
21
23
22
- Usage example:
23
24
```python
24
25
from haystack import Document
25
26
from haystack.components.preprocessors import DocumentSplitter
@@ -39,14 +40,15 @@ def __init__(
39
40
split_threshold : int = 0 ,
40
41
):
41
42
"""
42
- Initialize the DocumentSplitter.
43
+ Initialize DocumentSplitter.
43
44
44
- :param split_by: The unit by which the document should be split. Choose from "word" for splitting by " ",
45
- "sentence" for splitting by ".", "page" for splitting by "\\ f" or "passage" for splitting by "\\ n\\ n".
45
+ :param split_by: The unit for splitting your documents. Choose from `word` for splitting by spaces (" "),
46
+ `sentence` for splitting by periods ("."), `page` for splitting by form feed ("\\ f"),
47
+ or `passage` for splitting by double line breaks ("\\ n\\ n").
46
48
:param split_length: The maximum number of units in each split.
47
- :param split_overlap: The number of units that each split should overlap .
48
- :param split_threshold: The minimum number of units that the split should have . If the split has fewer units
49
- than the threshold, it will be attached to the previous split.
49
+ :param split_overlap: The number of overlapping units for each split.
50
+ :param split_threshold: The minimum number of units per split. If a split has fewer units
51
+ than the threshold, it's attached to the previous split.
50
52
"""
51
53
52
54
self .split_by = split_by
@@ -71,10 +73,10 @@ def run(self, documents: List[Document]):
71
73
:param documents: The documents to split.
72
74
73
75
:returns: A dictionary with the following key:
74
- - `documents`: List of documents with the split texts. A metadata field "source_id" is added to each
75
- document to keep track of the original document that was split. Another metadata field "page_number"
76
- is added to each number to keep track of the page it belonged to in the original document. Other metadata
77
- are copied from the original document.
76
+ - `documents`: List of documents with the split texts. Each document includes:
77
+ - A metadata field `source_id` to track the original document.
78
+ - A metadata field `page_number` to track the original page number.
79
+ - All other metadata copied from the original document.
78
80
79
81
:raises TypeError: if the input is not a list of Documents.
80
82
:raises ValueError: if the content of a document is None.
0 commit comments