|
5 | 5 | from pathlib import Path
|
6 | 6 |
|
7 | 7 | import pytest
|
| 8 | +from unittest.mock import patch |
8 | 9 |
|
9 | 10 | from haystack.components.converters import HTMLToDocument
|
10 | 11 | from haystack.dataclasses import ByteStream
|
@@ -161,21 +162,23 @@ def test_serde(self):
|
161 | 162 | assert new_converter.extraction_kwargs == converter.extraction_kwargs
|
162 | 163 |
|
163 | 164 | def test_run_difficult_html(self, test_files_path):
|
164 |
| - # boilerpy3's DefaultExtractor fails to extract text from this HTML file |
165 |
| - |
166 | 165 | converter = HTMLToDocument()
|
167 | 166 | result = converter.run(sources=[Path(test_files_path / "html" / "paul_graham_superlinear.html")])
|
168 | 167 |
|
169 | 168 | assert len(result["documents"]) == 1
|
170 | 169 | assert "Superlinear" in result["documents"][0].content
|
171 | 170 |
|
172 |
| - def test_run_with_extraction_kwargs(self, test_files_path): |
| 171 | + @patch("haystack.components.converters.html.extract") |
| 172 | + def test_run_with_extraction_kwargs(self, mock_extract, test_files_path): |
173 | 173 | sources = [test_files_path / "html" / "what_is_haystack.html"]
|
174 | 174 |
|
175 | 175 | converter = HTMLToDocument()
|
176 |
| - precise_converter = HTMLToDocument(extraction_kwargs={"favor_precision": True}) |
| 176 | + converter.run(sources=sources) |
| 177 | + assert mock_extract.call_count == 1 |
| 178 | + assert "favor_precision" not in mock_extract.call_args[1] |
177 | 179 |
|
178 |
| - doc = converter.run(sources=sources)["documents"][0] |
179 |
| - precise_doc = precise_converter.run(sources=sources)["documents"][0] |
180 |
| - |
181 |
| - assert len(doc.content) > len(precise_doc.content) |
| 180 | + precise_converter = HTMLToDocument(extraction_kwargs={"favor_precision": True}) |
| 181 | + mock_extract.reset_mock() |
| 182 | + precise_converter.run(sources=sources) |
| 183 | + assert mock_extract.call_count == 1 |
| 184 | + assert mock_extract.call_args[1]["favor_precision"] is True |
0 commit comments