diff --git a/CHANGELOG.md b/CHANGELOG.md index 4797b727..b8564b2b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## 0.3.11-dev1 + +### Enhancements + +* **fix for URL style character in fsspec url** + + ## 0.3.11-dev0 ### Enhancements diff --git a/test/unit/v2/connectors/test_fsspec_paths.py b/test/unit/v2/connectors/test_fsspec_paths.py new file mode 100644 index 00000000..abba02ed --- /dev/null +++ b/test/unit/v2/connectors/test_fsspec_paths.py @@ -0,0 +1,93 @@ +from unittest.mock import MagicMock, patch +from urllib.parse import unquote + +import pytest + +from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import ( + FsspecAccessConfig, + FsspecConnectionConfig, + FsspecIndexer, + FsspecIndexerConfig, + Secret, +) + + +@pytest.mark.parametrize( + ("remote_url", "expected_path"), + [ + ("dropbox://da ta", "da ta"), + ("dropbox://da%20ta", "da ta"), + ], +) +def test_fsspec_indexer_path_decoding(remote_url, expected_path): + index_config = FsspecIndexerConfig(remote_url=remote_url) + connection_config = FsspecConnectionConfig(access_config=Secret(FsspecAccessConfig())) + + # After initialization, ensure path_without_protocol matches expected result + assert ( + index_config.path_without_protocol == expected_path + ), f"Expected {expected_path}, got {index_config.path_without_protocol}" + + indexer = FsspecIndexer(connection_config=connection_config, index_config=index_config) + + # Now index_config should have our expected path + full_path = expected_path + assert ( + indexer.index_config.path_without_protocol == full_path + ), f"Expected path to be {full_path}, got {indexer.index_config.path_without_protocol}" + + # Mock fsspec filesystem class to verify it's called with the correct path + with patch("fsspec.get_filesystem_class") as mock_fs_class: + mock_fs_instance = MagicMock() + mock_fs_class.return_value = lambda **kwargs: mock_fs_instance + + # Mock fs.ls to return a dummy file + mock_fs_instance.ls.return_value = [ + {"name": full_path + "/file.txt", "type": "file", "size": 123} + ] + + files = indexer.get_file_data() + + # Verify that fs.ls was called with the correct decoded path + mock_fs_instance.ls.assert_called_once_with(full_path, detail=True) + + # Assert that we got the expected file and it has the correct name + assert len(files) == 1 + assert ( + files[0]["name"] == full_path + "/file.txt" + ), "File name did not match expected output" + + +@pytest.mark.parametrize( + "remote_url", + [ + "dropbox://da ta", + "dropbox://da%20ta", + ], +) +def test_fsspec_indexer_precheck(remote_url): + # This test ensures that precheck doesn't raise errors and calls head appropriately + index_config = FsspecIndexerConfig(remote_url=remote_url) + connection_config = FsspecConnectionConfig(access_config=Secret(FsspecAccessConfig())) + indexer = FsspecIndexer(connection_config=connection_config, index_config=index_config) + + with patch("fsspec.get_filesystem_class") as mock_fs_class: + mock_fs_instance = MagicMock() + mock_fs_class.return_value = lambda **kwargs: mock_fs_instance + + mock_fs_instance.ls.return_value = [ + { + "name": "/" + unquote(remote_url.split("://", 1)[1]) + "/file.txt", + "type": "file", + "size": 123, + } + ] + + # head should be called on that file + mock_fs_instance.head.return_value = {"Content-Length": "123"} + + # If precheck does not raise SourceConnectionError, we consider it passed + indexer.precheck() + + # Check that fs.head was called with the correct file path + mock_fs_instance.head.assert_called_once() diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py index 312f3ed3..99e11453 100644 --- a/unstructured_ingest/__version__.py +++ b/unstructured_ingest/__version__.py @@ -1 +1 @@ -__version__ = "0.3.11-dev0" # pragma: no cover +__version__ = "0.3.11-dev1" # pragma: no cover diff --git a/unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py b/unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py index ffe7364e..2cd95fd4 100644 --- a/unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +++ b/unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py @@ -8,6 +8,7 @@ from dataclasses import dataclass, field from pathlib import Path from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar +from urllib.parse import unquote from uuid import NAMESPACE_DNS, uuid5 from pydantic import BaseModel, Field, Secret @@ -61,6 +62,7 @@ class FileConfig(BaseModel): def __init__(self, **data): protocol, path_without_protocol = data["remote_url"].split("://") + path_without_protocol = unquote(path_without_protocol) data["protocol"] = protocol data["path_without_protocol"] = path_without_protocol super().__init__(**data) @@ -107,8 +109,18 @@ def precheck(self) -> None: **self.connection_config.get_access_config(), ) files = fs.ls(path=self.index_config.path_without_protocol, detail=True) + if files is None: + logger.error( + f"[{CONNECTOR_TYPE}]fs.ls returned None for path: \ + {self.index_config.path_without_protocol}" + ) + raise SourceConnectionError("No files returned. Check if path is correct.") valid_files = [x.get("name") for x in files if x.get("type") == "file"] if not valid_files: + logger.warning( + f"[{CONNECTOR_TYPE}]There was no files found at path: \ + {self.index_config.path_without_protocol}" + ) return file_to_sample = valid_files[0] logger.debug(f"attempting to make HEAD request for file: {file_to_sample}")