Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: handling path containing url unicode chars #312

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
## 0.3.11-dev1

### Enhancements

* **fix for URL style character in fsspec url**


## 0.3.11-dev0

### Enhancements
Expand Down
93 changes: 93 additions & 0 deletions test/unit/v2/connectors/test_fsspec_paths.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from unittest.mock import MagicMock, patch
from urllib.parse import unquote

import pytest

from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
FsspecAccessConfig,
FsspecConnectionConfig,
FsspecIndexer,
FsspecIndexerConfig,
Secret,
)


@pytest.mark.parametrize(
("remote_url", "expected_path"),
[
("dropbox://da ta", "da ta"),
("dropbox://da%20ta", "da ta"),
],
)
Comment on lines +15 to +21
Copy link
Contributor

@mpolomdeepsense mpolomdeepsense Dec 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if my dropbox folder is named "a%20b" (literal name, not encoded)? Will it return a%20b or a b?

def test_fsspec_indexer_path_decoding(remote_url, expected_path):
index_config = FsspecIndexerConfig(remote_url=remote_url)
connection_config = FsspecConnectionConfig(access_config=Secret(FsspecAccessConfig()))

# After initialization, ensure path_without_protocol matches expected result
assert (
index_config.path_without_protocol == expected_path
), f"Expected {expected_path}, got {index_config.path_without_protocol}"

indexer = FsspecIndexer(connection_config=connection_config, index_config=index_config)

# Now index_config should have our expected path
full_path = expected_path
assert (
indexer.index_config.path_without_protocol == full_path
), f"Expected path to be {full_path}, got {indexer.index_config.path_without_protocol}"

# Mock fsspec filesystem class to verify it's called with the correct path
with patch("fsspec.get_filesystem_class") as mock_fs_class:
mock_fs_instance = MagicMock()
mock_fs_class.return_value = lambda **kwargs: mock_fs_instance

# Mock fs.ls to return a dummy file
mock_fs_instance.ls.return_value = [
{"name": full_path + "/file.txt", "type": "file", "size": 123}
]

files = indexer.get_file_data()

# Verify that fs.ls was called with the correct decoded path
mock_fs_instance.ls.assert_called_once_with(full_path, detail=True)

# Assert that we got the expected file and it has the correct name
assert len(files) == 1
assert (
files[0]["name"] == full_path + "/file.txt"
), "File name did not match expected output"


@pytest.mark.parametrize(
"remote_url",
[
"dropbox://da ta",
"dropbox://da%20ta",
],
)
def test_fsspec_indexer_precheck(remote_url):
# This test ensures that precheck doesn't raise errors and calls head appropriately
index_config = FsspecIndexerConfig(remote_url=remote_url)
connection_config = FsspecConnectionConfig(access_config=Secret(FsspecAccessConfig()))
indexer = FsspecIndexer(connection_config=connection_config, index_config=index_config)

with patch("fsspec.get_filesystem_class") as mock_fs_class:
mock_fs_instance = MagicMock()
mock_fs_class.return_value = lambda **kwargs: mock_fs_instance

mock_fs_instance.ls.return_value = [
{
"name": "/" + unquote(remote_url.split("://", 1)[1]) + "/file.txt",
"type": "file",
"size": 123,
}
]

# head should be called on that file
mock_fs_instance.head.return_value = {"Content-Length": "123"}

# If precheck does not raise SourceConnectionError, we consider it passed
indexer.precheck()

# Check that fs.head was called with the correct file path
mock_fs_instance.head.assert_called_once()
2 changes: 1 addition & 1 deletion unstructured_ingest/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.3.11-dev0" # pragma: no cover
__version__ = "0.3.11-dev1" # pragma: no cover
12 changes: 12 additions & 0 deletions unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from dataclasses import dataclass, field
from pathlib import Path
from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
from urllib.parse import unquote
from uuid import NAMESPACE_DNS, uuid5

from pydantic import BaseModel, Field, Secret
Expand Down Expand Up @@ -61,6 +62,7 @@ class FileConfig(BaseModel):

def __init__(self, **data):
protocol, path_without_protocol = data["remote_url"].split("://")
path_without_protocol = unquote(path_without_protocol)
data["protocol"] = protocol
data["path_without_protocol"] = path_without_protocol
super().__init__(**data)
Expand Down Expand Up @@ -107,8 +109,18 @@ def precheck(self) -> None:
**self.connection_config.get_access_config(),
)
files = fs.ls(path=self.index_config.path_without_protocol, detail=True)
if files is None:
logger.error(
f"[{CONNECTOR_TYPE}]fs.ls returned None for path: \
{self.index_config.path_without_protocol}"
)
raise SourceConnectionError("No files returned. Check if path is correct.")
valid_files = [x.get("name") for x in files if x.get("type") == "file"]
if not valid_files:
logger.warning(
f"[{CONNECTOR_TYPE}]There was no files found at path: \
{self.index_config.path_without_protocol}"
)
return
file_to_sample = valid_files[0]
logger.debug(f"attempting to make HEAD request for file: {file_to_sample}")
Expand Down
Loading