diff --git a/api/factories/file_factory.py b/api/factories/file_factory.py index 2316e45179..737a79f2b0 100644 --- a/api/factories/file_factory.py +++ b/api/factories/file_factory.py @@ -1,5 +1,6 @@ import mimetypes import os +import re import urllib.parse import uuid from collections.abc import Callable, Mapping, Sequence @@ -268,15 +269,47 @@ def _build_from_remote_url( def _extract_filename(url_path: str, content_disposition: str | None) -> str | None: - filename = None + filename: str | None = None # Try to extract from Content-Disposition header first if content_disposition: - _, params = parse_options_header(content_disposition) - # RFC 5987 https://datatracker.ietf.org/doc/html/rfc5987: filename* takes precedence over filename - filename = params.get("filename*") or params.get("filename") + # Manually extract filename* parameter since parse_options_header doesn't support it + filename_star_match = re.search(r"filename\*=([^;]+)", content_disposition) + if filename_star_match: + raw_star = filename_star_match.group(1).strip() + # Remove trailing quotes if present + raw_star = raw_star.removesuffix('"') + # format: charset'lang'value + try: + parts = raw_star.split("'", 2) + charset = (parts[0] or "utf-8").lower() if len(parts) >= 1 else "utf-8" + value = parts[2] if len(parts) == 3 else parts[-1] + filename = urllib.parse.unquote(value, encoding=charset, errors="replace") + except Exception: + # Fallback: try to extract value after the last single quote + if "''" in raw_star: + filename = urllib.parse.unquote(raw_star.split("''")[-1]) + else: + filename = urllib.parse.unquote(raw_star) + + if not filename: + # Fallback to regular filename parameter + _, params = parse_options_header(content_disposition) + raw = params.get("filename") + if raw: + # Strip surrounding quotes and percent-decode if present + if len(raw) >= 2 and raw[0] == raw[-1] == '"': + raw = raw[1:-1] + filename = urllib.parse.unquote(raw) # Fallback to URL path if no filename from header if not filename: - filename = os.path.basename(url_path) + candidate = os.path.basename(url_path) + filename = urllib.parse.unquote(candidate) if candidate else None + # Defense-in-depth: ensure basename only + if filename: + filename = os.path.basename(filename) + # Return None if filename is empty or only whitespace + if not filename or not filename.strip(): + filename = None return filename or None diff --git a/api/tests/unit_tests/factories/test_file_factory.py b/api/tests/unit_tests/factories/test_file_factory.py index 777fe5a6e7..e5f45044fa 100644 --- a/api/tests/unit_tests/factories/test_file_factory.py +++ b/api/tests/unit_tests/factories/test_file_factory.py @@ -2,7 +2,7 @@ import re import pytest -from factories.file_factory import _get_remote_file_info +from factories.file_factory import _extract_filename, _get_remote_file_info class _FakeResponse: @@ -113,3 +113,120 @@ class TestGetRemoteFileInfo: # Should generate a random hex filename with .bin extension assert re.match(r"^[0-9a-f]{32}\.bin$", filename) is not None assert mime_type == "application/octet-stream" + + +class TestExtractFilename: + """Tests for _extract_filename function focusing on RFC5987 parsing and security.""" + + def test_no_content_disposition_uses_url_basename(self): + """Test that URL basename is used when no Content-Disposition header.""" + result = _extract_filename("http://example.com/path/file.txt", None) + assert result == "file.txt" + + def test_no_content_disposition_with_percent_encoded_url(self): + """Test that percent-encoded URL basename is decoded.""" + result = _extract_filename("http://example.com/path/file%20name.txt", None) + assert result == "file name.txt" + + def test_no_content_disposition_empty_url_path(self): + """Test that empty URL path returns None.""" + result = _extract_filename("http://example.com/", None) + assert result is None + + def test_simple_filename_header(self): + """Test basic filename extraction from Content-Disposition.""" + result = _extract_filename("http://example.com/", 'attachment; filename="test.txt"') + assert result == "test.txt" + + def test_quoted_filename_with_spaces(self): + """Test filename with spaces in quotes.""" + result = _extract_filename("http://example.com/", 'attachment; filename="my file.txt"') + assert result == "my file.txt" + + def test_unquoted_filename(self): + """Test unquoted filename.""" + result = _extract_filename("http://example.com/", "attachment; filename=test.txt") + assert result == "test.txt" + + def test_percent_encoded_filename(self): + """Test percent-encoded filename.""" + result = _extract_filename("http://example.com/", 'attachment; filename="file%20name.txt"') + assert result == "file name.txt" + + def test_rfc5987_filename_star_utf8(self): + """Test RFC5987 filename* with UTF-8 encoding.""" + result = _extract_filename("http://example.com/", "attachment; filename*=UTF-8''file%20name.txt") + assert result == "file name.txt" + + def test_rfc5987_filename_star_chinese(self): + """Test RFC5987 filename* with Chinese characters.""" + result = _extract_filename( + "http://example.com/", "attachment; filename*=UTF-8''%E6%B5%8B%E8%AF%95%E6%96%87%E4%BB%B6.txt" + ) + assert result == "测试文件.txt" + + def test_rfc5987_filename_star_with_language(self): + """Test RFC5987 filename* with language tag.""" + result = _extract_filename("http://example.com/", "attachment; filename*=UTF-8'en'file%20name.txt") + assert result == "file name.txt" + + def test_rfc5987_filename_star_fallback_charset(self): + """Test RFC5987 filename* with fallback charset.""" + result = _extract_filename("http://example.com/", "attachment; filename*=''file%20name.txt") + assert result == "file name.txt" + + def test_rfc5987_filename_star_malformed_fallback(self): + """Test RFC5987 filename* with malformed format falls back to simple unquote.""" + result = _extract_filename("http://example.com/", "attachment; filename*=malformed%20filename.txt") + assert result == "malformed filename.txt" + + def test_filename_star_takes_precedence_over_filename(self): + """Test that filename* takes precedence over filename.""" + test_string = 'attachment; filename="old.txt"; filename*=UTF-8\'\'new.txt"' + result = _extract_filename("http://example.com/", test_string) + assert result == "new.txt" + + def test_path_injection_protection(self): + """Test that path injection attempts are blocked by os.path.basename.""" + result = _extract_filename("http://example.com/", 'attachment; filename="../../../etc/passwd"') + assert result == "passwd" + + def test_path_injection_protection_rfc5987(self): + """Test that path injection attempts in RFC5987 are blocked.""" + result = _extract_filename("http://example.com/", "attachment; filename*=UTF-8''..%2F..%2F..%2Fetc%2Fpasswd") + assert result == "passwd" + + def test_empty_filename_returns_none(self): + """Test that empty filename returns None.""" + result = _extract_filename("http://example.com/", 'attachment; filename=""') + assert result is None + + def test_whitespace_only_filename_returns_none(self): + """Test that whitespace-only filename returns None.""" + result = _extract_filename("http://example.com/", 'attachment; filename=" "') + assert result is None + + def test_complex_rfc5987_encoding(self): + """Test complex RFC5987 encoding with special characters.""" + result = _extract_filename( + "http://example.com/", + "attachment; filename*=UTF-8''%E4%B8%AD%E6%96%87%E6%96%87%E4%BB%B6%20%28%E5%89%AF%E6%9C%AC%29.pdf", + ) + assert result == "中文文件 (副本).pdf" + + def test_iso8859_1_encoding(self): + """Test ISO-8859-1 encoding in RFC5987.""" + result = _extract_filename("http://example.com/", "attachment; filename*=ISO-8859-1''file%20name.txt") + assert result == "file name.txt" + + def test_encoding_error_fallback(self): + """Test that encoding errors fall back to safe ASCII filename.""" + result = _extract_filename("http://example.com/", "attachment; filename*=INVALID-CHARSET''file%20name.txt") + assert result == "file name.txt" + + def test_mixed_quotes_and_encoding(self): + """Test filename with mixed quotes and percent encoding.""" + result = _extract_filename( + "http://example.com/", 'attachment; filename="file%20with%20quotes%20%26%20encoding.txt"' + ) + assert result == "file with quotes & encoding.txt"