From f8cc3a0eb6a0d65dac023164410c9b50b41ebf18 Mon Sep 17 00:00:00 2001 From: Anton Ivashkin Date: Tue, 2 Jun 2026 15:42:29 +0200 Subject: [PATCH 1/5] Fix cluster functions with hive partitioning --- .../StorageObjectStorageCluster.cpp | 14 ++++++-- tests/integration/test_s3_cluster/test.py | 34 +++++++++++++++++++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index e5131d06ae2e..a70462c9744e 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -612,6 +612,16 @@ RemoteQueryExecutor::Extension StorageObjectStorageCluster::getTaskIteratorExten ClusterPtr cluster, StorageMetadataPtr storage_metadata_snapshot) const { + // Virtual columns can contain hive columns, so we remove these hive coulmns to avoid duplicates. + // In non-cluster case these columns are filtered in DB::prepareReadingFromFormat function. + auto virtual_columns = getVirtualsList(); + NamesAndTypesList hive_partition_filtered; + for (const auto & hive_name_and_type : hive_partition_columns_to_read_from_file_path) + { + if (!virtual_columns.contains(hive_name_and_type.name)) + hive_partition_filtered.emplace_back(hive_name_and_type); + } + auto iterator = StorageObjectStorageSource::createFileIterator( configuration, configuration->getQuerySettings(local_context), @@ -621,8 +631,8 @@ RemoteQueryExecutor::Extension StorageObjectStorageCluster::getTaskIteratorExten local_context, predicate, filter, - getVirtualsList(), - hive_partition_columns_to_read_from_file_path, + virtual_columns, + hive_partition_filtered, nullptr, local_context->getFileProgressCallback(), /*ignore_archive_globs=*/false, diff --git a/tests/integration/test_s3_cluster/test.py b/tests/integration/test_s3_cluster/test.py index b990c0709f08..930713ac168d 100644 --- a/tests/integration/test_s3_cluster/test.py +++ b/tests/integration/test_s3_cluster/test.py @@ -1490,3 +1490,37 @@ def test_object_storage_remote_initiator_without_cluster_function(started_cluste assert users[1:] == ["s0_0_0\tdefault", "s0_0_1\tfoo", "s0_1_0\tfoo"] + + +def test_hive_partitioning(started_cluster): + node = started_cluster.instances["s0_0_0"] + + for i in range(1, 5): + node.query( + f""" + INSERT INTO FUNCTION s3('http://minio1:9001/root/hive/date=2000-01-0{i}/data.csv', + 'minio','{minio_secret_key}','CSVWithNames','d UInt64') + SELECT number FROM numbers(10) + SETTINGS s3_truncate_on_insert=1 + """) + + # Direct query + result = node.query( + f""" + SELECT count() FROM s3('http://minio1:9001/root/hive/date=*/data.csv', + 'minio','{minio_secret_key}','CSVWithNames','d UInt64') + WHERE date='2000-01-02' + SETTINGS use_hive_partitioning=1 + """ + ) + assert result.strip() == "10" + + result = node.query( + f""" + SELECT count() FROM s3Cluster('cluster_simple', 'http://minio1:9001/root/hive/date=*/data.csv', + 'minio','{minio_secret_key}','CSVWithNames','d UInt64') + WHERE date='2000-01-02' + SETTINGS use_hive_partitioning=1 + """ + ) + assert result.strip() == "10" From 776d0aecf5e2d2107fcc50b3263aeaa36e332890 Mon Sep 17 00:00:00 2001 From: Anton Ivashkin Date: Tue, 2 Jun 2026 18:29:02 +0200 Subject: [PATCH 2/5] Fix for url function --- tests/integration/test_s3_cluster/test.py | 2 +- tests/integration/test_storage_url/test.py | 36 ++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_s3_cluster/test.py b/tests/integration/test_s3_cluster/test.py index 930713ac168d..9bf7604d3ff4 100644 --- a/tests/integration/test_s3_cluster/test.py +++ b/tests/integration/test_s3_cluster/test.py @@ -1492,7 +1492,7 @@ def test_object_storage_remote_initiator_without_cluster_function(started_cluste "s0_1_0\tfoo"] -def test_hive_partitioning(started_cluster): +def test_hive_partitioning_with_where_condition(started_cluster): node = started_cluster.instances["s0_0_0"] for i in range(1, 5): diff --git a/tests/integration/test_storage_url/test.py b/tests/integration/test_storage_url/test.py index 8c7bc908f932..6e6eb5b03433 100644 --- a/tests/integration/test_storage_url/test.py +++ b/tests/integration/test_storage_url/test.py @@ -42,6 +42,42 @@ def test_partition_by(): assert result.strip() == "1\t2\t3" +def test_hive_partitioning_with_where_condition(): + """ + Same hive + virtual column overlap as s3Cluster/fileCluster: urlCluster must not pass + duplicate hive partition names into createPathAndFileFilterDAG. + """ + test_id = uuid.uuid4().hex[:8] + base_url = f"http://nginx:80/hive_url_cluster_{test_id}" + + node1.query( + f""" + INSERT INTO FUNCTION url(url_file, url='{base_url}/date=2000-01-01/data.csv', format='CSVWithNames', structure='d UInt64') + SELECT number FROM numbers(10) + """ + ) + + # 'ur' table function does not work with globs, so we have to test hive partitioning with a single file. + result = node1.query( + f""" + SELECT count() FROM url('{base_url}/date=2000-01-01/data.csv', 'CSVWithNames', 'd UInt64') + WHERE date='2000-01-01' + SETTINGS use_hive_partitioning=1 + """ + ) + assert result.strip() == "10" + + result = node1.query( + f""" + SELECT count() FROM urlCluster( + 'test_cluster_two_shards', '{base_url}/date=2000-01-01/data.csv', 'CSVWithNames', 'd UInt64') + WHERE date='2000-01-01' + SETTINGS use_hive_partitioning=1 + """ + ) + assert result.strip() == "10" + + def test_url_cluster(): result = node1.query( f"select * from urlCluster('test_cluster_two_shards', 'http://nginx:80/test_1', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32')" From 7de10eb74295e98b635591f91f7e7de1e23f2f5e Mon Sep 17 00:00:00 2001 From: Anton Ivashkin Date: Tue, 2 Jun 2026 19:25:43 +0200 Subject: [PATCH 3/5] Fix for file function --- src/Storages/StorageFileCluster.cpp | 25 ++++++-- src/Storages/StorageURLCluster.cpp | 14 ++++- tests/integration/test_file_cluster/test.py | 63 +++++++++++++++++++++ tests/integration/test_s3_cluster/test.py | 7 ++- 4 files changed, 100 insertions(+), 9 deletions(-) diff --git a/src/Storages/StorageFileCluster.cpp b/src/Storages/StorageFileCluster.cpp index 08485417dc50..c3e484bf6323 100644 --- a/src/Storages/StorageFileCluster.cpp +++ b/src/Storages/StorageFileCluster.cpp @@ -68,16 +68,23 @@ StorageFileCluster::StorageFileCluster( auto & storage_columns = storage_metadata.columns; + const auto sample_path = paths.empty() ? "" : paths.front(); + /// Not grabbing the file_columns because it is not necessary to do it here. std::tie(hive_partition_columns_to_read_from_file_path, std::ignore) = HivePartitioningUtils::setupHivePartitioningForFileURLLikeStorage( storage_columns, - paths.empty() ? "" : paths.front(), + sample_path, columns_.empty(), std::nullopt, context); storage_metadata.setConstraints(constraints_); - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns, context)); + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage( + storage_metadata.columns, + context, + std::nullopt, + PartitionStrategyFactory::StrategyType::NONE, + sample_path)); setInMemoryMetadata(storage_metadata); } @@ -134,12 +141,22 @@ class FileTaskIterator : public TaskIterator RemoteQueryExecutor::Extension StorageFileCluster::getTaskIteratorExtension( const ActionsDAG::Node * predicate, const ActionsDAG * /* filter */, const ContextPtr & context, ClusterPtr, StorageMetadataPtr) const { + // Virtual columns can contain hive columns, so we remove these hive coulmns to avoid duplicates. + // In non-cluster case these columns are filtered in DB::prepareReadingFromFormat function. + auto virtual_columns = getVirtualsList(); + NamesAndTypesList hive_partition_filtered; + for (const auto & hive_name_and_type : hive_partition_columns_to_read_from_file_path) + { + if (!virtual_columns.contains(hive_name_and_type.name)) + hive_partition_filtered.emplace_back(hive_name_and_type); + } + auto callback = std::make_shared( paths, std::nullopt, predicate, - getVirtualsList(), - hive_partition_columns_to_read_from_file_path, + virtual_columns, + hive_partition_filtered, context ); return RemoteQueryExecutor::Extension{.task_iterator = std::move(callback)}; diff --git a/src/Storages/StorageURLCluster.cpp b/src/Storages/StorageURLCluster.cpp index 0d3723f4e1fc..33aaacd4db37 100644 --- a/src/Storages/StorageURLCluster.cpp +++ b/src/Storages/StorageURLCluster.cpp @@ -162,12 +162,22 @@ class UrlTaskIterator : public TaskIterator RemoteQueryExecutor::Extension StorageURLCluster::getTaskIteratorExtension( const ActionsDAG::Node * predicate, const ActionsDAG * /* filter */, const ContextPtr & context, ClusterPtr, StorageMetadataPtr) const { + // Virtual columns can contain hive columns, so we remove these hive coulmns to avoid duplicates. + // In non-cluster case these columns are filtered in DB::prepareReadingFromFormat function. + auto virtual_columns = getVirtualsList(); + NamesAndTypesList hive_partition_filtered; + for (const auto & hive_name_and_type : hive_partition_columns_to_read_from_file_path) + { + if (!virtual_columns.contains(hive_name_and_type.name)) + hive_partition_filtered.emplace_back(hive_name_and_type); + } + auto callback = std::make_shared( uri, context->getSettingsRef()[Setting::glob_expansion_max_elements], predicate, - getVirtualsList(), - hive_partition_columns_to_read_from_file_path, + virtual_columns, + hive_partition_filtered, context ); return RemoteQueryExecutor::Extension{.task_iterator = std::move(callback)}; diff --git a/tests/integration/test_file_cluster/test.py b/tests/integration/test_file_cluster/test.py index 04ed4c51cee3..4fed435fca8d 100644 --- a/tests/integration/test_file_cluster/test.py +++ b/tests/integration/test_file_cluster/test.py @@ -1,6 +1,7 @@ import csv import logging import time +import uuid import pytest @@ -211,3 +212,65 @@ def test_format_detection(started_cluster): "select * from fileCluster('my_cluster', 'file_for_format_detection*', auto, 's String, i UInt32', auto) ORDER BY (i, s)" ) assert result == expected_result + + +def test_hive_partitioning_with_where_condition(started_cluster): + """ + Hive partition columns are also exposed as virtual columns. fileCluster passes both + lists to createPathAndFileFilterDAG without deduplication (unlike s3Cluster), which + duplicates block columns and triggers SIZES_OF_COLUMNS_DOESNT_MATCH when filtering. + """ + test_id = uuid.uuid4().hex[:8] + hive_glob = f"hive_file_cluster_{test_id}/date=*/data.csv" + + for node_name in ("s0_0_0", "s0_0_1", "s0_1_0"): + node = started_cluster.instances[node_name] + for i in range(1, 5): + node.query( + f""" + INSERT INTO TABLE FUNCTION file( + 'hive_file_cluster_{test_id}/date=2000-01-0{i}/data.csv', 'CSVWithNames', 'd UInt64') + SELECT number FROM numbers(10) + SETTINGS engine_file_truncate_on_insert=1 + """ + ) + + node = started_cluster.instances["s0_0_0"] + + result = node.query( + f""" + SELECT count() FROM file('{hive_glob}', 'CSVWithNames', 'd UInt64') + WHERE date='2000-01-02' + SETTINGS use_hive_partitioning=1 + """ + ) + assert result.strip() == "10" + + result = node.query( + f""" + SELECT date, d FROM file('{hive_glob}', 'CSVWithNames', 'd UInt64') + WHERE date='2000-01-02' + LIMIT 1 + SETTINGS use_hive_partitioning=1 + """ + ) + assert "2000-01-02" in result + + result = node.query( + f""" + SELECT count() FROM fileCluster('my_cluster', '{hive_glob}', 'CSVWithNames', 'd UInt64') + WHERE date='2000-01-02' + SETTINGS use_hive_partitioning=1 + """ + ) + assert result.strip() == "10" + + result = node.query( + f""" + SELECT date, d FROM fileCluster('my_cluster', '{hive_glob}', 'CSVWithNames', 'd UInt64') + WHERE date='2000-01-02' + LIMIT 1 + SETTINGS use_hive_partitioning=1 + """ + ) + assert "2000-01-02" in result diff --git a/tests/integration/test_s3_cluster/test.py b/tests/integration/test_s3_cluster/test.py index 9bf7604d3ff4..c53f215a8520 100644 --- a/tests/integration/test_s3_cluster/test.py +++ b/tests/integration/test_s3_cluster/test.py @@ -1494,11 +1494,12 @@ def test_object_storage_remote_initiator_without_cluster_function(started_cluste def test_hive_partitioning_with_where_condition(started_cluster): node = started_cluster.instances["s0_0_0"] + test_id = uuid.uuid4().hex[:8] for i in range(1, 5): node.query( f""" - INSERT INTO FUNCTION s3('http://minio1:9001/root/hive/date=2000-01-0{i}/data.csv', + INSERT INTO FUNCTION s3('http://minio1:9001/root/hive/{test_id}/date=2000-01-0{i}/data.csv', 'minio','{minio_secret_key}','CSVWithNames','d UInt64') SELECT number FROM numbers(10) SETTINGS s3_truncate_on_insert=1 @@ -1507,7 +1508,7 @@ def test_hive_partitioning_with_where_condition(started_cluster): # Direct query result = node.query( f""" - SELECT count() FROM s3('http://minio1:9001/root/hive/date=*/data.csv', + SELECT count() FROM s3('http://minio1:9001/root/hive/{test_id}/date=*/data.csv', 'minio','{minio_secret_key}','CSVWithNames','d UInt64') WHERE date='2000-01-02' SETTINGS use_hive_partitioning=1 @@ -1517,7 +1518,7 @@ def test_hive_partitioning_with_where_condition(started_cluster): result = node.query( f""" - SELECT count() FROM s3Cluster('cluster_simple', 'http://minio1:9001/root/hive/date=*/data.csv', + SELECT count() FROM s3Cluster('cluster_simple', 'http://minio1:9001/root/hive/{test_id}/date=*/data.csv', 'minio','{minio_secret_key}','CSVWithNames','d UInt64') WHERE date='2000-01-02' SETTINGS use_hive_partitioning=1 From 9850be48c3c98318268d41ccf25653108e7c8358 Mon Sep 17 00:00:00 2001 From: Anton Ivashkin Date: Tue, 2 Jun 2026 19:44:50 +0200 Subject: [PATCH 4/5] Remove code copy-paste --- src/Storages/IStorageCluster.cpp | 14 ++++++++++++++ src/Storages/IStorageCluster.h | 4 ++++ .../ObjectStorage/StorageObjectStorageCluster.cpp | 14 ++------------ .../ObjectStorage/StorageObjectStorageCluster.h | 1 - src/Storages/StorageFileCluster.cpp | 14 ++------------ src/Storages/StorageFileCluster.h | 1 - src/Storages/StorageURLCluster.cpp | 14 ++------------ src/Storages/StorageURLCluster.h | 1 - 8 files changed, 24 insertions(+), 39 deletions(-) diff --git a/src/Storages/IStorageCluster.cpp b/src/Storages/IStorageCluster.cpp index 962b123234bb..46bcdc223186 100644 --- a/src/Storages/IStorageCluster.cpp +++ b/src/Storages/IStorageCluster.cpp @@ -650,6 +650,20 @@ QueryProcessingStage::Enum IStorageCluster::getQueryProcessingStage( return QueryProcessingStage::Enum::FetchColumns; } +NamesAndTypesList IStorageCluster::getHivePartitionColumnsWithoutVirtuals() const +{ + // Virtual columns can contain hive columns, so we remove these hive coulmns to avoid duplicates. + // In non-cluster case these columns are filtered in DB::prepareReadingFromFormat function. + auto virtual_columns = getVirtualsList(); + NamesAndTypesList hive_partition_filtered; + for (const auto & hive_name_and_type : hive_partition_columns_to_read_from_file_path) + { + if (!virtual_columns.contains(hive_name_and_type.name)) + hive_partition_filtered.emplace_back(hive_name_and_type); + } + return hive_partition_filtered; +} + ContextPtr ReadFromCluster::updateSettings(const Settings & settings) { Settings new_settings{settings}; diff --git a/src/Storages/IStorageCluster.h b/src/Storages/IStorageCluster.h index 2c0c6d3c6029..4eb64b9b830b 100644 --- a/src/Storages/IStorageCluster.h +++ b/src/Storages/IStorageCluster.h @@ -105,6 +105,10 @@ class IStorageCluster : public IStorage throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method writeFallBackToPure is not supported by storage {}", getName()); } + NamesAndTypesList getHivePartitionColumnsWithoutVirtuals() const; + + NamesAndTypesList hive_partition_columns_to_read_from_file_path; + private: static ClusterPtr getClusterImpl(ContextPtr context, const String & cluster_name_, size_t max_hosts = 0); diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index a70462c9744e..39ad348c7f8b 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -612,16 +612,6 @@ RemoteQueryExecutor::Extension StorageObjectStorageCluster::getTaskIteratorExten ClusterPtr cluster, StorageMetadataPtr storage_metadata_snapshot) const { - // Virtual columns can contain hive columns, so we remove these hive coulmns to avoid duplicates. - // In non-cluster case these columns are filtered in DB::prepareReadingFromFormat function. - auto virtual_columns = getVirtualsList(); - NamesAndTypesList hive_partition_filtered; - for (const auto & hive_name_and_type : hive_partition_columns_to_read_from_file_path) - { - if (!virtual_columns.contains(hive_name_and_type.name)) - hive_partition_filtered.emplace_back(hive_name_and_type); - } - auto iterator = StorageObjectStorageSource::createFileIterator( configuration, configuration->getQuerySettings(local_context), @@ -631,8 +621,8 @@ RemoteQueryExecutor::Extension StorageObjectStorageCluster::getTaskIteratorExten local_context, predicate, filter, - virtual_columns, - hive_partition_filtered, + getVirtualsList(), + getHivePartitionColumnsWithoutVirtuals(), nullptr, local_context->getFileProgressCallback(), /*ignore_archive_globs=*/false, diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h index 52c7d5951855..bcaf293ad4e7 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h @@ -214,7 +214,6 @@ class StorageObjectStorageCluster : public IStorageCluster const String engine_name; StorageObjectStorageConfigurationPtr configuration; const ObjectStoragePtr object_storage; - NamesAndTypesList hive_partition_columns_to_read_from_file_path; bool cluster_name_in_settings; /// non-clustered storage to fall back on pure realisation if needed diff --git a/src/Storages/StorageFileCluster.cpp b/src/Storages/StorageFileCluster.cpp index c3e484bf6323..1aefb32a9883 100644 --- a/src/Storages/StorageFileCluster.cpp +++ b/src/Storages/StorageFileCluster.cpp @@ -141,22 +141,12 @@ class FileTaskIterator : public TaskIterator RemoteQueryExecutor::Extension StorageFileCluster::getTaskIteratorExtension( const ActionsDAG::Node * predicate, const ActionsDAG * /* filter */, const ContextPtr & context, ClusterPtr, StorageMetadataPtr) const { - // Virtual columns can contain hive columns, so we remove these hive coulmns to avoid duplicates. - // In non-cluster case these columns are filtered in DB::prepareReadingFromFormat function. - auto virtual_columns = getVirtualsList(); - NamesAndTypesList hive_partition_filtered; - for (const auto & hive_name_and_type : hive_partition_columns_to_read_from_file_path) - { - if (!virtual_columns.contains(hive_name_and_type.name)) - hive_partition_filtered.emplace_back(hive_name_and_type); - } - auto callback = std::make_shared( paths, std::nullopt, predicate, - virtual_columns, - hive_partition_filtered, + getVirtualsList(), + getHivePartitionColumnsWithoutVirtuals(), context ); return RemoteQueryExecutor::Extension{.task_iterator = std::move(callback)}; diff --git a/src/Storages/StorageFileCluster.h b/src/Storages/StorageFileCluster.h index eb2a70f60b89..dc4c49573623 100644 --- a/src/Storages/StorageFileCluster.h +++ b/src/Storages/StorageFileCluster.h @@ -45,7 +45,6 @@ class StorageFileCluster : public IStorageCluster Strings paths; String filename; String format_name; - NamesAndTypesList hive_partition_columns_to_read_from_file_path; }; } diff --git a/src/Storages/StorageURLCluster.cpp b/src/Storages/StorageURLCluster.cpp index 33aaacd4db37..26e80d0f8981 100644 --- a/src/Storages/StorageURLCluster.cpp +++ b/src/Storages/StorageURLCluster.cpp @@ -162,22 +162,12 @@ class UrlTaskIterator : public TaskIterator RemoteQueryExecutor::Extension StorageURLCluster::getTaskIteratorExtension( const ActionsDAG::Node * predicate, const ActionsDAG * /* filter */, const ContextPtr & context, ClusterPtr, StorageMetadataPtr) const { - // Virtual columns can contain hive columns, so we remove these hive coulmns to avoid duplicates. - // In non-cluster case these columns are filtered in DB::prepareReadingFromFormat function. - auto virtual_columns = getVirtualsList(); - NamesAndTypesList hive_partition_filtered; - for (const auto & hive_name_and_type : hive_partition_columns_to_read_from_file_path) - { - if (!virtual_columns.contains(hive_name_and_type.name)) - hive_partition_filtered.emplace_back(hive_name_and_type); - } - auto callback = std::make_shared( uri, context->getSettingsRef()[Setting::glob_expansion_max_elements], predicate, - virtual_columns, - hive_partition_filtered, + getVirtualsList(), + getHivePartitionColumnsWithoutVirtuals(), context ); return RemoteQueryExecutor::Extension{.task_iterator = std::move(callback)}; diff --git a/src/Storages/StorageURLCluster.h b/src/Storages/StorageURLCluster.h index e8ea21ffd306..e644608e20b1 100644 --- a/src/Storages/StorageURLCluster.h +++ b/src/Storages/StorageURLCluster.h @@ -47,7 +47,6 @@ class StorageURLCluster : public IStorageCluster String uri; String format_name; - NamesAndTypesList hive_partition_columns_to_read_from_file_path; }; From 9d5e1d15b7e88e2017a778370f5c8239f843f0e1 Mon Sep 17 00:00:00 2001 From: Anton Ivashkin Date: Tue, 2 Jun 2026 20:49:10 +0200 Subject: [PATCH 5/5] Remove confused AI comments --- tests/integration/test_file_cluster/test.py | 5 ----- tests/integration/test_storage_url/test.py | 4 ---- 2 files changed, 9 deletions(-) diff --git a/tests/integration/test_file_cluster/test.py b/tests/integration/test_file_cluster/test.py index 4fed435fca8d..4e97bb0beb43 100644 --- a/tests/integration/test_file_cluster/test.py +++ b/tests/integration/test_file_cluster/test.py @@ -215,11 +215,6 @@ def test_format_detection(started_cluster): def test_hive_partitioning_with_where_condition(started_cluster): - """ - Hive partition columns are also exposed as virtual columns. fileCluster passes both - lists to createPathAndFileFilterDAG without deduplication (unlike s3Cluster), which - duplicates block columns and triggers SIZES_OF_COLUMNS_DOESNT_MATCH when filtering. - """ test_id = uuid.uuid4().hex[:8] hive_glob = f"hive_file_cluster_{test_id}/date=*/data.csv" diff --git a/tests/integration/test_storage_url/test.py b/tests/integration/test_storage_url/test.py index 6e6eb5b03433..d93d28f30702 100644 --- a/tests/integration/test_storage_url/test.py +++ b/tests/integration/test_storage_url/test.py @@ -43,10 +43,6 @@ def test_partition_by(): def test_hive_partitioning_with_where_condition(): - """ - Same hive + virtual column overlap as s3Cluster/fileCluster: urlCluster must not pass - duplicate hive partition names into createPathAndFileFilterDAG. - """ test_id = uuid.uuid4().hex[:8] base_url = f"http://nginx:80/hive_url_cluster_{test_id}"