-
Notifications
You must be signed in to change notification settings - Fork 414
3353 normalize start method spawn seems to ignore environment variables #3463
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: devel
Are you sure you want to change the base?
Changes from 4 commits
bd07832
85aa0f5
76fbb49
3ef46d9
7a974f6
dc7429d
1178a4a
30955fb
5751ed6
04df4fb
a5410d5
55cbf17
04ca239
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,143 @@ | ||
| """Test that ConfigSectionContext is properly restored in spawned worker processes.""" | ||
|
|
||
| import os | ||
| from typing import Tuple, ClassVar | ||
|
|
||
| import pytest | ||
|
|
||
| from dlt.common.configuration.container import Container | ||
| from dlt.common.configuration.specs import PluggableRunContext, ConfigSectionContext | ||
| from dlt.common.configuration.specs.base_configuration import BaseConfiguration, configspec | ||
| from dlt.common.configuration.resolve import resolve_configuration | ||
| from dlt.common.runners.configuration import PoolRunnerConfiguration | ||
| from dlt.common.runners.pool_runner import create_pool | ||
|
|
||
|
|
||
| @configspec | ||
| class SectionedTestConfig(BaseConfiguration): | ||
|
||
| """A test configuration that uses a specific section.""" | ||
|
|
||
| test_value: str = "default" | ||
|
|
||
| __section__: ClassVar[str] = "test_section" | ||
|
|
||
|
|
||
| def _worker_resolve_config() -> Tuple[str, Tuple[str, ...]]: | ||
| """Worker function that resolves a config value using ConfigSectionContext. | ||
| Returns: | ||
| Tuple of (resolved_value, sections_from_context) | ||
| """ | ||
| section_ctx = Container()[ConfigSectionContext] | ||
| config = resolve_configuration(SectionedTestConfig()) | ||
|
|
||
| return config.test_value, section_ctx.sections | ||
|
|
||
|
|
||
| def test_config_section_context_restored_in_spawn_worker() -> None: | ||
|
||
| """Test that ConfigSectionContext is properly restored when using spawn method. | ||
| This test verifies that ConfigSectionContext is correctly serialized and restored | ||
| in worker processes, allowing config resolution to use the correct sections. | ||
| """ | ||
| # Set up environment variable with section-specific value | ||
| os.environ["MY_SECTION__TEST_SECTION__TEST_VALUE"] = "sectioned_value" | ||
| os.environ["TEST_SECTION__TEST_VALUE"] = "non_sectioned_value" # Should not be used | ||
|
|
||
| # Set up ConfigSectionContext in main process | ||
| section_context = ConfigSectionContext( | ||
| pipeline_name=None, | ||
| sections=("my_section",), | ||
| ) | ||
|
|
||
| # Store it in container | ||
| container = Container() | ||
| container[ConfigSectionContext] = section_context | ||
|
||
|
|
||
| # Create process pool with spawn method and multiple workers | ||
| # Using multiple workers ensures we're actually testing cross-process behavior | ||
| config = PoolRunnerConfiguration( | ||
| pool_type="process", | ||
| workers=4, | ||
| start_method="spawn", | ||
| ) | ||
|
|
||
| with create_pool(config) as pool: | ||
| # Submit multiple tasks to ensure we're using worker processes | ||
| futures = [pool.submit(_worker_resolve_config) for _ in range(4)] | ||
| results = [f.result() for f in futures] | ||
|
|
||
| # All workers should have the same ConfigSectionContext | ||
| result_value, result_sections = results[0] | ||
|
|
||
| # Verify that ConfigSectionContext was restored correctly | ||
| assert result_sections == ("my_section",), ( | ||
| f"Expected sections ('my_section',) but got {result_sections}. " | ||
| "ConfigSectionContext was not properly restored in worker process." | ||
| ) | ||
|
|
||
| # Verify that config resolution used the correct sections | ||
| assert result_value == "sectioned_value", ( | ||
| f"Expected 'sectioned_value' but got '{result_value}'. " | ||
| "Config resolution did not use the restored ConfigSectionContext sections." | ||
| ) | ||
|
|
||
|
|
||
| def test_config_section_context_with_pipeline_name() -> None: | ||
djudjuu marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| pipeline_name = "test_pipeline" | ||
| os.environ[f"{pipeline_name.upper()}__MY_SECTION__TEST_SECTION__TEST_VALUE"] = ( | ||
| "pipeline_sectioned_value" | ||
| ) | ||
| os.environ["MY_SECTION__TEST_SECTION__TEST_VALUE"] = "sectioned_value" | ||
|
|
||
| section_context = ConfigSectionContext( | ||
| pipeline_name=pipeline_name, | ||
| sections=("my_section",), | ||
| ) | ||
|
|
||
| container = Container() | ||
| container[ConfigSectionContext] = section_context | ||
|
|
||
| config = PoolRunnerConfiguration( | ||
| pool_type="process", | ||
| workers=4, | ||
| start_method="spawn", | ||
| ) | ||
|
|
||
| with create_pool(config) as pool: | ||
| futures = [pool.submit(_worker_resolve_config) for _ in range(4)] | ||
| results = [f.result() for f in futures] | ||
|
|
||
| # Verify all workers got the same context | ||
| for result_value, result_sections in results: | ||
| assert result_sections == ("my_section",) | ||
| # Should prefer pipeline-specific value | ||
| assert result_value == "pipeline_sectioned_value" | ||
|
|
||
|
|
||
| def test_config_section_context_empty_sections() -> None: | ||
| os.environ["TEST_SECTION__TEST_VALUE"] = "non_sectioned_value" | ||
|
|
||
| # ConfigSectionContext with empty sections | ||
| section_context = ConfigSectionContext( | ||
| pipeline_name=None, | ||
| sections=(), | ||
| ) | ||
|
|
||
| container = Container() | ||
| container[ConfigSectionContext] = section_context | ||
|
|
||
| config = PoolRunnerConfiguration( | ||
| pool_type="process", | ||
| workers=4, | ||
| start_method="spawn", | ||
| ) | ||
|
|
||
| with create_pool(config) as pool: | ||
| futures = [pool.submit(_worker_resolve_config) for _ in range(4)] | ||
| results = [f.result() for f in futures] | ||
|
|
||
| # Verify all workers got empty sections | ||
| for result_value, result_sections in results: | ||
| assert result_sections == (), "Empty sections should be preserved" | ||
| assert result_value == "non_sectioned_value", "Should use non-sectioned value" | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -568,6 +568,60 @@ def test_pipeline_data_writer_compression( | |
| assert_table_column(p, "data", data, info=info) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "destination_config", | ||
| destinations_configs( | ||
| default_sql_configs=True, | ||
| all_buckets_filesystem_configs=True, | ||
| ), | ||
| ids=lambda x: x.name, | ||
| ) | ||
| def test_normalize_compression_with_spawn_workers( | ||
|
||
| destination_config: DestinationTestConfiguration, | ||
| ) -> None: | ||
| """Disabling compression should work with multiple workers and spawn method, | ||
| because ConfigSectionContext is restored in worker processes. | ||
| """ | ||
| # Set compression disabled via normalize section | ||
| workers = 4 | ||
| os.environ["NORMALIZE__DATA_WRITER__DISABLE_COMPRESSION"] = "true" | ||
| os.environ["NORMALIZE__WORKERS"] = str(workers) | ||
| os.environ["NORMALIZE__START_METHOD"] = "spawn" | ||
|
|
||
| data = ["a", "b", "c", "d", "e"] | ||
| dataset_name = "compression_spawn_test_" + uniq_id() | ||
|
|
||
| p = destination_config.setup_pipeline("compression_spawn_test", dataset_name=dataset_name) | ||
| p.extract( | ||
| dlt.resource(data, name="data"), | ||
| table_format=destination_config.table_format, | ||
| loader_file_format=destination_config.file_format, | ||
| ) | ||
|
|
||
| # Normalize with multiple workers and spawn method | ||
| p.normalize(workers=workers) | ||
|
|
||
| # Check that normalized files are not compressed | ||
| load_storage = p._get_load_storage() | ||
| normalized_packages = load_storage.list_normalized_packages() | ||
|
||
| assert len(normalized_packages) > 0, "Should have at least one normalized package" | ||
|
|
||
| for load_id in normalized_packages: | ||
| # Get all job files from the normalized package | ||
| job_files = load_storage.normalized_packages.list_new_jobs(load_id) | ||
| assert len(job_files) > 0, f"Should have at least one job file in package {load_id}" | ||
|
|
||
| for job_file_name in job_files: | ||
| file_path = load_storage.normalized_packages.storage.make_full_path(job_file_name) | ||
| # If compression is disabled, file should NOT be gzipped | ||
| with pytest.raises(gzip.BadGzipFile): | ||
| with gzip.open(file_path, "rb") as f: | ||
| f.read() | ||
|
|
||
| info = p.load() | ||
| assert_table_column(p, "data", data, info=info) | ||
|
|
||
|
|
||
| @pytest.mark.essential | ||
| @pytest.mark.parametrize( | ||
| "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do we want this test? I wanted to have something that is closer to the actual change, whereas the other test is more of a regression test that fixes the exact issue