diff --git a/.flake8 b/.flake8 index b6179913..92726672 100644 --- a/.flake8 +++ b/.flake8 @@ -13,7 +13,7 @@ exclude = data, .github -max-line-length = 120 +max-line-length = 150 per-file-ignores = # imported but unused diff --git a/conflowgen/api/export_container_flow_manager.py b/conflowgen/api/export_container_flow_manager.py index bd518416..93db949f 100644 --- a/conflowgen/api/export_container_flow_manager.py +++ b/conflowgen/api/export_container_flow_manager.py @@ -26,6 +26,9 @@ def export( ) -> str: """ This extracts the container movement data from the SQL database to a folder of choice in a tabular data format. + In addition, you find a file called `metadata.yaml`. It contains an explanation for each column of each CSV file + as well as some general metadata, such as the ConFlowGen version used and the date and time of the scenario + generation. Args: folder_name: Name of folder that bundles the tabular data which belongs together diff --git a/conflowgen/application/services/export_container_flow_service.py b/conflowgen/application/services/export_container_flow_service.py index bdd59ab2..1354eb42 100644 --- a/conflowgen/application/services/export_container_flow_service.py +++ b/conflowgen/application/services/export_container_flow_service.py @@ -8,6 +8,8 @@ import numpy as np import pandas as pd +import peewee +import yaml # noinspection PyProtectedMember from peewee import ModelSelect @@ -22,6 +24,7 @@ from conflowgen.domain_models.large_vehicle_schedule import Destination from conflowgen.domain_models.vehicle import DeepSeaVessel, LargeScheduledVehicle, Feeder, Barge, Train, Truck, \ AbstractLargeScheduledVehicle +from conflowgen.application.models.container_flow_generation_properties import ContainerFlowGenerationProperties EXPORTS_DEFAULT_DIR = os.path.join( os.path.dirname(os.path.realpath(__file__)), @@ -129,6 +132,13 @@ def _save_as_xlsx(cls, df: pd.DataFrame, file_name: str): }, } + large_schedule_vehicles_as_subtype = { + "deep_sea_vessels": DeepSeaVessel, + "feeders": Feeder, + "barges": Barge, + "trains": Train, + } + def __init__(self): self.save_as_file_format_mapping = { ExportFileFormat.csv: self._save_as_csv, @@ -235,23 +245,60 @@ def _convert_sql_database_to_pandas_dataframe(cls) -> Dict[str, pd.DataFrame]: "containers": df_container, } - large_schedule_vehicles_as_subtype = { - "deep_sea_vessels": DeepSeaVessel, - "feeders": Feeder, - "barges": Barge, - "trains": Train, - } - for file_name, large_schedule_vehicle_as_subtype in large_schedule_vehicles_as_subtype.items(): - cls.logger.debug(f"Gathering data for generating the '{file_name}' table...") + for vehicle_type_name, large_schedule_vehicle_as_subtype in cls.large_schedule_vehicles_as_subtype.items(): + cls.logger.debug(f"Gathering data for generating the '{vehicle_type_name}' table...") df = cls._convert_table_to_pandas_dataframe(large_schedule_vehicle_as_subtype) if len(df) == 0: - cls.logger.info(f"No content found for the {file_name} table, the file will be empty.") - result[file_name] = df + cls.logger.info(f"No content found for the {vehicle_type_name} table, the file will be empty.") + result[vehicle_type_name] = df df_trucks = cls._convert_table_to_pandas_dataframe(Truck) result["trucks"] = df_trucks return result + @classmethod + def _get_metadata_of_model( + cls, model: type[peewee.Model], metadata: Optional[dict] = None, single: bool = False, resolve: bool = True, + ) -> Dict: + if metadata is None: + metadata = {} + for field in model._meta.sorted_fields: # pylint: disable=protected-access + if not field.help_text: # if there is no help text, we have no metadata to add + continue + + if model in cls.columns_to_drop.keys(): # if model has columns to drop in the first place... + if field.name in cls.columns_to_drop[model]: # ...and the column is to be dropped... + continue # ...then don't include it into the metadata (as it has been dropped). + + field_name = field.name + if model in cls.columns_to_rename.keys(): # if model has columns to rename in the first place... + if field_name in cls.columns_to_rename[model].keys(): # ...and the column name is to be renamed... + field_name = cls.columns_to_rename[model][field.name] # ...then re-set the field name. + + # if nested + if isinstance(field, peewee.ForeignKeyField) and resolve: + cls._get_metadata_of_model(field.rel_model, metadata) + else: # actually enter metadata + if single: # if single entry in table, then it can also be spelled out + metadata[field_name] = { + "Explanation": field.help_text, + "Value": getattr(model.get_or_none(), field.name), + } + else: # default case: several entries per table + metadata[field_name] = field.help_text + + return metadata + + @classmethod + def _get_metadata(cls) -> Dict[str, dict]: + metadata = { + "general": cls._get_metadata_of_model(ContainerFlowGenerationProperties, single=True), + "container": cls._get_metadata_of_model(Container, resolve=False), + } + for vehicle_type_name, large_schedule_vehicle_as_subtype in cls.large_schedule_vehicles_as_subtype.items(): + metadata[vehicle_type_name] = cls._get_metadata_of_model(large_schedule_vehicle_as_subtype) + return metadata + def export( self, folder_name: str, @@ -294,5 +341,19 @@ def export( self.logger.debug(f"Saving file {full_file_name}") # noinspection PyArgumentList self.save_as_file_format_mapping[file_format](df, path_to_file) + + self._save_metadata(path_to_target_folder) + self.logger.debug("Saving file metadata.yaml") + self.logger.info("Export has finished successfully.") return path_to_target_folder + + @classmethod + def _save_metadata(cls, path_to_target_folder: str): + path_to_metadata_file = os.path.join( + path_to_target_folder, + "metadata.yaml" + ) + with open(path_to_metadata_file, "w", encoding="utf-8") as f: + metadata = cls._get_metadata() + yaml.dump(metadata, f) diff --git a/conflowgen/domain_models/vehicle.py b/conflowgen/domain_models/vehicle.py index c1a42f56..8165f5c5 100644 --- a/conflowgen/domain_models/vehicle.py +++ b/conflowgen/domain_models/vehicle.py @@ -57,6 +57,8 @@ class LargeScheduledVehicle(BaseModel): vehicle_name = CharField( null=False, default=lambda: "no-name-" + str(uuid.uuid4()), + help_text="The name of the vehicle. This might help the user of the data to track each vehicle, so it is " + "preferably unique." ) capacity_in_teu = IntegerField( null=False, diff --git a/conflowgen/tests/application/services/test_export_container_flow_service.py b/conflowgen/tests/application/services/test_export_container_flow_service.py index 4958aa86..63b7cd3f 100644 --- a/conflowgen/tests/application/services/test_export_container_flow_service.py +++ b/conflowgen/tests/application/services/test_export_container_flow_service.py @@ -6,8 +6,10 @@ import numpy as np import pandas as pd from peewee import IntegerField, Model, SqliteDatabase +import yaml from conflowgen.application.data_types.export_file_format import ExportFileFormat +from conflowgen.application.models.container_flow_generation_properties import ContainerFlowGenerationProperties from conflowgen.application.services.export_container_flow_service import ( CastingException, ExportContainerFlowService, @@ -33,8 +35,6 @@ TruckArrivalInformationForPickup, ) -# pylint: disable=protected-access, unused-argument, redundant-unittest-assert - class DummyModel: """Dummy model that mocks a Peewee ORM model.""" @@ -74,6 +74,7 @@ def setUpClass(cls): Feeder, Barge, DeepSeaVessel, + ContainerFlowGenerationProperties, ] # type: ignore[attr-defined] cls._orig_model_dbs = {m: getattr(m, "_meta").database for m in cls._all_models} @@ -111,24 +112,53 @@ def test_save_as_csv_xls_xlsx(self): df = mock.Mock() # CSV - ExportContainerFlowService._save_as_csv(df, "file.csv") + ExportContainerFlowService._save_as_csv(df, "file.csv") # pylint: disable=protected-access df.to_csv.assert_called_once_with("file.csv") with self.assertRaises(AssertionError): - ExportContainerFlowService._save_as_csv(df, "bad.txt") + ExportContainerFlowService._save_as_csv(df, "bad.txt") # pylint: disable=protected-access # XLS - ExportContainerFlowService._save_as_xls(df, "file.xls") + ExportContainerFlowService._save_as_xls(df, "file.xls") # pylint: disable=protected-access df.to_excel.assert_called_with("file.xls") with self.assertRaises(AssertionError): - ExportContainerFlowService._save_as_xls(df, "wrong.xlsx") + ExportContainerFlowService._save_as_xls(df, "wrong.xlsx") # pylint: disable=protected-access # XLSX - ExportContainerFlowService._save_as_xlsx(df, "file.xlsx") + ExportContainerFlowService._save_as_xlsx(df, "file.xlsx") # pylint: disable=protected-access df.to_excel.assert_called_with("file.xlsx") with self.assertRaises(AssertionError): - ExportContainerFlowService._save_as_xlsx(df, "wrong.xls") + ExportContainerFlowService._save_as_xlsx(df, "wrong.xls") # pylint: disable=protected-access + + def test_get_metadata(self): + container_metadata = ExportContainerFlowService._get_metadata_of_model(Container) # pylint: disable=protected-access + self.assertIn("storage_requirement", container_metadata.keys()) + + def test_get_metadata_single(self): + cfgp = ContainerFlowGenerationProperties() + start_date = datetime.date(2025, 12, 8) + cfgp.start_date = start_date + cfgp.save() + container_flow_generation_properties_metadata = ExportContainerFlowService._get_metadata_of_model( # pylint: disable=protected-access + ContainerFlowGenerationProperties, single=True + ) + self.assertIn("start_date", container_flow_generation_properties_metadata.keys()) + self.assertIn("Explanation", container_flow_generation_properties_metadata["start_date"].keys()) + self.assertIn("Value", container_flow_generation_properties_metadata["start_date"].keys()) + self.assertEqual(container_flow_generation_properties_metadata["start_date"]["Value"], start_date) + self.assertEqual( + container_flow_generation_properties_metadata["start_date"]["Explanation"], + "The first day of the generated container flow" + ) - # Conversion helpers + def test_save_metadata(self): + cfgp = ContainerFlowGenerationProperties() + start_date = datetime.date(2025, 12, 8) + cfgp.start_date = start_date + cfgp.save() + with (mock.patch.object(yaml, "dump"), + mock.patch("builtins.open") as mock_file): + ExportContainerFlowService._save_metadata("my/funny/path/") # pylint: disable=protected-access + mock_file.assert_called_once_with("my/funny/path/metadata.yaml", "w", encoding='utf-8') def test_convert_table_to_pandas_dataframe_exceptions(self): """ @@ -144,7 +174,7 @@ def test_convert_table_to_pandas_dataframe_exceptions(self): with mock.patch.object(pd.DataFrame, "drop", return_value=pd.DataFrame(fake_rows)): with self.assertRaises(RuntimeError): - ExportContainerFlowService._convert_table_to_pandas_dataframe(fake_select) + ExportContainerFlowService._convert_table_to_pandas_dataframe(fake_select) # pylint: disable=protected-access fake_rows = [{"id": 1, "f": np.float64(2.0)}] fake_select.dicts.return_value = fake_rows @@ -169,7 +199,7 @@ def test_convert_table_to_pandas_dataframe_exceptions(self): with mock.patch.object(pd, "DataFrame", return_value=fake_df): with self.assertRaises(CastingException): - ExportContainerFlowService._convert_table_to_pandas_dataframe(fake_select) + ExportContainerFlowService._convert_table_to_pandas_dataframe(fake_select) # pylint: disable=protected-access def test_convert_sql_database_to_pandas_dataframe(self): """Covers lines 234–253.""" @@ -188,7 +218,7 @@ def side_effect(model, resolved_column=None): ), mock.patch.object(ExportContainerFlowService, "logger") as log, ): - result = ExportContainerFlowService._convert_sql_database_to_pandas_dataframe() + result = ExportContainerFlowService._convert_sql_database_to_pandas_dataframe() # pylint: disable=protected-access self.assertIn("containers", result) self.assertIn("trucks", result) @@ -200,7 +230,7 @@ def side_effect(model, resolved_column=None): def test_export_creates_folder_and_saves_csv(self): """Covers 264 and 267–268.""" - svc = ExportContainerFlowService() + ecfs = ExportContainerFlowService() fake_dfs = { "containers": pd.DataFrame([{"id": 1}]).set_index("id"), "trucks": pd.DataFrame([{"id": 2}]).set_index("id"), @@ -216,8 +246,13 @@ def test_export_creates_folder_and_saves_csv(self): return_value=fake_dfs, ), mock.patch.object(pd.DataFrame, "to_csv") as to_csv, + mock.patch.object( + ExportContainerFlowService, + "_save_metadata", + return_value=None, + ) ): - out = svc.export("run1", None, ExportFileFormat.csv, overwrite=False) + out = ecfs.export("run1", None, ExportFileFormat.csv, overwrite=False) makedirs.assert_called_once_with(EXPORTS_DEFAULT_DIR, exist_ok=True) mkdir.assert_called_once() @@ -226,7 +261,7 @@ def test_export_creates_folder_and_saves_csv(self): def test_export_existing_folder_overwrite_behavior(self): """Covers lines 278 and 280 for overwrite True/False.""" - svc = ExportContainerFlowService() + ecfs = ExportContainerFlowService() fake_dfs = {"containers": pd.DataFrame([{"id": 1}]).set_index("id")} with ( @@ -238,11 +273,16 @@ def test_export_existing_folder_overwrite_behavior(self): ), mock.patch.object(pd.DataFrame, "to_csv") as to_csv, mock.patch.object(ExportContainerFlowService, "logger"), + mock.patch.object( + ExportContainerFlowService, + "_save_metadata", + return_value=None, + ) ): with self.assertRaises(ExportOnlyAllowedToNotExistingFolderException): - svc.export("exists", "X", ExportFileFormat.csv, overwrite=False) + ecfs.export("exists", "X", ExportFileFormat.csv, overwrite=False) - out = svc.export("exists", "X", ExportFileFormat.csv, overwrite=True) + out = ecfs.export("exists", "X", ExportFileFormat.csv, overwrite=True) to_csv.assert_called_once() self.assertTrue(out.endswith(os.path.join("X", "exists"))) @@ -271,7 +311,7 @@ def test_convert_table_to_pandas_dataframe_resolved_column(self): return_value=True, ), ): - ExportContainerFlowService._convert_table_to_pandas_dataframe( + ExportContainerFlowService._convert_table_to_pandas_dataframe( # pylint: disable=protected-access fake_select, resolved_column="col_x" ) @@ -307,7 +347,7 @@ class Meta: svc.foreign_keys_to_resolve = {Child: {"parent_id": Parent}} with mock.patch.object(svc, "debug_once") as dbg: - ExportContainerFlowService._convert_table_to_pandas_dataframe(Child) + ExportContainerFlowService._convert_table_to_pandas_dataframe(Child) # pylint: disable=protected-access dbg.assert_called_with(mock.ANY) except TypeError: @@ -318,7 +358,7 @@ class Meta: db.close() def test_none_foreign_key(self): - """Light weight integration test hitting line 165.""" + """Lightweight integration test hitting line 165.""" db = SqliteDatabase(":memory:") try: database_proxy.initialize(db) @@ -350,7 +390,7 @@ def test_none_foreign_key(self): destination=None, ) - ExportContainerFlowService._convert_table_to_pandas_dataframe(Container) + ExportContainerFlowService._convert_table_to_pandas_dataframe(Container) # pylint: disable=protected-access except TypeError: pass finally: @@ -377,7 +417,7 @@ def raise_keyerror(*_, **__): } with self.assertRaises(RuntimeError): - ExportContainerFlowService._convert_table_to_pandas_dataframe(Container) + ExportContainerFlowService._convert_table_to_pandas_dataframe(Container) # pylint: disable=protected-access finally: pd.DataFrame.drop = original_drop ExportContainerFlowService.columns_to_drop = original_columns @@ -396,7 +436,7 @@ def test_columns_to_drop(self): return_value=df_mock, ), ): - out = svc._convert_table_to_pandas_dataframe(DummyModel) + out = svc._convert_table_to_pandas_dataframe(DummyModel) # pylint: disable=protected-access self.assertIsInstance(out, pd.DataFrame) except TypeError: pass @@ -422,6 +462,6 @@ def rename_spy(*args, **kwargs): return_value=df_mock, ), ): - svc._convert_table_to_pandas_dataframe(DummyModel) + svc._convert_table_to_pandas_dataframe(DummyModel) # pylint: disable=protected-access self.assertTrue(rename_called["hit"]) diff --git a/docs/notebooks/first_steps.ipynb b/docs/notebooks/first_steps.ipynb index bcc31d36..daccdadc 100644 --- a/docs/notebooks/first_steps.ipynb +++ b/docs/notebooks/first_steps.ipynb @@ -442,7 +442,11 @@ "id": "0949540f-f1c4-4a8e-a86c-21fb1dddcd16", "metadata": {}, "source": [ - "Corresponding CSV files exist for the other vehicles as well." + "Corresponding CSV files exist for the other vehicles as well.\n", + "\n", + "In the same folder, you also find a file called `metadata.yaml`.\n", + "It contains an explanation for each column of each CSV file.\n", + "In addition, it includes some general metadata, such as the ConFlowGen version used and the date and time of the scenario generation." ] } ], diff --git a/pyproject.toml b/pyproject.toml index 6cc83ee3..4a7908f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,8 +20,9 @@ dependencies = [ # data export 'numpy', # used in combination with pandas for column types - 'pandas >=1', # CSV/Excel import and export + 'pandas >=1', # CSV/Excel export 'openpyxl', # optional dependency of pandas that is compulsory for xlsx export + 'PyYAML', # export of metadata # internal data keeping 'peewee >=3', # ORM mapper