From ef936f2b8933718eb838e10abcf7519ff7c8b633 Mon Sep 17 00:00:00 2001 From: Deniz Tepe Date: Fri, 19 Aug 2022 09:25:51 +0200 Subject: [PATCH 01/13] Validate parameter data and differentiate between api and bulk data #308 --- open_mastr/utils/helpers.py | 42 ++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/open_mastr/utils/helpers.py b/open_mastr/utils/helpers.py index 3df3da43..6a4add91 100644 --- a/open_mastr/utils/helpers.py +++ b/open_mastr/utils/helpers.py @@ -75,7 +75,7 @@ def validate_parameter_format_for_mastr_init(engine) -> None: def validate_parameter_format_for_download_method( method, - technology, + data, bulk_date_string, bulk_cleansing, api_processes, @@ -87,7 +87,7 @@ def validate_parameter_format_for_download_method( ) -> None: validate_parameter_method(method) - validate_parameter_technology(technology) + validate_parameter_data(method, data) validate_parameter_bulk_date_string(bulk_date_string) validate_parameter_bulk_cleansing(bulk_cleansing) validate_parameter_api_processes(api_processes) @@ -202,13 +202,13 @@ def validate_parameter_api_processes(api_processes) -> None: ) -def validate_parameter_technology(technology) -> None: - if not isinstance(technology, (str, list)) and technology is not None: - raise ValueError("parameter technology has to be a string, list, or None") - if isinstance(technology, str): - technology = [technology] - if isinstance(technology, list): - bulk_technologies = [ +def validate_parameter_data(method, data) -> None: + if not isinstance(data, (str, list)) and data is not None: + raise ValueError("parameter data has to be a string, list, or None") + if isinstance(data, str): + data = [data] + if isinstance(data, list): + bulk_data = [ "wind", "solar", "biomass", @@ -225,14 +225,32 @@ def validate_parameter_technology(technology) -> None: "balancing_area", "permit", ] - for value in technology: - if value not in bulk_technologies: + api_data = [ + "wind", + "solar", + "biomass", + "hydro", + "gsgk", + "combustion", + "nuclear", + "storage", + "location", + "permit", + ] + for value in data: + if method == "bulk" and value not in bulk_data: raise ValueError( - 'Allowed values for parameter technology are "wind", "solar",' + 'Allowed values for parameter data with bulk method are "wind", "solar",' '"biomass", "hydro", "gsgk", "combustion", "nuclear", "gas", ' '"storage", "electricity_consumer", "location", "market", ' '"grid", "balancing_area" or "permit"' ) + if method == "API" and value not in api_data: + raise ValueError( + 'Allowed values for parameter data with API method are "wind", "solar", ' + '"biomass", "hydro", "gsgk", "combustion", "nuclear", ' + '"storage", "location" or "permit"' + ) def raise_warning_for_invalid_parameter_combinations( From d2f3ac0dddaee8ceabc82ba78815241a6f99e7c1 Mon Sep 17 00:00:00 2001 From: Deniz Tepe Date: Fri, 19 Aug 2022 11:37:58 +0200 Subject: [PATCH 02/13] Rename technology to data #308 The parameter 'technology' is replaced with 'data' wherever it means more than only technologies. It requires a thorough scan of scripts for a consistent refactoring. The documentation must be adapted as well. --- main.py | 8 +- open_mastr/mastr.py | 54 +++--- open_mastr/soap_api/download.py | 166 ++++++++---------- open_mastr/soap_api/mirror.py | 100 +++++------ open_mastr/utils/helpers.py | 24 +-- .../xml_download/utils_write_to_database.py | 34 ++-- tests/soap_api/test_download.py | 2 +- tests/soap_api/test_mastr_mirror.py | 10 +- tests/test_helpers.py | 16 +- .../test_utils_write_to_database.py | 16 +- 10 files changed, 205 insertions(+), 225 deletions(-) diff --git a/main.py b/main.py index ea8a4eeb..64ab40f3 100644 --- a/main.py +++ b/main.py @@ -17,7 +17,7 @@ # bulk download bulk_date_string = "today" bulk_cleansing = True -technology_bulk = [ +data_bulk = [ "biomass", "combustion", "gsgk", @@ -43,7 +43,7 @@ api_limit = 10 api_processes = None -technology_api = [ +data_api = [ "biomass", "combustion", "gsgk", @@ -71,7 +71,7 @@ # bulk download db.download( method="bulk", - technology=technology_bulk, + data=data_bulk, bulk_date_string="today", bulk_cleansing=True, ) @@ -83,7 +83,7 @@ api_chunksize=api_chunksize, api_limit=api_limit, api_processes=api_processes, - technology=technology_api, + data=data_api, api_data_types=api_data_types, api_location_types=api_location_types, ) diff --git a/open_mastr/mastr.py b/open_mastr/mastr.py index 7c3a3a36..60388da2 100644 --- a/open_mastr/mastr.py +++ b/open_mastr/mastr.py @@ -11,14 +11,18 @@ from open_mastr.soap_api.mirror import MaStRMirror from open_mastr.utils.helpers import ( - technology_input_harmonisation, + data_input_harmonisation, print_api_settings, validate_api_credentials, ) -from open_mastr.utils.config import create_data_dir, get_data_version_dir +from open_mastr.utils.config import ( + create_data_dir, + get_data_version_dir, + get_project_home_dir, +) +import open_mastr.utils.orm as orm from open_mastr.utils.data_io import cleaned_data - # import initialize_database dependencies from open_mastr.utils.helpers import ( create_database_engine, @@ -26,8 +30,6 @@ validate_parameter_format_for_mastr_init, parse_date_string, ) -import open_mastr.utils.orm as orm -from open_mastr.utils.config import get_project_home_dir class Mastr: @@ -65,7 +67,7 @@ def __init__(self, engine="sqlite") -> None: def download( self, method="bulk", - technology=None, + data=None, bulk_date_string="today", bulk_cleansing=True, api_processes=None, @@ -86,12 +88,12 @@ def download( zipped bulk download or via the MaStR API. The latter requires an account from marktstammdatenregister.de, (see :ref:`Configuration `). Default to 'bulk'. - technology: str or list or None, optional - Determines which technologies are written to the database. If None, all technologies are + data: str or list or None, optional + Determines which types of data are written to the database. If None, all data is used. If it is a list, possible entries are "wind", "solar", "biomass", "hydro", "gsgk", "combustion", "nuclear", "gas", "storage", "electricity_consumer", "location", "market", - "grid", "balancing_area" or "permit". If only one technology is of interest, this can be - given as a string. Default to None, where all technologies are included. + "grid", "balancing_area" or "permit". If only one data is of interest, this can be + given as a string. Default to None, where all data is included. bulk_date_string: str, optional Either "today" if the newest data dump should be downloaded from the MaStR website. If an already downloaded dump should be used, state the date of the download in the format @@ -146,7 +148,7 @@ def download( validate_parameter_format_for_download_method( method=method, - technology=technology, + data=data, bulk_date_string=bulk_date_string, bulk_cleansing=bulk_cleansing, api_processes=api_processes, @@ -172,17 +174,17 @@ def download( write_mastr_xml_to_database( engine=self.engine, zipped_xml_file_path=zipped_xml_file_path, - technology=technology, + data=data, bulk_cleansing=bulk_cleansing, bulk_download_date=bulk_download_date, ) if method == "API": validate_api_credentials() - if isinstance(technology, str): - technology = [technology] - elif technology is None: - technology = [ + if isinstance(data, str): + data = [data] + elif data is None: + data = [ "wind", "biomass", "combustion", @@ -192,12 +194,8 @@ def download( "storage", "solar", ] - ( - harm_log, - api_data_types, - api_location_types, - ) = technology_input_harmonisation( - technology=technology, + (harm_log, api_data_types, api_location_types,) = data_input_harmonisation( + data=data, api_data_types=api_data_types, api_location_types=api_location_types, ) @@ -212,7 +210,7 @@ def download( print_api_settings( harmonisation_log=harm_log, - technology=technology, + data=data, api_date=api_date, api_data_types=api_data_types, api_chunksize=api_chunksize, @@ -227,11 +225,11 @@ def download( restore_dump=None, ) # Download basic unit data - mastr_mirror.backfill_basic(technology, limit=api_limit, date=api_date) + mastr_mirror.backfill_basic(data, limit=api_limit, date=api_date) # Download additional unit data - for tech in technology: - # mastr_mirror.create_additional_data_requests(tech) + for tech in data: + # mastr_mirror.create_additional_data_requests(data) for data_type in api_data_types: mastr_mirror.retrieve_additional_data( tech, data_type, chunksize=api_chunksize, limit=api_limit @@ -267,7 +265,7 @@ def to_csv( chunksize: int Defines the chunksize of the tables export. Default value is 500.000. limit: None or int - Limits the number of exported technology and location units. + Limits the number of exported data and location units. """ create_data_dir() @@ -336,7 +334,7 @@ def to_csv( if technologies_to_export: # fill basic unit table, after downloading with method = 'bulk' to use API export functions api_export.reverse_fill_basic_units(technology=technologies_to_export) - # export to csv per technology + # export to csv per data api_export.to_csv( technology=technologies_to_export, statistic_flag=None, diff --git a/open_mastr/soap_api/download.py b/open_mastr/soap_api/download.py index 557eb0b4..d0b9ef82 100644 --- a/open_mastr/soap_api/download.py +++ b/open_mastr/soap_api/download.py @@ -427,14 +427,14 @@ def to_csv(df: pd.DataFrame, technology: str, chunk_number: int) -> None: log.info(f"Appended {len(df)} rows to {csv_file.split('/')[-1:]}.") -def _missed_units_to_file(technology, data_type, missed_units): +def _missed_units_to_file(data, data_type, missed_units): """ Write IDs of missed units to file Parameters ---------- - technology : str - Technology, see :meth:`MaStRDownload.download_power_plants` + data : str + Data, see :meth:`MaStRDownload.download_power_plants` data_type : str Which type of additional data. Options: 'extended', 'eeg', 'kwk', 'permit' missed_units : list @@ -444,7 +444,7 @@ def _missed_units_to_file(technology, data_type, missed_units): data_path = get_data_version_dir() filenames = get_filenames() missed_units_file = os.path.join( - data_path, filenames["raw"][technology][f"{data_type}_fail"] + data_path, filenames["raw"][data][f"{data_type}_fail"] ) with open(missed_units_file, "w") as f: @@ -600,9 +600,9 @@ def __init__(self, parallel_processes=None): self._mastr_api._user = cred.check_and_set_mastr_user() self._mastr_api._key = cred.check_and_set_mastr_token(self._mastr_api._user) - def download_power_plants(self, technology, limit=None): + def download_power_plants(self, data, limit=None): """ - Download power plant unit data for one technology. + Download power plant unit data for one data type. Based on list with basic information about each unit, subsequently additional data is retrieved: @@ -618,7 +618,7 @@ def download_power_plants(self, technology, limit=None): Parameters ---------- - technology : str + data : str Retrieve unit data for one power system unit. Power plants are grouped by following technologies: @@ -650,39 +650,31 @@ def download_power_plants(self, technology, limit=None): # This was introduced later, after creation of this method units = [ unit - for sublist in self.basic_unit_data(technology=technology, limit=limit) + for sublist in self.basic_unit_data(data=data, limit=limit) for unit in sublist ] # Prepare list of unit ID for different additional data (extended, eeg, kwk, permit) - mastr_ids = self._create_ID_list( - units, "unit_data", "EinheitMastrNummer", technology - ) - eeg_ids = self._create_ID_list(units, "eeg_data", "EegMastrNummer", technology) - kwk_ids = self._create_ID_list(units, "kwk_data", "KwkMastrNummer", technology) - permit_ids = self._create_ID_list( - units, "permit_data", "GenMastrNummer", technology - ) + mastr_ids = self._create_ID_list(units, "unit_data", "EinheitMastrNummer", data) + eeg_ids = self._create_ID_list(units, "eeg_data", "EegMastrNummer", data) + kwk_ids = self._create_ID_list(units, "kwk_data", "KwkMastrNummer", data) + permit_ids = self._create_ID_list(units, "permit_data", "GenMastrNummer", data) # Download additional data for unit extended_data, extended_missed = self.additional_data( - technology, mastr_ids, "extended_unit_data" + data, mastr_ids, "extended_unit_data" ) if eeg_ids: - eeg_data, eeg_missed = self.additional_data( - technology, eeg_ids, "eeg_unit_data" - ) + eeg_data, eeg_missed = self.additional_data(data, eeg_ids, "eeg_unit_data") else: eeg_data = eeg_missed = [] if kwk_ids: - kwk_data, kwk_missed = self.additional_data( - technology, kwk_ids, "kwk_unit_data" - ) + kwk_data, kwk_missed = self.additional_data(data, kwk_ids, "kwk_unit_data") else: kwk_data = kwk_missed = [] if permit_ids: permit_data, permit_missed = self.additional_data( - technology, permit_ids, "permit_unit_data" + data, permit_ids, "permit_unit_data" ) else: permit_data = permit_missed = [] @@ -693,28 +685,28 @@ def download_power_plants(self, technology, limit=None): extended_data_retry, extended_missed_retry, ) = self._retry_missed_additional_data( - technology, [_[0] for _ in extended_missed], "extended_unit_data" + data, [_[0] for _ in extended_missed], "extended_unit_data" ) extended_data.extend(extended_data_retry) - _missed_units_to_file(technology, "extended", extended_missed_retry) + _missed_units_to_file(data, "extended", extended_missed_retry) if eeg_missed: eeg_data_retry, eeg_missed_retry = self._retry_missed_additional_data( - technology, [_[0] for _ in eeg_missed], "eeg_unit_data" + data, [_[0] for _ in eeg_missed], "eeg_unit_data" ) eeg_data.extend(eeg_data_retry) - _missed_units_to_file(technology, "eeg", eeg_missed_retry) + _missed_units_to_file(data, "eeg", eeg_missed_retry) if kwk_missed: kwk_data_retry, kwk_missed_retry = self._retry_missed_additional_data( - technology, [_[0] for _ in kwk_missed], "kwk_unit_data" + data, [_[0] for _ in kwk_missed], "kwk_unit_data" ) kwk_data.extend(kwk_data_retry) - _missed_units_to_file(technology, "kwk", kwk_missed_retry) + _missed_units_to_file(data, "kwk", kwk_missed_retry) if permit_missed: permit_data_retry, permit_missed_retry = self._retry_missed_additional_data( - technology, [_[0] for _ in permit_missed], "permit_unit_data" + data, [_[0] for _ in permit_missed], "permit_unit_data" ) permit_data.extend(permit_data_retry) - _missed_units_to_file(technology, "permit", permit_missed_retry) + _missed_units_to_file(data, "permit", permit_missed_retry) # Flatten data extended_data = flatten_dict(extended_data, serialize_with_json=True) @@ -743,30 +735,28 @@ def download_power_plants(self, technology, limit=None): # Remove duplicates joined_data.drop_duplicates(inplace=True) - to_csv(joined_data, technology) + to_csv(joined_data, data) return joined_data - def _create_ID_list(self, units, data_descriptor, key, technology): + def _create_ID_list(self, units, data_descriptor, key, data): """Extracts a list of MaStR numbers (eeg, kwk, or permit Mastr Nr) from the given units.""" return ( [basic[key] for basic in units if basic[key]] - if data_descriptor in self._unit_data_specs[technology] + if data_descriptor in self._unit_data_specs[data] else [] ) - def basic_unit_data( - self, technology=None, limit=2000, date_from=None, max_retries=3 - ): + def basic_unit_data(self, data=None, limit=2000, date_from=None, max_retries=3): """ - Download basic unit information for one technology. + Download basic unit information for one data type. Retrieves basic information about units. The number of unit in bound to `limit`. Parameters ---------- - technology : str, optional + data : str, optional Technology data is requested for. See :meth:`MaStRDownload.download_power_plants` for options. Data is retrieved using :meth:`MaStRAPI.GetGefilterteListeStromErzeuger`. @@ -802,18 +792,16 @@ def basic_unit_data( for x in chunks_start ] - # Deal with or w/o technology being specified + # Deal with or w/o data type being specified energietraeger = ( - self._unit_data_specs[technology]["energietraeger"] - if technology - else [None] + self._unit_data_specs[data]["energietraeger"] if data else [None] ) - # In case multiple energy carriers (energietraeger) exist for one technology, + # In case multiple energy carriers (energietraeger) exist for one data type, # loop over these and join data to one list for et in energietraeger: log.info( - f"Get list of units with basic information for technology {technology} ({et})" + f"Get list of units with basic information for data type {data} ({et})" ) yield from basic_data_download( self._mastr_api, @@ -823,7 +811,7 @@ def basic_unit_data( limits, date_from, max_retries, - technology, + data, et=et, ) if et is None else basic_data_download( self._mastr_api, @@ -833,11 +821,11 @@ def basic_unit_data( limits, date_from, max_retries, - technology, + data, et=et, ) - def additional_data(self, technology, unit_ids, data_fcn, timeout=10): + def additional_data(self, data, unit_ids, data_fcn, timeout=10): """ Retrieve addtional informations about units. @@ -847,8 +835,8 @@ def additional_data(self, technology, unit_ids, data_fcn, timeout=10): Parameters ---------- - technology : str - Technology, see :meth:`MaStRDownload.download_power_plants` + data : str + data, see :meth:`MaStRDownload.download_power_plants` unit_ids : list Unit identifier for additional data data_fcn : str @@ -857,10 +845,10 @@ def additional_data(self, technology, unit_ids, data_fcn, timeout=10): * "extended_unit_data" (:meth:`~.extended_unit_data`): Extended information (i.e. technical, location) - about a unit. The exact set of information depends on the technology. + about a unit. The exact set of information depends on the data type. * "eeg_unit_data" (:meth:`~.eeg_unit_data`): Unit Information from EEG unit registry. The exact - set of information depends on the technology. + set of information depends on the data. * "kwk_unit_data" (:meth:`~.kwk_unit_data`): Unit information from KWK unit registry. * "permit_unit_data" (:meth:`~.permit_unit_data`): Information about the permit process of a unit. @@ -880,18 +868,18 @@ def additional_data(self, technology, unit_ids, data_fcn, timeout=10): [tuple("SME930865355925", "Reason for failing dowload"), ...] ) """ - # Prepare a list of unit IDs packed as tuple associated with technology - prepared_args = list(product(unit_ids, [technology])) + # Prepare a list of unit IDs packed as tuple associated with data + prepared_args = list(product(unit_ids, [data])) # Prepare results lists if self.parallel_processes: data, data_missed = self._retrieve_data_in_parallel_process( - prepared_args, data_fcn, technology, timeout + prepared_args, data_fcn, data, timeout ) else: data, data_missed = self._retrieve_data_in_single_process( - prepared_args, data_fcn, technology + prepared_args, data_fcn, data ) # Remove Nones and empty dicts @@ -909,20 +897,20 @@ def additional_data(self, technology, unit_ids, data_fcn, timeout=10): return data, data_missed - def _retrieve_data_in_single_process(self, prepared_args, data_fcn, technology): + def _retrieve_data_in_single_process(self, prepared_args, data_fcn, data): data = [] data_missed = [] for unit_specs in tqdm( prepared_args, total=len(prepared_args), - desc=f"Downloading {data_fcn} ({technology})", + desc=f"Downloading {data_fcn} ({data})", unit="unit", ): data_tmp, data_missed_tmp = self.__getattribute__(data_fcn)(unit_specs) if not data_tmp: log.debug( f"Download for additional data for " - f"{data_missed_tmp[0]} ({technology}) failed. " + f"{data_missed_tmp[0]} ({data}) failed. " f"Traceback of caught error:\n{data_missed_tmp[1]}" ) data.append(data_tmp) @@ -931,7 +919,7 @@ def _retrieve_data_in_single_process(self, prepared_args, data_fcn, technology): return data, data_missed def _retrieve_data_in_parallel_process( - self, prepared_args, data_fcn, technology, timeout + self, prepared_args, data_fcn, data, timeout ): data = [] data_missed = [] @@ -941,7 +929,7 @@ def _retrieve_data_in_parallel_process( with tqdm( total=len(prepared_args), - desc=f"Downloading {data_fcn} ({technology})", + desc=f"Downloading {data_fcn} ({data})", unit="unit", ) as pbar: unit_result = pool.imap_unordered( @@ -955,7 +943,7 @@ def _retrieve_data_in_parallel_process( if not data_tmp: log.debug( f"Download for additional data for " - f"{data_missed_tmp[0]} ({technology}) failed. " + f"{data_missed_tmp[0]} ({data}) failed. " f"Traceback of caught error:\n{data_missed_tmp[1]}" ) data.append(data_tmp) @@ -966,7 +954,7 @@ def _retrieve_data_in_parallel_process( break except multiprocessing.TimeoutError: # If retrieval time exceeds timeout of next(), pass on - log.debug(f"Data request for 1 {technology} unit timed out") + log.debug(f"Data request for 1 {data} unit timed out") return data, data_missed def extended_unit_data(self, unit_specs): @@ -978,7 +966,7 @@ def extended_unit_data(self, unit_specs): Parameters ---------- unit_specs : tuple - *EinheitMastrNummer* and technology as tuple that for example looks like + *EinheitMastrNummer* and data type as tuple that for example looks like .. code-block:: python @@ -997,10 +985,10 @@ def extended_unit_data(self, unit_specs): tuple("SME930865355925", "Reason for failing dowload") """ - mastr_id, technology = unit_specs + mastr_id, data = unit_specs try: unit_data = self._mastr_api.__getattribute__( - self._unit_data_specs[technology]["unit_data"] + self._unit_data_specs[data]["unit_data"] )(einheitMastrNummer=mastr_id) unit_missed = None except ( @@ -1027,7 +1015,7 @@ def eeg_unit_data(self, unit_specs): Parameters ---------- unit_specs : tuple - *EegMastrnummer* and technology as tuple that for example looks like + *EegMastrnummer* and data type as tuple that for example looks like .. code-block:: python @@ -1045,10 +1033,10 @@ def eeg_unit_data(self, unit_specs): tuple("EEG961554380393", "Reason for failing dowload") """ - eeg_id, technology = unit_specs + eeg_id, data = unit_specs try: eeg_data = self._mastr_api.__getattribute__( - self._unit_data_specs[technology]["eeg_data"] + self._unit_data_specs[data]["eeg_data"] )(eegMastrNummer=eeg_id) eeg_missed = None except ( @@ -1075,7 +1063,7 @@ def kwk_unit_data(self, unit_specs): Parameters ---------- unit_specs : tuple - *KwkMastrnummer* and technology as tuple that for example looks like + *KwkMastrnummer* and data type as tuple that for example looks like .. code-block:: python @@ -1094,10 +1082,10 @@ def kwk_unit_data(self, unit_specs): tuple("KWK910493229164", "Reason for failing dowload") """ - kwk_id, technology = unit_specs + kwk_id, data = unit_specs try: kwk_data = self._mastr_api.__getattribute__( - self._unit_data_specs[technology]["kwk_data"] + self._unit_data_specs[data]["kwk_data"] )(kwkMastrNummer=kwk_id) kwk_missed = None except ( @@ -1121,7 +1109,7 @@ def permit_unit_data(self, unit_specs): Parameters ---------- unit_specs : tuple - *GenMastrnummer* and technology as tuple that for example looks like + *GenMastrnummer* and data type as tuple that for example looks like .. code-block:: python @@ -1140,10 +1128,10 @@ def permit_unit_data(self, unit_specs): tuple("GEN952474728808", "Reason for failing dowload") """ - permit_id, technology = unit_specs + permit_id, data = unit_specs try: permit_data = self._mastr_api.__getattribute__( - self._unit_data_specs[technology]["permit_data"] + self._unit_data_specs[data]["permit_data"] )(genMastrNummer=permit_id) permit_missed = None except ( @@ -1209,9 +1197,7 @@ def location_data(self, specs): return data, missed - def _retry_missed_additional_data( - self, technology, missed_ids, data_fcn, retries=3 - ): + def _retry_missed_additional_data(self, data, missed_ids, data_fcn, retries=3): """ Retry to download extended data that was missed earlier. @@ -1219,8 +1205,8 @@ def _retry_missed_additional_data( Parameters ---------- - technology : str - Technology, see :meth:`MaStRDownload.download_power_plants` + data : str + data, see :meth:`MaStRDownload.download_power_plants` missed_ids : list Unit identifiers for additional data data_fcn : str @@ -1236,7 +1222,7 @@ def _retry_missed_additional_data( log.info( f"Retrying to download additional data for {len(missed_ids)} " - f"{technology} units with {retries} retries" + f"{data} units with {retries} retries" ) data = [] @@ -1244,7 +1230,7 @@ def _retry_missed_additional_data( missed_ids_remaining = missed_ids for _ in range(1, retries + 1): data_tmp, missed_ids_tmp = self.additional_data( - technology, missed_ids_remaining, data_fcn + data, missed_ids_remaining, data_fcn ) if data_tmp: data.extend(data_tmp) @@ -1336,7 +1322,7 @@ def basic_data_download( limits, date_from, max_retries, - technology=None, + data=None, et=None, ): """ @@ -1367,11 +1353,11 @@ def basic_data_download( Date for querying only newer data than this date max_retries: int Number of maximum retries for each chunk - technology: str, optional + data: str, optional Choose a subset from available technologies. Only relevant if category="Einheiten". Defaults to all technologies. et: str - Energietraeger of a technology. Some technologies are subdivided into a list of + Energietraeger of a data type. Some technologies are subdivided into a list of energietraeger. Only relevant if category="Einheiten". Defaults to None. Yields @@ -1382,8 +1368,8 @@ def basic_data_download( # Construct description string description = f"Get basic {category} data information" - if technology: - description += f" for technology {technology}" + if data: + description += f" for data {data}" if et: description += f" ({et})" @@ -1392,7 +1378,7 @@ def basic_data_download( # Iterate over chunks and download data # Results are first collected per 'et' (units_tech) for properly # displaying download progress. - # Later, all units of a single technology are collected in 'units' + # Later, all units of a single data are collected in 'units' for chunk_start, limit_iter in zip(chunks_start, limits): # Use a retry loop to retry on connection errors for try_number in range(max_retries + 1): @@ -1440,7 +1426,7 @@ def basic_data_download( if response["Ergebniscode"] == "OkWeitereDatenVorhanden": continue - # Update progress bar and move on with next et or technology + # Update progress bar and move on with next et or data type pbar.total = pbar.n pbar.refresh() pbar.close() diff --git a/open_mastr/soap_api/mirror.py b/open_mastr/soap_api/mirror.py index 218624c6..49c9a2b0 100644 --- a/open_mastr/soap_api/mirror.py +++ b/open_mastr/soap_api/mirror.py @@ -67,7 +67,7 @@ class MaStRMirror: mastr_mirror.retrieve_additional_data("solar", ["unit_data"]) - The data can be joined to one table for each technology and exported to + The data can be joined to one table for each data type and exported to CSV files using :meth:`~.to_csv`. Also consider to use :meth:`~.dump` and :meth:`~.restore` for specific purposes. @@ -145,7 +145,7 @@ def __init__( }, } - # Map technology and MaStR unit type + # Map data and MaStR unit type # Map technologies on ORMs self.unit_type_map = { "Windeinheit": "wind", @@ -167,7 +167,7 @@ def __init__( } self.unit_type_map_reversed = {v: k for k, v in self.unit_type_map.items()} - def backfill_basic(self, technology=None, date=None, limit=10**8) -> None: + def backfill_basic(self, data=None, date=None, limit=10 ** 8) -> None: """Backfill basic unit data. Fill database table 'basic_units' with data. It allows specification @@ -178,10 +178,10 @@ def backfill_basic(self, technology=None, date=None, limit=10**8) -> None: Parameters ---------- - technology: str or list - Specify technologies for which data should be backfilled. + data: str or list + Specify data types for which data should be backfilled. - * 'solar' (`str`): Backfill data for a single technology. + * 'solar' (`str`): Backfill data for a single data type. * ['solar', 'wind'] (`list`): Backfill data for multiple technologies given in a list. * `None`: Backfill data for all technologies @@ -192,13 +192,13 @@ def backfill_basic(self, technology=None, date=None, limit=10**8) -> None: Only data with modification time stamp greater that `date` is retrieved. - * `datetime.datetime(2020, 11, 27)`: Retrieve data which is is newer + * `datetime.datetime(2020, 11, 27)`: Retrieve data which is newer than this time stamp * 'latest': Retrieve data which is newer than the newest data already in the table. - It is aware of a different 'latest date' for each technology. + It is aware of a different 'latest date' for each data. Hence, it works in combination with - `technology=None` and `technology=["wind", "solar"]` for example. + `data=None` and `data=["wind", "solar"]` for example. .. warning:: @@ -214,20 +214,20 @@ def backfill_basic(self, technology=None, date=None, limit=10**8) -> None: """ # Create list of technologies to backfill - if isinstance(technology, str): - technology_list = [technology] - elif technology is None: - technology_list = [None] - elif isinstance(technology, list): - technology_list = technology + if isinstance(data, str): + data_list = [data] + elif data is None: + data_list = [None] + elif isinstance(data, list): + data_list = data - dates = self._get_list_of_dates(date, technology_list) + dates = self._get_list_of_dates(date, data_list) - for tech, date in zip(technology_list, dates): - self._write_basic_data_for_one_technology_to_db(tech, date, limit) + for data, date in zip(data_list, dates): + self._write_basic_data_for_one_data_type_to_db(data, date, limit) def backfill_locations_basic( - self, limit=10**7, date=None, delete_additional_data_requests=True + self, limit=10 ** 7, date=None, delete_additional_data_requests=True ): """ Backfill basic location data. @@ -246,7 +246,7 @@ def backfill_locations_basic( Only data with modification time stamp greater that `date` is retrieved. - * `datetime.datetime(2020, 11, 27)`: Retrieve data which is is newer than + * `datetime.datetime(2020, 11, 27)`: Retrieve data which is newer than this time stamp * 'latest': Retrieve data which is newer than the newest data already in the table. .. warning:: @@ -330,7 +330,7 @@ def backfill_locations_basic( ) def retrieve_additional_data( - self, technology, data_type, limit=10**8, chunksize=1000 + self, data, data_type, limit=10 ** 8, chunksize=1000 ): """ Retrieve additional unit data @@ -342,7 +342,7 @@ def retrieve_additional_data( Parameters ---------- - technology: `str` + data: `str` See list of available technologies in :meth:`open_mastr.soap_api.download.py.MaStRDownload.download_power_plants`. data_type: `str` @@ -379,7 +379,7 @@ def retrieve_additional_data( table_identifier="additional_data", session=session, data_request_type=data_type, - technology=technology, + data=data, chunksize=chunksize, ) @@ -389,7 +389,7 @@ def retrieve_additional_data( # Retrieve data unit_data, missed_units = self.mastr_dl.additional_data( - technology, requested_ids, download_functions[data_type] + data, requested_ids, download_functions[data_type] ) unit_data = flatten_dict(unit_data) @@ -398,7 +398,7 @@ def retrieve_additional_data( # Prepare data and add to database table for unit_dat in unit_data: unit = self._preprocess_additional_data_entry( - unit_dat, technology, data_type + unit_dat, data, data_type ) session.merge(unit) number_units_merged += 1 @@ -422,7 +422,7 @@ def retrieve_additional_data( break def retrieve_additional_location_data( - self, location_type, limit=10**8, chunksize=1000 + self, location_type, limit=10 ** 8, chunksize=1000 ): """ Retrieve extended location data @@ -462,7 +462,7 @@ def retrieve_additional_location_data( table_identifier="additional_location_data", session=session, data_request_type=location_type, - technology=None, + data=None, chunksize=chunksize, ) @@ -529,7 +529,7 @@ def retrieve_additional_location_data( def create_additional_data_requests( self, - technology, + data, data_types=["unit_data", "eeg_data", "kwk_data", "permit_data"], delete_existing=True, ): @@ -542,12 +542,12 @@ def create_additional_data_requests( Parameters ---------- - technology: str - Specify technology additional data should be requested for. + data: str + Specify data type, additional data should be requested for. data_types: list Select type of additional data that is to be requested. Defaults to all data that is available for a - technology. + data type. delete_existing: bool Toggle deletion of already existing requests for additional data. Defaults to True. @@ -558,25 +558,25 @@ def create_additional_data_requests( with session_scope(engine=self._engine) as session: # Check which additional data is missing for data_type in data_types: - if data_type_available := self.orm_map[technology].get(data_type, None): + if data_type_available := self.orm_map[data].get(data_type, None): log.info( - f"Create requests for additional data of type {data_type} for {technology}" + f"Create requests for additional data of type {data_type} for {data}" ) - # Get ORM for additional data by technology and data_type + # Get ORM for additional data by data and data_type additional_data_orm = getattr(orm, data_type_available) - # Delete prior additional data requests for this technology and data_type + # Delete prior additional data requests for this data and data_type if delete_existing: session.query(orm.AdditionalDataRequested).filter( - orm.AdditionalDataRequested.technology == technology, + orm.AdditionalDataRequested.technology == data, orm.AdditionalDataRequested.data_type == data_type, ).delete() session.commit() # Query database for missing additional data units_for_request = self._get_units_for_request( - data_type, session, additional_data_orm, technology + data_type, session, additional_data_orm, data ) # Prepare data for additional data request @@ -611,7 +611,7 @@ def create_additional_data_requests( session.bulk_insert_mappings(orm.AdditionalDataRequested, data_requests) def _add_data_source_and_download_date(self, entry: dict) -> dict: - "Adds DatenQuelle = 'APT' and DatumDownload = date.today" + """Adds DatenQuelle = 'APT' and DatumDownload = date.today""" entry["DatenQuelle"] = "API" entry["DatumDownload"] = date.today() return entry @@ -742,11 +742,11 @@ def _create_inserted_and_updated_list( session.commit() return insert + updated - def _write_basic_data_for_one_technology_to_db(self, tech, date, limit) -> None: - log.info(f"Backfill data for technology {tech}") + def _write_basic_data_for_one_data_type_to_db(self, data, date, limit) -> None: + log.info(f"Backfill data for data type {data}") # Catch weird MaStR SOAP response - basic_units = self.mastr_dl.basic_unit_data(tech, limit, date_from=date) + basic_units = self.mastr_dl.basic_unit_data(data, limit, date_from=date) with session_scope(engine=self._engine) as session: log.info( @@ -817,7 +817,7 @@ def _get_list_of_dates(self, date, technology_list) -> list: for tech in technology_list: if tech: # In case technologies are specified, latest data date - # gets queried per technology + # gets queried per data with session_scope(engine=self._engine) as session: newest_date = ( session.query(orm.BasicUnit.DatumLetzteAktualisierung) @@ -829,7 +829,7 @@ def _get_list_of_dates(self, date, technology_list) -> list: .first() ) else: - # If technologies aren't defined ([None]) latest date per technology + # If technologies aren't defined ([None]) latest date per data # is queried in query # This also leads that the remainder of the loop body is skipped with session_scope(engine=self._engine) as session: @@ -841,7 +841,7 @@ def _get_list_of_dates(self, date, technology_list) -> list: ).group_by(orm.BasicUnit.Einheittyp) dates = [s[1] for s in subquery] technology_list = [self.unit_type_map[s[0]] for s in subquery] - # Break the for loop over technology here, because we + # Break the for loop over data here, because we # write technology_list and dates at once break @@ -937,7 +937,7 @@ def _preprocess_additional_data_entry(self, unit_dat, technology, data_type): return getattr(orm, self.orm_map[technology][data_type])(**unit_dat) def _get_additional_data_requests_from_db( - self, table_identifier, session, data_request_type, technology, chunksize + self, table_identifier, session, data_request_type, data, chunksize ): """Retrieves the data that is requested from the database table AdditionalDataRequested.""" if table_identifier == "additional_data": @@ -946,7 +946,7 @@ def _get_additional_data_requests_from_db( .filter( and_( orm.AdditionalDataRequested.data_type == data_request_type, - orm.AdditionalDataRequested.technology == technology, + orm.AdditionalDataRequested.technology == data, ) ) .limit(chunksize) @@ -1141,7 +1141,7 @@ def to_csv( if isinstance(technology, str): technology = [technology] elif not isinstance(technology, (list, None)): - raise TypeError("Parameter technology must be of type `str` or `list`") + raise TypeError("Parameter data must be of type `str` or `list`") renaming = column_renaming() @@ -1199,7 +1199,7 @@ def to_csv( ) query = Query(subtables, session=session) - # Define joins based on available tables for tech and user input + # Define joins based on available tables for data and user input if unit_data_orm and "unit_data" in additional_data: query = query.join( unit_data_orm, @@ -1415,9 +1415,9 @@ def list_of_dicts_to_columns(row) -> pd.Series: Parameters ---------- row: list of dict - Usually apllied using apply on a column of a pandas DataFrame, + Usually applied using apply on a column of a pandas DataFrame, hence, a Series. This column of the - DataFrame should comprise of a single-level dict with an + DataFrame should consist of a single-level dict with an arbitrary number of columns. Each key is transformed into a new column, while data from each dict inside the list is concatenated by key. Such diff --git a/open_mastr/utils/helpers.py b/open_mastr/utils/helpers.py index 6a4add91..bce2a886 100644 --- a/open_mastr/utils/helpers.py +++ b/open_mastr/utils/helpers.py @@ -305,18 +305,18 @@ def session_scope(engine): session.close() -def technology_input_harmonisation(technology, api_data_types, api_location_types): +def data_input_harmonisation(data, api_data_types, api_location_types): harmonisation_log = [] - if "permit" in technology: - technology.remove("permit") + if "permit" in data: + data.remove("permit") api_data_types.append( "permit_data" ) if "permit_data" not in api_data_types else api_data_types harmonisation_log.append("permit") - if "location" in technology: - technology.remove("location") + if "location" in data: + data.remove("location") api_location_types = [ "location_elec_generation", "location_elec_consumption", @@ -324,14 +324,14 @@ def technology_input_harmonisation(technology, api_data_types, api_location_type "location_gas_consumption", ] harmonisation_log.append("location") - # return changed api_location_types only if "location" in technology, else None + # return changed api_location_types only if "location" in data, else None return harmonisation_log, api_data_types, api_location_types def print_api_settings( harmonisation_log, - technology, + data, api_date, api_data_types, api_chunksize, @@ -342,14 +342,14 @@ def print_api_settings( print( f"Downloading with soap_API.\n\n -- API settings -- \nunits after date: " - f"{api_date}\nunit download limit per technology: " + f"{api_date}\nunit download limit per data: " f"{api_limit}\nparallel_processes: {api_processes}\nchunksize: " - f"{api_chunksize}\ntechnology_api: {technology}" + f"{api_chunksize}\ndata_api: {data}" ) if "permit" in harmonisation_log: print( f"data_types: {api_data_types}" "\033[31m", - f"Attention, 'permit_data' was automatically set in api_data_types, as you defined 'permit' in parameter technology_api.", + "Attention, 'permit_data' was automatically set in api_data_types, as you defined 'permit' in parameter data_api.", "\033[m", ) @@ -360,10 +360,10 @@ def print_api_settings( print( "location_types:", "\033[31m", - f"Attention, 'location' is in parameter technology_api. location_types are set to", + f"Attention, 'location' is in parameter data. location_types are set to", "\033[m", f"{api_location_types}" - "\n If you want to change location_types, please remove 'location' from technology_api and specify api_location_types." + "\n If you want to change location_types, please remove 'location' from data_api and specify api_location_types." "\n ------------------ \n", ) diff --git a/open_mastr/xml_download/utils_write_to_database.py b/open_mastr/xml_download/utils_write_to_database.py index 897c5549..900d384d 100644 --- a/open_mastr/xml_download/utils_write_to_database.py +++ b/open_mastr/xml_download/utils_write_to_database.py @@ -15,12 +15,12 @@ def write_mastr_xml_to_database( engine: sqlalchemy.engine.Engine, zipped_xml_file_path: str, - technology: list, + data: list, bulk_cleansing: bool, bulk_download_date: str, ) -> None: """Write the Mastr in xml format into a database defined by the engine parameter.""" - include_tables = technology_to_include_tables(technology) + include_tables = data_to_include_tables(data) with ZipFile(zipped_xml_file_path, "r") as f: files_list = f.namelist() @@ -367,18 +367,18 @@ def handle_xml_syntax_error(data: bytes, err: Error) -> pd.DataFrame: return df -def technology_to_include_tables( - technology, +def data_to_include_tables( + data, ) -> list: """ - Check the user input 'technology' and convert it to the list 'include_tables' which contains + Check the user input 'data' and convert it to the list 'include_tables' which contains file names from zipped bulk download. Parameters ---------- - technology: None, str, list - The user input for technology selection + data: None, str, list + The user input for data selection * `None`: All technologies (default) - * `str`: One technology + * `str`: One data * `list`: List of technologies Returns ------- @@ -389,24 +389,24 @@ def technology_to_include_tables( """ all_technologies = orm.bulk_technologies tables_map = orm.bulk_include_tables_map - # Convert technology input into a standard list + # Convert data input into a standard list chosen_technologies = [] - if technology is None: + if data is None: # All technologies are to be chosen chosen_technologies = all_technologies - elif isinstance(technology, str): - # Only one technology is chosen - chosen_technologies = [technology] - elif isinstance(technology, list): + elif isinstance(data, str): + # Only one data is chosen + chosen_technologies = [data] + elif isinstance(data, list): # list of technologies is given - chosen_technologies = technology + chosen_technologies = data # Check if given technologies match with the valid options from 'orm.bulk_technologies' for tech in chosen_technologies: if tech not in all_technologies: raise ValueError( - f"The input technology = {technology} does not match with the " - f"possible technology options. Only following technology options are available " + f"The input data = {data} does not match with the " + f"possible data options. Only following data options are available " f"bulk_technologies = {all_technologies}" ) diff --git a/tests/soap_api/test_download.py b/tests/soap_api/test_download.py index 09e47c3e..e4dc4c0d 100644 --- a/tests/soap_api/test_download.py +++ b/tests/soap_api/test_download.py @@ -64,7 +64,7 @@ def test_soap_wrapper_power_plant_list(mastr_api): def test_basic_unit_data(mastr_download): data = [ unit - for sublist in mastr_download.basic_unit_data(technology="nuclear", limit=1) + for sublist in mastr_download.basic_unit_data(data="nuclear", limit=1) for unit in sublist ] diff --git a/tests/soap_api/test_mastr_mirror.py b/tests/soap_api/test_mastr_mirror.py index e35ceacf..a23abb6b 100644 --- a/tests/soap_api/test_mastr_mirror.py +++ b/tests/soap_api/test_mastr_mirror.py @@ -47,7 +47,7 @@ def engine(): @pytest.mark.dependency(name="backfill_basic") def test_backfill_basic(mastr_mirror, engine): - mastr_mirror.backfill_basic(technology=TECHNOLOGIES, date=DATE, limit=LIMIT) + mastr_mirror.backfill_basic(data=TECHNOLOGIES, date=DATE, limit=LIMIT) # The table basic_units should have at least as much rows as TECHNOLOGIES were queried with session_scope(engine=engine) as session: @@ -60,21 +60,21 @@ def test_retrieve_additional_data(mastr_mirror): for tech in TECHNOLOGIES: for data_type in DATA_TYPES: mastr_mirror.retrieve_additional_data( - technology=tech, data_type=data_type, limit=10 * LIMIT + data=tech, data_type=data_type, limit=10 * LIMIT ) # This comparison currently fails because of # https://github.com/OpenEnergyPlatform/open-MaStR/issues/154 # with session_scope() as session: - # for tech in TECHNOLOGIES: - # mapper = getattr(orm, mastr_mirror.orm_map[tech]["unit_data"]) + # for data in TECHNOLOGIES: + # mapper = getattr(orm, mastr_mirror.orm_map[data]["unit_data"]) # response = session.query(mapper).count() # assert response >= LIMIT @pytest.mark.dependency(depends=["retrieve_additional_data"], name="update_latest") def test_update_latest(mastr_mirror, engine): - mastr_mirror.backfill_basic(technology=TECHNOLOGIES, date="latest", limit=LIMIT) + mastr_mirror.backfill_basic(data=TECHNOLOGIES, date="latest", limit=LIMIT) # Test if latest date is newer that initially requested data in backfill_basic with session_scope(engine=engine) as session: diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 9b9b3b4e..a3fa7138 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -18,7 +18,7 @@ def db(): def parameter_dict_working(): parameter_dict = { "method": ["API", "bulk"], - "technology": [ + "data": [ "wind", "solar", "biomass", @@ -68,7 +68,7 @@ def parameter_dict_working(): def parameter_dict_not_working(): parameter_dict = { "method": [5, "BULK", "api"], - "technology": [ + "data": [ "wint", "Solar", "biomasse", @@ -97,7 +97,7 @@ def test_Mastr_validate_working_parameter(parameter_dict_working): parameter_dict[key] = value ( method, - technology, + data, bulk_date_string, bulk_cleansing, api_processes, @@ -111,7 +111,7 @@ def test_Mastr_validate_working_parameter(parameter_dict_working): assert ( validate_parameter_format_for_download_method( method, - technology, + data, bulk_date_string, bulk_cleansing, api_processes, @@ -140,7 +140,7 @@ def test_Mastr_validate_not_working_parameter( parameter_dict[key] = value ( method, - technology, + data, bulk_date_string, bulk_cleansing, api_processes, @@ -153,7 +153,7 @@ def test_Mastr_validate_not_working_parameter( with pytest.raises(ValueError): validate_parameter_format_for_download_method( method, - technology, + data, bulk_date_string, bulk_cleansing, api_processes, @@ -167,7 +167,7 @@ def test_Mastr_validate_not_working_parameter( def get_parameters_from_parameter_dict(parameter_dict): method = parameter_dict["method"] - technology = parameter_dict["technology"] + data = parameter_dict["data"] bulk_date_string = parameter_dict["bulk_date_string"] bulk_cleansing = parameter_dict["bulk_cleansing"] api_processes = parameter_dict["api_processes"] @@ -178,7 +178,7 @@ def get_parameters_from_parameter_dict(parameter_dict): api_location_types = parameter_dict["api_location_types"] return ( method, - technology, + data, bulk_date_string, bulk_cleansing, api_processes, diff --git a/tests/xml_download/test_utils_write_to_database.py b/tests/xml_download/test_utils_write_to_database.py index 33547605..8d3bd9be 100644 --- a/tests/xml_download/test_utils_write_to_database.py +++ b/tests/xml_download/test_utils_write_to_database.py @@ -11,7 +11,7 @@ add_table_to_database, add_zero_as_first_character_for_too_short_string, correct_ordering_of_filelist, - technology_to_include_tables, + data_to_include_tables, ) import os from os.path import expanduser @@ -238,7 +238,7 @@ def test_cast_date_columns_to_datetime(): ) -def test_technology_to_include_tables(): +def test_data_to_include_tables(): # Prepare include_tables_list = [ "anlageneegwind", @@ -249,11 +249,7 @@ def test_technology_to_include_tables(): include_tables_str = ["einheitenstromverbraucher"] # Assert - assert include_tables_list == technology_to_include_tables( - technology=["wind", "hydro"] - ) - assert include_tables_str == technology_to_include_tables( - technology="electricity_consumer" - ) - assert "anlageneegwind" in technology_to_include_tables(technology=None) - assert 28 == len(technology_to_include_tables(technology=None)) \ No newline at end of file + assert include_tables_list == data_to_include_tables(data=["wind", "hydro"]) + assert include_tables_str == data_to_include_tables(data="electricity_consumer") + assert "anlageneegwind" in data_to_include_tables(data=None) + assert 28 == len(data_to_include_tables(data=None)) From 9e9778cd9fb9617780ae99d7a1d03a6c119ad35b Mon Sep 17 00:00:00 2001 From: Deniz Tepe Date: Fri, 19 Aug 2022 11:54:23 +0200 Subject: [PATCH 03/13] Shorten the test to_csv It takes too long if you want to test with a filled open-mastr database --- tests/soap_api/test_mastr_mirror.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/soap_api/test_mastr_mirror.py b/tests/soap_api/test_mastr_mirror.py index a23abb6b..ac70465e 100644 --- a/tests/soap_api/test_mastr_mirror.py +++ b/tests/soap_api/test_mastr_mirror.py @@ -107,6 +107,7 @@ def test_to_csv(mastr_mirror, engine): for tech in TECHNOLOGIES: mastr_mirror.to_csv( technology=tech, + limit=100, additional_data=DATA_TYPES, statistic_flag=None, chunksize=1, @@ -120,8 +121,9 @@ def test_to_csv(mastr_mirror, engine): units = session.query(orm.BasicUnit.EinheitMastrNummer).filter( orm.BasicUnit.Einheittyp == mastr_mirror.unit_type_map_reversed[tech] ) - for unit in units: - assert unit.EinheitMastrNummer in df.index + list_MastrNummer = [unit.EinheitMastrNummer for unit in units] + for idx in df.index: + assert idx in list_MastrNummer @pytest.mark.dependency(name="backfill_locations_basic") From 13e768a3b3edeedde4178bf94b857ccd05b8298b Mon Sep 17 00:00:00 2001 From: Deniz Tepe Date: Mon, 22 Aug 2022 09:02:03 +0200 Subject: [PATCH 04/13] Fasten search using sets instead of lists *Increase chunksize *https://stackoverflow.com/questions/5993621/fastest-way-to-search-a-list-in-python --- tests/soap_api/test_mastr_mirror.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/soap_api/test_mastr_mirror.py b/tests/soap_api/test_mastr_mirror.py index ac70465e..4d033290 100644 --- a/tests/soap_api/test_mastr_mirror.py +++ b/tests/soap_api/test_mastr_mirror.py @@ -110,7 +110,7 @@ def test_to_csv(mastr_mirror, engine): limit=100, additional_data=DATA_TYPES, statistic_flag=None, - chunksize=1, + chunksize=10, ) # Test if all EinheitMastrNummer in basic_units are included in CSV file csv_path = join( @@ -121,7 +121,7 @@ def test_to_csv(mastr_mirror, engine): units = session.query(orm.BasicUnit.EinheitMastrNummer).filter( orm.BasicUnit.Einheittyp == mastr_mirror.unit_type_map_reversed[tech] ) - list_MastrNummer = [unit.EinheitMastrNummer for unit in units] + list_MastrNummer = {unit.EinheitMastrNummer for unit in units} for idx in df.index: assert idx in list_MastrNummer From f53db8864facfa9b6ab52420d373b0adb92a09d8 Mon Sep 17 00:00:00 2001 From: Deniz Tepe Date: Mon, 22 Aug 2022 09:53:00 +0200 Subject: [PATCH 05/13] Validate that list parameters are not empty lists #308 [None], [] are not valid anymore --- open_mastr/utils/helpers.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/open_mastr/utils/helpers.py b/open_mastr/utils/helpers.py index bce2a886..ec4643c6 100644 --- a/open_mastr/utils/helpers.py +++ b/open_mastr/utils/helpers.py @@ -120,13 +120,14 @@ def validate_parameter_api_location_types(api_location_types) -> None: raise ValueError("parameter api_location_types has to be a list or 'None'.") if isinstance(api_location_types, list): + if not api_location_types: # api_location_types == [] + raise ValueError("parameter api_location_types cannot be an empty list!") for value in api_location_types: if value not in [ "location_elec_generation", "location_elec_consumption", "location_gas_generation", "location_gas_consumption", - None, ]: raise ValueError( 'list entries of api_data_types have to be "location_elec_generation",' @@ -140,13 +141,14 @@ def validate_parameter_api_data_types(api_data_types) -> None: raise ValueError("parameter api_data_types has to be a list or 'None'.") if isinstance(api_data_types, list): + if not api_data_types: # api_data_types == [] + raise ValueError("parameter api_data_types cannot be an empty list!") for value in api_data_types: if value not in [ "unit_data", "eeg_data", "kwk_data", "permit_data", - None, ]: raise ValueError( 'list entries of api_data_types have to be "unit_data", ' @@ -237,6 +239,8 @@ def validate_parameter_data(method, data) -> None: "location", "permit", ] + if not data: # data == [] + raise ValueError("parameter data cannot be an empty list!") for value in data: if method == "bulk" and value not in bulk_data: raise ValueError( From 9fe4e88ec9a62cf255f61dde476e99cc6935b0c3 Mon Sep 17 00:00:00 2001 From: Deniz Tepe Date: Mon, 22 Aug 2022 12:15:32 +0200 Subject: [PATCH 06/13] Transform and harmonize data input #308 *api_data_types and api_location_types default now to all possible selections. * transform function returns lists required from lower functions -> lower functions must handle only lists -> no parsing etc. --- open_mastr/mastr.py | 30 ++++--------- open_mastr/utils/helpers.py | 90 +++++++++++++++++++++++++++---------- 2 files changed, 75 insertions(+), 45 deletions(-) diff --git a/open_mastr/mastr.py b/open_mastr/mastr.py index 60388da2..7360da87 100644 --- a/open_mastr/mastr.py +++ b/open_mastr/mastr.py @@ -11,9 +11,9 @@ from open_mastr.soap_api.mirror import MaStRMirror from open_mastr.utils.helpers import ( - data_input_harmonisation, print_api_settings, validate_api_credentials, + transform_data_parameter, ) from open_mastr.utils.config import ( create_data_dir, @@ -139,11 +139,11 @@ def download( Defaults to 1000. api_data_types: list or None, optional Select type of additional data that should be retrieved. Choose from - "unit_data", "eeg_data", "kwk_data", "permit_data". + "unit_data", "eeg_data", "kwk_data", "permit_data". Defaults to all. api_location_types: list or None, optional Select type of location that should be retrieved. Choose from "location_elec_generation", "location_elec_consumption", "location_gas_generation", - "location_gas_consumption". + "location_gas_consumption". Defaults to all. """ validate_parameter_format_for_download_method( @@ -158,6 +158,12 @@ def download( api_data_types=api_data_types, api_location_types=api_location_types, ) + ( + data, + api_data_types, + api_location_types, + harm_log, + ) = transform_data_parameter(method, data, api_data_types, api_location_types) if method == "bulk": @@ -181,24 +187,6 @@ def download( if method == "API": validate_api_credentials() - if isinstance(data, str): - data = [data] - elif data is None: - data = [ - "wind", - "biomass", - "combustion", - "gsgk", - "hydro", - "nuclear", - "storage", - "solar", - ] - (harm_log, api_data_types, api_location_types,) = data_input_harmonisation( - data=data, - api_data_types=api_data_types, - api_location_types=api_location_types, - ) # Set api_processes to None in order to avoid the malfunctioning usage if api_processes: diff --git a/open_mastr/utils/helpers.py b/open_mastr/utils/helpers.py index ec4643c6..407de245 100644 --- a/open_mastr/utils/helpers.py +++ b/open_mastr/utils/helpers.py @@ -294,6 +294,72 @@ def raise_warning_for_invalid_parameter_combinations( ) +def transform_data_parameter(method, data, api_data_types, api_location_types): + # initialize full lists TODO decide for the best location to centralize these lists + bulk_data = [ + "wind", + "solar", + "biomass", + "hydro", + "gsgk", + "combustion", + "nuclear", + "gas", + "storage", + "electricity_consumer", + "location", + "market", + "grid", + "balancing_area", + "permit", + ] + api_data = [ + "wind", + "solar", + "biomass", + "hydro", + "gsgk", + "combustion", + "nuclear", + "storage", + "location", + "permit", + ] + all_api_data_types = ["unit_data", "eeg_data", "kwk_data", "permit_data"] + all_api_location_types = [ + "location_elec_generation", + "location_elec_consumption", + "location_gas_generation", + "location_gas_consumption", + ] + + # parse parameters as list + if isinstance(data, str): + data = [data] + elif data is None: + data = bulk_data if method == "bulk" else api_data + if api_data_types is None: + api_data_types = all_api_data_types + if api_location_types is None: + api_location_types = all_api_location_types + + # data input harmonisation + harmonisation_log = [] + if "permit" in data: + data.remove("permit") + api_data_types.append( + "permit_data" + ) if "permit_data" not in api_data_types else api_data_types + harmonisation_log.append("permit") + + if "location" in data: + data.remove("location") + api_location_types = all_api_location_types + harmonisation_log.append("location") + + return data, api_data_types, api_location_types, harmonisation_log + + @contextmanager def session_scope(engine): """Provide a transactional scope around a series of operations.""" @@ -309,30 +375,6 @@ def session_scope(engine): session.close() -def data_input_harmonisation(data, api_data_types, api_location_types): - harmonisation_log = [] - - if "permit" in data: - data.remove("permit") - api_data_types.append( - "permit_data" - ) if "permit_data" not in api_data_types else api_data_types - harmonisation_log.append("permit") - - if "location" in data: - data.remove("location") - api_location_types = [ - "location_elec_generation", - "location_elec_consumption", - "location_gas_generation", - "location_gas_consumption", - ] - harmonisation_log.append("location") - # return changed api_location_types only if "location" in data, else None - - return harmonisation_log, api_data_types, api_location_types - - def print_api_settings( harmonisation_log, data, From 2d188bb8ce1c8785e4f11bf680369f97726b15f0 Mon Sep 17 00:00:00 2001 From: Deniz Tepe Date: Mon, 22 Aug 2022 12:46:15 +0200 Subject: [PATCH 07/13] Add test #308 --- tests/test_helpers.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/tests/test_helpers.py b/tests/test_helpers.py index a3fa7138..62bea919 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -2,6 +2,7 @@ validate_parameter_format_for_download_method, validate_parameter_format_for_mastr_init, validate_api_credentials, + transform_data_parameter, ) import pytest import sys @@ -68,20 +69,15 @@ def parameter_dict_working(): def parameter_dict_not_working(): parameter_dict = { "method": [5, "BULK", "api"], - "data": [ - "wint", - "Solar", - "biomasse", - 5, - ], + "data": ["wint", "Solar", "biomasse", 5, []], "bulk_date_string": [124, "heute", 123], "bulk_cleansing": ["cleansing", 4, None], "api_processes": ["20", "None"], "api_limit": ["15", "None"], "api_date": ["None", "20220202"], "api_chunksize": ["20"], - "api_data_types": ["unite_data", 5], - "api_location_types": ["locatione_elec_generation", 5], + "api_data_types": ["unite_data", 5, []], + "api_location_types": ["locatione_elec_generation", 5, []], } return parameter_dict @@ -202,5 +198,24 @@ def test_validate_parameter_format_for_mastr_init(db): validate_parameter_format_for_mastr_init(engine) +def test_transform_data_parameter(parameter_dict_working): + (data, api_data_types, api_location_types, harm_log,) = transform_data_parameter( + method="API", + data=["wind", "location"], + api_data_types=["eeg_data"], + api_location_types=None, + ) + + assert data == ["wind"] + assert api_data_types == ["eeg_data"] + assert api_location_types == [ + "location_elec_generation", + "location_elec_consumption", + "location_gas_generation", + "location_gas_consumption", + ] # TODO centralize + assert harm_log == ["location"] + + def test_validate_api_credentials(): validate_api_credentials() From 99f4fda033771f81c7c6334003d7016a542bdeaf Mon Sep 17 00:00:00 2001 From: Deniz Tepe Date: Mon, 22 Aug 2022 16:45:01 +0200 Subject: [PATCH 08/13] Remove redundant parsing of data parameter #308 * Black applied --- open_mastr/mastr.py | 32 ++++++++------- open_mastr/soap_api/mirror.py | 31 +++------------ open_mastr/utils/helpers.py | 4 ++ open_mastr/utils/orm.py | 20 ---------- .../xml_download/utils_write_to_database.py | 39 +++---------------- tests/soap_api/test_mastr_mirror.py | 2 +- .../test_utils_write_to_database.py | 4 +- 7 files changed, 34 insertions(+), 98 deletions(-) diff --git a/open_mastr/mastr.py b/open_mastr/mastr.py index 7360da87..db8a9d0c 100644 --- a/open_mastr/mastr.py +++ b/open_mastr/mastr.py @@ -14,6 +14,7 @@ print_api_settings, validate_api_credentials, transform_data_parameter, + validate_parameter_data, ) from open_mastr.utils.config import ( create_data_dir, @@ -291,24 +292,25 @@ def to_csv( "location_gas_consumption", ] + # Validate and parse tables parameter TODO parameter renaming + validate_parameter_data(method="bulk", data=tables) + ( + data, + api_data_types, + api_location_types, + harm_log, + ) = transform_data_parameter( + method="bulk", data=tables, api_data_types=None, api_location_types=None + ) + # Determine tables to export technologies_to_export = [] additional_tables_to_export = [] - if isinstance(tables, str): - # str to list - tables = [tables] - if tables is None: - technologies_to_export = all_technologies - additional_tables_to_export = all_additional_tables - print(f"Tables: {technologies_to_export}, {additional_tables_to_export}") - elif isinstance(tables, list): - for table in tables: - if table in all_technologies: - technologies_to_export.append(table) - elif table in all_additional_tables: - additional_tables_to_export.append(table) - else: - raise ValueError("Tables parameter has an invalid string!") + for table in data: + if table in all_technologies: + technologies_to_export.append(table) + elif table in all_additional_tables: + additional_tables_to_export.append(table) if technologies_to_export: print(f"\nTechnology tables: {technologies_to_export}") diff --git a/open_mastr/soap_api/mirror.py b/open_mastr/soap_api/mirror.py index 49c9a2b0..462c5262 100644 --- a/open_mastr/soap_api/mirror.py +++ b/open_mastr/soap_api/mirror.py @@ -178,15 +178,12 @@ def backfill_basic(self, data=None, date=None, limit=10 ** 8) -> None: Parameters ---------- - data: str or list + data: list Specify data types for which data should be backfilled. - * 'solar' (`str`): Backfill data for a single data type. + * ['solar']: Backfill data for a single data type. * ['solar', 'wind'] (`list`): Backfill data for multiple technologies given in a list. - * `None`: Backfill data for all technologies - Defaults to `None` which is passed to - :meth:`open_mastr.soap_api.download.MaStRDownload.basic_unit_data`. date: None, :class:`datetime.datetime`, str Specify backfill date from which on data is retrieved @@ -213,18 +210,10 @@ def backfill_basic(self, data=None, date=None, limit=10 ** 8) -> None: all available data is queried. Use with care! """ - # Create list of technologies to backfill - if isinstance(data, str): - data_list = [data] - elif data is None: - data_list = [None] - elif isinstance(data, list): - data_list = data + dates = self._get_list_of_dates(date, data) - dates = self._get_list_of_dates(date, data_list) - - for data, date in zip(data_list, dates): - self._write_basic_data_for_one_data_type_to_db(data, date, limit) + for data_type, date in zip(data, dates): + self._write_basic_data_for_one_data_type_to_db(data_type, date, limit) def backfill_locations_basic( self, limit=10 ** 7, date=None, delete_additional_data_requests=True @@ -329,9 +318,7 @@ def backfill_locations_basic( orm.AdditionalLocationsRequested, new_requests ) - def retrieve_additional_data( - self, data, data_type, limit=10 ** 8, chunksize=1000 - ): + def retrieve_additional_data(self, data, data_type, limit=10 ** 8, chunksize=1000): """ Retrieve additional unit data @@ -1137,12 +1124,6 @@ def to_csv( create_data_dir() - # Make sure input in either str or list - if isinstance(technology, str): - technology = [technology] - elif not isinstance(technology, (list, None)): - raise TypeError("Parameter data must be of type `str` or `list`") - renaming = column_renaming() with session_scope(engine=self._engine) as session: diff --git a/open_mastr/utils/helpers.py b/open_mastr/utils/helpers.py index 407de245..143c4cb8 100644 --- a/open_mastr/utils/helpers.py +++ b/open_mastr/utils/helpers.py @@ -295,6 +295,10 @@ def raise_warning_for_invalid_parameter_combinations( def transform_data_parameter(method, data, api_data_types, api_location_types): + """ + Parse input parameters related to data as lists. Harmonize variables for later use. + Data output depends on the possible data types of chosen method. + """ # initialize full lists TODO decide for the best location to centralize these lists bulk_data = [ "wind", diff --git a/open_mastr/utils/orm.py b/open_mastr/utils/orm.py index d3c5e2c2..b126bfff 100644 --- a/open_mastr/utils/orm.py +++ b/open_mastr/utils/orm.py @@ -999,26 +999,6 @@ class GridConnections(ParentAllTables, Base): }, } -# List of technologies which can be called by mastr.download() -# as well as by MastrMirror.basic_backfill() -bulk_technologies = [ - "wind", - "solar", - "biomass", - "hydro", - "gsgk", - "combustion", - "nuclear", - "gas", - "storage", - "electricity_consumer", - "location", - "market", - "grid", - "balancing_area", - "permit", -] - # Map bulk technologies to bulk download tables bulk_include_tables_map = { "wind": ["anlageneegwind", "einheitenwind"], diff --git a/open_mastr/xml_download/utils_write_to_database.py b/open_mastr/xml_download/utils_write_to_database.py index 900d384d..b629166b 100644 --- a/open_mastr/xml_download/utils_write_to_database.py +++ b/open_mastr/xml_download/utils_write_to_database.py @@ -368,18 +368,15 @@ def handle_xml_syntax_error(data: bytes, err: Error) -> pd.DataFrame: def data_to_include_tables( - data, + data: list, ) -> list: """ - Check the user input 'data' and convert it to the list 'include_tables' which contains + Convert user input 'data' to the list 'include_tables' which contains file names from zipped bulk download. Parameters ---------- - data: None, str, list + data: list The user input for data selection - * `None`: All technologies (default) - * `str`: One data - * `list`: List of technologies Returns ------- list @@ -387,33 +384,7 @@ def data_to_include_tables( ------- """ - all_technologies = orm.bulk_technologies tables_map = orm.bulk_include_tables_map - # Convert data input into a standard list - chosen_technologies = [] - if data is None: - # All technologies are to be chosen - chosen_technologies = all_technologies - elif isinstance(data, str): - # Only one data is chosen - chosen_technologies = [data] - elif isinstance(data, list): - # list of technologies is given - chosen_technologies = data - - # Check if given technologies match with the valid options from 'orm.bulk_technologies' - for tech in chosen_technologies: - if tech not in all_technologies: - raise ValueError( - f"The input data = {data} does not match with the " - f"possible data options. Only following data options are available " - f"bulk_technologies = {all_technologies}" - ) - - # Map technologies to include tables - include_tables = [] - for tech in chosen_technologies: - # Append table names to the include_tables list respectively - include_tables += tables_map[tech] - + # Map data selection to include tables + include_tables = [table for tech in data for table in tables_map[tech]] return include_tables diff --git a/tests/soap_api/test_mastr_mirror.py b/tests/soap_api/test_mastr_mirror.py index 4d033290..dd33d44e 100644 --- a/tests/soap_api/test_mastr_mirror.py +++ b/tests/soap_api/test_mastr_mirror.py @@ -106,7 +106,7 @@ def test_to_csv(mastr_mirror, engine): with session_scope(engine=engine) as session: for tech in TECHNOLOGIES: mastr_mirror.to_csv( - technology=tech, + technology=[tech], limit=100, additional_data=DATA_TYPES, statistic_flag=None, diff --git a/tests/xml_download/test_utils_write_to_database.py b/tests/xml_download/test_utils_write_to_database.py index 8d3bd9be..2100566a 100644 --- a/tests/xml_download/test_utils_write_to_database.py +++ b/tests/xml_download/test_utils_write_to_database.py @@ -250,6 +250,4 @@ def test_data_to_include_tables(): # Assert assert include_tables_list == data_to_include_tables(data=["wind", "hydro"]) - assert include_tables_str == data_to_include_tables(data="electricity_consumer") - assert "anlageneegwind" in data_to_include_tables(data=None) - assert 28 == len(data_to_include_tables(data=None)) + assert include_tables_str == data_to_include_tables(data=["electricity_consumer"]) From 110ed241c63e532d643cc2cae14aa38bc614e061 Mon Sep 17 00:00:00 2001 From: Deniz Tepe Date: Wed, 24 Aug 2022 10:32:56 +0200 Subject: [PATCH 09/13] Adapt documentation for data parameter #308 * Technology is renamed to data where necessary * In docstring, a table for possible data values is added --- docs/advanced.rst | 2 +- docs/getting_started.rst | 4 ++-- open_mastr/mastr.py | 29 +++++++++++++++++++++++++---- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/docs/advanced.rst b/docs/advanced.rst index a9346097..6a3cfdf5 100644 --- a/docs/advanced.rst +++ b/docs/advanced.rst @@ -165,7 +165,7 @@ This data is updated on a daily base. In the following, the process is described that is started when calling the download function with the parameter method="bulk". First, the zipped files are downloaded and saved in `$HOME/.open-MaStR/data/xml_download`. The zipped folder contains many xml files, which represent the different tables from the MaStR. Those tables are then parsed to a sqlite database. If only some specific -technologies are of interest, they can be specified with the parameter `technology`. Every table that is selected in `technology` will be deleted, if existent, +data are of interest, they can be specified with the parameter `data`. Every table that is selected in `data` will be deleted, if existent, and then filled with data from the xml files. In the last step, a basic data cleansing is performed. Many entries in the MaStR from the bulk download are replaced by numbers. diff --git a/docs/getting_started.rst b/docs/getting_started.rst index fbb5e1c0..722d71df 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -48,9 +48,9 @@ additional parameters can be set to define in detail which data should be obtain * - argument - options for specification - explanation - * - technology + * - data - ["wind","biomass","combustion","gsgk","hydro","nuclear","storage","solar"] - - Select technologies to download. + - Select data to download. * - api_data_types - ["unit_data","eeg_data","kwk_data","permit_data"] - Select the type of data to download. diff --git a/open_mastr/mastr.py b/open_mastr/mastr.py index db8a9d0c..ddd628c7 100644 --- a/open_mastr/mastr.py +++ b/open_mastr/mastr.py @@ -91,10 +91,31 @@ def download( (see :ref:`Configuration `). Default to 'bulk'. data: str or list or None, optional Determines which types of data are written to the database. If None, all data is - used. If it is a list, possible entries are "wind", "solar", "biomass", "hydro", "gsgk", - "combustion", "nuclear", "gas", "storage", "electricity_consumer", "location", "market", - "grid", "balancing_area" or "permit". If only one data is of interest, this can be - given as a string. Default to None, where all data is included. + used. If it is a list, possible entries are listed at the table below with respect to the download method. + Missing categories are being developed. If only one data is of interest, this can be + given as a string. Default to None, where all data is included. + + .. csv-table:: Values for data parameter + :header-rows: 1 + :widths: 5 5 5 + + "Data", "Bulk", "API" + "wind", "Yes", "Yes" + "solar", "Yes", "Yes" + "biomass", "Yes", "Yes" + "hydro", "Yes", "Yes" + "gsgk", "Yes", "Yes" + "combustion", "Yes", "Yes" + "nuclear", "Yes", "Yes" + "gas", "Yes", "Yes" + "storage", "Yes", "Yes" + "electricity_consumer", "Yes", "No" + "location", "Yes", "Yes" + "market", "Yes", "No" + "grid", "Yes", "No" + "balancing_area", "Yes", "No" + "permit", "Yes", "Yes" + bulk_date_string: str, optional Either "today" if the newest data dump should be downloaded from the MaStR website. If an already downloaded dump should be used, state the date of the download in the format From f1579bb7726184af377a2b7c01b117536c2c7036 Mon Sep 17 00:00:00 2001 From: Deniz Tepe Date: Wed, 24 Aug 2022 10:49:04 +0200 Subject: [PATCH 10/13] Edit Changelog #308 --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f0db8518..95c1bab5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,8 +22,10 @@ Here is a template for new release sections: ### Added - Read version number for CI from setup.py [#333](https://github.com/rl-institut/super-repo/pull/333) + ### Changed -- [#](https://github.com/rl-institut/super-repo/pull/) +- Technology parameter is renamed to data for better comprehension [#337](https://github.com/OpenEnergyPlatform/open-MaStR/pull/337) + ### Removed - [#](https://github.com/rl-institut/super-repo/pull/) From a1fd835674e716e33a9ad74e4dceae826544ba16 Mon Sep 17 00:00:00 2001 From: chrwm Date: Fri, 26 Aug 2022 12:11:38 +0100 Subject: [PATCH 11/13] Fix soap_API tqdm message #308 --- open_mastr/soap_api/download.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/open_mastr/soap_api/download.py b/open_mastr/soap_api/download.py index d0b9ef82..215ec53f 100644 --- a/open_mastr/soap_api/download.py +++ b/open_mastr/soap_api/download.py @@ -898,8 +898,8 @@ def additional_data(self, data, unit_ids, data_fcn, timeout=10): return data, data_missed def _retrieve_data_in_single_process(self, prepared_args, data_fcn, data): - data = [] - data_missed = [] + data_list = [] + data_missed_list = [] for unit_specs in tqdm( prepared_args, total=len(prepared_args), @@ -913,16 +913,16 @@ def _retrieve_data_in_single_process(self, prepared_args, data_fcn, data): f"{data_missed_tmp[0]} ({data}) failed. " f"Traceback of caught error:\n{data_missed_tmp[1]}" ) - data.append(data_tmp) - data_missed.append(data_missed_tmp) + data_list.append(data_tmp) + data_missed_list.append(data_missed_tmp) - return data, data_missed + return data_list, data_missed_list def _retrieve_data_in_parallel_process( self, prepared_args, data_fcn, data, timeout ): - data = [] - data_missed = [] + data_list = [] + data_missed_list = [] with multiprocessing.Pool( processes=self.parallel_processes, maxtasksperchild=1 ) as pool: @@ -946,8 +946,8 @@ def _retrieve_data_in_parallel_process( f"{data_missed_tmp[0]} ({data}) failed. " f"Traceback of caught error:\n{data_missed_tmp[1]}" ) - data.append(data_tmp) - data_missed.append(data_missed_tmp) + data_list.append(data_tmp) + data_missed_list.append(data_missed_tmp) pbar.update() except StopIteration: # Multiprocessing returns StropIteration when results list gets empty @@ -955,7 +955,7 @@ def _retrieve_data_in_parallel_process( except multiprocessing.TimeoutError: # If retrieval time exceeds timeout of next(), pass on log.debug(f"Data request for 1 {data} unit timed out") - return data, data_missed + return data_list, data_missed_list def extended_unit_data(self, unit_specs): """ From d28e61451c7450cf8734795aab2b0fbb9bea9858 Mon Sep 17 00:00:00 2001 From: Florian Kotthoff <74312290+FlorianK13@users.noreply.github.com> Date: Mon, 29 Aug 2022 13:09:40 +0200 Subject: [PATCH 12/13] Update tests/soap_api/test_mastr_mirror.py Co-authored-by: chrwm <54852694+chrwm@users.noreply.github.com> --- tests/soap_api/test_mastr_mirror.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/soap_api/test_mastr_mirror.py b/tests/soap_api/test_mastr_mirror.py index dd33d44e..c4367dc4 100644 --- a/tests/soap_api/test_mastr_mirror.py +++ b/tests/soap_api/test_mastr_mirror.py @@ -121,7 +121,7 @@ def test_to_csv(mastr_mirror, engine): units = session.query(orm.BasicUnit.EinheitMastrNummer).filter( orm.BasicUnit.Einheittyp == mastr_mirror.unit_type_map_reversed[tech] ) - list_MastrNummer = {unit.EinheitMastrNummer for unit in units} + set_MastrNummer = {unit.EinheitMastrNummer for unit in units} for idx in df.index: assert idx in list_MastrNummer From 4b78b4353787e8a463863cdb4a65fdebc54d749e Mon Sep 17 00:00:00 2001 From: Florian Kotthoff <74312290+FlorianK13@users.noreply.github.com> Date: Mon, 29 Aug 2022 13:09:46 +0200 Subject: [PATCH 13/13] Update tests/soap_api/test_mastr_mirror.py Co-authored-by: chrwm <54852694+chrwm@users.noreply.github.com> --- tests/soap_api/test_mastr_mirror.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/soap_api/test_mastr_mirror.py b/tests/soap_api/test_mastr_mirror.py index c4367dc4..feabc3f9 100644 --- a/tests/soap_api/test_mastr_mirror.py +++ b/tests/soap_api/test_mastr_mirror.py @@ -123,7 +123,7 @@ def test_to_csv(mastr_mirror, engine): ) set_MastrNummer = {unit.EinheitMastrNummer for unit in units} for idx in df.index: - assert idx in list_MastrNummer + assert idx in set_MastrNummer @pytest.mark.dependency(name="backfill_locations_basic")