diff --git a/cubedash/summary/_stores.py b/cubedash/summary/_stores.py index cc5040943..fc9590efb 100644 --- a/cubedash/summary/_stores.py +++ b/cubedash/summary/_stores.py @@ -526,7 +526,7 @@ def refresh_product_extent( product, kind="derived", sample_percentage=sample_percentage ) fixed_metadata = self._find_product_fixed_metadata( - product, sample_percentage=sample_percentage + product, sample_datasets_size=dataset_sample_size ) new_summary = ProductSummary( @@ -613,7 +613,7 @@ def refresh_stats(self, concurrently=False): refresh_supporting_views(self._engine, concurrently=concurrently) def _find_product_fixed_metadata( - self, product: DatasetType, sample_percentage=0.05 + self, product: DatasetType, sample_datasets_size=1000, ) -> Dict[str, any]: """ Find metadata fields that have an identical value in every dataset of the product. @@ -622,11 +622,6 @@ def _find_product_fixed_metadata( feel free to sample 100%!) """ - if not 0.0 < sample_percentage <= 100.0: - raise ValueError( - f"Sample percentage out of range 0>s>=100. Got {sample_percentage!r}" - ) - # Get a single dataset, then we'll compare the rest against its values. first_dataset_fields = self.index.datasets.search_eager( product=product.name, limit=1 @@ -648,18 +643,6 @@ def _find_product_fixed_metadata( if field.type_name in simple_field_types and name in first_dataset_fields ] - if sample_percentage < 100: - dataset_table = ODC_DATASET.tablesample( - func.system(float(sample_percentage)), name="sampled_dataset" - ) - # Replace the table with our sampled one. - for _, field in candidate_fields: - if field.alchemy_column.table == ODC_DATASET: - field.alchemy_column = dataset_table.c[field.alchemy_column.name] - - else: - dataset_table = ODC_DATASET - # Give a friendlier error message when a product doesn't match the dataset. for name, field in candidate_fields: sample_value = first_dataset_fields[name] @@ -673,11 +656,25 @@ def _find_product_fixed_metadata( f"claimed to be type {expected_types}, but dataset has value {sample_value!r}" ) + dataset_samples = self._engine.execute( + select( + [ + ODC_DATASET.c.id + ] + ) + .select_from(ODC_DATASET) + .where(ODC_DATASET.c.dataset_type_ref == product.id) + .where(ODC_DATASET.c.archived.is_(None)) + .limit(sample_datasets_size) + .order_by(func.random()) + ).fetchall() + _LOG.info( "product.fixed_metadata_search", product=product.name, - sample_percentage=round(sample_percentage, 2), + sampled_dataset_count=sample_datasets_size, ) + result = self._engine.execute( select( [ @@ -689,9 +686,8 @@ def _find_product_fixed_metadata( for field_name, field in candidate_fields ] ) - .select_from(dataset_table) - .where(dataset_table.c.dataset_type_ref == product.id) - .where(dataset_table.c.archived.is_(None)) + .select_from(ODC_DATASET) + .where(ODC_DATASET.c.id.in_([r for r, in dataset_samples])) ).fetchall() assert len(result) == 1 @@ -703,7 +699,7 @@ def _find_product_fixed_metadata( _LOG.info( "product.fixed_metadata_search.done", product=product.name, - sample_percentage=round(sample_percentage, 2), + sampled_dataset_count=sample_datasets_size, searched_field_count=len(result[0]), found_field_count=len(fixed_fields), ) diff --git a/cubedash/templates/product.html b/cubedash/templates/product.html index 1f8500fd0..18db5fbd8 100644 --- a/cubedash/templates/product.html +++ b/cubedash/templates/product.html @@ -147,6 +147,7 @@

Metadata (Common fields

{% if (product.fields | all_values_none) and (product_summary.fixed_metadata == {}) %} No common values {% else %} @@ -161,7 +162,6 @@

Metadata (= 5 + + # There are 4 interim and 16 final maturity level datasets + # at 20% (4 datasets), there is a large chance, maturity level + # will be in the dictionary + fixed_fields = summary_store._find_product_fixed_metadata( + summary_store.index.products.get_by_name("ga_ls8c_ard_3"), + sample_datasets_size=4, + ) + + assert len(fixed_fields) >= 5 + + # There are 4 interim and 16 final maturity level datasets + # at 5% (1 datasets), there is a large chance, maturity level + # will be in the dictionary + fixed_fields = summary_store._find_product_fixed_metadata( + summary_store.index.products.get_by_name("ga_ls8c_ard_3"), + sample_datasets_size=1, + ) + + assert len(fixed_fields) >= 5 diff --git a/integration_tests/test_summarise_data.py b/integration_tests/test_summarise_data.py index 3bfb70ddf..f424b69ed 100644 --- a/integration_tests/test_summarise_data.py +++ b/integration_tests/test_summarise_data.py @@ -335,7 +335,7 @@ def test_sampled_product_fixed_fields(summary_store: SummaryStore): # Tiled product, sampled fixed_fields = summary_store._find_product_fixed_metadata( summary_store.index.products.get_by_name("ls8_nbar_albers"), - sample_percentage=50, + sample_datasets_size=5, ) # Ingested products carry little of the original metadata... assert fixed_fields == {