diff --git a/cubedash/summary/_stores.py b/cubedash/summary/_stores.py index cc5040943..fc9590efb 100644 --- a/cubedash/summary/_stores.py +++ b/cubedash/summary/_stores.py @@ -526,7 +526,7 @@ def refresh_product_extent( product, kind="derived", sample_percentage=sample_percentage ) fixed_metadata = self._find_product_fixed_metadata( - product, sample_percentage=sample_percentage + product, sample_datasets_size=dataset_sample_size ) new_summary = ProductSummary( @@ -613,7 +613,7 @@ def refresh_stats(self, concurrently=False): refresh_supporting_views(self._engine, concurrently=concurrently) def _find_product_fixed_metadata( - self, product: DatasetType, sample_percentage=0.05 + self, product: DatasetType, sample_datasets_size=1000, ) -> Dict[str, any]: """ Find metadata fields that have an identical value in every dataset of the product. @@ -622,11 +622,6 @@ def _find_product_fixed_metadata( feel free to sample 100%!) """ - if not 0.0 < sample_percentage <= 100.0: - raise ValueError( - f"Sample percentage out of range 0>s>=100. Got {sample_percentage!r}" - ) - # Get a single dataset, then we'll compare the rest against its values. first_dataset_fields = self.index.datasets.search_eager( product=product.name, limit=1 @@ -648,18 +643,6 @@ def _find_product_fixed_metadata( if field.type_name in simple_field_types and name in first_dataset_fields ] - if sample_percentage < 100: - dataset_table = ODC_DATASET.tablesample( - func.system(float(sample_percentage)), name="sampled_dataset" - ) - # Replace the table with our sampled one. - for _, field in candidate_fields: - if field.alchemy_column.table == ODC_DATASET: - field.alchemy_column = dataset_table.c[field.alchemy_column.name] - - else: - dataset_table = ODC_DATASET - # Give a friendlier error message when a product doesn't match the dataset. for name, field in candidate_fields: sample_value = first_dataset_fields[name] @@ -673,11 +656,25 @@ def _find_product_fixed_metadata( f"claimed to be type {expected_types}, but dataset has value {sample_value!r}" ) + dataset_samples = self._engine.execute( + select( + [ + ODC_DATASET.c.id + ] + ) + .select_from(ODC_DATASET) + .where(ODC_DATASET.c.dataset_type_ref == product.id) + .where(ODC_DATASET.c.archived.is_(None)) + .limit(sample_datasets_size) + .order_by(func.random()) + ).fetchall() + _LOG.info( "product.fixed_metadata_search", product=product.name, - sample_percentage=round(sample_percentage, 2), + sampled_dataset_count=sample_datasets_size, ) + result = self._engine.execute( select( [ @@ -689,9 +686,8 @@ def _find_product_fixed_metadata( for field_name, field in candidate_fields ] ) - .select_from(dataset_table) - .where(dataset_table.c.dataset_type_ref == product.id) - .where(dataset_table.c.archived.is_(None)) + .select_from(ODC_DATASET) + .where(ODC_DATASET.c.id.in_([r for r, in dataset_samples])) ).fetchall() assert len(result) == 1 @@ -703,7 +699,7 @@ def _find_product_fixed_metadata( _LOG.info( "product.fixed_metadata_search.done", product=product.name, - sample_percentage=round(sample_percentage, 2), + sampled_dataset_count=sample_datasets_size, searched_field_count=len(result[0]), found_field_count=len(fixed_fields), ) diff --git a/cubedash/templates/product.html b/cubedash/templates/product.html index 1f8500fd0..18db5fbd8 100644 --- a/cubedash/templates/product.html +++ b/cubedash/templates/product.html @@ -147,6 +147,7 @@