Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

setup test case for dataset common fields #432

Merged
merged 6 commits into from
Sep 19, 2022
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 20 additions & 24 deletions cubedash/summary/_stores.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,7 @@ def refresh_product_extent(
product, kind="derived", sample_percentage=sample_percentage
)
fixed_metadata = self._find_product_fixed_metadata(
product, sample_percentage=sample_percentage
product, sample_datasets_size=dataset_sample_size
)

new_summary = ProductSummary(
Expand Down Expand Up @@ -613,7 +613,7 @@ def refresh_stats(self, concurrently=False):
refresh_supporting_views(self._engine, concurrently=concurrently)

def _find_product_fixed_metadata(
self, product: DatasetType, sample_percentage=0.05
self, product: DatasetType, sample_datasets_size=1000,
) -> Dict[str, any]:
"""
Find metadata fields that have an identical value in every dataset of the product.
Expand All @@ -622,11 +622,6 @@ def _find_product_fixed_metadata(
feel free to sample 100%!)

"""
if not 0.0 < sample_percentage <= 100.0:
raise ValueError(
f"Sample percentage out of range 0>s>=100. Got {sample_percentage!r}"
)

# Get a single dataset, then we'll compare the rest against its values.
first_dataset_fields = self.index.datasets.search_eager(
product=product.name, limit=1
Expand All @@ -648,18 +643,6 @@ def _find_product_fixed_metadata(
if field.type_name in simple_field_types and name in first_dataset_fields
]

if sample_percentage < 100:
dataset_table = ODC_DATASET.tablesample(
func.system(float(sample_percentage)), name="sampled_dataset"
)
# Replace the table with our sampled one.
for _, field in candidate_fields:
if field.alchemy_column.table == ODC_DATASET:
field.alchemy_column = dataset_table.c[field.alchemy_column.name]

else:
dataset_table = ODC_DATASET

# Give a friendlier error message when a product doesn't match the dataset.
for name, field in candidate_fields:
sample_value = first_dataset_fields[name]
Expand All @@ -673,11 +656,25 @@ def _find_product_fixed_metadata(
f"claimed to be type {expected_types}, but dataset has value {sample_value!r}"
)

dataset_samples = self._engine.execute(
select(
[
ODC_DATASET.c.id
]
)
.select_from(ODC_DATASET)
.where(ODC_DATASET.c.dataset_type_ref == product.id)
.where(ODC_DATASET.c.archived.is_(None))
.limit(sample_datasets_size)
.order_by(func.random())
).fetchall()

_LOG.info(
"product.fixed_metadata_search",
product=product.name,
sample_percentage=round(sample_percentage, 2),
sampled_dataset_count=sample_datasets_size,
)

result = self._engine.execute(
select(
[
Expand All @@ -689,9 +686,8 @@ def _find_product_fixed_metadata(
for field_name, field in candidate_fields
]
)
.select_from(dataset_table)
.where(dataset_table.c.dataset_type_ref == product.id)
.where(dataset_table.c.archived.is_(None))
.select_from(ODC_DATASET)
.where(ODC_DATASET.c.id.in_([r for r, in dataset_samples]))
).fetchall()
assert len(result) == 1

Expand All @@ -703,7 +699,7 @@ def _find_product_fixed_metadata(
_LOG.info(
"product.fixed_metadata_search.done",
product=product.name,
sample_percentage=round(sample_percentage, 2),
sampled_dataset_count=sample_datasets_size,
searched_field_count=len(result[0]),
found_field_count=len(fixed_fields),
)
Expand Down
2 changes: 1 addition & 1 deletion cubedash/templates/product.html
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ <h4>Metadata (<a href="{{ url_for('product.metadata_type_page', name=product.met
>{{ product.metadata_type.name -}}</a>)
</h4>

<h4 title="randomly sampled 1,000 datasets">Common fields <i class="fa fa-info-circle"></i></h4>
{% if (product.fields | all_values_none) and (product_summary.fixed_metadata == {}) %}
<em>No common values</em>
{% else %}
Expand All @@ -161,7 +162,6 @@ <h4>Metadata (<a href="{{ url_for('product.metadata_type_page', name=product.met
</em>
{% endif %}
{% endif %}

{% if product_summary.derived_products %}
<h4>Derived</h4>
<ul>
Expand Down
Loading