Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Implement data validation step with basic check for duplicate rows #1088

Merged
merged 3 commits into from
Mar 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/TulipaEnergyModel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ include("structures.jl")
# Data
include("input-schemas.jl")
include("io.jl")
include("data-validation.jl")
include("data-preparation.jl")

# Data massage and model preparation
Expand Down
80 changes: 80 additions & 0 deletions src/data-validation.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
export DataValidationException

"""
DataValidationException

Exception related to data validation of the Tulipa Energy Model input data.
"""
mutable struct DataValidationException <: Exception
error_messages::Vector{String}
end

function Base.showerror(io::IO, ex::DataValidationException)
println(io, "DataValidationException: The following issues were found in the data:")
for error_message in ex.error_messages
println(io, "- " * error_message)
end
end

"""
validate_data!(connection)

Raises an error if the data is not valid.
"""
function validate_data!(connection)
error_messages = String[]
@timeit to "no duplicate rows" append!(error_messages, _validate_no_duplicate_rows!(connection))

if length(error_messages) > 0
throw(DataValidationException(error_messages))
end

return
end

function _validate_no_duplicate_rows!(connection)
# It should be possible to add a primary key to the tables below to avoid this validation.
# However, where to add this, and how to ensure it was added is not clear.
duplicates = String[]
for (table, primary_keys) in (
("asset", (:asset,)),
("asset_both", (:asset, :milestone_year, :commission_year)),
("asset_commission", (:asset, :commission_year)),
("asset_milestone", (:asset, :milestone_year)),
("assets_profiles", (:asset, :commission_year, :profile_type)),
("assets_rep_periods_partitions", (:asset, :year, :rep_period)),
("assets_timeframe_partitions", (:asset, :year)),
("assets_timeframe_profiles", (:asset, :commission_year, :profile_type)),
("flow", (:from_asset, :to_asset)),
("flow_both", (:from_asset, :to_asset, :milestone_year, :commission_year)),
("flow_commission", (:from_asset, :to_asset, :commission_year)),
("flow_milestone", (:from_asset, :to_asset, :milestone_year)),
("flows_profiles", (:from_asset, :to_asset, :year, :profile_type)),
("flows_rep_periods_partitions", (:from_asset, :to_asset, :year, :rep_period)),
("group_asset", (:name, :milestone_year)),
("profiles_rep_periods", (:profile_name, :year, :rep_period, :timestep)),
("profiles_timeframe", (:profile_name, :year, :period)),
("rep_periods_data", (:year, :rep_period)),
("rep_periods_mapping", (:year, :period, :rep_period)),
("timeframe_data", (:year, :period)),
("year_data", (:year,)),
)
append!(duplicates, _validate_no_duplicate_rows!(connection, table, primary_keys))
end

return duplicates
end

function _validate_no_duplicate_rows!(connection, table, primary_keys)
keys_as_string = join(primary_keys, ", ")
duplicates = String[]
for row in DuckDB.query(
connection,
"SELECT $keys_as_string, COUNT(*) FROM $table GROUP BY $keys_as_string HAVING COUNT(*) > 1",
)
values = join(["$k=$(row[k])" for k in primary_keys], ", ")
push!(duplicates, "Table $table has duplicate entries for ($values)")
end

return duplicates
end
7 changes: 6 additions & 1 deletion src/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ export create_internal_tables!, export_solution_to_csv_files

Creates internal tables.
"""
function create_internal_tables!(connection)
function create_internal_tables!(connection; skip_validation = false)

# Create tables that are allowed to be missing
tables_allowed_to_be_missing = [
Expand All @@ -20,6 +20,11 @@ function create_internal_tables!(connection)
_create_empty_unless_exists(connection, table)
end

if !skip_validation
# Data validation - ensure that data is correct before
@timeit to "validate data" validate_data!(connection)
end

@timeit to "create_unrolled_partition_tables" create_unrolled_partition_tables!(connection)
@timeit to "create_merged_tables" create_merged_tables!(connection)
@timeit to "create_lowest_resolution_table" create_lowest_resolution_table!(connection)
Expand Down
40 changes: 40 additions & 0 deletions test/test-data-validation.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
const TEM = TulipaEnergyModel

@testset "Test DataValidationException print" begin
# Mostly to appease codecov
error_msg = "DataValidationException: The following issues were found in the data:\n- example"
@test_throws error_msg throw(TEM.DataValidationException(["example"]))
end

@testset "Test duplicate rows" begin
@testset "Using fake data" begin
bad_data = DataFrame(
:asset => ["ccgt", "demand", "wind", "ccgt", "demand"],
:year => [2030, 2030, 2030, 2050, 2050],
:value => [5.0, 10.0, 15.0, 7.0, 12.0],
)
connection = DBInterface.connect(DuckDB.DB)
DuckDB.register_data_frame(connection, bad_data, "bad_data")
@test TEM._validate_no_duplicate_rows!(connection, "bad_data", [:asset, :year]) == []
@test TEM._validate_no_duplicate_rows!(connection, "bad_data", [:asset]) == [
"Table bad_data has duplicate entries for (asset=ccgt)",
"Table bad_data has duplicate entries for (asset=demand)",
]
end

@testset "Duplicating rows of Tiny data" begin
connection = DBInterface.connect(DuckDB.DB)
_read_csv_folder(connection, joinpath(@__DIR__, "inputs", "Tiny"))
# Duplicating rows in these specific tables
for table in ("asset", "asset_both", "flow_both")
DuckDB.query(connection, "INSERT INTO $table (FROM $table ORDER BY RANDOM() LIMIT 1)")
end
@test_throws TEM.DataValidationException TEM.create_internal_tables!(connection)
error_messages = TEM._validate_no_duplicate_rows!(connection)
@test length(error_messages) == 3
# These tests assume an order in the validation of the tables
@test occursin("Table asset has duplicate entries", error_messages[1])
@test occursin("Table asset_both has duplicate entries", error_messages[2])
@test occursin("Table flow_both has duplicate entries", error_messages[3])
end
end
Loading