Skip to content

Commit

Permalink
Allow ordering by id if created_at/updated_at is null
Browse files Browse the repository at this point in the history
  • Loading branch information
ericaporter committed Sep 5, 2024
1 parent 79378cc commit f62bc40
Show file tree
Hide file tree
Showing 6 changed files with 123 additions and 7 deletions.
12 changes: 10 additions & 2 deletions lib/dfe/analytics/services/entity_table_checks.rb
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,9 @@ def order_column_exposed_for_entity?(entity_name, columns)
end

def determine_order_column(entity_name, columns)
if connection.column_exists?(entity_name, :updated_at) && columns.include?('updated_at')
if connection.column_exists?(entity_name, :updated_at) && columns.include?('updated_at') && !null_values_in_column?('updated_at')
'UPDATED_AT'
elsif connection.column_exists?(entity_name, :created_at) && columns.include?('created_at')
elsif connection.column_exists?(entity_name, :created_at) && columns.include?('created_at') && !null_values_in_column?('created_at')
'CREATED_AT'
elsif connection.column_exists?(entity_name, :id) && columns.include?('id')
'ID'
Expand All @@ -78,6 +78,14 @@ def determine_order_column(entity_name, columns)
end
end

def null_values_in_column?(column)
connection.select_value(<<-SQL).to_i.positive?
SELECT COUNT(*)
FROM #{connection.quote_table_name(entity_name)}
WHERE #{column} IS NULL
SQL
end

def entity_table_check_data(entity_name, order_column)
checksum_calculated_at = fetch_current_timestamp_in_time_zone

Expand Down
3 changes: 2 additions & 1 deletion lib/dfe/analytics/services/generic_checksum_calculator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ def build_select_and_order_clause(order_column, table_name_sanitized)
def build_where_clause(order_column, table_name_sanitized, checksum_calculated_at_sanitized)
return '' unless WHERE_CLAUSE_ORDER_COLUMNS.map(&:downcase).include?(order_column.downcase)

"WHERE #{table_name_sanitized}.#{order_column.downcase} < #{checksum_calculated_at_sanitized}"
# Add IS NULL to include records with null updated_at / created_at values
"WHERE (#{table_name_sanitized}.#{order_column.downcase} IS NULL OR #{table_name_sanitized}.#{order_column.downcase} < #{checksum_calculated_at_sanitized})"
end
end
end
Expand Down
7 changes: 5 additions & 2 deletions lib/dfe/analytics/services/postgres_checksum_calculator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ def initialize(entity, order_column, checksum_calculated_at)
end

def call
calculate_checksum
ActiveRecord::Base.transaction do
calculate_checksum
end
end

private
Expand Down Expand Up @@ -65,7 +67,8 @@ def build_select_and_order_clause(order_column, table_name_sanitized)
def build_where_clause(order_column, table_name_sanitized, checksum_calculated_at_sanitized)
return '' unless WHERE_CLAUSE_ORDER_COLUMNS.map(&:downcase).include?(order_column.downcase)

"WHERE DATE_TRUNC('milliseconds', #{table_name_sanitized}.#{order_column.downcase}) < DATE_TRUNC('milliseconds', #{checksum_calculated_at_sanitized}::timestamp)"
# Add IS NULL to include records with null updated_at / created_at values
"WHERE (#{table_name_sanitized}.#{order_column.downcase} IS NULL OR DATE_TRUNC('milliseconds', #{table_name_sanitized}.#{order_column.downcase}) < DATE_TRUNC('milliseconds', #{checksum_calculated_at_sanitized}::timestamp))"
end
end
end
Expand Down
78 changes: 76 additions & 2 deletions spec/dfe/analytics/services/entity_table_checks_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
t.string :email_address
t.string :first_name
t.string :last_name
t.datetime :created_at
t.datetime :updated_at
end
end
Expand Down Expand Up @@ -48,7 +49,7 @@
DfE::Analytics.config.entity_table_checks_enabled = true
allow(DfE::Analytics::SendEvents).to receive(:perform_later)
allow(DfE::Analytics).to receive(:allowlist).and_return({
Candidate.table_name.to_sym => %w[updated_at],
Candidate.table_name.to_sym => %w[updated_at created_at id],
Application.table_name.to_sym => %w[type created_at],
Course.table_name.to_sym => %w[name duration],
Department.table_name.to_sym => %w[name],
Expand Down Expand Up @@ -96,6 +97,80 @@
expect(Rails.logger).to have_received(:info).with(expected_message)
end

it 'uses updated_at if it exists and has no null values' do
Candidate.create!(id: 1, email_address: 'first@example.com', updated_at: Time.current)
Candidate.create!(id: 2, email_address: 'second@example.com', updated_at: Time.current + 1.minute)

table_ids = Candidate.order(:updated_at).pluck(:id)
checksum = Digest::MD5.hexdigest(table_ids.join)

described_class.call(entity_name: candidate_entity, entity_type: entity_type, entity_tag: nil)

expect(DfE::Analytics::SendEvents).to have_received(:perform_later)
.with([a_hash_including({
'data' => [
{ 'key' => 'row_count', 'value' => [table_ids.size] },
{ 'key' => 'checksum', 'value' => [checksum] },
{ 'key' => 'checksum_calculated_at', 'value' => [checksum_calculated_at] },
{ 'key' => 'order_column', 'value' => ['UPDATED_AT'] }
]
})])
end

it 'falls back to id when both updated_at and created_at are null' do
frozen_time = Time.zone.now.in_time_zone('London') # Use London timezone
Timecop.freeze(frozen_time)

candidate1 = Candidate.create!(id: 1, email_address: 'first@example.com')
candidate2 = Candidate.create!(id: 2, email_address: 'second@example.com')
candidate3 = Candidate.create!(id: 3, email_address: 'third@example.com')

candidate1.update_columns(created_at: nil, updated_at: nil)
candidate2.update_columns(created_at: nil, updated_at: nil)
candidate3.update_columns(created_at: nil, updated_at: nil)

table_ids = Candidate.order(:id).pluck(:id)
checksum = Digest::MD5.hexdigest(table_ids.join)

described_class.call(entity_name: candidate_entity, entity_type: entity_type, entity_tag: nil)

expect(DfE::Analytics::SendEvents).to have_received(:perform_later)
.with([a_hash_including({
'data' => [
{ 'key' => 'row_count', 'value' => [table_ids.size] },
{ 'key' => 'checksum', 'value' => [checksum] },
{ 'key' => 'checksum_calculated_at', 'value' => a_string_including(frozen_time.iso8601(6)) }, # Expect the +01:00 (or +00:00) timezone for London
{ 'key' => 'order_column', 'value' => ['ID'] }
]
})])

Timecop.return
end

it 'falls back to created_at when updated_at is null but created_at exists' do
Candidate.create!(id: 1, email_address: 'first@example.com', updated_at: nil, created_at: 2.hours.ago)
Candidate.create!(id: 2, email_address: 'second@example.com', updated_at: nil, created_at: 1.hour.ago)
Candidate.create!(id: 3, email_address: 'third@example.com', updated_at: nil, created_at: 3.hours.ago)

# Ensure updated_at is nil
Candidate.update_all(updated_at: nil)

table_ids = Candidate.order(:created_at).pluck(:id)
checksum = Digest::MD5.hexdigest(table_ids.join)

described_class.call(entity_name: candidate_entity, entity_type: entity_type, entity_tag: nil)

expect(DfE::Analytics::SendEvents).to have_received(:perform_later)
.with([a_hash_including({
'data' => [
{ 'key' => 'row_count', 'value' => [table_ids.size] },
{ 'key' => 'checksum', 'value' => [checksum] },
{ 'key' => 'checksum_calculated_at', 'value' => [checksum_calculated_at] },
{ 'key' => 'order_column', 'value' => ['CREATED_AT'] }
]
})])
end

it 'orders by created_at if updated_at is missing' do
order_column = 'CREATED_AT'
[123, 124, 125].map { |id| Application.create(id: id) }
Expand Down Expand Up @@ -147,7 +222,6 @@
checksum = Digest::MD5.hexdigest(table_ids.join)

described_class.call(entity_name: candidate_entity, entity_type: entity_type, entity_tag: nil)
puts "@checksum_calculated_at set to #{checksum_calculated_at}" # Debugging output

expect(DfE::Analytics::SendEvents).to have_received(:perform_later)
.with([a_hash_including({
Expand Down
16 changes: 16 additions & 0 deletions spec/dfe/analytics/services/generic_checksum_calculator_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,20 @@
expect(row_count).to eq(candidate_table_ids.count)
expect(checksum).to eq(entity_table_checksum)
end

it 'calculates checksum and correct row_count when updated_at has null values' do
Candidate.create(id: 1, email_address: 'candidate1@example.com', updated_at: nil)
Candidate.create(id: 2, email_address: 'candidate2@example.com', updated_at: nil)
Candidate.create(id: 3, email_address: 'candidate3@example.com', updated_at: Time.current - 1.day)
Candidate.create(id: 4, email_address: 'candidate4@example.com', updated_at: Time.current - 2.days)

order_column = 'ID' # defaults to ID when order_columns contain null values
expected_ids = [1, 2, 3, 4]
expected_checksum = Digest::MD5.hexdigest(expected_ids.join)

row_count, checksum = described_class.call(candidate_entity, order_column, checksum_calculated_at)

expect(row_count).to eq(4)
expect(checksum).to eq(expected_checksum)
end
end
14 changes: 14 additions & 0 deletions spec/dfe/analytics/services/postgres_checksum_calculator_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,18 @@
expect(row_count).to eq(candidate_table_ids.count)
expect(checksum).to eq('mocked_checksum')
end

it 'calculates checksum and correct row_count when updated_at has null values' do
Candidate.create(id: 1, email_address: 'candidate1@example.com', updated_at: nil)
Candidate.create(id: 2, email_address: 'candidate2@example.com', updated_at: nil)
Candidate.create(id: 3, email_address: 'candidate3@example.com', updated_at: Time.current - 1.day)
Candidate.create(id: 4, email_address: 'candidate4@example.com', updated_at: Time.current - 2.days)

allow(ActiveRecord::Base.connection).to receive(:execute).and_return([{ 'row_count' => 4, 'checksum' => 'mocked_checksum' }])

row_count, checksum = described_class.call(candidate_entity, order_column, checksum_calculated_at)

expect(row_count).to eq(4)
expect(checksum).to eq('mocked_checksum')
end
end

0 comments on commit f62bc40

Please # to comment.