Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

fix(targets): Safely skip parsing record field as date-time if it is missing in schema #1844

Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions singer_sdk/sinks/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,12 +365,15 @@ def _parse_timestamps_in_record(
schema: TODO
treatment: TODO
"""
for key in record:
for key, value in record.items():
if key not in schema["properties"]:
self.logger.debug("No schema for record field '%s'", key)
continue
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using the in operator is a bit faster than try-except:

import timeit

setup = """
big_dict = {
    i: i * i
    for i in range(50_000)
}
"""

try_except = """
for i in range(70_000):
    try:
        x = big_dict[i]
    except KeyError:
        continue
"""

in_operator = """
for i in range(70_000):
    if i in big_dict:
        x = big_dict[i]
"""

print(timeit.timeit(setup=setup, stmt=try_except, number=1000))   # 4.430988875003095
print(timeit.timeit(setup=setup, stmt=in_operator, number=1000))  # 2.137157458000729

datelike_type = get_datelike_property_type(schema["properties"][key])
if datelike_type:
date_val = record[key]
date_val = value
try:
if record[key] is not None:
if value is not None:
date_val = parser.parse(date_val)
except parser.ParserError as ex:
date_val = handle_invalid_timestamp_in_record(
Expand Down
44 changes: 44 additions & 0 deletions tests/core/sinks/test_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from __future__ import annotations

import datetime

from tests.conftest import BatchSinkMock, TargetMock


def test_validate_record():
target = TargetMock()
sink = BatchSinkMock(
target,
"users",
{
"type": "object",
"properties": {
"id": {"type": "integer"},
"created_at": {"type": "string", "format": "date-time"},
},
},
["id"],
)

record_message = {
"type": "RECORD",
"stream": "users",
"record": {
"id": 1,
"created_at": "2021-01-01T00:00:00+00:00",
"missing_datetime": "2021-01-01T00:00:00+00:00",
},
"time_extracted": "2021-01-01T00:00:00+00:00",
"version": 100,
}
record = record_message["record"]
updated_record = sink._validate_and_parse(record)
assert updated_record["created_at"] == datetime.datetime(
2021,
1,
1,
0,
0,
tzinfo=datetime.timezone.utc,
)
assert updated_record["missing_datetime"] == "2021-01-01T00:00:00+00:00"