diff --git a/README.md b/README.md index 9e6ab68..cf5045d 100644 --- a/README.md +++ b/README.md @@ -11,12 +11,12 @@ Built with the [Meltano SDK](https://sdk.meltano.com) for Singer Taps and Target ## Capabilities -* `about` -* `stream-maps` -* `schema-flattening` -* `validate-records` -* `target-schema` -* `hard-delete` +- `about` +- `stream-maps` +- `schema-flattening` +- `validate-records` +- `target-schema` +- `hard-delete` ## Supported Python and PostgreSQL Versions @@ -24,33 +24,34 @@ This target is tested with all actively supported [Python](https://devguide.pyth ## Settings -| Setting | Required | Default | Description | -| :------------------------------ | :------- | :--------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| host | False | None | Hostname for postgres instance. | -| port | False | 5432 | The port on which postgres is awaiting connections. | -| user | False | None | User name used to authenticate. | -| password | False | None | Password used to authenticate. | -| database | False | None | Database name. | -| use_copy | False | None | Use the COPY command to insert data. This is usually faster than INSERT statements. This option is only available for the postgres+psycopg dialect+driver combination. | -| default_target_schema | False | melty | Postgres schema to send data to, example: tap-clickup | -| activate_version | False | 1 | If set to false, the tap will ignore activate version messages. If set to true, add_record_metadata must be set to true as well. | -| hard_delete | False | 0 | When activate version is sent from a tap this specefies if we should delete the records that don't match, or mark them with a date in the `_sdc_deleted_at` column. This config option is ignored if `activate_version` is set to false. | -| add_record_metadata | False | 1 | Note that this must be enabled for activate_version to work!This adds _sdc_extracted_at, _sdc_batched_at, and more to every table. See https://sdk.meltano.com/en/latest/implementation/record_metadata.html for more information. | -| interpret_content_encoding | False | 0 | If set to true, the target will interpret the content encoding of the schema to determine how to store the data. Using this option may result in a more efficient storage of the data but may also result in an error if the data is not encoded as expected. | -| ssl_enable | False | 0 | Whether or not to use ssl to verify the server's identity. Use ssl_certificate_authority and ssl_mode for further customization. To use a client certificate to authenticate yourself to the server, use ssl_client_certificate_enable instead. | -| ssl_client_certificate_enable | False | 0 | Whether or not to provide client-side certificates as a method of authentication to the server. Use ssl_client_certificate and ssl_client_private_key for further customization. To use SSL to verify the server's identity, use ssl_enable instead. | -| ssl_mode | False | verify-full | SSL Protection method, see [postgres documentation](https://www.postgresql.org/docs/current/libpq-ssl.html#LIBPQ-SSL-PROTECTION) for more information. Must be one of disable, allow, prefer, require, verify-ca, or verify-full. | -| ssl_certificate_authority | False | ~/.postgresql/root.crl | The certificate authority that should be used to verify the server's identity. Can be provided either as the certificate itself (in .env) or as a filepath to the certificate. | -| ssl_client_certificate | False | ~/.postgresql/postgresql.crt | The certificate that should be used to verify your identity to the server. Can be provided either as the certificate itself (in .env) or as a filepath to the certificate. | -| ssl_client_private_key | False | ~/.postgresql/postgresql.key | The private key for the certificate you provided. Can be provided either as the certificate itself (in .env) or as a filepath to the certificate. | -| ssl_storage_directory | False | .secrets | The folder in which to store SSL certificates provided as raw values. When a certificate/key is provided as a raw value instead of as a filepath, it must be written to a file before it can be used. This configuration option determines where that file is created. | -| ssh_tunnel | False | None | SSH Tunnel Configuration, this is a json object | -| ssh_tunnel.enable | False | 0 | Enable an ssh tunnel (also known as bastion host), see the other ssh_tunnel.* properties for more details | -| ssh_tunnel.host | False | None | Host of the bastion host, this is the host we'll connect to via ssh | -| ssh_tunnel.username | False | None | Username to connect to bastion host | -| ssh_tunnel.port | False | 22 | Port to connect to bastion host | -| ssh_tunnel.private_key | False | None | Private Key for authentication to the bastion host | -| ssh_tunnel.private_key_password | False | None | Private Key Password, leave None if no password is set | +| Setting | Required | Default | Description | +| :------------------------------ | :------- | :--------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| host | False | None | Hostname for postgres instance. | +| port | False | 5432 | The port on which postgres is awaiting connections. | +| user | False | None | User name used to authenticate. | +| password | False | None | Password used to authenticate. | +| database | False | None | Database name. | +| use_copy | False | None | Use the COPY command to insert data. This is usually faster than INSERT statements. This option is only available for the postgres+psycopg dialect+driver combination. | +| default_target_schema | False | melty | Postgres schema to send data to, example: tap-clickup | +| activate_version | False | 1 | If set to false, the tap will ignore activate version messages. If set to true, add_record_metadata must be set to true as well. | +| hard_delete | False | 0 | When activate version is sent from a tap this specefies if we should delete the records that don't match, or mark them with a date in the `_sdc_deleted_at` column. This config option is ignored if `activate_version` is set to false. | +| add_record_metadata | False | 1 | Note that this must be enabled for activate_version to work!This adds \_sdc_extracted_at, \_sdc_batched_at, and more to every table. See https://sdk.meltano.com/en/latest/implementation/record_metadata.html for more information. | +| interpret_content_encoding | False | 0 | If set to true, the target will interpret the content encoding of the schema to determine how to store the data. Using this option may result in a more efficient storage of the data but may also result in an error if the data is not encoded as expected. | +| sanitize_null_text_characters | False | 0 | If set to true, the target will sanitize null characters in char/text/varchar fields, as they are not supported by Postgres. See [postgres documentation](https://www.postgresql.org/docs/current/functions-string.html) for more information about chr(0) not being supported. | +| ssl_enable | False | 0 | Whether or not to use ssl to verify the server's identity. Use ssl_certificate_authority and ssl_mode for further customization. To use a client certificate to authenticate yourself to the server, use ssl_client_certificate_enable instead. | +| ssl_client_certificate_enable | False | 0 | Whether or not to provide client-side certificates as a method of authentication to the server. Use ssl_client_certificate and ssl_client_private_key for further customization. To use SSL to verify the server's identity, use ssl_enable instead. | +| ssl_mode | False | verify-full | SSL Protection method, see [postgres documentation](https://www.postgresql.org/docs/current/libpq-ssl.html#LIBPQ-SSL-PROTECTION) for more information. Must be one of disable, allow, prefer, require, verify-ca, or verify-full. | +| ssl_certificate_authority | False | ~/.postgresql/root.crl | The certificate authority that should be used to verify the server's identity. Can be provided either as the certificate itself (in .env) or as a filepath to the certificate. | +| ssl_client_certificate | False | ~/.postgresql/postgresql.crt | The certificate that should be used to verify your identity to the server. Can be provided either as the certificate itself (in .env) or as a filepath to the certificate. | +| ssl_client_private_key | False | ~/.postgresql/postgresql.key | The private key for the certificate you provided. Can be provided either as the certificate itself (in .env) or as a filepath to the certificate. | +| ssl_storage_directory | False | .secrets | The folder in which to store SSL certificates provided as raw values. When a certificate/key is provided as a raw value instead of as a filepath, it must be written to a file before it can be used. This configuration option determines where that file is created. | +| ssh_tunnel | False | None | SSH Tunnel Configuration, this is a json object | +| ssh_tunnel.enable | False | 0 | Enable an ssh tunnel (also known as bastion host), see the other ssh_tunnel.\* properties for more details | +| ssh_tunnel.host | False | None | Host of the bastion host, this is the host we'll connect to via ssh | +| ssh_tunnel.username | False | None | Username to connect to bastion host | +| ssh_tunnel.port | False | 22 | Port to connect to bastion host | +| ssh_tunnel.private_key | False | None | Private Key for authentication to the bastion host | +| ssh_tunnel.private_key_password | False | None | Private Key Password, leave None if no password is set | A full list of supported settings and capabilities is available by running: `target-postgres --about` @@ -58,18 +59,18 @@ A full list of supported settings and capabilities is available by running: `tar The following settings are automatically supported by the Meltano SDK and inherited by this target. -| Setting | Required | Default | Description | -| :------------------------------ | :------- | :---------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| load_method | False | TargetLoadMethods.APPEND_ONLY | The method to use when loading data into the destination. `append-only` will always write all input records whether that records already exists or not. `upsert` will update existing records and insert new records. `overwrite` will delete all existing records and insert all input records. | -| batch_size_rows | False | None | Maximum number of rows in each batch. | -| validate_records | False | 1 | Whether to validate the schema of the incoming streams. | -| stream_maps | False | None | Config object for stream maps capability. For more information check out [Stream Maps](https://sdk.meltano.com/en/latest/stream_maps.html). | -| stream_map_config | False | None | User-defined config values to be used within map expressions. | -| faker_config | False | None | Config for the [`Faker`](https://faker.readthedocs.io/en/master/) instance variable `fake` used within map expressions. Only applicable if the plugin specifies `faker` as an addtional dependency (through the `singer-sdk` `faker` extra or directly). | -| faker_config.seed | False | None | Value to seed the Faker generator for deterministic output: https://faker.readthedocs.io/en/master/#seeding-the-generator | -| faker_config.locale | False | None | One or more LCID locale strings to produce localized output for: https://faker.readthedocs.io/en/master/#localization | -| flattening_enabled | False | None | 'True' to enable schema flattening and automatically expand nested properties. | -| flattening_max_depth | False | None | The max depth to flatten schemas. | +| Setting | Required | Default | Description | +| :------------------- | :------- | :---------------------------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| load_method | False | TargetLoadMethods.APPEND_ONLY | The method to use when loading data into the destination. `append-only` will always write all input records whether that records already exists or not. `upsert` will update existing records and insert new records. `overwrite` will delete all existing records and insert all input records. | +| batch_size_rows | False | None | Maximum number of rows in each batch. | +| validate_records | False | 1 | Whether to validate the schema of the incoming streams. | +| stream_maps | False | None | Config object for stream maps capability. For more information check out [Stream Maps](https://sdk.meltano.com/en/latest/stream_maps.html). | +| stream_map_config | False | None | User-defined config values to be used within map expressions. | +| faker_config | False | None | Config for the [`Faker`](https://faker.readthedocs.io/en/master/) instance variable `fake` used within map expressions. Only applicable if the plugin specifies `faker` as an addtional dependency (through the `singer-sdk` `faker` extra or directly). | +| faker_config.seed | False | None | Value to seed the Faker generator for deterministic output: https://faker.readthedocs.io/en/master/#seeding-the-generator | +| faker_config.locale | False | None | One or more LCID locale strings to produce localized output for: https://faker.readthedocs.io/en/master/#localization | +| flattening_enabled | False | None | 'True' to enable schema flattening and automatically expand nested properties. | +| flattening_max_depth | False | None | The max depth to flatten schemas. | #### Note on generating fake data @@ -79,16 +80,16 @@ If you're using [Meltano](https://docs.meltano.com/), you can add the `faker` ex 1. If you're installing the plugin from PyPI: - ```yaml - pip_url: "meltanolabs-target-postgres[faker]==" - ``` +```yaml +pip_url: "meltanolabs-target-postgres[faker]==" +``` 2. If you're installing the plugin from the Git repository: - ```yaml - # Note the nested quotes - pip_url: "'meltanolabs-target-postgres[faker] @ git+https://github.com/MeltanoLabs/target-postgres.git@'" - ``` +```yaml +# Note the nested quotes +pip_url: "'meltanolabs-target-postgres[faker] @ git+https://github.com/MeltanoLabs/target-postgres.git@'" +``` ## Installation @@ -123,6 +124,7 @@ environment variable is set either in the terminal context or in the `.env` file ### Source Authentication and Authorization The database account provided must have access to: + 1. Create schemas 1. Create tables (DDL) 1. Push Data to tables (DML) @@ -161,6 +163,7 @@ pre-commit install We have set the provided keys in the .ssl directory to be valid for multiple centuries. However, we have also provided configuration instructions below to create all of the necessary files for testing SSL. A list of each file and its purpose: + 1. `ca.crt`: CA for client's certificate (stored on the server) 1. `cert.crt`: Client's certificate (stored on the client) 1. `pkey.key`: Client's private key (stored on the client) @@ -188,12 +191,13 @@ Now that all of the SSL files have been set up, you're ready to set up tests wit ### Create and Run Tests Start the test databases using Docker Compose: + ```bash docker-compose up -d ``` Create tests within the `target_postgres/tests` subfolder and - then run: +then run: ```bash poetry run pytest @@ -237,55 +241,56 @@ develop your own Singer taps and targets. The below table shows how this tap will map between jsonschema datatypes and Postgres datatypes. -| jsonschema | Postgres | -|--------------------------------|-----------------------------------------| -| integer | bigint | -| UNSUPPORTED | bigserial | -| UNSUPPORTED | bit [ (n) ] | -| UNSUPPORTED | bit varying [ (n) ] | -| boolean | boolean | -| UNSUPPORTED | box | +| jsonschema | Postgres | +| ---------------------------------------------------------------------------------- | --------------------------------------- | +| integer | bigint | +| UNSUPPORTED | bigserial | +| UNSUPPORTED | bit [ (n) ] | +| UNSUPPORTED | bit varying [ (n) ] | +| boolean | boolean | +| UNSUPPORTED | box | | string with contentEncoding="base16" ([opt-in feature](#content-encoding-support)) | bytea | -| UNSUPPORTED | character [ (n) ] | -| UNSUPPORTED | character varying [ (n) ] | -| UNSUPPORTED | cidr | -| UNSUPPORTED | circle | -| string with format="date" | date | -| UNSUPPORTED | double precision | -| UNSUPPORTED | inet | -| UNSUPPORTED | integer | -| UNSUPPORTED | interval [ fields ] [ (p) ] | -| UNSUPPORTED | json | -| array; object | jsonb | -| UNSUPPORTED | line | -| UNSUPPORTED | lseg | -| UNSUPPORTED | macaddr | -| UNSUPPORTED | macaddr8 | -| UNSUPPORTED | money | -| number | numeric [ (p, s) ] | -| UNSUPPORTED | path | -| UNSUPPORTED | pg_lsn | -| UNSUPPORTED | pg_snapshot | -| UNSUPPORTED | point | -| UNSUPPORTED | polygon | -| UNSUPPORTED | real | -| UNSUPPORTED | smallint | -| UNSUPPORTED | smallserial | -| UNSUPPORTED | serial | -| string without format; untyped | text | -| string with format="time" | time [ (p) ] [ without time zone ] | -| UNSUPPORTED | time [ (p) ] with time zone | -| string with format="date-time" | timestamp [ (p) ] [ without time zone ] | -| UNSUPPORTED | timestamp [ (p) ] with time zone | -| UNSUPPORTED | tsquery | -| UNSUPPORTED | tsvector | -| UNSUPPORTED | txid_snapshot | -| string with format="uuid" | uuid | -| UNSUPPORTED | xml | +| UNSUPPORTED | character [ (n) ] | +| UNSUPPORTED | character varying [ (n) ] | +| UNSUPPORTED | cidr | +| UNSUPPORTED | circle | +| string with format="date" | date | +| UNSUPPORTED | double precision | +| UNSUPPORTED | inet | +| UNSUPPORTED | integer | +| UNSUPPORTED | interval [ fields ] [ (p) ] | +| UNSUPPORTED | json | +| array; object | jsonb | +| UNSUPPORTED | line | +| UNSUPPORTED | lseg | +| UNSUPPORTED | macaddr | +| UNSUPPORTED | macaddr8 | +| UNSUPPORTED | money | +| number | numeric [ (p, s) ] | +| UNSUPPORTED | path | +| UNSUPPORTED | pg_lsn | +| UNSUPPORTED | pg_snapshot | +| UNSUPPORTED | point | +| UNSUPPORTED | polygon | +| UNSUPPORTED | real | +| UNSUPPORTED | smallint | +| UNSUPPORTED | smallserial | +| UNSUPPORTED | serial | +| string without format; untyped | text | +| string with format="time" | time [ (p) ] [ without time zone ] | +| UNSUPPORTED | time [ (p) ] with time zone | +| string with format="date-time" | timestamp [ (p) ] [ without time zone ] | +| UNSUPPORTED | timestamp [ (p) ] with time zone | +| UNSUPPORTED | tsquery | +| UNSUPPORTED | tsvector | +| UNSUPPORTED | txid_snapshot | +| string with format="uuid" | uuid | +| UNSUPPORTED | xml | Note that while object types are mapped directly to jsonb, array types are mapped to a jsonb array. If a column has multiple jsonschema types, the following order is using to order Postgres types, from highest priority to lowest priority. + - BYTEA - ARRAY(JSONB) - JSONB @@ -314,10 +319,10 @@ To enable it, set the `interpret_content_encoding` option to `True`. ### base16 -The string is encoded using the base16 encoding, as defined in [RFC 4648](https://json-schema.org/draft/2020-12/draft-bhutton-json-schema-validation-00#rfc.section.8.3 -). +The string is encoded using the base16 encoding, as defined in [RFC 4648](https://json-schema.org/draft/2020-12/draft-bhutton-json-schema-validation-00#rfc.section.8.3). Example schema: + ```json { "type": "object", @@ -333,6 +338,7 @@ Example schema: Data will be stored as a `bytea` in the database. Example data: + ```json # valid data { "my_hex": "01AF" } diff --git a/target_postgres/connector.py b/target_postgres/connector.py index f627876..252b1a1 100644 --- a/target_postgres/connector.py +++ b/target_postgres/connector.py @@ -111,6 +111,14 @@ def interpret_content_encoding(self) -> bool: True if the feature is enabled, False otherwise. """ return self.config.get("interpret_content_encoding", False) + + @cached_property + def sanitize_null_text_characters(self) -> bool: + """Whether to sanitize null text characters. + Returns: + True if the feature is enabled, False otherwise. + """ + return self.config.get("sanitize_null_text_characters", False) def prepare_table( # type: ignore[override] # noqa: PLR0913 self, diff --git a/target_postgres/sinks.py b/target_postgres/sinks.py index d6959cd..32a622e 100644 --- a/target_postgres/sinks.py +++ b/target_postgres/sinks.py @@ -170,6 +170,27 @@ def _do_copy( copy.write_row(processed_row) + def sanitize_null_text_characters(self, data): + """Sanitizes null characters by replacing \u0000 with \ufffd.""" + + def replace_null_character(d): + return d.replace("\u0000", "\ufffd") + + if isinstance(data, str): + data = replace_null_character(data) + + elif isinstance(data, dict): + for k in data: + if isinstance(data[k], str): + data[k] = replace_null_character(data[k]) + + elif isinstance(data, list): + for i in range(0, len(data)): + if isinstance(data[i], str): + data[i] = replace_null_character(data[i]) + + return data + def bulk_insert_records( # type: ignore[override] self, table: sa.Table, @@ -204,7 +225,12 @@ def bulk_insert_records( # type: ignore[override] unique_records: dict[tuple, dict] = {} # pk tuple: values for record in records: insert_record = { - column.name: record.get(column.name) for column in columns + column.name: ( + self.sanitize_null_text_characters(record.get(column.name)) + if self.connector.sanitize_null_text_characters + else record.get(column.name) + ) + for column in columns } # No need to check for a KeyError here because the SDK already # guarantees that all key properties exist in the record. @@ -214,7 +240,12 @@ def bulk_insert_records( # type: ignore[override] else: for record in records: insert_record = { - column.name: record.get(column.name) for column in columns + column.name: ( + self.sanitize_null_text_characters(record.get(column.name)) + if self.connector.sanitize_null_text_characters + else record.get(column.name) + ) + for column in columns } data.append(insert_record) diff --git a/target_postgres/target.py b/target_postgres/target.py index 1d4bf5a..b54d7fb 100644 --- a/target_postgres/target.py +++ b/target_postgres/target.py @@ -218,6 +218,17 @@ def __init__( "in an error if the data is not encoded as expected." ), ), + th.Property( + "sanitize_null_text_characters", + th.BooleanType, + default=False, + description=( + "If set to true, the target will sanitize null characters in " + "char/text/varchar fields, as they are not supported by Postgres. " + "See [postgres documentation](https://www.postgresql.org/docs/current/functions-string.html) " # noqa: E501 + "for more information about chr(0) not being supported." + ), + ), th.Property( "ssl_enable", th.BooleanType,