Add tests for few json options (#4698)

* add tests for few json options Signed-off-by: Niranjan Artal <nartal@nvidia.com> * addressed review comments Signed-off-by: Niranjan Artal <nartal@nvidia.com>
NVIDIA · Feb 9, 2022 · 667449d · 667449d
1 parent 9705fab
commit 667449d
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 2 deletions.
diff --git a/docs/compatibility.md b/docs/compatibility.md
@@ -489,6 +489,30 @@ these formats when unquoted, will produce `null` on the CPU and may produce vali
 Another limitation of the GPU JSON reader is that it will parse strings containing floating-point values where
 Spark will treat them as invalid inputs and will just return `null`.
 
+### JSON Schema discovery
+
+Spark SQL can automatically infer the schema of a JSON dataset if schema is not provided explicitly. The CPU 
+handles schema discovery and there is no GPU acceleration of this. By default Spark will read/parse the entire
+dataset to determine the schema. This means that some options/errors which are ignored by the GPU may still
+result in an exception if used with schema discovery.
+
+### JSON options
+
+Spark supports passing options to the JSON parser when reading a dataset.  In most cases if the RAPIDS Accelerator 
+sees one of these options that it does not support it will fall back to the CPU. In some cases we do not. The 
+following options are documented below.
+
+- `allowNumericLeadingZeros`  - Allows leading zeros in numbers (e.g. 00012). By default this is set to false. 
+When it is false Spark throws an exception if it encounters this type of number. The RAPIDS Accelerator 
+strips off leading zeros from all numbers and this config has no impact on it.
+
+- `allowUnquotedControlChars` - Allows JSON Strings to contain unquoted control characters (ASCII characters with 
+value less than 32, including tab and line feed characters) or not. By default this is set to false. If the schema 
+is provided while reading JSON file, then this flag has no impact on the RAPIDS Accelerator as it always allows 
+unquoted control characters but Spark reads these entries incorrectly as null. However, if the schema is not provided 
+and when the option is false, then RAPIDS Accelerator's behavior is same as Spark where an exception is thrown 
+as discussed in `JSON Schema discovery` section.
+
 ## Regular Expressions
 
 The following Apache Spark regular expression functions and expressions are supported on the GPU:

diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py
@@ -44,6 +44,9 @@
 _double_schema = StructType([
     StructField('number', DoubleType())])
 
+_string_schema = StructType([
+    StructField('a', StringType())])
+
 def read_json_df(data_path, schema, options = {}):
     def read_impl(spark):
         reader = spark.read
@@ -170,17 +173,31 @@ def test_json_ts_formats_round_trip(spark_tmp_path, date_format, ts_part, v1_ena
     'nan_and_inf.json',
     pytest.param('nan_and_inf_edge_cases.json', marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/4646')),
     'floats.json',
+    'floats_leading_zeros.json',
     'floats_invalid.json',
     pytest.param('floats_edge_cases.json', marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/4647')),
 ])
 @pytest.mark.parametrize('schema', [_float_schema, _double_schema])
 @pytest.mark.parametrize('read_func', [read_json_df, read_json_sql])
 @pytest.mark.parametrize('allow_non_numeric_numbers', ["true", "false"])
+@pytest.mark.parametrize('allow_numeric_leading_zeros', ["true"])
 @pytest.mark.parametrize('ansi_enabled', ["true", "false"])
-def test_basic_json_read(std_input_path, filename, schema, read_func, allow_non_numeric_numbers, ansi_enabled):
+def test_basic_json_read(std_input_path, filename, schema, read_func, allow_non_numeric_numbers, allow_numeric_leading_zeros, ansi_enabled):
     updated_conf = copy_and_update(_enable_all_types_conf, {'spark.sql.ansi.enabled': ansi_enabled})
     assert_gpu_and_cpu_are_equal_collect(
         read_func(std_input_path + '/' + filename,
         schema,
-        { "allowNonNumericNumbers": allow_non_numeric_numbers }),
+        { "allowNonNumericNumbers": allow_non_numeric_numbers,
+          "allowNumericLeadingZeros": allow_numeric_leading_zeros}),
         conf=updated_conf)
+
+@pytest.mark.parametrize('schema', [_string_schema])
+@pytest.mark.parametrize('read_func', [read_json_df, read_json_sql])
+@pytest.mark.parametrize('allow_unquoted_chars', ["true"])
+@pytest.mark.parametrize('filename', ['unquotedChars.json'])
+def test_json_unquotedCharacters(std_input_path, filename, schema, read_func, allow_unquoted_chars):
+    assert_gpu_and_cpu_are_equal_collect(
+        read_func(std_input_path + '/' + filename,
+        schema,
+        {"allowUnquotedControlChars": allow_unquoted_chars}),
+        conf=_enable_all_types_conf)
diff --git a/integration_tests/src/test/resources/floats_leading_zeros.json b/integration_tests/src/test/resources/floats_leading_zeros.json
@@ -0,0 +1,5 @@
+{ "number": -03.141592 }
+{ "number": 3.141592 }
+{ "number": 10 }
+{ "number": -00.0 }
+{ "number": 000.012 }
diff --git a/integration_tests/src/test/resources/unquotedChars.json b/integration_tests/src/test/resources/unquotedChars.json
@@ -0,0 +1,3 @@
+{ "a": "quotedChar-27" }
+{ "a": "quotedChar-7" }
+{ "a": "quotedChar-9	" }