Skip to content

Column support for array_to_string #6940

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Merged
merged 4 commits into from
Jul 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 80 additions & 32 deletions datafusion/core/tests/sqllogictests/test_files/array.slt
Original file line number Diff line number Diff line change
Expand Up @@ -344,22 +344,22 @@ select array_prepend(100.1, column2), array_prepend('.', column3) from arrays;
## array_fill

# array_fill scalar function #1
query error DataFusion error: SQL error: ParserError\("Expected an SQL statement, found: caused"\)
caused by
Error during planning: Cannot automatically convert List\(Field \{ name: "item", data_type: List\(Field \{ name: "item", data_type: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) to List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\)
query ???
select array_fill(11, make_array(1, 2, 3)), array_fill(3, make_array(2, 3)), array_fill(2, make_array(2));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

array_fill will not be used in future versions (due to the inability to properly set the data type)
It will be replaced with array_repeat.

----
[[[11, 11, 11], [11, 11, 11]]] [[3, 3, 3], [3, 3, 3]] [2, 2]

# array_fill scalar function #2
query error DataFusion error: SQL error: ParserError\("Expected an SQL statement, found: caused"\)
caused by
Error during planning: Cannot automatically convert List\(Field \{ name: "item", data_type: List\(Field \{ name: "item", data_type: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) to List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\)
query ??
select array_fill(1, make_array(1, 1, 1)), array_fill(2, make_array(2, 2, 2, 2, 2));
----
[[[1]]] [[[[[2, 2], [2, 2]], [[2, 2], [2, 2]]], [[[2, 2], [2, 2]], [[2, 2], [2, 2]]]], [[[[2, 2], [2, 2]], [[2, 2], [2, 2]]], [[[2, 2], [2, 2]], [[2, 2], [2, 2]]]]]

# array_fill scalar function #3
query error DataFusion error: SQL error: TokenizerError\("Unterminated string literal at Line: 2, Column 856"\)
caused by
Internal error: Optimizer rule 'simplify_expressions' failed, due to generate a different schema, original schema: DFSchema \{ fields: \[DFField \{ qualifier: None, field: Field \{ name: "array_fill\(Int64\(1\),make_array\(\)\)", data_type: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \} \}\], metadata: \{\} \}, new schema: DFSchema \{ fields: \[DFField \{ qualifier: None, field: Field \{ name: "array_fill\(Int64\(1\),make_array\(\)\)", data_type: List\(Field \{ name: "item", data_type: Null, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: \{\} \} \}\], metadata: \{\} \}\. This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker
query ?
select array_fill(1, make_array())
----
[]

## array_concat

Expand Down Expand Up @@ -498,16 +498,16 @@ select array_to_string(['h', 'e', 'l', 'l', 'o'], ','), array_to_string([1, 2, 3
h,e,l,l,o 1-2-3-4-5 1|2|3

# array_to_string scalar function #2
query error DataFusion error: SQL error: ParserError\("Expected an SQL statement, found: caused"\)
caused by
Arrow error: Cast error: Cannot cast string '1\+2\+3\+4\+5\+6' to value of Int64 type
query TTT
select array_to_string([1, 1, 1], '1'), array_to_string([[1, 2], [3, 4], [5, 6]], '+'), array_to_string(array_fill(3, [3, 2, 2]), '/\');
----
11111 1+2+3+4+5+6 3/\3/\3/\3/\3/\3/\3/\3/\3/\3/\3/\3

# array_to_string scalar function #3
query error DataFusion error: SQL error: ParserError\("Expected an SQL statement, found: caused"\)
caused by
Error during planning: Cannot automatically convert Utf8 to List\(Field \{ name: "item", data_type: Null, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\)
query T
select array_to_string(make_array(), ',')
----
(empty)

# array_to_string scalar function with nulls #1
query TTT
Expand All @@ -521,6 +521,56 @@ select array_to_string(make_array('h', NULL, NULL, NULL, 'o'), ',', '-'), array_
----
h,-,-,-,o nil-2-nil-4-5 1|0|3

# array_to_string with columns #1

# For reference
# select column1, column4 from arrays_values;
# ----
# [, 2, 3, 4, 5, 6, 7, 8, 9, 10] ,
# [11, 12, 13, 14, 15, 16, 17, 18, , 20] .
# [21, 22, 23, , 25, 26, 27, 28, 29, 30] -
# [31, 32, 33, 34, 35, , 37, 38, 39, 40] ok
# NULL @
# [41, 42, 43, 44, 45, 46, 47, 48, 49, 50] $
# [51, 52, , 54, 55, 56, 57, 58, 59, 60] ^
# [61, 62, 63, 64, 65, 66, 67, 68, 69, 70] NULL

query T
select array_to_string(column1, column4) from arrays_values;
----
2,3,4,5,6,7,8,9,10
11.12.13.14.15.16.17.18.20
21-22-23-25-26-27-28-29-30
31ok32ok33ok34ok35ok37ok38ok39ok40
NULL
41$42$43$44$45$46$47$48$49$50
51^52^54^55^56^57^58^59^60
NULL

query TT
select array_to_string(column1, '_'), array_to_string(make_array(1,2,3), '/') from arrays_values;
----
2_3_4_5_6_7_8_9_10 1/2/3
11_12_13_14_15_16_17_18_20 1/2/3
21_22_23_25_26_27_28_29_30 1/2/3
31_32_33_34_35_37_38_39_40 1/2/3
NULL 1/2/3
41_42_43_44_45_46_47_48_49_50 1/2/3
51_52_54_55_56_57_58_59_60 1/2/3
61_62_63_64_65_66_67_68_69_70 1/2/3

query TT
select array_to_string(column1, '_', '*'), array_to_string(make_array(make_array(1,2,3)), '.') from arrays_values;
----
*_2_3_4_5_6_7_8_9_10 1.2.3
11_12_13_14_15_16_17_18_*_20 1.2.3
21_22_23_*_25_26_27_28_29_30 1.2.3
31_32_33_34_35_*_37_38_39_40 1.2.3
NULL 1.2.3
41_42_43_44_45_46_47_48_49_50 1.2.3
51_52_*_54_55_56_57_58_59_60 1.2.3
61_62_63_64_65_66_67_68_69_70 1.2.3

## cardinality

# cardinality scalar function
Expand All @@ -530,10 +580,10 @@ select cardinality(make_array(1, 2, 3, 4, 5)), cardinality([1, 3, 5]), cardinali
5 3 5

# cardinality scalar function #2
query error DataFusion error: SQL error: ParserError\("Expected an SQL statement, found: caused"\)
caused by
Error during planning: Cannot automatically convert List\(Field \{ name: "item", data_type: List\(Field \{ name: "item", data_type: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) to List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\)
query II
select cardinality(make_array([1, 2], [3, 4], [5, 6])), cardinality(array_fill(3, array[3, 2, 3]));
----
6 18

# cardinality scalar function #3
query II
Expand Down Expand Up @@ -562,10 +612,10 @@ select trim_array(make_array(1, 2, 3, 4, 5), 2), trim_array(['h', 'e', 'l', 'l',
[1, 2, 3] [h, e] [1.0]

# trim_array scalar function #2
query error DataFusion error: SQL error: ParserError\("Expected an SQL statement, found: caused"\)
caused by
Error during planning: Cannot automatically convert List\(Field \{ name: "item", data_type: List\(Field \{ name: "item", data_type: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) to List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\)
query ??
select trim_array([[1, 2], [3, 4], [5, 6]], 2), trim_array(array_fill(4, [3, 4, 2]), 2);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

trim_array too (See: #6855 (reply in thread)) (it will be replaced with array_slice).

----
[[1, 2]] [[[4, 4], [4, 4], [4, 4], [4, 4]]]

# trim_array scalar function #3
query ?
Expand Down Expand Up @@ -600,10 +650,10 @@ select array_length(make_array(1, 2, 3, 4, 5), 2), array_length(make_array(1, 2,
NULL NULL 2

# array_length scalar function #4
query error DataFusion error: SQL error: ParserError\("Expected an SQL statement, found: caused"\)
caused by
Error during planning: Cannot automatically convert List\(Field \{ name: "item", data_type: List\(Field \{ name: "item", data_type: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) to List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\)
query IIII
select array_length(array_fill(3, [3, 2, 5]), 1), array_length(array_fill(3, [3, 2, 5]), 2), array_length(array_fill(3, [3, 2, 5]), 3), array_length(array_fill(3, [3, 2, 5]), 4);
----
3 2 5 NULL

# array_length scalar function #5
query III
Expand Down Expand Up @@ -646,10 +696,10 @@ select array_dims(make_array(1, 2, 3)), array_dims(make_array([1, 2], [3, 4])),
[3] [2, 2] [1, 1, 1, 2, 1]

# array_dims scalar function #2
query error DataFusion error: SQL error: ParserError\("Expected an SQL statement, found: caused"\)
caused by
Error during planning: Cannot automatically convert List\(Field \{ name: "item", data_type: List\(Field \{ name: "item", data_type: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) to List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\)
query ??
select array_dims(array_fill(2, [1, 2, 3])), array_dims(array_fill(3, [2, 5, 4]));
----
[1, 2, 3] [2, 5, 4]

# array_dims scalar function #3
query ??
Expand Down Expand Up @@ -678,10 +728,10 @@ select array_ndims(make_array(1, 2, 3)), array_ndims(make_array([1, 2], [3, 4]))
1 2 5

# array_ndims scalar function #2
query error DataFusion error: SQL error: ParserError\("Expected an SQL statement, found: caused"\)
caused by
Error during planning: Cannot automatically convert List\(Field \{ name: "item", data_type: List\(Field \{ name: "item", data_type: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) to List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\)
query II
select array_ndims(array_fill(1, [1, 2, 3])), array_ndims([[[[[[[[[[[[[[[[[[[[[1]]]]]]]]]]]]]]]]]]]]]);
----
3 21

# array_ndims scalar function #3
query II
Expand Down Expand Up @@ -763,10 +813,8 @@ select 1 || make_array(2, 3, 4), 1.0 || make_array(2.0, 3.0, 4.0), 'h' || make_a
----
[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o]


### Array casting tests


## make_array

# make_array scalar function #1
Expand Down
67 changes: 51 additions & 16 deletions datafusion/physical-expr/src/array_expressions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1040,17 +1040,17 @@ macro_rules! to_string {
}
}
}

Ok($ARG)
}};
}

/// Array_to_string SQL function
pub fn array_to_string(args: &[ArrayRef]) -> Result<ArrayRef> {
let arr = &args[0];
let delimeter = as_generic_string_array::<i32>(&args[1])?
.value(0)
.to_string();

let delimeters = as_generic_string_array::<i32>(&args[1])?;
let delimeters: Vec<Option<&str>> = delimeters.iter().collect();

let mut null_string = String::from("");
let mut with_null_string = false;
if args.len() == 3 {
Expand Down Expand Up @@ -1195,21 +1195,56 @@ pub fn array_to_string(args: &[ArrayRef]) -> Result<ArrayRef> {
}

let mut arg = String::from("");
let mut res = compute_array_to_string(
&mut arg,
arr.clone(),
delimeter.clone(),
null_string,
with_null_string,
)?
.clone();
match res.as_str() {
"" => Ok(Arc::new(StringArray::from(vec![Some(res)]))),
let mut res: Vec<Option<String>> = Vec::new();

match arr.data_type() {
DataType::List(_) | DataType::LargeList(_) | DataType::FixedSizeList(_, _) => {
let list_array = arr.as_list::<i32>();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is going to work (will panic) for LargeList as it should be i64 -- we would probably have to make a generic version of this function.

However, for now I think it looks reasonable and a step forward to me

for (arr, &delimeter) in list_array.iter().zip(delimeters.iter()) {
if let (Some(arr), Some(delimeter)) = (arr, delimeter) {
arg = String::from("");
let s = compute_array_to_string(
&mut arg,
arr,
delimeter.to_string(),
null_string.clone(),
with_null_string,
)?
.clone();

if let Some(s) = s.strip_suffix(delimeter) {
res.push(Some(s.to_string()));
} else {
res.push(Some(s));
}
} else {
res.push(None);
}
}
}
_ => {
res.truncate(res.len() - delimeter.len());
Ok(Arc::new(StringArray::from(vec![Some(res)])))
// delimeter length is 1
assert_eq!(delimeters.len(), 1);
let delimeter = delimeters[0].unwrap();
let s = compute_array_to_string(
&mut arg,
arr.clone(),
delimeter.to_string(),
null_string,
with_null_string,
)?
.clone();

if !s.is_empty() {
let s = s.strip_suffix(delimeter).unwrap().to_string();
res.push(Some(s));
Comment on lines +1239 to +1240
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not quite sure if it is sure that delimiter is always present. If it is not, this code will panic.

Perhaps it could be something like the following to aavoid the unwrap?

Suggested change
let s = s.strip_suffix(delimeter).unwrap().to_string();
res.push(Some(s));
let s = s.strip_suffix(delimeter).unwrap_or(s).to_string();
res.push(Some(s));

} else {
res.push(Some(s));
}
}
}

Ok(Arc::new(StringArray::from(res)))
}

/// Trim_array SQL function
Expand Down