diff --git a/distribution/std-lib/Base/src/Error/Extensions.enso b/distribution/std-lib/Base/src/Error/Extensions.enso index 5ae59204e962..ec61324189ee 100644 --- a/distribution/std-lib/Base/src/Error/Extensions.enso +++ b/distribution/std-lib/Base/src/Error/Extensions.enso @@ -26,3 +26,7 @@ unimplemented message="" = Panic.throw (Unimplemented_Error message) Error.catch : (Error -> Any) -> Any Error.catch (handler = x->x) = this.catch_primitive handler +## Takes any value, and if it is a dataflow error, throws it as a Panic. + Otherwise, returns the original value unchanged. +Panic.rethrow : (Any ! Any) -> Any +Panic.rethrow value = value.catch Panic.throw diff --git a/distribution/std-lib/Table/src/Data/Order_Rule.enso b/distribution/std-lib/Table/src/Data/Order_Rule.enso new file mode 100644 index 000000000000..904109a646c3 --- /dev/null +++ b/distribution/std-lib/Table/src/Data/Order_Rule.enso @@ -0,0 +1,29 @@ +from Base import all + +type Order_Rule + ## UNSTABLE + + A rule used for sorting table-like structures. + + Arguments: + - column: a value representing the data dimension by which this rule is + sorting. This type does not specify the underlying representation of + a column, assuming that the sorting engine defines its own column + representation. + - comparator: a function taking two elements of the data being sorted + on and returning an `Ordering`. The function may be `Nothing`, in + which case a natural ordering will be used. Note that certain table + backends (such us database connectors) may not support this field + being set to a non-`Nothing` value. + - order: specifies whether the table should be sorted in an ascending + or descending order. The default value of `Nothing` delegates the + decision to the sorting function. Can be set to + `Sort_Order.Ascending` or `Sort_Order.Descending` from the `Base` + library, to specify the ordering. + - missing_last: whether the missing values should be placed at the + beginning or end of the sorted table. Note that this argument is + independent from `order`, i.e. missing values will always be sorted + according to this rule, ignoring the ascending / descending setting. + The default value of `Nothing` delegates the decision to the sorting + function. + type Order_Rule column comparator=Nothing order=Nothing missing_last=Nothing diff --git a/distribution/std-lib/Table/src/Data/Table.enso b/distribution/std-lib/Table/src/Data/Table.enso index 696a1cf94cf5..c7c6963f5126 100644 --- a/distribution/std-lib/Table/src/Data/Table.enso +++ b/distribution/std-lib/Table/src/Data/Table.enso @@ -2,8 +2,13 @@ from Base import all import Table.Io.Csv import Table.Data.Column import Base.System.Platform +from Table.Data.Order_Rule as Order_Rule_Module import Order_Rule polyglot java import org.enso.table.data.table.Table as Java_Table +polyglot java import org.enso.table.operations.OrderBuilder + +## An error returned when a non-existent column is being looked up. +type No_Such_Column_Error column_name ## Represents a column-oriented table data structure. type Table @@ -48,9 +53,9 @@ type Table Json.Object fields ## Returns the column with the given name. - at : Text -> Column | Nothing + at : Text -> Column ! No_Such_Column_Error at name = case this.java_table.getColumnByName name of - Nothing -> Nothing + Nothing -> Error.throw (No_Such_Column_Error name) c -> Column.Column c ## Selects only the rows of this table that correspond to `True` values in @@ -165,6 +170,109 @@ type Table group by=Nothing = Aggregate_Table (this.java_table.group by) + ## UNSTABLE + + Sorts the table according to the specified rules. + + Arguments: + - by: specifies the columns used for reordering the table. This + argument may be one of: + - a text: the text is treated as a column name. + - a column: any column, that may or may not belong to this table. + Sorting by a column will result in reordering the rows of this + table in a way that would result in sorting the given column. + - an order rule: specifies both the sorting column and additional + settings, that will take precedence over the global parameters of + this sort operation. The `column` field of the rule may be a text + or a column, with the semantics described above. + - a vector of any of the above: this will result in a hierarchical + sorting, such that the first rule is applied first, the second is + used for breaking ties, etc. + - order: specifies the default sort order for this operation. All the + rules specified in the `by` argument will default to this setting, + unless specified in the rule. + - missing_last: specifies the default placement of missing values when + compared to non-missing ones. This setting may be overriden by the + particular rules of the `by` argument. Note thet this argument is + independent from `order`, i.e. missing values will always be sorted + according to this rule, ignoring the ascending / descending setting. + + > Example + Sorting `table` in ascending order by the value in column `'Quantity'` + table.sort by='Quantity' + + > Example + Sorting `table` in descending order by the value in column `'Quantity'`, + placing missing values at the top of the table. + table.sort by='Quantity' order=Sort_Order.Descending missing_last=False + + > Example + Sorting `table` in ascending order by the value in column `'Quantity'`, + using the value in column `'Rating'` for breaking ties. + table.sort by=['Quantity', 'Rating'] + + > Example + Sorting `table` in ascending order by the value in column `'Quantity'`, + using the value in column `'Rating'` in descending order for breaking + ties. + table.sort by=['Quantity', Order_Rule 'Rating' (order=Sort_Order.Descending)] + + > Example + Sorting `table` in ascending order by the value in an externally + computed column, using the value in column `'Rating'` for breaking + ties. + quality_ratio = table.at 'Rating' / table.at 'Price' + table.sort by=[quality_ratio, 'Rating'] + + > Sorting `table` in ascending order, by the value in column + `'position'`, using a custom comparator function. + manhattan_comparator a b = (a.x.abs + a.y.abs) . compare_to (b.x.abs + b.y.abs) + table.sort by=(Order_Rule 'position' comparator=manhattan_comparator) + sort : Text | Column.Column | Order_Rule | Vector.Vector (Text | Column.Column | Order_Rule) -> Sort_Order -> Boolean -> Table + sort by order=Sort_Order.Ascending missing_last=True = Panic.recover <| + rules = this.build_java_order_rules by order missing_last + fallback_cmp = here.comparator_to_java .compare_to + mask = OrderBuilder.buildOrderMask rules.to_array fallback_cmp + new_table = this.java_table.applyMask mask + Table new_table + + ## PRIVATE + build_java_order_rules rules order missing_last = case rules of + Text -> [this.build_java_order_rule rules order missing_last] + Column.Column _ -> [this.build_java_order_rule rules order missing_last] + Order_Rule _ _ _ _ -> [this.build_java_order_rule rules order missing_last] + Vector.Vector _ -> rules.map (this.build_java_order_rule _ order missing_last) + + ## PRIVATE + build_java_order_rule rule order missing_last = + order_bool = case order of + Sort_Order.Ascending -> True + Sort_Order.Descending -> False + case rule of + Text -> + column = Panic.rethrow (this.at rule) + OrderBuilder.OrderRule.new column.java_column Nothing order_bool missing_last + Column.Column c -> + OrderBuilder.OrderRule.new c Nothing order_bool missing_last + Order_Rule col_ref cmp rule_order rule_nulls_last -> + c = case col_ref of + Text -> this.at col_ref . java_column + Column.Column c -> c + o = case rule_order of + Nothing -> order_bool + Sort_Order.Ascending -> True + Sort_Order.Descending -> False + nulls = case rule_nulls_last of + Nothing -> missing_last + _ -> rule_nulls_last + java_cmp = case cmp of + Nothing -> Nothing + c -> here.comparator_to_java c + OrderBuilder.OrderRule.new c java_cmp o nulls + +## PRIVATE +comparator_to_java cmp x y = cmp x y . to_sign + ## Represents a table with grouped rows. type Aggregate_Table type Aggregate_Table java_table diff --git a/distribution/std-lib/Table/src/Main.enso b/distribution/std-lib/Table/src/Main.enso index 9f87c8cd3b40..9ed740bc941c 100644 --- a/distribution/std-lib/Table/src/Main.enso +++ b/distribution/std-lib/Table/src/Main.enso @@ -3,10 +3,12 @@ from Base import all import Table.Io.Csv import Table.Data.Table import Table.Data.Column +import Table.Data.Order_Rule from Table.Io.Csv export all hiding Parser export Table.Data.Column -from Table.Data.Table export new, join +from Table.Data.Table export new, join, No_Such_Column_Error +from Table.Data.Order_Rule export Order_Rule ## Converts a JSON array into a dataframe, by looking up the requested keys from each item. diff --git a/table/src/main/java/org/enso/table/data/column/storage/BoolStorage.java b/table/src/main/java/org/enso/table/data/column/storage/BoolStorage.java index 8b1a7f069498..48c5d300e373 100644 --- a/table/src/main/java/org/enso/table/data/column/storage/BoolStorage.java +++ b/table/src/main/java/org/enso/table/data/column/storage/BoolStorage.java @@ -1,10 +1,13 @@ package org.enso.table.data.column.storage; import java.util.BitSet; +import java.util.Comparator; + import org.enso.table.data.column.operation.map.MapOpStorage; import org.enso.table.data.column.operation.map.MapOperation; import org.enso.table.data.column.operation.map.UnaryMapOperation; import org.enso.table.data.index.Index; +import org.enso.table.data.mask.OrderMask; import org.enso.table.error.UnexpectedColumnTypeException; import org.enso.table.error.UnexpectedTypeException; @@ -120,7 +123,8 @@ public Storage mask(BitSet mask, int cardinality) { } @Override - public Storage orderMask(int[] positions) { + public Storage applyMask(OrderMask mask) { + int[] positions = mask.getPositions(); BitSet newNa = new BitSet(); BitSet newVals = new BitSet(); for (int i = 0; i < positions.length; i++) { @@ -297,4 +301,10 @@ public static BitSet toMask(BoolStorage storage) { mask.andNot(storage.getIsMissing()); return mask; } + + @SuppressWarnings("unchecked") + @Override + public Comparator getDefaultComparator() { + return Comparator.naturalOrder(); + } } diff --git a/table/src/main/java/org/enso/table/data/column/storage/DoubleStorage.java b/table/src/main/java/org/enso/table/data/column/storage/DoubleStorage.java index e640f4f46c8d..4cb290fcc089 100644 --- a/table/src/main/java/org/enso/table/data/column/storage/DoubleStorage.java +++ b/table/src/main/java/org/enso/table/data/column/storage/DoubleStorage.java @@ -1,12 +1,15 @@ package org.enso.table.data.column.storage; import java.util.BitSet; +import java.util.Comparator; + import org.enso.table.data.column.builder.object.NumericBuilder; import org.enso.table.data.column.operation.map.MapOpStorage; import org.enso.table.data.column.operation.map.UnaryMapOperation; import org.enso.table.data.column.operation.map.numeric.DoubleBooleanOp; import org.enso.table.data.column.operation.map.numeric.DoubleNumericOp; import org.enso.table.data.index.Index; +import org.enso.table.data.mask.OrderMask; /** A column containing floating point numbers. */ public class DoubleStorage extends NumericStorage { @@ -126,7 +129,8 @@ public DoubleStorage mask(BitSet mask, int cardinality) { } @Override - public Storage orderMask(int[] positions) { + public Storage applyMask(OrderMask mask) { + int[] positions = mask.getPositions(); long[] newData = new long[positions.length]; BitSet newMissing = new BitSet(); for (int i = 0; i < positions.length; i++) { @@ -157,6 +161,11 @@ public Storage countMask(int[] counts, int total) { return new DoubleStorage(newData, total, newMissing); } + @Override + public Comparator getDefaultComparator() { + return Comparator.naturalOrder(); + } + public BitSet getIsMissing() { return isMissing; } diff --git a/table/src/main/java/org/enso/table/data/column/storage/LongStorage.java b/table/src/main/java/org/enso/table/data/column/storage/LongStorage.java index 3a573bd115b0..792d47d4a968 100644 --- a/table/src/main/java/org/enso/table/data/column/storage/LongStorage.java +++ b/table/src/main/java/org/enso/table/data/column/storage/LongStorage.java @@ -1,21 +1,17 @@ package org.enso.table.data.column.storage; -import java.util.Arrays; -import java.util.BitSet; -import java.util.OptionalDouble; -import java.util.OptionalLong; -import java.util.stream.DoubleStream; +import java.util.*; import java.util.stream.LongStream; import org.enso.table.data.column.builder.object.NumericBuilder; import org.enso.table.data.column.operation.aggregate.Aggregator; import org.enso.table.data.column.operation.aggregate.numeric.LongToLongAggregator; -import org.enso.table.data.column.operation.aggregate.numeric.NumericAggregator; import org.enso.table.data.column.operation.map.MapOpStorage; import org.enso.table.data.column.operation.map.UnaryMapOperation; import org.enso.table.data.column.operation.map.numeric.LongBooleanOp; import org.enso.table.data.column.operation.map.numeric.LongNumericOp; import org.enso.table.data.index.Index; +import org.enso.table.data.mask.OrderMask; /** A column storing 64-bit integers. */ public class LongStorage extends NumericStorage { @@ -196,7 +192,8 @@ public LongStorage mask(BitSet mask, int cardinality) { } @Override - public Storage orderMask(int[] positions) { + public Storage applyMask(OrderMask mask) { + int[] positions = mask.getPositions(); long[] newData = new long[positions.length]; BitSet newMissing = new BitSet(); for (int i = 0; i < positions.length; i++) { @@ -227,6 +224,12 @@ public Storage countMask(int[] counts, int total) { return new LongStorage(newData, total, newMissing); } + @SuppressWarnings("unchecked") + @Override + public Comparator getDefaultComparator() { + return Comparator.naturalOrder(); + } + public BitSet getIsMissing() { return isMissing; } diff --git a/table/src/main/java/org/enso/table/data/column/storage/ObjectStorage.java b/table/src/main/java/org/enso/table/data/column/storage/ObjectStorage.java index 2048ce165be8..38369a5f245c 100644 --- a/table/src/main/java/org/enso/table/data/column/storage/ObjectStorage.java +++ b/table/src/main/java/org/enso/table/data/column/storage/ObjectStorage.java @@ -1,10 +1,12 @@ package org.enso.table.data.column.storage; import java.util.BitSet; +import java.util.Comparator; import org.enso.table.data.column.operation.map.MapOpStorage; import org.enso.table.data.column.operation.map.UnaryMapOperation; import org.enso.table.data.index.Index; +import org.enso.table.data.mask.OrderMask; /** A column storing arbitrary objects. */ public class ObjectStorage extends Storage { @@ -92,7 +94,8 @@ public ObjectStorage mask(BitSet mask, int cardinality) { } @Override - public ObjectStorage orderMask(int[] positions) { + public ObjectStorage applyMask(OrderMask mask) { + int[] positions = mask.getPositions(); Object[] newData = new Object[positions.length]; for (int i = 0; i < positions.length; i++) { if (positions[i] == Index.NOT_FOUND) { @@ -120,6 +123,11 @@ public Object[] getData() { return data; } + @Override + public Comparator getDefaultComparator() { + return null; + } + private static MapOpStorage buildOps() { MapOpStorage ops = new MapOpStorage<>(); ops.add( diff --git a/table/src/main/java/org/enso/table/data/column/storage/Storage.java b/table/src/main/java/org/enso/table/data/column/storage/Storage.java index 152c9192c5cd..0fa259bfe2de 100644 --- a/table/src/main/java/org/enso/table/data/column/storage/Storage.java +++ b/table/src/main/java/org/enso/table/data/column/storage/Storage.java @@ -7,12 +7,13 @@ import org.enso.table.data.column.operation.aggregate.FunctionAggregator; import java.util.BitSet; +import java.util.Comparator; import java.util.List; import java.util.function.BiFunction; import java.util.function.Function; -import org.enso.table.data.column.builder.object.Builder; -import org.enso.table.data.column.builder.object.InferredBuilder; + import org.enso.table.data.column.builder.object.ObjectBuilder; +import org.enso.table.data.mask.OrderMask; /** An abstract representation of a data column. */ public abstract class Storage { @@ -228,16 +229,11 @@ protected final Storage fillMissingHelper(Object arg, Builder builder) { public abstract Storage mask(BitSet mask, int cardinality); /** - * Returns a new storage, ordered according to the rules specified in a mask. The resulting - * storage should contain the {@code positions[i]}-th element of the original storage at the i-th - * position. {@code positions[i]} may be equal to {@link - * org.enso.table.data.index.Index.NOT_FOUND}, in which case a missing value should be inserted at - * this position. + * Returns a new storage, ordered according to the rules specified in a mask. * - * @param positions an array specifying the ordering as described - * @return a storage resulting from applying the reordering rules + * @param mask@return a storage resulting from applying the reordering rules */ - public abstract Storage orderMask(int[] positions); + public abstract Storage applyMask(OrderMask mask); /** * Returns a new storage, resulting from applying the rules specified in a mask. The resulting @@ -251,4 +247,10 @@ protected final Storage fillMissingHelper(Object arg, Builder builder) { * @return the storage masked according to the specified rules */ public abstract Storage countMask(int[] counts, int total); + + /** + * @return a comparator comparing objects in this storage in a natural order. May be {@code null} + * to specify no natural ordering. + */ + public abstract Comparator getDefaultComparator(); } diff --git a/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java b/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java index 9436d29a2a34..125ba0ddf30e 100644 --- a/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java +++ b/table/src/main/java/org/enso/table/data/column/storage/StringStorage.java @@ -1,10 +1,13 @@ package org.enso.table.data.column.storage; import java.util.BitSet; +import java.util.Comparator; + import org.enso.table.data.column.builder.object.StringBuilder; import org.enso.table.data.column.operation.map.MapOpStorage; import org.enso.table.data.column.operation.map.MapOperation; import org.enso.table.data.column.operation.map.text.StringBooleanOp; +import org.enso.table.data.mask.OrderMask; /** A column storing strings. */ public class StringStorage extends ObjectStorage { @@ -64,8 +67,8 @@ public StringStorage mask(BitSet mask, int cardinality) { } @Override - public StringStorage orderMask(int[] positions) { - ObjectStorage storage = super.orderMask(positions); + public StringStorage applyMask(OrderMask mask) { + ObjectStorage storage = super.applyMask(mask); return new StringStorage(storage.getData(), (int) storage.size()); } @@ -75,6 +78,11 @@ public StringStorage countMask(int[] counts, int total) { return new StringStorage(storage.getData(), total); } + @Override + public Comparator getDefaultComparator() { + return Comparator.naturalOrder(); + } + private static MapOpStorage buildOps() { MapOpStorage t = ObjectStorage.ops.makeChild(); t.add( diff --git a/table/src/main/java/org/enso/table/data/index/DefaultIndex.java b/table/src/main/java/org/enso/table/data/index/DefaultIndex.java index e507ea4c2517..8baf5ca94a6d 100644 --- a/table/src/main/java/org/enso/table/data/index/DefaultIndex.java +++ b/table/src/main/java/org/enso/table/data/index/DefaultIndex.java @@ -1,5 +1,7 @@ package org.enso.table.data.index; +import org.enso.table.data.mask.OrderMask; + import java.util.BitSet; import java.util.Collections; import java.util.List; @@ -60,4 +62,9 @@ public Index unique() { public int size() { return size; } + + @Override + public Index applyMask(OrderMask mask) { + return this; + } } diff --git a/table/src/main/java/org/enso/table/data/index/HashIndex.java b/table/src/main/java/org/enso/table/data/index/HashIndex.java index 526f8d379405..50bbaed1bfa7 100644 --- a/table/src/main/java/org/enso/table/data/index/HashIndex.java +++ b/table/src/main/java/org/enso/table/data/index/HashIndex.java @@ -1,6 +1,8 @@ package org.enso.table.data.index; import org.enso.table.data.column.storage.Storage; +import org.enso.table.data.mask.OrderMask; + import java.util.*; public class HashIndex extends Index { @@ -61,6 +63,12 @@ public Index countMask(int[] counts, int total) { return HashIndex.fromStorage(name, newSt); } + @Override + public Index applyMask(OrderMask mask) { + Storage newSt = items.applyMask(mask); + return HashIndex.fromStorage(name, newSt); + } + @Override public Index unique() { HashMap> newLocs = new HashMap<>(); diff --git a/table/src/main/java/org/enso/table/data/index/Index.java b/table/src/main/java/org/enso/table/data/index/Index.java index ac44d79018a5..89a5e73040d0 100644 --- a/table/src/main/java/org/enso/table/data/index/Index.java +++ b/table/src/main/java/org/enso/table/data/index/Index.java @@ -1,5 +1,8 @@ package org.enso.table.data.index; +import org.enso.table.data.column.storage.Storage; +import org.enso.table.data.mask.OrderMask; + import java.util.BitSet; import java.util.List; @@ -63,6 +66,14 @@ public abstract class Index { */ public abstract Index countMask(int[] counts, int total); + /** + * Returns a new index, ordered according to the rules specified in a mask. + * + * @param mask an order mask specifying the reordering + * @return an index resulting from applying the reordering rules + */ + public abstract Index applyMask(OrderMask mask); + /** @return the number of elements in this index. */ public abstract int size(); } diff --git a/table/src/main/java/org/enso/table/data/mask/OrderMask.java b/table/src/main/java/org/enso/table/data/mask/OrderMask.java new file mode 100644 index 000000000000..48364df1f06a --- /dev/null +++ b/table/src/main/java/org/enso/table/data/mask/OrderMask.java @@ -0,0 +1,28 @@ +package org.enso.table.data.mask; + +/** Describes a storage reordering operator. */ +public class OrderMask { + private final int[] positions; + + /** + * Creates a new reordering operator, with the specified characteristics. See {@link + * #getPositions()} for a description of the semantics. + * + * @param positions the positions array, as described by {@link #getPositions()} + */ + public OrderMask(int[] positions) { + this.positions = positions; + } + + /** + * Describes the reordering that should happen on the applying storage. + * + *

The resulting storage should contain the {@code positions[i]}-th element of the original + * storage at the i-th position. {@code positions[i]} may be equal to {@link + * org.enso.table.data.index.Index.NOT_FOUND}, in which case a missing value should be inserted at + * this position. + */ + public int[] getPositions() { + return positions; + } +} diff --git a/table/src/main/java/org/enso/table/data/table/Table.java b/table/src/main/java/org/enso/table/data/table/Table.java index 50c34abe7516..0b77b5e32663 100644 --- a/table/src/main/java/org/enso/table/data/table/Table.java +++ b/table/src/main/java/org/enso/table/data/table/Table.java @@ -11,6 +11,7 @@ import org.enso.table.data.index.DefaultIndex; import org.enso.table.data.index.HashIndex; import org.enso.table.data.index.Index; +import org.enso.table.data.mask.OrderMask; import org.enso.table.data.table.aggregate.AggregateTable; import org.enso.table.error.NoSuchColumnException; import org.enso.table.error.UnexpectedColumnTypeException; @@ -213,19 +214,20 @@ public Table join(Table other, boolean dropUnmatched, String on, String lsuffix, } outSize += countMask[i]; } - int[] orderMask = new int[outSize]; + int[] orderMaskArr = new int[outSize]; int orderMaskPosition = 0; for (int i = 0; i < s; i++) { if (matches[i] == null) { if (!dropUnmatched) { - orderMask[orderMaskPosition++] = Index.NOT_FOUND; + orderMaskArr[orderMaskPosition++] = Index.NOT_FOUND; } } else { for (Integer x : matches[i]) { - orderMask[orderMaskPosition++] = x; + orderMaskArr[orderMaskPosition++] = x; } } } + OrderMask orderMask = new OrderMask(orderMaskArr); Column[] newColumns = new Column[this.columns.length + other.columns.length]; Index newIndex = index.countMask(countMask, outSize); Set lnames = @@ -246,11 +248,30 @@ public Table join(Table other, boolean dropUnmatched, String on, String lsuffix, new Column( suffixIfNecessary(lnames, original.getName(), rsuffix), newIndex, - original.getStorage().orderMask(orderMask)); + original.getStorage().applyMask(orderMask)); } return new Table(newColumns, newIndex); } + /** + * Applies an order mask to all columns and indexes of this array. + * + * @param orderMask the mask to apply + * @return a new table, with all columns and indexes reordered accordingly + */ + public Table applyMask(OrderMask orderMask) { + final Index newIndex = index.applyMask(orderMask); + Column[] newColumns = + Arrays.stream(columns) + .map( + column -> { + Storage newStorage = column.getStorage().applyMask(orderMask); + return new Column(column.getName(), newIndex, newStorage); + }) + .toArray(Column[]::new); + return new Table(newColumns, newIndex); + } + private String suffixIfNecessary(Set names, String name, String suffix) { return names.contains(name) ? name + suffix : name; } diff --git a/table/src/main/java/org/enso/table/operations/OrderBuilder.java b/table/src/main/java/org/enso/table/operations/OrderBuilder.java new file mode 100644 index 000000000000..1b001d709ac8 --- /dev/null +++ b/table/src/main/java/org/enso/table/operations/OrderBuilder.java @@ -0,0 +1,99 @@ +package org.enso.table.operations; + +import org.enso.table.data.column.storage.Storage; +import org.enso.table.data.mask.OrderMask; +import org.enso.table.data.table.Column; + +import java.util.Comparator; +import java.util.List; +import java.util.stream.IntStream; + +/** Builds an order mask resulting in sorting storages according to specified rules. */ +public class OrderBuilder { + public static class OrderRule { + private final Column column; + private final Comparator customComparator; + private final boolean ascending; + private final boolean missingLast; + + /** + * A single-column ordering rule. + * + * @param column the column to use for ordering + * @param customComparator a comparator that should be used instead of natural ordering of the + * values + * @param ascending whether column should be sorted ascending or descending + * @param missingLast whether or not missing values should be placed at the start or end of the + * ordering + */ + public OrderRule( + Column column, + Comparator customComparator, + boolean ascending, + boolean missingLast) { + this.column = column; + this.customComparator = customComparator; + this.ascending = ascending; + this.missingLast = missingLast; + } + + /** + * Builds an index-comparing comparator, that will sort array indexes according to the specified + * ordering of the underlying column. + * + * @param fallbackComparator a base value comparator, used in case the column does not define a + * natural ordering. + * @return a comparator with properties described above + */ + public Comparator toComparator(Comparator fallbackComparator) { + final Storage storage = column.getStorage(); + Comparator itemCmp = customComparator; + if (itemCmp == null) { + itemCmp = column.getStorage().getDefaultComparator(); + } + if (itemCmp == null) { + itemCmp = fallbackComparator; + } + if (!ascending) { + itemCmp = itemCmp.reversed(); + } + if (missingLast) { + itemCmp = Comparator.nullsLast(itemCmp); + } else { + itemCmp = Comparator.nullsFirst(itemCmp); + } + + final Comparator cmp = itemCmp; + Comparator result = + (i, j) -> cmp.compare(storage.getItemBoxed(i), storage.getItemBoxed(j)); + return result; + } + } + + /** + * Builds an order mask based on the specified set of rules. + * + * @param rules a list of rules that should be used in generating the ordering. The rules are + * treated hierarchically, i.e. the first rule is applied first, all the groups of equal + * elements are then internally reordered according to the second rule etc. The ordering is + * stable, i.e. if no rule disambiguates the ordering, the original position in the storage is + * used instead. + * @param fallbackComparator a comparator that should be used for columns that do not define a + * natural ordering. + * @return and order mask that will result in sorting any storage according to the specified + * rules. + */ + public static OrderMask buildOrderMask( + List rules, Comparator fallbackComparator) { + int size = (int) rules.get(0).column.getSize(); + Comparator comparator = + rules.stream() + .map(rule -> rule.toComparator(fallbackComparator)) + .reduce(Comparator::thenComparing) + .get(); + + int[] positions = + IntStream.range(0, size).boxed().sorted(comparator).mapToInt(i -> i).toArray(); + return new OrderMask(positions); + } +} diff --git a/test/Table_Tests/data/clothes.csv b/test/Table_Tests/data/clothes.csv new file mode 100644 index 000000000000..24036de98462 --- /dev/null +++ b/test/Table_Tests/data/clothes.csv @@ -0,0 +1,7 @@ +Id,Name,Quantity,Rating,Price +1,shoes,20,3.0,37.2 +2,trousers,10,,42.1 +3,dress,20,7.3,64.1 +4,skirt,10,3.0,87.4 +5,blouse,30,2.2,13.5 +6,t-shirt,30,,64.2 diff --git a/test/Table_Tests/src/Table_Spec.enso b/test/Table_Tests/src/Table_Spec.enso index a1271459bb71..80deeb129170 100644 --- a/test/Table_Tests/src/Table_Spec.enso +++ b/test/Table_Tests/src/Table_Spec.enso @@ -9,6 +9,8 @@ My.== that = case that of My x1 y1 -> (this.x + this.y) == (x1 + y1) _ -> False +My.compare_to that = this.x+this.y . compare_to that.x+that.y + My.frobnicate = case this of My x1 y1 -> My y1 x1 @@ -321,4 +323,55 @@ spec = if sorted.is_empty then Nothing else sorted.at (sorted.length-1 / 2).floor agg.at 'quantity' . reduce median . to_vector . should_equal [30, 20, 40, 70] + Test.group "Sorting" <| + df = (Enso_Project.data / "clothes.csv").read_csv + + Test.specify "should allow sorting by a single column name" <| + r_1 = df.sort by="Quantity" + r_1.at 'Id' . to_vector . should_equal [2,4,1,3,5,6] + + r_2 = df.sort by="Rating" missing_last=False + r_2.at 'Id' . to_vector . should_equal [2,6,5,1,4,3] + + r_3 = df.sort by="Rating" missing_last=False order=Sort_Order.Descending + r_3.at 'Id' . to_vector . should_equal [2,6,3,1,4,5] + + Test.specify 'should allow sorting by multiple column names' <| + r_1 = df.sort by=['Quantity', 'Rating'] + r_1.at 'Id' . to_vector . should_equal [4,2,1,3,5,6] + + r_2 = df.sort by=['Rating', 'Quantity'] missing_last=False order=Sort_Order.Descending + r_2.at 'Id' . to_vector . should_equal [6,2,3,1,4,5] + + Test.specify 'should allow sorting by external columns' <| + quality_ratio = df.at 'Rating' / df.at 'Price' + + r_1 = df.sort by=quality_ratio + r_1.at 'Id' . to_vector . should_equal [4,1,3,5,2,6] + + r_2 = df.sort by=['Quantity', quality_ratio] + r_2.at 'Id' . to_vector . should_equal [4,2,1,3,5,6] + + Test.specify 'should allow sorting with specific by-column rules' <| + r_1 = df.sort by=['Quantity', (Order_Rule 'Price' order=Sort_Order.Descending)] + r_1.at 'Id' . to_vector . should_equal [4,2,3,1,6,5] + + Test.specify 'should respect defined comparison operations for custom types' <| + c_1 = ['id', [1, 2, 3, 4, 5, 6]] + c_2 = ['val', [My 1 2, My 3 4, My 2 1, My 5 2, My 7 0, My 4 -1]] + df = Table.new [c_1, c_2] + r = df.sort by='val' + r.at 'id' . to_vector . should_equal [1,3,6,2,4,5] + + Test.specify 'should allow passing a custom comparator per column and should missing-proof it' <| + c_1 = ['id', [1, 2, 3, 4, 5, 6]] + c_2 = ['val', [My 1 2, My 2 5, My 3 4, My 6 3, Nothing, My 1 0]] + df = Table.new [c_1, c_2] + + cmp a b = (a.x-a.y).abs . compare_to (b.x-b.y).abs + r = df.sort by=(Order_Rule 'val' comparator=cmp) + r.at 'id' . to_vector . should_equal [1,3,6,2,4,5] + Test.specify 'should return dataflow error when passed a non-existent column' <| + r = df.sort by='foobar' + r.should_fail_with No_Such_Column_Error