From 67d9004a88bcb295e8eef51130c4742f143b85f7 Mon Sep 17 00:00:00 2001 From: Vitalie Spinu Date: Sun, 28 Jan 2018 22:28:02 +0100 Subject: [PATCH] [Fix #622] Implement ISO 8601 parsing of periods and durations --- R/periods.r | 55 ++++++++++++++++++++++----------- src/period.c | 86 +++++++++++++++++++++++++++++++++++++++------------- src/utils.c | 37 +++++++++++----------- 3 files changed, 119 insertions(+), 59 deletions(-) diff --git a/R/periods.r b/R/periods.r index 1a4d7045..1b6ab38a 100644 --- a/R/periods.r +++ b/R/periods.r @@ -272,9 +272,9 @@ setMethod("$<-", signature(x = "Period"), function(x, name, value) { x }) -#' Create a period object +#' Create or parse period objects #' -#' `period()` creates a period object with the specified values. +#' `period()` creates or parses a period object with the specified values. #' #' Within a Period object, time units do not have a fixed length (except for #' seconds) until they are added to a date-time. The length of each time unit @@ -290,11 +290,11 @@ setMethod("$<-", signature(x = "Period"), function(x, name, value) { #' minutes, and seconds. Each unit except for seconds must be expressed in #' integer values. #' -#' Period objects can be easily created with the helper functions -#' [years()], [months()], [weeks()], -#' [days()], [hours()], [minutes()], -#' and [seconds()]. These objects can be added to and subtracted -#' to date-times to create a user interface similar to object oriented programming. +#' Besides the main constructor and parser [period()], period objects can also +#' be created with the specialized functions [years()], [months()], [weeks()], +#' [days()], [hours()], [minutes()], and [seconds()]. These objects can be added +#' to and subtracted to date-times to create a user interface similar to object +#' oriented programming. #' #' Note: Arithmetic with periods can results in undefined behavior when #' non-existent dates are involved (such as February 29th). Please see @@ -307,12 +307,12 @@ setMethod("$<-", signature(x = "Period"), function(x, name, value) { #' #' @name period #' @aliases periods -#' @param num a numeric vector that lists the number of time units to be -#' included in the period. From v1.6.0 `num` can also be a character vector -#' that specifies durations in a convenient shorthand format. All unambiguous -#' name units and abbreviations are supported, "m" stands for months, "M" for -#' minutes; see examples. Fractional units are supported but the fractional -#' part is always converted to seconds. +#' @param num a numeric or character vector. A character vector can specify +#' periods in a convenient shorthand format or ISO 8601 specification. All +#' unambiguous name units and abbreviations are supported, "m" stands for +#' months, "M" for minutes unless ISO 8601 "P" modifier is present (see +#' examples). Fractional units are supported but the fractional part is always +#' converted to seconds. #' @param units a character vector that lists the type of units to be used. The #' units in units are matched to the values in num according to their #' order. When `num` is character, this argument is ignored. @@ -330,28 +330,46 @@ setMethod("$<-", signature(x = "Period"), function(x, name, value) { #' @keywords chron classes #' @examples #' +#' ### Separate period and units vectors +#' #' period(c(90, 5), c("second", "minute")) #' # "5M 90S" #' period(-1, "days") #' period(c(3, 1, 2, 13, 1), c("second", "minute", "hour", "day", "week")) #' period(c(1, -60), c("hour", "minute")) #' period(0, "second") +#' +#' ### Units as arguments +#' #' period (second = 90, minute = 5) #' period(day = -1) #' period(second = 3, minute = 1, hour = 2, day = 13, week = 1) #' period(hour = 1, minute = -60) #' period(second = 0) #' period(c(1, -60), c("hour", "minute"), hour = c(1, 2), minute = c(3, 4)) +#' +#' ### Lubridate style parsing +#' #' period("2M 1sec") #' period("2hours 2minutes 1second") #' period("2d 2H 2M 2S") #' period("2days 2hours 2mins 2secs") +#' period("2 days, 2 hours, 2 mins, 2 secs") #' # Missing numerals default to 1. Repeated units are added up. #' duration("day day") -#' # Comparison with characters is supported from v1.6.0. +#' +#' ### ISO 8601 parsing +#' +#' period("P3Y6M4DT12H30M5S") +#' period("P23DT23H") # M stands for months +#' period("10DT10M") # M stands for minutes +#' period("P23DT60H 20min 100 sec") # mixing ISO and lubridate style parsing +#' +#' ### Comparison with characters (from v1.6.0) +#' #' duration("day 2 sec") > "day 1sec" #' -#' ### ELEMENTARY CONSTRUCTORS +#' ### Elementary Constructors #' #' x <- ymd("2009-08-03") #' x + days(1) + hours(6) + minutes(30) @@ -366,7 +384,7 @@ setMethod("$<-", signature(x = "Period"), function(x, name, value) { #' c(1:3) * hours(1) #' hours(1:3) #' -#' #sequencing +#' # sequencing #' y <- ymd(090101) # "2009-01-01 CST" #' y + months(0:11) #' @@ -391,8 +409,9 @@ period <- function(num = NULL, units = "second", ...) { } parse_period <- function(x) { - out <- matrix(.Call(C_parse_period, as.character(x)), nrow = 7L) - new("Period", out[1, ], + out <- .Call(C_parse_period, as.character(x)) + new("Period", + out[1, ], minute = out[2, ], hour = out[3, ], day = out[4, ] + 7L*out[5, ], diff --git a/src/period.c b/src/period.c index 4b25e210..a4d7faba 100644 --- a/src/period.c +++ b/src/period.c @@ -25,17 +25,24 @@ #include "constants.h" #include "utils.h" -static const char *en_units[] = {"SECS", "secs", "seconds", - "MINS", "mins", "minutes", - "HOURS", "hours", - "days", - "weeks", - "months", - "years"}; +static const char *EN_UNITS[] = {"S", "secs", "seconds", + "M", "mins", "minutes", + "H", "hours", // 6 + "D", "days", // 8 + "W", "weeks", // 10 + "M", "months", // 12 + "Y", "years", // 14 + // ISO period delimiters + "M", // 16 + "P", // 17 + "T" // 18 +}; +#define N_EN_UNITS 19 // S=0, M=1, H=2, d=3, w=4, m=5, y=6 -#define N_EN_UNITS 12 +static const char *PERIOD_UNITS[] = {"seconds", "minutes", "hours", + "days", "weeks", "months", "years"}; #define N_PERIOD_UNITS 7 fractionUnit parse_period_unit (const char **c) { @@ -57,17 +64,16 @@ fractionUnit parse_period_unit (const char **c) { } if(**c){ - out.unit = parse_alphanum(c, en_units, N_EN_UNITS, 0); - if (out.unit < 0){ + out.unit = parse_alphanum(c, EN_UNITS, N_EN_UNITS, 0); + if (out.unit < 0 || out.unit > 16) { return out; } else { // if only unit name supplied, default to 1 units if(out.val == -1) out.val = 1; - if (out.unit < 3) out.unit = 0; - else if (out.unit < 6) out.unit = 1; - else if (out.unit < 8) out.unit = 2; - else out.unit = out.unit - 5; + if (out.unit < 3) out.unit = 0; // seconds + else if (out.unit < 6) out.unit = 1; // minutes + else if (out.unit < 16) out.unit = (out.unit - 6)/2 + 2; return out; } } else { @@ -76,31 +82,58 @@ fractionUnit parse_period_unit (const char **c) { } void parse_period_1 (const char **c, double ret[N_PERIOD_UNITS]){ + int P = 0; // ISO period flag + int parsed1 = 0; while (**c) { fractionUnit fu = parse_period_unit(c); + /* Rprintf("P:%d UNIT:%d\n", P, fu.unit); */ if (fu.unit >= 0) { - ret[fu.unit] += fu.val; - if (fu.fraction > 0) { - if (fu.unit == 0) ret[fu.unit] += fu.fraction; - else ret[0] += fu.fraction * SECONDS_IN_ONE[fu.unit]; + if (fu.unit == 17) { // ISO P + P = 1; + } else if (fu.unit == 18) { // ISO T + P = 0; + } else { + if (fu.unit == 16) { // month or minute + fu.unit = P ? 5 : 1; + } + parsed1 = 1; + ret[fu.unit] += fu.val; + if (fu.fraction > 0) { + if (fu.unit == 0) ret[fu.unit] += fu.fraction; + else ret[0] += fu.fraction * SECONDS_IN_ONE[fu.unit]; + } } } else { ret[0] = NA_REAL; break; } } + if (!parsed1) { + ret[0] = NA_REAL; + } +} + +SEXP period_names() { + SEXP names = PROTECT(allocVector(STRSXP, N_PERIOD_UNITS)); + for (int i = 0; i < N_PERIOD_UNITS; i++) { + SET_STRING_ELT(names, i, mkChar(PERIOD_UNITS[i])); + } + UNPROTECT(1); + return names; } SEXP C_parse_period(SEXP str) { + if (TYPEOF(str) != STRSXP) error("STR argument must be a character vector"); + int n = LENGTH(str); - // store parsed units in N_PERIOD_UNITS x n matrix - SEXP out = allocVector(REALSXP, n * N_PERIOD_UNITS); + + // store parsed units in a N_PERIOD_UNITS x n matrix + SEXP out = PROTECT(allocMatrix(REALSXP, N_PERIOD_UNITS, n)); double *data = REAL(out); for (int i = 0; i < n; i++) { const char *c = CHAR(STRING_ELT(str, i)); - double ret[N_PERIOD_UNITS] = {0}; parse_period_1(&c, ret); int j = i * N_PERIOD_UNITS; @@ -108,5 +141,16 @@ SEXP C_parse_period(SEXP str) { data[j + k] = ret[k]; } } + + // Not adding names as mat[i, ] retains names when mat is a single column, thus + // requiring additional pre-processing at R level + + /* SEXP dimnames = PROTECT(allocVector(VECSXP, 2)); */ + /* SET_VECTOR_ELT(dimnames, 0, period_names()); */ + /* SET_VECTOR_ELT(dimnames, 1, R_NilValue); */ + /* setAttrib(out, R_DimNamesSymbol, dimnames); */ + + UNPROTECT(1); + return out; } diff --git a/src/utils.c b/src/utils.c index cd5b7aad..958951aa 100644 --- a/src/utils.c +++ b/src/utils.c @@ -89,15 +89,15 @@ int parse_int (const char **c, const int N, const int strict) { } -// Find partial match in `strings`. +// Find maximal partial match in `strings`. // // Increment *c and return index in 0..(length(strings)-1) if match was found, // -1 if not. Matching starts from *c, with all non-alpha-numeric characters // pre-skipped. // // - *c: pointer to a character in a C string (incremented by side effect) -// - *stings: pointer to an array of C strings to be matched to. -// - strings_len: length of strings array. +// - *stings: pointer to an array of C strings to be matched to +// - strings_len: length of strings array int parse_alphanum(const char **c, const char **strings, const int strings_len, const char ignore_case){ // tracking array: all valid objects are marked with 1, invalid with 0 @@ -106,36 +106,33 @@ int parse_alphanum(const char **c, const char **strings, const int strings_len, track[i] = 1; } - int j = 0, go = 1, out = -1; + int j = 0, out = -1, good_tracks = strings_len; while (**c && !ALPHA(**c) && !DIGIT(**c)) (*c)++; - while (**c && go) { - // stop when all tracks where exhausted - go = 0; + while (**c && good_tracks) { + // stop when all tracks have been exhausted for (int i = 0; i < strings_len; i++){ + + // keep going while at least one valid track if (track[i]){ - // keep going while at least one valid track - if (strings[i][j]){ - if(**c == strings[i][j] || (ignore_case && (tolower(**c) == strings[i][j]))){ + + if (strings[i][j]) { + if (**c == strings[i][j] || (ignore_case && (tolower(**c) == strings[i][j]))) { out = i; - go = 1; - } else { - // invalidate track i if not matching + } else { // invalidate track i if not matching track[i] = 0; + good_tracks--; } - } else { - // reached to the end of string i; return it - go = 0; + } else { // reached to the end of string i; return it if the last track + good_tracks--; out = i; - break; } } } - if(go){ + if (good_tracks) { (*c)++; j++; } } - if (out >= 0) return out; - else return -1; + return out; }