Skip to content

Commit

Permalink
[Fix #622] Implement ISO 8601 parsing of periods and durations
Browse files Browse the repository at this point in the history
  • Loading branch information
vspinu committed Jan 28, 2018
1 parent a63c88a commit 67d9004
Show file tree
Hide file tree
Showing 3 changed files with 119 additions and 59 deletions.
55 changes: 37 additions & 18 deletions R/periods.r
Original file line number Diff line number Diff line change
Expand Up @@ -272,9 +272,9 @@ setMethod("$<-", signature(x = "Period"), function(x, name, value) {
x
})

#' Create a period object
#' Create or parse period objects
#'
#' `period()` creates a period object with the specified values.
#' `period()` creates or parses a period object with the specified values.
#'
#' Within a Period object, time units do not have a fixed length (except for
#' seconds) until they are added to a date-time. The length of each time unit
Expand All @@ -290,11 +290,11 @@ setMethod("$<-", signature(x = "Period"), function(x, name, value) {
#' minutes, and seconds. Each unit except for seconds must be expressed in
#' integer values.
#'
#' Period objects can be easily created with the helper functions
#' [years()], [months()], [weeks()],
#' [days()], [hours()], [minutes()],
#' and [seconds()]. These objects can be added to and subtracted
#' to date-times to create a user interface similar to object oriented programming.
#' Besides the main constructor and parser [period()], period objects can also
#' be created with the specialized functions [years()], [months()], [weeks()],
#' [days()], [hours()], [minutes()], and [seconds()]. These objects can be added
#' to and subtracted to date-times to create a user interface similar to object
#' oriented programming.
#'
#' Note: Arithmetic with periods can results in undefined behavior when
#' non-existent dates are involved (such as February 29th). Please see
Expand All @@ -307,12 +307,12 @@ setMethod("$<-", signature(x = "Period"), function(x, name, value) {
#'
#' @name period
#' @aliases periods
#' @param num a numeric vector that lists the number of time units to be
#' included in the period. From v1.6.0 `num` can also be a character vector
#' that specifies durations in a convenient shorthand format. All unambiguous
#' name units and abbreviations are supported, "m" stands for months, "M" for
#' minutes; see examples. Fractional units are supported but the fractional
#' part is always converted to seconds.
#' @param num a numeric or character vector. A character vector can specify
#' periods in a convenient shorthand format or ISO 8601 specification. All
#' unambiguous name units and abbreviations are supported, "m" stands for
#' months, "M" for minutes unless ISO 8601 "P" modifier is present (see
#' examples). Fractional units are supported but the fractional part is always
#' converted to seconds.
#' @param units a character vector that lists the type of units to be used. The
#' units in units are matched to the values in num according to their
#' order. When `num` is character, this argument is ignored.
Expand All @@ -330,28 +330,46 @@ setMethod("$<-", signature(x = "Period"), function(x, name, value) {
#' @keywords chron classes
#' @examples
#'
#' ### Separate period and units vectors
#'
#' period(c(90, 5), c("second", "minute"))
#' # "5M 90S"
#' period(-1, "days")
#' period(c(3, 1, 2, 13, 1), c("second", "minute", "hour", "day", "week"))
#' period(c(1, -60), c("hour", "minute"))
#' period(0, "second")
#'
#' ### Units as arguments
#'
#' period (second = 90, minute = 5)
#' period(day = -1)
#' period(second = 3, minute = 1, hour = 2, day = 13, week = 1)
#' period(hour = 1, minute = -60)
#' period(second = 0)
#' period(c(1, -60), c("hour", "minute"), hour = c(1, 2), minute = c(3, 4))
#'
#' ### Lubridate style parsing
#'
#' period("2M 1sec")
#' period("2hours 2minutes 1second")
#' period("2d 2H 2M 2S")
#' period("2days 2hours 2mins 2secs")
#' period("2 days, 2 hours, 2 mins, 2 secs")
#' # Missing numerals default to 1. Repeated units are added up.
#' duration("day day")
#' # Comparison with characters is supported from v1.6.0.
#'
#' ### ISO 8601 parsing
#'
#' period("P3Y6M4DT12H30M5S")
#' period("P23DT23H") # M stands for months
#' period("10DT10M") # M stands for minutes
#' period("P23DT60H 20min 100 sec") # mixing ISO and lubridate style parsing
#'
#' ### Comparison with characters (from v1.6.0)
#'
#' duration("day 2 sec") > "day 1sec"
#'
#' ### ELEMENTARY CONSTRUCTORS
#' ### Elementary Constructors
#'
#' x <- ymd("2009-08-03")
#' x + days(1) + hours(6) + minutes(30)
Expand All @@ -366,7 +384,7 @@ setMethod("$<-", signature(x = "Period"), function(x, name, value) {
#' c(1:3) * hours(1)
#' hours(1:3)
#'
#' #sequencing
#' # sequencing
#' y <- ymd(090101) # "2009-01-01 CST"
#' y + months(0:11)
#'
Expand All @@ -391,8 +409,9 @@ period <- function(num = NULL, units = "second", ...) {
}

parse_period <- function(x) {
out <- matrix(.Call(C_parse_period, as.character(x)), nrow = 7L)
new("Period", out[1, ],
out <- .Call(C_parse_period, as.character(x))
new("Period",
out[1, ],
minute = out[2, ],
hour = out[3, ],
day = out[4, ] + 7L*out[5, ],
Expand Down
86 changes: 65 additions & 21 deletions src/period.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,24 @@
#include "constants.h"
#include "utils.h"

static const char *en_units[] = {"SECS", "secs", "seconds",
"MINS", "mins", "minutes",
"HOURS", "hours",
"days",
"weeks",
"months",
"years"};
static const char *EN_UNITS[] = {"S", "secs", "seconds",
"M", "mins", "minutes",
"H", "hours", // 6
"D", "days", // 8
"W", "weeks", // 10
"M", "months", // 12
"Y", "years", // 14
// ISO period delimiters
"M", // 16
"P", // 17
"T" // 18
};
#define N_EN_UNITS 19

// S=0, M=1, H=2, d=3, w=4, m=5, y=6
#define N_EN_UNITS 12

static const char *PERIOD_UNITS[] = {"seconds", "minutes", "hours",
"days", "weeks", "months", "years"};
#define N_PERIOD_UNITS 7

fractionUnit parse_period_unit (const char **c) {
Expand All @@ -57,17 +64,16 @@ fractionUnit parse_period_unit (const char **c) {
}

if(**c){
out.unit = parse_alphanum(c, en_units, N_EN_UNITS, 0);
if (out.unit < 0){
out.unit = parse_alphanum(c, EN_UNITS, N_EN_UNITS, 0);
if (out.unit < 0 || out.unit > 16) {
return out;
} else {
// if only unit name supplied, default to 1 units
if(out.val == -1)
out.val = 1;
if (out.unit < 3) out.unit = 0;
else if (out.unit < 6) out.unit = 1;
else if (out.unit < 8) out.unit = 2;
else out.unit = out.unit - 5;
if (out.unit < 3) out.unit = 0; // seconds
else if (out.unit < 6) out.unit = 1; // minutes
else if (out.unit < 16) out.unit = (out.unit - 6)/2 + 2;
return out;
}
} else {
Expand All @@ -76,37 +82,75 @@ fractionUnit parse_period_unit (const char **c) {
}

void parse_period_1 (const char **c, double ret[N_PERIOD_UNITS]){
int P = 0; // ISO period flag
int parsed1 = 0;
while (**c) {
fractionUnit fu = parse_period_unit(c);
/* Rprintf("P:%d UNIT:%d\n", P, fu.unit); */
if (fu.unit >= 0) {
ret[fu.unit] += fu.val;
if (fu.fraction > 0) {
if (fu.unit == 0) ret[fu.unit] += fu.fraction;
else ret[0] += fu.fraction * SECONDS_IN_ONE[fu.unit];
if (fu.unit == 17) { // ISO P
P = 1;
} else if (fu.unit == 18) { // ISO T
P = 0;
} else {
if (fu.unit == 16) { // month or minute
fu.unit = P ? 5 : 1;
}
parsed1 = 1;
ret[fu.unit] += fu.val;
if (fu.fraction > 0) {
if (fu.unit == 0) ret[fu.unit] += fu.fraction;
else ret[0] += fu.fraction * SECONDS_IN_ONE[fu.unit];
}
}
} else {
ret[0] = NA_REAL;
break;
}
}
if (!parsed1) {
ret[0] = NA_REAL;
}
}

SEXP period_names() {
SEXP names = PROTECT(allocVector(STRSXP, N_PERIOD_UNITS));
for (int i = 0; i < N_PERIOD_UNITS; i++) {
SET_STRING_ELT(names, i, mkChar(PERIOD_UNITS[i]));
}
UNPROTECT(1);
return names;
}

SEXP C_parse_period(SEXP str) {

if (TYPEOF(str) != STRSXP) error("STR argument must be a character vector");

int n = LENGTH(str);
// store parsed units in N_PERIOD_UNITS x n matrix
SEXP out = allocVector(REALSXP, n * N_PERIOD_UNITS);

// store parsed units in a N_PERIOD_UNITS x n matrix
SEXP out = PROTECT(allocMatrix(REALSXP, N_PERIOD_UNITS, n));
double *data = REAL(out);

for (int i = 0; i < n; i++) {
const char *c = CHAR(STRING_ELT(str, i));

double ret[N_PERIOD_UNITS] = {0};
parse_period_1(&c, ret);
int j = i * N_PERIOD_UNITS;
for(int k = 0; k < N_PERIOD_UNITS; k++) {
data[j + k] = ret[k];
}
}

// Not adding names as mat[i, ] retains names when mat is a single column, thus
// requiring additional pre-processing at R level

/* SEXP dimnames = PROTECT(allocVector(VECSXP, 2)); */
/* SET_VECTOR_ELT(dimnames, 0, period_names()); */
/* SET_VECTOR_ELT(dimnames, 1, R_NilValue); */
/* setAttrib(out, R_DimNamesSymbol, dimnames); */

UNPROTECT(1);

return out;
}
37 changes: 17 additions & 20 deletions src/utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,15 +89,15 @@ int parse_int (const char **c, const int N, const int strict) {
}


// Find partial match in `strings`.
// Find maximal partial match in `strings`.
//
// Increment *c and return index in 0..(length(strings)-1) if match was found,
// -1 if not. Matching starts from *c, with all non-alpha-numeric characters
// pre-skipped.
//
// - *c: pointer to a character in a C string (incremented by side effect)
// - *stings: pointer to an array of C strings to be matched to.
// - strings_len: length of strings array.
// - *stings: pointer to an array of C strings to be matched to
// - strings_len: length of strings array
int parse_alphanum(const char **c, const char **strings, const int strings_len, const char ignore_case){

// tracking array: all valid objects are marked with 1, invalid with 0
Expand All @@ -106,36 +106,33 @@ int parse_alphanum(const char **c, const char **strings, const int strings_len,
track[i] = 1;
}

int j = 0, go = 1, out = -1;
int j = 0, out = -1, good_tracks = strings_len;
while (**c && !ALPHA(**c) && !DIGIT(**c)) (*c)++;

while (**c && go) {
// stop when all tracks where exhausted
go = 0;
while (**c && good_tracks) {
// stop when all tracks have been exhausted
for (int i = 0; i < strings_len; i++){

// keep going while at least one valid track
if (track[i]){
// keep going while at least one valid track
if (strings[i][j]){
if(**c == strings[i][j] || (ignore_case && (tolower(**c) == strings[i][j]))){

if (strings[i][j]) {
if (**c == strings[i][j] || (ignore_case && (tolower(**c) == strings[i][j]))) {
out = i;
go = 1;
} else {
// invalidate track i if not matching
} else { // invalidate track i if not matching
track[i] = 0;
good_tracks--;
}
} else {
// reached to the end of string i; return it
go = 0;
} else { // reached to the end of string i; return it if the last track
good_tracks--;
out = i;
break;
}
}
}
if(go){
if (good_tracks) {
(*c)++;
j++;
}
}
if (out >= 0) return out;
else return -1;
return out;
}

0 comments on commit 67d9004

Please # to comment.