Package dtutils¶

A previous article (group-by processing with Rcpp and data.table) shows how to use forderv function to make group-by processing with Rcpp and data.table. When writing C or C++ extensions to R, a fastidous task is that almost the same lines must be written for each R type: logical, integer, numeric, character and complex. But C++ has templates. And Rcpp can use it. See how with the na_fill function.

Template¶

In group-by processing with Rcpp and data.table, all Cna_fill_* must be defined and are almost the same:

// [[Rcpp::export]]
StringVector Cna_fill_char(StringVector xx, unsigned int type = 1, bool inplace = false) {
  StringVector x = inplace ? xx : clone(xx);
  R_xlen_t n = x.size();
  if (type & 0b01) {
    for(R_xlen_t i = 1; i<n; i++) {
      if(StringVector::is_na(x[i]) && !StringVector::is_na(x[i-1])) {
        x[i] = x[i-1];
      }
    }
  }
  if (type & 0b10) {
    for(R_xlen_t i = n - 1; i>=0; i--) {
      if(StringVector::is_na(x[i-1]) && !StringVector::is_na(x[i])) {
        x[i-1] = x[i];
      }
    }
  }
  return x;
}

// [[Rcpp::export]]
IntegerVector Cna_fill_char(IntegerVector xx, unsigned int type = 1, bool inplace = false) {
  IntegerVector x = inplace ? xx : clone(xx);
  R_xlen_t n = x.size();
  if (type & 0b01) {
    for(R_xlen_t i = 1; i<n; i++) {
      if(IntegerVector::is_na(x[i]) && !IntegerVector::is_na(x[i-1])) {
        x[i] = x[i-1];
      }
    }
  }
  if (type & 0b10) {
    for(R_xlen_t i = n - 1; i>=0; i--) {
      if(IntegerVector::is_na(x[i-1]) && !IntegerVector::is_na(x[i])) {
        x[i-1] = x[i];
      }
    }
  }
  return x;
}

And the same for NumericVector, LogicalVector, etc…

What would be great is to use a template by remplacing R types by generic T like this:

T Cna_fill_char(T xx, unsigned int type = 1, bool inplace = false) {
  T x = inplace ? xx : clone(xx);
  R_xlen_t n = x.size();
  if (type & 0b01) {
    for(R_xlen_t i = 1; i<n; i++) {
      if(T::is_na(x[i]) && !T::is_na(x[i-1])) {
        x[i] = x[i-1];
      }
    }
  }
  if (type & 0b10) {
    for(R_xlen_t i = n - 1; i>=0; i--) {
      if(T::is_na(x[i-1]) && !T::is_na(x[i])) {
        x[i-1] = x[i];
      }
    }
  }
  return x;
}

It’s possible in Rcpp with C++ templateSee this enhanced version of na_fill with group-by processing and replacement by a constant (fill):

template<typename T>
T Cna_fill_type(T x, IntegerVector rows, T fill, unsigned int type = 1, bool inplace = false) {
  R_xlen_t n = x.size();
  R_xlen_t nrows = rows.size();

  if (nrows != 0 && n != rows.size())
    stop("x and rows must have the same length");
  if (!rows.hasAttribute("starts"))
    stop("rows must have 'starts' attribute");
  if (fill.size() != 1)
    stop("fill must be a 1-length vector");

  IntegerVector grps = rows.attr("starts");
  R_xlen_t ngrps = grps.size();

  T ret = inplace ? x : clone(x);

  if (type == 0)
  for(R_xlen_t i = 0; i<n; i++) {
    if(T::is_na(x[i])) {
      ret[i] = fill[0];
    }
  } else {
    for(int g=0; g<ngrps; g++) {
      R_xlen_t f = grps[g] - 1; // start indice of group g (C indice = R indice - 1)
      R_xlen_t l = g == (ngrps - 1) ? n : grps[g + 1] - 1; // last indice (n if last group)

      if (type & 0b01) {
        for(R_xlen_t i = f + 1; i < l; i++) {
          R_xlen_t r  = nrows == 0 ? i : rows[i] - 1;
          R_xlen_t r1 = nrows == 0 ? i - 1 : rows[i - 1] - 1;
          if(T::is_na(ret[r]) && !T::is_na(ret[r1])) {
            ret[r] = ret[r1];
          }
        }
      }

      if (type & 0b10) {
        for(R_xlen_t i = l - 1; i > f; i--) {
          R_xlen_t r  = nrows == 0 ? i : rows[i] - 1;
          R_xlen_t r1 = nrows == 0 ? i - 1 : rows[i - 1] - 1;
          if(T::is_na(ret[r1]) && !T::is_na(ret[r])) {
            ret[r1] = ret[r];
          }
        }
      }
    }
  }
  return ret;
}

Note that this template definition is not exported.

Dispatch function¶

Elementary functons are defined. Now a dispatch function, which is exported, is needed : it takes a list as main parameter and an iterator crosses each element and dispatches the function calls:

// [[Rcpp::export]]
List Cna_fill_by(List x, IntegerVector rows, unsigned int type = 1, bool inplace = false, RObject fill = R_NilValue) {
  for(List::iterator it = x.begin(); it != x.end(); ++it) {
    if(is<NumericVector>(*it)){
      *it = Cna_fill_type<NumericVector>(as<NumericVector>(*it), rows, as<NumericVector>(fill), type, inplace);
    } else if(is<IntegerVector>(*it)){
      *it = Cna_fill_type<IntegerVector>(as<IntegerVector>(*it), rows, as<IntegerVector>(fill), type, inplace);
    } else if(is<StringVector>(*it)){
      *it = Cna_fill_type<StringVector> (as<StringVector>(*it),  rows, as<StringVector>( fill), type, inplace);
    } else if(is<LogicalVector>(*it)){
      *it = Cna_fill_type<LogicalVector>(as<LogicalVector>(*it), rows, as<LogicalVector>(fill), type, inplace);
    } else if(is<ComplexVector>(*it)){
      *it = Cna_fill_type<ComplexVector>(as<ComplexVector>(*it), rows, as<ComplexVector>(fill), type, inplace);
    } else {
      stop("na_fill error: unimplemented type");
    }
  }
  return x;
}

That’s all.

R wrapper function¶

A thin wrapper function in R is written and calls the forderv data.table function and the dispatch function to test entry values and prepare C++ parameters:

#' @title Replace NA by constant, previous or next value, optionally by group
#' @export
na_fill_by <- function(dt, var = NULL, by = NULL, type = 1L, inplace = FALSE, fill = NA) {
  nm <- names(dt)
  if (!is.null(by) && !all(by %in% nm)) {
    stop("When by is not NULL, all names in 'by' must be dt colnames")
  }
  if (is.null(var)) {
    var <- setdiff(nm, by)
  }
  if (!all(var %in% nm)) {
    stop("All names in 'var' must be dt colnames")
  }
  tt1 <- intersect(by, var)
  if (length(tt1) > 0) {
    stop("Some variables are in 'by' and in 'var': ", tt1)
  }
  if (!length(type) == 1L && type < 0 && type > 3) {
    stop("type must be 0, 1, 2 or 3")
  }
  if (!length(inplace) == 1L) {
    stop("inplace must be TRUE or FALSE")
  }
  if (is.null(by)) {
    grp = numeric(0)
    attr(grp, "starts") = 1
    attr(grp, "maxgrpn") = 1
  } else {
    grp = data.table:::forderv(dt, by = by, retGrp = T)
  }
  ldt <- lapply(var, function(x) dt[[x]])
  names(ldt) <- var
  ret <- Cna_fill_by(ldt, grp, type, inplace, fill)
  if (inplace)
    invisible(ret)
  else
    ret
}

Package dtutils¶

Some functions have been written with the same model in a R package called dtutils.