Package dtutils¶
A previous article (group-by processing with Rcpp and data.table) shows how to use forderv
function
to make group-by processing with Rcpp and data.table.
When writing C or C++ extensions to R, a fastidous task is that almost the same
lines must be written for each R type: logical, integer, numeric, character and
complex.
But C++ has templates. And Rcpp can use it. See how with the na_fill
function.
Template¶
In group-by processing with Rcpp and data.table,
all Cna_fill_*
must be defined and are almost the same:
// [[Rcpp::export]]
StringVector Cna_fill_char(StringVector xx, unsigned int type = 1, bool inplace = false) {
StringVector x = inplace ? xx : clone(xx);
R_xlen_t n = x.size();
if (type & 0b01) {
for(R_xlen_t i = 1; i<n; i++) {
if(StringVector::is_na(x[i]) && !StringVector::is_na(x[i-1])) {
x[i] = x[i-1];
}
}
}
if (type & 0b10) {
for(R_xlen_t i = n - 1; i>=0; i--) {
if(StringVector::is_na(x[i-1]) && !StringVector::is_na(x[i])) {
x[i-1] = x[i];
}
}
}
return x;
}
// [[Rcpp::export]]
IntegerVector Cna_fill_char(IntegerVector xx, unsigned int type = 1, bool inplace = false) {
IntegerVector x = inplace ? xx : clone(xx);
R_xlen_t n = x.size();
if (type & 0b01) {
for(R_xlen_t i = 1; i<n; i++) {
if(IntegerVector::is_na(x[i]) && !IntegerVector::is_na(x[i-1])) {
x[i] = x[i-1];
}
}
}
if (type & 0b10) {
for(R_xlen_t i = n - 1; i>=0; i--) {
if(IntegerVector::is_na(x[i-1]) && !IntegerVector::is_na(x[i])) {
x[i-1] = x[i];
}
}
}
return x;
}
And the same for NumericVector, LogicalVector, etc…
What would be great is to use a template by remplacing R types by generic T like this:
T Cna_fill_char(T xx, unsigned int type = 1, bool inplace = false) {
T x = inplace ? xx : clone(xx);
R_xlen_t n = x.size();
if (type & 0b01) {
for(R_xlen_t i = 1; i<n; i++) {
if(T::is_na(x[i]) && !T::is_na(x[i-1])) {
x[i] = x[i-1];
}
}
}
if (type & 0b10) {
for(R_xlen_t i = n - 1; i>=0; i--) {
if(T::is_na(x[i-1]) && !T::is_na(x[i])) {
x[i-1] = x[i];
}
}
}
return x;
}
It’s possible in Rcpp with C++ templateSee this enhanced version of na_fill
with group-by processing and replacement by a constant (fill):
template<typename T>
T Cna_fill_type(T x, IntegerVector rows, T fill, unsigned int type = 1, bool inplace = false) {
R_xlen_t n = x.size();
R_xlen_t nrows = rows.size();
if (nrows != 0 && n != rows.size())
stop("x and rows must have the same length");
if (!rows.hasAttribute("starts"))
stop("rows must have 'starts' attribute");
if (fill.size() != 1)
stop("fill must be a 1-length vector");
IntegerVector grps = rows.attr("starts");
R_xlen_t ngrps = grps.size();
T ret = inplace ? x : clone(x);
if (type == 0)
for(R_xlen_t i = 0; i<n; i++) {
if(T::is_na(x[i])) {
ret[i] = fill[0];
}
} else {
for(int g=0; g<ngrps; g++) {
R_xlen_t f = grps[g] - 1; // start indice of group g (C indice = R indice - 1)
R_xlen_t l = g == (ngrps - 1) ? n : grps[g + 1] - 1; // last indice (n if last group)
if (type & 0b01) {
for(R_xlen_t i = f + 1; i < l; i++) {
R_xlen_t r = nrows == 0 ? i : rows[i] - 1;
R_xlen_t r1 = nrows == 0 ? i - 1 : rows[i - 1] - 1;
if(T::is_na(ret[r]) && !T::is_na(ret[r1])) {
ret[r] = ret[r1];
}
}
}
if (type & 0b10) {
for(R_xlen_t i = l - 1; i > f; i--) {
R_xlen_t r = nrows == 0 ? i : rows[i] - 1;
R_xlen_t r1 = nrows == 0 ? i - 1 : rows[i - 1] - 1;
if(T::is_na(ret[r1]) && !T::is_na(ret[r])) {
ret[r1] = ret[r];
}
}
}
}
}
return ret;
}
Note that this template definition is not exported.
Dispatch function¶
Elementary functons are defined. Now a dispatch function, which is exported, is needed : it takes a list as main parameter and an iterator crosses each element and dispatches the function calls:
// [[Rcpp::export]]
List Cna_fill_by(List x, IntegerVector rows, unsigned int type = 1, bool inplace = false, RObject fill = R_NilValue) {
for(List::iterator it = x.begin(); it != x.end(); ++it) {
if(is<NumericVector>(*it)){
*it = Cna_fill_type<NumericVector>(as<NumericVector>(*it), rows, as<NumericVector>(fill), type, inplace);
} else if(is<IntegerVector>(*it)){
*it = Cna_fill_type<IntegerVector>(as<IntegerVector>(*it), rows, as<IntegerVector>(fill), type, inplace);
} else if(is<StringVector>(*it)){
*it = Cna_fill_type<StringVector> (as<StringVector>(*it), rows, as<StringVector>( fill), type, inplace);
} else if(is<LogicalVector>(*it)){
*it = Cna_fill_type<LogicalVector>(as<LogicalVector>(*it), rows, as<LogicalVector>(fill), type, inplace);
} else if(is<ComplexVector>(*it)){
*it = Cna_fill_type<ComplexVector>(as<ComplexVector>(*it), rows, as<ComplexVector>(fill), type, inplace);
} else {
stop("na_fill error: unimplemented type");
}
}
return x;
}
That’s all.
R wrapper function¶
A thin wrapper function in R is written and calls the forderv
data.table
function and the dispatch function to test entry values and prepare C++
parameters:
#' @title Replace NA by constant, previous or next value, optionally by group
#' @export
na_fill_by <- function(dt, var = NULL, by = NULL, type = 1L, inplace = FALSE, fill = NA) {
nm <- names(dt)
if (!is.null(by) && !all(by %in% nm)) {
stop("When by is not NULL, all names in 'by' must be dt colnames")
}
if (is.null(var)) {
var <- setdiff(nm, by)
}
if (!all(var %in% nm)) {
stop("All names in 'var' must be dt colnames")
}
tt1 <- intersect(by, var)
if (length(tt1) > 0) {
stop("Some variables are in 'by' and in 'var': ", tt1)
}
if (!length(type) == 1L && type < 0 && type > 3) {
stop("type must be 0, 1, 2 or 3")
}
if (!length(inplace) == 1L) {
stop("inplace must be TRUE or FALSE")
}
if (is.null(by)) {
grp = numeric(0)
attr(grp, "starts") = 1
attr(grp, "maxgrpn") = 1
} else {
grp = data.table:::forderv(dt, by = by, retGrp = T)
}
ldt <- lapply(var, function(x) dt[[x]])
names(ldt) <- var
ret <- Cna_fill_by(ldt, grp, type, inplace, fill)
if (inplace)
invisible(ret)
else
ret
}