Overview

dsTidyVerseClient is the DataSHIELD implementation of selected functions from the Tidyverse (https://www.tidyverse.org/). Most of these functions are from dplyr, with some planned implementations for functions from purrr and tidyr.

As long as not potentially disclosing, all functionality from the original Tidyverse functions has been preserved. The main argument which is normally passed to the ... parameter in Tidyverse functions is passed as a list to the parameter tidy_select. Other arguments are passed to the relevant parameters which retain the same names as the original function.

To illustrate the usage of these functions, we use DSLite which creates a virtual DataSHIELD session.

Install and load packages

## install.packages("dplyr")
## install.packages("DSLite")
## install.packages(c("dsBase", "dsBaseClient"), repos = "https://cran.obiba.org/")
## devtools::install_github("molgenis/dsTidyverse")
## devtools::install_github("molgenis/dsTidyverseClient")
require(DSLite)
require(dplyr)
require(dsBase)
require(dsBaseClient)
require(dsTidyverse)
require(dsTidyverseClient)
require(DSI)

Set up DSLite environment and log in.

<<<<<<< HEAD
=======

>>>>>>> 2999dd7 (docs: added group examples to vignette)
data("mtcars")
mtcars_group <- mtcars %>%
  group_by(cyl) %>%
  mutate(drop_test = factor("a", levels = c("a", "b")))

dslite.server <- newDSLiteServer(
  tables = list(
    mtcars = mtcars,
    mtcars_group = mtcars_group
  )
)

dslite.server$config(defaultDSConfiguration(include=c("dsBase", "dsTidyverse")))
dslite.server$assignMethod("selectDS", "selectDS")
dslite.server$assignMethod("renameDS", "renameDS")
dslite.server$assignMethod("mutateDS", "mutateDS")
dslite.server$assignMethod("ifElseDS", "ifElseDS")
dslite.server$assignMethod("caseWhenDS", "caseWhenDS")
dslite.server$assignMethod("bindRowsDS", "bindRowsDS")
dslite.server$assignMethod("bindColsDS", "bindColsDS")
<<<<<<< HEAD
<<<<<<< HEAD
dslite.server$assignMethod("filterDS", "filterDS")
dslite.server$assignMethod("sliceDS", "sliceDS")
=======
dslite.server$assignMethod("arrangeDS", "arrangeDS")
>>>>>>> 5ccd846 (docs: added vignette)
conns <- datashield.login(logindata.dslite.cnsim[1, ], assign=TRUE)
=======
dslite.server$assignMethod("groupByDS", "groupByDS")
dslite.server$assignMethod("ungroupDS", "ungroupDS")
dslite.server$aggregateMethod("groupKeysDS", "groupKeysDS")

builder <- DSI::newDSLoginBuilder()

builder$append(
  server="server_1",
  url="dslite.server",
  table = "mtcars",
  driver = "DSLiteDriver")

logindata <- builder$build()
conns <- DSI::datashield.login(logins = logindata, assign = TRUE)

datashield.assign.table(
  conns = conns,
  table = "mtcars",
  symbol = "mtcars")

datashield.assign.table(
  conns = conns,
  table = "mtcars_group",
  symbol = "mtcars_group")
>>>>>>> 2999dd7 (docs: added group examples to vignette)

Functions

Select

ds.select, implements all the flexibility of dplyr::select in sub-setting columns. This includes: (i) the ability to pass unquoted column names which are evaluated as columns of df.name, and (ii) the ability to use tidyselect helper functions, such as contains. Below are some examples, for full details see the documentation for dplyr::select.

Subset columns by name

ds.colnames("mtcars")
<<<<<<< HEAD
#> $sim1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear" "carb"
=======
#> $server_1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"  
#> [10] "gear" "carb"
>>>>>>> 2999dd7 (docs: added group examples to vignette)
ds.select(df.name = "mtcars", tidy_select = list(mpg, disp, wt), newobj = "new_df")
ds.colnames("new_df")
#> $server_1
#> [1] "mpg"  "disp" "wt"

Specify columns to exclude

ds.colnames("mtcars")
<<<<<<< HEAD
#> $sim1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear" "carb"
=======
#> $server_1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"  
#> [10] "gear" "carb"
>>>>>>> 2999dd7 (docs: added group examples to vignette)
ds.select(df.name = "mtcars", tidy_select = list(!mpg), newobj = "new_df")
ds.colnames("new_df")
#> $server_1
#>  [1] "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
#> [10] "carb"

Specify a range of columns to keep

ds.colnames("mtcars")
<<<<<<< HEAD
#> $sim1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear" "carb"
=======
#> $server_1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"  
#> [10] "gear" "carb"
>>>>>>> 2999dd7 (docs: added group examples to vignette)
ds.select(df.name = "mtcars", tidy_select = list(mpg:drat), newobj = "new_df")
ds.colnames("new_df")
#> $server_1
#> [1] "mpg"  "cyl"  "disp" "hp"   "drat"

Specify column names using a regular expression

ds.colnames("mtcars")
<<<<<<< HEAD
#> $sim1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear" "carb"
=======
#> $server_1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"  
#> [10] "gear" "carb"
>>>>>>> 2999dd7 (docs: added group examples to vignette)
ds.select(df.name = "mtcars", tidy_select = list(matches('[aeiou]')), newobj = "new_df")
ds.colnames("new_df")
#> $server_1
#> [1] "disp" "drat" "qsec" "am"   "gear" "carb"

Combine multiple conditions with ‘&’

ds.colnames("mtcars")
<<<<<<< HEAD
#> $sim1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear" "carb"
=======
#> $server_1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"  
#> [10] "gear" "carb"
>>>>>>> 2999dd7 (docs: added group examples to vignette)
ds.select(df.name = "mtcars", tidy_select = list(starts_with('c') & ends_with('b')), newobj = "new_df")
ds.colnames("new_df")
#> $server_1
#> [1] "carb"

Combine multiple conditions with ‘|’

ds.colnames("mtcars")
<<<<<<< HEAD
#> $sim1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear" "carb"
=======
#> $server_1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"  
#> [10] "gear" "carb"
>>>>>>> 2999dd7 (docs: added group examples to vignette)
ds.select(df.name = "mtcars", tidy_select = list(starts_with('c') | ends_with('b')), newobj = "new_df")
ds.colnames("new_df")
#> $server_1
#> [1] "cyl"  "carb"

Rename

ds.rename renames columns within a server-side dataframe. Column names are passed unquoted and are evaluated as column names within df.name. Below are some examples, see dplyr::rename for full details.

ds.colnames("mtcars")
<<<<<<< HEAD
#> $sim1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear" "carb"
ds.rename(df.name = "mtcars", tidy_select = list(effiency = mpg, power = hp), newobj = "new_df")
ds.colnames("new_df")
#> $sim1
#>  [1] "effiency" "cyl"      "disp"     "power"    "drat"     "wt"       "qsec"    
#>  [8] "vs"       "am"       "gear"     "carb"
=======
#> $server_1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"  
#> [10] "gear" "carb"
ds.rename(df.name = "mtcars", tidy_select = list(effiency = mpg, power = hp), newobj = "new_df")
ds.colnames("new_df")
#> $server_1
#>  [1] "effiency" "cyl"      "disp"     "power"    "drat"    
#>  [6] "wt"       "qsec"     "vs"       "am"       "gear"    
#> [11] "carb"
>>>>>>> 2999dd7 (docs: added group examples to vignette)

Mutate

ds.mutate creates new columns in a server-side dataframe. These new columns are normally transformations of existing columns. This reduces the number of steps currently required in DataSHIELD, i.e. creating a new vector and joining it back to an existing data frame. Again, column names are passed unquoted. Below are some examples, see dplyr::mutate for full details.

Create columns which are transformations of existing variables:

ds.colnames("mtcars")
<<<<<<< HEAD
#> $sim1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear" "carb"
=======
#> $server_1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"  
#> [10] "gear" "carb"
>>>>>>> 2999dd7 (docs: added group examples to vignette)
ds.mutate(
  df.name = "mtcars",
  tidy_select = list(mpg_trans = cyl*1000, new_var = (hp-drat)/qsec),
  newobj = "new_df")
ds.colnames("new_df")
<<<<<<< HEAD
#> $sim1
#>  [1] "mpg"       "cyl"       "disp"      "hp"        "drat"      "wt"       
#>  [7] "qsec"      "vs"        "am"        "gear"      "carb"      "mpg_trans"
#> [13] "new_var"
ds.mean("mtcars$cyl")$Mean.by.Study
#>      EstimatedMean Nmissing Nvalid Ntotal
#> sim1        6.1875        0     32     32
ds.mean("new_df$mpg_trans")$Mean.by.Study
#>      EstimatedMean Nmissing Nvalid Ntotal
#> sim1        6187.5        0     32     32
ds.mean("mtcars$hp")$Mean.by.Study
#>      EstimatedMean Nmissing Nvalid Ntotal
#> sim1      146.6875        0     32     32
ds.mean("mtcars$drat")$Mean.by.Study
#>      EstimatedMean Nmissing Nvalid Ntotal
#> sim1      3.596563        0     32     32
ds.mean("mtcars$qsec")$Mean.by.Study
#>      EstimatedMean Nmissing Nvalid Ntotal
#> sim1      17.84875        0     32     32
=======
#> $server_1
#>  [1] "mpg"       "cyl"       "disp"      "hp"        "drat"     
#>  [6] "wt"        "qsec"      "vs"        "am"        "gear"     
#> [11] "carb"      "mpg_trans" "new_var"
ds.mean("mtcars$cyl")$Mean.by.Study
#>          EstimatedMean Nmissing Nvalid Ntotal
#> server_1        6.1875        0     32     32
ds.mean("new_df$mpg_trans")$Mean.by.Study
#>          EstimatedMean Nmissing Nvalid Ntotal
#> server_1        6187.5        0     32     32
ds.mean("mtcars$hp")$Mean.by.Study
#>          EstimatedMean Nmissing Nvalid Ntotal
#> server_1      146.6875        0     32     32
ds.mean("mtcars$drat")$Mean.by.Study
#>          EstimatedMean Nmissing Nvalid Ntotal
#> server_1      3.596563        0     32     32
ds.mean("mtcars$qsec")$Mean.by.Study
#>          EstimatedMean Nmissing Nvalid Ntotal
#> server_1      17.84875        0     32     32
>>>>>>> 2999dd7 (docs: added group examples to vignette)
ds.mean("new_df$new_var")$Mean.by.Study
#>          EstimatedMean Nmissing Nvalid Ntotal
#> server_1      8.372669        0     32     32

Choose where the new columns are positioned in the data frame with .before and .after:

ds.colnames("mtcars")
<<<<<<< HEAD
#> $sim1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear" "carb"
=======
#> $server_1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"  
#> [10] "gear" "carb"
>>>>>>> 2999dd7 (docs: added group examples to vignette)
ds.mutate(
  df.name = "mtcars",
  tidy_select = list(mpg_trans = cyl*1000, new_var = (hp-drat)/qsec),
  newobj = "new_df",
  .before = "disp")
ds.colnames("new_df")
<<<<<<< HEAD
#> $sim1
#>  [1] "mpg"       "cyl"       "mpg_trans" "new_var"   "disp"      "hp"       
#>  [7] "drat"      "wt"        "qsec"      "vs"        "am"        "gear"     
#> [13] "carb"
=======
#> $server_1
#>  [1] "mpg"       "cyl"       "mpg_trans" "new_var"   "disp"     
#>  [6] "hp"        "drat"      "wt"        "qsec"      "vs"       
#> [11] "am"        "gear"      "carb"
>>>>>>> 2999dd7 (docs: added group examples to vignette)

Only keep the newly created variables with .keep:

ds.colnames("mtcars")
<<<<<<< HEAD
#> $sim1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear" "carb"
=======
#> $server_1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"  
#> [10] "gear" "carb"
>>>>>>> 2999dd7 (docs: added group examples to vignette)
ds.mutate(
  df.name = "mtcars",
  tidy_select = list(mpg_trans = cyl*1000, new_var = (hp-drat)/qsec),
  newobj = "new_df",
  .keep = "none")
ds.colnames("new_df")
#> $server_1
#> [1] "mpg_trans" "new_var"

if_else

A shinier version of base::ifelse. As with the other implementations of tidyverse, variable/object names can be passed unquoted, this time in the condition argument.

ds.if_else(
  condition = list(mtcars$mpg > 20),
  "high",
  "low",
  newobj = "mpg_cat")

ds.table("mpg_cat")$output.list$TABLE_rvar.by.study_counts
#> 
#>  Data in all studies were valid 
#> 
#> Study 1 :  No errors reported from this study
#>        study
#> mpg_cat server_1
#>    high       14
#>    low        18
#>    NA          0

bind_rows

Bind any number of data frames by row, making a longer result. This is similar to do.call(rbind, dfs), but the output will contain all columns that appear in any of the inputs.

ds.dim("mtcars")[[1]]
#> [1] 32 11

ds.bind_rows(
  to_combine = list(mtcars, mtcars),
  newobj = "df_bound",
  datasources = conns
)

ds.dim("df_bound")[[1]]
#> [1] 64 11

The argument .id can be used to create an additional column which records which dataframe each row came from:

ds.bind_rows(
  to_combine = list(mtcars, mtcars),
  newobj = "df_bound",
  datasources = conns,
  .id = "where_it_came_from"
)

ds.colnames("df_bound")
<<<<<<< HEAD
#> $sim1
=======
#> $server_1
>>>>>>> 2999dd7 (docs: added group examples to vignette)
#>  [1] "where_it_came_from" "mpg"                "cyl"               
#>  [4] "disp"               "hp"                 "drat"              
#>  [7] "wt"                 "qsec"               "vs"                
#> [10] "am"                 "gear"               "carb"

bind_cols

Bind any number of data frames by column, making a wider result. This is similar to do.call(cbind, dfs).

ds.colnames("mtcars")
<<<<<<< HEAD
#> $sim1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear" "carb"
=======
#> $server_1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"  
#> [10] "gear" "carb"
>>>>>>> 2999dd7 (docs: added group examples to vignette)

ds.bind_cols(
  to_combine = list(mtcars, mtcars),
  newobj = "df_bound",
  datasources = conns
)

ds.colnames("df_bound")
<<<<<<< HEAD
#> $sim1
#>  [1] "mpg...1"   "cyl...2"   "disp...3"  "hp...4"    "drat...5"  "wt...6"   
#>  [7] "qsec...7"  "vs...8"    "am...9"    "gear...10" "carb...11" "mpg...12" 
#> [13] "cyl...13"  "disp...14" "hp...15"   "drat...16" "wt...17"   "qsec...18"
#> [19] "vs...19"   "am...20"   "gear...21" "carb...22"
=======
#> $server_1
#>  [1] "mpg...1"   "cyl...2"   "disp...3"  "hp...4"    "drat...5" 
#>  [6] "wt...6"    "qsec...7"  "vs...8"    "am...9"    "gear...10"
#> [11] "carb...11" "mpg...12"  "cyl...13"  "disp...14" "hp...15"  
#> [16] "drat...16" "wt...17"   "qsec...18" "vs...19"   "am...20"  
#> [21] "gear...21" "carb...22"
>>>>>>> 2999dd7 (docs: added group examples to vignette)

The argument .name_repair handles duplicate or broken names, e.g.

ds.colnames("mtcars")
<<<<<<< HEAD
#> $sim1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear" "carb"
=======
#> $server_1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"  
#> [10] "gear" "carb"
>>>>>>> 2999dd7 (docs: added group examples to vignette)

ds.bind_cols(
  to_combine = list(mtcars, mtcars),
  newobj = "df_bound",
  .name_repair = "minimal",
  datasources = conns
)

ds.colnames("df_bound")
<<<<<<< HEAD
#> $sim1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear" "carb"
#> [12] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear" "carb"
=======
#> $server_1
#>  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"  
#> [10] "gear" "carb" "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec"
#> [19] "vs"   "am"   "gear" "carb"
>>>>>>> 2999dd7 (docs: added group examples to vignette)

case_when

An extension of dplyr::if_else which allows the specification of multiple conditions. Extremely useful for recoding variables.

  ds.case_when(
    dynamic_dots = list(
      mtcars$mpg < 20 ~ "low",
      mtcars$mpg >= 20 & mtcars$mpg < 30 ~ "medium",
      mtcars$mpg >= 30 ~ "high"),
    newobj = "recoded",
    datasources = conns)

ds.table("recoded")$output.list$TABLE_rvar.by.study_counts
#> 
#>  Data in all studies were valid 
#> 
#> Study 1 :  No errors reported from this study
#>         study
#> recoded  server_1
#>   high          4
#>   low          18
#>   medium       10
#>   NA            0

You can also use the .default argument to control what happens if the condition is not met:

  ds.case_when(
    dynamic_dots = list(
      mtcars$mpg < 20 ~ "low",
      mtcars$mpg >= 30 ~ "high"),
    newobj = "recoded_missing",
    .default = "something_missing",
    datasources = conns)

ds.table("recoded_missing")$output.list$TABLE_rvar.by.study_counts
#> 
#>  Data in all studies were valid 
#> 
#> Study 1 :  No errors reported from this study
#>                    study
#> recoded_missing     server_1
#>   high                     4
#>   low                     18
#>   something_missing       10
#>   NA                       0

See the help file for dplyr::case_when for all available arguments.

filter

Subset set a data frame flexibly, retaining all rows that satisfy the condition supplied to expression

ds.dim("mtcars")[[1]]
#> [1] 32 11
  ds.filter(
    df.name = "mtcars",
    expr = list(cyl == 4 & mpg > 20),
    newobj = "filtered",
    datasources = conns)
ds.dim("filtered")[[1]]
#> [1] 11 11

Use the .by argument to perform the filtering by a specified group.

ds.dim("mtcars")[[1]]
#> [1] 32 11
  ds.filter(
    df.name = "mtcars",
    expr = list(mpg > median(mpg)),
    .by = "cyl",
    newobj = "filtered_by",
    datasources = conns)
ds.dim("filtered_by")[[1]]
#> [1] 14 11

If filtering on an already grouped tibble, use the preserve argument to specify whether to retain the original groups or recalculate groups based on the resulting data. See the help file for dplyr::filter for more information.

ds.dim("mtcars")[[1]]
#> [1] 32 11
    ds.filter(
      df.name = "mtcars",
      expr = list(mpg > median(mpg)),
      .preserve = T,
      newobj = "preserved_t",
      datasources = conns
      )
ds.dim("preserved_t")[[1]]
#> [1] 15 11

Slice

Subset rows using their positions. This is particularly useful if you want to take one observation within a group (for example if you have repeated measures data, and want to take one measurements per individual within an age group). Currently this can be done using dh.createSubset, however this should be replaced by group_by and slice as it will be vastly quicker.

ds.dim("mtcars")[1]
#> $`dimensions of mtcars in sim1`
#> [1] 32 11
ds.slice(
  df.name = "mtcars",
  expr = list(1:10),
  newobj = "sliced_df"
)
ds.dim("sliced_df")[1]
#> $`dimensions of sliced_df in sim1`
#> [1] 10 11
ds.dim("mtcars")[1]
#> $`dimensions of mtcars in sim1`
#> [1] 32 11
ds.slice(
  df.name = "mtcars",
  expr = list(1),
  .by = "cyl",
  newobj = "sliced_df_group"
)
ds.dim("sliced_df_group")[1]
#> $`dimensions of sliced_df_group in sim1`
#> [1]  3 11

Arrange

arrange is a more flexible version of base::sort, which orders the rows of a data frame by the values of selected columns.

    ds.arrange(
      df.name = "mtcars",
      expr = list(mpg, cyl),
      newobj = "arranged_df",
      datasources = conns
      )

arrange can be used on a sorted data frame or tibble. The arrange argument .by_group (TRUE or FALSE) determines whether or not data is sorted by group.

Grouping

Many data operations are performed on groups defined by variables. group_by takes an existing tibble or data frame and converts it into a grouped tibble. Subsequent operations can then be performed ‘by group’ rather than to the whole dataframe.

To group a data frame:

ds.group_by(
  df.name = "mtcars",
  expr = list(mpg, cyl),
  newobj = "grouped"
)
ds.class("grouped")[[1]]
#> [1] "grouped_df" "tbl_df"     "tbl"        "data.frame"

To ungroup a data frame:

ds.ungroup("grouped", "no_longer_grouped")
ds.class("no_longer_grouped")[[1]]
#> [1] "tbl_df"     "tbl"        "data.frame"

To return to clientside details of the groups, use ds.group_keys. Note that this is subject to disclosure controls and will return an error if the number of groups is too high.

my_groups <- ds.group_keys("mtcars_group")
my_groups
#> $server_1
#> # A tibble: 3 × 1
#>     cyl
#>   <dbl>
#> 1     4
#> 2     6
#> 3     8