dataset • torch

library(torch)

# In deep learning models you don't usually have all your data in RAM
# because you are usually training using mini-batch gradient descent
# thus only needing a mini-batch on RAM each time.

# In torch we use the `datasets` abstraction to define the process of
# loading data. Once you have defined your dataset you can use torch
# dataloaders that allows you to iterate over this dataset in batches.

# Note that datasets are optional in torch. They are jut there as a
# recommended way to load data.

# Below you will see an example of how to create a simple torch dataset
# that pre-process a data.frame into tensors so you can feed them to
# a model.

df_dataset <- dataset(
  "mydataset",

  # the input data to your dataset goes in the initialize function.
  # our dataset will take a dataframe and the name of the response
  # variable.
  initialize = function(df, response_variable) {
    self$df <- df[,-which(names(df) == response_variable)]
    self$response_variable <- df[[response_variable]]
  },

  # the .getitem method takes an index as input and returns the
  # corresponding item from the dataset.
  # the index could be anything. the dataframe could have many
  # rows for each index and the .getitem method would do some
  # kind of aggregation before returning the element.
  # in our case the index will be a row of the data.frame,
  .getitem = function(index) {
    response <- torch_tensor(self$response_variable[index])
    x <- torch_tensor(as.numeric(self$df[index,]))

    # note that the dataloaders will automatically stack tensors
    # creating a new dimension
    list(x = x, y = response)
  },

  # It's optional, but helpful to define the .length method returning
  # the number of elements in the dataset. This is needed if you want
  # to shuffle your dataset.
  .length = function() {
    length(self$response_variable)
  }

)


# we can now initialize an instance of our dataset.
# for example
mtcars_dataset <- df_dataset(mtcars, "mpg")

# now we can get an item with
mtcars_dataset$.getitem(1)

## $x
## torch_tensor
##    6.0000
##  160.0000
##  110.0000
##    3.9000
##    2.6200
##   16.4600
##    0.0000
##    1.0000
##    4.0000
##    4.0000
## [ CPUFloatType{10} ]
## 
## $y
## torch_tensor
##  21
## [ CPUFloatType{1} ]

# Given a dataset you can create a dataloader with
dl <- dataloader(mtcars_dataset, batch_size = 15, shuffle = TRUE)

# we can then loop trough the elements of the dataloader with
coro::loop(for(batch in dl) {
  cat("X size:  ")
  print(batch[[1]]$size())
  cat("Y size:  ")
  print(batch[[2]]$size())
})

## X size:  [1] 15 10
## Y size:  [1] 15  1
## X size:  [1] 15 10
## Y size:  [1] 15  1
## X size:  [1]  2 10
## Y size:  [1] 2 1