INTRODUCTION

This project demostrates how to build and train a simple neural network using the torch package.

The torch package brings deep learning to R by providing bindings to the popular PyTorch library. This comprehensive project demonstrates how to build and train a simple neural network using torch in R.

#installation

#install.packages("torch")
library(torch)
library(ggplot2)
#torch::install_torch()

#set seed for reproducibility

set.seed(42)

Generate training data: y = 3x + 2 + noise

x<- torch_randn(100, 1)
y<- 3 * x + 2 + torch_randn(100, 1) * 0.3

#display the first few data points
head(data.frame(
  x = as.numeric(x$squeeze()),
  y = as.numeric(y$squeeze())
))

##            x           y
## 1  1.5389941  6.48198080
## 2  0.5420342  3.83502460
## 3 -0.2955945  0.52947181
## 4 -0.6785704  0.09556369
## 5 -0.5023355  0.29653335
## 6  2.5532901 10.54154968

NEURAL NETWORK MODULE

Now we define the neural network architechture using torch’s module system:

Define a simple feedforward neural network

nnet <- nn_module (
      initialize = function() {
        #define layers
        self$layer1 <- nn_linear(1, 8)
        self$layer2 <- nn_linear (8, 1)
        },
      forward = function(x){
        #define forward pass
        x %>% 
          self$layer1() %>% #first linear transfromation
          nnf_relu() %>%    #ReLU activation function
          self$layer2()     #second linear transformation
        },
      )
#instantiate the model
model<- nnet()

#dsiplay model structure      
print(model)

## An `nn_module` containing 25 parameters.
## 
## ── Modules ─────────────────────────────────────────────────────────────────────
## • layer1: <nn_linear> #16 parameters
## • layer2: <nn_linear> #9 parameters

SET UP OPTIMIZER AND LOSS FUNCTION

The training process requires defining how the model will learn from the data:

we set up an adam optimizer with learning rate at 0.02

optimizer <- optim_adam(model$parameters, lr=0.02)

#define loss function (MSE)
loss_fn <- nnf_mse_loss

TRAINING LOOPS

store loss values for plotting

loss_history <- numeric(300)

#Training loop
for (epoch in 1:300) {
  #set model to trainig mode 
  model$train()
  
  #reset gradients
  optimizer$zero_grad()
  
  #forward pass
  y_pred <- model(x)
  
  #calculate loss
  loss <- loss_fn(y_pred, y)
  
  #backward pass
  loss$backward()
  
  #update parameters
  optimizer$step()
  
  #store loss for plots
  loss_history [epoch] <- loss$item()
}


# Visualize the training progress 
#create a data frame for plotting 
training_df <- data.frame (
  epoch = 1:300,
  loss = loss_history
)

#plotting training loss
ggplot(training_df, aes(x = epoch, y = loss)) + 
  geom_line(color = "#2c3e65", size = 1)  + 
  labs(
    title = "Trainig Loss over time",
    subtitle = "Neural Network Learning Progress",
    x = "Epoch",
    y = "Mean squared Error Loss"
    ) +
  theme_minimal() + 
  theme(
    plot.title = element_text(size = 14, face = "bold"), 
    plot.subtitle = element_text(size = 12, color = "gray60")
  )

Trainig loss function

VISUALIZE THE RESULT

set model evaluation mode

model$eval()

#generate predictions
with_no_grad({
  y_pred <- model(x)
})


#convert to Rvectors for plottin
x_np <- as.numeric(x$squeeze())
y_np <- as.numeric(y$squeeze())
y_pred_np <- as.numeric(y_pred$squeeze())

#create dataframe for ggplot
plot_df <- data.frame(
  x = x_np,
  y_actual = y_np,
  y_predicted = y_pred_np
  )

#create the plot

ggplot(plot_df, aes(x=x)) +
  geom_point(aes(y = y_actual, color = "Actual"), alpha = 0.7, size = 2) +
  geom_point(aes(y = y_predicted, color = "predicted", alpha = 0.7, size = 1)) +
  geom_smooth(aes(y = y_predicted), method = "loess", se = FALSE, color = "#e88c3c", linetype = "dashed") + 
  labs(
    title = "Neural Network Regression Results",
    subtitle = "Comparing actual vs predicted values", 
    x = "Input (x)",
    y = "Output (y)",
    color = "Data type"
  ) +
  scale_color_manual(
    values = c("Actual" = "#3658dc", "predicted" = "#e48c3c")) + 
  theme_minimal() + 
  theme( 
      plot.title = element_text(size = 14, face = "bold"), 
      plot.subtitle = element_text(size = 12, color = "gray25"),
      legend.position = "top"
    )

neural network plot

MODEL PERFORMANCE ANALYSIS

mse <- mean((y_pred_np - y_np)^2)
rmse <- sqrt(mse)
mae <- mean(abs(y_pred_np - y_np))
r_squared <- cor(y_pred_np, y_np) ^2

#Create performance metrics
performance_summary <- data.frame(
                        metric = c("Mean Squared Error", "Root Mean Squared Error", "Mean Absolute Error", "R-Squared"),
                        value = c(mse, rmse, mae, r_squared)
)

print(performance_summary)

##                    metric      value
## 1      Mean Squared Error 0.09719032
## 2 Root Mean Squared Error 0.31175362
## 3     Mean Absolute Error 0.23808641
## 4               R-Squared 0.98831562

COMPARISON WITH RELATIONSHIP (y = 3x + 2)

Generate predictions on a grid for comparison

x_grid <- torch_linspace(-3, 3, 100)$unsqueeze(2)
with_no_grad({
  y_grid_pred <- model(x_grid)
})

x_grid_np <- as.numeric(x_grid$squeeze())
y_grid_pred_np <- as.numeric(y_grid_pred$squeeze())
y_grid_true <- 3 * x_grid_np + 2

# Plot comparison
comparison_df <- data.frame(
  x = x_grid_np,
  y_true = y_grid_true,
  y_predicted = y_grid_pred_np
)

ggplot(comparison_df, aes(x = x)) +
  geom_line(aes(y = y_true, color = "True Function"), size = 2) +
  geom_line(aes(y = y_predicted, color = "Neural Network"), size = 2, linetype = "dashed") +
  geom_point(data = plot_df, aes(y = y_actual), alpha = 0.3, color = "gray50") +  labs(
    title = "Neural Network vs True Function",
    subtitle = "Model learning assessment against the underlying pattern",
    x = "Input (x)",
    y = "Output (y)",
    color = "Function Type"
  ) +
  scale_color_manual(values = c("True Function" = "#2c3e50", "Neural Network" = "#e74c3c")) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 12, color = "gray60"),
    legend.position = "top"
  )

plot comparison

UNDERSTANDING THE NEURAL NETWORK

The part reveals what the network learned by analyzing its parameters:

# Extract learned parameters
layer1_weight <- as.matrix(model$layer1$weight$detach())
layer1_bias <- as.numeric(model$layer1$bias$detach())
layer2_weight <- as.matrix(model$layer2$weight$detach())
layer2_bias <- as.numeric(model$layer2$bias$detach())

cat("First layer (fc1) parameters:\n")

layers display First layer (fc1) parameters:

cat("Weight matrix shape:", dim(layer1_weight), "\n")

layers display Weight matrix shape: 8 1

cat("Bias vector length:", length(layer1_bias), "\n\n")

layers display Bias vector length: 8

cat("Second layer (fc2) parameters:\n")

layers display Second layer (fc2) parameters:

cat("Weight matrix shape:", dim(layer2_weight), "\n")

layers display Weight matrix shape: 1 8

cat("Bias value:", layer2_bias, "\n\n")

layers display Bias value: 0.6431097

Display first layer weights and biases

cat("First layer weights:\n")

## First layer weights:

print(round(layer1_weight, 4))

##         [,1]
## [1,]  1.6058
## [2,]  0.4265
## [3,]  1.2185
## [4,] -0.8082
## [5,] -1.6228
## [6,]  0.1594
## [7,] -0.0204
## [8,]  0.3282

cat("\nFirst layer biases:\n")

## 
## First layer biases:

print(round(layer2_bias, 4))

## [1] 0.6431

EXPERIMENTING WITH DIFFERENT ARCHITECHTURES

Defining different network architectures

create_network <- function(hidden_sizes) {
  nn_module(
    initialize = function() {
      self$layers <- nn_module_list()
      
      # Input layer
      prev_size <- 1
      
      for(i in seq_along(hidden_sizes)) {
        self$layers$append(nn_linear(prev_size, hidden_sizes[i]))
        prev_size <- hidden_sizes[i]
      }
      # Output layer
      self$layers$append(nn_linear(prev_size, 1))
    },
    forward = function(x) {
      for(i in 1:(length(self$layers) - 1)) {
        x <- nnf_relu(self$layers[[i]](x))
      }
      # No activation on output layer
      self$layers[[length(self$layers)]](x)
    }
  )
}

# Train different architectures
architectures <- list(
  "Simple (8)" = c(8),
  "Deep (16-8)" = c(16, 8),
  "Wide (32)" = c(32),
  "Very Deep (16-16-8)" = c(16, 16, 8)
)

results <- list()

for(arch_name in names(architectures)) {
  
  # Create and train model
  net_class <- create_network(architectures[[arch_name]])
  model_temp <- net_class()
  optimizer_temp <- optim_adam(model_temp$parameters, lr = 0.01)
  
  # Quick training (fewer epochs for comparison)
  for(epoch in 1:200) {
    model_temp$train()
    optimizer_temp$zero_grad()
    y_pred_temp <- model_temp(x)
    loss_temp <- loss_fn(y_pred_temp, y)
    loss_temp$backward()
    optimizer_temp$step()
  }
  
  # Generate predictions
  model_temp$eval()
  with_no_grad({
    y_pred_arch <- model_temp(x_grid)
  })
  
  results[[arch_name]] <- data.frame(
    x = x_grid_np,
    y_pred = as.numeric(y_pred_arch$squeeze()),
    architecture = arch_name
  )
}

# Combine results
all_results <- do.call(rbind, results)

# Plot comparison
ggplot(all_results, aes(x = x, y = y_pred, color = architecture)) +
  geom_line(size = 1.2) +
  geom_line(data = comparison_df, aes(y = y_true, color = "True Function"), 
            size = 2, linetype = "solid") +
  geom_point(data = plot_df, aes(x = x, y = y_actual), 
             color = "gray50", alpha = 0.3, inherit.aes = FALSE) +  labs(
               title = "Comparison of Different Neural Network Architectures",
               subtitle = "Effects of network depth and width on learning performance",
               x = "Input (x)",
               y = "Output (y)",
               color = "Architecture"
             ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 12, color = "gray60"),
    legend.position = "top"
  )

comapring plots

#Key Takeaways

Simple Architecture: Even a simple 2-layer network can learn complex patterns effectively Training Process: The importance of proper training loops with gradient computation Visualization: Effective methods for visualizing both training progress and results Model Evaluation: Understanding model performance through multiple metrics *Architecture Comparison: How different network structures affect learning capabilities

The torch package provides a straightforward approach to building and experimenting with neural networks in R, bringing the power of deep learning to the R ecosystem. This approach can be extended to more complex datasets and deeper architectures as needed.

Badawi Aminu Muhammed

Building & Performing Neural Network Analysis in R with Torch

Badawi Amin Muhammed

2025-11-20