sink("output.txt")
sessionInfo()
library(rstan)
rstan_options(auto_write = TRUE)
# Define the Stan model as a string.
# This model simulates two independent normal vectors, y1 and y2,
# then performs a Welch's two-sample t-test on these simulated data,
# outputting the t-statistic, degrees of freedom, and two-sided p-value.
stan_model_code <- "
data {
  int<lower=1> vector_length; // Length of the simulated vectors y1 and y2
  real mu1;                   // Mean for the first normal vector (y1)
  real sigma1;                // Standard deviation for y1
  real mu2;                   // Mean for the second normal vector (y2)
  real sigma2;                // Standard deviation for y2
}

generated quantities {
  // Declare Stan vector types for y1 and y2.
  // Using 'vector' is generally more efficient for vector operations in Stan.
  vector[vector_length] y1;
  vector[vector_length] y2;

  // Declare variables to store the t-test results for each simulation.
  real t_statistic;
  real df; // Degrees of freedom (Welch-Satterthwaite approximation)
  real p_value;

  // Simulate 'vector_length' observations for y1 and y2 from normal distributions.
  for (i in 1:vector_length) {
    y1[i] = normal_rng(mu1, sigma1);
    y2[i] = normal_rng(mu2, sigma2);
  }

  // Calculate the sample means for y1 and y2.
  real mean_y1 = mean(y1);
  real mean_y2 = mean(y2);

  // Calculate the unbiased sample variances for y1 and y2.
  real var_y1 = variance(y1);
  real var_y2 = variance(y2);

  // Calculate Welch's two-sample t-statistic.
  t_statistic = (mean_y1 - mean_y2) / sqrt(var_y1 / vector_length + var_y2 / vector_length);

  // Calculate Welch-Satterthwaite degrees of freedom.
  real v1_component = var_y1 / vector_length;
  real v2_component = var_y2 / vector_length;
  df = square(v1_component + v2_component) / 
       (square(v1_component) / (vector_length - 1) + square(v2_component) / (vector_length - 1));

  // Calculate the two-sided p-value using the Student's t cumulative distribution function (CDF).
  // The `student_t_cdf(y, nu, mu, sigma)` function is used.
  // For a standard t-distribution, mu=0 and sigma=1.
  // For a two-sided p-value, we calculate 2 * P(T < -|t_statistic|).
  p_value = 2 * student_t_cdf(-fabs(t_statistic)| df, 0, 1);
}
"

# Define the simulation parameters as specified in the task
vector_length <- 100       # Length of each vector (y1, y2)
mu1 <- 5                  # Mean for y1
sigma1 <- 2               # Standard deviation for y1
mu2 <- 8                  # Mean for y2
sigma2 <- 2               # Standard deviation for y2
num_simulations <- 10000  # Total number of simulations (N)

# Prepare the data list to be passed to Stan
stan_data <- list(
  vector_length = vector_length,
  mu1 = mu1,
  sigma1 = sigma1,
  mu2 = mu2,
  sigma2 = sigma2
)


message("Compiling and running the Stan model. This may take a moment...")
stan_fit <- rstan::stan(
  model_code = stan_model_code,
  data = stan_data,
  algorithm = "Fixed_param",
  chains = 1,
  iter = num_simulations,
  warmup = 0,             # No warmup needed for Fixed_param
  refresh = 0,            # Suppress intermediate output during sampling
  seed = 42               # Set a seed for reproducibility of random numbers
)
message("Stan model finished running.")

# Extract the generated variables from the Stan fit object.
# The 'pars' argument specifies which variables to extract.
extracted_samples <- rstan::extract(stan_fit, pars = c("y1", "y2", "t_statistic", "df", "p_value"))

# Combine the extracted variables into a single R array (matrix).
# Each row of the resulting matrix will correspond to one simulation.
# The columns will contain the simulated values of y1, then y2, followed by
# the t-statistic, degrees of freedom, and p-value for that simulation.
# Create descriptive column names for clarity.
y1_col_names <- paste0("y1_sim_", 1:vector_length)
y2_col_names <- paste0("y2_sim_", 1:vector_length)
result_col_names <- c(y1_col_names, y2_col_names, "t_statistic", "df", "p_value")

results_array <- cbind(
  extracted_samples$y1,
  extracted_samples$y2,
  extracted_samples$t_statistic,
  extracted_samples$df,
  extracted_samples$p_value
)

# Assign the descriptive column names to the results array.
colnames(results_array) <- result_col_names

### MANUAL PART
# now manually check t-stat and p-value with t.test
myres<-matrix(data=NA,ncol=2,nrow=10000);
for(i in 1:10000){
  res<-t.test(x=results_array[i,1:100],y=results_array[i,101:200])
  myres[i,1:2]<-c(res$statistic,res$p.value)
}
results_array<-cbind(results_array,myres);
colnames(results_array)<-c(result_col_names,c("t.test$stat","t.test$p.val"))

# this gives p_values = 0 incorrect - must come from student_t_cdf
results_array[1:10,201:205]
sink();