sink("output.txt") sessionInfo() library(rstan) rstan_options(auto_write = TRUE) # Define the Stan model as a string. # This model simulates two independent normal vectors, y1 and y2, # then performs a Welch's two-sample t-test on these simulated data, # outputting the t-statistic, degrees of freedom, and two-sided p-value. stan_model_code <- " data { int vector_length; // Length of the simulated vectors y1 and y2 real mu1; // Mean for the first normal vector (y1) real sigma1; // Standard deviation for y1 real mu2; // Mean for the second normal vector (y2) real sigma2; // Standard deviation for y2 } generated quantities { // Declare Stan vector types for y1 and y2. // Using 'vector' is generally more efficient for vector operations in Stan. vector[vector_length] y1; vector[vector_length] y2; // Declare variables to store the t-test results for each simulation. real t_statistic; real df; // Degrees of freedom (Welch-Satterthwaite approximation) real p_value; // Simulate 'vector_length' observations for y1 and y2 from normal distributions. for (i in 1:vector_length) { y1[i] = normal_rng(mu1, sigma1); y2[i] = normal_rng(mu2, sigma2); } // Calculate the sample means for y1 and y2. real mean_y1 = mean(y1); real mean_y2 = mean(y2); // Calculate the unbiased sample variances for y1 and y2. real var_y1 = variance(y1); real var_y2 = variance(y2); // Calculate Welch's two-sample t-statistic. t_statistic = (mean_y1 - mean_y2) / sqrt(var_y1 / vector_length + var_y2 / vector_length); // Calculate Welch-Satterthwaite degrees of freedom. real v1_component = var_y1 / vector_length; real v2_component = var_y2 / vector_length; df = square(v1_component + v2_component) / (square(v1_component) / (vector_length - 1) + square(v2_component) / (vector_length - 1)); // Calculate the two-sided p-value using the Student's t cumulative distribution function (CDF). // The `student_t_cdf(y, nu, mu, sigma)` function is used. // For a standard t-distribution, mu=0 and sigma=1. // For a two-sided p-value, we calculate 2 * P(T < -|t_statistic|). p_value = 2 * student_t_cdf(-fabs(t_statistic)| df, 0, 1); } " # Define the simulation parameters as specified in the task vector_length <- 100 # Length of each vector (y1, y2) mu1 <- 5 # Mean for y1 sigma1 <- 2 # Standard deviation for y1 mu2 <- 8 # Mean for y2 sigma2 <- 2 # Standard deviation for y2 num_simulations <- 10000 # Total number of simulations (N) # Prepare the data list to be passed to Stan stan_data <- list( vector_length = vector_length, mu1 = mu1, sigma1 = sigma1, mu2 = mu2, sigma2 = sigma2 ) message("Compiling and running the Stan model. This may take a moment...") stan_fit <- rstan::stan( model_code = stan_model_code, data = stan_data, algorithm = "Fixed_param", chains = 1, iter = num_simulations, warmup = 0, # No warmup needed for Fixed_param refresh = 0, # Suppress intermediate output during sampling seed = 42 # Set a seed for reproducibility of random numbers ) message("Stan model finished running.") # Extract the generated variables from the Stan fit object. # The 'pars' argument specifies which variables to extract. extracted_samples <- rstan::extract(stan_fit, pars = c("y1", "y2", "t_statistic", "df", "p_value")) # Combine the extracted variables into a single R array (matrix). # Each row of the resulting matrix will correspond to one simulation. # The columns will contain the simulated values of y1, then y2, followed by # the t-statistic, degrees of freedom, and p-value for that simulation. # Create descriptive column names for clarity. y1_col_names <- paste0("y1_sim_", 1:vector_length) y2_col_names <- paste0("y2_sim_", 1:vector_length) result_col_names <- c(y1_col_names, y2_col_names, "t_statistic", "df", "p_value") results_array <- cbind( extracted_samples$y1, extracted_samples$y2, extracted_samples$t_statistic, extracted_samples$df, extracted_samples$p_value ) # Assign the descriptive column names to the results array. colnames(results_array) <- result_col_names ### MANUAL PART # now manually check t-stat and p-value with t.test myres<-matrix(data=NA,ncol=2,nrow=10000); for(i in 1:10000){ res<-t.test(x=results_array[i,1:100],y=results_array[i,101:200]) myres[i,1:2]<-c(res$statistic,res$p.value) } results_array<-cbind(results_array,myres); colnames(results_array)<-c(result_col_names,c("t.test$stat","t.test$p.val")) # this gives p_values = 0 incorrect - must come from student_t_cdf results_array[1:10,201:205] sink();