ArtStudies/M1/Monte Carlo Methods/Project 2/Projet2.rmd

---
title: "R Notebook"
output: html_notebook
---

### Random Variable simulation with stratification

##### Question 9

We consider a partition $\mathcal{P}= (D_0,D_1,...,D_k)$, $k \in \mathbb{N}$ of $\mathbb{R}$ such that $D_0$ covers the tails of $f_1$ and $f_1$ is upper bounded and $f_2$ lower bounded in $D_1, \dots, D_k$.

To simulate under $f$ using the accept-reject algorithm, we need to find a density function $g$ such that $f(x) \le M g(x)$ for all $x \in \mathbb{R}$, where M is a constant.\

We generate $X \sim g$ and $U \sim \mathcal{U}([0,1])$ $\textit{(1)}$.\
We accept $Y = X$ if $U \le \frac{f(X)}{Mg(X)}$. Otherwise, we return to $\textit{(1)}$.

Here we pose $g(x) = f_1(x)$ if $x \in D_0$ and $g(x) = sup_{D_i} f_1(x)$ if $x \in D_i, i \in \{1, \dots, k\}$.

To conclude, we have $M = \frac{1}{1-a}$ and the probability of acceptance is $r = \frac{1}{M} = 1-a$


##### Question 10

Let $P_n = (D_0, \dots D_n)$ a partition of $\mathbb{R}$ for $n \in \mathbb{N}$.
We have $\forall x \in P_n$ and $\forall i \in \{0, \dots, n\}$, $\lim_{n \to\infty} sup_{D_i} f_1(x) = f_1(x)$ and $\lim_{x\to\infty} inf_{D_i} f_2(x) = f_2(x)$.

$\Rightarrow \lim_{x\to\infty} g(x) = f(x)$

$\Rightarrow \lim_{x\to\infty} \frac{g(x)}{f(x)} = 1$ as $f(x) > 0$ as $f$ is a density function.

$\Rightarrow \forall \epsilon > 0, \exists n_{\epsilon} \in \mathbb{N}$ such that $\forall n \ge n_{\epsilon}$, $M = sup_{x \in P_n} \frac{g(x)}{f(x)} < \epsilon$

$\Rightarrow r = \frac{1}{M} > \frac{1}{\epsilon} := \delta \in ]0, 1]$ where $r$ is the acceptance rate defined in the question 10.

##### Question 11

We recall the parameters and the functions of the problem.
```{r}
mu1 <- 0
mu2 <- 1
s1 <- 3
s2 <- 1
a <- 0.2

f1 <- function(x) {
  dnorm(x, mu1, s1)
}

f2 <- function(x) {
  dnorm(x, mu2, s2)
}

f <- function(x) {
  fx <- f1(x) - a * f2(x)
  fx[fx < 0] <- 0
  return(fx / (1 - a))
}

f1_bounds <- c(mu1 - 3 * s1, mu1 + 3 * s1)
```

We implement the partition, the given $g$ function to understand the behavior of $g$ compared to $f$ and the computation of the supremum and infimum of $f_1$ and $f_2$ on each partition.
```{r}
create_partition <- function(k = 10) {
  return(seq(f1_bounds[1], f1_bounds[2], , length.out = k))
}

sup_inf <- function(f, P, i) {
  x <- seq(P[i], P[i + 1], length.out = 1000)
  f_values <- sapply(x, f)
  return(c(max(f_values), min(f_values)))
}

g <- function(X, P) {
  values <- numeric(0)
  for (x in X) {
    if (x <= P[1] | x >= P[length(P)]) {
      values <- c(values, 1 / (1 - a) * f1(x))
    } else {
      for (i in 1:(length(P) - 1)) {
        if (x >= P[i] & x <= P[i + 1]) {
          values <- c(values, 1 / (1 - a) * (sup_inf(f1, P, i)[1]) - a * sup_inf(f2, P, i)[2])
        }
      }
    }
  }
  return(values)
}
```

We plot the function $f$ and the dominating function $g$ for different sizes of the partition.
```{r}
library(ggplot2)

X <- seq(-12, 12, length.out = 1000)

# Plot for different k with ggplot on same graph
k_values <- c(10, 20, 50, 100)
P_values <- lapply(k_values, create_partition)
g_values <- lapply(P_values, g, X)

ggplot() +
  geom_line(aes(x = X, y = f(X)), col = "red") +
  geom_line(aes(x = X, y = g(X, P_values[[1]])), col = "green") +
  geom_line(aes(x = X, y = g(X, P_values[[2]])), col = "orange") +
  geom_line(aes(x = X, y = g(X, P_values[[3]])), col = "purple") +
  geom_line(aes(x = X, y = g(X, P_values[[4]])), col = "black") +
  labs(title = "Dominating function g of f for different size of partition", x = "x", y = "Density") +
  theme_minimal()
```

Here, we implement the algorithm of accept-reject with the given partition and an appropriate function $g$ to compute $f$.
```{r}
set.seed(123)

g_accept_reject <- function(x, P) {
  if (x < P[1] | x >= P[length(P)]) {
    return(f1(x))
  } else {
    for (i in seq_along(P)) {
      if (x >= P[i] & x < P[i + 1]) {
        return(sup_inf(f1, P, i)[1])
      }
    }
  }
}

stratified <- function(n, P) {
  samples <- numeric(0)
  rate <- 0
  while (length(samples) < n) {
    x <- rnorm(1, mu1, s1)
    u <- runif(1)
    if (u <= f(x) * (1 - a) / g_accept_reject(x, P)) {
      samples <- c(samples, x)
    }
    rate <- rate + 1
  }
  list(samples = samples, acceptance_rate = n / rate)
}

n <- 10000
k <- 100

P <- create_partition(k)
samples <- stratified(n, P)
X <- seq(-10, 10, length.out = 1000)

hist(samples$samples, breaks = 50, freq = FALSE, col = "lightblue", main = "Empirical density function f", xlab = "x")
lines(X, f(X), col = "red")
```

We also compute the acceptance rate of the algorithm.
```{r}
theorical_acceptance_rate <- 1 - a
cat(sprintf("Empirical acceptance rate: %f, Theoretical acceptance rate: %f \n", samples$acceptance_rate, theorical_acceptance_rate))
```

##### Question 12

```{r}
set.seed(123)

stratified_delta <- function(n, delta) {
  samples <- numeric(0)
  P <- create_partition(n * delta)
  rate <- 0
  while (length(samples) < n | rate < delta * n) {
    x <- rnorm(1, mu1, s1)
    u <- runif(1)
    if (u <= f(x) * delta / g_accept_reject(x, P)) {
      samples <- c(samples, x)
    }
    rate <- rate + 1
  }
  list(samples = samples, partition = P, acceptance_rate = n / rate)
}

n <- 10000
delta <- 0.8

samples_delta <- stratified_delta(n, delta)
X <- seq(-10, 10, length.out = 1000)

hist(samples_delta$samples, breaks = 50, freq = FALSE, col = "lightblue", main = "Empirical density function f", xlab = "x")
lines(X, f(X), col = "red")
```

We also compute the acceptance rate of the algorithm.
```{r}
theorical_acceptance_rate <- 1 - a
cat(sprintf("Empirical acceptance rate: %f, Theoretical acceptance rate: %f \n", samples_delta$acceptance_rate, theorical_acceptance_rate))
```

Now, we will test the stratified_delta function for different delta:

```{r}
set.seed(123)
n <- 1000
deltas <- seq(0.1, 1, by = 0.1)

for (delta in deltas) {
  samples <- stratified_delta(n, delta)
  cat(sprintf("Delta: %.1f, Empirical acceptance rate: %f \n", delta, samples$acceptance_rate))
}
```

### Cumulative density function.

##### Question 13

The cumulative density function $\forall x \in \mathbb{R}$ $F_X(x) = \int_{-\infty}^{x} f(t) \,dt = \int_{\mathbb{R}} f(t) h(t), dt$ where $h(x) = \mathbb{1}_{X_n \le x}$

For a given $x \in \mathbb{R}$, a Monte Carlo estimator $F_n(x) = \frac{1}{n} \sum_{i=1}^{n} h(X_i)$ where $h$ is the same function as above and $(X_i)_{i=1}^{n} \sim^{iid} X$

##### Question 14

As $X_1, \dots, X_n$ are iid and follows the law of X, and $h$ is continuous and positive, we have $h(X_1), \dots h(X_n)$ are iid and $\mathbb{E}[h(X_i)] < + \infty$. By the law of large numbers, we have $F_n(X) = \frac{1}{n} \sum_{i=1}^{n} h(X_i) \xrightarrow{a.s} \mathbb{E}[h(X_1)] = F_X(x)$.

Moreover, $\forall \epsilon > 0$, $\exists N \in \mathbb{N}$ such that $\forall n \le N$, $|F_n(x) - F_X(x)| < \epsilon$, ie, $sup_{x \in \mathbb{R}} |F_n(x) - F_X(x)| \xrightarrow{a.s} 0$, by Glivenko-Cantelli theorem.

Hence, $F_n$ is a good estimate of $F_X$ as a function of $x$.

##### Question 15
```{r}
set.seed(123)
n <- 10000
Xn <- (rnorm(n, mu1, s1) - a * rnorm(n, mu2, s2)) / (1 - a)

h <- function(x, Xn) {
  return(Xn <= x)
}

# Fn empirical
empirical_cdf <- function(x, Xn) {
  return(mean(h(x, Xn)))
}

# F theoretical
F <- function(x) {
  Fx <- pnorm(x, mu1, s1) - a * pnorm(x, mu2, s2)
  return(Fx / (1 - a))
}

X <- seq(-10, 10, length.out = n)
cat(sprintf("Empirical cdf: %f, Theoretical cdf: %f \n", empirical_cdf(X, Xn), mean(F(Xn))))

length(X)
length(empirical_cdf(X, Xn))
plot(X, empirical_cdf(X, Xn), col = "red", type = "l", xlab = "x", ylab = "F(x)")
lines(X, F(X), col = "blue")
```

Now we plot the empirical and theoretical cumulative density functions for different n.
```{r}
n_values <- seq(100, 10000, length.out = 10)

```

##### Question 16

As $X_1, \dots, X_n$ are iid and follows the law of X, and $h$ is continuous and positive, we have $h(X_1), \dots h(X_n)$ are iid and $\mathbb{E}[h(X_i)^2] < + \infty$. By the Central Limit Theorem, we have $\sqrt{n} \frac{(F_n(x) - F_X(x))}{\sigma} \xrightarrow{d} \mathcal{N}(0, 1)$ where $\sigma^2 = Var(h(X_1))$.

So we have $\lim_{x\to\infty} \mathbb{P}(\sqrt{n} \frac{(F_n(x) - F_X(x))}{\sigma} \le q_{1-\frac{\alpha}{2}}^{\mathcal{N}(0, 1)})= 1 - \alpha$

So by computing the quantile of the normal distribution, we can have a confidence interval for $F_X(x)$ :
$F_X(x) \in [F_n(x) - \frac{q_{1-\frac{\alpha}{2}}^{\mathcal{N}(0, 1)} \sigma}{\sqrt{n}} ; F_n(x) + \frac{q_{1-\frac{\alpha}{2}}^{\mathcal{N}(0, 1)} \sigma}{\sqrt{n}}]$

```{r}
Fn <- empirical_cdf(X, Xn)
sigma <- sqrt(Fn - Fn^2)
q <- qnorm(0.975)
interval <- c(Fn - q * sigma / sqrt(n), Fn + q * sigma / sqrt(n))
cat(sprintf("Confidence interval: [%f, %f] \n", interval[1], interval[2]))
```

##### Question 17

```{r}
q <- qnorm(0.975)
x <- 1
epsilon <- 0.01
n_required <- (1.96^2 * F(x) * (1 - F(x))) / epsilon^2
ceiling(n_required)
```