This commit is contained in:
2024-11-20 18:32:54 +01:00
parent 10e9191969
commit af27fbba72
4 changed files with 533 additions and 1 deletions

View File

@@ -75,7 +75,6 @@ autoplot(model_full, 1:4)
set.seed(12)
durbinWatsonTest(model_full)
```
[P3] is verified as the p-value is 0.7 > 0.05, so we do not reject H0 so the residuals are not auto-correlated
# Bonus : Type II Test

View File

@@ -0,0 +1,134 @@
```{r}
setwd('/Users/arthurdanjou/Workspace/studies/M1/General Linear Models/TP3')
```
# Question 1 : Import dataset and check variables
```{r}
library(dplyr)
ozone <- read.table("ozone.txt", header = TRUE, sep = " ", dec = ".")
ozone$vent <- as.factor(ozone$vent)
ozone$temps <- as.factor(ozone$temps)
ozone <- ozone %>% mutate(across(where(is.character), as.numeric))
ozone <- ozone %>% mutate(across(where(is.integer), as.numeric))
paged_table(ozone)
```
# Question 2 : max03 ~ T12
```{r}
model_T12 <- lm(maxO3 ~ T12, data = ozone)
summary(model_T12)
```
```{r}
library(ggplot2)
ggplot(ozone, aes(x = T12, y = maxO3)) +
geom_smooth(method = 'lm', se = T) +
geom_point(col = 'red', size = 0.5) +
labs(title = "maxO3 ~ T12") +
theme_minimal()
```
```{r}
library(ggfortify)
library(car)
autoplot(model_T12, 1:4)
durbinWatsonTest(model_T12)
```
The p-value of the Durbin-Watson test is 0, so [P3] is not valid. The residuals are correlated.
The model is not valid.
# Question 3 : max03 ~ T12 + temps
```{r}
library(ggplot2)
ggplot(ozone, aes(y = maxO3, x = temps, colour = temps, fill = temps)) +
geom_boxplot(alpha = 0.5, outlier.alpha = 0) +
geom_jitter(width = 0.25, size = 1) +
stat_summary(fun = mean, colour = "black", geom = "point", shape = 18, size = 3)
```
```{r}
model_temps <- lm(maxO3 ~ -1 + temps, data = ozone)
summary(model_temps)
```
```{r}
autoplot(model_temps, 1:4)
durbinWatsonTest(model_temps)
```
```{r}
model_vent <- lm(maxO3 ~ -1 + vent, data = ozone)
summary(model_vent)
```
```{r}
autoplot(model_vent, 1:4)
durbinWatsonTest(model_vent)
```
The p-value of the Durbin-Watson test is 0, so [P3] is not valid. The residuals are correlated.
The model is not valid.
```{r}
model_temps_vent <- lm(maxO3 ~ temps * vent, data = ozone)
summary(model_temps_vent)
```
```{r}
autoplot(model_temps_vent, 1:4)
durbinWatsonTest(model_temps_vent)
```
# Question 4 : Multiple linear regression
```{r}
model <- lm(maxO3 ~ ., data = ozone)
summary(model)
```
```{r}
autoplot(model, 1:4)
durbinWatsonTest(model)
```
[P1] is verified as the 'Residuals vs Fitted' plot shows that the points are well distributed around 0
[P2] is verified as the 'Scale-Location' plot shows that the points are well distributed around 1
[P4] is verified as the 'QQPlot' is aligned with the 'y=x' line
[P3] is verified as the p-value is 0.7 > 0.05, so we do not reject H0 so the residuals are not auto-correlated
The model is valid.
```{r}
library(GGally)
ggpairs(ozone, progress = FALSE)
```
```{r}
library(MASS)
model_backward <- stepAIC(model, ~., trace = FALSE, direction = c("backward"))
summary(model_backward)
```
```{r}
AIC(model_backward)
autoplot(model_backward, 1:4)
durbinWatsonTest(model_backward)
```
# Question 5 : Prediction
```{r}
new_obs <- list(
T12 = 18,
Ne9 = 3,
Vx9 = 0.7,
maxO3v = 85
)
predict(model_backward, new_obs, interval = "confidence")
```

View File

@@ -0,0 +1,113 @@
maxO3 T9 T12 T15 Ne9 Ne12 Ne15 Vx9 Vx12 Vx15 maxO3v vent temps
87 15.6 18.5 18.4 4 4 8 0.6946 -1.7101 -0.6946 84 Nord Sec
82 17 18.4 17.7 5 5 7 -4.3301 -4 -3 87 Nord Sec
92 15.3 17.6 19.5 2 5 4 2.9544 1.8794 0.5209 82 Est Sec
114 16.2 19.7 22.5 1 1 0 0.9848 0.3473 -0.1736 92 Nord Sec
94 17.4 20.5 20.4 8 8 7 -0.5 -2.9544 -4.3301 114 Ouest Sec
80 17.7 19.8 18.3 6 6 7 -5.6382 -5 -6 94 Ouest Pluie
79 16.8 15.6 14.9 7 8 8 -4.3301 -1.8794 -3.7588 80 Ouest Sec
79 14.9 17.5 18.9 5 5 4 0 -1.0419 -1.3892 99 Nord Sec
101 16.1 19.6 21.4 2 4 4 -0.766 -1.0261 -2.2981 79 Nord Sec
106 18.3 21.9 22.9 5 6 8 1.2856 -2.2981 -3.9392 101 Ouest Sec
101 17.3 19.3 20.2 7 7 3 -1.5 -1.5 -0.8682 106 Nord Sec
90 17.6 20.3 17.4 7 6 8 0.6946 -1.0419 -0.6946 101 Sud Sec
72 18.3 19.6 19.4 7 5 6 -0.8682 -2.7362 -6.8944 90 Sud Sec
70 17.1 18.2 18 7 7 7 -4.3301 -7.8785 -5.1962 72 Ouest Pluie
83 15.4 17.4 16.6 8 7 7 -4.3301 -2.0521 -3 70 Nord Sec
88 15.9 19.1 21.5 6 5 4 0.5209 -2.9544 -1.0261 83 Ouest Sec
145 21 24.6 26.9 0 1 1 -0.342 -1.5321 -0.684 121 Ouest Sec
81 16.2 22.4 23.4 8 3 1 0 0.3473 -2.5712 145 Nord Sec
121 19.7 24.2 26.9 2 1 0 1.5321 1.7321 2 81 Est Sec
146 23.6 28.6 28.4 1 1 2 1 -1.9284 -1.2155 121 Sud Sec
121 20.4 25.2 27.7 1 0 0 0 -0.5209 1.0261 146 Nord Sec
146 27 32.7 33.7 0 0 0 2.9544 6.5778 4.3301 121 Est Sec
108 24 23.5 25.1 4 4 0 -2.5712 -3.8567 -4.6985 146 Sud Sec
83 19.7 22.9 24.8 7 6 6 -2.5981 -3.9392 -4.924 108 Ouest Sec
57 20.1 22.4 22.8 7 6 7 -5.6382 -3.8302 -4.5963 83 Ouest Pluie
81 19.6 25.1 27.2 3 4 4 -1.9284 -2.5712 -4.3301 57 Sud Sec
67 19.5 23.4 23.7 5 5 4 -1.5321 -3.0642 -0.8682 81 Ouest Sec
70 18.8 22.7 24.9 5 2 1 0.684 0 1.3681 67 Nord Sec
106 24.1 28.4 30.1 0 0 1 2.8191 3.9392 3.4641 70 Est Sec
139 26.6 30.1 31.9 0 1 4 1.8794 2 1.3681 106 Sud Sec
79 19.5 18.8 17.8 8 8 8 0.6946 -0.866 -1.0261 139 Ouest Sec
93 16.8 18.2 22 8 8 6 0 0 1.2856 79 Sud Pluie
97 20.8 23.7 25 2 3 4 0 1.7101 -2.7362 93 Nord Sec
113 17.5 18.2 22.7 8 8 5 -3.7588 -3.9392 -4.6985 97 Ouest Pluie
72 18.1 21.2 23.9 7 6 4 -2.5981 -3.9392 -3.7588 113 Ouest Pluie
88 19.2 22 25.2 4 7 4 -1.9696 -3.0642 -4 72 Ouest Sec
77 19.4 20.7 22.5 7 8 7 -6.5778 -5.6382 -9 88 Ouest Sec
71 19.2 21 22.4 6 4 6 -7.8785 -6.8937 -6.8937 77 Ouest Sec
56 13.8 17.3 18.5 8 8 6 1.5 -3.8302 -2.0521 71 Ouest Pluie
45 14.3 14.5 15.2 8 8 8 0.684 4 2.9544 56 Est Pluie
67 15.6 18.6 20.3 5 7 5 -3.2139 -3.7588 -4 45 Ouest Pluie
67 16.9 19.1 19.5 5 5 6 -2.2981 -3.7588 0 67 Ouest Pluie
84 17.4 20.4 21.4 3 4 6 0 0.3473 -2.5981 67 Sud Sec
63 15.1 20.5 20.6 8 6 6 2 -5.3623 -6.1284 84 Ouest Pluie
69 15.1 15.6 15.9 8 8 8 -4.5963 -3.8302 -4.3301 63 Ouest Pluie
92 16.7 19.1 19.3 7 6 4 -2.0521 -4.4995 -2.7362 69 Nord Sec
88 16.9 20.3 20.7 6 6 5 -2.8191 -3.4641 -3 92 Ouest Pluie
66 18 21.6 23.3 8 6 5 -3 -3.5 -3.2139 88 Sud Sec
72 18.6 21.9 23.6 4 7 6 0.866 -1.9696 -1.0261 66 Ouest Sec
81 18.8 22.5 23.9 6 3 2 0.5209 -1 -2 72 Nord Sec
83 19 22.5 24.1 2 4 6 0 -1.0261 0.5209 81 Nord Sec
149 19.9 26.9 29 3 4 3 1 -0.9397 -0.6428 83 Ouest Sec
153 23.8 27.7 29.4 1 1 4 0.9397 1.5 0 149 Nord Sec
159 24 28.3 26.5 2 2 7 -0.342 1.2856 -2 153 Nord Sec
149 23.3 27.6 28.8 4 6 3 0.866 -1.5321 -0.1736 159 Ouest Sec
160 25 29.6 31.1 0 3 5 1.5321 -0.684 2.8191 149 Sud Sec
156 24.9 30.5 32.2 0 1 4 -0.5 -1.8794 -1.2856 160 Ouest Sec
84 20.5 26.3 27.8 1 0 2 -1.3681 -0.6946 0 156 Nord Sec
126 25.3 29.5 31.2 1 4 4 3 3.7588 5 84 Est Sec
116 21.3 23.8 22.1 7 7 8 0 -2.3941 -1.3892 126 Sud Pluie
77 20 18.2 23.6 5 7 6 -3.4641 -2.5981 -3.7588 116 Ouest Pluie
63 18.7 20.6 20.3 6 7 7 -5 -4.924 -5.6382 77 Ouest Pluie
54 18.6 18.7 17.8 8 8 8 -4.6985 -2.5 -0.8682 63 Sud Pluie
65 19.2 23 22.7 8 7 7 -3.8302 -4.924 -5.6382 54 Ouest Sec
72 19.9 21.6 20.4 7 7 8 -3 -4.5963 -5.1962 65 Ouest Pluie
60 18.7 21.4 21.7 7 7 7 -5.6382 -6.0622 -6.8937 72 Ouest Pluie
70 18.4 17.1 20.5 3 6 3 -5.9088 -3.2139 -4.4995 60 Nord Pluie
77 17.1 20 20.8 4 5 4 -1.9284 -1.0261 0.5209 70 Nord Sec
98 17.8 22.8 24.3 1 1 0 0 -1.5321 -1 77 Ouest Pluie
111 20.9 25.2 26.7 1 5 2 -1.0261 -3 -2.2981 98 Ouest Sec
75 18.8 20.5 26 8 7 1 -0.866 0 0 111 Nord Sec
116 23.5 29.8 31.7 1 3 5 1.8794 1.3681 0.6946 75 Sud Sec
109 20.8 23.7 26.6 8 5 4 -1.0261 -1.7101 -3.2139 116 Sud Sec
67 18.8 21.1 18.9 7 7 8 -5.3623 -5.3623 -2.5 86 Ouest Pluie
76 17.8 21.3 24 7 5 5 -3.0642 -2.2981 -3.9392 67 Ouest Pluie
113 20.6 24.8 27 1 1 2 1.3681 0.8682 -2.2981 76 Sud Sec
117 21.6 26.9 28.6 6 6 4 1.5321 1.9284 1.9284 113 Sud Pluie
131 22.7 28.4 30.1 5 3 3 0.1736 -1.9696 -1.9284 117 Ouest Sec
166 19.8 27.2 30.8 4 0 1 0.6428 -0.866 0.684 131 Ouest Sec
159 25 33.5 35.5 1 1 1 1 0.6946 -1.7101 166 Sud Sec
100 20.1 22.9 27.6 8 8 6 1.2856 -1.7321 -0.684 159 Ouest Sec
114 21 26.3 26.4 7 4 5 3.0642 2.8191 1.3681 100 Est Sec
112 21 24.4 26.8 1 6 3 4 4 3.7588 114 Est Sec
101 16.9 17.8 20.6 7 7 7 -2 -0.5209 1.8794 112 Nord Pluie
76 17.5 18.6 18.7 7 7 7 -3.4641 -4 -1.7321 101 Ouest Sec
59 16.5 20.3 20.3 5 7 6 -4.3301 -5.3623 -4.5 76 Ouest Pluie
78 17.7 20.2 21.5 5 5 3 0 0.5209 0 59 Nord Pluie
76 17.3 22.7 24.6 4 5 6 -2.9544 -2.9544 -2 78 Ouest Pluie
55 15.3 16.8 19.2 8 7 5 -1.8794 -1.8794 -2.3941 76 Ouest Pluie
71 15.9 19.2 19.5 7 5 3 -6.1284 0 -1.3892 55 Nord Pluie
66 16.2 18.9 19.3 2 5 6 -1.3681 -0.8682 1.7101 71 Nord Pluie
59 18.3 18.3 19 7 7 7 -3.9392 -1.9284 -1.7101 66 Nord Pluie
68 16.9 20.8 22.5 6 5 7 -1.5 -3.4641 -3.0642 59 Ouest Pluie
63 17.3 19.8 19.4 7 8 8 -4.5963 -6.0622 -4.3301 68 Ouest Sec
78 14.2 22.2 22 5 5 6 -0.866 -5 -5 62 Ouest Sec
74 15.8 18.7 19.1 8 7 7 -4.5963 -6.8937 -7.5175 78 Ouest Pluie
71 15.2 17.9 18.6 6 5 1 -1.0419 -1.3681 -1.0419 74 Nord Pluie
69 17.1 17.7 17.5 6 7 8 -5.1962 -2.7362 -1.0419 71 Nord Pluie
71 15.4 17.7 16.6 4 5 5 -3.8302 0 1.3892 69 Nord Sec
60 13.7 14 15.8 4 5 4 0 3.2139 0 71 Nord Pluie
42 12.7 14.3 14.9 8 7 7 -2.5 -3.2139 -2.5 60 Nord Pluie
65 14.8 16.3 15.9 7 7 7 -4.3301 -6.0622 -5.1962 42 Ouest Pluie
71 15.5 18 17.4 7 7 6 -3.9392 -3.0642 0 65 Ouest Sec
96 11.3 19.4 20.2 3 3 3 -0.1736 3.7588 3.8302 71 Est Pluie
98 15.2 19.7 20.3 2 2 2 4 5 4.3301 96 Est Sec
92 14.7 17.6 18.2 1 4 6 5.1962 5.1423 3.5 98 Nord Sec
76 13.3 17.7 17.7 7 7 6 -0.9397 -0.766 -0.5 92 Ouest Pluie
84 13.3 17.7 17.8 3 5 6 0 -1 -1.2856 76 Sud Sec
77 16.2 20.8 22.1 6 5 5 -0.6946 -2 -1.3681 71 Sud Pluie
99 16.9 23 22.6 6 4 7 1.5 0.8682 0.8682 77 Sud Sec
83 16.9 19.8 22.1 6 5 3 -4 -3.7588 -4 99 Ouest Pluie
70 15.7 18.6 20.7 7 7 7 0 -1.0419 -4 83 Sud Sec

View File

@@ -0,0 +1,286 @@
---
title: "R Notebook"
output: html_notebook
---
### Random Variable simulation with stratification
##### Question 9
We consider a partition $\mathcal{P}= (D_0,D_1,...,D_k)$, $k \in \mathbb{N}$ of $\mathbb{R}$ such that $D_0$ covers the tails of $f_1$ and $f_1$ is upper bounded and $f_2$ lower bounded in $D_1, \dots, D_k$.
To simulate under $f$ using the accept-reject algorithm, we need to find a density function $g$ such that $f(x) \le M g(x)$ for all $x \in \mathbb{R}$, where M is a constant.\
We generate $X \sim g$ and $U \sim \mathcal{U}([0,1])$ $\textit{(1)}$.\
We accept $Y = X$ if $U \le \frac{f(X)}{Mg(X)}$. Otherwise, we return to $\textit{(1)}$.
Here we pose $g(x) = f_1(x)$ if $x \in D_0$ and $g(x) = sup_{D_i} f_1(x)$ if $x \in D_i, i \in \{1, \dots, k\}$.
To conclude, we have $M = \frac{1}{1-a}$ and the probability of acceptance is $r = \frac{1}{M} = 1-a$
##### Question 10
Let $P_n = (D_0, \dots D_n)$ a partition of $\mathbb{R}$ for $n \in \mathbb{N}$.
We have $\forall x \in P_n$ and $\forall i \in \{0, \dots, n\}$, $\lim_{n \to\infty} sup_{D_i} f_1(x) = f_1(x)$ and $\lim_{x\to\infty} inf_{D_i} f_2(x) = f_2(x)$.
$\Rightarrow \lim_{x\to\infty} g(x) = f(x)$
$\Rightarrow \lim_{x\to\infty} \frac{g(x)}{f(x)} = 1$ as $f(x) > 0$ as $f$ is a density function.
$\Rightarrow \forall \epsilon > 0, \exists n_{\epsilon} \in \mathbb{N}$ such that $\forall n \ge n_{\epsilon}$, $M = sup_{x \in P_n} \frac{g(x)}{f(x)} < \epsilon$
$\Rightarrow r = \frac{1}{M} > \frac{1}{\epsilon} := \delta \in ]0, 1]$ where $r$ is the acceptance rate defined in the question 10.
##### Question 11
We recall the parameters and the functions of the problem.
```{r}
mu1 <- 0
mu2 <- 1
s1 <- 3
s2 <- 1
a <- 0.2
f1 <- function(x) {
dnorm(x, mu1, s1)
}
f2 <- function(x) {
dnorm(x, mu2, s2)
}
f <- function(x) {
fx <- f1(x) - a * f2(x)
fx[fx < 0] <- 0
return(fx / (1 - a))
}
f1_bounds <- c(mu1 - 3 * s1, mu1 + 3 * s1)
```
We implement the partition, the given $g$ function to understand the behavior of $g$ compared to $f$ and the computation of the supremum and infimum of $f_1$ and $f_2$ on each partition.
```{r}
create_partition <- function(k = 10) {
return(seq(f1_bounds[1], f1_bounds[2], , length.out = k))
}
sup_inf <- function(f, P, i) {
x <- seq(P[i], P[i + 1], length.out = 1000)
f_values <- sapply(x, f)
return(c(max(f_values), min(f_values)))
}
g <- function(X, P) {
values <- numeric(0)
for (x in X) {
if (x <= P[1] | x >= P[length(P)]) {
values <- c(values, 1 / (1 - a) * f1(x))
} else {
for (i in 1:(length(P) - 1)) {
if (x >= P[i] & x <= P[i + 1]) {
values <- c(values, 1 / (1 - a) * (sup_inf(f1, P, i)[1]) - a * sup_inf(f2, P, i)[2])
}
}
}
}
return(values)
}
```
We plot the function $f$ and the dominating function $g$ for different sizes of the partition.
```{r}
library(ggplot2)
X <- seq(-12, 12, length.out = 1000)
# Plot for different k with ggplot on same graph
k_values <- c(10, 20, 50, 100)
P_values <- lapply(k_values, create_partition)
g_values <- lapply(P_values, g, X)
ggplot() +
geom_line(aes(x = X, y = f(X)), col = "red") +
geom_line(aes(x = X, y = g(X, P_values[[1]])), col = "green") +
geom_line(aes(x = X, y = g(X, P_values[[2]])), col = "orange") +
geom_line(aes(x = X, y = g(X, P_values[[3]])), col = "purple") +
geom_line(aes(x = X, y = g(X, P_values[[4]])), col = "black") +
labs(title = "Dominating function g of f for different size of partition", x = "x", y = "Density") +
theme_minimal()
```
Here, we implement the algorithm of accept-reject with the given partition and an appropriate function $g$ to compute $f$.
```{r}
set.seed(123)
g_accept_reject <- function(x, P) {
if (x < P[1] | x >= P[length(P)]) {
return(f1(x))
} else {
for (i in seq_along(P)) {
if (x >= P[i] & x < P[i + 1]) {
return(sup_inf(f1, P, i)[1])
}
}
}
}
stratified <- function(n, P) {
samples <- numeric(0)
rate <- 0
while (length(samples) < n) {
x <- rnorm(1, mu1, s1)
u <- runif(1)
if (u <= f(x) * (1 - a) / g_accept_reject(x, P)) {
samples <- c(samples, x)
}
rate <- rate + 1
}
list(samples = samples, acceptance_rate = n / rate)
}
n <- 10000
k <- 100
P <- create_partition(k)
samples <- stratified(n, P)
X <- seq(-10, 10, length.out = 1000)
hist(samples$samples, breaks = 50, freq = FALSE, col = "lightblue", main = "Empirical density function f", xlab = "x")
lines(X, f(X), col = "red")
```
We also compute the acceptance rate of the algorithm.
```{r}
theorical_acceptance_rate <- 1 - a
cat(sprintf("Empirical acceptance rate: %f, Theoretical acceptance rate: %f \n", samples$acceptance_rate, theorical_acceptance_rate))
```
##### Question 12
```{r}
set.seed(123)
stratified_delta <- function(n, delta) {
samples <- numeric(0)
P <- create_partition(n * delta)
rate <- 0
while (length(samples) < n | rate < delta * n) {
x <- rnorm(1, mu1, s1)
u <- runif(1)
if (u <= f(x) * delta / g_accept_reject(x, P)) {
samples <- c(samples, x)
}
rate <- rate + 1
}
list(samples = samples, partition = P, acceptance_rate = n / rate)
}
n <- 10000
delta <- 0.8
samples_delta <- stratified_delta(n, delta)
X <- seq(-10, 10, length.out = 1000)
hist(samples_delta$samples, breaks = 50, freq = FALSE, col = "lightblue", main = "Empirical density function f", xlab = "x")
lines(X, f(X), col = "red")
```
We also compute the acceptance rate of the algorithm.
```{r}
theorical_acceptance_rate <- 1 - a
cat(sprintf("Empirical acceptance rate: %f, Theoretical acceptance rate: %f \n", samples_delta$acceptance_rate, theorical_acceptance_rate))
```
Now, we will test the stratified_delta function for different delta:
```{r}
set.seed(123)
n <- 1000
deltas <- seq(0.1, 1, by = 0.1)
for (delta in deltas) {
samples <- stratified_delta(n, delta)
cat(sprintf("Delta: %.1f, Empirical acceptance rate: %f \n", delta, samples$acceptance_rate))
}
```
### Cumulative density function.
##### Question 13
The cumulative density function $\forall x \in \mathbb{R}$ $F_X(x) = \int_{-\infty}^{x} f(t) \,dt = \int_{\mathbb{R}} f(t) h(t), dt$ where $h(x) = \mathbb{1}_{X_n \le x}$
For a given $x \in \mathbb{R}$, a Monte Carlo estimator $F_n(x) = \frac{1}{n} \sum_{i=1}^{n} h(X_i)$ where $h$ is the same function as above and $(X_i)_{i=1}^{n} \sim^{iid} X$
##### Question 14
As $X_1, \dots, X_n$ are iid and follows the law of X, and $h$ is continuous and positive, we have $h(X_1), \dots h(X_n)$ are iid and $\mathbb{E}[h(X_i)] < + \infty$. By the law of large numbers, we have $F_n(X) = \frac{1}{n} \sum_{i=1}^{n} h(X_i) \xrightarrow{a.s} \mathbb{E}[h(X_1)] = F_X(x)$.
Moreover, $\forall \epsilon > 0$, $\exists N \in \mathbb{N}$ such that $\forall n \le N$, $|F_n(x) - F_X(x)| < \epsilon$, ie, $sup_{x \in \mathbb{R}} |F_n(x) - F_X(x)| \xrightarrow{a.s} 0$, by Glivenko-Cantelli theorem.
Hence, $F_n$ is a good estimate of $F_X$ as a function of $x$.
##### Question 15
```{r}
set.seed(123)
n <- 10000
Xn <- (rnorm(n, mu1, s1) - a * rnorm(n, mu2, s2)) / (1 - a)
h <- function(x, Xn) {
return(Xn <= x)
}
# Fn empirical
empirical_cdf <- function(x, Xn) {
return(mean(h(x, Xn)))
}
# F theoretical
F <- function(x) {
Fx <- pnorm(x, mu1, s1) - a * pnorm(x, mu2, s2)
return(Fx / (1 - a))
}
X <- seq(-10, 10, length.out = n)
cat(sprintf("Empirical cdf: %f, Theoretical cdf: %f \n", empirical_cdf(X, Xn), mean(F(Xn))))
length(X)
length(empirical_cdf(X, Xn))
plot(X, empirical_cdf(X, Xn), col = "red", type = "l", xlab = "x", ylab = "F(x)")
lines(X, F(X), col = "blue")
```
Now we plot the empirical and theoretical cumulative density functions for different n.
```{r}
n_values <- seq(100, 10000, length.out = 10)
```
##### Question 16
As $X_1, \dots, X_n$ are iid and follows the law of X, and $h$ is continuous and positive, we have $h(X_1), \dots h(X_n)$ are iid and $\mathbb{E}[h(X_i)^2] < + \infty$. By the Central Limit Theorem, we have $\sqrt{n} \frac{(F_n(x) - F_X(x))}{\sigma} \xrightarrow{d} \mathcal{N}(0, 1)$ where $\sigma^2 = Var(h(X_1))$.
So we have $\lim_{x\to\infty} \mathbb{P}(\sqrt{n} \frac{(F_n(x) - F_X(x))}{\sigma} \le q_{1-\frac{\alpha}{2}}^{\mathcal{N}(0, 1)})= 1 - \alpha$
So by computing the quantile of the normal distribution, we can have a confidence interval for $F_X(x)$ :
$F_X(x) \in [F_n(x) - \frac{q_{1-\frac{\alpha}{2}}^{\mathcal{N}(0, 1)} \sigma}{\sqrt{n}} ; F_n(x) + \frac{q_{1-\frac{\alpha}{2}}^{\mathcal{N}(0, 1)} \sigma}{\sqrt{n}}]$
```{r}
Fn <- empirical_cdf(X, Xn)
sigma <- sqrt(Fn - Fn^2)
q <- qnorm(0.975)
interval <- c(Fn - q * sigma / sqrt(n), Fn + q * sigma / sqrt(n))
cat(sprintf("Confidence interval: [%f, %f] \n", interval[1], interval[2]))
```
##### Question 17
```{r}
q <- qnorm(0.975)
x <- 1
epsilon <- 0.01
n_required <- (1.96^2 * F(x) * (1 - F(x))) / epsilon^2
ceiling(n_required)
```