diff --git a/M1/General Linear Models/TP1/.RData b/M1/General Linear Models/TP1/.RData new file mode 100644 index 0000000..c51e7bb Binary files /dev/null and b/M1/General Linear Models/TP1/.RData differ diff --git a/M1/General Linear Models/TP1/.Rhistory b/M1/General Linear Models/TP1/.Rhistory new file mode 100644 index 0000000..bd91032 --- /dev/null +++ b/M1/General Linear Models/TP1/.Rhistory @@ -0,0 +1,345 @@ +library(rmarkdown) +health <- read.table("health.txt", header = TRUE, sep = " ", dec = ".") +paged_table(health) +library(dplyr) +names(health) +library(dplyr) +names(health) +Health<-health[,2:4] +names(Ozone) +library(dplyr) +names(health) +Health<-health[,2:4] +names(Health) +library(dplyr) +names(health) +Health<-health[,2:5] +names(Health) +library(ggplot2) +library(plotly) +p1<-ggplot(health) + aes(x = T12, y = maxO3) + geom_point(col = 'red', size = 0.5) + geom_smooth(method = "lm", se = FALSE) +ggplotly(p1) +library(ggplot2) +library(plotly) +p1<-ggplot(health) + aes(x = age, y = y) + geom_point(col = 'red', size = 0.5) + geom_smooth(method = "lm", se = FALSE) +ggplotly(p1) +library(rmarkdown) +health <- read.table("health.txt", header = TRUE, sep = " ", dec = ".") +paged_table(health) +library(rmarkdown) +health <- read.table("health.txt", header = TRUE, sep = " ", dec = ".") +paged_table(health) +library(dplyr) +library(corrplot) +install.packages("corrplot") +library(dplyr) +library(corrplot) +correlation_matrix<-cor(Health) +library(dplyr) +library(corrplot) +correlation_matrix<-cor(health) +corrplot(correlation_matrix, order = 'hclust',addrect = 3) +GGally +install.packages("GGally") +library(dplyr) +library(corrplot) +correlation_matrix<-cor(health) +corrplot(correlation_matrix, order = 'hclust',addrect = 3) +library(GGally) +ggpairs(Ozone) +library(dplyr) +library(corrplot) +correlation_matrix<-cor(health) +corrplot(correlation_matrix, order = 'hclust',addrect = 3) +library(GGally) +ggpairs(health) +library(dplyr) +library(corrplot) +correlation_matrix<-cor(health) +corrplot(correlation_matrix, order = 'hclust',addrect = 3) +library(GGally) +ggpairs(health) +library(dplyr) +library(corrplot) +correlation_matrix<-cor(health) +corrplot(correlation_matrix, order = 'hclust',addrect = 3) +Health <- health[2:5] +library(dplyr) +library(corrplot) +correlation_matrix<-cor(Health) +corrplot(correlation_matrix, order = 'hclust',addrect = 3) +model <- lm(y ~ ., data = Health) +coefficients(model) +summary(model) +library(ggfortify) +install.packages("ggfortify") +library(ggfortify) +autoplot(mod,1) +library(ggfortify) +autoplot(model,1) +library(ggfortify) +autoplot(model,1) +autoplot(model,3) +library(ggfortify) +autoplot(model,1) +autoplot(model,3) +set.seed(0) +durbinWatsonTest(model) +install.packages("car") +library(ggfortify) +autoplot(model,1) +autoplot(model,3) +set.seed(0) +durbinWatsonTest(model) +library(ggfortify) +autoplot(model,1) +autoplot(model,3) +set.seed(0) +durbinWatsonTest(model) +library(ggfortify) +library(car) +autoplot(model,1) +autoplot(model,3) +set.seed(0) +durbinWatsonTest(model) +library(ggfortify) +library(car) +autoplot(model,1) +autoplot(model,3) +set.seed(0) +durbinWatsonTest(model) +library(ggfortify) +library(car) +autoplot(model,1) +autoplot(model,3) +set.seed(0) +durbinWatsonTest(model) +autoplot(model,2) +model <- lm(y ~ age, data = Health) +coefficients(model) +library(ggfortify) +library(car) +autoplot(model,1) +autoplot(model,3) +set.seed(0) +durbinWatsonTest(model) +autoplot(model,2) +model <- lm(y ~ ., data = Health) +coefficients(model) +library(ggfortify) +library(car) +autoplot(model,1) +autoplot(model,3) +set.seed(0) +durbinWatsonTest(model) +autoplot(model,2) +library(ggfortify) +library(car) +autoplot(model,1) +autoplot(model,3) +set.seed(0) +durbinWatsonTest(model) +autoplot(model,2) +model <- lm(y ~ ., data = Health) +coefficients(model) +summary(model) +model <- lm(y ~ age, data = Health) +coefficients(model) +summary(model) +library(ggfortify) +library(car) +autoplot(model,1) +model <- lm(y ~ ., data = Health) +coefficients(model) +summary(model) +library(ggfortify) +library(car) +autoplot(model,1) +library(GGally) +ggpairs(model) +library(GGally) +ggpairs(health) +library(GGally) +ggpairs(Health) +library(GGally) +ggpairs(Health, mapping = F) +library(GGally) +ggpairs(Health, progress = F) +model2 <- lm(y ~ age**2 + tri + chol, data = Health) +model2 <- lm(y ~ age**2 + tri + chol, data = Health) +summary(model2) +coefficients(model2) +library(ggfortify) +library(car) +autoplot(model2,1) +autoplot(model2,3) +set.seed(0) +durbinWatsonTest(model2) +autoplot(model2,2) +model2 <- lm(y ~ sqrt(age) + tri + chol, data = Health) +summary(model2) +coefficients(model2) +model2 <- lm(y ~ sqrt(age) + tri + chol, data = Health) +summary(model2) +coefficients(model2) +library(ggfortify) +library(car) +autoplot(model2,1) +Health2 <- Health +Health2[1] <- Health2[1]^2 +model2 <- lm(y ~ ., data = Health) +summary(model2) +coefficients(model2) +library(ggfortify) +library(car) +autoplot(model2,1) +View(Health) +View(Health) +View(Health2) +View(Health2) +Health2 <- Health +View(Health2) +View(Health2) +Health2 <- Health +Health2[2] <- Health2[2]^2 +model2 <- lm(y ~ ., data = Health) +summary(model2) +coefficients(model2) +library(ggfortify) +library(car) +autoplot(model2,1) +View(Health2) +View(Health2) +autoplot(model2,3) +Health2 <- Health +Health2[2] <- Health2[2]^2 +model2 <- lm(y ~ ., data = Health2) +summary(model2) +coefficients(model2) +library(ggfortify) +library(car) +autoplot(model2,1) +autoplot(model2,3) +set.seed(0) +durbinWatsonTest(model2) +autoplot(model2,2) +Health2 <- Health +Health2[2] <- sqrt(Health2[2]) +model2 <- lm(y ~ ., data = Health2) +summary(model2) +coefficients(model2) +library(ggfortify) +library(car) +autoplot(model2,1) +Health2 <- Health +Health2[2] <- Health2[2]^2 +model2 <- lm(y ~ ., data = Health2) +summary(model2) +coefficients(model2) +Health2 <- Health +Health2[2] <- Health2[2]^2 +model2 <- lm(y ~ ., data = Health2) +summary(model2) +coefficients(model2) +library(ggfortify) +library(car) +autoplot(model2,1) +autoplot(model2,3) +Health2 <- Health +Health2[6] <- Health2[2]^2 +View(Health2) +View(Health2) +Health2 <- c(Health, c()) +Health2[6] <- Health2[2]^2 +Health2 <- c(Health, c()) +Health2[5] <- Health2[2]^2 +View(Health2) +View(Health2) +Health2 <- Health +Health2 <- append(Health2, Health[2]^2) +model2 <- lm(y ~ ., data = Health2) +View(Health2) +Health2 <- Health +Health2 <- append(Health2, Health[2]^2) +model2 <- lm(y ~ ., data = Health2) +Health2 <- Health +# Add to Health a new column with the square of the age named age_sq +Health2$age_sq <- Health2$age^2 +model2 <- lm(y ~ ., data = Health2) +summary(model2) +coefficients(model2) +View(Health2) +Health2 <- Health +the age named age_sq +library(ggfortify) +library(car) +autoplot(model2,1) +Health2 <- Health +Health2$age_sq <- Health2$age^2 +model2 <- lm(y ~ ., data = Health2) +summary(model2) +coefficients(model2) +library(ggfortify) +library(car) +autoplot(model2,1) +autoplot(model2,3) +autoplot(model2,2) +View(Health2) +View(Health2) +Health2 <- Health +Health2$age_sq <- sqrt(Health2$age) +model2 <- lm(y ~ ., data = Health2) +summary(model2) +coefficients(model2) +library(ggfortify) +library(car) +autoplot(model2,1) +Health2 <- Health +Health2$age_sq <- Health2$age^2 +model2 <- lm(y ~ ., data = Health2) +summary(model2) +coefficients(model2) +library(ggfortify) +library(car) +autoplot(model2,1) +View(Health2) +View(Health2) +Health2 <- Health +Health2$age_sq <- Health2$age^2 +Health2 <- Health2[1:99,] +model2 <- lm(y ~ ., data = Health2) +summary(model2) +coefficients(model2) +library(ggfortify) +library(car) +autoplot(model2,1) +Health2 <- Health +Health2$age_sq <- Health2$age^2 +Health2 <- Health2[1:5,24] +model2 <- lm(y ~ ., data = Health2) +Health2 <- Health +Health2$age_sq <- Health2$age^2 +Health2 <- Health2[1:5,] +model2 <- lm(y ~ ., data = Health2) +summary(model2) +coefficients(model2) +View(Health2) +Health2 <- Health +Health2$age_sq <- Health2$age^2 +Health2 <- Health2[1:24,] +model2 <- lm(y ~ ., data = Health2) +summary(model2) +coefficients(model2) +library(ggfortify) +library(car) +autoplot(model2,1) +autoplot(model2,3) +set.seed(0) +durbinWatsonTest(model2) +autoplot(model2,2) +Health2 <- Health +Health2$age_sq <- Health2$age^2 +Health2 <- Health2[1:24,] +model2 <- lm(y ~ ., data = Health2) +summary(model2) +coefficients(model2) diff --git a/M1/General Linear Models/TP1/TP1.Rmd b/M1/General Linear Models/TP1/TP1.Rmd index 27ee182..aaa6653 100644 --- a/M1/General Linear Models/TP1/TP1.Rmd +++ b/M1/General Linear Models/TP1/TP1.Rmd @@ -23,16 +23,12 @@ summary(model) ```{r} library(ggfortify) library(car) -autoplot(model,1) +autoplot(model,1:3) ``` The points are not well distributed around 0 -> [P1] is not verified - -```{r} -autoplot(model,3) -``` - The points are not well distributed around 1 -> [P2] is not verified +The QQPlot is align with the line y = x, so it is globally gaussian -> [P4] is verified ```{r} set.seed(0) @@ -41,12 +37,6 @@ durbinWatsonTest(model) The p-value is 0.58 > 0.05 -> We do not reject H0 so the residuals are not autocorrelated -> [P3] is verified -```{r} -autoplot(model,2) -``` - -The QQPlot is align with the line y = x, so it is globally gaussian -> [P4] is verified - ```{r} library(GGally) ggpairs(Health, progress = F) @@ -68,16 +58,12 @@ coefficients(model2) ```{r} library(ggfortify) library(car) -autoplot(model2,1) +autoplot(model2,1:4) ``` The points are well distributed around 0 -> [P1] is verified - -```{r} -autoplot(model2,3) -``` - The points are not well distributed around 1 -> [P2] is verified +The QQPlot is align with the line y = x, so it is gaussian -> [P4] is verified ```{r} set.seed(0) @@ -86,8 +72,3 @@ durbinWatsonTest(model2) The p-value is 0.294 > 0.05 -> We do not reject H0 so the residuals are not autocorrelated -> [P3] is verified -```{r} -autoplot(model2,2) -``` - -The QQPlot is align with the line y = x, so it is gaussian -> [P4] is verified diff --git a/M1/Monte Carlo Methods/Project 1/003_rapport_DANJOU_DUROUSSEAU.html b/M1/Monte Carlo Methods/Project 1/003_rapport_DANJOU_DUROUSSEAU.html new file mode 100644 index 0000000..d2e375f --- /dev/null +++ b/M1/Monte Carlo Methods/Project 1/003_rapport_DANJOU_DUROUSSEAU.html @@ -0,0 +1,730 @@ + + + + + + + + + + + + + +Groupe 03 Projet DANJOU - DUROUSSEAU + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Exercise 1 : Negative weighted mixture

+
+

Definition

+
+
Question 1
+

The conditions for a function f to be a probability density are :

+
    +
  • f is defined on \(\mathbb{R}\)
  • +
  • f is non-negative, ie \(f(x) \ge +0\), \(\forall x \in +\mathbb{R}\)
  • +
  • f is Lebesgue-integrable
  • +
  • and \(\int_{\mathbb{R}} f(x) \,dx = +1\)
  • +
+

The function f, to be a density, needs to be non-negative, ie \(f(|x|) \ge 0\) when \(|x| \rightarrow \infty\)

+

We have, when \(|x| \rightarrow +\infty\), \(f_i(|x|) \sim +exp(\frac{-x^2}{\sigma_i^2})\) for \(i += 1, 2\)

+

Then, \(f(|x|) \sim +exp(\frac{-x^2}{\sigma_1^2}) - exp(\frac{-x^2}{\sigma_2^2})\)

+

We want \(f(|x|) \ge 0\), ie, \(exp(\frac{-x^2}{\sigma_1^2}) - +exp(\frac{-x^2}{\sigma_2^2}) \ge 0\)

+

\(\Leftrightarrow \sigma_1^2 \ge +\sigma_2^2\)

+

We can see that \(f_1\) dominates +the tail behavior.

+


+

+
+
+
Question 2
+

For given parameters \((\mu_1, +\sigma_1^2)\) and \((\mu_2, +\sigma_2^2)\), we have \(\forall x \in +\mathbb{R}\), \(f(x) \ge 0\)

+

\(\Leftrightarrow \frac{1}{\sigma_1} +exp(\frac{-(x-\mu_1)^2}{2 \sigma_1^2}) \ge \frac{a}{\sigma_2} +exp(\frac{-(x-\mu_2)^2}{2 \sigma_2^2})\)

+

\(\Leftrightarrow 0 < a \le a^* = +\min_{x \in \mathbb{R}} \frac{f_1(x)}{f_2(x)} = \min_{x \in \mathbb{R}} +\frac{\sigma_2}{\sigma_1} exp(\frac{(x-\mu_2)^2}{2 \sigma_2^2} - +\frac{(x-\mu_1)^2}{2 \sigma_1^2})\)

+

To find \(a^*\), we just have to +minimize \(g(x) := \frac{(x-\mu_2)^2}{2 +\sigma_2^2} - \frac{(x-\mu_1)^2}{2 \sigma_1^2}\)

+

First we derive \(g\): \(\forall x \in \mathbb{R}, g^{\prime}(x) = \frac{x +- \mu_2}{\sigma_2^2} - \frac{x - \mu_1}{\sigma_1^2}\)

+

We search \(x^*\) such that \(g^{\prime}(x^*) = 0\)

+

\(\Leftrightarrow x^* = \frac{\mu_2 +\sigma_1^2 - \mu_1 \sigma_2^2}{\sigma_1^2 - \sigma_2^2}\)

+

Then, we compute \(a^* = +\frac{f_1(x^*)}{f_2(x^*)}\)

+

We call \(C \in \mathbb{R}\) the +normalization constant such that \(f(x) = C +(f_1(x) - af_2(x))\)

+

To find \(C\), we know that \(1 = \int_{\mathbb{R}} f(x) \,dx = +\int_{\mathbb{R}} C (f_1(x) - af_2(x)) \,dx = C \int_{\mathbb{R}} f_1(x) +\,dx - Ca \int_{\mathbb{R}} f_2(x) \,dx = C(1-a)\) as \(f_1\) and \(f_2\) are density functions and by +linearity of the integrals.

+

\(\Leftrightarrow C = +\frac{1}{1-a}\)

+


+

+
+
+
Question 3
+
f <- function(a, mu1, mu2, s1, s2, x) {
+  fx <- dnorm(x, mu1, s1) - a * dnorm(x, mu2, s2)
+  fx[fx < 0] <- 0
+  return(fx / (1 - a))
+}
+
+a_star <- function(mu1, mu2, s1, s2) {
+  x_star <- (mu2 * s1^2 - mu1 * s2^2) / (s1^2 - s2^2)
+  return(dnorm(x_star, mu1, s1) / dnorm(x_star, mu2, s2))
+}
+
mu1 <- 0
+mu2 <- 1
+s1 <- 3
+s2 <- 1
+
+x <- seq(-10, 10, length.out = 1000)
+as <- a_star(mu1, mu2, s1, s2)
+a_values <- c(0.1, 0.2, 0.3, 0.4, 0.5, as)
+
+plot(x, f(as, mu1, mu2, s1, s2, x),
+     type = "l",
+     col = "red",
+     xlab = "x",
+     ylab = "f(a, mu1, mu2, s1, s2, x)",
+     main = "Density function of f(a, mu1, mu2, s1, s2, x) for different a"
+)
+for (i in (length(a_values) - 1):1) {
+  lines(x, f(a_values[i], mu1, mu2, s1, s2, x), lty = 3, col = "blue")
+}
+legend("topright", legend = c("a = a*", "a != a*"), col = c("red", "blue"), lty = 1)
+

+

We observe that for small values of a, the density f is close to the +density of \(f_1 \sim \mathcal{N}(\mu_1, +\sigma_1^2)\). When a increases, the shape of evolves into the +combinaison of two normal distributions. We observe that for \(a = a^*\), the density the largest value of +a for which the density is still a density function, indeed for \(a > a^*\), the function f takes negative +values so it is no longer a density.

+
s2_values <- seq(1, 10, length.out = 5)
+a <- 0.2
+
+plot(x, f(a, mu1, mu2, s1, max(s2_values), x),
+     type = "l",
+     xlab = "x",
+     ylab = "f(a, mu1, mu2, s1, s2, x)",
+     col = "red",
+     main = "Density function of f(a, mu1, mu2, s1, s2, x) for different s2"
+)
+
+for (i in length(s2_values):1) {
+  lines(x, f(a, mu1, mu2, s1, s2_values[i], x), lty = 1, col = rainbow(length(s2_values))[i])
+}
+
+legend("topright", legend = paste("s2 =", s2_values), col = rainbow(length(s2_values)), lty = 1)
+

+

We observe that when \(\sigma_2^2 = +1\), the density \(f\) has two +peaks and when \(\sigma_2^1 > 1\), +the density \(f\) has only one +peak.

+
mu1 <- 0
+mu2 <- 1
+sigma1 <- 3
+sigma2 <- 1
+a <- 0.2
+as <- a_star(mu1, mu2, sigma1, sigma2)
+
+cat(sprintf("a* = %f, a = %f, a <= a* [%s]", as, a, a <= as))
+
## a* = 0.313138, a = 0.200000, a <= a* [TRUE]
+

We have \(\sigma_1^2 \ge +\sigma_2^2\) and \(0 < a \le +a^*\), so the numerical values are compatible with the +constraints defined above.

+
+
+
+

Inverse c.d.f Random Variable simulation

+
+
Question 4
+

To prove that the cumulative density function F associated with f is +available in closed form, we need to compute \(F(x) = \int_{-\infty}^{x} f(t) \,dt = +\frac{1}{1-a} (\int_{-\infty}^{x} f_1(t)\, dt - a \int_{-\infty}^{x} +f_2(t)\, dt) = \frac{1}{1-a} (F_1(x) - aF_2(x))\) where \(F_1\) and \(F_2\) are the cumulative density functions +of \(f_1 \sim \mathcal{N}(\mu1, +\sigma_1^2)\) and \(f_2 \sim +\mathcal{N}(\mu2, \sigma_2^2)\) respectively.

+

Then, \(F\) is a closed-form as a +finite sum of closed forms.

+
F <- function(a, mu1, mu2, s1, s2, x) {
+  Fx <- pnorm(x, mu1, s1) - a * pnorm(x, mu2, s2)
+  return(Fx / (1 - a))
+}
+

To construct an algorithm that returns the value of the inverse +function method as a function of u ∈(0,1), of the parameters \(a, \mu_1, \mu_2, \sigma_1, \sigma_2, x\), +and of an approximation precision \(\epsilon\), we can use the bisection +method.

+

We fixe \(\epsilon > 0\).
+We set \(u \in (0, 1)\).
+We define \(L = -10\) and \(U = -L\), the bounds and \(M = \frac{L + U}{2}\), the middle of our +interval.
+While \(U - L > \epsilon\) :

+
    +
  • We compute \(F(M) = \frac{1}{1-a} (F_1(M) +- aF_2(M))\)
  • +
  • If \(F(M) < u\), we set \(L = M\)
  • +
  • Else, we set \(U = M\)
  • +
  • We set \(M = \frac{L + U}{2}\)
  • +
+

End while

+

For the generation of random variables from F, we can use the inverse +transform sampling method.
+

+

We set \(X\) as an empty array and +\(n\) the number of random variables we +want to generate.
+We fixe \(\epsilon > 0\).
+For \(i = 1, \dots, n\) :

+
    +
  • We set \(u \in (0, 1)\).
    +

  • +
  • We define \(L = -10\) and \(U = -L\), the bounds and \(M = \frac{L + U}{2}\), the middle of our +interval.
    +

  • +
  • While \(U - L > \epsilon\) +:

  • +
  • We compute \(F(M) = \frac{1}{1-a} +(F_1(M) - aF_2(M))\)

  • +
  • If \(F(M) < u\), we set \(L = M\)

  • +
  • Else, we set \(U = M\)

  • +
  • We set \(M = \frac{L + +U}{2}\)

  • +
  • End while

  • +
  • We add \(M\) to \(X\)

  • +
+

End for We return \(X\)

+
+
+
Question 5
+
inv_cdf <- function(n) {
+  X <- numeric(n)
+  for (i in 1:n) {
+    u <- runif(1)
+    L <- -10
+    U <- -L
+    M <- (L + U) / 2
+    while (U - L > 1e-6) {
+      FM <- F(a, mu1, mu2, s1, s2, M)
+      if (FM < u) {
+        L <- M
+      } else {
+        U <- M
+      }
+      M <- (L + U) / 2
+    }
+    X[i] <- M
+  }
+  return(X)
+}
+
set.seed(123)
+n <- 10000
+X <- inv_cdf(n)
+x <- seq(-10, 10, length.out = 1000)
+
+hist(X, breaks = 100, freq = FALSE, col = "lightblue", main = "Empirical density function", xlab = "x")
+lines(x, f(a, mu1, mu2, s1, s2, x), col = "red")
+legend("topright", legend = c("Theorical density", "Empirical density"), col = c("red", "lightblue"), lty = 1)
+

+
plot(ecdf(X), col = "blue", main = "Empirical cumulative distribution function", xlab = "x", ylab = "F(x)")
+lines(x, F(a, mu1, mu2, s1, s2, x), col = "red")
+legend("bottomright", legend = c("Theoretical cdf", "Empirical cdf"), col = c("red", "blue"), lty = 1)
+

+

We can see of both graphs that the empirical cumulative distribution +function is close to the theoretical one.

+
+
+
+

Accept-Reject Random Variable simulation

+
+
Question 6
+

To simulate under f using the accept-reject algorithm, we need to +find a density function g such that \(f(x) \le +M g(x)\) for all \(x \in +\mathbb{R}\), where M is a constant.
+

+

Then, we generate \(X \sim g\) and +\(U \sim \mathcal{U}([0,1])\).
+We accept \(Y = X\) if \(U \le \frac{f(X)}{Mg(X)}\). Return \(1\) otherwise.

+

The probability of acceptance is \(\int_{\mathbb{R}} \frac{f(x)}{Mg(x)} g(x) \,dx = +\frac{1}{M} \int_{\mathbb{R}} f(x) \,dx = \frac{1}{M}\)

+

Here we pose \(g = f1\).

+

Then we have \(\frac{f(x)}{g(x)} = +\frac{1}{1-a} (1 - a\frac{f_2(x)}{f_1(x)})\)

+

We pose earlier that \(a^* = \min_{x \in +\mathbb{R}} \frac{f_1(x)}{f_2(x)} \Rightarrow \frac{1}{a^*} = \max_{x +\in \mathbb{R}} \frac{f_2(x)}{f_1(x)}\).

+

We compute in our fist equation : \(\frac{1}{1-a} (1 - a\frac{f_2(x)}{f_1(x)}) +\leq \frac{1}{1-a} (1 - \frac{a}{a^*}) \leq \frac{1}{1-a}\) +because \(a \leq a^* \Rightarrow 1 - +\frac{a}{a^*} \leq 0\)

+

To conclude, we have \(M = +\frac{1}{1-a}\) and the probability of acceptance is \(\frac{1}{M} = 1-a\)

+
+
+
Question 7
+
accept_reject <- function(n, a) {
+  X <- numeric(0)
+  M <- 1 / (1 - a)
+
+  while (length(X) < n) {
+    Y <- rnorm(1, mu1, s1)
+    U <- runif(1)
+
+    if (U <= f(a, mu1, mu2, s1, s2, Y) / (M * dnorm(Y, mu1, s1))) {
+      X <- append(X, Y)
+    }
+  }
+  return(X)
+}
+
set.seed(123)
+n <- 10000
+a <- 0.2
+X <- accept_reject(n, a)
+x <- seq(-10, 10, length.out = 1000)
+
+hist(X, breaks = 100, freq = FALSE, col = "lightblue", main = "Empirical density function", xlab = "x")
+lines(x, f(a, mu1, mu2, s1, s2, x), col = "red")
+legend("topright", legend = c("Theorical density", "Empirical density"), col = c("red", "lightblue"), lty = 1)
+

+
set.seed(123)
+
+acceptance_rate <- function(n, a = 0.2) {
+  Y <- rnorm(n, mu1, s1)
+  U <- runif(n)
+  return(mean(U <= f(a, mu1, mu2, s1, s2, Y) / (M * dnorm(Y, mu1, s1))))
+}
+
+M <- 1 / (1 - a)
+n <- 10000
+cat(sprintf("[M = %.2f] Empirical acceptance rate: %f, Theoretical acceptance rate: %f \n", M, acceptance_rate(n), 1 / M))
+
## [M = 1.25] Empirical acceptance rate: 0.802600, Theoretical acceptance rate: 0.800000
+
+
+
Question 8
+
set.seed(123)
+a_values <- seq(0.01, 1, length.out = 100)
+acceptance_rates <- numeric(length(a_values))
+as <- a_star(mu1, mu2, s1, s2)
+
+for (i in seq_along(a_values)) {
+  acceptance_rates[i] <- acceptance_rate(n, a_values[i])
+}
+
+plot(a_values, acceptance_rates, type = "l", col = "blue", xlab = "a", ylab = "Acceptance rate", main = "Acceptance rate as a function of a")
+points(as, acceptance_rate(n, as), col = "red", pch = 16)
+legend("topright", legend = c("Acceptance rate for all a", "Acceptance rate for a_star"), col = c("lightblue", "red"), lty = c(1, NA), pch = c(NA, 16))
+

+
+
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/M1/Monte Carlo Methods/Project 1/003_rapport_DANJOU_DUROUSSEAU.rmd b/M1/Monte Carlo Methods/Project 1/003_rapport_DANJOU_DUROUSSEAU.rmd new file mode 100644 index 0000000..97c2d21 --- /dev/null +++ b/M1/Monte Carlo Methods/Project 1/003_rapport_DANJOU_DUROUSSEAU.rmd @@ -0,0 +1,313 @@ +--- +title: "Groupe 03 Projet DANJOU - DUROUSSEAU" +output: html_document +--- + +# Exercise 1 : Negative weighted mixture + +### Definition + +##### Question 1 + +The conditions for a function f to be a probability density are : + +- f is defined on $\mathbb{R}$ +- f is non-negative, ie $f(x) \ge 0$, $\forall x \in \mathbb{R}$ +- f is Lebesgue-integrable +- and $\int_{\mathbb{R}} f(x) \,dx = 1$ + +The function f, to be a density, needs to be non-negative, ie $f(|x|) \ge 0$ when $|x| \rightarrow \infty$ + +We have, when $|x| \rightarrow \infty$, $f_i(|x|) \sim exp(\frac{-x^2}{\sigma_i^2})$ for $i = 1, 2$ + +Then, $f(|x|) \sim exp(\frac{-x^2}{\sigma_1^2}) - exp(\frac{-x^2}{\sigma_2^2})$ + +We want $f(|x|) \ge 0$, ie, $exp(\frac{-x^2}{\sigma_1^2}) - exp(\frac{-x^2}{\sigma_2^2}) \ge 0$ + +$\Leftrightarrow \sigma_1^2 \ge \sigma_2^2$ + +We can see that $f_1$ dominates the tail behavior. + +\ + +##### Question 2 + +For given parameters $(\mu_1, \sigma_1^2)$ and $(\mu_2, \sigma_2^2)$, we have $\forall x \in \mathbb{R}$, $f(x) \ge 0$ + +$\Leftrightarrow \frac{1}{\sigma_1} exp(\frac{-(x-\mu_1)^2}{2 \sigma_1^2}) \ge \frac{a}{\sigma_2} exp(\frac{-(x-\mu_2)^2}{2 \sigma_2^2})$ + +$\Leftrightarrow 0 < a \le a^* = \min_{x \in \mathbb{R}} \frac{f_1(x)}{f_2(x)} = \min_{x \in \mathbb{R}} \frac{\sigma_2}{\sigma_1} exp(\frac{(x-\mu_2)^2}{2 \sigma_2^2} - \frac{(x-\mu_1)^2}{2 \sigma_1^2})$ + +To find $a^*$, we just have to minimize $g(x) := \frac{(x-\mu_2)^2}{2 \sigma_2^2} - \frac{(x-\mu_1)^2}{2 \sigma_1^2}$ + +First we derive $g$: $\forall x \in \mathbb{R}, g^{\prime}(x) = \frac{x - \mu_2}{\sigma_2^2} - \frac{x - \mu_1}{\sigma_1^2}$ + +We search $x^*$ such that $g^{\prime}(x^*) = 0$ + +$\Leftrightarrow x^* = \frac{\mu_2 \sigma_1^2 - \mu_1 \sigma_2^2}{\sigma_1^2 - \sigma_2^2}$ + +Then, we compute $a^* = \frac{f_1(x^*)}{f_2(x^*)}$ + +We call $C \in \mathbb{R}$ the normalization constant such that $f(x) = C (f_1(x) - af_2(x))$ + +To find $C$, we know that $1 = \int_{\mathbb{R}} f(x) \,dx = \int_{\mathbb{R}} C (f_1(x) - af_2(x)) \,dx = C \int_{\mathbb{R}} f_1(x) \,dx - Ca \int_{\mathbb{R}} f_2(x) \,dx = C(1-a)$ as $f_1$ and $f_2$ are density functions and by linearity of the integrals. + +$\Leftrightarrow C = \frac{1}{1-a}$ + +\ + +##### Question 3 + +```{r} +f <- function(a, mu1, mu2, s1, s2, x) { + fx <- dnorm(x, mu1, s1) - a * dnorm(x, mu2, s2) + fx[fx < 0] <- 0 + return(fx / (1 - a)) +} + +a_star <- function(mu1, mu2, s1, s2) { + x_star <- (mu2 * s1^2 - mu1 * s2^2) / (s1^2 - s2^2) + return(dnorm(x_star, mu1, s1) / dnorm(x_star, mu2, s2)) +} +``` + +```{r} +mu1 <- 0 +mu2 <- 1 +s1 <- 3 +s2 <- 1 + +x <- seq(-10, 10, length.out = 1000) +as <- a_star(mu1, mu2, s1, s2) +a_values <- c(0.1, 0.2, 0.3, 0.4, 0.5, as) + +plot(x, f(as, mu1, mu2, s1, s2, x), + type = "l", + col = "red", + xlab = "x", + ylab = "f(a, mu1, mu2, s1, s2, x)", + main = "Density function of f(a, mu1, mu2, s1, s2, x) for different a" +) +for (i in (length(a_values) - 1):1) { + lines(x, f(a_values[i], mu1, mu2, s1, s2, x), lty = 3, col = "blue") +} +legend("topright", legend = c("a = a*", "a != a*"), col = c("red", "blue"), lty = 1) +``` + +We observe that for small values of a, the density f is close to the density of $f_1 \sim \mathcal{N}(\mu_1, \sigma_1^2)$. When a increases, the shape of evolves into the combinaison of two normal distributions. We observe that for $a = a^*$, the density the largest value of a for which the density is still a density function, indeed for $a > a^*$, the function f takes negative values so it is no longer a density. + +```{r} +s2_values <- seq(1, 10, length.out = 5) +a <- 0.2 + +plot(x, f(a, mu1, mu2, s1, max(s2_values), x), + type = "l", + xlab = "x", + ylab = "f(a, mu1, mu2, s1, s2, x)", + col = "red", + main = "Density function of f(a, mu1, mu2, s1, s2, x) for different s2" +) + +for (i in length(s2_values):1) { + lines(x, f(a, mu1, mu2, s1, s2_values[i], x), lty = 1, col = rainbow(length(s2_values))[i]) +} + +legend("topright", legend = paste("s2 =", s2_values), col = rainbow(length(s2_values)), lty = 1) +``` + +We observe that when $\sigma_2^2 = 1$, the density $f$ has two peaks and when $\sigma_2^1 > 1$, the density $f$ has only one peak. + +```{r} +mu1 <- 0 +mu2 <- 1 +sigma1 <- 3 +sigma2 <- 1 +a <- 0.2 +as <- a_star(mu1, mu2, sigma1, sigma2) + +cat(sprintf("a* = %f, a = %f, a <= a* [%s]", as, a, a <= as)) +``` + +We have $\sigma_1^2 \ge \sigma_2^2$ and $0 < a \le a^*$, so the numerical values are compatible with the constraints defined above. + +### Inverse c.d.f Random Variable simulation + +##### Question 4 + +To prove that the cumulative density function F associated with f is available in closed form, we need to compute $F(x) = \int_{-\infty}^{x} f(t) \,dt = \frac{1}{1-a} (\int_{-\infty}^{x} f_1(t)\, dt - a \int_{-\infty}^{x} f_2(t)\, dt) = \frac{1}{1-a} (F_1(x) - aF_2(x))$ where $F_1$ and $F_2$ are the cumulative density functions of $f_1 \sim \mathcal{N}(\mu1, \sigma_1^2)$ and $f_2 \sim \mathcal{N}(\mu2, \sigma_2^2)$ respectively. + +Then, $F$ is a closed-form as a finite sum of closed forms. + +```{r} +F <- function(a, mu1, mu2, s1, s2, x) { + Fx <- pnorm(x, mu1, s1) - a * pnorm(x, mu2, s2) + return(Fx / (1 - a)) +} +``` + +To construct an algorithm that returns the value of the inverse function method as a function of u ∈(0,1), of the parameters $a, \mu_1, \mu_2, \sigma_1, \sigma_2, x$, and of an approximation precision $\epsilon$, we can use the bisection method. + +We fixe $\epsilon > 0$.\ +We set $u \in (0, 1)$.\ +We define $L = -10$ and $U = -L$, the bounds and $M = \frac{L + U}{2}$, the middle of our interval.\ +While $U - L > \epsilon$ : + +- We compute $F(M) = \frac{1}{1-a} (F_1(M) - aF_2(M))$ +- If $F(M) < u$, we set $L = M$ +- Else, we set $U = M$ +- We set $M = \frac{L + U}{2}$ + +End while + +For the generation of random variables from F, we can use the inverse transform sampling method.\ + +We set $X$ as an empty array and $n$ the number of random variables we want to generate.\ +We fixe $\epsilon > 0$.\ +For $i = 1, \dots, n$ : + +- We set $u \in (0, 1)$.\ + +- We define $L = -10$ and $U = -L$, the bounds and $M = \frac{L + U}{2}$, the middle of our interval.\ + +- While $U - L > \epsilon$ : + +- We compute $F(M) = \frac{1}{1-a} (F_1(M) - aF_2(M))$ + +- If $F(M) < u$, we set $L = M$ + +- Else, we set $U = M$ + +- We set $M = \frac{L + U}{2}$ + +- End while + +- We add $M$ to $X$ + +End for We return $X$ + +##### Question 5 + +```{r} +inv_cdf <- function(n) { + X <- numeric(n) + for (i in 1:n) { + u <- runif(1) + L <- -10 + U <- -L + M <- (L + U) / 2 + while (U - L > 1e-6) { + FM <- F(a, mu1, mu2, s1, s2, M) + if (FM < u) { + L <- M + } else { + U <- M + } + M <- (L + U) / 2 + } + X[i] <- M + } + return(X) +} +``` + +```{r} +set.seed(123) +n <- 10000 +X <- inv_cdf(n) +x <- seq(-10, 10, length.out = 1000) + +hist(X, breaks = 100, freq = FALSE, col = "lightblue", main = "Empirical density function", xlab = "x") +lines(x, f(a, mu1, mu2, s1, s2, x), col = "red") +legend("topright", legend = c("Theorical density", "Empirical density"), col = c("red", "lightblue"), lty = 1) +``` + +```{r} +plot(ecdf(X), col = "blue", main = "Empirical cumulative distribution function", xlab = "x", ylab = "F(x)") +lines(x, F(a, mu1, mu2, s1, s2, x), col = "red") +legend("bottomright", legend = c("Theoretical cdf", "Empirical cdf"), col = c("red", "blue"), lty = 1) +``` + +We can see of both graphs that the empirical cumulative distribution function is close to the theoretical one. + +### Accept-Reject Random Variable simulation + +##### Question 6 + +To simulate under f using the accept-reject algorithm, we need to find a density function g such that $f(x) \le M g(x)$ for all $x \in \mathbb{R}$, where M is a constant.\ + +Then, we generate $X \sim g$ and $U \sim \mathcal{U}([0,1])$.\ +We accept $Y = X$ if $U \le \frac{f(X)}{Mg(X)}$. Return $1$ otherwise. + +The probability of acceptance is $\int_{\mathbb{R}} \frac{f(x)}{Mg(x)} g(x) \,dx = \frac{1}{M} \int_{\mathbb{R}} f(x) \,dx = \frac{1}{M}$ + +Here we pose $g = f1$. + +Then we have $\frac{f(x)}{g(x)} = \frac{1}{1-a} (1 - a\frac{f_2(x)}{f_1(x)})$ + +We pose earlier that $a^* = \min_{x \in \mathbb{R}} \frac{f_1(x)}{f_2(x)} \Rightarrow \frac{1}{a^*} = \max_{x \in \mathbb{R}} \frac{f_2(x)}{f_1(x)}$. + +We compute in our fist equation : $\frac{1}{1-a} (1 - a\frac{f_2(x)}{f_1(x)}) \leq \frac{1}{1-a} (1 - \frac{a}{a^*}) \leq \frac{1}{1-a}$ because $a \leq a^* \Rightarrow 1 - \frac{a}{a^*} \leq 0$ + +To conclude, we have $M = \frac{1}{1-a}$ and the probability of acceptance is $\frac{1}{M} = 1-a$ + +##### Question 7 + +```{r} +accept_reject <- function(n, a) { + X <- numeric(0) + M <- 1 / (1 - a) + + while (length(X) < n) { + Y <- rnorm(1, mu1, s1) + U <- runif(1) + + if (U <= f(a, mu1, mu2, s1, s2, Y) / (M * dnorm(Y, mu1, s1))) { + X <- append(X, Y) + } + } + return(X) +} +``` + +```{r} +set.seed(123) +n <- 10000 +a <- 0.2 +X <- accept_reject(n, a) +x <- seq(-10, 10, length.out = 1000) + +hist(X, breaks = 100, freq = FALSE, col = "lightblue", main = "Empirical density function", xlab = "x") +lines(x, f(a, mu1, mu2, s1, s2, x), col = "red") +legend("topright", legend = c("Theorical density", "Empirical density"), col = c("red", "lightblue"), lty = 1) +``` + +```{r} +set.seed(123) + +acceptance_rate <- function(n, a = 0.2) { + Y <- rnorm(n, mu1, s1) + U <- runif(n) + return(mean(U <= f(a, mu1, mu2, s1, s2, Y) / (M * dnorm(Y, mu1, s1)))) +} + +M <- 1 / (1 - a) +n <- 10000 +cat(sprintf("[M = %.2f] Empirical acceptance rate: %f, Theoretical acceptance rate: %f \n", M, acceptance_rate(n), 1 / M)) +``` + +##### Question 8 + +```{r} +set.seed(123) +a_values <- seq(0.01, 1, length.out = 100) +acceptance_rates <- numeric(length(a_values)) +as <- a_star(mu1, mu2, s1, s2) + +for (i in seq_along(a_values)) { + acceptance_rates[i] <- acceptance_rate(n, a_values[i]) +} + +plot(a_values, acceptance_rates, type = "l", col = "blue", xlab = "a", ylab = "Acceptance rate", main = "Acceptance rate as a function of a") +points(as, acceptance_rate(n, as), col = "red", pch = 16) +legend("topright", legend = c("Acceptance rate for all a", "Acceptance rate for a_star"), col = c("lightblue", "red"), lty = c(1, NA), pch = c(NA, 16)) +```