Commit 7e62e6fa authored by Gilles Kratzer's avatar Gilles Kratzer
Browse files

update datasets

parent d49a1da6
Pipeline #1570 passed with stage
in 2 seconds
......@@ -205,7 +205,7 @@ switch (method.choice, "MC3" = {
iterations = mcmc.scheme[1]*(mcmc.scheme[2]+1),
thinning = mcmc.scheme[2],
burnin = mcmc.scheme[3],
dist = data.dists
data.dist = data.dists
)
class(out) <- "mcmcabn"
......
......@@ -16,23 +16,27 @@ summary.mcmcabn <- function(object, quantiles = c(0.025, 0.25, 0.5, 0.75, 0.975)
cat("Maximum score: ", max(object$scores),"\n",sep = "")
cat("Empirical mean: ", mean(object$scores),"\n",sep = "")
cat("Empirical standard deviation: ", sd(object$scores),"\n",sep = "")
#cat("Standard error of the mean: ", sd(object$scores)/sqrt(object$iterations),"\n\n",sep = "")
cat("Quantiles of the posterior network score:\n")
out2 <- matrix(data = quantile(x = object$scores,probs = quantiles),nrow = 1,ncol = length(quantiles),dimnames = list("BN score", quantiles))
out1 <- matrix(data = quantile(x = object$scores,probs = quantiles),nrow = 1,ncol = length(quantiles),dimnames = list("BN score", quantiles))
print(out1, ...)
cat("\n\nGlobal acceptance rate: ", 1-mean(object$rejection),"\n",sep = "")
out2<- matrix(data = table(object$method,object$rejection),ncol = 2,dimnames = list(levels(factor(object$method)),c("Accepted", "Rejected")))
print(out2, ...)
cat("\n\nAcceptance rate: ", 1-mean(object$rejection),"\n",sep = "")
cat("Sample size adjusted for autocorrelation: ", unname(effectiveSize(object$scores)),"\n",sep = "")
cat("\n\nSample size adjusted for autocorrelation: ", unname(effectiveSize(object$scores)),"\n",sep = "")
unname(acf(object$scores, lag.max = 10, plot = FALSE))
cat("\nAutocorrelations by lag:\n")
out2 <- matrix(data = acf(mcmc.out$scores, lag.max = 10, plot = FALSE)$acf,nrow = 1,ncol = (lag.max+1),dimnames = list("acf",acf(mcmc.out$scores, lag.max = 10, plot = FALSE)$lag))
out2 <- matrix(data = acf(object$scores, lag.max = 10, plot = FALSE)$acf,nrow = 1,ncol = (lag.max+1),dimnames = list("acf",acf(object$scores, lag.max = 10, plot = FALSE)$lag))
print(out2, ...)
......
No preview for this file type
\name{bsc.compute.asia}
\docType{data}
\alias{bsc.compute.asia}
\title{Cache of pre-computed scores related to the asia dataset}
\description{This dataframe contains a cache of pre-computed scores with a maximum of two parents per node for the asia dataset.
}
\usage{data("mcmc_run_asia")}
\format{
The data contains a cache of pre-computed scores with a maximum of two parents per node.
\itemize{
\item \code{bsc.compute.asia}: cache of score with a maximum of two parents per node.
}}
\examples{
\dontrun{
## This data set was generated using the following code:
library(bnlearn) #for the dataset
library(abn) #for the cache of score function
#renaming columns of the dataset
colnames(asia) <- c("Asia",
"Smoking",
"Tuberculosis",
"LungCancer",
"Bronchitis",
"Either",
"XRay",
"Dyspnea")
#lets define the distribution list
dist.asia <- list(Asia = "binomial",
Smoking = "binomial",
Tuberculosis = "binomial",
LungCancer = "binomial",
Bronchitis = "binomial",
Either = "binomial",
XRay = "binomial",
Dyspnea = "binomial")
bsc.compute.asia <- buildscorecache(data.df = asia,
data.dists = dist.asia,
max.parents = 2)
mcmc.out.asia <- mcmcabn(score.cache = bsc.compute.asia,
score = "mlik",
data.dists = dist.asia,
max.parents = 2,
mcmc.scheme = c(1000,99,1000),
seed = 42,
verbose = FALSE,
start.dag = "random",
prob.rev = 0.03,
prob.mbr = 0.03,
prior.choice = 2)
}
}
\keyword{datasets}
\name{dist.asia}
\docType{data}
\alias{dist.asia}
\title{Named list of distribution to analyze asia dataset}
\description{Named list of distribution to analyze asia dataset
}
\usage{data("mcmc_run_asia")}
\format{
The data contains a cache of pre-computed scores with a maximum of two parents per node.
\itemize{
\item \code{dist.asia}: a named list giving the distribution for each node in the network.
}}
\examples{
\dontrun{
## This data set was generated using the following code:
library(bnlearn) #for the dataset
#renaming columns of the dataset
colnames(asia) <- c("Asia",
"Smoking",
"Tuberculosis",
"LungCancer",
"Bronchitis",
"Either",
"XRay",
"Dyspnea")
#lets define the distribution list
dist.asia <- list(Asia = "binomial",
Smoking = "binomial",
Tuberculosis = "binomial",
LungCancer = "binomial",
Bronchitis = "binomial",
Either = "binomial",
XRay = "binomial",
Dyspnea = "binomial")
}
}
\keyword{datasets}
\name{mcmc.out.asia}
\docType{data}
\alias{mcmc.out.asia}
\title{MCMC search from the synthetic asia dataset for use with mcmcabn library examples}
\description{This dataframe contains a cache of pre-computed scores with a maximum of two parents per node for the asia dataset.
}
\usage{data("mcmc_run_asia")}
\format{
The data contains an object of class mcmcabn.
\itemize{
\item \code{mcmc.out.asia}: an object of class mcmcabn.
}}
\examples{
\dontrun{
## This data set was generated using the following code:
library(bnlearn) #for the dataset
library(abn) #for the cache of score function
#renaming columns of the dataset
colnames(asia) <- c("Asia",
"Smoking",
"Tuberculosis",
"LungCancer",
"Bronchitis",
"Either",
"XRay",
"Dyspnea")
#lets define the distribution list
dist.asia <- list(Asia = "binomial",
Smoking = "binomial",
Tuberculosis = "binomial",
LungCancer = "binomial",
Bronchitis = "binomial",
Either = "binomial",
XRay = "binomial",
Dyspnea = "binomial")
bsc.compute.asia <- buildscorecache(data.df = asia,
data.dists = dist.asia,
max.parents = 2)
mcmc.out.asia <- mcmcabn(score.cache = bsc.compute.asia,
score = "mlik",
data.dists = dist.asia,
max.parents = 2,
mcmc.scheme = c(1000,99,1000),
seed = 42,
verbose = FALSE,
start.dag = "random",
prob.rev = 0.03,
prob.mbr = 0.03,
prior.choice = 2)
}
}
\keyword{datasets}
......@@ -82,7 +82,7 @@ Werhli, A. V., & Husmeier, D. (2007). "Reconstructing gene regulatory networks w
Imoto, S., Higuchi, T., Goto, T., Tashiro, K., Kuhara, S., & Miyano, S. (2003). Using Bayesian networks for estimating gene networks from microarrays and biological knowledge. In Proceedings of the European Conference on Computational Biology.
For the asia dataset:
Scutari, M. (2010). Learning Bayesian Networks with the bnlearn R Package. Journal of Statistical Software, 35(3), 1 - 22. doi:http://dx.doi.org/10.18637/jss.v035.i03.
}
......@@ -94,17 +94,17 @@ Scutari, M. (2010). Learning Bayesian Networks with the bnlearn R Package. Journ
data("mcmc_run_asia")
mcmc.out <- mcmcabn(score.cache = bsc.compute,
mcmc.out.asia <- mcmcabn(score.cache = bsc.compute.asia,
score = "mlik",
data.dists = dist,
data.dists = dist.asia,
max.parents = 2,
mcmc.scheme = c(10000,9,1000),
seed = 6,
mcmc.scheme = c(1000,99,10000),
seed = 42,
verbose = FALSE,
start.dag = "random",
prob.rev = 0.005,
prob.mbr = 0.005,
prior.choice = 1)
prob.rev = 0.03,
prob.mbr = 0.03,
prior.choice = 2)
summary(mcmc.out)
summary(mcmc.out.asia)
}}
......@@ -12,9 +12,9 @@
The data contains an object of class mcmcabn and a cache of score compute with abn R package.
\itemize{
\item \code{bsc.compute}: cache of score with a maximum of two parents per node.
\item \code{dist}: a named list giving the distribution for each node in the network.
\item \code{mcmc.out}: an object of class mcmcabn.
\item \code{bsc.compute.asia}: cache of score with a maximum of two parents per node.
\item \code{dist.asia}: a named list giving the distribution for each node in the network.
\item \code{mcmc.out.asia}: an object of class mcmcabn.
}}
\examples{
......@@ -34,7 +34,7 @@ colnames(asia) <- c("Asia",
"Dyspnea")
#lets define the distribution list
dist <- list(Asia = "binomial",
dist.asia <- list(Asia = "binomial",
Smoking = "binomial",
Tuberculosis = "binomial",
LungCancer = "binomial",
......@@ -43,21 +43,21 @@ dist <- list(Asia = "binomial",
XRay = "binomial",
Dyspnea = "binomial")
bsc.compute <- buildscorecache(data.df = asia,
data.dists = dist,
bsc.compute.asia <- buildscorecache(data.df = asia,
data.dists = dist.asia,
max.parents = 2)
mcmc.out <- mcmcabn(score.cache = bsc.compute,
mcmc.out.asia <- mcmcabn(score.cache = bsc.compute.asia,
score = "mlik",
data.dists = dist,
data.dists = dist.asia,
max.parents = 2,
mcmc.scheme = c(10000,9,1000),
seed = 6,
mcmc.scheme = c(1000,99,1000),
seed = 42,
verbose = FALSE,
start.dag = "random",
prob.rev = 0.005,
prob.mbr = 0.005,
prior.choice = 1)
prob.rev = 0.03,
prob.mbr = 0.03,
prior.choice = 2)
}
}
......
......@@ -34,6 +34,9 @@ Alboukadel Kassambara (2018). ggpubr: 'ggplot2' Based Publication Ready Plots. R
Claus O. Wilke (2019). cowplot: Streamlined Plot Theme and Plot Annotations for 'ggplot2'. R package version 0.9.4. https://CRAN.R-project.org/package=cowplot
M. Barrett (2018). ggdag: Analyze and Create Elegant Directed Acyclic Graphs. R package version
0.1.0. https://CRAN.R-project.org/package=ggdag
Scutari, M. (2010). Learning Bayesian Networks with the bnlearn R Package. Journal of Statistical Software, 35(3), 1 - 22. doi:http://dx.doi.org/10.18637/jss.v035.i03.
}
......@@ -44,5 +47,5 @@ Scutari, M. (2010). Learning Bayesian Networks with the bnlearn R Package. Journ
data("mcmc_run_asia")
#plot the mcmc run
plot(mcmc.out)
plot(mcmc.out.asia)
}}
......@@ -48,19 +48,19 @@ Scutari, M. (2010). Learning Bayesian Networks with the bnlearn R Package. Journ
data("mcmc_run_asia")
##return a named matrix with individual arc support
query(mcmcabn = mcmc.out)
query(mcmcabn = mcmc.out.asia)
## what is the probability of LungCancer node being children of the Smoking node?
query(mcmcabn = mcmc.out,formula = ~LungCancer|Smoking)
query(mcmcabn = mcmc.out.asia,formula = ~LungCancer|Smoking)
## what is the probability of Smoking node being parent of
## both LungCancer and Bronchitis node?
query(mcmcabn = mcmc.out,
query(mcmcabn = mcmc.out.asia,
formula = ~ LungCancer|Smoking+Bronchitis|Smoking)
## what is the probability of previous statement,
## when there is no arc from Smoking to Tuberculosis and from Bronchitis to XRay?
query(mcmcabn = mcmc.out,
query(mcmcabn = mcmc.out.asia,
formula = ~LungCancer|Smoking +
Bronchitis|Smoking -
Tuberculosis|Smoking -
......
......@@ -77,7 +77,7 @@ library(ggdag) #to plot the DAG
colnames(asia) <- c("Asia","Smoking", "Tuberculosis", "LungCancer", "Bronchitis", "Either", "XRay", "Dyspnea")
#lets define the distribution list
dist <- list(Asia = "binomial",
dist.asia <- list(Asia = "binomial",
Smoking = "binomial",
Tuberculosis = "binomial",
LungCancer = "binomial",
......@@ -103,10 +103,10 @@ dag <- dagify(Tuberculosis~Asia,
For the first search, it is advised to limite the number of parent per node to 2.
```{r}
```{r, eval=FALSE}
#loglikelihood scores
bsc.compute <- buildscorecache(data.df = asia,
data.dists = dist,
bsc.compute.asia <- buildscorecache(data.df = asia,
data.dists = dist.asia,
max.parents = 2)
```
......@@ -115,9 +115,9 @@ Let us use the `mcmcabn()` function. One needs to define the score used (here is
Let us run this search ...
```{r, eval=FALSE}
mcmc.out <- mcmcabn(score.cache = bsc.compute,
mcmc.out.asia <- mcmcabn(score.cache = bsc.compute.asia,
score = "mlik",
data.dists = dist,
data.dists = dist.asia,
max.parents = 2,
mcmc.scheme = c(1000,99,10000),
seed = 42,
......@@ -128,7 +128,7 @@ mcmc.out <- mcmcabn(score.cache = bsc.compute,
prior.choice = 2)
```
The function `mcmcabn()` returns a list with multiple entries: "dags", "scores", "alpha", "method", "rejection", "iteration", "thinning", "burnin", "dist".
The function `mcmcabn()` returns a list with multiple entries: "dags", "scores", "alpha", "method", "rejection", "iteration", "thinning", "burnin", "data.dist".
- dags: is the list of sampled DAGs.
- scores is the list of DAGs scores.
......@@ -138,7 +138,7 @@ The function `mcmcabn()` returns a list with multiple entries: "dags", "scores",
- iterations is the total number of MCMC iterations.
- thinning is the number of thinned MCMC moves for one returned.
- burnin is the length of the burn in phase.
- dist is the list giving the distribution for each node in the network.
- data.dist is the list giving the distribution for each node in the network.
The object `mcmc.out` is of class `mcmcabn`. We can check that we reach the maximum possible posterior DAG score even with a small number of MCMC steps.
......@@ -147,57 +147,57 @@ The object `mcmc.out` is of class `mcmcabn`. We can check that we reach the maxi
data("mcmc_run_asia")
#maximum score get from the MCMC sampler
max(mcmc.out$scores)
max(mcmc.out.asia$scores)
#maximum scoring network using exact search (not MCMC based)
dag <- mostprobable(score.cache = bsc.compute)
fitabn(dag.m = dag,data.df = asia,data.dists = dist)$mlik
dag <- mostprobable(score.cache = bsc.compute.asia)
fitabn(dag.m = dag,data.df = asia,data.dists = dist.asia)$mlik
```
One can plot the output using the *plot()*. On the graph below, one can see the trace plot of the posterior structure score. The dashed red line is the maximum reached score (as expected = -11151). The colored dot on the trace plot indicate when different methods have been used. The densities on the left represent the relative occuring frequencies of the methods.
```{r}
plot(mcmc.out)
plot(mcmc.out.asia)
```
One can also plot the cumulative best score over the MCMC steps. On this graph one can see the cumulated maximum posterior structure score (as expected = -11151). The dashed red line is the maximum reached score. The colored dot on the trace plot indicate when different methods have been used.
```{r}
plot(mcmc.out,max.score = TRUE)
plot(mcmc.out.asia,max.score = TRUE)
```
On can also plot the summary of the MCMC run:
```{r}
summary(mcmc.out)
summary(mcmc.out.asia)
```
The chosen number of MCMC steps is certainly not sufficient to draw any valid conclusion form the posterior sample. But one can still query it. Below is displayed the individual frequencies of the arcs.
```{r}
query(mcmcabn = mcmc.out)
query(mcmcabn = mcmc.out.asia)
```
The `query()` function recognize fomula statements. So it is possible to explicitelly query: **what is the probability of LungCancer node being children of the Smoking node?**
```{r}
query(mcmcabn = mcmc.out,formula = ~LungCancer|Smoking)
query(mcmcabn = mcmc.out.asia, formula = ~LungCancer|Smoking)
```
This means that an arrow from Smoking to LungCancer appears in `r query(mcmcabn = mcmc.out,formula = ~LungCancer|Smoking)*100`% of the sampled DAGs.
This means that an arrow from Smoking to LungCancer appears in `r query(mcmcabn = mcmc.out.asia,formula = ~LungCancer|Smoking)*100`% of the sampled DAGs.
One can also ask more complicated strctural requests. If want to know **what is the probability of Smoking node being parent of both LungCancer and Bronchitis node?**
```{r}
query(mcmcabn = mcmc.out,formula = ~ LungCancer|Smoking+Bronchitis|Smoking)
query(mcmcabn = mcmc.out.asia, formula = ~ LungCancer|Smoking+Bronchitis|Smoking)
```
It means that an arrow from Smoking to LungCancer **and** an arrow from Smoking to Bronchitis appears in `r query(mcmcabn = mcmc.out,formula = ~ LungCancer|Smoking+Bronchitis|Smoking)*100`% of the sampled DAGs. This probability cannot be read in the first matrix above. Indeed, it is a joint statement.
It means that an arrow from Smoking to LungCancer **and** an arrow from Smoking to Bronchitis appears in `r query(mcmcabn = mcmc.out.asia,formula = ~ LungCancer|Smoking+Bronchitis|Smoking)*100`% of the sampled DAGs. This probability cannot be read in the first matrix above. Indeed, it is a joint statement.
If one want to query positive and negative statements such as **What is the probability of previous statement when there is no arc from Smoking to Tuberculosis and from Bronchitis to XRay?**
```{r}
query(mcmcabn = mcmc.out,formula = ~LungCancer|Smoking+Bronchitis|Smoking-Tuberculosis|Smoking-XRay|Bronchitis)
query(mcmcabn = mcmc.out.asia ,formula = ~LungCancer|Smoking+Bronchitis|Smoking-Tuberculosis|Smoking-XRay|Bronchitis)
```
Essentially zero!
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment