Commit fb4b59d9 authored by Gilles Kratzer's avatar Gilles Kratzer
Browse files

vignette

parent 8e462df2
Pipeline #1568 passed with stage
in 3 seconds
^Meta$
^doc$
^docs$
^_pkgdown\.yml$
^.*\.Rproj$
......@@ -6,4 +8,4 @@
^\.gitlab-ci\.yml$
^NEWS.md$
^README.md$
^mcmcabn_0.1.tar.gz$
\ No newline at end of file
^mcmcabn_0.1.tar.gz$
Meta
doc
.Rproj.user
.Rhistory
.RData
......@@ -4,7 +4,7 @@ plot.mcmcabn
)
importFrom(gRbase, topoSortMAT)
importFrom(stats,rbinom)
importFrom(stats,rbinom, acf)
importFrom(ggplot2,ggplot)
importFrom(coda,effectiveSize)
import(abn)
......
......@@ -95,22 +95,36 @@ for (j in order.child) {
#from score take out parent + descendant
sc.tmp <- sc[score.cache$children==j,]
sc.tmp <- sc.tmp[sc.tmp[,selected.node]==1,]
#print(class(sc.tmp))
# remove descendents of i
if(is.null(ncol(sc.tmp))){
if(!is.null(descendents.j)){
if(length(descendents.j)==1){
sc.tmp <- sc.tmp[sc.tmp[descendents.j]==0]
}else{
sc.tmp <- sc.tmp[sum(sc.tmp[descendents.j]==0)==length(descendents.j)]}
}
}else{
if(!is.null(descendents.j)){
if(length(descendents.j)==1){
sc.tmp <- sc.tmp[sc.tmp[,descendents.j]==0,]
}else{
sc.tmp <- sc.tmp[rowSums(sc.tmp[,descendents.j]==0)==length(descendents.j),]}
}
}
if(nrow(sc.tmp)>0 || is.vector(sc.tmp)){
if(!identical(unname(sc.tmp[length(sc.tmp)]), numeric(0))){
if(is.vector(sc.tmp)){
new.parent.j <- sc.tmp
new.parent.j<-new.parent.j[-length(new.parent.j)]
#store partition function
#print(sc.tmp[length(sc.tmp)])
score.G[j] <- sc.tmp[length(sc.tmp)]
}else{
......@@ -122,7 +136,7 @@ for (j in order.child) {
}
dag.G.1[j,] <- new.parent.j
}
}}
}}}
dag.MBR <- dag.G.1
......@@ -198,16 +212,30 @@ if(!is.character(descendents.j)){
sc.tmp <- sc.tmp[sc.tmp[,selected.node]==1,]
# remove descendents of j
if(!is.null(descendents.j)){
if(length(descendents.j)==1){
sc.tmp <- sc.tmp[sc.tmp[,descendents.j]==0,]
}else{
sc.tmp <- sc.tmp[rowSums(sc.tmp[,descendents.j]==0)==length(descendents.j),]}
if(is.null(ncol(sc.tmp))){
if(!is.null(descendents.j)){
if(length(descendents.j)==1){
sc.tmp <- sc.tmp[sc.tmp[descendents.j]==0]
}else{
sc.tmp <- sc.tmp[sum(sc.tmp[descendents.j]==0)==length(descendents.j)]}
}
}else{
if(!is.null(descendents.j)){
if(length(descendents.j)==1){
sc.tmp <- sc.tmp[sc.tmp[,descendents.j]==0,]
}else{
sc.tmp <- sc.tmp[rowSums(sc.tmp[,descendents.j]==0)==length(descendents.j),]}
}
}
dag.G.1[j,] <- dag.tmp[j,]
score.G.prime[j] <- sum(sc.tmp[,ncol(sc.tmp)])
if(is.null(ncol(sc.tmp))){
score.G.prime[j] <- sum(sc.tmp[length(sc.tmp)])
}else{
score.G.prime[j] <- sum(sc.tmp[,ncol(sc.tmp)])}
}}}
......
......@@ -31,13 +31,24 @@ sc.tmp <- sc[score.cache$children==selected.edge[2],]
sc.tmp <- sc.tmp[sc.tmp[,selected.edge[1]]==1,]
# remove descendents of i
if(!is.null(descendents.M.dot.i)){
if(length(descendents.M.dot.i)==1){
sc.tmp <- sc.tmp[sc.tmp[,descendents.M.dot.i]==0,]
}else{
sc.tmp <- sc.tmp[rowSums(sc.tmp[,descendents.M.dot.i]==0)==length(descendents.M.dot.i),]}
if(is.null(ncol(sc.tmp))){
if(!is.null(descendents.M.dot.i)){
if(length(descendents.M.dot.i)==1){
sc.tmp <- sc.tmp[sc.tmp[descendents.M.dot.i]==0]
}else{
sc.tmp <- sc.tmp[sum(sc.tmp[descendents.M.dot.i]==0)==length(descendents.M.dot.i)]}
}
}else{
if(!is.null(descendents.M.dot.i)){
if(length(descendents.M.dot.i)==1){
sc.tmp <- sc.tmp[sc.tmp[,descendents.M.dot.i]==0,]
}else{
sc.tmp <- sc.tmp[rowSums(sc.tmp[,descendents.M.dot.i]==0)==length(descendents.M.dot.i),]}
}
}
#sample new parent set
if(nrow(sc.tmp)>0 || is.vector(sc.tmp)){
......@@ -105,11 +116,25 @@ n.edges.tilde <- sum(dag.M.tilde)
sc.tmp <- sc[score.cache$children==selected.edge[1],]
sc.tmp <- sc.tmp[sc.tmp[,selected.edge[2]]==1,]
# remove descendents of i
if(is.vector(sc.tmp)){
if(!is.null(descendents.M.dot.j)){
if(length(descendents.M.dot.j)==1){
sc.tmp <- sc.tmp[sc.tmp[,descendents.M.dot.j]==0,]
sc.tmp <- sc.tmp[sc.tmp[descendents.M.dot.j]==0] #
}else{
sc.tmp <- sc.tmp[rowSums(sc.tmp[,descendents.M.dot.j]==0)==length(descendents.M.dot.j),]}
#print(class(sc.tmp))
#print(descendents.M.dot.j)
sc.tmp <- sc.tmp[sum(sc.tmp[descendents.M.dot.j]==0)==length(descendents.M.dot.j)]}
}}else{
if(!is.null(descendents.M.dot.j)){
if(length(descendents.M.dot.j)==1){
sc.tmp <- sc.tmp[sc.tmp[,descendents.M.dot.j]==0,]
}else{
#print(class(sc.tmp))
#print(descendents.M.dot.j)
sc.tmp <- sc.tmp[rowSums(sc.tmp[,descendents.M.dot.j]==0)==length(descendents.M.dot.j),]}
}
}
if(is.vector(sc.tmp)){
......
......@@ -181,3 +181,4 @@ is.integer0 <- function(x)
{
is.integer(x) && length(x) == 0L
}
......@@ -20,7 +20,7 @@ plot.mcmcabn <- function(x,
geom_line(alpha = 0.5) +
geom_hline(yintercept = max.score., linetype="dashed", color = "red", alpha = 0.8) +
geom_text(aes(0,max.score.,label = round(max.score., digits = 2), vjust = -.5),color="red")+
geom_point(data = dta[dta$method %in% c("REV", "MBR"),],aes_string(color = factor("method")))+
geom_point(data = dta[dta$method %in% c("REV", "MBR"),],aes_string(color = "factor(method)"))+
#geom_point(aes(color=as.factor(method)))+
labs(x = "DAG index", y = "DAG scores",colour="Methods",title = "Trace plot") + theme_pubr() +
scale_color_manual(values = c("#F2C500", "#56B4E9"))
......@@ -46,7 +46,7 @@ p<-ggplot(data = dta,aes_string(x = "X",y = "cummax"))+
geom_line() +
geom_hline(yintercept = max.score., linetype="dashed", color = "red", alpha = 0.8) +
geom_text(aes(0,max.score.,label = round(max.score., digits = 2), vjust = -.5),color="red") +
geom_point(data = dta[dta$method %in% c("REV", "MBR"),],aes_string(color = factor("method")))+
geom_point(data = dta[dta$method %in% c("REV", "MBR"),],aes_string(color = "factor(method)"))+
theme_pubr() +
scale_colour_manual(values = c("#F2C500", "#56B4E9")) +
labs(x = "DAG index", y = "DAG scores",color="Methods",title = "Trace plot")
......
......@@ -88,7 +88,7 @@ query <- function(mcmcabn = NULL,
#creat array
n <- length(mcmcabn$scores)
n.var <- length(dist)
n.var <- length(mcmcabn$dist)
m.array <- array(data = 0,dim = c(n.var,n.var,n))
for(i in 1:n.var){
for(j in 1:n.var){
......
......@@ -5,7 +5,7 @@
## :
###############################################################################
summary.mcmcabn <- function(object, quantiles = c(0.025, 0.25, 0.5, 0.75, 0.975), ...){
summary.mcmcabn <- function(object, quantiles = c(0.025, 0.25, 0.5, 0.75, 0.975),lag.max = 10, ...){
cat("MCMC summary:\n",sep = "")
......@@ -20,12 +20,20 @@ summary.mcmcabn <- function(object, quantiles = c(0.025, 0.25, 0.5, 0.75, 0.975)
cat("Quantiles of the posterior network score:\n")
out2<-matrix(data = quantile(x = object$scores,probs = quantiles),nrow = 1,ncol = length(quantiles),dimnames = list("BN score", quantiles))
out2 <- matrix(data = quantile(x = object$scores,probs = quantiles),nrow = 1,ncol = length(quantiles),dimnames = list("BN score", quantiles))
print(out2, ...)
cat("\n\nAcceptance rate: ", 1-mean(object$rejection),"\n",sep = "")
cat("Sample size adjusted for autocorrelation: ", unname(effectiveSize(object$scores)),"\n",sep = "")
unname(acf(object$scores, lag.max = 10, plot = FALSE))
cat("\nAutocorrelations by lag:\n")
out2 <- matrix(data = acf(mcmc.out$scores, lag.max = 10, plot = FALSE)$acf,nrow = 1,ncol = (lag.max+1),dimnames = list("acf",acf(mcmc.out$scores, lag.max = 10, plot = FALSE)$lag))
print(out2, ...)
}#EOF
......@@ -19,9 +19,17 @@ knitr::opts_chunk$set(
___
## mcmcabn: Flexible implementation of a structural MCMC sampler for DAGs
# mcmcabn: An R Package for sampling DAGs using structural MCMC
Flexible implementation of a structural MCMC sampler for Directed Acyclic Graphs (DAGs). It supports the new edge reversal move from Grzegorczyk and Husmeier (2008) and the Markov blanket resampling from Su and Borsuk (2016). It supports three priors: the one from Koivisto and Sood (2004), an uninformative prior and a user defined prior. The three main problems that can be addressed by this R package is selecting the most probable structure based on a cache of pre-computed scores, controlling for overfitting and sampling the landscape of high scoring structures which can be used as an alternative to parametric bootstrapping and allow the end user to make model averaging with DAGs.
## Installation
You can install `mcmabn` with:
``` r
install.packages("mcmcabn")
```
mcmcabn is a flexible implementation of a structural MCMC sampler for Directed Acyclic Graphs (DAGs). It supports the new edge reversal move from Grzegorczyk and Husmeier (2008) and the Markov blanket resampling from Su and Borsuk (2016). It supports three priors: the one from Koivisto and Sood (2004), an uninformative prior and a user defined prior. The three main problems that can be addressed by this R package is selecting the most probable structure based on a cache of pre-computed scores, controlling for overfitting and sampling the landscape of high scoring structures which can be used as an alternative to parametric bootstrapping and allow the end user to make model averaging with DAGs.
___
## What's New
......
# mcmcabn (PUBLIC)
!!! UNSTABLE VERSION !!!
# mcmcabn: An R Package for sampling DAGs using structural MCMC
(PUBLIC) !!! UNSTABLE VERSION !!!
mcmcabn is a one man show (me!) and made of more than 10000 lines of code which are not bug free! So use it with caution and awareness.
......
No preview for this file type
......@@ -8,6 +8,7 @@ vignette: >
%\VignetteEngine{knitr::rmarkdown}
%\usepackage[utf8]{inputenc}
bibliography: bib_mcmcabn.bib
link-citations: yes
---
```{r setup, include = FALSE, cache = FALSE}
......@@ -28,23 +29,23 @@ options(digits = 3)
The three main problems addressed by this R package are:
1. selecting the most probable structure based on a cache of pre-computed scores
2. controlling for overfitting
3. sampling the landscape of high scoring networks which can be used as an alternative to parametric bootstrapping.
1. selecting the most probable structure based on a cache of pre-computed scores.
2. controlling for overfitting.
3. sampling the landscape of high scoring structures.
The latter could be very useful in an applied perspective to avoid reducing the richeness of Bayesian network modelling to only **one** structure!
The latter could be very useful in an applied perspective to avoid reducing the richeness of Bayesian network modelling to only **one** structure. Indeed, it allows user to quantify the marginal impact of relationships of interest by marginalising out over structures or nuisance dependencies. Structural MCMC seems a very elegant and natural way to estimate the *true* marginal impact, so one can determine if it's magnitude is big enough to consider as a worthwhile intervention.
**How does it work?**
The *mcmcabn* R package takes a cache of pre-computed score as input (possibly computed using **buildscorecache()** function from *abn* R package). Then *mcmcabn* generates Monte Carlo Markoc Chain (MCMC) samples from the posterior distribution of the most supported Directed Acyclic Graphs (DAGs). To do so user should choose a structural prior and a learning scheme. *mcmcabn* is equipped with multiple structural priors: the so-called Koivisto prior, an uninformative prior, the Ellis prior, a modular flat prior and a possibly user define prior and three algorithms for inferring the structure of Bayesian networks: the classical Monte Carlo Markov Chain Model Choice (MC^3), the new edge reversal (REV) from Grzegorczyk and Husmeier (2008) and the Markov blanket resampling (MBR) from Su and Borsuk (2016).
The `mcmcabn` R package takes a cache of pre-computed score as input (possibly computed using `buildscorecache()` function from `abn` R package). Then `mcmcabn()` function generates Monte Carlo Markoc Chain (MCMC) samples from the posterior distribution of the most supported Directed Acyclic Graphs (DAGs). To do so user should choose a structural prior and a learning scheme. `mcmcabn` is equipped with multiple structural priors: the so-called Koivisto prior, an uninformative prior, the Ellis prior, a modular flat prior and a possibly user define prior and three algorithms for inferring the structure of Bayesian networks: the classical Monte Carlo Markov Chain Model Choice ($MC^3$), the new edge reversal (REV) from Grzegorczyk and Husmeier (2008) and the Markov blanket resampling (MBR) from Su and Borsuk (2016).
**What are the R package functionalities?**
The workhorse of the R package is the *mcmc()* function. It generates MCMC samples from the posterior distribution of the data supperted DAGs. This object can be summarized with a comprehensive verbal description or a plot. From those samples one can either compute the most probable structure and plot the cumulative maximum score achieved or query it using *query()* R function using formula statement. For example what is the probability have an arc between two nodes, one node being in the Markov blanket of another or what is the probability to **not** have an arc between two nodes?
The workhorse of the R package is the `mcmcabn()` function. It generates MCMC samples from the posterior distribution of the data supperted DAGs. This object can be summarized with a comprehensive verbal description or a plot. From those samples one can either compute the most probable structure and plot the cumulative maximum score achieved or query it using `query()` R function using formula statement. For example what is the probability have an arc between two nodes, one node being in the Markov blanket of another or what is the probability to **not** have an arc between two nodes? Alternativelly, one can query the frequency of an arc in one direction relative to the oppposite direction.
**What is the structure of this document?**
We first illustrate the package with a simple example. In a second step we compare the output of the main function *mcmc()* with alternative approaches. Then some details are provided. For a full description of the technical details we refer to the original publication [XXX].
We first illustrate the package with a simple example. Then some more theoretical details are provided. For a full description of the technical details we refer to the original publication [XXX].
# Simple *mcmcabn* example
......@@ -60,13 +61,13 @@ Once installed, the *mcmcabn* R package can be loaded using:
library(mcmcabn)
```
Let us start with a ranking example from the *bnlearn* R package [XXX]. The first example is about a small synthetic dataset from Lauritzen and Spiegelhalter (1988) about lung diseases (tuberculosis, lung cancer or bronchitis) and visits to Asia.
Let us start with a ranking example from the `bnlearn` R package [XXX]. The first example is about a small synthetic dataset from Lauritzen and Spiegelhalter (1988) about lung diseases (tuberculosis, lung cancer or bronchitis) and visits to Asia.
First thing first, one need to precompute a cache of scores. We use the R package **abn** to do it.
First thing first, one need to precompute a cache of scores. We use the R package `abn` to do it.
```{r, warning = FALSE, message = FALSE}
library(bnlearn) #for the dataset
library(abn)
library(abn) #to precompute the scores
library(ggplot2) #plotting
library(ggpubr) #plotting
library(cowplot) #plotting
......@@ -95,7 +96,7 @@ dag <- dagify(Tuberculosis~Asia,
Either~LungCancer,
Dyspnea~Bronchitis)
ggdag_classic(dag) + theme_dag_blank()
ggdag_classic(dag,size = 6) + theme_dag_blank()
```
......@@ -109,7 +110,7 @@ bsc.compute <- buildscorecache(data.df = asia,
max.parents = 2)
```
Let us use the *mcmc()* function. One needs to define the score used (here is the marginal likelihood `mlik`). The maximum of number of parent per node (same as the one used in *buildscorecache()*). The MCMC learning scheme: number of MCMC samples, number of thinned sampled (to avoid autocorrelation) and the length of the burn in phase. Here we choose: 500 returned MCMC steps and 10 thinned steps in between. This means that 5000 = 10*500 total MCMC steps have been performed. The burn in phase is 500 steps. We decide to initiate the algorithm with a random dag `start.dag = "random"`. This means that the function will randomly select a valid DAG. In this context, valid means no cycle and an appropriate max number of parent. The probability for the REV and the MBR jumps are set to 5%. It is usually a good idea to keep those probability low. Indeed, REV and MBR are computationally costly. We select the Koivisto prior (`prior.choice = 2`).
Let us use the `mcmcabn()` function. One needs to define the score used (here is the marginal likelihood `mlik`). The maximum of number of parent per node (same as the one used in `buildscorecache()`). The MCMC learning scheme: number of MCMC samples, number of thinned sampled (to avoid autocorrelation) and the length of the burn in phase. Here we choose: 1000 returned MCMC steps and 99 thinned steps in between. This means that for every return MCMC move 99 have been thinned. The burn in phase is 10k steps. We decide to initiate the algorithm with a random dag `start.dag = "random"`. This means that the function will randomly select a valid DAG. In this context, valid means no cycle and an appropriate max number of parent. The probability for the REV and the MBR jumps are set to 3%. It is usually a good idea to keep those probability low. Indeed, REV and MBR are efficient in producing high scoring structure but computationally costly compared to classical MCMC jumps. We select the Koivisto prior (`prior.choice = 2`).
Let us run this search ...
......@@ -118,27 +119,32 @@ mcmc.out <- mcmcabn(score.cache = bsc.compute,
score = "mlik",
data.dists = dist,
max.parents = 2,
mcmc.scheme = c(10000,9,1000),
seed = 6,
mcmc.scheme = c(1000,99,10000),
seed = 42,
verbose = FALSE,
start.dag = "random",
prob.rev = 0.005,
prob.mbr = 0.005,
prior.choice = 1)
prob.rev = 0.03,
prob.mbr = 0.03,
prior.choice = 2)
```
The function *mcmcabn()* returns a list with multiple entries: "dags", "scores", "alpha" and "method".
The function `mcmcabn()` returns a list with multiple entries: "dags", "scores", "alpha", "method", "rejection", "iteration", "thinning", "burnin", "dist".
- dags: is the list of sampled DAGs.
- scores is the list of DAGs scores.
- alpha is the acceptance probability of the Metropolis-Hasting sampler.
- method is the algorithm chosen for the index MCMC step.
- rejection if a MCMC jumps is rejected or not.
- iterations is the total number of MCMC iterations.
- thinning is the number of thinned MCMC moves for one returned.
- burnin is the length of the burn in phase.
- dist is the list giving the distribution for each node in the network.
The object `mcmc.out` is of class `mcmcabn`. We can check that we reach the maximum possible posterior DAG score even with a small number of MCMC steps.
```{r}
##to speed up building of the vignette, we store the output
load(file = "../data/mcmc_run_asia.rda")
data("mcmc_run_asia")
#maximum score get from the MCMC sampler
max(mcmc.out$scores)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment