Commit 1229a3f8 authored by Gilles Kratzer's avatar Gilles Kratzer
Browse files

Initial commit

parents
^.*\.Rproj$
^\.Rproj\.user$
.Rproj.user
.Rhistory
.RData
Package: mcmcabn
Title: Flexible implementation of an MCMC sampler for DAGs.
Version: 0.0.0.9000
Authors@R: c(person("Gilles", "Kratzer", role = c("aut", "cre"),
email = "gilles.kratzer@math.uzh.ch",
comment = c(ORCID = "0000-0002-5929-8935")),
person("Reinhard", "Furrer", role = c("ctb"),
email = "reinhard.furrer@math.uzh.ch",
comment = c(ORCID = "0000-0002-6319-2332")))
Author: Gilles Kratzer [aut, cre] (<https://orcid.org/0000-0002-5929-8935>),
Reinhard Furrer [ctb] (<https://orcid.org/0000-0002-6319-2332>)
Maintainer: Gilles Kratzer <gilles.kratzer@math.uzh.ch>
Description: Flexible implementation of an MCMC sampler for DAGs. It supports the new edge reversal move from Grzegorczyk and Husmeier (2008) <doi:10.1007/s10994-008-5057-7> and the Markov blanket resampling from Su and Borsuk (2016) <doi:>
Depends: R (>= 3.5.1)
License: GPL-3
Encoding: UTF-8
LazyData: true
Imports: gRbase, abn
RoxygenNote: 6.1.0
export(mcmc,
REV,
mc3)
importFrom(gRbase, topoSortMAT)
importFrom(stats,rbinom)
\ No newline at end of file
REV<-function(n.var, dag.tmp, max.parents, sc,score.cache, score, prior, verbose){
#neighbours.list <- NULL
##sum of logs
sumlogs<-function(x){
xprime<-sort(x,decreasing = TRUE)
n<-length(xprime)
out1<-0
if(n>1){
for(i in 2:n){
out1<-out1+exp(1)**(xprime[i]-xprime[1])
}}
out<-xprime[1]+log(1+out1)
return(out)
}
alpha <- 0
indices <- NULL
for(a in 1:n.var){
for(b in 1:n.var){
if(dag.tmp[a,b]==1 & sum(dag.tmp[b,])<=(max.parents-1)){
tmp <- dag.tmp
tmp[a,b] <- 0
tmp[b,a] <- 1
if(!identical(topoSortMAT(amat = tmp,index = FALSE),character(0))){
#neighbours.list[[length(neighbours.list)+1]] <- tmp
indices[[length(indices)+1]] <- c(a,b)
}
}
}
}
if(length(indices)!=0){
##step1:
n.G <- length(indices)
mcmc.move <- sample(x = 1:n.G,size = 1,replace = FALSE)
dag.gprime <- dag.tmp
#reverse computing
##Q1:
sc.tmp <- sc[score.cache$children==indices[[mcmc.move]][1],]
sc.tmp <- sc.tmp[sc.tmp[,indices[[mcmc.move]][2]]==1,]
#Q1.tmp <- sc.tmp[which(apply(sc.tmp, 1, function(x) identical(unname(x[1:n.var]), unname(dag.gprime[indices[[mcmc.move]][1],])))),ncol(sc.tmp)]
#Q.G1 <- Q1.tmp/sum(sc.tmp[,ncol(sc.tmp)])
Q.G1 <- sumlogs(sc.tmp[,ncol(sc.tmp)])
##Q2:
sc.tmp <- sc[score.cache$children==indices[[mcmc.move]][2],]
#Q2.tmp <- sc.tmp[which(apply(sc.tmp, 1, function(x) identical(unname(x[1:n.var]), unname(dag.gprime[indices[[mcmc.move]][2],])))),ncol(sc.tmp)]
Q.G2 <- sumlogs(sc.tmp[,ncol(sc.tmp)])
##Orphaning
N.G <- sum(dag.tmp)
dag.gprime[indices[[mcmc.move]][1],] <- dag.gprime[indices[[mcmc.move]][2],] <- rep(0,n.var)
##step2:
sc.tmp <- sc[score.cache$children==indices[[mcmc.move]][2],]
sc.tmp <- sc.tmp[sc.tmp[,indices[[mcmc.move]][1]]==1,]
d.tmp <- dag.gprime
d.list.tmp <- NULL
s.list.tmp <-NULL
##test all b -> a
for(i in 1:dim(sc.tmp)[1]){
d.tmp[indices[[mcmc.move]][2],] <- sc.tmp[i,-ncol(sc.tmp)]
if(!identical(topoSortMAT(amat = d.tmp,index = FALSE),character(0))){
d.list.tmp[[length(d.list.tmp)+1]] <- d.tmp
s.list.tmp[[length(s.list.tmp)+1]] <- sc.tmp[i,ncol(sc.tmp)]
}}
step2 <- sample(x = 1:length(d.list.tmp),size = 1,replace = FALSE,prob = 1/abs(s.list.tmp))
##end step2:
dag.gprime[indices[[mcmc.move]][2],] <- sc.tmp[step2,-ncol(sc.tmp)]
Q.Gprime1 <- sumlogs(sc.tmp[,ncol(sc.tmp)])
##step3
sc.tmp <- sc[score.cache$children==indices[[mcmc.move]][1],]
d.tmp <- dag.gprime
d.list.tmp <- NULL
s.list.tmp <- NULL
##test all b -> a
for(i in 1:dim(sc.tmp)[1]){
d.tmp[indices[[mcmc.move]][1],] <- sc.tmp[i,-ncol(sc.tmp)]
if(!identical(topoSortMAT(amat = d.tmp,index = FALSE),character(0))){
d.list.tmp[[length(d.list.tmp)+1]] <- d.tmp
s.list.tmp[[length(s.list.tmp)+1]] <- sc.tmp[i,ncol(sc.tmp)]
}}
if(!is.null(s.list.tmp)){
step3 <- sample(x = 1:length(d.list.tmp),size = 1,replace = FALSE,prob = 1/abs(s.list.tmp))
##endstep3
dag.gprime[indices[[mcmc.move]][1],] <- sc.tmp[step3,-ncol(sc.tmp)]
Q.Gprime2 <- sumlogs(sc.tmp[,ncol(sc.tmp)])
N.Gprime <- sum(dag.gprime)
##output:
score.G <- score.Gprime <- 0
prior.G <- prior.Gprime <- 1
for(a in 1:n.var){
#Koivisto priors
if(prior==2){
prior.G <- prod(1/choose(n = (n.var-1),k = sum(dag.tmp[a,])),prior.G)
prior.Gprime <- prod(1/choose(n = (n.var-1),k = sum(dag.gprime[a,])),prior.Gprime)
}
sc.tmp <- sc[score.cache$children==a,]
score.G <- sum(min(sc.tmp[(apply(sc.tmp, 1, function(x) identical(unname(x[1:n.var]), unname(dag.tmp[a,])))),n.var+1]),score.G)
score.Gprime <- sum(min(sc.tmp[(apply(sc.tmp, 1, function(x) identical(unname(x[1:n.var]), unname(dag.gprime[a,])))),n.var+1]),score.Gprime)
}
score <- score.G
#print(min(exp(log(prior.G) + log(N.G) + Q.G2 + Q.G1 - (log(prior.Gprime) + log(N.Gprime) + Q.Gprime2 + Q.Gprime1)),1))
#alpha<-1
alpha <- min(exp((log(prior.G) + log(N.G) + Q.G2 + Q.G1) - (log(prior.Gprime) + log(N.Gprime) + Q.Gprime2 + Q.Gprime1)),1)
}}#if tests!
if(rbinom(n = 1,size = 1,prob = alpha)==1){
dag.tmp <- dag.gprime
if(verbose)cat("REV move\n");
score <- score.Gprime
}
return(list(dag.tmp=dag.tmp, score=score, alpha=alpha))
}
mc3<-function(n.var, dag.tmp, max.parents, sc,score.cache, score, prior, verbose){
##construction of neighbours list
neighbours.list <- NULL
##Removal of an arc
for(a in 1:n.var){
for(b in 1:n.var){
if(dag.tmp[a,b]==1){
tmp <- dag.tmp
tmp[a,b] <- 0
neighbours.list[[length(neighbours.list)+1]] <- tmp
}
}
}
##Addition of an arc
for(a in 1:n.var){
for(b in 1:n.var){
if((dag.tmp[a,b]==0 & dag.tmp[b,a]==0) & sum(dag.tmp[a,])<=(max.parents-1)){
tmp <- dag.tmp
tmp[a,b] <- 1
if(!identical(topoSortMAT(amat = tmp,index = FALSE),character(0))){
neighbours.list[[length(neighbours.list)+1]] <- tmp
}
}
}
}
##Reversal of an arc
for(a in 1:n.var){
for(b in 1:n.var){
if(dag.tmp[a,b]==1 & sum(dag.tmp[b,])<=(max.parents-1)){
tmp <- dag.tmp
tmp[a,b] <- 0
tmp[b,a] <- 1
if(!identical(topoSortMAT(amat = tmp,index = FALSE),character(0))){
neighbours.list[[length(neighbours.list)+1]] <- tmp
}
}
}
}
##choose of an mcmc move
n.G <- length(neighbours.list)
mcmc.move <- sample(x = 1:n.G,size = 1,replace = FALSE)
dag.gprime <- neighbours.list[[mcmc.move]]
##construction of neighbours list of GPRIME
neighbours.list.gprime <- NULL
##Removal of an arc
for(a in 1:n.var){
for(b in 1:n.var){
if(dag.gprime[a,b]==1){
tmp <- dag.gprime
tmp[a,b] <- 0
neighbours.list.gprime[[length(neighbours.list.gprime)+1]] <- tmp
}
}
}
##Addition of an arc
for(a in 1:n.var){
for(b in 1:n.var){
if((dag.tmp[a,b]==0 & dag.tmp[b,a]==0) & sum(dag.tmp[a,])<=(max.parents-1)){
tmp <- dag.gprime
tmp[a,b] <- 1
if(!identical(topoSortMAT(amat = tmp,index = FALSE),character(0))){
neighbours.list.gprime[[length(neighbours.list.gprime)+1]] <- tmp
}
}
}
}
##Reveral of an arc
for(a in 1:n.var){
for(b in 1:n.var){
if(dag.tmp[a,b]==1 & sum(dag.tmp[b,])<=(max.parents-1)){
tmp <- dag.gprime
tmp[a,b] <- 0
tmp[b,a] <- 1
if(!identical(topoSortMAT(amat = tmp,index = FALSE),character(0))){
neighbours.list.gprime[[length(neighbours.list.gprime)+1]] <- tmp
}
}
}
}
n.Gprime <- length(neighbours.list.gprime)
##prior computation
prior.G <- prior.Gprime <- 1
score.G <- score.Gprime <- 0
for(a in 1:n.var){
#Koivisto priors
if(prior==2){
prior.G <- prod(1/choose(n = (n.var-1),k = sum(dag.tmp[a,])),prior.G)
prior.Gprime <- prod(1/choose(n = (n.var-1),k = sum(dag.gprime[a,])),prior.Gprime)
}
sc.tmp <- sc[score.cache$children==a,]
score.G <- sum(min(sc.tmp[(apply(sc.tmp, 1, function(x) identical(unname(x[1:n.var]), unname(dag.tmp[a,])))),n.var+1]),score.G)
score.Gprime <- sum(min(sc.tmp[(apply(sc.tmp, 1, function(x) identical(unname(x[1:n.var]), unname(dag.gprime[a,])))),n.var+1]),score.Gprime)
}
alpha <- min(exp((score.Gprime + log(prior.Gprime) + log(n.Gprime)) - ((score.G + log(prior.G)) + log(n.G))),1)
score <- score.G
if(rbinom(n = 1,size = 1,prob = alpha)==1){
dag.tmp <- dag.gprime
score <- score.Gprime
cat("Regular move")
}
return(list(dag.tmp=dag.tmp, score=score, alpha=alpha))
}
################################################################################
## heuristic.search.R ---
## Author : Gilles Kratzer
## Document created: 22/10/2018
## -
################################################################################
##-------------------------------------------------------------------------
## MCMC procedure
##
##-------------------------------------------------------------------------
mcmc <- function(score.cache = NULL,
score = "mlik",
data.dists = NULL,
max.parents = 1,
mcmc.scheme = c(100,1000,1000), #returned, #thinned, #burned
seed = 42,
verbose = FALSE,
start.dag = NULL,
dag.retained = NULL,
dag.banned = NULL,
prob.rev = 0.15,
prior = 2){
n.var <- length(data.dists)
##output
out.dags <- array(data = NA,dim = c(n.var,n.var,mcmc.scheme[1]+1))
out.scores <- rep(NA,length = mcmc.scheme[1]+1)
out.alpha <- rep(NA,length = mcmc.scheme[1]+1)
##seeding
set.seed(seed = seed)
##Initializing matrix
dag.tmp <- matrix(data = 0,nrow = n.var,ncol = n.var)
##start zero matrix
# if(!is.null(start.dag)){
# colnames(dag.tmp) <- rownames(dag.tmp)<-name.dag<-sample(1:n.var)
# }
##start random matrix
if(is.null(start.dag)){start.dag <- "random"}
if(start.dag=="random"){
vec.tmp <- c(rep(1,max.parents),rep(0,2*n.var-max.parents))
for(lines in 1:(n.var-1)){
dag.tmp[1+lines,1:lines] <- sample(vec.tmp)[1:lines]
}
colnames(dag.tmp) <- rownames(dag.tmp)<-name.dag<-sample(1:n.var)
dag.tmp<-dag.tmp[order(name.dag),order(name.dag)]
}
if(is.matrix(start.dag)) dag.tmp <- start.dag
##scores
if(score %in% c("bic", "aic", "mdl")){
tmp<-score.cache$node.defn[,as.numeric(colnames(dag.tmp))]
colnames(tmp)<-as.numeric(colnames(dag.tmp))
sc <- cbind(tmp,-score.cache[[score]])
}
if(score=="mlik"){
tmp<-score.cache$node.defn[,as.numeric(colnames(dag.tmp))]
colnames(tmp)<-as.numeric(colnames(dag.tmp))
sc <- cbind(tmp,score.cache[[score]])
}
##scoring init
score.init <- 0
for(a in 1:n.var){
sc.tmp <- sc[score.cache$children==as.numeric(colnames(dag.tmp)[a]),]
score.init <- sum(min(sc.tmp[which(apply(sc.tmp, 1, function(x) identical(unname(x[1:n.var]), unname(dag.tmp[a,])))),n.var+1]),score.init)
}
##out
out.dags[,,1] <- dag.tmp
out.scores[1] <- score <- score.init
out.alpha[1] <- alpha <- 0
##start mcmc search
if(verbose) cat("Start MCMC burn in \n");
j <- 1
while(j<=mcmc.scheme[3]){
####################################################
##REV move
####################################################
if(rbinom(n = 1,size = 1,prob = prob.rev)==1){
out <- REV(n.var,(dag.tmp),max.parents,sc,score.cache,score,prior,verbose)
dag.tmp <- out$dag.tmp
score <- out$score
j<-j+1
}#EOF: REV move
else{
out <- mc3(n.var,(dag.tmp),max.parents,sc,score.cache,score,prior,verbose)
dag.tmp <- out$dag.tmp
score <- out$score
j<-j+1
}
}
###EOF: Burnin phase!
if(verbose) cat("Start MCMC search \n");
for(mcmc.search in 1:mcmc.scheme[1]){
if (verbose) cat("processing search ...", mcmc.search,"\n");
##start blind search
j <- 0
while(j<=mcmc.scheme[2]){
if(rbinom(n = 1,size = 1,prob = prob.rev)==1){
out <- REV(n.var,(dag.tmp),max.parents,sc,score.cache,score,prior,verbose)
dag.tmp <- out$dag.tmp
score <- out$score
alpha <- out$alpha
j <- j+1 ## while update
}#EOF: REV move
else{
out <- mc3(n.var,(dag.tmp),max.parents,sc,score.cache,score,prior,verbose)
dag.tmp <- out$dag.tmp
score <- out$score
alpha <- out$alpha
j <- j+1 ## while update
}
}#EOF mcmc search
out.dags[,,mcmc.search+1] <- dag.tmp
out.scores[mcmc.search+1] <- score
out.alpha[mcmc.search+1] <- alpha
}#EOF mcmc search
return(list(dags = out.dags,
scores = out.scores,
alpha = out.alpha))
}#eof
% varrank.Rd ---
% Author : Gilles Kratzer
% Created on : 29.11.2017
% Last modification :
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\name{varrank}
\alias{varrank}
\title{Heuristics Tools Based on Mutual Information for Variable Ranking and Feature Selection}
\usage{
varrank(data.df = NULL, variable.important = NULL, method =
c("battiti", "kwak", "peng", "estevez"), algorithm =
c("forward", "backward"),
scheme = c("mid", "miq"),
discretization.method = NULL, ratio = NULL, n.var =
NULL, verbose = TRUE)
}
\arguments{
\item{data.df}{a named data frame with either numeric or factor variables.}
\item{variable.important}{a list of variables that is the target variables.}
\item{method}{the method to be used. See \sQuote{Details}. Default is \code{"estevez"}.}
\item{algorithm}{the algorithm scheme to be used. Default is '\code{"forward"}.}
\item{scheme}{the scheme search to be used. \code{"mid"} and \code{"miq"} stands for the Mutual Information Difference and Quotient schemes, respectively. Those are two ways to combine the relevance and redundancy. They are the two most used mRMRe schemes}
\item{discretization.method}{a character vector giving the discretization method to use. See \code{\link{discretization}}.}
\item{ratio}{parameter to be used in \code{"battiti"} and \code{"kwak"}.}
\item{n.var}{number of variables to be returned}
\item{verbose}{logical. Should a progress bar be plotted? As the search scheme.}
}
\description{
This function heuristically estimates the variables ranks based on mutual information with multiple model and multiple search schemes.
}
\details{
By default \code{varrank} performs a variable ranking based on forward search algorithm using mutual information. The scoring is based on one of the four models implemented. The input dataset can be discrete, continuous or mixed variables. The target variables can be a set.
The filter approach based on mutual information is the Minimum Redundancy Maximum Relevance (mRMRe) algorithm. A general formulation of the ensemble of mRMRe techniques is, given a set of features F, a subset of important features C, a candidate feature f_i and possibly some already selected features f_s in S. The local score function for a mid scheme (Mutual Information Difference) is expressed as:
\deqn{g_i(A, C, S, f_i) = MI(f_i;C) - \sum_{f_s} A(f_i, f_s, C) MI(f_i; f_s)}{%
g_i(A, C, S, F) = MI(f_i;C) - \sum_{f_s} A(f_i, f_s, C) MI(f_i; f_s)}
Depending of the value method, the value of A and B will be set accordingly to:
\code{battiti} defined A=B, where B is a user defined parameter (called ratio). Battiti (1994).
\code{kwak} defined A = B MI(f_s;C)/H(f_s), where B a user defined parameter (called ratio). Kwak et al. (2002).
\code{peng} defined A=1/|S|. Peng et al. (2005).
\code{estevez} defined A = 1/|S| min(H(f_i), H(f_s)). Estévez et al. (2009).
The search algorithm implemented are a forward search i.e. start with an empty set S and fill in. The the returned list is ranked by decreasing order of relevance. A backward search which start with a full set i.e. all variables except \code{variable.importance} are considered as selected and the returned list is in increasing order of relevance based solely on mutual information estimation. Thus a backward search will only check for relevance and not for redundancy. \code{n.var} is optional if it is not provided, all candidate variables will be ranked.
}
\value{A list with an entry for the variables ranked and another entry for the score for each ranked variables.}
\author{Gilles Kratzer}
\references{Kratzer, G. Furrer, R. "varrank: an R package for variable ranking based on mutual information with applications to system epidemiology"}
\examples{
library(mlbench)
data(PimaIndiansDiabetes)
##forward search for all variables
varrank(data.df = PimaIndiansDiabetes,
method = "estevez",
variable.important = "diabetes",
discretization.method = "sturges",
algorithm = "forward", scheme = "mid")
##forward search for 3 variables
varrank(data.df = PimaIndiansDiabetes,
method = "estevez",
variable.important = "diabetes",
discretization.method = "sturges",
algorithm = "forward",
scheme = "mid",
n.var=3)
##backward search for all variables
varrank(data.df = PimaIndiansDiabetes,
method = "peng",
variable.important = "diabetes",
discretization.method = "sturges",
algorithm = "backward",
scheme = "mid",
n.var=NULL)
}
Version: 1.0
RestoreWorkspace: No
SaveWorkspace: No
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: knitr
LaTeX: pdfLaTeX
AutoAppendNewline: Yes
StripTrailingWhitespace: Yes
BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,collate,namespace
model{
###-----------------------
###Gaussian nodes
###-----------------------
a ~ dnorm(mu.a, precision.a); #Gausianne response
mu.a <- 0; #linear regression
precision.a <- 1;
###-----------------------
###Gaussian nodes
###-----------------------
b ~ dnorm(mu.b, precision.b); #Gausianne response
mu.b <- 0 + 0.09*a; #linear regression
precision.b <- 1;
###-----------------------
###Gaussian nodes
###-----------------------
c ~ dnorm(mu.c, precision.c); #Gausianne response
mu.c <- 0 + 0.09*a + 0.09*b; #linear regression
precision.c <- 1;
###-----------------------
###Gaussian nodes