Commit 160dc3fe authored by Manuela Brunner's avatar Manuela Brunner

Addition of ad.test

Modification of fitting and backtransformation to allow for any type of distribution.
Problem with the argument CDF. It currently is a variable, which needs to be pre-specified. How can we just provide any character string e.g. "gev" instead of using CDF <- "gev"?
parent 6d468b45
......@@ -17,7 +17,7 @@ Description: Provides a simulation framework to simulate streamflow time series
streamflow values and their temporal correlation as expressed by short- and long-range
dependence. The approach is based on the randomization of the phases of the Fourier
transform. We further use the flexible four-parameter Kappa distribution, which allows
for the extrapolation to yet unobserved low and high flows. A detailed description of
for the extrapolation to yet unobserved low and high flows. Alternatively, the empirical or any other distribution can be used. A detailed description of
the simulation approach and an application example can be found
in https://www.hydrol-earth-syst-sci-discuss.net/hess-2019-142/.
URL: https://git.math.uzh.ch/reinhard.furrer/PRSim-devel
......
importFrom("homtest", "Lmoments", "rand.kappa", "par.kappa")
importFrom("stats", "rnorm", "runif", "fft", "ks.test")
importFrom("graphics", "hist")
export("prsim")
prsim <- function(data, station_id="Qobs", number_sim=1, win_h_length=15,
marginal=c("kappa","empirical","CDF.fit"), draw="rCDF",
marginal=c("kappa","empirical",CDF),n_par=4,
verbose=TRUE, marginalpar=TRUE, suppWarn=FALSE, GoFtest=c(NULL, "AD", "KS"), ...){
ifft <- function (x) fft(x, inverse = TRUE)/length(x)
......@@ -31,6 +31,8 @@ prsim <- function(data, station_id="Qobs", number_sim=1, win_h_length=15,
data$timestamp <- as.POSIXct(strptime(tmp, format="%Y %m %d", tz="GMT"))
}
### test whether r_CDF, p_CDF, and CDF_fit are properly defined.
### @Reinhard: could you please implement these tests?
### remove February 29
data <- data[format(data$timestamp, "%m %d") != "02 29",]
......@@ -81,8 +83,8 @@ prsim <- function(data, station_id="Qobs", number_sim=1, win_h_length=15,
### To enable a sufficient sample size by using daily values in moving window around day i (i.e., reduce uncertainty due to fitting)
if(marginal=="kappa"){
p_vals <- numeric(365)
kap_par_day <- matrix(0, nrow=365, ncol=4)
density_kap <- list()
par_day <- matrix(0, nrow=365, ncol=4)
# density_kap <- list()
### define window length
win_length <- c(1:win_h_length)
for(d in c(1:365)){
......@@ -105,7 +107,7 @@ prsim <- function(data, station_id="Qobs", number_sim=1, win_h_length=15,
if(length(test)>1){
kap_par <- test
kap_par_day[d,] <- unlist(kap_par)
par_day[d,] <- unlist(kap_par)
### define vector of quantiles
quant <- sort(data_window)
thresh <- kap_par$xi + kap_par$alfa*(1 - kap_par$h^(-kap_par$k))/kap_par$k
......@@ -123,32 +125,73 @@ prsim <- function(data, station_id="Qobs", number_sim=1, win_h_length=15,
if (tolower(GoFtest)=="ks")
p_vals[d] <- ks.test(data_window, data_kap)$p.value ### kappa distribution not rejected at alpha=0.05
if (tolower(GoFtest)=="ad")
p_vals[d] <- ad.test(data_window, F.kappa, xi=kap_par$xi,alfa=kap_par$alfa, k=kap_par$k, h=kap_par$h )$p.value ### NOTE: TRY
if(length(try(ad.test(data_window,F.kappa,xi=kap_par$xi,alfa=kap_par$alfa,k=kap_par$k,h=kap_par$h)))==1){
p_vals[d] <- NA
}else{
p_vals[d] <- ad.test(data_window, F.kappa, xi=kap_par$xi, alfa=kap_par$alfa, k=kap_par$k, h=kap_par$h)$p.value
}
} else{
if(d==1){
p_vals[d] <- NA
kap_par_day[d,] <- NA
par_day[d,] <- NA
}else{
p_vals[d] <- p_vals[d-1]
kap_par_day[d,] <- kap_par_day[d-1,]
par_day[d,] <- par_day[d-1,]
}
}
}
###qu: check if subsequent NAs work properly.
### replace NA entries by values of subsequent day
if(length(which(is.na(kap_par_day[,1])))>0){
indices <- rev(which(is.na(kap_par_day[,1])))
if(length(which(is.na(par_day[,1])))>0){
indices <- rev(which(is.na(par_day[,1])))
for(i in 1:length(indices)){
kap_par_day[indices[i],] <- kap_par_day[indices[i]+1,]
par_day[indices[i],] <- par_day[indices[i]+1,]
}
}
}
# which(unlist(p_val_list)<0.05) ### non-rejected for most days, difficulty with winter months (relation to hp production?)
### use either a predefined distribution in R or define own function
if(marginal!=c("kappa","empirical")){
p_vals <- numeric(365)
par_day <- matrix(0, nrow=365, ncol=n_par)
for(d in c(1:365)){
### define window length
win_length <- seq(1:15)
### define start and end of window
before <- data$index[d+365-win_length]
after <- data$index[d+365+win_length-1]
### define days within window
ids <- c(before,after)
### determine values in window around day i
data_window <- data$Qobs[which(data$index%in%ids)]
### fit generalized gamma distribution
# fitdistr(x=data_window,dgengamma.stacy,start=list("scale"=1,"k"=6,"d"=2)) ### package VGAM: fitting does not work
### use function from package felxsurv
# theta <- CDF.fit(xdat=data_window)
theta <- match.fun(paste(CDF,"_fit",sep=""))(xdat=data_window)
# do.call(paste(CDF,".fit",sep=""),list(xdat=data_window))
### goodness of fit test
# data_random <- rCDF(n=length(data_window),theta=theta)
data_random <- match.fun(paste("r_",CDF,sep=""))(n=length(data_window),theta=theta)
# density_gengam[[d]] <- density(data_gengam)
hist(data_window)
hist(data_random,add=T,col="red")
if (tolower(GoFtest)=="ks"){
p_vals[d] <- ks.test(data_window,data_random)$p.value
}
if (tolower(GoFtest)=="ad"){
p_vals[d] <- ad.test(data_window,match.fun(paste("p_",CDF,sep="")),theta)$p.value
}
### store parameters
par_day[d,] <- theta
}
}
### Deseasonalization was not found to be necessary. Use normalized data directly.
data$des <- data$norm
......@@ -249,13 +292,13 @@ prsim <- function(data, station_id="Qobs", number_sim=1, win_h_length=15,
### use kappa distribution for backtransformation
if(marginal=="kappa"){
colnames(kap_par_day) <- names(kap_par)
colnames(par_day) <- names(kap_par)
### use monthly Kappa distribution for backtransformation
### simulate random sample of size n from Kappa disribution
data_day$kappa <- rand.kappa(length(data_day$Qobs),
xi=kap_par_day[d,"xi"],alfa=kap_par_day[d,"alfa"],
k=kap_par_day[d,"k"],h=kap_par_day[d,"h"])
xi=par_day[d,"xi"],alfa=par_day[d,"alfa"],
k=par_day[d,"k"],h=par_day[d,"h"])
# if(marginal=="wakeby"){
# ### use Wakeby distribution for backtrasformation
......@@ -295,6 +338,25 @@ prsim <- function(data, station_id="Qobs", number_sim=1, win_h_length=15,
data_new$simulated_seasonal[which(data_new$index%in%c(d))] <- data_ordered$Qobs[data_new$rank[which(data$index%in%c(d))]]
# }
}
### use any predefined distribution for backtransformation
if(marginal!=c("kappa","empirical")){
### use monthly distribution for backtransformation
### simulate random sample of size n from disribution
data_day$cdf <- match.fun(paste("r_",CDF,sep=""))(n=length(data_day$Qobs),par_day[d,])
data_day$rank <- rank(data_day$cdf)
data_new$rank <- rank(data_new$seasonal)
# hist(data_day$Qobs)
# hist(data_day$cdf,add=T,col="blue")
# data_day$rank <- rank(data_day$cdf)
data_new$rank[which(data$index%in%c(d))] <- rank(data_new[which(data$index%in%c(d)),]$seasonal)
### derive corresponding values from the kappa distribution
### identify value corresponding to rank in the kappa time series
data_ordered <- data_day[order(data_day$rank),]
data_new$simulated_seasonal[which(data_new$index%in%c(d))] <- data_ordered$cdf[data_new$rank[which(data$index%in%c(d))]]
}
} # end for loop
data_sim[[r]] <- data_new$simulated_seasonal
......@@ -310,13 +372,13 @@ prsim <- function(data, station_id="Qobs", number_sim=1, win_h_length=15,
deseaonalized=data$des,
data_sim)
if (!KStest) {
if (is.null(GoFtest)) {
p_vals <- NULL
}
if(marginal=="kappa"){
if (marginalpar) { # also return intermediate results
return(list( simulation=data_stoch, pars=kap_par_day, p_val=p_vals))
return(list( simulation=data_stoch, pars=par_day, p_val=p_vals))
} else {
return(list( simulation=data_stoch, pars=NULL, p_val=p_vals))
}
......
new argument
marginal=c("kappa", "empirical", "gev.fit")
drawmarginal="rgev"
rCDF <- function(n, theta) {
rnorm(n, theta[1], sqrt(theta[2]))
}
CDF.fit <- function(xdat,...) { gev.fit(xdat,..., show=FALSE)$mle}
> data(portpirie)
> gev.fttt(portpirie[,2])
\ No newline at end of file
......@@ -2,8 +2,40 @@
data(runoff)
out <- prsim(data=runoff, number_sim=1, marginal="empirical")
out <- prsim(data=runoff, number_sim=1, marginal="kappa")
out <- prsim(data=runoff, number_sim=1, marginal="kappa",GoFtest = "KS")
### GEV distribution
r_gev <- function(n, theta){
rgev(n, theta[1], theta[2], theta[3])
}
p_gev <- function(x, theta){
pgev(x, theta[1], theta[2], theta[3])
}
gev_fit <- function( xdat, ...) {
gev.fit( xdat, ...)$mle
}
### generalized Pareto distribution of the second kind
r_gb2 <- function(n, theta){
rgb2(n, theta[1], theta[2], theta[3], theta[4])
}
p_gb2 <- function(x, theta){
pgb2(x, theta[1], theta[2], theta[3], theta[4])
}
gb2_fit <- function( xdat, ...) {
mlfit.gb2( xdat, ...)[[2]]$par
}
### @ Reinhard: This is not very elegant yet. CDF needs to be specified before giving it to the function.
### I guess this can be alternatively resolved by using a different argument for marginal. Any ideas?
CDF <-"gev"
out <- prsim(data=runoff, number_sim=1, marginal=CDF,GoFtest = "KS",n_par=3)
CDF <-"gb2"
out <- prsim(data=runoff, number_sim=1, marginal="gb2",GoFtest = "KS",n_par=4)
sim <- out$simulation
# p_val <- out$p_val
par(mai=c(.9,.9,.1,.1))
plot(sim$timestamp[1:1000], sim$Qobs[1:1000], type="l",
xlab="Time [d]", ylab=expression(paste("Discharge [m"^3,"/s]")))
......
......@@ -6,19 +6,20 @@
Applies the algorithm to a single station
}
\usage{
prsim(data, station_id="Qobs", number_sim=1, win_h_length=15,marginal="kappa",
verbose=TRUE, kappapar=TRUE, suppWarn=FALSE, KStest=FALSE, ...)
prsim(data, station_id="Qobs", number_sim=1, win_h_length=15,marginal="kappa",n_par=4,
verbose=TRUE, marginalpar=TRUE, suppWarn=FALSE, GoFtest=FALSE, ...)
}
\arguments{
\item{data}{data frame containing the time indications and runoff of at least one station. See \sQuote{Details}.}
\item{station_id}{identifies the station in case several runoffs are present in \code{data}. See \sQuote{Details}.}
\item{win_h_length}{(half-)length of moving window size.}
\item{number_sim}{number of simulations to be carried out.}
\item{marginal}{marginal distribution to be used for the backtransformation.}
\item{marginal}{marginal distribution to be used for the backtransformation. Can be either "kappa", "empirical", or CDF. "kappa" uses the four-parameter kappa distribution for backtransformation, "empirical" uses the empirical distribution. CDF allows for specifying any distribution \sQuote{Examples}.}
\item{n_par}{number of parameters of the marginal distribution used}
\item{verbose}{logical. Should progress be reported?}
\item{marginalpar}{logical. Should the estimated parameters of the kappa distribution be returned?}
\item{marginalpar}{logical. Should the estimated parameters of the distribution used be returned?}
\item{suppWarn}{logical. See \sQuote{Details}.}
\item{KStest}{logical. Should a \code{ks.test} for daily data be performed and the p-values be passed back?}
\item{GoFtest}{NULL, KS or AD. Should a goodness-of-fit test for daily data be performed and the p-values be passed back? NULL performs no test, KS performs a Kolmogorof-Smirnov test, and AD performs an Anderson-Darling test.}
\item{...}{any other argument passed to the sub-function specifying the cdf for fitting and drawing. See \sQuote{Details} and \sQuote{Examples}. }
}
\details{
......@@ -61,20 +62,33 @@ out <- prsim( runoff[ runoff$YYYY<1980, ], "Qobs", 1, suppWarn=TRUE)
# warnings() # as a follow-up to `suppWarn=TRUE`
## Specifying particular CDF:
## example with generalized Beta distribution of the second kind
require( "GB2")
rCDF <- function(n, theta){
CDF <- "gb2"
r_gb2 <- function(n, theta){
rgb2(n, theta[1], theta[2], theta[3], theta[4])
}
pCDF <- function(x, theta){
p_gb2 <- function(x, theta){
pgb2(x, theta[1], theta[2], theta[3], theta[4])
}
CDF.fit <- function( xdat, ...) {
gb2_fit <- function( xdat, ...) {
mlfit.gb2( xdat, ...)[[2]]$par
}
out <- prsim( runoff[ runoff$YYYY<1980, ], "Qobs", 1, suppWarn=TRUE,
marginal="CDF.fit", draw="rCDF")
## example with the Generalized Extreme Value (GEV) distribution
### test with GEV distribution
CDF <- "gev"
r_gev <- function(n, theta){
rgev(n, theta[1], theta[2], theta[3])
}
p_gev <- function(x, theta){
pgev(x, theta[1], theta[2], theta[3])
}
gev_fit <- function( xdat, ...) {
gev.fit( xdat, ...)$mle
}
}
\keyword{ts}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment